mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-09 12:30:45 +00:00
Using ijson to load item by item, so it is possible to load dataset using dataset/convert_cc_sbu.py and dataset/convert_laion.py on machines with low RAM.
23 lines
723 B
Python
23 lines
723 B
Python
import ijson
|
|
|
|
# specify input and output file paths
|
|
input_file = 'laion_synthetic_filtered_large.json'
|
|
output_file = 'laion_synthetic_filtered_large.tsv'
|
|
|
|
# set header to None
|
|
headers = None
|
|
|
|
# load JSON data from input file and open the output file at same time
|
|
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
|
|
objects = ijson.items(in_file, 'item')
|
|
|
|
for obj in objects:
|
|
# extract header and data from JSON
|
|
if headers is None:
|
|
headers = list(obj.keys())
|
|
out_file.write('\t'.join(headers) + '\n')
|
|
|
|
# write data to TSV file line by line
|
|
row = '\t'.join(str(obj[key]) for key in headers)
|
|
out_file.write(row + '\n')
|