mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-05 02:20:47 +00:00
Using ijson to avoid loading full json in memory.
Using ijson to load item by item, so it is possible to load dataset using dataset/convert_cc_sbu.py and dataset/convert_laion.py on machines with low RAM.
This commit is contained in:
parent
ef1ac08ce3
commit
a8eb69ecd1
@ -1,20 +1,22 @@
|
||||
import json
|
||||
import csv
|
||||
import ijson
|
||||
|
||||
# specify input and output file paths
|
||||
input_file = 'ccs_synthetic_filtered_large.json'
|
||||
output_file = 'ccs_synthetic_filtered_large.tsv'
|
||||
|
||||
# load JSON data from input file
|
||||
with open(input_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
# set header to None
|
||||
headers = None
|
||||
|
||||
# extract header and data from JSON
|
||||
header = data[0].keys()
|
||||
rows = [x.values() for x in data]
|
||||
|
||||
# write data to TSV file
|
||||
with open(output_file, 'w') as f:
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
writer.writerow(header)
|
||||
writer.writerows(rows)
|
||||
# load JSON data from input file and open the output file at same time
|
||||
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
|
||||
objects = ijson.items(in_file, 'item')
|
||||
|
||||
for obj in objects:
|
||||
# extract header and data from JSON
|
||||
if headers is None:
|
||||
headers = list(obj.keys())
|
||||
out_file.write('\t'.join(headers) + '\n')
|
||||
|
||||
# write data to TSV file line by line
|
||||
row = '\t'.join(str(obj[key]) for key in headers)
|
||||
out_file.write(row + '\n')
|
||||
|
@ -1,20 +1,22 @@
|
||||
import json
|
||||
import csv
|
||||
import ijson
|
||||
|
||||
# specify input and output file paths
|
||||
input_file = 'laion_synthetic_filtered_large.json'
|
||||
output_file = 'laion_synthetic_filtered_large.tsv'
|
||||
|
||||
# load JSON data from input file
|
||||
with open(input_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
# set header to None
|
||||
headers = None
|
||||
|
||||
# extract header and data from JSON
|
||||
header = data[0].keys()
|
||||
rows = [x.values() for x in data]
|
||||
|
||||
# write data to TSV file
|
||||
with open(output_file, 'w') as f:
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
writer.writerow(header)
|
||||
writer.writerows(rows)
|
||||
# load JSON data from input file and open the output file at same time
|
||||
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
|
||||
objects = ijson.items(in_file, 'item')
|
||||
|
||||
for obj in objects:
|
||||
# extract header and data from JSON
|
||||
if headers is None:
|
||||
headers = list(obj.keys())
|
||||
out_file.write('\t'.join(headers) + '\n')
|
||||
|
||||
# write data to TSV file line by line
|
||||
row = '\t'.join(str(obj[key]) for key in headers)
|
||||
out_file.write(row + '\n')
|
||||
|
Loading…
Reference in New Issue
Block a user