mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-06 19:10:45 +00:00
Using ijson to avoid loading full json in memory.
Using ijson to load item by item, so it is possible to load dataset using dataset/convert_cc_sbu.py and dataset/convert_laion.py on machines with low RAM.
This commit is contained in:
parent
ef1ac08ce3
commit
a8eb69ecd1
@ -1,20 +1,22 @@
|
|||||||
import json
|
import ijson
|
||||||
import csv
|
|
||||||
|
|
||||||
# specify input and output file paths
|
# specify input and output file paths
|
||||||
input_file = 'ccs_synthetic_filtered_large.json'
|
input_file = 'ccs_synthetic_filtered_large.json'
|
||||||
output_file = 'ccs_synthetic_filtered_large.tsv'
|
output_file = 'ccs_synthetic_filtered_large.tsv'
|
||||||
|
|
||||||
# load JSON data from input file
|
# set header to None
|
||||||
with open(input_file, 'r') as f:
|
headers = None
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# extract header and data from JSON
|
# load JSON data from input file and open the output file at same time
|
||||||
header = data[0].keys()
|
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
|
||||||
rows = [x.values() for x in data]
|
objects = ijson.items(in_file, 'item')
|
||||||
|
|
||||||
# write data to TSV file
|
for obj in objects:
|
||||||
with open(output_file, 'w') as f:
|
# extract header and data from JSON
|
||||||
writer = csv.writer(f, delimiter='\t')
|
if headers is None:
|
||||||
writer.writerow(header)
|
headers = list(obj.keys())
|
||||||
writer.writerows(rows)
|
out_file.write('\t'.join(headers) + '\n')
|
||||||
|
|
||||||
|
# write data to TSV file line by line
|
||||||
|
row = '\t'.join(str(obj[key]) for key in headers)
|
||||||
|
out_file.write(row + '\n')
|
||||||
|
@ -1,20 +1,22 @@
|
|||||||
import json
|
import ijson
|
||||||
import csv
|
|
||||||
|
|
||||||
# specify input and output file paths
|
# specify input and output file paths
|
||||||
input_file = 'laion_synthetic_filtered_large.json'
|
input_file = 'laion_synthetic_filtered_large.json'
|
||||||
output_file = 'laion_synthetic_filtered_large.tsv'
|
output_file = 'laion_synthetic_filtered_large.tsv'
|
||||||
|
|
||||||
# load JSON data from input file
|
# set header to None
|
||||||
with open(input_file, 'r') as f:
|
headers = None
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# extract header and data from JSON
|
# load JSON data from input file and open the output file at same time
|
||||||
header = data[0].keys()
|
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
|
||||||
rows = [x.values() for x in data]
|
objects = ijson.items(in_file, 'item')
|
||||||
|
|
||||||
# write data to TSV file
|
for obj in objects:
|
||||||
with open(output_file, 'w') as f:
|
# extract header and data from JSON
|
||||||
writer = csv.writer(f, delimiter='\t')
|
if headers is None:
|
||||||
writer.writerow(header)
|
headers = list(obj.keys())
|
||||||
writer.writerows(rows)
|
out_file.write('\t'.join(headers) + '\n')
|
||||||
|
|
||||||
|
# write data to TSV file line by line
|
||||||
|
row = '\t'.join(str(obj[key]) for key in headers)
|
||||||
|
out_file.write(row + '\n')
|
||||||
|
Loading…
Reference in New Issue
Block a user