Using ijson to avoid loading full json in memory.

Using ijson to load item by item, so it is possible to load dataset
using dataset/convert_cc_sbu.py and dataset/convert_laion.py on machines
with low RAM.
This commit is contained in:
SamimAB 2023-09-20 23:32:58 +05:30
parent ef1ac08ce3
commit a8eb69ecd1
2 changed files with 32 additions and 28 deletions

View File

@ -1,20 +1,22 @@
import json import ijson
import csv
# specify input and output file paths # specify input and output file paths
input_file = 'ccs_synthetic_filtered_large.json' input_file = 'ccs_synthetic_filtered_large.json'
output_file = 'ccs_synthetic_filtered_large.tsv' output_file = 'ccs_synthetic_filtered_large.tsv'
# load JSON data from input file # set header to None
with open(input_file, 'r') as f: headers = None
data = json.load(f)
# load JSON data from input file and open the output file at same time
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
objects = ijson.items(in_file, 'item')
for obj in objects:
# extract header and data from JSON # extract header and data from JSON
header = data[0].keys() if headers is None:
rows = [x.values() for x in data] headers = list(obj.keys())
out_file.write('\t'.join(headers) + '\n')
# write data to TSV file # write data to TSV file line by line
with open(output_file, 'w') as f: row = '\t'.join(str(obj[key]) for key in headers)
writer = csv.writer(f, delimiter='\t') out_file.write(row + '\n')
writer.writerow(header)
writer.writerows(rows)

View File

@ -1,20 +1,22 @@
import json import ijson
import csv
# specify input and output file paths # specify input and output file paths
input_file = 'laion_synthetic_filtered_large.json' input_file = 'laion_synthetic_filtered_large.json'
output_file = 'laion_synthetic_filtered_large.tsv' output_file = 'laion_synthetic_filtered_large.tsv'
# load JSON data from input file # set header to None
with open(input_file, 'r') as f: headers = None
data = json.load(f)
# load JSON data from input file and open the output file at same time
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
objects = ijson.items(in_file, 'item')
for obj in objects:
# extract header and data from JSON # extract header and data from JSON
header = data[0].keys() if headers is None:
rows = [x.values() for x in data] headers = list(obj.keys())
out_file.write('\t'.join(headers) + '\n')
# write data to TSV file # write data to TSV file line by line
with open(output_file, 'w') as f: row = '\t'.join(str(obj[key]) for key in headers)
writer = csv.writer(f, delimiter='\t') out_file.write(row + '\n')
writer.writerow(header)
writer.writerows(rows)