Using ijson to avoid loading full json in memory.

Using ijson to load item by item, so it is possible to load dataset
using dataset/convert_cc_sbu.py and dataset/convert_laion.py on machines
with low RAM.
This commit is contained in:
SamimAB 2023-09-20 23:32:58 +05:30
parent ef1ac08ce3
commit a8eb69ecd1
2 changed files with 32 additions and 28 deletions

View File

@ -1,20 +1,22 @@
import json
import csv
import ijson
# specify input and output file paths
input_file = 'ccs_synthetic_filtered_large.json'
output_file = 'ccs_synthetic_filtered_large.tsv'
# load JSON data from input file
with open(input_file, 'r') as f:
data = json.load(f)
# set header to None
headers = None
# extract header and data from JSON
header = data[0].keys()
rows = [x.values() for x in data]
# write data to TSV file
with open(output_file, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(header)
writer.writerows(rows)
# load JSON data from input file and open the output file at same time
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
objects = ijson.items(in_file, 'item')
for obj in objects:
# extract header and data from JSON
if headers is None:
headers = list(obj.keys())
out_file.write('\t'.join(headers) + '\n')
# write data to TSV file line by line
row = '\t'.join(str(obj[key]) for key in headers)
out_file.write(row + '\n')

View File

@ -1,20 +1,22 @@
import json
import csv
import ijson
# specify input and output file paths
input_file = 'laion_synthetic_filtered_large.json'
output_file = 'laion_synthetic_filtered_large.tsv'
# load JSON data from input file
with open(input_file, 'r') as f:
data = json.load(f)
# set header to None
headers = None
# extract header and data from JSON
header = data[0].keys()
rows = [x.values() for x in data]
# write data to TSV file
with open(output_file, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(header)
writer.writerows(rows)
# load JSON data from input file and open the output file at same time
with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
objects = ijson.items(in_file, 'item')
for obj in objects:
# extract header and data from JSON
if headers is None:
headers = list(obj.keys())
out_file.write('\t'.join(headers) + '\n')
# write data to TSV file line by line
row = '\t'.join(str(obj[key]) for key in headers)
out_file.write(row + '\n')