diff --git a/dataset/convert_cc_sbu.py b/dataset/convert_cc_sbu.py index 8c325ed..cc9b1a3 100644 --- a/dataset/convert_cc_sbu.py +++ b/dataset/convert_cc_sbu.py @@ -1,20 +1,22 @@ -import json -import csv +import ijson # specify input and output file paths input_file = 'ccs_synthetic_filtered_large.json' output_file = 'ccs_synthetic_filtered_large.tsv' -# load JSON data from input file -with open(input_file, 'r') as f: - data = json.load(f) +# set header to None +headers = None -# extract header and data from JSON -header = data[0].keys() -rows = [x.values() for x in data] - -# write data to TSV file -with open(output_file, 'w') as f: - writer = csv.writer(f, delimiter='\t') - writer.writerow(header) - writer.writerows(rows) +# load JSON data from input file and open the output file at same time +with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file: + objects = ijson.items(in_file, 'item') + + for obj in objects: + # extract header and data from JSON + if headers is None: + headers = list(obj.keys()) + out_file.write('\t'.join(headers) + '\n') + + # write data to TSV file line by line + row = '\t'.join(str(obj[key]) for key in headers) + out_file.write(row + '\n') diff --git a/dataset/convert_laion.py b/dataset/convert_laion.py index b793579..b0bcbc5 100644 --- a/dataset/convert_laion.py +++ b/dataset/convert_laion.py @@ -1,20 +1,22 @@ -import json -import csv +import ijson # specify input and output file paths input_file = 'laion_synthetic_filtered_large.json' output_file = 'laion_synthetic_filtered_large.tsv' -# load JSON data from input file -with open(input_file, 'r') as f: - data = json.load(f) +# set header to None +headers = None -# extract header and data from JSON -header = data[0].keys() -rows = [x.values() for x in data] - -# write data to TSV file -with open(output_file, 'w') as f: - writer = csv.writer(f, delimiter='\t') - writer.writerow(header) - writer.writerows(rows) +# load JSON data from input file and open the output file at same time +with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file: + objects = ijson.items(in_file, 'item') + + for obj in objects: + # extract header and data from JSON + if headers is None: + headers = list(obj.keys()) + out_file.write('\t'.join(headers) + '\n') + + # write data to TSV file line by line + row = '\t'.join(str(obj[key]) for key in headers) + out_file.write(row + '\n')