From 64472dedb1745b077258abc8172dc5b326b2571e Mon Sep 17 00:00:00 2001 From: Bingyi Kang Date: Wed, 24 May 2023 14:16:50 +0800 Subject: [PATCH] Audio dataset (#1) * add audio dataset setup --------- Co-authored-by: bingyikang --- dataset/audio/README.md | 55 ++++++++++ dataset/audio/make_tar.py | 216 ++++++++++++++++++++++++++++++++++++++ dataset/audio/process.py | 160 ++++++++++++++++++++++++++++ dataset/audio/setup.sh | 45 ++++++++ 4 files changed, 476 insertions(+) create mode 100644 dataset/audio/README.md create mode 100644 dataset/audio/make_tar.py create mode 100644 dataset/audio/process.py create mode 100644 dataset/audio/setup.sh diff --git a/dataset/audio/README.md b/dataset/audio/README.md new file mode 100644 index 0000000..3820ef2 --- /dev/null +++ b/dataset/audio/README.md @@ -0,0 +1,55 @@ +## Audio Dataset + +## Stage1: Pretraining +We mainly use [WavCaps](https://github.com/XinhaoMei/WavCaps) dataset for pre-training. + +### Download + +```Bash +# install git-lfs +sudo apt update +sudo apt-get install git-lfs + + +git clone https://huggingface.co/datasets/cvssp/WavCaps +cd WavCaps +git lfs pull --include "*" +``` + +### Processing + +1. Extract zip file +```bash +# merge shards first +zip -s- FILE_NAME.zip -O COMBINED_FILE.zip +unzip COMBINED_FILE.zip +``` + +2. Processing +Extract raw audio data +```bash +unzip COMBINED_FILE.zip -d /target/dir +``` + +Create json files (annotations) for each example. Before processing, modify `dataset/audio/process.py` to set data and json path. +```bash +python3 --dataset test --data_dir /path/to/data --json_path /path/to/json +``` + + +3. Pack with tar +```bash +python3 dataset/audio/make_tar.py --input /path/to/data --output /path/to/web_dataset \ + --dataclass none --filename filename --num_element 500 +``` + +To view tar file +``` +tar tf filename.tar | sed 10q +``` + +**To setup in one line:** +```bash +# DATASET=soundbible bbc audioset freesound +DATASET=soundbible bash dataset/audio/setup.sh +``` diff --git a/dataset/audio/make_tar.py b/dataset/audio/make_tar.py new file mode 100644 index 0000000..c8f5d2e --- /dev/null +++ b/dataset/audio/make_tar.py @@ -0,0 +1,216 @@ +import argparse +import librosa +import json +import webdataset as wds +import numpy as np +import os +import random +import io +from glob import glob +from itertools import islice +import scipy.signal as sps +import soundfile as sf +from tqdm import tqdm +import tarfile + + +def tardir( + file_path, tar_name, n_entry_each, audio_ext=".flac", text_ext=".json", shuffle=True, start_idx=0, delete_file=False +): + """ + This function create the tars that includes the audio and text files in the same folder + @param file_path | string | the path where audio and text files located + @param tar_name | string | the tar name + @param n_entry_each | int | how many pairs of (audio, text) will be in a tar + @param audio_ext | string | the extension of the audio + @param text_ext | string | the extension of the text + @param shuffle | boolean | True to shuffle the file sequence before packing up + @param start_idx | int | the start index of the tar + @param delete_file | boolean | True to delete the audio and text files after packing up + """ + filelist = glob(file_path+'/*'+audio_ext) + + if shuffle: + random.shuffle(filelist) + count = 0 + n_split = len(filelist) // n_entry_each + if n_split * n_entry_each != len(filelist): + n_split += 1 + size_dict = { + os.path.basename(tar_name) + "{:06d}".format(i) + ".tar": n_entry_each + for i in range(n_split) + } + if n_split * n_entry_each != len(filelist): + size_dict[os.path.basename(tar_name) + "{:06d}".format(n_split - 1) + ".tar"] = ( + len(filelist) - (n_split - 1) * n_entry_each + ) + for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'): + with tarfile.open(tar_name + "{:06d}".format(i) + ".tar", "w") as tar_handle: + for j in range(count, len(filelist)): + audio = filelist[j] + basename = ".".join(audio.split(".")[:-1]) + text_file_path = os.path.join(file_path, basename + text_ext) + audio_file_path = os.path.join(file_path, audio) + tar_handle.add(audio_file_path) + tar_handle.add(text_file_path) + if delete_file: + os.remove(audio_file_path) + os.remove(text_file_path) + if (j + 1) % n_entry_each == 0: + count = j + 1 + break + tar_handle.close() + # Serializing json + json_object = json.dumps(size_dict, indent=4) + # Writing to sample.json + with open(os.path.join(os.path.dirname(tar_name), "sizes.json"), "w") as outfile: + outfile.write(json_object) + return size_dict + +def packup(input, output, filename, dataclass='all', num_element=512, start_idx=0, delete_file=False): + if not os.path.exists(os.path.join(input, dataclass)): + print( + "Dataclass {} does not exist, this folder does not exist. Skipping it.".format( + dataclass + ) + ) + return + if os.path.exists(os.path.join(output, dataclass)): + tardir( + os.path.join(input, dataclass), + os.path.join(output, dataclass, filename), + num_element, + start_idx=start_idx, + delete_file=delete_file, + ) + else: + os.makedirs(os.path.join(output, dataclass)) + tardir( + os.path.join(input, dataclass), + os.path.join(output, dataclass, filename), + num_element, + start_idx=start_idx, + delete_file=delete_file, + ) + return + +def load_from_tar( + file_path, + file_path_type="local", + audio_ext="flac", + text_ext="json", + samplerate=32000, + mono=True, + max_len=1000000, + dtype="float64", + res_type="kaiser_best", +): + """ + This function load the tar files to 3 entry tuple (audios, texts, names) accordingly + @param file_path | string | the path where audio and text files located + @param file_path_type | string | this is meant to control the prefix of the address in case people forget to include it + if file_path_type is "local" and 'file:\\' is not shown as a prefix, it will be added automatically + @param audio_ext | string | the extension of the audio + @param text_ext | string | the extension of the text + @param samplerate | int | the sample rate of the audio + @param mono | boolean | if the audio is in mono channel + @param max_len | int | max len of the audio, if exceed, will random crop; elif deficit, will pad + @param dtype | string | the type of the dtype of the audio sample representation + @param res_type | string | the resample method + """ + if file_path_type == "local" and ("file:\\" not in file_path): + file_path = "file:\\" + file_path + dataset = wds.WebDataset(file_path) + audios = [] + texts = [] + names = [] + for sample in dataset: + for key, value in sample.items(): + if key == audio_ext: + audio_data, orig_sr = sf.read(io.BytesIO(value)) + if samplerate is not None: + audio_data = librosa.resample( + audio_data, + orig_sr=orig_sr, + target_sr=samplerate, + res_type=res_type, + ) + if len(audio_data) > max_len: + overflow = len(audio_data) - max_len + idx = np.random.randint(0, overflow + 1) + if np.random.rand() > 0.5: + audio_data = audio_data[idx : idx + max_len] + else: + audio_data = audio_data[ + len(audio_data) + + 1 + - idx + - max_len : len(audio_data) + + 1 + - idx + ] + else: + audio_data = np.pad( + audio_data, + (0, max_len - len(audio_data)), + mode="constant", + constant_values=0, + ) + if mono: + audio_data = librosa.to_mono(audio_data) + audios.append((audio_data, samplerate)) + elif key == text_ext: + texts.append(value) + elif key == "__key__": + names.append(value) + return audios, texts, names + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--input", + type=str, + help="input folder, expecting subdirectory like train, valid or test", + ) + parser.add_argument( + "--output", + type=str, + help="output, generating tar files at output/dataclass/filename_{}.tar", + ) + parser.add_argument( + "--filename", + type=str, + default="", + help="the filename of the tar, generating tar files at output/dataclass/filename_{}.tar", + ) + parser.add_argument( + "--dataclass", type=str, default="all", help="train or test or valid or all" + ) + parser.add_argument( + "--num_element", type=int, default=512, help="pairs of (audio, text) to be included in a single tar" + ) + parser.add_argument( + "--start_idx", type=int, default=0, help="start index of the tar" + ) + parser.add_argument( + "--delete_file", action='store_true', help="delete the input file when making tars" + ) + args = parser.parse_args() + + + if args.dataclass == "all": + for x in ["train", "valid", "test"]: + packup(args.input, args.output, args.filename, x, args.num_element, args.start_idx, args.delete_file) + elif args.dataclass == "none": + os.makedirs(args.output, exist_ok=True) + tardir( + args.input, + os.path.join(args.output, args.filename), + args.num_element, + start_idx=args.start_idx, + delete_file=args.delete_file, + ) + else: # if dataclass is in other name + packup(args.input, args.output, args.filename, args.dataclass, args.num_element, args.start_idx, args.delete_file) \ No newline at end of file diff --git a/dataset/audio/process.py b/dataset/audio/process.py new file mode 100644 index 0000000..67e5c3e --- /dev/null +++ b/dataset/audio/process.py @@ -0,0 +1,160 @@ +""" +Code for processing WavCaps dataset. +""" +import argparse + +import os +from tqdm import tqdm +import glob +import numpy as np +import json + +parser = argparse.ArgumentParser() +parser.add_argument("--dataset", type=str, default='bbc', choices=['bbc', 'audioset', 'soundbible', 'freesound', 'test']) +parser.add_argument( + "--data_root", + type=str, + default="/mnt/bn/zilongdata-hl/dataset/wavcaps" +) +parser.add_argument( + "--data_dir", + type=str, + default="/mnt/bn/zilongdata-hl/dataset/wavcaps/raw_datasets/test/" +) +parser.add_argument( + "--json_path", + type=str, + default="/mnt/bn/zilongdata-hl/dataset/wavcaps/WavCaps/json_files/test.json", +) +args = parser.parse_args() + +DATA_DIRS = { + "bbc": "raw_datasets/BBC_Sound_Effects_flac/", + "audioset": "raw_datasets/AudioSet_SL_flac/", + "soundbible": "raw_datasets/SoundBible_flac/", + "freesound": "raw_datasets/FreeSound_flac/", +} + +JSON_PATHS = { + "bbc": "WavCaps/json_files/BBC_Sound_Effects/bbc_final.json", + "audioset": "WavCaps/json_files/AudioSet_SL/as_final.json", + "soundbible": "WavCaps/json_files/SoundBible/sb_final.json", + "freesound": "WavCaps/json_files/FreeSound/fsd_final.json", +} + + + +def load_audioset_json(fname): + """A sample example: + { + 'id': 'Yb0RFKhbpFJA.wav', + 'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.', + 'audio': 'wav_path', + 'duration': 10.0 + } + """ + with open(fname) as f: + data = json.load(f) + + for sample in data['data']: + yield sample['id'].split('.')[0], sample['caption'], sample + + +def load_soundbible_json(fname): + """A sample example: + { + 'title': 'Airplane Landing Airport', + 'description': 'Large commercial airplane landing at an airport runway.', + 'author': 'Daniel Simion', + 'href': '2219-Airplane-Landing-Airport.html', + 'caption': 'An airplane is landing.', + 'id': '2219', + 'duration': 14.1424375, + 'audio': 'wav_path', + 'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav' + } + """ + with open(fname) as f: + data = json.load(f) + + for sample in data['data']: + yield sample['id'], sample['caption'], sample + + +def load_freesound_json(fname): + """A sample example: + { 'id': '180913', + 'file_name': 'UK Mello.wav', + 'href': '/people/Tempouser/sounds/180913/', + 'tags': ['Standard', 'ringtone', 'basic', 'traditional'], + 'description': 'Standard traditional basic ringtone, in mello tone.', + 'author': 'Tempouser', + 'duration': 3.204375, + 'download_link': 'https://freesound.org/people/Tempouser/sounds/180913/download/180913__tempouser__uk-mello.wav', + 'caption': 'A traditional ringtone is playing.', + 'audio': 'wav_path' + } + """ + with open(fname) as f: + data = json.load(f) + + for sample in data['data']: + yield sample['id'], sample['caption'], sample + + +def load_bbc_json(fname): + """A sample example: + { + 'description': "Timber & Wood - Rip saw, carpenters' workshop.", + 'category': "['Machines']", + 'caption': "Someone is using a rip saw in a carpenter's workshop.", + 'id': '07066104', + 'duration': 138.36, + 'audio': 'wav_path', + 'download_link': 'https://sound-effects-media.bbcrewind.co.uk/zip/07066104.wav.zip' + } + """ + with open(fname) as f: + data = json.load(f) + + for sample in data['data']: + yield sample['id'], sample['caption'], sample + + +def load_test_json(fname): + """Using SoundBible as a text example.""" + with open(fname) as f: + data = json.load(f) + + for sample in data['data']: + yield sample['id'], sample['caption'], sample + +if __name__ == '__main__': + if args.dataset in DATA_DIRS: + data_dir = os.path.join(args.data_root, DATA_DIRS[args.dataset]) + json_path = os.path.join(args.data_root, JSON_PATHS[args.dataset]) + else: + data_dir = args.data_dir + json_path = args.json_path + + file_list = glob.glob(f'{data_dir}/*.flac') + for data_id, unsed_caption, meta_data in tqdm(list(globals()[f'load_{args.dataset}_json'](json_path))): + file_name = os.path.join(data_dir, data_id + '.flac') + json_save_path = os.path.join(data_dir, data_id + '.json') + # text_save_path = os.path.join(data_dir, data_id + '.text') + file_list.remove(file_name) + + assert os.path.exists(file_name), f'{file_name} does not exist!' + with open(json_save_path, 'w') as f: + json.dump(meta_data, f) + + if len(file_list) > 0: + # import pdb; pdb.set_trace() + for f in file_list: + os.remove(f) + + + # file_list = glob.glob(f'{data_dir}/*.flac') + # for file_path in file_list: + # audio_json_save_path = file_path.replace('.flac', '.json') + # audio_text_save_path = file_path.replace('.flac', '.text') diff --git a/dataset/audio/setup.sh b/dataset/audio/setup.sh new file mode 100644 index 0000000..2010c4b --- /dev/null +++ b/dataset/audio/setup.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# TO setup dataset, run +# `DATASET=soundbible bash setup.sh` + + +DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps +DATASET=${DATASET-:soundbible} + +if [[ $DATASET == soundbible ]]; then + DATA_FOLDER=SoundBible + NUM_ELEMENT=2000 +elif [[ $DATASET == bbc ]]; then + DATA_FOLDER=BBC_Sound_Effects + NUM_ELEMENT=500 +elif [[ $DATASET == audioset ]]; then + DATA_FOLDER=AudioSet_SL + NUM_ELEMENT=2000 +elif [[ $DATASET == freesound ]]; then + DATA_FOLDER=FreeSound + NUM_ELEMENT=500 +else + echo "${DATASET} not found!" + exit +fi + +CODE_PATH=$(pwd) + +# Merge zip files +cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER} +zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip + +# Extract zip file +cd ${DATA_ROOT} +unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER} +mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac +rm -rf raw_datasets/${DATA_FOLDER} + +# Process raw data to create json annotation files +cd $CODE_PATH +python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET} + +# Pack up tar files +python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \ + --output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \ + --dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}