Audio dataset (#1)

* add audio dataset setup --------- Co-authored-by: bingyikang <bingyikang@bytedance.com>
2025-04-05 02:20:47 +00:00 · 2023-05-24 14:16:50 +08:00 · 2023-05-24 14:16:50 +08:00 · 64472dedb1
commit 64472dedb1
parent 2d2d781469
4 changed files with 476 additions and 0 deletions
--- a/dataset/audio/README.md
+++ b/dataset/audio/README.md
@ -0,0 +1,55 @@
+## Audio Dataset 
+
+## Stage1: Pretraining 
+We mainly use [WavCaps](https://github.com/XinhaoMei/WavCaps) dataset for pre-training. 
+
+### Download 
+
+```Bash
+# install git-lfs
+sudo apt update
+sudo apt-get install git-lfs
+
+
+git clone https://huggingface.co/datasets/cvssp/WavCaps
+cd WavCaps
+git lfs pull --include "*" 
+```
+
+### Processing
+
+1. Extract zip file
+```bash
+# merge shards first
+zip -s- FILE_NAME.zip -O COMBINED_FILE.zip
+unzip COMBINED_FILE.zip
+```
+
+2. Processing
+Extract raw audio data
+```bash
+unzip COMBINED_FILE.zip -d /target/dir
+```
+
+Create json files (annotations) for each example. Before processing, modify `dataset/audio/process.py` to set data and json path. 
+```bash
+python3 --dataset test --data_dir /path/to/data --json_path /path/to/json
+```
+
+
+3. Pack with tar
+```bash
+python3 dataset/audio/make_tar.py --input /path/to/data --output /path/to/web_dataset \
+    --dataclass none --filename filename --num_element 500
+```
+
+To view tar file 
+```
+tar tf filename.tar | sed 10q
+```
+
+**To setup in one line:**
+```bash
+# DATASET=soundbible bbc audioset freesound
+DATASET=soundbible bash dataset/audio/setup.sh
+```
--- a/dataset/audio/make_tar.py
+++ b/dataset/audio/make_tar.py
@ -0,0 +1,216 @@
+import argparse
+import librosa
+import json
+import webdataset as wds
+import numpy as np
+import os
+import random
+import io
+from glob import glob
+from itertools import islice
+import scipy.signal as sps
+import soundfile as sf
+from tqdm import tqdm
+import tarfile
+
+
+def tardir(
+    file_path, tar_name, n_entry_each, audio_ext=".flac", text_ext=".json", shuffle=True, start_idx=0, delete_file=False
+):
+    """
+    This function create the tars that includes the audio and text files in the same folder
+    @param file_path      | string  | the path where audio and text files located
+    @param tar_name       | string  | the tar name
+    @param n_entry_each   | int     | how many pairs of (audio, text) will be in a tar
+    @param audio_ext      | string  | the extension of the audio
+    @param text_ext       | string  | the extension of the text
+    @param shuffle        | boolean | True to shuffle the file sequence before packing up
+    @param start_idx      | int     | the start index of the tar
+    @param delete_file    | boolean | True to delete the audio and text files after packing up
+    """
+    filelist = glob(file_path+'/*'+audio_ext)
+
+    if shuffle:
+        random.shuffle(filelist)
+    count = 0
+    n_split = len(filelist) // n_entry_each
+    if n_split * n_entry_each != len(filelist):
+        n_split += 1
+    size_dict = {
+        os.path.basename(tar_name) + "{:06d}".format(i) + ".tar": n_entry_each
+        for i in range(n_split)
+    }
+    if n_split * n_entry_each != len(filelist):
+        size_dict[os.path.basename(tar_name) + "{:06d}".format(n_split - 1) + ".tar"] = (
+            len(filelist) - (n_split - 1) * n_entry_each
+        )
+    for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
+        with tarfile.open(tar_name + "{:06d}".format(i) + ".tar", "w") as tar_handle:
+            for j in range(count, len(filelist)):
+                audio = filelist[j]
+                basename = ".".join(audio.split(".")[:-1])
+                text_file_path = os.path.join(file_path, basename + text_ext)
+                audio_file_path = os.path.join(file_path, audio)
+                tar_handle.add(audio_file_path)
+                tar_handle.add(text_file_path)
+                if delete_file:
+                    os.remove(audio_file_path)
+                    os.remove(text_file_path)
+                if (j + 1) % n_entry_each == 0:
+                    count = j + 1
+                    break
+        tar_handle.close()
+    # Serializing json
+    json_object = json.dumps(size_dict, indent=4)
+    # Writing to sample.json
+    with open(os.path.join(os.path.dirname(tar_name), "sizes.json"), "w") as outfile:
+        outfile.write(json_object)
+    return size_dict
+
+def packup(input, output, filename, dataclass='all', num_element=512, start_idx=0, delete_file=False):
+    if not os.path.exists(os.path.join(input, dataclass)):
+        print(
+            "Dataclass {} does not exist, this folder does not exist. Skipping it.".format(
+                dataclass
+            )
+        )
+        return
+    if os.path.exists(os.path.join(output, dataclass)):
+        tardir(
+            os.path.join(input, dataclass),
+            os.path.join(output, dataclass, filename),
+            num_element,
+            start_idx=start_idx,
+            delete_file=delete_file,
+        )
+    else:
+        os.makedirs(os.path.join(output, dataclass))
+        tardir(
+            os.path.join(input, dataclass),
+            os.path.join(output, dataclass, filename),
+            num_element,
+            start_idx=start_idx,
+            delete_file=delete_file,
+        )
+    return
+
+def load_from_tar(
+    file_path,
+    file_path_type="local",
+    audio_ext="flac",
+    text_ext="json",
+    samplerate=32000,
+    mono=True,
+    max_len=1000000,
+    dtype="float64",
+    res_type="kaiser_best",
+):
+    """
+    This function load the tar files to 3 entry tuple (audios, texts, names) accordingly
+    @param file_path      | string  | the path where audio and text files located
+    @param file_path_type | string  | this is meant to control the prefix of the address in case people forget to include it
+                                      if file_path_type is "local" and 'file:\\' is not shown as a prefix, it will be added automatically
+    @param audio_ext      | string  | the extension of the audio
+    @param text_ext       | string  | the extension of the text
+    @param samplerate     | int     | the sample rate of the audio
+    @param mono           | boolean | if the audio is in mono channel
+    @param max_len        | int     | max len of the audio, if exceed, will random crop; elif deficit, will pad
+    @param dtype          | string  | the type of the dtype of the audio sample representation
+    @param res_type       | string  | the resample method
+    """
+    if file_path_type == "local" and ("file:\\" not in file_path):
+        file_path = "file:\\" + file_path
+    dataset = wds.WebDataset(file_path)
+    audios = []
+    texts = []
+    names = []
+    for sample in dataset:
+        for key, value in sample.items():
+            if key == audio_ext:
+                audio_data, orig_sr = sf.read(io.BytesIO(value))
+                if samplerate is not None:
+                    audio_data = librosa.resample(
+                        audio_data,
+                        orig_sr=orig_sr,
+                        target_sr=samplerate,
+                        res_type=res_type,
+                    )
+                if len(audio_data) > max_len:
+                    overflow = len(audio_data) - max_len
+                    idx = np.random.randint(0, overflow + 1)
+                    if np.random.rand() > 0.5:
+                        audio_data = audio_data[idx : idx + max_len]
+                    else:
+                        audio_data = audio_data[
+                            len(audio_data)
+                            + 1
+                            - idx
+                            - max_len : len(audio_data)
+                            + 1
+                            - idx
+                        ]
+                else:
+                    audio_data = np.pad(
+                        audio_data,
+                        (0, max_len - len(audio_data)),
+                        mode="constant",
+                        constant_values=0,
+                    )
+                if mono:
+                    audio_data = librosa.to_mono(audio_data)
+                audios.append((audio_data, samplerate))
+            elif key == text_ext:
+                texts.append(value)
+            elif key == "__key__":
+                names.append(value)
+    return audios, texts, names
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="input folder, expecting subdirectory like train, valid or test",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="output, generating tar files at output/dataclass/filename_{}.tar",
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        default="",
+        help="the filename of the tar, generating tar files at output/dataclass/filename_{}.tar",
+    )
+    parser.add_argument(
+        "--dataclass", type=str, default="all", help="train or test or valid or all"
+    )
+    parser.add_argument(
+        "--num_element", type=int, default=512, help="pairs of (audio, text) to be included in a single tar"
+    )
+    parser.add_argument(
+        "--start_idx", type=int, default=0, help="start index of the tar"
+    )
+    parser.add_argument(
+        "--delete_file", action='store_true', help="delete the input file when making tars"
+    )
+    args = parser.parse_args()
+
+
+    if args.dataclass == "all":
+        for x in ["train", "valid", "test"]:
+            packup(args.input, args.output,  args.filename,  x,  args.num_element,  args.start_idx,  args.delete_file)
+    elif args.dataclass == "none":
+        os.makedirs(args.output, exist_ok=True)
+        tardir(
+            args.input,
+            os.path.join(args.output, args.filename),
+            args.num_element,
+            start_idx=args.start_idx,
+            delete_file=args.delete_file,
+        )
+    else:  # if dataclass is in other name
+        packup(args.input, args.output,  args.filename,  args.dataclass,  args.num_element,  args.start_idx,  args.delete_file)
--- a/dataset/audio/process.py
+++ b/dataset/audio/process.py
@ -0,0 +1,160 @@
+"""
+Code for processing WavCaps dataset.
+"""
+import argparse
+
+import os
+from tqdm import tqdm
+import glob
+import numpy as np
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", type=str, default='bbc', choices=['bbc', 'audioset', 'soundbible', 'freesound', 'test'])
+parser.add_argument(
+    "--data_root",
+    type=str,
+    default="/mnt/bn/zilongdata-hl/dataset/wavcaps"
+)
+parser.add_argument(
+    "--data_dir",
+    type=str,
+    default="/mnt/bn/zilongdata-hl/dataset/wavcaps/raw_datasets/test/"
+)
+parser.add_argument(
+    "--json_path",
+    type=str,
+    default="/mnt/bn/zilongdata-hl/dataset/wavcaps/WavCaps/json_files/test.json",
+)
+args = parser.parse_args()
+
+DATA_DIRS = {
+    "bbc": "raw_datasets/BBC_Sound_Effects_flac/",
+    "audioset": "raw_datasets/AudioSet_SL_flac/",
+    "soundbible": "raw_datasets/SoundBible_flac/",
+    "freesound": "raw_datasets/FreeSound_flac/",
+}
+
+JSON_PATHS = {
+    "bbc": "WavCaps/json_files/BBC_Sound_Effects/bbc_final.json",
+    "audioset": "WavCaps/json_files/AudioSet_SL/as_final.json",
+    "soundbible": "WavCaps/json_files/SoundBible/sb_final.json",
+    "freesound": "WavCaps/json_files/FreeSound/fsd_final.json",
+}
+
+
+
+def load_audioset_json(fname):
+    """A sample example:
+    {   
+        'id': 'Yb0RFKhbpFJA.wav',
+        'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.',
+        'audio': 'wav_path',
+        'duration': 10.0
+    }
+    """
+    with open(fname) as f:
+        data = json.load(f)
+
+    for sample in data['data']:
+        yield sample['id'].split('.')[0], sample['caption'], sample
+
+
+def load_soundbible_json(fname):
+    """A sample example:
+    {   
+        'title': 'Airplane Landing Airport',
+        'description': 'Large commercial airplane landing at an airport runway.',
+        'author': 'Daniel Simion',
+        'href': '2219-Airplane-Landing-Airport.html',
+        'caption': 'An airplane is landing.',
+        'id': '2219',
+        'duration': 14.1424375,
+        'audio': 'wav_path',
+        'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
+    }
+    """
+    with open(fname) as f:
+        data = json.load(f)
+    
+    for sample in data['data']:
+        yield sample['id'], sample['caption'], sample
+
+
+def load_freesound_json(fname):
+    """A sample example:
+    {   'id': '180913',
+        'file_name': 'UK Mello.wav',
+        'href': '/people/Tempouser/sounds/180913/',
+        'tags': ['Standard', 'ringtone', 'basic', 'traditional'],
+        'description': 'Standard traditional basic ringtone, in mello tone.',
+        'author': 'Tempouser',
+        'duration': 3.204375,
+        'download_link': 'https://freesound.org/people/Tempouser/sounds/180913/download/180913__tempouser__uk-mello.wav',
+        'caption': 'A traditional ringtone is playing.',
+        'audio': 'wav_path'
+    }
+    """
+    with open(fname) as f:
+        data = json.load(f)
+    
+    for sample in data['data']:
+        yield sample['id'], sample['caption'], sample
+
+
+def load_bbc_json(fname):
+    """A sample example:
+    {
+        'description': "Timber & Wood - Rip saw, carpenters' workshop.",
+        'category': "['Machines']",
+        'caption': "Someone is using a rip saw in a carpenter's workshop.",
+        'id': '07066104',
+        'duration': 138.36,
+        'audio': 'wav_path',
+        'download_link': 'https://sound-effects-media.bbcrewind.co.uk/zip/07066104.wav.zip'
+    }
+    """
+    with open(fname) as f:
+        data = json.load(f)
+    
+    for sample in data['data']:
+        yield sample['id'], sample['caption'], sample
+
+
+def load_test_json(fname):
+    """Using SoundBible as a text example."""
+    with open(fname) as f:
+        data = json.load(f)
+    
+    for sample in data['data']:
+        yield sample['id'], sample['caption'], sample
+
+if __name__ == '__main__':
+    if args.dataset in DATA_DIRS:
+        data_dir = os.path.join(args.data_root, DATA_DIRS[args.dataset])
+        json_path = os.path.join(args.data_root, JSON_PATHS[args.dataset])
+    else:
+        data_dir = args.data_dir
+        json_path = args.json_path
+
+    file_list = glob.glob(f'{data_dir}/*.flac')
+    for data_id, unsed_caption, meta_data in tqdm(list(globals()[f'load_{args.dataset}_json'](json_path))):
+        file_name = os.path.join(data_dir, data_id + '.flac')
+        json_save_path = os.path.join(data_dir, data_id + '.json')
+        # text_save_path = os.path.join(data_dir, data_id + '.text')
+        file_list.remove(file_name)
+
+        assert os.path.exists(file_name), f'{file_name} does not exist!'
+        with open(json_save_path, 'w') as f:
+            json.dump(meta_data, f)
+    
+    if len(file_list) > 0:
+        # import pdb; pdb.set_trace()
+        for f in file_list:
+            os.remove(f)
+  
+
+    # file_list = glob.glob(f'{data_dir}/*.flac')
+    # for file_path in file_list:
+    #     audio_json_save_path = file_path.replace('.flac', '.json')
+    #     audio_text_save_path = file_path.replace('.flac', '.text')
--- a/dataset/audio/setup.sh
+++ b/dataset/audio/setup.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+# TO setup dataset, run 
+# `DATASET=soundbible bash setup.sh`
+
+
+DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps
+DATASET=${DATASET-:soundbible}
+
+if [[ $DATASET == soundbible ]]; then
+    DATA_FOLDER=SoundBible
+    NUM_ELEMENT=2000
+elif [[ $DATASET == bbc ]]; then
+    DATA_FOLDER=BBC_Sound_Effects
+    NUM_ELEMENT=500
+elif [[ $DATASET == audioset ]]; then
+    DATA_FOLDER=AudioSet_SL
+    NUM_ELEMENT=2000
+elif [[ $DATASET == freesound ]]; then
+    DATA_FOLDER=FreeSound
+    NUM_ELEMENT=500
+else
+    echo "${DATASET} not found!"
+    exit
+fi 
+
+CODE_PATH=$(pwd)
+
+# Merge zip files 
+cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}
+zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip
+
+# Extract zip file
+cd ${DATA_ROOT}
+unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER}
+mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac
+rm -rf raw_datasets/${DATA_FOLDER}
+
+# Process raw data to create json annotation files
+cd $CODE_PATH
+python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET}
+
+# Pack up tar files 
+python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \
+    --output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \
+    --dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}