Audio dataset (#1)

* add audio dataset setup

---------

Co-authored-by: bingyikang <bingyikang@bytedance.com>
This commit is contained in:
Bingyi Kang 2023-05-24 14:16:50 +08:00 committed by GitHub
parent 2d2d781469
commit 64472dedb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 476 additions and 0 deletions

55
dataset/audio/README.md Normal file
View File

@ -0,0 +1,55 @@
## Audio Dataset
## Stage1: Pretraining
We mainly use [WavCaps](https://github.com/XinhaoMei/WavCaps) dataset for pre-training.
### Download
```Bash
# install git-lfs
sudo apt update
sudo apt-get install git-lfs
git clone https://huggingface.co/datasets/cvssp/WavCaps
cd WavCaps
git lfs pull --include "*"
```
### Processing
1. Extract zip file
```bash
# merge shards first
zip -s- FILE_NAME.zip -O COMBINED_FILE.zip
unzip COMBINED_FILE.zip
```
2. Processing
Extract raw audio data
```bash
unzip COMBINED_FILE.zip -d /target/dir
```
Create json files (annotations) for each example. Before processing, modify `dataset/audio/process.py` to set data and json path.
```bash
python3 --dataset test --data_dir /path/to/data --json_path /path/to/json
```
3. Pack with tar
```bash
python3 dataset/audio/make_tar.py --input /path/to/data --output /path/to/web_dataset \
--dataclass none --filename filename --num_element 500
```
To view tar file
```
tar tf filename.tar | sed 10q
```
**To setup in one line:**
```bash
# DATASET=soundbible bbc audioset freesound
DATASET=soundbible bash dataset/audio/setup.sh
```

216
dataset/audio/make_tar.py Normal file
View File

@ -0,0 +1,216 @@
import argparse
import librosa
import json
import webdataset as wds
import numpy as np
import os
import random
import io
from glob import glob
from itertools import islice
import scipy.signal as sps
import soundfile as sf
from tqdm import tqdm
import tarfile
def tardir(
file_path, tar_name, n_entry_each, audio_ext=".flac", text_ext=".json", shuffle=True, start_idx=0, delete_file=False
):
"""
This function create the tars that includes the audio and text files in the same folder
@param file_path | string | the path where audio and text files located
@param tar_name | string | the tar name
@param n_entry_each | int | how many pairs of (audio, text) will be in a tar
@param audio_ext | string | the extension of the audio
@param text_ext | string | the extension of the text
@param shuffle | boolean | True to shuffle the file sequence before packing up
@param start_idx | int | the start index of the tar
@param delete_file | boolean | True to delete the audio and text files after packing up
"""
filelist = glob(file_path+'/*'+audio_ext)
if shuffle:
random.shuffle(filelist)
count = 0
n_split = len(filelist) // n_entry_each
if n_split * n_entry_each != len(filelist):
n_split += 1
size_dict = {
os.path.basename(tar_name) + "{:06d}".format(i) + ".tar": n_entry_each
for i in range(n_split)
}
if n_split * n_entry_each != len(filelist):
size_dict[os.path.basename(tar_name) + "{:06d}".format(n_split - 1) + ".tar"] = (
len(filelist) - (n_split - 1) * n_entry_each
)
for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
with tarfile.open(tar_name + "{:06d}".format(i) + ".tar", "w") as tar_handle:
for j in range(count, len(filelist)):
audio = filelist[j]
basename = ".".join(audio.split(".")[:-1])
text_file_path = os.path.join(file_path, basename + text_ext)
audio_file_path = os.path.join(file_path, audio)
tar_handle.add(audio_file_path)
tar_handle.add(text_file_path)
if delete_file:
os.remove(audio_file_path)
os.remove(text_file_path)
if (j + 1) % n_entry_each == 0:
count = j + 1
break
tar_handle.close()
# Serializing json
json_object = json.dumps(size_dict, indent=4)
# Writing to sample.json
with open(os.path.join(os.path.dirname(tar_name), "sizes.json"), "w") as outfile:
outfile.write(json_object)
return size_dict
def packup(input, output, filename, dataclass='all', num_element=512, start_idx=0, delete_file=False):
if not os.path.exists(os.path.join(input, dataclass)):
print(
"Dataclass {} does not exist, this folder does not exist. Skipping it.".format(
dataclass
)
)
return
if os.path.exists(os.path.join(output, dataclass)):
tardir(
os.path.join(input, dataclass),
os.path.join(output, dataclass, filename),
num_element,
start_idx=start_idx,
delete_file=delete_file,
)
else:
os.makedirs(os.path.join(output, dataclass))
tardir(
os.path.join(input, dataclass),
os.path.join(output, dataclass, filename),
num_element,
start_idx=start_idx,
delete_file=delete_file,
)
return
def load_from_tar(
file_path,
file_path_type="local",
audio_ext="flac",
text_ext="json",
samplerate=32000,
mono=True,
max_len=1000000,
dtype="float64",
res_type="kaiser_best",
):
"""
This function load the tar files to 3 entry tuple (audios, texts, names) accordingly
@param file_path | string | the path where audio and text files located
@param file_path_type | string | this is meant to control the prefix of the address in case people forget to include it
if file_path_type is "local" and 'file:\\' is not shown as a prefix, it will be added automatically
@param audio_ext | string | the extension of the audio
@param text_ext | string | the extension of the text
@param samplerate | int | the sample rate of the audio
@param mono | boolean | if the audio is in mono channel
@param max_len | int | max len of the audio, if exceed, will random crop; elif deficit, will pad
@param dtype | string | the type of the dtype of the audio sample representation
@param res_type | string | the resample method
"""
if file_path_type == "local" and ("file:\\" not in file_path):
file_path = "file:\\" + file_path
dataset = wds.WebDataset(file_path)
audios = []
texts = []
names = []
for sample in dataset:
for key, value in sample.items():
if key == audio_ext:
audio_data, orig_sr = sf.read(io.BytesIO(value))
if samplerate is not None:
audio_data = librosa.resample(
audio_data,
orig_sr=orig_sr,
target_sr=samplerate,
res_type=res_type,
)
if len(audio_data) > max_len:
overflow = len(audio_data) - max_len
idx = np.random.randint(0, overflow + 1)
if np.random.rand() > 0.5:
audio_data = audio_data[idx : idx + max_len]
else:
audio_data = audio_data[
len(audio_data)
+ 1
- idx
- max_len : len(audio_data)
+ 1
- idx
]
else:
audio_data = np.pad(
audio_data,
(0, max_len - len(audio_data)),
mode="constant",
constant_values=0,
)
if mono:
audio_data = librosa.to_mono(audio_data)
audios.append((audio_data, samplerate))
elif key == text_ext:
texts.append(value)
elif key == "__key__":
names.append(value)
return audios, texts, names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--input",
type=str,
help="input folder, expecting subdirectory like train, valid or test",
)
parser.add_argument(
"--output",
type=str,
help="output, generating tar files at output/dataclass/filename_{}.tar",
)
parser.add_argument(
"--filename",
type=str,
default="",
help="the filename of the tar, generating tar files at output/dataclass/filename_{}.tar",
)
parser.add_argument(
"--dataclass", type=str, default="all", help="train or test or valid or all"
)
parser.add_argument(
"--num_element", type=int, default=512, help="pairs of (audio, text) to be included in a single tar"
)
parser.add_argument(
"--start_idx", type=int, default=0, help="start index of the tar"
)
parser.add_argument(
"--delete_file", action='store_true', help="delete the input file when making tars"
)
args = parser.parse_args()
if args.dataclass == "all":
for x in ["train", "valid", "test"]:
packup(args.input, args.output, args.filename, x, args.num_element, args.start_idx, args.delete_file)
elif args.dataclass == "none":
os.makedirs(args.output, exist_ok=True)
tardir(
args.input,
os.path.join(args.output, args.filename),
args.num_element,
start_idx=args.start_idx,
delete_file=args.delete_file,
)
else: # if dataclass is in other name
packup(args.input, args.output, args.filename, args.dataclass, args.num_element, args.start_idx, args.delete_file)

160
dataset/audio/process.py Normal file
View File

@ -0,0 +1,160 @@
"""
Code for processing WavCaps dataset.
"""
import argparse
import os
from tqdm import tqdm
import glob
import numpy as np
import json
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default='bbc', choices=['bbc', 'audioset', 'soundbible', 'freesound', 'test'])
parser.add_argument(
"--data_root",
type=str,
default="/mnt/bn/zilongdata-hl/dataset/wavcaps"
)
parser.add_argument(
"--data_dir",
type=str,
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/raw_datasets/test/"
)
parser.add_argument(
"--json_path",
type=str,
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/WavCaps/json_files/test.json",
)
args = parser.parse_args()
DATA_DIRS = {
"bbc": "raw_datasets/BBC_Sound_Effects_flac/",
"audioset": "raw_datasets/AudioSet_SL_flac/",
"soundbible": "raw_datasets/SoundBible_flac/",
"freesound": "raw_datasets/FreeSound_flac/",
}
JSON_PATHS = {
"bbc": "WavCaps/json_files/BBC_Sound_Effects/bbc_final.json",
"audioset": "WavCaps/json_files/AudioSet_SL/as_final.json",
"soundbible": "WavCaps/json_files/SoundBible/sb_final.json",
"freesound": "WavCaps/json_files/FreeSound/fsd_final.json",
}
def load_audioset_json(fname):
"""A sample example:
{
'id': 'Yb0RFKhbpFJA.wav',
'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.',
'audio': 'wav_path',
'duration': 10.0
}
"""
with open(fname) as f:
data = json.load(f)
for sample in data['data']:
yield sample['id'].split('.')[0], sample['caption'], sample
def load_soundbible_json(fname):
"""A sample example:
{
'title': 'Airplane Landing Airport',
'description': 'Large commercial airplane landing at an airport runway.',
'author': 'Daniel Simion',
'href': '2219-Airplane-Landing-Airport.html',
'caption': 'An airplane is landing.',
'id': '2219',
'duration': 14.1424375,
'audio': 'wav_path',
'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
}
"""
with open(fname) as f:
data = json.load(f)
for sample in data['data']:
yield sample['id'], sample['caption'], sample
def load_freesound_json(fname):
"""A sample example:
{ 'id': '180913',
'file_name': 'UK Mello.wav',
'href': '/people/Tempouser/sounds/180913/',
'tags': ['Standard', 'ringtone', 'basic', 'traditional'],
'description': 'Standard traditional basic ringtone, in mello tone.',
'author': 'Tempouser',
'duration': 3.204375,
'download_link': 'https://freesound.org/people/Tempouser/sounds/180913/download/180913__tempouser__uk-mello.wav',
'caption': 'A traditional ringtone is playing.',
'audio': 'wav_path'
}
"""
with open(fname) as f:
data = json.load(f)
for sample in data['data']:
yield sample['id'], sample['caption'], sample
def load_bbc_json(fname):
"""A sample example:
{
'description': "Timber & Wood - Rip saw, carpenters' workshop.",
'category': "['Machines']",
'caption': "Someone is using a rip saw in a carpenter's workshop.",
'id': '07066104',
'duration': 138.36,
'audio': 'wav_path',
'download_link': 'https://sound-effects-media.bbcrewind.co.uk/zip/07066104.wav.zip'
}
"""
with open(fname) as f:
data = json.load(f)
for sample in data['data']:
yield sample['id'], sample['caption'], sample
def load_test_json(fname):
"""Using SoundBible as a text example."""
with open(fname) as f:
data = json.load(f)
for sample in data['data']:
yield sample['id'], sample['caption'], sample
if __name__ == '__main__':
if args.dataset in DATA_DIRS:
data_dir = os.path.join(args.data_root, DATA_DIRS[args.dataset])
json_path = os.path.join(args.data_root, JSON_PATHS[args.dataset])
else:
data_dir = args.data_dir
json_path = args.json_path
file_list = glob.glob(f'{data_dir}/*.flac')
for data_id, unsed_caption, meta_data in tqdm(list(globals()[f'load_{args.dataset}_json'](json_path))):
file_name = os.path.join(data_dir, data_id + '.flac')
json_save_path = os.path.join(data_dir, data_id + '.json')
# text_save_path = os.path.join(data_dir, data_id + '.text')
file_list.remove(file_name)
assert os.path.exists(file_name), f'{file_name} does not exist!'
with open(json_save_path, 'w') as f:
json.dump(meta_data, f)
if len(file_list) > 0:
# import pdb; pdb.set_trace()
for f in file_list:
os.remove(f)
# file_list = glob.glob(f'{data_dir}/*.flac')
# for file_path in file_list:
# audio_json_save_path = file_path.replace('.flac', '.json')
# audio_text_save_path = file_path.replace('.flac', '.text')

45
dataset/audio/setup.sh Normal file
View File

@ -0,0 +1,45 @@
#!/bin/bash
# TO setup dataset, run
# `DATASET=soundbible bash setup.sh`
DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps
DATASET=${DATASET-:soundbible}
if [[ $DATASET == soundbible ]]; then
DATA_FOLDER=SoundBible
NUM_ELEMENT=2000
elif [[ $DATASET == bbc ]]; then
DATA_FOLDER=BBC_Sound_Effects
NUM_ELEMENT=500
elif [[ $DATASET == audioset ]]; then
DATA_FOLDER=AudioSet_SL
NUM_ELEMENT=2000
elif [[ $DATASET == freesound ]]; then
DATA_FOLDER=FreeSound
NUM_ELEMENT=500
else
echo "${DATASET} not found!"
exit
fi
CODE_PATH=$(pwd)
# Merge zip files
cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}
zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip
# Extract zip file
cd ${DATA_ROOT}
unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER}
mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac
rm -rf raw_datasets/${DATA_FOLDER}
# Process raw data to create json annotation files
cd $CODE_PATH
python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET}
# Pack up tar files
python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \
--output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \
--dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}