mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-06 19:10:45 +00:00
Audio dataset (#1)
* add audio dataset setup --------- Co-authored-by: bingyikang <bingyikang@bytedance.com>
This commit is contained in:
parent
2d2d781469
commit
64472dedb1
55
dataset/audio/README.md
Normal file
55
dataset/audio/README.md
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
## Audio Dataset
|
||||||
|
|
||||||
|
## Stage1: Pretraining
|
||||||
|
We mainly use [WavCaps](https://github.com/XinhaoMei/WavCaps) dataset for pre-training.
|
||||||
|
|
||||||
|
### Download
|
||||||
|
|
||||||
|
```Bash
|
||||||
|
# install git-lfs
|
||||||
|
sudo apt update
|
||||||
|
sudo apt-get install git-lfs
|
||||||
|
|
||||||
|
|
||||||
|
git clone https://huggingface.co/datasets/cvssp/WavCaps
|
||||||
|
cd WavCaps
|
||||||
|
git lfs pull --include "*"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Processing
|
||||||
|
|
||||||
|
1. Extract zip file
|
||||||
|
```bash
|
||||||
|
# merge shards first
|
||||||
|
zip -s- FILE_NAME.zip -O COMBINED_FILE.zip
|
||||||
|
unzip COMBINED_FILE.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Processing
|
||||||
|
Extract raw audio data
|
||||||
|
```bash
|
||||||
|
unzip COMBINED_FILE.zip -d /target/dir
|
||||||
|
```
|
||||||
|
|
||||||
|
Create json files (annotations) for each example. Before processing, modify `dataset/audio/process.py` to set data and json path.
|
||||||
|
```bash
|
||||||
|
python3 --dataset test --data_dir /path/to/data --json_path /path/to/json
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
3. Pack with tar
|
||||||
|
```bash
|
||||||
|
python3 dataset/audio/make_tar.py --input /path/to/data --output /path/to/web_dataset \
|
||||||
|
--dataclass none --filename filename --num_element 500
|
||||||
|
```
|
||||||
|
|
||||||
|
To view tar file
|
||||||
|
```
|
||||||
|
tar tf filename.tar | sed 10q
|
||||||
|
```
|
||||||
|
|
||||||
|
**To setup in one line:**
|
||||||
|
```bash
|
||||||
|
# DATASET=soundbible bbc audioset freesound
|
||||||
|
DATASET=soundbible bash dataset/audio/setup.sh
|
||||||
|
```
|
216
dataset/audio/make_tar.py
Normal file
216
dataset/audio/make_tar.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
import argparse
|
||||||
|
import librosa
|
||||||
|
import json
|
||||||
|
import webdataset as wds
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import io
|
||||||
|
from glob import glob
|
||||||
|
from itertools import islice
|
||||||
|
import scipy.signal as sps
|
||||||
|
import soundfile as sf
|
||||||
|
from tqdm import tqdm
|
||||||
|
import tarfile
|
||||||
|
|
||||||
|
|
||||||
|
def tardir(
|
||||||
|
file_path, tar_name, n_entry_each, audio_ext=".flac", text_ext=".json", shuffle=True, start_idx=0, delete_file=False
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
This function create the tars that includes the audio and text files in the same folder
|
||||||
|
@param file_path | string | the path where audio and text files located
|
||||||
|
@param tar_name | string | the tar name
|
||||||
|
@param n_entry_each | int | how many pairs of (audio, text) will be in a tar
|
||||||
|
@param audio_ext | string | the extension of the audio
|
||||||
|
@param text_ext | string | the extension of the text
|
||||||
|
@param shuffle | boolean | True to shuffle the file sequence before packing up
|
||||||
|
@param start_idx | int | the start index of the tar
|
||||||
|
@param delete_file | boolean | True to delete the audio and text files after packing up
|
||||||
|
"""
|
||||||
|
filelist = glob(file_path+'/*'+audio_ext)
|
||||||
|
|
||||||
|
if shuffle:
|
||||||
|
random.shuffle(filelist)
|
||||||
|
count = 0
|
||||||
|
n_split = len(filelist) // n_entry_each
|
||||||
|
if n_split * n_entry_each != len(filelist):
|
||||||
|
n_split += 1
|
||||||
|
size_dict = {
|
||||||
|
os.path.basename(tar_name) + "{:06d}".format(i) + ".tar": n_entry_each
|
||||||
|
for i in range(n_split)
|
||||||
|
}
|
||||||
|
if n_split * n_entry_each != len(filelist):
|
||||||
|
size_dict[os.path.basename(tar_name) + "{:06d}".format(n_split - 1) + ".tar"] = (
|
||||||
|
len(filelist) - (n_split - 1) * n_entry_each
|
||||||
|
)
|
||||||
|
for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
|
||||||
|
with tarfile.open(tar_name + "{:06d}".format(i) + ".tar", "w") as tar_handle:
|
||||||
|
for j in range(count, len(filelist)):
|
||||||
|
audio = filelist[j]
|
||||||
|
basename = ".".join(audio.split(".")[:-1])
|
||||||
|
text_file_path = os.path.join(file_path, basename + text_ext)
|
||||||
|
audio_file_path = os.path.join(file_path, audio)
|
||||||
|
tar_handle.add(audio_file_path)
|
||||||
|
tar_handle.add(text_file_path)
|
||||||
|
if delete_file:
|
||||||
|
os.remove(audio_file_path)
|
||||||
|
os.remove(text_file_path)
|
||||||
|
if (j + 1) % n_entry_each == 0:
|
||||||
|
count = j + 1
|
||||||
|
break
|
||||||
|
tar_handle.close()
|
||||||
|
# Serializing json
|
||||||
|
json_object = json.dumps(size_dict, indent=4)
|
||||||
|
# Writing to sample.json
|
||||||
|
with open(os.path.join(os.path.dirname(tar_name), "sizes.json"), "w") as outfile:
|
||||||
|
outfile.write(json_object)
|
||||||
|
return size_dict
|
||||||
|
|
||||||
|
def packup(input, output, filename, dataclass='all', num_element=512, start_idx=0, delete_file=False):
|
||||||
|
if not os.path.exists(os.path.join(input, dataclass)):
|
||||||
|
print(
|
||||||
|
"Dataclass {} does not exist, this folder does not exist. Skipping it.".format(
|
||||||
|
dataclass
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return
|
||||||
|
if os.path.exists(os.path.join(output, dataclass)):
|
||||||
|
tardir(
|
||||||
|
os.path.join(input, dataclass),
|
||||||
|
os.path.join(output, dataclass, filename),
|
||||||
|
num_element,
|
||||||
|
start_idx=start_idx,
|
||||||
|
delete_file=delete_file,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
os.makedirs(os.path.join(output, dataclass))
|
||||||
|
tardir(
|
||||||
|
os.path.join(input, dataclass),
|
||||||
|
os.path.join(output, dataclass, filename),
|
||||||
|
num_element,
|
||||||
|
start_idx=start_idx,
|
||||||
|
delete_file=delete_file,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
def load_from_tar(
|
||||||
|
file_path,
|
||||||
|
file_path_type="local",
|
||||||
|
audio_ext="flac",
|
||||||
|
text_ext="json",
|
||||||
|
samplerate=32000,
|
||||||
|
mono=True,
|
||||||
|
max_len=1000000,
|
||||||
|
dtype="float64",
|
||||||
|
res_type="kaiser_best",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
This function load the tar files to 3 entry tuple (audios, texts, names) accordingly
|
||||||
|
@param file_path | string | the path where audio and text files located
|
||||||
|
@param file_path_type | string | this is meant to control the prefix of the address in case people forget to include it
|
||||||
|
if file_path_type is "local" and 'file:\\' is not shown as a prefix, it will be added automatically
|
||||||
|
@param audio_ext | string | the extension of the audio
|
||||||
|
@param text_ext | string | the extension of the text
|
||||||
|
@param samplerate | int | the sample rate of the audio
|
||||||
|
@param mono | boolean | if the audio is in mono channel
|
||||||
|
@param max_len | int | max len of the audio, if exceed, will random crop; elif deficit, will pad
|
||||||
|
@param dtype | string | the type of the dtype of the audio sample representation
|
||||||
|
@param res_type | string | the resample method
|
||||||
|
"""
|
||||||
|
if file_path_type == "local" and ("file:\\" not in file_path):
|
||||||
|
file_path = "file:\\" + file_path
|
||||||
|
dataset = wds.WebDataset(file_path)
|
||||||
|
audios = []
|
||||||
|
texts = []
|
||||||
|
names = []
|
||||||
|
for sample in dataset:
|
||||||
|
for key, value in sample.items():
|
||||||
|
if key == audio_ext:
|
||||||
|
audio_data, orig_sr = sf.read(io.BytesIO(value))
|
||||||
|
if samplerate is not None:
|
||||||
|
audio_data = librosa.resample(
|
||||||
|
audio_data,
|
||||||
|
orig_sr=orig_sr,
|
||||||
|
target_sr=samplerate,
|
||||||
|
res_type=res_type,
|
||||||
|
)
|
||||||
|
if len(audio_data) > max_len:
|
||||||
|
overflow = len(audio_data) - max_len
|
||||||
|
idx = np.random.randint(0, overflow + 1)
|
||||||
|
if np.random.rand() > 0.5:
|
||||||
|
audio_data = audio_data[idx : idx + max_len]
|
||||||
|
else:
|
||||||
|
audio_data = audio_data[
|
||||||
|
len(audio_data)
|
||||||
|
+ 1
|
||||||
|
- idx
|
||||||
|
- max_len : len(audio_data)
|
||||||
|
+ 1
|
||||||
|
- idx
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
audio_data = np.pad(
|
||||||
|
audio_data,
|
||||||
|
(0, max_len - len(audio_data)),
|
||||||
|
mode="constant",
|
||||||
|
constant_values=0,
|
||||||
|
)
|
||||||
|
if mono:
|
||||||
|
audio_data = librosa.to_mono(audio_data)
|
||||||
|
audios.append((audio_data, samplerate))
|
||||||
|
elif key == text_ext:
|
||||||
|
texts.append(value)
|
||||||
|
elif key == "__key__":
|
||||||
|
names.append(value)
|
||||||
|
return audios, texts, names
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--input",
|
||||||
|
type=str,
|
||||||
|
help="input folder, expecting subdirectory like train, valid or test",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=str,
|
||||||
|
help="output, generating tar files at output/dataclass/filename_{}.tar",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--filename",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="the filename of the tar, generating tar files at output/dataclass/filename_{}.tar",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataclass", type=str, default="all", help="train or test or valid or all"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_element", type=int, default=512, help="pairs of (audio, text) to be included in a single tar"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--start_idx", type=int, default=0, help="start index of the tar"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--delete_file", action='store_true', help="delete the input file when making tars"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if args.dataclass == "all":
|
||||||
|
for x in ["train", "valid", "test"]:
|
||||||
|
packup(args.input, args.output, args.filename, x, args.num_element, args.start_idx, args.delete_file)
|
||||||
|
elif args.dataclass == "none":
|
||||||
|
os.makedirs(args.output, exist_ok=True)
|
||||||
|
tardir(
|
||||||
|
args.input,
|
||||||
|
os.path.join(args.output, args.filename),
|
||||||
|
args.num_element,
|
||||||
|
start_idx=args.start_idx,
|
||||||
|
delete_file=args.delete_file,
|
||||||
|
)
|
||||||
|
else: # if dataclass is in other name
|
||||||
|
packup(args.input, args.output, args.filename, args.dataclass, args.num_element, args.start_idx, args.delete_file)
|
160
dataset/audio/process.py
Normal file
160
dataset/audio/process.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
"""
|
||||||
|
Code for processing WavCaps dataset.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import os
|
||||||
|
from tqdm import tqdm
|
||||||
|
import glob
|
||||||
|
import numpy as np
|
||||||
|
import json
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--dataset", type=str, default='bbc', choices=['bbc', 'audioset', 'soundbible', 'freesound', 'test'])
|
||||||
|
parser.add_argument(
|
||||||
|
"--data_root",
|
||||||
|
type=str,
|
||||||
|
default="/mnt/bn/zilongdata-hl/dataset/wavcaps"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--data_dir",
|
||||||
|
type=str,
|
||||||
|
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/raw_datasets/test/"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--json_path",
|
||||||
|
type=str,
|
||||||
|
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/WavCaps/json_files/test.json",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
DATA_DIRS = {
|
||||||
|
"bbc": "raw_datasets/BBC_Sound_Effects_flac/",
|
||||||
|
"audioset": "raw_datasets/AudioSet_SL_flac/",
|
||||||
|
"soundbible": "raw_datasets/SoundBible_flac/",
|
||||||
|
"freesound": "raw_datasets/FreeSound_flac/",
|
||||||
|
}
|
||||||
|
|
||||||
|
JSON_PATHS = {
|
||||||
|
"bbc": "WavCaps/json_files/BBC_Sound_Effects/bbc_final.json",
|
||||||
|
"audioset": "WavCaps/json_files/AudioSet_SL/as_final.json",
|
||||||
|
"soundbible": "WavCaps/json_files/SoundBible/sb_final.json",
|
||||||
|
"freesound": "WavCaps/json_files/FreeSound/fsd_final.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_audioset_json(fname):
|
||||||
|
"""A sample example:
|
||||||
|
{
|
||||||
|
'id': 'Yb0RFKhbpFJA.wav',
|
||||||
|
'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.',
|
||||||
|
'audio': 'wav_path',
|
||||||
|
'duration': 10.0
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
with open(fname) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for sample in data['data']:
|
||||||
|
yield sample['id'].split('.')[0], sample['caption'], sample
|
||||||
|
|
||||||
|
|
||||||
|
def load_soundbible_json(fname):
|
||||||
|
"""A sample example:
|
||||||
|
{
|
||||||
|
'title': 'Airplane Landing Airport',
|
||||||
|
'description': 'Large commercial airplane landing at an airport runway.',
|
||||||
|
'author': 'Daniel Simion',
|
||||||
|
'href': '2219-Airplane-Landing-Airport.html',
|
||||||
|
'caption': 'An airplane is landing.',
|
||||||
|
'id': '2219',
|
||||||
|
'duration': 14.1424375,
|
||||||
|
'audio': 'wav_path',
|
||||||
|
'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
with open(fname) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for sample in data['data']:
|
||||||
|
yield sample['id'], sample['caption'], sample
|
||||||
|
|
||||||
|
|
||||||
|
def load_freesound_json(fname):
|
||||||
|
"""A sample example:
|
||||||
|
{ 'id': '180913',
|
||||||
|
'file_name': 'UK Mello.wav',
|
||||||
|
'href': '/people/Tempouser/sounds/180913/',
|
||||||
|
'tags': ['Standard', 'ringtone', 'basic', 'traditional'],
|
||||||
|
'description': 'Standard traditional basic ringtone, in mello tone.',
|
||||||
|
'author': 'Tempouser',
|
||||||
|
'duration': 3.204375,
|
||||||
|
'download_link': 'https://freesound.org/people/Tempouser/sounds/180913/download/180913__tempouser__uk-mello.wav',
|
||||||
|
'caption': 'A traditional ringtone is playing.',
|
||||||
|
'audio': 'wav_path'
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
with open(fname) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for sample in data['data']:
|
||||||
|
yield sample['id'], sample['caption'], sample
|
||||||
|
|
||||||
|
|
||||||
|
def load_bbc_json(fname):
|
||||||
|
"""A sample example:
|
||||||
|
{
|
||||||
|
'description': "Timber & Wood - Rip saw, carpenters' workshop.",
|
||||||
|
'category': "['Machines']",
|
||||||
|
'caption': "Someone is using a rip saw in a carpenter's workshop.",
|
||||||
|
'id': '07066104',
|
||||||
|
'duration': 138.36,
|
||||||
|
'audio': 'wav_path',
|
||||||
|
'download_link': 'https://sound-effects-media.bbcrewind.co.uk/zip/07066104.wav.zip'
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
with open(fname) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for sample in data['data']:
|
||||||
|
yield sample['id'], sample['caption'], sample
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_json(fname):
|
||||||
|
"""Using SoundBible as a text example."""
|
||||||
|
with open(fname) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
for sample in data['data']:
|
||||||
|
yield sample['id'], sample['caption'], sample
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if args.dataset in DATA_DIRS:
|
||||||
|
data_dir = os.path.join(args.data_root, DATA_DIRS[args.dataset])
|
||||||
|
json_path = os.path.join(args.data_root, JSON_PATHS[args.dataset])
|
||||||
|
else:
|
||||||
|
data_dir = args.data_dir
|
||||||
|
json_path = args.json_path
|
||||||
|
|
||||||
|
file_list = glob.glob(f'{data_dir}/*.flac')
|
||||||
|
for data_id, unsed_caption, meta_data in tqdm(list(globals()[f'load_{args.dataset}_json'](json_path))):
|
||||||
|
file_name = os.path.join(data_dir, data_id + '.flac')
|
||||||
|
json_save_path = os.path.join(data_dir, data_id + '.json')
|
||||||
|
# text_save_path = os.path.join(data_dir, data_id + '.text')
|
||||||
|
file_list.remove(file_name)
|
||||||
|
|
||||||
|
assert os.path.exists(file_name), f'{file_name} does not exist!'
|
||||||
|
with open(json_save_path, 'w') as f:
|
||||||
|
json.dump(meta_data, f)
|
||||||
|
|
||||||
|
if len(file_list) > 0:
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
for f in file_list:
|
||||||
|
os.remove(f)
|
||||||
|
|
||||||
|
|
||||||
|
# file_list = glob.glob(f'{data_dir}/*.flac')
|
||||||
|
# for file_path in file_list:
|
||||||
|
# audio_json_save_path = file_path.replace('.flac', '.json')
|
||||||
|
# audio_text_save_path = file_path.replace('.flac', '.text')
|
45
dataset/audio/setup.sh
Normal file
45
dataset/audio/setup.sh
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# TO setup dataset, run
|
||||||
|
# `DATASET=soundbible bash setup.sh`
|
||||||
|
|
||||||
|
|
||||||
|
DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps
|
||||||
|
DATASET=${DATASET-:soundbible}
|
||||||
|
|
||||||
|
if [[ $DATASET == soundbible ]]; then
|
||||||
|
DATA_FOLDER=SoundBible
|
||||||
|
NUM_ELEMENT=2000
|
||||||
|
elif [[ $DATASET == bbc ]]; then
|
||||||
|
DATA_FOLDER=BBC_Sound_Effects
|
||||||
|
NUM_ELEMENT=500
|
||||||
|
elif [[ $DATASET == audioset ]]; then
|
||||||
|
DATA_FOLDER=AudioSet_SL
|
||||||
|
NUM_ELEMENT=2000
|
||||||
|
elif [[ $DATASET == freesound ]]; then
|
||||||
|
DATA_FOLDER=FreeSound
|
||||||
|
NUM_ELEMENT=500
|
||||||
|
else
|
||||||
|
echo "${DATASET} not found!"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
CODE_PATH=$(pwd)
|
||||||
|
|
||||||
|
# Merge zip files
|
||||||
|
cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}
|
||||||
|
zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip
|
||||||
|
|
||||||
|
# Extract zip file
|
||||||
|
cd ${DATA_ROOT}
|
||||||
|
unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER}
|
||||||
|
mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac
|
||||||
|
rm -rf raw_datasets/${DATA_FOLDER}
|
||||||
|
|
||||||
|
# Process raw data to create json annotation files
|
||||||
|
cd $CODE_PATH
|
||||||
|
python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET}
|
||||||
|
|
||||||
|
# Pack up tar files
|
||||||
|
python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \
|
||||||
|
--output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \
|
||||||
|
--dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}
|
Loading…
Reference in New Issue
Block a user