mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-05 02:20:47 +00:00
Audio dataset (#1)
* add audio dataset setup --------- Co-authored-by: bingyikang <bingyikang@bytedance.com>
This commit is contained in:
parent
2d2d781469
commit
64472dedb1
55
dataset/audio/README.md
Normal file
55
dataset/audio/README.md
Normal file
@ -0,0 +1,55 @@
|
||||
## Audio Dataset
|
||||
|
||||
## Stage1: Pretraining
|
||||
We mainly use [WavCaps](https://github.com/XinhaoMei/WavCaps) dataset for pre-training.
|
||||
|
||||
### Download
|
||||
|
||||
```Bash
|
||||
# install git-lfs
|
||||
sudo apt update
|
||||
sudo apt-get install git-lfs
|
||||
|
||||
|
||||
git clone https://huggingface.co/datasets/cvssp/WavCaps
|
||||
cd WavCaps
|
||||
git lfs pull --include "*"
|
||||
```
|
||||
|
||||
### Processing
|
||||
|
||||
1. Extract zip file
|
||||
```bash
|
||||
# merge shards first
|
||||
zip -s- FILE_NAME.zip -O COMBINED_FILE.zip
|
||||
unzip COMBINED_FILE.zip
|
||||
```
|
||||
|
||||
2. Processing
|
||||
Extract raw audio data
|
||||
```bash
|
||||
unzip COMBINED_FILE.zip -d /target/dir
|
||||
```
|
||||
|
||||
Create json files (annotations) for each example. Before processing, modify `dataset/audio/process.py` to set data and json path.
|
||||
```bash
|
||||
python3 --dataset test --data_dir /path/to/data --json_path /path/to/json
|
||||
```
|
||||
|
||||
|
||||
3. Pack with tar
|
||||
```bash
|
||||
python3 dataset/audio/make_tar.py --input /path/to/data --output /path/to/web_dataset \
|
||||
--dataclass none --filename filename --num_element 500
|
||||
```
|
||||
|
||||
To view tar file
|
||||
```
|
||||
tar tf filename.tar | sed 10q
|
||||
```
|
||||
|
||||
**To setup in one line:**
|
||||
```bash
|
||||
# DATASET=soundbible bbc audioset freesound
|
||||
DATASET=soundbible bash dataset/audio/setup.sh
|
||||
```
|
216
dataset/audio/make_tar.py
Normal file
216
dataset/audio/make_tar.py
Normal file
@ -0,0 +1,216 @@
|
||||
import argparse
|
||||
import librosa
|
||||
import json
|
||||
import webdataset as wds
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
import io
|
||||
from glob import glob
|
||||
from itertools import islice
|
||||
import scipy.signal as sps
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
import tarfile
|
||||
|
||||
|
||||
def tardir(
|
||||
file_path, tar_name, n_entry_each, audio_ext=".flac", text_ext=".json", shuffle=True, start_idx=0, delete_file=False
|
||||
):
|
||||
"""
|
||||
This function create the tars that includes the audio and text files in the same folder
|
||||
@param file_path | string | the path where audio and text files located
|
||||
@param tar_name | string | the tar name
|
||||
@param n_entry_each | int | how many pairs of (audio, text) will be in a tar
|
||||
@param audio_ext | string | the extension of the audio
|
||||
@param text_ext | string | the extension of the text
|
||||
@param shuffle | boolean | True to shuffle the file sequence before packing up
|
||||
@param start_idx | int | the start index of the tar
|
||||
@param delete_file | boolean | True to delete the audio and text files after packing up
|
||||
"""
|
||||
filelist = glob(file_path+'/*'+audio_ext)
|
||||
|
||||
if shuffle:
|
||||
random.shuffle(filelist)
|
||||
count = 0
|
||||
n_split = len(filelist) // n_entry_each
|
||||
if n_split * n_entry_each != len(filelist):
|
||||
n_split += 1
|
||||
size_dict = {
|
||||
os.path.basename(tar_name) + "{:06d}".format(i) + ".tar": n_entry_each
|
||||
for i in range(n_split)
|
||||
}
|
||||
if n_split * n_entry_each != len(filelist):
|
||||
size_dict[os.path.basename(tar_name) + "{:06d}".format(n_split - 1) + ".tar"] = (
|
||||
len(filelist) - (n_split - 1) * n_entry_each
|
||||
)
|
||||
for i in tqdm(range(start_idx, n_split + start_idx), desc='Creating .tar file:'):
|
||||
with tarfile.open(tar_name + "{:06d}".format(i) + ".tar", "w") as tar_handle:
|
||||
for j in range(count, len(filelist)):
|
||||
audio = filelist[j]
|
||||
basename = ".".join(audio.split(".")[:-1])
|
||||
text_file_path = os.path.join(file_path, basename + text_ext)
|
||||
audio_file_path = os.path.join(file_path, audio)
|
||||
tar_handle.add(audio_file_path)
|
||||
tar_handle.add(text_file_path)
|
||||
if delete_file:
|
||||
os.remove(audio_file_path)
|
||||
os.remove(text_file_path)
|
||||
if (j + 1) % n_entry_each == 0:
|
||||
count = j + 1
|
||||
break
|
||||
tar_handle.close()
|
||||
# Serializing json
|
||||
json_object = json.dumps(size_dict, indent=4)
|
||||
# Writing to sample.json
|
||||
with open(os.path.join(os.path.dirname(tar_name), "sizes.json"), "w") as outfile:
|
||||
outfile.write(json_object)
|
||||
return size_dict
|
||||
|
||||
def packup(input, output, filename, dataclass='all', num_element=512, start_idx=0, delete_file=False):
|
||||
if not os.path.exists(os.path.join(input, dataclass)):
|
||||
print(
|
||||
"Dataclass {} does not exist, this folder does not exist. Skipping it.".format(
|
||||
dataclass
|
||||
)
|
||||
)
|
||||
return
|
||||
if os.path.exists(os.path.join(output, dataclass)):
|
||||
tardir(
|
||||
os.path.join(input, dataclass),
|
||||
os.path.join(output, dataclass, filename),
|
||||
num_element,
|
||||
start_idx=start_idx,
|
||||
delete_file=delete_file,
|
||||
)
|
||||
else:
|
||||
os.makedirs(os.path.join(output, dataclass))
|
||||
tardir(
|
||||
os.path.join(input, dataclass),
|
||||
os.path.join(output, dataclass, filename),
|
||||
num_element,
|
||||
start_idx=start_idx,
|
||||
delete_file=delete_file,
|
||||
)
|
||||
return
|
||||
|
||||
def load_from_tar(
|
||||
file_path,
|
||||
file_path_type="local",
|
||||
audio_ext="flac",
|
||||
text_ext="json",
|
||||
samplerate=32000,
|
||||
mono=True,
|
||||
max_len=1000000,
|
||||
dtype="float64",
|
||||
res_type="kaiser_best",
|
||||
):
|
||||
"""
|
||||
This function load the tar files to 3 entry tuple (audios, texts, names) accordingly
|
||||
@param file_path | string | the path where audio and text files located
|
||||
@param file_path_type | string | this is meant to control the prefix of the address in case people forget to include it
|
||||
if file_path_type is "local" and 'file:\\' is not shown as a prefix, it will be added automatically
|
||||
@param audio_ext | string | the extension of the audio
|
||||
@param text_ext | string | the extension of the text
|
||||
@param samplerate | int | the sample rate of the audio
|
||||
@param mono | boolean | if the audio is in mono channel
|
||||
@param max_len | int | max len of the audio, if exceed, will random crop; elif deficit, will pad
|
||||
@param dtype | string | the type of the dtype of the audio sample representation
|
||||
@param res_type | string | the resample method
|
||||
"""
|
||||
if file_path_type == "local" and ("file:\\" not in file_path):
|
||||
file_path = "file:\\" + file_path
|
||||
dataset = wds.WebDataset(file_path)
|
||||
audios = []
|
||||
texts = []
|
||||
names = []
|
||||
for sample in dataset:
|
||||
for key, value in sample.items():
|
||||
if key == audio_ext:
|
||||
audio_data, orig_sr = sf.read(io.BytesIO(value))
|
||||
if samplerate is not None:
|
||||
audio_data = librosa.resample(
|
||||
audio_data,
|
||||
orig_sr=orig_sr,
|
||||
target_sr=samplerate,
|
||||
res_type=res_type,
|
||||
)
|
||||
if len(audio_data) > max_len:
|
||||
overflow = len(audio_data) - max_len
|
||||
idx = np.random.randint(0, overflow + 1)
|
||||
if np.random.rand() > 0.5:
|
||||
audio_data = audio_data[idx : idx + max_len]
|
||||
else:
|
||||
audio_data = audio_data[
|
||||
len(audio_data)
|
||||
+ 1
|
||||
- idx
|
||||
- max_len : len(audio_data)
|
||||
+ 1
|
||||
- idx
|
||||
]
|
||||
else:
|
||||
audio_data = np.pad(
|
||||
audio_data,
|
||||
(0, max_len - len(audio_data)),
|
||||
mode="constant",
|
||||
constant_values=0,
|
||||
)
|
||||
if mono:
|
||||
audio_data = librosa.to_mono(audio_data)
|
||||
audios.append((audio_data, samplerate))
|
||||
elif key == text_ext:
|
||||
texts.append(value)
|
||||
elif key == "__key__":
|
||||
names.append(value)
|
||||
return audios, texts, names
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
help="input folder, expecting subdirectory like train, valid or test",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="output, generating tar files at output/dataclass/filename_{}.tar",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--filename",
|
||||
type=str,
|
||||
default="",
|
||||
help="the filename of the tar, generating tar files at output/dataclass/filename_{}.tar",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataclass", type=str, default="all", help="train or test or valid or all"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_element", type=int, default=512, help="pairs of (audio, text) to be included in a single tar"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start_idx", type=int, default=0, help="start index of the tar"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete_file", action='store_true', help="delete the input file when making tars"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if args.dataclass == "all":
|
||||
for x in ["train", "valid", "test"]:
|
||||
packup(args.input, args.output, args.filename, x, args.num_element, args.start_idx, args.delete_file)
|
||||
elif args.dataclass == "none":
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
tardir(
|
||||
args.input,
|
||||
os.path.join(args.output, args.filename),
|
||||
args.num_element,
|
||||
start_idx=args.start_idx,
|
||||
delete_file=args.delete_file,
|
||||
)
|
||||
else: # if dataclass is in other name
|
||||
packup(args.input, args.output, args.filename, args.dataclass, args.num_element, args.start_idx, args.delete_file)
|
160
dataset/audio/process.py
Normal file
160
dataset/audio/process.py
Normal file
@ -0,0 +1,160 @@
|
||||
"""
|
||||
Code for processing WavCaps dataset.
|
||||
"""
|
||||
import argparse
|
||||
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import glob
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dataset", type=str, default='bbc', choices=['bbc', 'audioset', 'soundbible', 'freesound', 'test'])
|
||||
parser.add_argument(
|
||||
"--data_root",
|
||||
type=str,
|
||||
default="/mnt/bn/zilongdata-hl/dataset/wavcaps"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
type=str,
|
||||
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/raw_datasets/test/"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json_path",
|
||||
type=str,
|
||||
default="/mnt/bn/zilongdata-hl/dataset/wavcaps/WavCaps/json_files/test.json",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
DATA_DIRS = {
|
||||
"bbc": "raw_datasets/BBC_Sound_Effects_flac/",
|
||||
"audioset": "raw_datasets/AudioSet_SL_flac/",
|
||||
"soundbible": "raw_datasets/SoundBible_flac/",
|
||||
"freesound": "raw_datasets/FreeSound_flac/",
|
||||
}
|
||||
|
||||
JSON_PATHS = {
|
||||
"bbc": "WavCaps/json_files/BBC_Sound_Effects/bbc_final.json",
|
||||
"audioset": "WavCaps/json_files/AudioSet_SL/as_final.json",
|
||||
"soundbible": "WavCaps/json_files/SoundBible/sb_final.json",
|
||||
"freesound": "WavCaps/json_files/FreeSound/fsd_final.json",
|
||||
}
|
||||
|
||||
|
||||
|
||||
def load_audioset_json(fname):
|
||||
"""A sample example:
|
||||
{
|
||||
'id': 'Yb0RFKhbpFJA.wav',
|
||||
'caption': 'Wind and a man speaking are heard, accompanied by buzzing and ticking.',
|
||||
'audio': 'wav_path',
|
||||
'duration': 10.0
|
||||
}
|
||||
"""
|
||||
with open(fname) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for sample in data['data']:
|
||||
yield sample['id'].split('.')[0], sample['caption'], sample
|
||||
|
||||
|
||||
def load_soundbible_json(fname):
|
||||
"""A sample example:
|
||||
{
|
||||
'title': 'Airplane Landing Airport',
|
||||
'description': 'Large commercial airplane landing at an airport runway.',
|
||||
'author': 'Daniel Simion',
|
||||
'href': '2219-Airplane-Landing-Airport.html',
|
||||
'caption': 'An airplane is landing.',
|
||||
'id': '2219',
|
||||
'duration': 14.1424375,
|
||||
'audio': 'wav_path',
|
||||
'download_link': 'http://soundbible.com/grab.php?id=2219&type=wav'
|
||||
}
|
||||
"""
|
||||
with open(fname) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for sample in data['data']:
|
||||
yield sample['id'], sample['caption'], sample
|
||||
|
||||
|
||||
def load_freesound_json(fname):
|
||||
"""A sample example:
|
||||
{ 'id': '180913',
|
||||
'file_name': 'UK Mello.wav',
|
||||
'href': '/people/Tempouser/sounds/180913/',
|
||||
'tags': ['Standard', 'ringtone', 'basic', 'traditional'],
|
||||
'description': 'Standard traditional basic ringtone, in mello tone.',
|
||||
'author': 'Tempouser',
|
||||
'duration': 3.204375,
|
||||
'download_link': 'https://freesound.org/people/Tempouser/sounds/180913/download/180913__tempouser__uk-mello.wav',
|
||||
'caption': 'A traditional ringtone is playing.',
|
||||
'audio': 'wav_path'
|
||||
}
|
||||
"""
|
||||
with open(fname) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for sample in data['data']:
|
||||
yield sample['id'], sample['caption'], sample
|
||||
|
||||
|
||||
def load_bbc_json(fname):
|
||||
"""A sample example:
|
||||
{
|
||||
'description': "Timber & Wood - Rip saw, carpenters' workshop.",
|
||||
'category': "['Machines']",
|
||||
'caption': "Someone is using a rip saw in a carpenter's workshop.",
|
||||
'id': '07066104',
|
||||
'duration': 138.36,
|
||||
'audio': 'wav_path',
|
||||
'download_link': 'https://sound-effects-media.bbcrewind.co.uk/zip/07066104.wav.zip'
|
||||
}
|
||||
"""
|
||||
with open(fname) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for sample in data['data']:
|
||||
yield sample['id'], sample['caption'], sample
|
||||
|
||||
|
||||
def load_test_json(fname):
|
||||
"""Using SoundBible as a text example."""
|
||||
with open(fname) as f:
|
||||
data = json.load(f)
|
||||
|
||||
for sample in data['data']:
|
||||
yield sample['id'], sample['caption'], sample
|
||||
|
||||
if __name__ == '__main__':
|
||||
if args.dataset in DATA_DIRS:
|
||||
data_dir = os.path.join(args.data_root, DATA_DIRS[args.dataset])
|
||||
json_path = os.path.join(args.data_root, JSON_PATHS[args.dataset])
|
||||
else:
|
||||
data_dir = args.data_dir
|
||||
json_path = args.json_path
|
||||
|
||||
file_list = glob.glob(f'{data_dir}/*.flac')
|
||||
for data_id, unsed_caption, meta_data in tqdm(list(globals()[f'load_{args.dataset}_json'](json_path))):
|
||||
file_name = os.path.join(data_dir, data_id + '.flac')
|
||||
json_save_path = os.path.join(data_dir, data_id + '.json')
|
||||
# text_save_path = os.path.join(data_dir, data_id + '.text')
|
||||
file_list.remove(file_name)
|
||||
|
||||
assert os.path.exists(file_name), f'{file_name} does not exist!'
|
||||
with open(json_save_path, 'w') as f:
|
||||
json.dump(meta_data, f)
|
||||
|
||||
if len(file_list) > 0:
|
||||
# import pdb; pdb.set_trace()
|
||||
for f in file_list:
|
||||
os.remove(f)
|
||||
|
||||
|
||||
# file_list = glob.glob(f'{data_dir}/*.flac')
|
||||
# for file_path in file_list:
|
||||
# audio_json_save_path = file_path.replace('.flac', '.json')
|
||||
# audio_text_save_path = file_path.replace('.flac', '.text')
|
45
dataset/audio/setup.sh
Normal file
45
dataset/audio/setup.sh
Normal file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# TO setup dataset, run
|
||||
# `DATASET=soundbible bash setup.sh`
|
||||
|
||||
|
||||
DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps
|
||||
DATASET=${DATASET-:soundbible}
|
||||
|
||||
if [[ $DATASET == soundbible ]]; then
|
||||
DATA_FOLDER=SoundBible
|
||||
NUM_ELEMENT=2000
|
||||
elif [[ $DATASET == bbc ]]; then
|
||||
DATA_FOLDER=BBC_Sound_Effects
|
||||
NUM_ELEMENT=500
|
||||
elif [[ $DATASET == audioset ]]; then
|
||||
DATA_FOLDER=AudioSet_SL
|
||||
NUM_ELEMENT=2000
|
||||
elif [[ $DATASET == freesound ]]; then
|
||||
DATA_FOLDER=FreeSound
|
||||
NUM_ELEMENT=500
|
||||
else
|
||||
echo "${DATASET} not found!"
|
||||
exit
|
||||
fi
|
||||
|
||||
CODE_PATH=$(pwd)
|
||||
|
||||
# Merge zip files
|
||||
cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}
|
||||
zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip
|
||||
|
||||
# Extract zip file
|
||||
cd ${DATA_ROOT}
|
||||
unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER}
|
||||
mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac
|
||||
rm -rf raw_datasets/${DATA_FOLDER}
|
||||
|
||||
# Process raw data to create json annotation files
|
||||
cd $CODE_PATH
|
||||
python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET}
|
||||
|
||||
# Pack up tar files
|
||||
python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \
|
||||
--output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \
|
||||
--dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}
|
Loading…
Reference in New Issue
Block a user