mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-23 22:20:50 +00:00
* add audio dataset setup --------- Co-authored-by: bingyikang <bingyikang@bytedance.com>
46 lines
1.4 KiB
Bash
46 lines
1.4 KiB
Bash
#!/bin/bash
|
|
# TO setup dataset, run
|
|
# `DATASET=soundbible bash setup.sh`
|
|
|
|
|
|
DATA_ROOT=/mnt/bn/zilongdata-hl/dataset/wavcaps
|
|
DATASET=${DATASET-:soundbible}
|
|
|
|
if [[ $DATASET == soundbible ]]; then
|
|
DATA_FOLDER=SoundBible
|
|
NUM_ELEMENT=2000
|
|
elif [[ $DATASET == bbc ]]; then
|
|
DATA_FOLDER=BBC_Sound_Effects
|
|
NUM_ELEMENT=500
|
|
elif [[ $DATASET == audioset ]]; then
|
|
DATA_FOLDER=AudioSet_SL
|
|
NUM_ELEMENT=2000
|
|
elif [[ $DATASET == freesound ]]; then
|
|
DATA_FOLDER=FreeSound
|
|
NUM_ELEMENT=500
|
|
else
|
|
echo "${DATASET} not found!"
|
|
exit
|
|
fi
|
|
|
|
CODE_PATH=$(pwd)
|
|
|
|
# Merge zip files
|
|
cd ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}
|
|
zip -s- ${DATA_FOLDER}.zip -O ${DATA_FOLDER}_combined.zip
|
|
|
|
# Extract zip file
|
|
cd ${DATA_ROOT}
|
|
unzip ${DATA_ROOT}/WavCaps/Zip_files/${DATA_FOLDER}/${DATA_FOLDER}_combined.zip -d raw_datasets/${DATA_FOLDER}
|
|
mv raw_datasets/${DATA_FOLDER}/mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/${DATA_FOLDER}_flac raw_datasets/${DATA_FOLDER}_flac
|
|
rm -rf raw_datasets/${DATA_FOLDER}
|
|
|
|
# Process raw data to create json annotation files
|
|
cd $CODE_PATH
|
|
python3 dataset/audio/process.py --data_root ${DATA_ROOT} --dataset ${DATASET}
|
|
|
|
# Pack up tar files
|
|
python3 dataset/audio/make_tar.py --input ${DATA_ROOT}/raw_datasets/${DATA_FOLDER}_flac \
|
|
--output ${DATA_ROOT}/web_datasets/${DATA_FOLDER} \
|
|
--dataclass none --filename ${DATA_FOLDER} --num_element ${NUM_ELEMENT}
|