From 98558dd4fe11cfbb139f970be26886b534119c34 Mon Sep 17 00:00:00 2001 From: junchen14 Date: Mon, 23 Oct 2023 09:43:07 +0300 Subject: [PATCH] prepare dataset readme --- .gitignore | 180 +++++++++++ dataset/README_MINIGPTv2_FINETUNE.md | 133 ++++++++ train_configs/minigpt_v2_finetune.yaml | 422 ++++++++++++------------- 3 files changed, 524 insertions(+), 211 deletions(-) create mode 100755 .gitignore create mode 100644 dataset/README_MINIGPTv2_FINETUNE.md diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..50d0c3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,180 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +wandb/ +jobs/logs/ +*.out +*ipynb +.history/ +*.json +*.sh +.ipynb_common +logs/ +results/ +prompts/ +output/ +ckpt/ +divide_vqa.py + + +slurm* +sbatch_generate* \ No newline at end of file diff --git a/dataset/README_MINIGPTv2_FINETUNE.md b/dataset/README_MINIGPTv2_FINETUNE.md new file mode 100644 index 0000000..2d5c825 --- /dev/null +++ b/dataset/README_MINIGPTv2_FINETUNE.md @@ -0,0 +1,133 @@ +## Download the COCO captions, RefCOCO, RefCOCO+. RefCOCOg, visual genome, textcaps, LLaVA, gqa, AOK-VQA, OK-VQA, OCR-VQA, filtered Flickr-30k, multi-task conversation, and Unnatural instruction datasets + +### COCO captions + + +### RefCOCO, RefCOCO+, RefCOCOg + +### Visual genome + +### textcaps + +### LLaVA + +### gqa + +### OKVQA + +### AOK-VQA + +### OCR-VQA + +### filtered Flickr-30k + +### Multi-task conversation + +### Unnatural instruction + + + + + + + + + + + + + +### Pre-training datasets download: +We use the filtered synthetic captions prepared by BLIP. For more details about the dataset, please refer to [BLIP](https://github.com/salesforce/BLIP). + +It requires ~2.3T to store LAION and CC3M+CC12M+SBU datasets + +Image source | Filtered synthetic caption by ViT-L +--- | :---: +CC3M+CC12M+SBU | Download +LAION115M | Download + +This will download two json files +``` +ccs_synthetic_filtered_large.json +laion_synthetic_filtered_large.json +``` + +## prepare the data step-by-step + + +### setup the dataset folder and move the annotation file to the data storage folder +``` +export MINIGPT4_DATASET=/YOUR/PATH/FOR/LARGE/DATASET/ +mkdir ${MINIGPT4_DATASET}/cc_sbu +mkdir ${MINIGPT4_DATASET}/laion +mv ccs_synthetic_filtered_large.json ${MINIGPT4_DATASET}/cc_sbu +mv laion_synthetic_filtered_large.json ${MINIGPT4_DATASET}/laion +``` + +### Convert the scripts to data storate folder +``` +cp convert_cc_sbu.py ${MINIGPT4_DATASET}/cc_sbu +cp download_cc_sbu.sh ${MINIGPT4_DATASET}/cc_sbu +cp convert_laion.py ${MINIGPT4_DATASET}/laion +cp download_laion.sh ${MINIGPT4_DATASET}/laion +``` + + +### Convert the laion and cc_sbu annotation file format to be img2dataset format +``` +cd ${MINIGPT4_DATASET}/cc_sbu +python convert_cc_sbu.py + +cd ${MINIGPT4_DATASET}/laion +python convert_laion.py +``` + +### Download the datasets with img2dataset +``` +cd ${MINIGPT4_DATASET}/cc_sbu +sh download_cc_sbu.sh +cd ${MINIGPT4_DATASET}/laion +sh download_laion.sh +``` + + +The final dataset structure + +``` +. +├── ${MINIGPT4_DATASET} +│ ├── cc_sbu +│ ├── convert_cc_sbu.py +│ ├── download_cc_sbu.sh +│ ├── ccs_synthetic_filtered_large.json +│ ├── ccs_synthetic_filtered_large.tsv +│ └── cc_sbu_dataset +│ ├── 00000.tar +│ ├── 00000.parquet +│ ... +│ ├── laion +│ ├── convert_laion.py +│ ├── download_laion.sh +│ ├── laion_synthetic_filtered_large.json +│ ├── laion_synthetic_filtered_large.tsv +│ └── laion_dataset +│ ├── 00000.tar +│ ├── 00000.parquet +│ ... +... +``` + + +## Set up the dataset configuration files + +Then, set up the LAION dataset loading path in +[here](../minigpt4/configs/datasets/laion/defaults.yaml#L5) at Line 5 as +${MINIGPT4_DATASET}/laion/laion_dataset/{00000..10488}.tar + +and the Conceptual Captoin and SBU datasets loading path in +[here](../minigpt4/configs/datasets/cc_sbu/defaults.yaml#L5) at Line 5 as +${MINIGPT4_DATASET}/cc_sbu/cc_sbu_dataset/{00000..01255}.tar + + + diff --git a/train_configs/minigpt_v2_finetune.yaml b/train_configs/minigpt_v2_finetune.yaml index 60d7537..7bf6bbf 100644 --- a/train_configs/minigpt_v2_finetune.yaml +++ b/train_configs/minigpt_v2_finetune.yaml @@ -29,244 +29,244 @@ datasets: name: "blip_caption" sample_ratio: 40 - # llava_conversation: # 77k - # batch_size: 2 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + llava_conversation: # 77k + batch_size: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # unnatural_instruction: - # batch_size: 1 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 15 + unnatural_instruction: + batch_size: 1 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 15 - # refvg: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 40 + refvg: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 40 - # llava_detail: #23K - # batch_size: 4 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 20 + llava_detail: #23K + batch_size: 4 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 20 - # llava_reason: # 77k - # batch_size: 4 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 80 + llava_reason: # 77k + batch_size: 4 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 80 - # flickr_grounded_caption: - # batch_size: 2 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 80 + flickr_grounded_caption: + batch_size: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 80 - # flickr_CaptionToPhrase: - # batch_size: 2 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 80 + flickr_CaptionToPhrase: + batch_size: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 80 - # flickr_ObjectToPhrase: - # batch_size: 2 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 80 + flickr_ObjectToPhrase: + batch_size: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 80 - # coco_caption: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + coco_caption: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # textcaps_caption: # - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + textcaps_caption: # + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # refcoco: # 142k - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 15 + refcoco: # 142k + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 15 - # refcocop: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 15 + refcocop: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 15 - # refcocog: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 15 + refcocog: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 15 - # invrefcoco: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + invrefcoco: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # invrefcocop: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + invrefcocop: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # invrefcocog: - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 10 + invrefcocog: + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 - # coco_vqa: # 82K - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 15 + coco_vqa: # 82K + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 15 - # ok_vqa: # 9k - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 8 + ok_vqa: # 9k + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 8 - # aok_vqa: # 17k - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 12 + aok_vqa: # 17k + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 12 - # gqa: # 82K - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 40 + gqa: # 82K + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 40 - # ocrvqa: # 800K - # batch_size: 6 - # vis_processor: - # train: - # name: "blip2_image_train" - # image_size: 448 - # text_processor: - # train: - # name: "blip_caption" - # sample_ratio: 30 + ocrvqa: # 800K + batch_size: 6 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 30 run: @@ -285,7 +285,7 @@ run: iters_per_epoch: 1000 seed: 42 - output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt" + output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_finetune_test_online" amp: True resume_ckpt_path: null