Merge pull request #398 from junchen14/main

update finetuning code
2025-04-05 02:20:47 +00:00 · 2023-10-25 00:06:21 -07:00 · 2023-10-25 00:06:21 -07:00 · 2f63e6e2ce
commit 2f63e6e2ce
parent 41c050de76 58cfa08b56
51 changed files with 3247 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,184 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+wandb/
+jobs/logs/
+*.out
+*ipynb
+.history/
+*.json
+*.sh
+.ipynb_common
+logs/
+results/
+prompts/
+output/
+ckpt/
+divide_vqa.py
+jobs/
+
+*.slurm
+slurm*
+sbatch_generate*
+eval_data/
+dataset/Evaluation.md
+jupyter_notebook.slurm
--- a/.md
+++ b/.md
@ -0,0 +1,24 @@
+## Finetune of MiniGPT-4
+
+
+You firstly need to prepare the dataset. you can follow this step to prepare the dataset.
+our [dataset preparation](dataset/README_MINIGPTv2_FINETUNE.md). 
+
+In the train_configs/minigptv2_finetune.yaml, you need to set up the following paths:
+
+llama_model checkpoint path: "/path/to/llama_checkpoint"
+
+ckpt: "/path/to/pretrained_checkpoint"
+
+ckpt save path: "/path/to/save_checkpoint"
+
+For ckpt, you may load from our pretrained model checkpoints:
+| MiniGPT-v2 (after stage-2) | MiniGPT-v2 (after stage-3) | MiniGPT-v2 (online developing demo) | 
+|------------------------------|------------------------------|------------------------------|
+| [Download](https://drive.google.com/file/d/1Vi_E7ZtZXRAQcyz4f8E6LtLh2UXABCmu/view?usp=sharing) |[Download](https://drive.google.com/file/d/1jAbxUiyl04SFJMN4sF1vvUU69Etuz4qa/view?usp=sharing) | [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |
+
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigptv2_finetune.yaml
+```
+
--- a/README.md
+++ b/README.md
@ -23,6 +23,8 @@ Deyao Zhu*, Jun Chen*, Xiaoqian Shen, Xiang Li, Mohamed Elhoseiny


 ## News
+[Oct.24 2023] We release the finetuning code of our MiniGPT-v2.
+
 [Oct.13 2023] Breaking! We release the first major update with our MiniGPT-v2

 [Aug.28 2023] We now provide a llama 2 version of MiniGPT-4
@ -63,7 +65,7 @@ Git clone our repository, creating a python environment and activate it via the
 git clone https://github.com/Vision-CAIR/MiniGPT-4.git
 cd MiniGPT-4
 conda env create -f environment.yml
-conda activate minigpt4
+conda activate minigptv
 ```


@ -93,9 +95,10 @@ Then, set the variable *llama_model* in the model config file to the LLM weight
 Download the pretrained model checkpoints


-| MiniGPT-v2 (LLaMA-2 Chat 7B) |
-|------------------------------|
-| [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |
+| MiniGPT-v2 (after stage-2) | MiniGPT-v2 (after stage-3) | MiniGPT-v2 (online developing demo)| 
+|------------------------------|------------------------------|------------------------------|
+| [Download](https://drive.google.com/file/d/1Vi_E7ZtZXRAQcyz4f8E6LtLh2UXABCmu/view?usp=sharing) |[Download](https://drive.google.com/file/d/1jAbxUiyl04SFJMN4sF1vvUU69Etuz4qa/view?usp=sharing) | [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |
+

 For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file 
 in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at Line 8.
@ -146,6 +149,7 @@ Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run Mini
 ### Training
 For training details of MiniGPT-4, check [here](MiniGPT4_Train.md).

+For finetuning details of MiniGPT-v2, check [here](MiniGPTv2_Train.md)



--- a/dataset/README_MINIGPTv2_FINETUNE.md
+++ b/dataset/README_MINIGPTv2_FINETUNE.md
@ -0,0 +1,285 @@
+## Download the dataset for finetuning the MiniGPT-v2
+
+
+Download the dataset
+
+Image source | Download path
+--- | :---:
+COCO 2014 images | <a href="http://images.cocodataset.org/zips/train2014.zip">images</a> &nbsp;&nbsp;  <a href="https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json"> captions</a>
+COCO VQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json">vqa train</a> &nbsp;&nbsp;  <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json"> vqa val</a>
+Visual Genome |  <a href="https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip">images part1</a> &nbsp;&nbsp; <a href="https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip">images part2</a> &nbsp;&nbsp; <a href="https://homes.cs.washington.edu/~ranjay/visualgenome/data/dataset/image_data.json.zip"> image meta data </a>
+TextCaps | <a href="https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip">images</a>  &nbsp;&nbsp; <a href="https://dl.fbaipublicfiles.com/textvqa/data/textcaps/TextCaps_0.1_train.json"> annotations</a> 
+RefCOCO | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip"> annotations </a>
+RefCOCO+ | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip"> annotations </a>
+RefCOCOg | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip"> annotations </a>
+OKVQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json"> annotations </a>
+AOK-VQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json"> annotations </a>
+OCR-VQA | <a href="https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing"> annotations </a>
+GQA | <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a>  &nbsp;&nbsp; <a href="/ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json"> annotations </a>
+Filtered flickr-30k |  <a href="https://drive.google.com/drive/folders/19c_ggBI77AvdtYlPbuI0ZpnPz73T5teX?usp=sharing"> annotations </a>
+Multi-task conversation |  <a href="https://drive.google.com/file/d/11HHqB2c29hbSk-WLxdta-nG8UCUrcCN1/view?usp=sharing"> annotations </a> 
+Filtered unnatural instruction |  <a href="https://drive.google.com/file/d/1lXNnBcb5WU-sc8Fe2T2N8J0NRw4sBLev/view?usp=sharing"> annotations </a>
+LLaVA | <a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/complex_reasoning_77k.json"> Compelex reasoning </a> &nbsp;&nbsp;<a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/detail_23k.json"> Detailed description </a> &nbsp;&nbsp; <a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/conversation_58k.json"> Conversation </a> 
+
+
+
+### COCO captions
+Download the COCO 2014 images and captions
+
+coco 2014 images path
+
+```
+${MINIGPTv2_DATASET}
+├── coco
+│   ├── images
+...
+```
+
+
+coco caption annotation path
+
+```
+${MINIGPTv2_DATASET}
+├── coco_captions
+│   └── annotations
+│       ├── coco_karpathy_train.json
+...
+```
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** to the coco_karpathy_train.json path
+- [minigpt4/configs/datasets/coco/caption.yaml](../minigpt4/configs/datasets/coco/caption.yaml)
+
+### COCO VQA
+Download the vqa v2 train and validation json files
+
+```
+├── ${MINIGPTv2_DATASET}
+│   ├── vqav2
+│       ├── vqa_train.json
+|       ├── vqa_val.json
+```
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** to the vqa_train.json and vqa_val.json path
+- [minigpt4/configs/datasets/coco/defaults_vqa.yaml](../minigpt4/configs/datasets/coco/defaults_vqa.yaml)
+
+
+### Visual genome
+Download visiual genome images and annotation files
+
+```
+${MINIGPTv2_DATASET}
+├── visual_genome
+│   ├── VG_100K
+│   ├── VG_100K_2
+│   └── region_descriptions.json
+│   └── image_data.json
+...
+```
+
+Set **image_path** to visual_genome folder.
+Similarly, set **ann_path** to the visual_genome folder.
+
+- [minigpt4/configs/datasets/vg/ref.yaml](../minigpt4/configs/datasets/vg/ref.yaml)
+
+
+### TextCaps
+Download the TextCaps images and annotation files
+
+```
+├── ${MINIGPTv2_DATASET}
+│   ├── textcaps
+│       ├── train_images
+│       ├── TextCaps_0.1_train.json
+```
+
+Set **image_path** to TextCaps train_images folder.
+Similarly, set **ann_path** to the TextCaps_0.1_train.json path
+
+- [minigpt4/configs/datasets/textcaps/caption.yaml](../minigpt4/configs/datasets/textcaps/caption.yaml)
+
+### RefCOCO, RefCOCO+, RefCOCOg
+Download the RefCOCO, RefCOCO+, RefCOCOg annotation files
+
+```
+
+${MINIGPTv2_DATASET}
+├── refcoco_annotations
+│   ├── refcoco
+│   │   ├── instances.json
+│   │   ├── refs(google).p
+│   │   └── refs(unc).p
+│   ├── refcoco+
+│   │   ├── instances.json
+│   │   └── refs(unc).p
+│   └── refcocog
+│       ├── instances.json
+│       ├── refs(google).p
+│       └─── refs(und).p
+...
+```
+
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** in all the following configs to the above folder *refcoco_annotations* that contains refcoco, refcoco+, and refcocog.
+
+- [minigpt4/configs/datasets/coco_bbox/refcoco.yaml](../minigpt4/configs/datasets/coco_bbox/refcoco.yaml)
+- [minigpt4/configs/datasets/coco_bbox/refcocog.yaml](../minigpt4/configs/datasets/coco_bbox/refcocog.yaml) 
+- [minigpt4/configs/datasets/coco_bbox/refcocop.yaml](../minigpt4/configs/datasets/coco_bbox/refcocop.yaml)
+- [minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml](../minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml)
+- [minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml](../minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml) 
+- [minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml](../minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml)
+
+
+
+
+### OKVQA
+
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── okvqa
+│       ├── okvqa_train.json
+```
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** to the location of the OKVQA dataset
+- [minigpt4/configs/datasets/okvqa/defaults.yaml](../minigpt4/configs/datasets/okvqa/defaults.yaml)
+
+
+### COCO-VQA
+
+- [OK-VQA Input Questions](https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip)
+- [OK-VQA Annotations](https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip)
+
+
+### AOK-VQA
+Download the AOK-VQA annotation dataset
+
+```
+export AOKVQA_DIR=YOUR_DATASET_PATH
+mkdir -p ${AOKVQA_DIR}
+curl -fsSL https://prior-datasets.s3.us-east-2.amazonaws.com/aokvqa/aokvqa_v1p0.tar.gz | tar xvz -C ${AOKVQA_DIR}
+```
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── aokvqa
+│       ├── aokvqa_v1p0_train.json
+```
+
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** to the location of the AOKVQA dataset
+- [minigpt4/configs/datasets/aokvqa/defaults.yaml](../minigpt4/configs/datasets/aokvqa/defaults.yaml)
+
+
+
+### OCR-VQA
+Download the OCR-VQA annotation files
+download the images with loadDataset.py script
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── ocrvqa
+│       ├── images
+│       ├── dataset.json
+```
+
+Set **image_path** as the ocrvqa/images folder.
+Similarly, set **ann_path** to the dataset.json
+- [minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml](../minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml)
+
+### GQA
+Download the GQA annotation files and images
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── gqa
+│       ├── images
+│       ├── train_balanced_questions.json
+```
+
+Set **image_path** as the gqa/images folder.
+Similarly, set **ann_path** to the train_balanced_questions.json
+- [minigpt4/configs/datasets/gqa/balanced_val.yaml](../minigpt4/configs/datasets/gqa/balanced_val.yaml)
+
+
+
+### filtered Flickr-30k
+Download filtered Flickr-30k images (fill this [form](https://forms.illinois.edu/sec/229675) on official website or from [kaggle](https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset/download?datasetVersionNumber=1)) and annotation files
+
+```
+${MINIGPTv2_DATASET}
+├── filtered_flickr
+│   ├── images
+│   ├── captiontobbox.json
+│   ├── groundedcaption.json
+│   └── phrasetobbox.json
+...
+```
+
+Set **image_path** as the flickr-30k images foler.
+Similarly, set **ann_path** to the groundedcaption.json, captiontobbox.json and phrasetobbox.json for the 
+grounded image caption, caption to bbox, and phrase to bbox datasets.
+
+- [minigpt4/configs/datasets/flickr/default.yaml](../minigpt4/configs/datasets/flickr/default.yaml)
+- [minigpt4/configs/datasets/flickr/caption_to_phrase.yaml](../minigpt4/configs/datasets/flickr/caption_to_phrase.yaml)
+- [minigpt4/configs/datasets/flickr/object_to_phrase.yaml](../minigpt4/configs/datasets/flickr/object_to_phrase.yaml)
+
+
+### Multi-task conversation
+Download the multi-task converstation dataset
+
+```
+Location_you_like
+${MINIGPTv2_DATASET}
+├── multitask_conversation
+│   └── multitask_conversation.json
+...
+```
+
+Set **image_path** as the COCO 2014 images folder.
+Similarly, set **ann_path** to the multitask_conversation.json file path
+
+- [minigpt4/configs/datasets/multitask_conversation/default.yaml](../minigpt4/configs/datasets/multitask_conversation/default.yaml)
+
+### Unnatural instruction
+Download the filtered unnatural instruction annotation files (we remove the very long sentences from the original unnatural instruction dataset)
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── unnatural_instructions
+│       ├── filtered_unnatural_instruction.json
+```
+
+There is no image path.
+Similarly, set **ann_path** to the filtered_unnatural_instruction.json file path
+
+- [minigpt4/configs/datasets/nlp/unnatural_instruction.yaml](../minigpt4/configs/datasets/nlp/unnatural_instruction.yaml)
+
+### LLaVA
+
+```
+Location_you_like
+├── ${MINIGPTv2_DATASET}
+│   ├── llava
+│       ├── conversation_58k.json
+│       ├── detail_23k.json
+│       ├── complex_reasoning_77k.json
+```
+
+Set **image_path** to the COCO 2014 image folder.
+Similarly, set **ann_path** to the location of the previous downloaded conversation_58k.json, 
+detail_23k.json, and complex_reasoning_77k.json in conversation.yaml, detail.yaml, and reason.yaml, respectively.
+
+
+- [minigpt4/configs/datasets/llava/conversation.yaml](../minigpt4/configs/datasets/llava/conversation.yaml)
+- [minigpt4/configs/datasets/llava/detail.yaml](../minigpt4/configs/datasets/llava/detail.yaml) 
+- [minigpt4/configs/datasets/llava/reason.yaml](../minigpt4/configs/datasets/llava/reason.yaml)
--- a/environment.yml
+++ b/environment.yml
@ -1,4 +1,4 @@
-name: minigpt4
+name: minigptv
 channels:
  - pytorch
  - defaults
--- a/minigpt4/configs/datasets/aokvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml
@ -0,0 +1,20 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  aok_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
+          storage:
+              - /path/to/aokvqa_v1p0_train.json
+      images:
+          storage: /path/to/coco/images
--- a/minigpt4/configs/datasets/coco/caption.yaml
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@ -0,0 +1,21 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  coco_caption: # name of the dataset builder
+    # dataset_card: dataset_card/coco_caption.md
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
+          md5: aa31ac474cf6250ebb81d18348a07ed8
+          storage: /path/to/coco_caption/coco_karpathy_train.json
+      images:
+        storage: /path/to/coco/images
+        
--- a/minigpt4/configs/datasets/coco/defaults_vqa.yaml
+++ b/minigpt4/configs/datasets/coco/defaults_vqa.yaml
@ -0,0 +1,24 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  coco_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
+          storage:
+              - /path/to/vqav2/vqa_train.json
+              - /path/to/vqav2/vqa_val.json
+      images:
+          storage: /path/to/coco/images
+
+  
--- a/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcoco:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: invrefcoco
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcocog:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: invrefcocog
+      splitBy: umd
--- a/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcocop:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: invrefcoco+
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcoco:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: refcoco
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcocog:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: refcocog
+      splitBy: umd
--- a/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcocop:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/refcoco_annotations
+      dataset: refcoco+
+      splitBy: unc
--- a/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
+++ b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
@ -0,0 +1,6 @@
+datasets:
+  flickr_CaptionToPhrase:
+    data_type: images
+    build_info:
+      image_path: /path/to/filtered_flikcr/images
+      ann_path: /path/to/filtered_flickr/captiontobbox.json
--- a/minigpt4/configs/datasets/flickr/default.yaml
+++ b/minigpt4/configs/datasets/flickr/default.yaml
@ -0,0 +1,6 @@
+datasets:
+  flickr_grounded_caption:
+    data_type: images
+    build_info:
+      image_path: /path/to/filtered_flikcr/images
+      ann_path: /path/to/filtered_flikcr/groundedcaption.json
--- a/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
+++ b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
@ -0,0 +1,6 @@
+datasets:
+  flickr_ObjectToPhrase:
+    data_type: images
+    build_info:
+      image_path: /path/to/filtered_flikcr/images
+      ann_path: /path/to/filtered_flikcr/phrasetobbox.json
--- a/minigpt4/configs/datasets/gqa/balanced_val.yaml
+++ b/minigpt4/configs/datasets/gqa/balanced_val.yaml
@ -0,0 +1,21 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - /path/to/gqa/train_balanced_questions.json
+
+      images:
+          storage: /path/to/gqa/images
--- a/minigpt4/configs/datasets/llava/conversation.yaml
+++ b/minigpt4/configs/datasets/llava/conversation.yaml
@ -0,0 +1,7 @@
+datasets:
+
+  llava_conversation:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/llava/conversation_58k.json
--- a/minigpt4/configs/datasets/llava/detail.yaml
+++ b/minigpt4/configs/datasets/llava/detail.yaml
@ -0,0 +1,6 @@
+datasets:
+  llava_detail:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/llava/detail_23k.json
--- a/minigpt4/configs/datasets/llava/reason.yaml
+++ b/minigpt4/configs/datasets/llava/reason.yaml
@ -0,0 +1,7 @@
+datasets:
+
+  llava_reason:
+    data_type: images
+    build_info:
+      image_path: /path/to/coco/images
+      ann_path: /path/to/llava/complex_reasoning_77k.json
--- a/minigpt4/configs/datasets/multitask_conversation/default.yaml
+++ b/minigpt4/configs/datasets/multitask_conversation/default.yaml
@ -0,0 +1,7 @@
+datasets:
+  multitask_conversation:
+    data_type: images
+    build_info:
+    
+      image_path: /path/to/coco/images
+      ann_path: /path/to/multitask_conversation/multi_task_conversation.json
--- a/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
+++ b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
@ -0,0 +1,5 @@
+datasets:
+  unnatural_instruction:
+    data_type: text
+    build_info:
+      ann_path: /path/to/unnatural_instructions/filtered_unnatural_instruction.json
--- a/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
+++ b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
@ -0,0 +1,6 @@
+datasets:
+  ocrvqa:
+    data_type: images
+    build_info:
+      image_path: /path/to/ocrvqa/images
+      ann_path: /path/to/ocrvqa/dataset.json
--- a/minigpt4/configs/datasets/okvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/okvqa/defaults.yaml
@ -0,0 +1,21 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  ok_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+          storage:
+              - /path/to/okvqa/okvqa_train.json
+      images:
+          storage: /path/to/coco/images
--- a/minigpt4/configs/datasets/textcaps/caption.yaml
+++ b/minigpt4/configs/datasets/textcaps/caption.yaml
@ -0,0 +1,9 @@
+datasets:
+  textcaps_caption:
+    data_type: images
+    
+    build_info:
+      image_path: /path/to/textcaps/train_images
+      ann_path: /path/to/textcaps/TextCaps_0.1_train.json
+
+
--- a/minigpt4/configs/datasets/vg/ref.yaml
+++ b/minigpt4/configs/datasets/vg/ref.yaml
@ -0,0 +1,5 @@
+datasets:
+  refvg:
+    data_type: images
+    build_info:
+      data_dir: /path/to/visual_genome
--- a/minigpt4/datasets/builders/image_text_pair_builder.py
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@ -6,6 +6,425 @@ from minigpt4.common.registry import registry
 from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 from minigpt4.datasets.datasets.laion_dataset import LaionDataset
 from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
+from minigpt4.datasets.datasets.text_caps import TextCapDataset
+from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
+from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
+from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
+from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
+from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
+from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
+from minigpt4.datasets.datasets.gqa_datasets import GQADataset
+from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
+from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
+from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
+from minigpt4.datasets.datasets.coco_caption import COCOCapDataset
+
+
+@registry.register_builder("multitask_conversation")
+class MultitaskConversationBuilder(BaseDatasetBuilder):
+    train_dataset_cls = MultiTaskConversationDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/multitask_conversation/default.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+@registry.register_builder("unnatural_instruction")
+class UnnaturalInstructionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = UnnaturalDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/nlp/unnatural_instruction.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+        )
+
+        return datasets
+
+
+
+@registry.register_builder("llava_detail")
+class LlavaDetailBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaDetailDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/detail.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+    
+
+
+@registry.register_builder("llava_reason")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaReasonDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/reason.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+@registry.register_builder("llava_conversation")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaConversationDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/conversation.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+class AllRefCOCOBuilder(BaseDatasetBuilder):
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        image_path = build_info.image_path
+        ann_path = build_info.ann_path
+
+        datasets = dict()
+
+        if not os.path.exists(image_path):
+            warnings.warn("image path {} does not exist.".format(image_path))
+        if not os.path.exists(ann_path):
+            warnings.warn("ann path {} does not exist.".format(ann_path))
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=ann_path,
+            vis_root=image_path,
+            dataset=build_info.dataset,
+            splitBy=build_info.splitBy
+        )
+
+        return datasets
+    
+
+@registry.register_builder("refcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcoco.yaml",
+    }
+
+@registry.register_builder("refcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcocop.yaml",
+    }
+
+
+@registry.register_builder("refcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcocog.yaml",
+    }
+
+@registry.register_builder("invrefcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcoco.yaml",
+    }
+
+
+@registry.register_builder("invrefcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcocop.yaml",
+    }
+
+
+@registry.register_builder("invrefcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcocog.yaml",
+    }
+
+@registry.register_builder("refvg")
+class RefVisualGenomeBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ReferVisualGenomeDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vg/ref.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        data_dir = build_info.data_dir
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            data_dir=data_dir,
+        )
+
+        return datasets
+
+
+@registry.register_builder("textcaps_caption")
+class TextcapCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = TextCapDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+    
+@registry.register_builder("coco_vqa")
+class COCOVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOVQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_vqa.yaml",
+    }
+
+@registry.register_builder("ok_vqa")
+class OKVQABuilder(COCOVQABuilder):
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/okvqa/defaults.yaml",
+    }
+
+
+@registry.register_builder("aok_vqa")
+class AOKVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = AOKVQADataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
+
+
+@registry.register_builder("gqa")
+class GQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = GQADataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/gqa/balanced_val.yaml",
+    }
+
+
+
+
+@registry.register_builder("flickr_grounded_caption")
+class GroundedCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = GroundedDetailDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/default.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+@registry.register_builder("flickr_CaptionToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CaptionToObjectDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/caption_to_phrase.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+@registry.register_builder("flickr_ObjectToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+    train_dataset_cls = PhraseToObjectDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/object_to_phrase.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+
+
+class DocumentVQABuilder(BaseDatasetBuilder):
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            vis_root=build_info.image_path,
+            ann_path=build_info.ann_path
+        )
+
+        return datasets
+    
+
+@registry.register_builder("ocrvqa")
+class OCRVQABuilder(DocumentVQABuilder):
+    train_dataset_cls = OCRVQADataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"}


@registry.register_builder("cc_sbu")
@ -72,6 +491,17 @@ class LaionBuilder(BaseDatasetBuilder):
        return datasets


+
+@registry.register_builder("coco_caption")
+class COCOCapBuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOCapDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/caption.yaml",
+    }
+
+
+
@registry.register_builder("cc_sbu_align")
 class CCSBUAlignBuilder(BaseDatasetBuilder):
    train_dataset_cls = CCSBUAlignDataset
--- a/minigpt4/datasets/datasets/aok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py
@ -0,0 +1,116 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from collections import OrderedDict
+import json
+import os
+import random
+import torch
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset  #, VQAEvalDataset
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "direct_answers": "; ".join(ann["direct_answers"]),
+                "choices": "; ".join(ann["choices"]),
+                "correct_choice": ann["choices"][ann["correct_choice_idx"]],
+                "image": sample["image"],
+            }
+        )
+
+
+class AOKVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+        exist_annotation = []
+        for ann in self.annotation:
+            image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+            if os.path.exists(image_path):
+                exist_annotation.append(ann)
+        self.annotation = exist_annotation
+
+    def get_data(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answer_key = "direct_answers"
+
+        answer_weight = {}
+        for answer in ann[answer_key]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann[answer_key])
+            else:
+                answer_weight[answer] = 1 / len(ann[answer_key])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        answer = random.choices(answers, weights=weights, k=1)[0]  # random sample an answer according to weights
+
+        return {
+            "image": image,
+            "question": question,
+            "answer": answer,
+        }
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        question = self.text_processor(data["question"])
+        instruction = random.choice(self.instruction_pool).format(question)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        answer = self.text_processor(data['answer'])
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": answer,
+        }
+
+
+class AOKVQGDataset(AOKVQADataset):
+
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.instruction_pool = [
+            'Given the image, generate a question whose answer is: {}',
+            'Based on the image, provide a question with the answer: {}',
+            'Given the visual representation, create a question for which the answer is "{}"',
+            'From the image provided, craft a question that leads to the reply: {}',
+            'Considering the picture, come up with a question where the answer is: {}',
+            'Taking the image into account, generate an question that has the answer: {}'
+        ]
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        instruction = random.choice(self.instruction_pool).format(data['answer'])
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['question'],
+        }
--- a/minigpt4/datasets/datasets/base_dataset.py
+++ b/minigpt4/datasets/datasets/base_dataset.py
@ -12,6 +12,8 @@ from torch.utils.data import Dataset, ConcatDataset
 from torch.utils.data.dataloader import default_collate


+
+
 class BaseDataset(Dataset):
    def __init__(
        self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
@ -23,8 +25,15 @@ class BaseDataset(Dataset):
        self.vis_root = vis_root

        self.annotation = []
+        # print("ann paths", ann_paths)
        for ann_path in ann_paths:
+            # print("ann_path", ann_path)
+            ann = json.load(open(ann_path, "r"))
+            if isinstance(ann, dict):
                self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
+                # self.annotation.extend(json.load(open(ann_path, "r")))
+            else:
+                self.annotation.extend(json.load(open(ann_path, "r")))
    
        self.vis_processor = vis_processor
        self.text_processor = text_processor
@ -46,6 +55,7 @@ class BaseDataset(Dataset):
            ann[key] = str(idx)


+
 class ConcatDataset(ConcatDataset):
    def __init__(self, datasets: Iterable[Dataset]) -> None:
        super().__init__(datasets)
--- a/minigpt4/datasets/datasets/caption_datasets.py
+++ b/minigpt4/datasets/datasets/caption_datasets.py
@ -10,6 +10,7 @@ from collections import OrderedDict

 from minigpt4.datasets.datasets.base_dataset import BaseDataset
 from PIL import Image
+import random


 class __DisplMixin:
@ -60,6 +61,71 @@ class CaptionDataset(BaseDataset, __DisplMixin):
        }


+
+class COCOCaptionDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+
+        self.filter_anntation = []
+        
+        for ann in self.annotation:
+            if "train" in ann["image"]:
+                self.filter_anntation.append(ann)
+        self.annotation = self.filter_anntation
+
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+        self.instruction_pool = [
+            'Briefly describe this image.',
+            'Provide a concise depiction of this image.',
+            'Present a short description of this image.',
+            'Summarize this image in a few words.',
+            'A short image caption:',
+            'A short image description:',
+            'A photo of ',
+            'An image that shows ',
+            'Write a short description for the image. ',
+            'Write a description for the photo.',
+            'Provide a description of what is presented in the photo.',
+            'Briefly describe the content of the image.',
+            'Can you briefly explain what you see in the image?',
+            'Could you use a few words to describe what you perceive in the photo?',
+            'Please provide a short depiction of the picture.',
+            'Using language, provide a short account of the image.',
+            'Use a few words to illustrate what is happening in the picture.',
+        ]
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        img_file = ann["image"].split("/")[-1]
+        image_path = os.path.join(self.vis_root, img_file)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        instruction = random.choice(self.instruction_pool)
+        instruction = "<Img><ImageHere></Img> [caption] {} ".format(instruction)
+
+        return {
+            "image": image,
+            "answer": caption,
+            "instruction_input": instruction,
+        }
+
 class CaptionEvalDataset(BaseDataset, __DisplMixin):
    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
        """
--- a/minigpt4/datasets/datasets/coco_caption.py
+++ b/minigpt4/datasets/datasets/coco_caption.py
@ -0,0 +1,120 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import torch
+import numpy as np
+
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
+
+COCOCapDataset = COCOCaptionDataset
+
+
+
+
+
+class COCOCapEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
+
+
+class NoCapsEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["img_id"]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
+
+
+class RefCOCOEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['img_id']
+        sent = data['sents']
+        image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        question = f"[refer] tell me the location of {sent}?"
+        return image, question, img_id
+
+class EvalCaptionData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+        ann = dict()
+        for item in self.loaded_data:
+            image_id = item['image_id']
+            ann[image_id] = item['image']
+        self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann]
+
+    def __len__(self):
+        return len(self.ann)
+    
+    def __getitem__(self, idx):
+        data = self.ann[idx]
+        image_id = data['image_id']
+        img_file = data['image'].split('/')[-1]
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+            
+        image = self.vis_processor(image)
+        question = f"[caption] please describe this image?"
+        return image, question, image_id
--- a/minigpt4/datasets/datasets/coco_dataset.py
+++ b/minigpt4/datasets/datasets/coco_dataset.py
@ -0,0 +1,348 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class ReferCOCODataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.refer = REFER(ann_path, vis_root, dataset, splitBy)
+        self.ref_ids = self.refer.getRefIds(split="train")
+
+        self.instruction_pool = [
+            "[refer] {}",
+            "[refer] give me the location of {}",
+            "[refer] where is {} ?",
+            "[refer] from this image, tell me the location of {}",
+            "[refer] the location of {} is",
+            "[refer] could you tell me the location for {} ?",
+            "[refer] where can I locate the {} ?",
+        ]
+
+
+    def __len__(self):
+        return len(self.ref_ids)
+
+    def preprocess(self, index):
+        ref_id = self.ref_ids[index]
+        ref = self.refer.loadRefs(ref_id)[0]
+
+        image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image_orig_size = image.size
+        image = self.vis_processor(image)
+        image_new_size = [image.shape[1], image.shape[2]]
+
+        image_new_size = [100,100]
+
+        sample_sentence = random.choice(ref['sentences'])['raw']
+        refer_sentence = self.text_processor(sample_sentence)
+
+
+        bbox = self.refer.getRefBox(ref['ref_id'])
+        bbox = [
+            bbox[0] / image_orig_size[0] * image_new_size[0],
+            bbox[1] / image_orig_size[1] * image_new_size[1],
+            (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+            (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+        ]
+        bbox = [int(x) for x in bbox]
+        bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+        return {
+            "image": image,
+            "refer_sentence": refer_sentence,
+            "bbox": bbox,
+            "image_id": ref['image_id'],
+        }
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+        instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['bbox'],
+            "image_id": data['image_id'],
+        }
+
+
+class InvReferCOCODataset(ReferCOCODataset):
+    def __init__(self, *args, **kwargs):
+        super(InvReferCOCODataset, self).__init__(*args, **kwargs)
+
+        self.instruction_pool = [
+            "[identify] {}",
+            "[identify] what object is in this location {}",
+            "[identify] identify the object present at this location {}",
+            "[identify] what is it in {}",
+            "[identify] describe this object in {}",
+            "[identify] this {} is",
+            "[identify] the object in {} is",
+            ]
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+
+        instruction = random.choice(self.instruction_pool).format(data['bbox'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": self.text_processor(data['refer_sentence']),
+            "image_id": data['image_id'],
+        }
+
+
+class REFER:
+    def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        dataset = dataset.split('inv')[-1]  # inv dataset is stored in the same path as normal dataset
+        print('loading dataset %s into memory...' % dataset)
+        self.ann_dir = os.path.join(data_root, dataset)
+        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+            self.vis_root = vis_root
+        elif dataset == 'refclef':
+            raise 'No RefClef image data'
+        else:
+            raise 'No refer dataset is called [%s]' % dataset
+
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p')
+        self.data = {}
+        self.data['dataset'] = dataset
+        self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+
+        # load annotations from data/dataset/instances.json
+        instances_file = os.path.join(self.ann_dir, 'instances.json')
+        instances = json.load(open(instances_file, 'r'))
+        self.data['images'] = instances['images']
+        self.data['annotations'] = instances['annotations']
+        self.data['categories'] = instances['categories']
+
+        # create index
+        self.createIndex()
+        print('DONE (t=%.2fs)' % (time.time() - tic))
+
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print('creating index...')
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data['annotations']:
+            Anns[ann['id']] = ann
+            imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+        for img in self.data['images']:
+            Imgs[img['id']] = img
+        for cat in self.data['categories']:
+            Cats[cat['id']] = cat['name']
+
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data['refs']:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            category_id = ref['category_id']
+            image_id = ref['image_id']
+
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+
+            # add mapping of sent
+            for sent in ref['sentences']:
+                Sents[sent['sent_id']] = sent
+                sentToRef[sent['sent_id']] = ref
+                sentToTokens[sent['sent_id']] = sent['tokens']
+
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print('index created.')
+
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data['refs']
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data['refs']
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+            if not len(split) == 0:
+                if split in ['testA', 'testB', 'testC']:
+                    refs = [ref for ref in refs if
+                            split[-1] in ref['split']]  # we also consider testAB, testBC, ...
+                elif split in ['testAB', 'testBC', 'testAC']:
+                    refs = [ref for ref in refs if ref['split'] == split]  # rarely used I guess...
+                elif split == 'test':
+                    refs = [ref for ref in refs if 'test' in ref['split']]
+                elif split == 'train' or split == 'val':
+                    refs = [ref for ref in refs if ref['split'] == split]
+                else:
+                    raise 'No such split [%s]' % split
+        ref_ids = [ref['ref_id'] for ref in refs]
+        return ref_ids
+
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann['id'] for ann in self.data['annotations']]
+        else:
+            if not len(image_ids) == 0:
+                lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data['annotations']
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+            ann_ids = [ann['id'] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+        return ann_ids
+
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+
+    def getCatIds(self):
+        return self.Cats.keys()
+
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int:
+            return [self.Anns[ann_ids]]
+
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann['bbox']  # [x, y, w, h]
+
+    def showRef(self, ref, seg_box='box'):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref['image_id']]
+        I = io.imread(os.path.join(self.vis_root, image['file_name']))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref['sentences']):
+            print('%s. %s' % (sid + 1, sent['sent']))
+        # show segmentations
+        if seg_box == 'seg':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = 'none'
+            if type(ann['segmentation'][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann['segmentation']:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                raise NotImplementedError('RefClef is not downloaded')
+        # show bounding-box
+        elif seg_box == 'box':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref['ref_id'])
+            box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            ax.add_patch(box_plot)
--- a/minigpt4/datasets/datasets/coco_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/coco_vqa_datasets.py
@ -0,0 +1,145 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import random
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+from collections import OrderedDict
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class COCOVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+        exist_annotation = []
+        for ann in self.annotation:
+            image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+            if os.path.exists(image_path):
+                exist_annotation.append(ann)
+        self.annotation = exist_annotation
+
+
+    def get_data(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+        question_id = ann["question_id"]
+
+        answer_weight = {}
+        for answer in ann["answer"]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann["answer"])
+            else:
+                answer_weight[answer] = 1 / len(ann["answer"])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        answer = random.choices(answers, weights=weights, k=1)[0]  # random sample an answer according to weights
+
+
+        return {
+            "image": image,
+            "question": question,
+            "question_id": question_id,
+            "answer": answer,
+        }
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        instruction = random.choice(self.instruction_pool).format(data['question'])
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": data['image'],
+            "question_id": data["question_id"],
+            "instruction_input": instruction,
+            "answer": self.text_processor(data['answer']),
+        }
+
+
+class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        
+        self.instruction_pool = [
+            'Question: {} Short answer:',
+        ]
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        answer_list_path = ann_paths[1]
+        if os.path.exists(answer_list_path):
+            self.answer_list = json.load(open(answer_list_path))
+        else:
+            self.answer_list = None
+
+        try:
+            self.coco_fmt_qust_file = ann_paths[2]
+            self.coco_fmt_anno_file = ann_paths[3]
+        except IndexError:
+            self.coco_fmt_qust_file = None
+            self.coco_fmt_anno_file = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+        
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        
+        return {
+            "image": image,
+            'image_path': image_path,
+            "question": question,
+            "question_id": ann["question_id"],
+            "instruction_input": instruction,
+            "instance_id": ann["instance_id"],
+        }
--- a/minigpt4/datasets/datasets/flickr.py
+++ b/minigpt4/datasets/datasets/flickr.py
@ -0,0 +1,159 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class GroundedDetailDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[grounding] please describe this image in details',
+            '[grounding] describe this image as detailed as possible',
+            '[grounding] summarize this image in details',
+            '[grounding] give a thorough description of what you see in this image',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['grounded_caption']
+        instruction = random.choice(self.instruction_pool)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
+
+
+
+
+class CaptionToObjectDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[detection] {}',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        input = info["caption"]
+        answer = info["output"]
+
+        instruction = random.choice(self.instruction_pool).format(input)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        print("CaptionToObject instruction", instruction)
+        print("CaptionToObject answer", answer)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
+
+
+
+
+class PhraseToObjectDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[detection] {}',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        input = info["phrase"]
+        answer = "<p>"+input+"</p> "+info["bbox"]
+        instruction = random.choice(self.instruction_pool).format(input)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        print("PhraseToObject instruction", instruction)
+        print("PhraseToObject answer", answer)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
--- a/minigpt4/datasets/datasets/gqa_datasets.py
+++ b/minigpt4/datasets/datasets/gqa_datasets.py
@ -0,0 +1,60 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset
+
+from collections import OrderedDict
+import random
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class GQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        answers = self.text_processor(ann["answer"])
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answers,
+        }
+
--- a/minigpt4/datasets/datasets/llava_dataset.py
+++ b/minigpt4/datasets/datasets/llava_dataset.py
@ -0,0 +1,150 @@
+import os
+import json
+import pickle
+import random
+import time
+# import iterto
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+class LlavaDetailDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['conversations'][1]['value']
+        instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['id'],
+        }
+
+class LlavaReasonDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['conversations'][1]['value']
+        instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['id'],
+        }
+
+
+
+
+class LlavaConversationDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.ann=[]
+
+    
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+        self.connect_sym = "!@#"
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
+
+        questions = [first_instruction]
+        answers = []
+
+        for i, item in enumerate(info["conversations"][1:]):
+            if i % 2 ==0:  # assistant
+                assistant_answer = item["value"]
+                answers.append(assistant_answer)
+            else:
+                human_instruction = item["value"]+" "
+                questions.append(human_instruction)
+
+        questions = self.connect_sym.join(questions)
+        answers = self.connect_sym.join(answers)
+
+
+        return {
+            "image": image,
+            "conv_q": questions,
+            'conv_a': answers,
+            "image_id": info['id'],
+            "connect_sym": self.connect_sym
+        }
--- a/minigpt4/datasets/datasets/multitask_conversation.py
+++ b/minigpt4/datasets/datasets/multitask_conversation.py
@ -0,0 +1,75 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+
+class MultiTaskConversationDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+        self.connect_sym = "!@#"
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
+
+        questions = [first_instruction]
+        answers = []
+
+        for i, item in enumerate(info["conversations"][1:]):
+            if i % 2 ==0:  # assistant
+                assistant_answer = item["value"]
+                answers.append(assistant_answer)
+            else:
+                human_instruction = item["value"]+" "
+                questions.append(human_instruction)
+
+        questions = self.connect_sym.join(questions)
+        answers = self.connect_sym.join(answers)
+
+
+        return {
+            "image": image,
+            "conv_q": questions,
+            'conv_a': answers,
+            "image_id": info['id'],
+            "connect_sym": self.connect_sym
+        }
--- a/minigpt4/datasets/datasets/ocrvqa_dataset.py
+++ b/minigpt4/datasets/datasets/ocrvqa_dataset.py
@ -0,0 +1,77 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class OCRVQADataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.data = self.create_data(ann_path)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+    def create_data(self, ann_path):
+        processed_data = []
+        with open(ann_path, 'r') as f:
+            data = json.load(f)
+        for k in data.keys():
+            if data[k]['split'] != 1: continue  # 1 for training, 2 for validation, 3 for test
+            ext = os.path.splitext(data[k]['imageURL'])[1]
+            imageFile = k + ext
+            assert len(data[k]['questions']) == len(data[k]['answers'])
+            for q, a in zip(data[k]['questions'], data[k]['answers']):
+                processed_data.append(
+                    {'question': q,
+                     'answer': a,
+                     'image_path': imageFile,
+                     'image_id': k,
+                     'title': data[k]['title'],
+                     'genre': data[k]['genre'],
+                     }
+                )
+        return processed_data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+        image = self.vis_processor(image)
+        question = self.text_processor(sample["question"])
+        answer = self.text_processor(sample["answer"])
+
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": sample['image_id']
+        }
+    
--- a/minigpt4/datasets/datasets/text_caps.py
+++ b/minigpt4/datasets/datasets/text_caps.py
@ -0,0 +1,77 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+class TextCapDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            'Briefly describe this image.',
+            'Provide a concise depiction of this image.',
+            'Present a short description of this image.',
+            'Summarize this image in a few words.',
+            'A short image caption:',
+            'A short image description:',
+            'A photo of ',
+            'An image that shows ',
+            'Write a short description for the image. ',
+            'Write a description for the photo.',
+            'Provide a description of what is presented in the photo.',
+            'Briefly describe the content of the image.',
+            'Can you briefly explain what you see in the image?',
+            'Could you use a few words to describe what you perceive in the photo?',
+            'Please provide a short depiction of the picture.',
+            'Using language, provide a short account of the image.',
+            'Use a few words to illustrate what is happening in the picture.',
+        ]
+        
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+
+    def __len__(self):
+        return len(self.ann["data"])
+
+
+    def __getitem__(self, index):
+        info = self.ann["data"][index]
+
+        image_file = '{}.jpg'.format(info['image_id'])
+
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        caption = info["caption_str"]
+        caption = self.text_processor(caption)
+        instruction = "<Img><ImageHere></Img> [caption] {} ".format(random.choice(self.instruction_pool))
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": caption,
+        }
--- a/minigpt4/datasets/datasets/unnatural_instruction.py
+++ b/minigpt4/datasets/datasets/unnatural_instruction.py
@ -0,0 +1,46 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class UnnaturalDataset(Dataset):
+    def __init__(self, text_processor, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]["instances"][0]
+        instruction = info["instruction_with_input"]
+        constraints = info["constraints"]
+        answer = info["output"]
+        if constraints != None:
+            instruction = instruction+" "+constraints
+
+        return {
+            "instruction_input": self.text_processor(instruction),
+            "answer": self.text_processor(answer),
+        }
--- a/minigpt4/datasets/datasets/vg_dataset.py
+++ b/minigpt4/datasets/datasets/vg_dataset.py
@ -0,0 +1,90 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+from visual_genome import local
+
+
+
+
+class ReferVisualGenomeDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, data_dir):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.data_dir = data_dir
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        all_regions = local.get_all_region_descriptions(self.data_dir)
+        all_regions = [region for regions in all_regions for region in regions]
+
+        # follow OFA practice, only regions smaller than 16384 pixels are used for refer
+        self.regions = [region for region in all_regions if region.width * region.height < 16384]
+
+
+        self.instruction_pool = [
+            "[refer] {}",
+            "[refer] give me the location of {}",
+            "[refer] where is {} ?",
+            "[refer] from this image, tell me the location of {}",
+            "[refer] the location of {} is",
+            "[refer] could you tell me the location for {} ?",
+            "[refer] where can I locate the {} ?",
+        ]
+
+
+    def __len__(self):
+        return len(self.regions)
+
+    def preprocess(self, index):
+        region = self.regions[index]
+        image_file = region.image.url.split('/')[-2:]
+        image_path = os.path.join(self.data_dir, *image_file)
+        image = Image.open(image_path).convert("RGB")
+        image_orig_size = image.size
+        image = self.vis_processor(image)
+        image_new_size = [100,100]
+
+        sample_sentence = region.phrase
+        refer_sentence = self.text_processor(sample_sentence)
+
+        bbox = [region.x, region.y, region.width, region.height]
+
+        bbox = [
+            bbox[0] / image_orig_size[0] * image_new_size[0],
+            bbox[1] / image_orig_size[1] * image_new_size[1],
+            (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+            (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+        ]
+        bbox = [int(x) for x in bbox]
+        bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+        return {
+            "image": image,
+            "refer_sentence": refer_sentence,
+            "bbox": bbox,
+            "image_id": region.image.id,
+        }
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+        instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['bbox'],
+            "image_id": data['image_id'],
+        }
+
+
--- a/minigpt4/datasets/datasets/vqa_datasets.py
+++ b/minigpt4/datasets/datasets/vqa_datasets.py
@ -0,0 +1,223 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+from PIL import Image
+import os
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+
+
+class VQADataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    # def collater(self, samples):
+    #     image_list, question_list, answer_list, weight_list = [], [], [], []
+    
+    #     num_answers = []
+    
+    #     for sample in samples:
+    #         image_list.append(sample["image"])
+    #         question_list.append(sample["question"])
+    
+    #         weight_list.extend(sample["weights"])
+    
+    #         answers = sample["answer"]
+    
+    #         answer_list.extend(answers)
+    #         num_answers.append(len(answers))
+    
+    #     return {
+    #         "image": torch.stack(image_list, dim=0),
+    #         "text_input": question_list,
+    #         "answer": answer_list,
+    #         "weight": torch.Tensor(weight_list),
+    #         "n_answers": torch.LongTensor(num_answers),
+    #     }
+
+
+class VQAEvalDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+
+class OKVQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['image_id']
+        question = data['question']
+        question_id = data['question_id']
+        img_file = '{:0>12}.jpg'.format(img_id)
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id, img_id
+
+class VizWizEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['image']
+        question = data['question']
+        answers = data['answers']
+        answers = '_'.join([answer['answer'] for answer in answers])
+        image_path = os.path.join(self.root_path, img_id)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        # question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
+        return image, question, answers
+
+class AOKVQADAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_file = data['image']
+        question = data['question']
+        question_id = data['question_id']
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id
+
+class AOKVQAMCEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_file = data['image']
+        question = data['question']
+        question_id = data['question_id']
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image).half().cuda()
+        candidates=data['choices']
+        # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id, candidates
+
+class IconQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        image_id = data['image_id']
+        question = data['question']
+        image_path = os.path.join(self.root_path, image_id, 'image.png')
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image).half().cuda()
+        candidates = '_'.join(data['choices'])
+        answer = data['answer']
+        # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, candidates, answer
+
+class GQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_id = ann["image"]
+        image_path = os.path.join(self.root_path, f"{image_id}")
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["question"]
+        # question = f'Question: {question} Short answer: '
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        labels = ann["answer"]
+
+        return image, question, labels
+
+class HMEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_id = ann["img"]
+        image_path = os.path.join(self.root_path, f"{image_id}")
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["text"]
+        question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
+        labels = ann["label"]
+
+        return image, question, labels
+
+class VSREvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_path = os.path.join(self.root_path, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["caption"]
+        question = f'[vqa] Based on the image, is this statement true or false? {question}'
+        question_id = ann["image"].split('.')[0]
+        labels = 'true' if ann["label"] == 1 else 'false'
+
+        return image, question, labels
--- a/minigpt4/models/minigpt_base.py
+++ b/minigpt4/models/minigpt_base.py
@ -172,12 +172,12 @@ class MiniGPTBase(BaseModel):
        batch_size = len(conv_q)
        for batch_idx in range(batch_size):
            questions, answers = conv_q[batch_idx], conv_a[batch_idx]
-            questions = [self.llama_tokenizer(q,
+            questions = [self.llama_tokenizer(self.llama_tokenizer.bos_token + q,
                                              return_tensors="pt",
                                              add_special_tokens=False).to(self.device) for q in questions[1:]]  # the first question is handled in the prompt wrap function, skip it
-            answers = [self.llama_tokenizer(q,
+            answers = [self.llama_tokenizer(a + self.end_sym,
                                            return_tensors="pt",
-                                              add_special_tokens=False).to(self.device) for q in answers]
+                                            add_special_tokens=False).to(self.device) for a in answers]
            cur_id = []
            cur_target = []
            for i in range(len(questions)):
--- a/minigpt4/tasks/base_task.py
+++ b/minigpt4/tasks/base_task.py
@ -14,7 +14,7 @@ from minigpt4.common.dist_utils import get_rank, get_world_size, is_main_process
 from minigpt4.common.logger import MetricLogger, SmoothedValue
 from minigpt4.common.registry import registry
 from minigpt4.datasets.data_utils import prepare_sample
-
+import wandb

 class BaseTask:
    def __init__(self, **kwargs):
@ -234,7 +234,9 @@ class BaseTask:
                else:    
                    optimizer.step()
                optimizer.zero_grad()
-
+                # if self.cfg.wandb_log:
+                if self.cfg.run_cfg.wandb_log:
+                    wandb.log({"epoch": inner_epoch, "loss": loss})
            metric_logger.update(loss=loss.item())
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

--- a/train.py
+++ b/train.py
@ -12,6 +12,7 @@ import random
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
+import wandb

 import minigpt4.tasks as tasks
 from minigpt4.common.config import Config
@ -43,10 +44,7 @@ def parse_args():
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
-
    args = parser.parse_args()
-    # if 'LOCAL_RANK' not in os.environ:
-    #     os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args

@ -77,22 +75,25 @@ def main():

    # set before init_distributed_mode() to ensure the same job_id shared across all ranks.
    job_id = now()
-
-    cfg = Config(parse_args())
+    args = parse_args()
+    cfg = Config(args)

    init_distributed_mode(cfg.run_cfg)
-
    setup_seeds(cfg)

    # set after init_distributed_mode() to only log on master.
    setup_logger()
-
    cfg.pretty_print()

    task = tasks.setup_task(cfg)
    datasets = task.build_datasets(cfg)
    model = task.build_model(cfg)

+    if cfg.run_cfg.wandb_log:
+        wandb.login()
+        wandb.init(project="minigptv", name=cfg.run_cfg.job_name)
+        wandb.watch(model)
+
    runner = get_runner_class(cfg)(
        cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
    )
--- a/train_configs/minigpt4_llama2_stage1_pretrain.yaml
+++ b/train_configs/minigpt4_llama2_stage1_pretrain.yaml
@ -53,3 +53,6 @@ run:
  world_size: 1
  dist_url: "env://"
  distributed: True
+
+  wandb_log: True
+  job_name: minigpt4_llama2_pretrain
--- a/train_configs/minigpt4_llama2_stage2_finetune.yaml
+++ b/train_configs/minigpt4_llama2_stage2_finetune.yaml
@ -47,3 +47,6 @@ run:
  world_size: 1
  dist_url: "env://"
  distributed: True
+
+  wandb_log: True
+  job_name: minigpt4_llama2_finetune
--- a/train_configs/minigpt4_stage1_pretrain.yaml
+++ b/train_configs/minigpt4_stage1_pretrain.yaml
@ -53,3 +53,6 @@ run:
  world_size: 1
  dist_url: "env://"
  distributed: True
+
+  wandb_log: True
+  job_name: minigpt4_pretrain
--- a/train_configs/minigpt4_stage2_finetune.yaml
+++ b/train_configs/minigpt4_stage2_finetune.yaml
@ -47,3 +47,6 @@ run:
  world_size: 1
  dist_url: "env://"
  distributed: True
+
+  wandb_log: True
+  job_name: minigpt4_finetune
--- a/train_configs/minigptv2_finetune.yaml
+++ b/train_configs/minigptv2_finetune.yaml
@ -0,0 +1,294 @@
+model:
+  arch: minigpt_v2
+  model_type: pretrain
+  max_txt_len: 1024
+  image_size: 448
+  end_sym: "</s>"
+  llama_model: "/path/to/llama_checkpoint"
+  ckpt: "/path/to/pretrained_checkpoint"
+  use_grad_checkpoint: True
+  chat_template: True
+  lora_r: 64
+  lora_alpha: 16
+
+datasets:
+  multitask_conversation:
+    batch_size: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 50
+
+  llava_conversation: 
+    batch_size: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 30
+
+  unnatural_instruction:
+    batch_size: 1
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10
+
+
+  refvg:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 40
+
+  llava_detail:
+    batch_size: 4
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 20
+
+  llava_reason: 
+    batch_size: 4
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 80
+    
+
+  flickr_grounded_caption:
+    batch_size: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 80
+
+  flickr_CaptionToPhrase:
+    batch_size: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 80
+
+  flickr_ObjectToPhrase:
+    batch_size: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 80
+
+  coco_caption:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10  
+
+    
+  textcaps_caption:  #
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 30
+
+  refcoco: 
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 25
+
+
+  refcocop:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 25
+
+  refcocog:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 25
+
+
+
+  invrefcoco:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10
+
+  invrefcocop:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10
+
+  invrefcocog:
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10
+
+
+  coco_vqa:    
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 15
+
+  ok_vqa:   
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 8
+
+  aok_vqa: 
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 12
+
+  gqa:  
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 50
+
+  ocrvqa: 
+    batch_size: 6
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 30
+
+
+run:
+  task: image_text_pretrain
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 8e-5
+  warmup_lr: 1e-6
+
+  weight_decay: 0.05
+  max_epoch: 50
+  num_workers: 6
+  warmup_steps: 1000
+  iters_per_epoch: 1000
+
+  seed: 42
+  output_dir: "/path/to/save_checkpoint"
+
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+
+  wandb_log: True
+  job_name: minigptv2_finetune