add finetuning code

2025-04-05 10:30:45 +00:00 · 2023-10-22 21:37:45 +03:00 · 2023-10-22 21:37:45 +03:00 · 0e5d34ad2e
commit 0e5d34ad2e
parent 10f61a4dd8
38 changed files with 3798 additions and 0 deletions
--- a/jobs/srun_test.sh
+++ b/jobs/srun_test.sh
@ -0,0 +1,30 @@
+
+cd ..
+
+job_name=minigpt4_v2_test
+read LOWERPORT UPPERPORT < /proc/sys/net/ipv4/ip_local_port_range
+while :
+do
+        PORT="`shuf -i $LOWERPORT-$UPPERPORT -n 1`"
+        ss -lpn | grep -q ":$PORT " || break
+done
+
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path train_configs_llama2/336_final_v1_gqa.yaml
+
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path train_configs_llama2/448_final_v1_gqa_ablation2.yaml
+torchrun --master-port ${PORT} --nproc-per-node 2 train.py  --cfg-path train_configs/minigpt_v2_finetune.yaml
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path finetune_conversation_ablation/conversation_v2_last_336_test.yaml
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path train_configs_llama2/336_final_v1_13B.yaml
+
+# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path train_configs_final_ablations/448_v2_llama2.yaml
+#accelerate launch train.py --job_name ${job_name}  --cfg-path train_configs_final_ablations/336_v2_llama2.yaml
+
+
+# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name}  --cfg-path train_configs_final_ablations/336_v2_llama2_clip_encoder.yaml
+
+#best_data_ratio_336_full_dataset_lr2e4_v1.yaml
+
--- a/minigpt4/configs/datasets/aokvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml
@ -0,0 +1,29 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  aok_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      # annotations:
+      #   train:
+      #     url:
+      #         - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
+      #     storage:
+      #         - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
+      # images:
+      #     storage: /path/to/coco/images/
+
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
+          storage:
+              - /ibex/project/c2133/minigpt4_v2_dataset/aokvqa/annotations/aokvqa_v1p0_train.json
+      images:
+          storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/
--- a/minigpt4/configs/datasets/coco/caption.yaml
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@ -0,0 +1,38 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  coco_caption: # name of the dataset builder
+    # dataset_card: dataset_card/coco_caption.md
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    # build_info:
+    #   # Be careful not to append minus sign (-) before split to avoid itemizing
+    #   annotations:
+    #     train:
+    #       url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
+    #       md5: aa31ac474cf6250ebb81d18348a07ed8
+    #       storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
+    #   images:
+    #     storage: /path/to/coco/images/
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
+          md5: aa31ac474cf6250ebb81d18348a07ed8
+          storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_train.json
+        # val:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+        #   md5: b273847456ef5580e33713b1f7de52a0
+        #   storage:  /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
+        # test:
+        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+        #   md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
+        #   storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
+      images:
+        storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
--- a/minigpt4/configs/datasets/coco/defaults_vqa.yaml
+++ b/minigpt4/configs/datasets/coco/defaults_vqa.yaml
@ -0,0 +1,33 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  coco_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+
+      # annotations:
+      #   train:
+      #     url:
+      #         - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
+      #         - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
+      #     storage:
+      #         - /path/to/vqav2/annotations/vqa_train.json
+      #         - /path/to/vqav2/coco/annotations/vqa_val.json
+      # images:
+      #     storage: /path/to/coco/images/
+
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
+          storage:
+              - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/annotations/vqa_train.json
+              - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/coco/annotations/vqa_val.json
+      images:
+          storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
--- a/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcoco:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: invrefcoco
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcocog:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: invrefcocog
+      splitBy: umd
--- a/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
@ -0,0 +1,8 @@
+datasets:
+  invrefcocop:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: invrefcoco+
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcoco:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: refcoco
+      splitBy: unc
--- a/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcocog:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: refcocog
+      splitBy: umd
--- a/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
+++ b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
@ -0,0 +1,8 @@
+datasets:
+  refcocop:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/object_detection_datasets/
+      dataset: refcoco+
+      splitBy: unc
--- a/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
+++ b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
@ -0,0 +1,6 @@
+datasets:
+  CaptionToPhrase:
+    data_type: images
+    build_info:
+      image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_v2_last.json
--- a/minigpt4/configs/datasets/flickr/default.yaml
+++ b/minigpt4/configs/datasets/flickr/default.yaml
@ -0,0 +1,6 @@
+datasets:
+  grounded_detailed_image_caption:
+    data_type: images
+    build_info:
+      image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_last.json
--- a/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
+++ b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
@ -0,0 +1,6 @@
+datasets:
+  ObjectToPhrase:
+    data_type: images
+    build_info:
+      image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_phrase2bbox_resample_last.json
--- a/minigpt4/configs/datasets/gqa/balanced_val.yaml
+++ b/minigpt4/configs/datasets/gqa/balanced_val.yaml
@ -0,0 +1,33 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  gqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    # build_info:
+    #   # Be careful not to append minus sign (-) before split to avoid itemizing
+    #   annotations:
+    #     train:
+    #       url:
+    #           - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+    #       storage:
+    #           - /path/to/gqa/annotations/train_balanced_questions.json
+
+    #   images:
+    #       storage: /path/to/gqa/images/
+
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+          storage:
+              - /ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json
+      images:
+          storage: /ibex/project/c2133/minigpt4_v2_dataset/gqa/images_copy/
--- a/minigpt4/configs/datasets/llava/conversation.yaml
+++ b/minigpt4/configs/datasets/llava/conversation.yaml
@ -0,0 +1,12 @@
+datasets:
+  # llava_conversation:
+  #   data_type: images
+  #   build_info:
+  #     image_path: /path/to/coco/images
+  #     ann_path: /path/to/llava/conversation_58k.json
+
+  llava_conversation:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/conversation_58k.json
--- a/minigpt4/configs/datasets/llava/detail.yaml
+++ b/minigpt4/configs/datasets/llava/detail.yaml
@ -0,0 +1,12 @@
+datasets:
+  # llava_detail:
+  #   data_type: images
+  #   build_info:
+  #     image_path: /path/to/coco/images
+  #     ann_path: /path/to/llava/detail_23k.json
+
+  llava_detail:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/detail_23k.json
--- a/minigpt4/configs/datasets/llava/reason.yaml
+++ b/minigpt4/configs/datasets/llava/reason.yaml
@ -0,0 +1,12 @@
+datasets:
+  # llava_reason:
+  #   data_type: images
+  #   build_info:
+  #     image_path: /path/to/coco/images
+  #     ann_path: /path/to/llava/complex_reasoning_77k.json
+
+  llava_reason:
+    data_type: images
+    build_info:
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/complex_reasoning_77k.json
--- a/minigpt4/configs/datasets/multitask_conversation/default.yaml
+++ b/minigpt4/configs/datasets/multitask_conversation/default.yaml
@ -0,0 +1,14 @@
+datasets:
+  # multitask_conversation:
+  #   data_type: images
+  #   build_info:
+    
+  #     image_path: /path/to/coco/images
+  #     ann_path: /path/to/multitask_conversation/multi_task_conversation.json
+
+  multitask_conversation:
+    data_type: images
+    build_info:
+    
+      image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/multitask_conversation/multi_task_conversation.json
--- a/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
+++ b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
@ -0,0 +1,10 @@
+datasets:
+  # unnatural_instruction:
+  #   data_type: text
+  #   build_info:
+  #     ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
+
+  unnatural_instruction:
+    data_type: text
+    build_info:
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/unnatural-instructions/data/unnatural_instruction_filer.json
--- a/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
+++ b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
@ -0,0 +1,12 @@
+datasets:
+  # ocrvqa:
+  #   data_type: images
+  #   build_info:
+  #     image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
+  #     ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
+
+  ocrvqa:
+    data_type: images
+    build_info:
+      image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
--- a/minigpt4/configs/datasets/okvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/okvqa/defaults.yaml
@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  ok_vqa:
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    # build_info:
+    #   # Be careful not to append minus sign (-) before split to avoid itemizing
+    #   annotations:
+    #     train:
+    #       url:
+    #           # TODO make this order insensitive
+    #           - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+    #       storage:
+    #           - /path/to/okvqa/annotations/okvqa_train.json
+    #   images:
+    #       storage: /path/to/okvqa/images
+
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        train:
+          url:
+              # TODO make this order insensitive
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+              # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+          storage:
+              - /ibex/project/c2133/minigpt4_v2_dataset/okvqa_v2/annotations/okvqa_train.json
+      images:
+          storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
--- a/minigpt4/configs/datasets/textcaps/caption.yaml
+++ b/minigpt4/configs/datasets/textcaps/caption.yaml
@ -0,0 +1,16 @@
+datasets:
+  # textcaps_caption:
+  #   data_type: images
+    
+  #   build_info:
+  #     image_path: /path/to/TextCaps/train_images
+  #     ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
+
+  textcaps_caption:
+    data_type: images
+    
+    build_info:
+      image_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/train_images
+      ann_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/TextCaps_0.1_train.json
+
+
--- a/minigpt4/configs/datasets/vg/ref.yaml
+++ b/minigpt4/configs/datasets/vg/ref.yaml
@ -0,0 +1,10 @@
+datasets:
+  # refvg:
+  #   data_type: images
+  #   build_info:
+  #     data_dir: /path/to/visual_genome
+
+  refvg:
+    data_type: images
+    build_info:
+      data_dir: /ibex/project/c2133/minigpt4_v2_dataset/visual_genome
--- a/minigpt4/datasets/builders/image_text_pair_builder.py
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@ -6,6 +6,418 @@ from minigpt4.common.registry import registry
 from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 from minigpt4.datasets.datasets.laion_dataset import LaionDataset
 from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
+from minigpt4.datasets.datasets.text_caps import TextCapDataset
+from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
+from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
+from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
+from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
+from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
+from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
+from minigpt4.datasets.datasets.gqa_datasets import GQADataset
+from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
+from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
+from minigpt4.datasets.datasets.doc_dataset import OCRVQADataset
+
+
+
+@registry.register_builder("multitask_conversation")
+class MultitaskConversationBuilder(BaseDatasetBuilder):
+    train_dataset_cls = MultiTaskConversationDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/multitask_conversation/default.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+@registry.register_builder("unnatural_instruction")
+class UnnaturalInstructionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = UnnaturalDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/nlp/unnatural_instruction.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+        )
+
+        return datasets
+
+
+
+@registry.register_builder("llava_detail")
+class LlavaDetailBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaDetailDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/detail.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+    
+
+
+@registry.register_builder("llava_reason")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaReasonDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/reason.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+@registry.register_builder("llava_conversation")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LlavaConversationDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/llava/conversation.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+class AllRefCOCOBuilder(BaseDatasetBuilder):
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        image_path = build_info.image_path
+        ann_path = build_info.ann_path
+
+        datasets = dict()
+
+        if not os.path.exists(image_path):
+            warnings.warn("image path {} does not exist.".format(image_path))
+        if not os.path.exists(ann_path):
+            warnings.warn("ann path {} does not exist.".format(ann_path))
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=ann_path,
+            vis_root=image_path,
+            dataset=build_info.dataset,
+            splitBy=build_info.splitBy
+        )
+
+        return datasets
+    
+
+@registry.register_builder("refcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcoco.yaml",
+    }
+
+@registry.register_builder("refcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcocop.yaml",
+    }
+
+
+@registry.register_builder("refcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = ReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/refcocog.yaml",
+    }
+
+@registry.register_builder("invrefcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcoco.yaml",
+    }
+
+
+@registry.register_builder("invrefcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcocop.yaml",
+    }
+
+
+@registry.register_builder("invrefcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+    train_dataset_cls = InvReferCOCODataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco_bbox/invrefcocog.yaml",
+    }
+
+@registry.register_builder("refvg")
+class RefVisualGenomeBuilder(BaseDatasetBuilder):
+    train_dataset_cls = ReferVisualGenomeDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/vg/ref.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        data_dir = build_info.data_dir
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            data_dir=data_dir,
+        )
+
+        return datasets
+
+
+@registry.register_builder("textcaps_caption")
+class TextcapCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = TextCapDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+    
+@registry.register_builder("coco_vqa")
+class COCOVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = COCOVQADataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/coco/defaults_vqa.yaml",
+    }
+
+@registry.register_builder("aok_vqa")
+class AOKVQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = AOKVQADataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
+
+
+@registry.register_builder("gqa")
+class GQABuilder(BaseDatasetBuilder):
+    train_dataset_cls = GQADataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/gqa/balanced_val.yaml",
+    }
+
+
+
+
+@registry.register_builder("grounded_detailed_image_caption")
+class GroundedCaptionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = GroundedDetailDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/default.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+@registry.register_builder("CaptionToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CaptionToObjectDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/caption_to_phrase.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+@registry.register_builder("ObjectToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+    train_dataset_cls = PhraseToObjectDataset
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/flickr/object_to_phrase.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+        build_info = self.config.build_info
+        datasets = dict()
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_path=build_info.ann_path,
+            vis_root=build_info.image_path,
+        )
+
+        return datasets
+
+
+
+
+class DocumentVQABuilder(BaseDatasetBuilder):
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            vis_root=build_info.image_path,
+            ann_path=build_info.ann_path
+        )
+
+        return datasets
+    
+
+@registry.register_builder("ocrvqa")
+class OCRVQABuilder(DocumentVQABuilder):
+    train_dataset_cls = OCRVQADataset
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"}


@registry.register_builder("cc_sbu")
--- a/minigpt4/datasets/datasets/aok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py
@ -0,0 +1,212 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from collections import OrderedDict
+import json
+import os
+import random
+import torch
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset  #, VQAEvalDataset
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "direct_answers": "; ".join(ann["direct_answers"]),
+                "choices": "; ".join(ann["choices"]),
+                "correct_choice": ann["choices"][ann["correct_choice_idx"]],
+                "image": sample["image"],
+            }
+        )
+
+
+class AOKVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+        exist_annotation = []
+        for ann in self.annotation:
+            image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+            if os.path.exists(image_path):
+                exist_annotation.append(ann)
+        self.annotation = exist_annotation
+
+    def get_data(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        answer_key = "direct_answers"
+
+        # print("answer key", answer_key)
+        # for answer in ann[answer_key]:
+        #     print(answer)
+
+        answer_weight = {}
+        for answer in ann[answer_key]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann[answer_key])
+            else:
+                answer_weight[answer] = 1 / len(ann[answer_key])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        answer = random.choices(answers, weights=weights, k=1)[0]  # random sample an answer according to weights
+
+        return {
+            "image": image,
+            "question": question,
+            "answer": answer,
+        }
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        question = self.text_processor(data["question"])
+        instruction = random.choice(self.instruction_pool).format(question)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        answer = self.text_processor(data['answer'])
+
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": answer,
+        }
+
+
+class AOKVQGDataset(AOKVQADataset):
+
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.instruction_pool = [
+            'Given the image, generate a question whose answer is: {}',
+            'Based on the image, provide a question with the answer: {}',
+            'Given the visual representation, create a question for which the answer is "{}"',
+            'From the image provided, craft a question that leads to the reply: {}',
+            'Considering the picture, come up with a question where the answer is: {}',
+            'Taking the image into account, generate an question that has the answer: {}'
+        ]
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        instruction = random.choice(self.instruction_pool).format(data['answer'])
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['question'],
+        }
+
+
+# class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+#     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+#         """
+#         vis_root (string): Root directory of images (e.g. coco/images/)
+#         ann_root (string): directory to store the annotation file
+#         """
+#
+#         self.vis_root = vis_root
+#
+#         self.annotation = json.load(open(ann_paths[0]))
+#
+#         answer_list_path = ann_paths[1]
+#         if os.path.exists(answer_list_path):
+#             self.answer_list = json.load(open(answer_list_path))
+#         else:
+#             self.answer_list = None
+#
+#         try:
+#             self.coco_fmt_qust_file = ann_paths[2]
+#             self.coco_fmt_anno_file = ann_paths[3]
+#         except IndexError:
+#             self.coco_fmt_qust_file = None
+#             self.coco_fmt_anno_file = None
+#
+#         self.vis_processor = vis_processor
+#         self.text_processor = text_processor
+#
+#         self._add_instance_ids()
+#
+#     def collater(self, samples):
+#         (
+#             image_list,
+#             question_list,
+#             question_id_list,
+#             instance_id_list,
+#             choices_list,
+#             correct_choice_idx_list,
+#             direct_answers_list,
+#         ) = ([], [], [], [], [], [], [])
+#
+#         for sample in samples:
+#             image_list.append(sample["image"])
+#             question_list.append(sample["text_input"])
+#             question_id_list.append(sample["question_id"])
+#             instance_id_list.append(sample["instance_id"])
+#             choices_list.append(sample["choices"])
+#             correct_choice_idx_list.append(sample["correct_choice_idx"])
+#             direct_answers_list.append(sample["direct_answers"])
+#
+#         return {
+#             "image": torch.stack(image_list, dim=0),
+#             "text_input": question_list,
+#             "question_id": question_id_list,
+#             "instance_id": instance_id_list,
+#             "choices": choices_list,
+#             "correct_choice_idx": correct_choice_idx_list,
+#             "direct_answers": direct_answers_list,
+#         }
+#
+#     def __getitem__(self, index):
+#         ann = self.annotation[index]
+#
+#         image_path = os.path.join(self.vis_root, ann["image"])
+#         image = Image.open(image_path).convert("RGB")
+#
+#         image = self.vis_processor(image)
+#         question = self.text_processor(ann["question"])
+#
+#         choices = ann["choices"]
+#         if "correct_choice_idx" in ann:
+#             correct_choice_idx = ann["correct_choice_idx"]
+#         else:
+#             correct_choice_idx = None
+#
+#         if "direct_answers" in ann:
+#             direct_answers = ann["direct_answers"]
+#         else:
+#             direct_answers = None
+#
+#         return {
+#             "image": image,
+#             "text_input": question,
+#             "question_id": ann["question_id"],
+#             "instance_id": ann["instance_id"],
+#             "choices": choices,
+#             "correct_choice_idx": correct_choice_idx,
+#             "direct_answers": direct_answers,
+#         }
--- a/minigpt4/datasets/datasets/coco_caption.py
+++ b/minigpt4/datasets/datasets/coco_caption.py
@ -0,0 +1,122 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import torch
+import numpy as np
+
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
+
+COCOCapDataset = COCOCaptionDataset
+
+
+
+
+
+class COCOCapEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
+
+
+class NoCapsEvalDataset(CaptionEvalDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        img_id = ann["img_id"]
+
+        return {
+            "image": image,
+            "image_id": img_id,
+            "instance_id": ann["instance_id"],
+        }
+
+
+class RefCOCOEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['img_id']
+        sent = data['sents']
+        image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        # question = f"[refer] {sent}"
+        question = f"[refer] where is {sent}?"
+        # question = f"where is the bounding box location of {sent}?"
+        return image, question, img_id
+
+class EvalCaptionData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+        ann = dict()
+        for item in self.loaded_data:
+            image_id = item['image_id']
+            ann[image_id] = item['image']
+        self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann]
+
+    def __len__(self):
+        return len(self.ann)
+    
+    def __getitem__(self, idx):
+        data = self.ann[idx]
+        image_id = data['image_id']
+        img_file = data['image'].split('/')[-1]
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+            
+        image = self.vis_processor(image)
+        question = f"[caption] please describe this image?"
+        return image, question, image_id
--- a/minigpt4/datasets/datasets/coco_dataset.py
+++ b/minigpt4/datasets/datasets/coco_dataset.py
@ -0,0 +1,667 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+import threading
+
+# Global lock
+lock = threading.Lock()
+
+def sample_object_bbox(objects, bbox):
+
+    
+    
+    zipped_list = list(zip(objects, bbox))
+
+    # Shuffle the zipped list
+    random.shuffle(zipped_list)
+
+    # Generate the new string with interleaved format
+    # interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list])
+    
+    # print("objects", objects)
+    # print("bbox",bbox)
+    
+    interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","")
+
+    # interleaved_list = " "+interleaved_list
+    # print(interleaved_list)
+    return interleaved_list
+
+def bbox_to_object(objects, bbox):
+
+    index_sample = random.sample(range(len(objects)),1)[0]
+
+    sample_object = str(objects[index_sample])
+    sample_bbox = bbox[index_sample]
+    # sample_center_point = center_point[index_sample]
+
+    sample_bbox = r"{"+str(sample_bbox) + "}"
+    return sample_bbox, sample_object
+
+def object_to_bbox(objects, bbox, center_point):
+    index_sample = random.sample(range(len(objects)),1)[0]
+
+    sample_object = objects[index_sample]
+    sample_bbox = bbox[index_sample]
+    sample_center_point = center_point[index_sample]
+
+    instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? "
+    answer = "{"+str(sample_object)+","+str(sample_bbox)+"}"
+
+
+
+    return instruction, answer
+
+
+class COCOBBOXDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, location):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+        print("coco box dataset")
+        self.inner_dataset = wds.DataPipeline(
+            wds.ResampledShards(location),
+            wds.tarfile_to_samples(handler=wds.warn_and_continue),
+            wds.shuffle(1000, handler=wds.warn_and_continue),
+            wds.decode("pilrgb", handler=wds.warn_and_continue),
+            wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+            wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+            wds.map(self.to_dict, handler=wds.warn_and_continue),
+        )
+
+    def to_dict(self, sample):
+        objects = sample[1]["objects"]
+        boxes = sample[1]["bbox"]
+        caption = sample[1]["caption"]
+
+
+        new_bboxes = []
+
+        image_size = sample[0].shape[1]
+        image_size = 100
+        for index in range(len(boxes)):
+            box = boxes[index]
+            x1 = int(box[0]*image_size)
+            y1 = int(box[1]*image_size)
+            x2 = x1 + int(box[2]*image_size)
+            y2 = y1 + int(box[3]*image_size)
+            assert x1>=0 and x1<=image_size
+            assert x2>=0 and x2<=image_size
+            assert y1>=0 and y1<=image_size
+            assert y2>=0 and y2<=image_size
+            
+            new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+            # new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+            new_bboxes.append(new_bbox)
+
+        instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. "
+        instruction = "<Img><ImageHere></Img> {}".format(self.text_processor(instruction))
+
+        answer = sample_object_bbox(objects, new_bboxes)
+
+        # print("instruction",instruction)
+        # print("answer", answer)
+
+        return {
+            "image": sample[0],
+            "instruction_input": instruction,
+            "answer": answer,
+            "data_type": "bbox",
+            "question_split": True
+        }
+
+
+class COCOBboxToObjectDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, location):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+
+        self.inner_dataset = wds.DataPipeline(
+            wds.ResampledShards(location),
+            wds.tarfile_to_samples(handler=wds.warn_and_continue),
+            wds.shuffle(1000, handler=wds.warn_and_continue),
+            wds.decode("pilrgb", handler=wds.warn_and_continue),
+            wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+            wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+            wds.map(self.to_dict, handler=wds.warn_and_continue),
+        )
+
+
+        self.instruction_pool = [
+            "<Img><ImageHere></Img> what object is in this bounding box location {} ",
+            "<Img><ImageHere></Img> what object is in this location {} ",
+            "<Img><ImageHere></Img> identify the object present at this location {} ",
+            "<Img><ImageHere></Img> what is it in bounding box location{} ",
+            "<Img><ImageHere></Img> describe this object in {} ",
+            "<Img><ImageHere></Img> this {} is ",
+            "<Img><ImageHere></Img> the object in {} is ",
+            "<Img><ImageHere></Img> please tell me what is inside the bounding box position {} ",
+            "<Img><ImageHere></Img> what can you find in the bounding box area at position {}? ",
+            "<Img><ImageHere></Img> what is the object occupying this area {} ",
+            "<Img><ImageHere></Img> could you identify the content within the bounding box located at {} ",
+            ]
+
+    def to_dict(self, sample):
+            
+        objects = sample[1]["objects"]
+        boxes = sample[1]["bbox"]
+
+        new_bboxes = []
+
+        image_size = sample[0].shape[1]
+        image_size=100
+        for index in range(len(boxes)):
+            box = boxes[index]
+            x1 = int(box[0]*image_size)
+            y1 = int(box[1]*image_size)
+            x2 = x1 + int(box[2]*image_size)
+            y2 = y1 + int(box[3]*image_size)
+            assert x1>=0 and x1<=image_size
+            assert x2>=0 and x2<=image_size
+            assert y1>=0 and y1<=image_size
+            assert y2>=0 and y2<=image_size
+            
+            new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+            new_bboxes.append(new_bbox)
+        
+        bbox, object = bbox_to_object(objects, new_bboxes)
+
+        instruction = random.choice(self.instruction_pool).format(bbox)
+        return {
+            "image": sample[0],
+            "instruction_input": instruction,
+            "answer": self.text_processor(object),
+            "data_type": "bbox",
+            "question_split": True
+        }
+
+
+
+# class ReferCOCODataset(Dataset):
+#     def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
+#         """
+#         vis_root (string): Root directory of images (e.g. coco/images/)
+#         ann_root (string): directory to store the annotation file
+#         """
+#         self.vis_root = vis_root
+
+#         self.vis_processor = vis_processor
+#         self.text_processor = text_processor
+
+#         self.refer = REFER(ann_path, vis_root, dataset, splitBy)
+#         self.ref_ids = self.refer.getRefIds()
+
+
+#         self.instruction_pool = [
+#             "[refer] {}",
+#             "[refer] give me the location of {}",
+#             "[refer] where is {} ?",
+#             "[refer] from this image, tell me the location of {}",
+#             "[refer] the location of {} is",
+#             "[refer] could you tell me the location for {} ?",
+#             "[refer] where can I locate the {} ?",
+#         ]
+
+
+#     def __len__(self):
+#         return len(self.ref_ids)
+
+#     def preprocess(self, index):
+#         ref_id = self.ref_ids[index]
+#         ref = self.refer.loadRefs(ref_id)[0]
+
+#         image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
+#         image_path = os.path.join(self.vis_root, image_file)
+#         image = Image.open(image_path).convert("RGB")
+#         image_orig_size = image.size
+#         image = self.vis_processor(image)
+#         image_new_size = [image.shape[1], image.shape[2]]
+
+#         image_new_size = [100,100]
+
+#         sample_sentence = random.choice(ref['sentences'])['raw']
+
+#         refer_sentence = self.text_processor(sample_sentence)
+
+
+#         bbox = self.refer.getRefBox(ref['ref_id'])
+
+#         bbox_to_save = bbox
+#         image_id_to_save = ref["image_id"]
+#         ref_id_to_save = ref_id
+
+#         item = {"image":image_id_to_save,"bbox":bbox_to_save,"ref id":ref_id_to_save, "sentence":refer_sentence}
+
+
+#         def save_to_file():
+#             with lock:
+#                 with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "r") as f:
+#                     refer_json = json.load(f)
+                
+#                 if ref_id_to_save not in refer_json.keys():
+#                     print(item)
+#                     refer_json[ref_id_to_save] = item
+
+#                     with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "w") as f:
+#                         json.dump(refer_json, f)
+
+
+#         save_to_file()
+#         # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","r") as f:
+#         # refer_json = json.load(open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json"))
+        
+#         # if ref_id_to_save not in refer_json.keys():
+#         #     print(item)
+#         #     refer_json[ref_id_to_save] = item
+
+#         #     with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","w") as f:
+#         #         json.dump(refer_json,f)
+
+
+
+
+
+
+
+#         bbox = [
+#             bbox[0] / image_orig_size[0] * image_new_size[0],
+#             bbox[1] / image_orig_size[1] * image_new_size[1],
+#             (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+#             (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+#         ]
+#         bbox = [int(x) for x in bbox]
+#         bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+#         return {
+#             "image": image,
+#             "refer_sentence": refer_sentence,
+#             "bbox": bbox,
+#             "image_id": ref['image_id'],
+#         }
+
+#     def __getitem__(self, index):
+#         data = self.preprocess(index)
+#         instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+#         instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+#         return {
+#             "image": data['image'],
+#             "instruction_input": instruction,
+#             "answer": data['bbox'],
+#             "image_id": data['image_id'],
+#         }
+
+
+# class InvReferCOCODataset(ReferCOCODataset):
+#     def __init__(self, *args, **kwargs):
+#         super(InvReferCOCODataset, self).__init__(*args, **kwargs)
+
+#         self.instruction_pool = [
+#             "[identify] {}",
+#             "[identify] what object is in this location {}",
+#             "[identify] identify the object present at this location {}",
+#             "[identify] what is it in {}",
+#             "[identify] describe this object in {}",
+#             "[identify] this {} is",
+#             "[identify] the object in {} is",
+#             ]
+
+#     def __getitem__(self, index):
+#         data = self.preprocess(index)
+
+#         instruction = random.choice(self.instruction_pool).format(data['bbox'])
+
+#         instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        
+#         return {
+#             "image": data['image'],
+#             "instruction_input": instruction,
+#             "answer": self.text_processor(data['refer_sentence']),
+#             "image_id": data['image_id'],
+#         }
+
+
+class ReferCOCODataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.refer = REFER(ann_path, vis_root, dataset, splitBy)
+        self.ref_ids = self.refer.getRefIds(split="train")
+
+        print(dataset, len(self.ref_ids))
+
+        self.instruction_pool = [
+            "[refer] {}",
+            "[refer] give me the location of {}",
+            "[refer] where is {} ?",
+            "[refer] from this image, tell me the location of {}",
+            "[refer] the location of {} is",
+            "[refer] could you tell me the location for {} ?",
+            "[refer] where can I locate the {} ?",
+        ]
+
+
+    def __len__(self):
+        return len(self.ref_ids)
+
+    def preprocess(self, index):
+        ref_id = self.ref_ids[index]
+        ref = self.refer.loadRefs(ref_id)[0]
+
+        image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image_orig_size = image.size
+        image = self.vis_processor(image)
+        image_new_size = [image.shape[1], image.shape[2]]
+
+        image_new_size = [100,100]
+
+        sample_sentence = random.choice(ref['sentences'])['raw']
+        refer_sentence = self.text_processor(sample_sentence)
+
+
+        bbox = self.refer.getRefBox(ref['ref_id'])
+        bbox = [
+            bbox[0] / image_orig_size[0] * image_new_size[0],
+            bbox[1] / image_orig_size[1] * image_new_size[1],
+            (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+            (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+        ]
+        bbox = [int(x) for x in bbox]
+        bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+        return {
+            "image": image,
+            "refer_sentence": refer_sentence,
+            "bbox": bbox,
+            "image_id": ref['image_id'],
+        }
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+        instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['bbox'],
+            "image_id": data['image_id'],
+        }
+
+
+class InvReferCOCODataset(ReferCOCODataset):
+    def __init__(self, *args, **kwargs):
+        super(InvReferCOCODataset, self).__init__(*args, **kwargs)
+
+        self.instruction_pool = [
+            "[identify] {}",
+            "[identify] what object is in this location {}",
+            "[identify] identify the object present at this location {}",
+            "[identify] what is it in {}",
+            "[identify] describe this object in {}",
+            "[identify] this {} is",
+            "[identify] the object in {} is",
+            ]
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+
+        instruction = random.choice(self.instruction_pool).format(data['bbox'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": self.text_processor(data['refer_sentence']),
+            "image_id": data['image_id'],
+        }
+
+
+class REFER:
+    def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        dataset = dataset.split('inv')[-1]  # inv dataset is stored in the same path as normal dataset
+        print('loading dataset %s into memory...' % dataset)
+        self.ann_dir = os.path.join(data_root, dataset)
+        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+            self.vis_root = vis_root
+        elif dataset == 'refclef':
+            raise 'No RefClef image data'
+        else:
+            raise 'No refer dataset is called [%s]' % dataset
+
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p')
+        self.data = {}
+        self.data['dataset'] = dataset
+        self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+
+        # load annotations from data/dataset/instances.json
+        instances_file = os.path.join(self.ann_dir, 'instances.json')
+        instances = json.load(open(instances_file, 'r'))
+        self.data['images'] = instances['images']
+        self.data['annotations'] = instances['annotations']
+        self.data['categories'] = instances['categories']
+
+        # create index
+        self.createIndex()
+        print('DONE (t=%.2fs)' % (time.time() - tic))
+
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print('creating index...')
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data['annotations']:
+            Anns[ann['id']] = ann
+            imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+        for img in self.data['images']:
+            Imgs[img['id']] = img
+        for cat in self.data['categories']:
+            Cats[cat['id']] = cat['name']
+
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data['refs']:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            category_id = ref['category_id']
+            image_id = ref['image_id']
+
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+
+            # add mapping of sent
+            for sent in ref['sentences']:
+                Sents[sent['sent_id']] = sent
+                sentToRef[sent['sent_id']] = ref
+                sentToTokens[sent['sent_id']] = sent['tokens']
+
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print('index created.')
+
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data['refs']
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data['refs']
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+            if not len(split) == 0:
+                if split in ['testA', 'testB', 'testC']:
+                    refs = [ref for ref in refs if
+                            split[-1] in ref['split']]  # we also consider testAB, testBC, ...
+                elif split in ['testAB', 'testBC', 'testAC']:
+                    refs = [ref for ref in refs if ref['split'] == split]  # rarely used I guess...
+                elif split == 'test':
+                    refs = [ref for ref in refs if 'test' in ref['split']]
+                elif split == 'train' or split == 'val':
+                    refs = [ref for ref in refs if ref['split'] == split]
+                else:
+                    raise 'No such split [%s]' % split
+        ref_ids = [ref['ref_id'] for ref in refs]
+        return ref_ids
+
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann['id'] for ann in self.data['annotations']]
+        else:
+            if not len(image_ids) == 0:
+                lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data['annotations']
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+            ann_ids = [ann['id'] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+        return ann_ids
+
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+
+    def getCatIds(self):
+        return self.Cats.keys()
+
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int:
+            return [self.Anns[ann_ids]]
+
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann['bbox']  # [x, y, w, h]
+
+    def showRef(self, ref, seg_box='box'):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref['image_id']]
+        I = io.imread(os.path.join(self.vis_root, image['file_name']))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref['sentences']):
+            print('%s. %s' % (sid + 1, sent['sent']))
+        # show segmentations
+        if seg_box == 'seg':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = 'none'
+            if type(ann['segmentation'][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann['segmentation']:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                raise NotImplementedError('RefClef is not downloaded')
+        # show bounding-box
+        elif seg_box == 'box':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref['ref_id'])
+            box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            ax.add_patch(box_plot)
--- a/minigpt4/datasets/datasets/coco_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/coco_vqa_datasets.py
@ -0,0 +1,184 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import random
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+from collections import OrderedDict
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class COCOVQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+        exist_annotation = []
+        for ann in self.annotation:
+            image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+            if os.path.exists(image_path):
+                exist_annotation.append(ann)
+        self.annotation = exist_annotation
+
+
+    def get_data(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+        question_id = ann["question_id"]
+
+        answer_weight = {}
+        for answer in ann["answer"]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann["answer"])
+            else:
+                answer_weight[answer] = 1 / len(ann["answer"])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        answer = random.choices(answers, weights=weights, k=1)[0]  # random sample an answer according to weights
+
+        # if "unk" in answer:
+        #     print("cocovqa", answer)
+
+        return {
+            "image": image,
+            "question": question,
+            "question_id": question_id,
+            "answer": answer,
+        }
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        instruction = random.choice(self.instruction_pool).format(data['question'])
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": data['image'],
+            "question_id": data["question_id"],
+            "instruction_input": instruction,
+            "answer": self.text_processor(data['answer']),
+        }
+
+
+class COCOVQGDataset(COCOVQADataset):
+
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.instruction_pool = [
+            'Given the image, generate a question whose answer is: {}',
+            'Based on the image, provide a question with the answer: {}',
+            'Given the visual representation, create a question for which the answer is "{}"',
+            'From the image provided, craft a question that leads to the reply: {}',
+            'Considering the picture, come up with a question where the answer is: {}',
+            'Taking the image into account, generate an question that has the answer: {}'
+        ]
+
+    def __getitem__(self, index):
+        data = self.get_data(index)
+        instruction = random.choice(self.instruction_pool).format(data['answer'])
+        instruction = "<Img><ImageHere></Img> {}".format(instruction)
+
+        return {
+            "image": data['image'],
+            "question_id": data["question_id"],
+            "instruction_input": instruction,
+            "answer": data['question'],
+        }
+
+
+
+class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        
+        self.instruction_pool = [
+#             '{}',
+#             'Question: {}',
+#             '{} A short answer to the question is',
+#             'Q: {} A:',
+            'Question: {} Short answer:',
+#             'Given the image, answer the following question with no more than three words. {}',
+#             'Based on the image, respond to this question with a short answer: {}.',
+#             'Use the provided image to answer the question: {} Provide your answer as short as possible.',
+#             'What is the answer to the following question? "{}"',
+#             'The question "{}" can be answered using the image. A short answer is'
+        ]
+#         print('vis_root', vis_root)
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        answer_list_path = ann_paths[1]
+        if os.path.exists(answer_list_path):
+            self.answer_list = json.load(open(answer_list_path))
+        else:
+            self.answer_list = None
+
+        try:
+            self.coco_fmt_qust_file = ann_paths[2]
+            self.coco_fmt_anno_file = ann_paths[3]
+        except IndexError:
+            self.coco_fmt_qust_file = None
+            self.coco_fmt_anno_file = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+        
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        
+        return {
+            "image": image,
+            'image_path': image_path,
+            "question": question,
+            "question_id": ann["question_id"],
+            "instruction_input": instruction,
+            "instance_id": ann["instance_id"],
+        }
--- a/minigpt4/datasets/datasets/doc_dataset.py
+++ b/minigpt4/datasets/datasets/doc_dataset.py
@ -0,0 +1,290 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class SingleSlideVQADataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.data = self.create_data(ann_path)
+
+        # self.instruction_pool = [
+        #     "###Human: <Img><ImageHere></Img> {}###Assistant: ",
+        #     "###Human: <Img><ImageHere></Img> From this slide, {}###Assistant: ",
+        # ]
+        self.instruction_pool = [
+            "<Img><ImageHere></Img> {}",
+            "<Img><ImageHere></Img> From this slide, {}",
+        ]
+    def create_data(self, ann_path):
+        with open(ann_path, 'r') as f:
+            samples = f.readlines()
+        data = []
+        for sample in samples:
+            sample = json.loads(sample)
+            if len(sample['evidence_pages']) != 1: continue  # skip questions that need more than one slide page
+            page = sample['evidence_pages'][0]
+            image_name = 'slide_{}_1024.jpg'.format(page)
+            # assert [int(image_name.split('-')[-2]) for image_name in image_names] == list(range(1, 21))  # check the format
+            image_path = os.path.join(sample['deck_name'], image_name)
+            data.append({
+                'qa_id': sample['qa_id'],
+                'question': sample['question'],
+                'answer': sample['answer'],
+                'image_path': image_path
+            })
+        
+        print("single slide ",len(data))
+        return data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+        image = self.vis_processor(image)
+
+        # instruction = self.text_processor(sample["question"])
+        instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
+
+        # instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": sample['answer'],
+            "qa_id": sample['qa_id'],
+        }
+
+
+class OCRVQADataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.data = self.create_data(ann_path)
+
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+    def create_data(self, ann_path):
+        processed_data = []
+        with open(ann_path, 'r') as f:
+            data = json.load(f)
+        for k in data.keys():
+            if data[k]['split'] != 1: continue  # 1 for training, 2 for validation, 3 for test
+            ext = os.path.splitext(data[k]['imageURL'])[1]
+            imageFile = k + ext
+            assert len(data[k]['questions']) == len(data[k]['answers'])
+            for q, a in zip(data[k]['questions'], data[k]['answers']):
+                processed_data.append(
+                    {'question': q,
+                     'answer': a,
+                     'image_path': imageFile,
+                     'image_id': k,
+                     'title': data[k]['title'],
+                     'genre': data[k]['genre'],
+                     }
+                )
+        print("ocr vqa", len(processed_data))
+        return processed_data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+        image = self.vis_processor(image)
+        question = self.text_processor(sample["question"])
+        answer = self.text_processor(sample["answer"])
+
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": sample['image_id']
+        }
+    
+
+
+
+
+class TextOCRDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.data = self.create_data(ann_path)
+
+        self.instruction_pool = [
+            "<Img><ImageHere></Img> [OCR] {}"
+        ]
+
+    def create_data(self, ann_path):
+        processed_data = []
+        with open(ann_path, 'r') as f:
+            data = json.load(f)
+        for k in data["anns"].keys():
+            # ext = os.path.splitext(data[k]['imageURL'])[1]
+            imageFile = data["anns"][k]["image_id"]+".jpg"
+            bbox = data["anns"][k]["bbox"]
+            text = data["anns"][k]["utf8_string"]
+            # assert len(data[k]['questions']) == len(data[k]['answers'])
+            # for q, a in zip(data[k]['questions'], data[k]['answers']):
+
+            processed_data.append(
+                {'bbox': bbox,
+                    'answer': text,
+                    'image_path': imageFile,
+                    'image_id': k,
+                    }
+            )
+
+        return processed_data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+        width, height = image.size
+        image = self.vis_processor(image)
+
+        new_bbox =""
+        image_size = 100
+        bbox = sample['bbox']
+        for index in range(len(bbox)):
+            
+            x1 = int(bbox[0]/width*image_size)
+            y1 = int(bbox[1]/height*image_size)
+            x2 = x1 + int(bbox[2]/width*image_size)
+            y2 = y1 + int(bbox[3]/height*image_size)
+            assert x1>=0 and x1<=image_size
+            assert x2>=0 and x2<=image_size
+            assert y1>=0 and y1<=image_size
+            assert y2>=0 and y2<=image_size
+            
+            new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+         
+        instruction = random.choice(self.instruction_pool).format(new_bbox)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": sample['answer'],
+            "image_id": sample['image_id']
+        }
+
+
+
+class PlotVQADataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.data = self.create_data(ann_path)
+
+        self.instruction_pool = [
+            '{}',
+            'Question: {}',
+            '{} A short answer to the question is',
+            'Q: {} A:',
+            'Question: {} Short answer:',
+            # 'Given the image, answer the following question with no more than three words. {}',
+            'Based on the image, respond to this question with a short answer: {}.',
+            'Use the provided image to answer the question: {} Provide your answer as short as possible.',
+            'What is the answer to the following question? "{}"',
+            'The question "{}" can be answered using the image. A short answer is'
+        ]
+
+    def create_data(self, ann_path):
+        processed_data = []
+        with open(ann_path, 'r') as f:
+            data = json.load(f)
+        for da in data["qa_pairs"]:
+            # ext = os.path.splitext(data[k]['imageURL'])[1]
+
+            imageFile = str(da["image_index"])+".png"
+            question = da["question_string"]
+            answer = str(da["answer"])
+            # assert len(data[k]['questions']) == len(data[k]['answers'])
+            # for q, a in zip(data[k]['questions'], data[k]['answers']):
+
+            processed_data.append(
+                {'question': question,
+                    'answer': answer,
+                    'image_path': imageFile,
+                    'image_id': str(da["image_index"]),
+                    }
+            )
+
+        return processed_data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+        # width, height = image.size
+        image = self.vis_processor(image)
+
+
+        # image_shape = image.shape
+        instruction = "<Img><ImageHere></Img> {} ".format(sample["question"])
+         
+        instruction = random.choice(self.instruction_pool).format(instruction)
+        
+        answer = sample["answer"]
+
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": sample['image_id']
+        }
+
--- a/minigpt4/datasets/datasets/flickr.py
+++ b/minigpt4/datasets/datasets/flickr.py
@ -0,0 +1,159 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class GroundedDetailDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[grounding] please describe this image in details',
+            '[grounding] describe this image as detailed as possible',
+            '[grounding] summarize this image in details',
+            '[grounding] give a thorough description of what you see in this image',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['grounded_caption']
+
+        instruction = random.choice(self.instruction_pool)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
+
+
+
+
+class CaptionToObjectDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[detection] {}',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        input = info["caption"]
+        answer = info["output"]
+
+        instruction = random.choice(self.instruction_pool).format(input)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
+
+
+
+
+class PhraseToObjectDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            '[detection] {}',
+        ]
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+        image_file = '{}.jpg'.format(info['image_id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        input = info["phrase"]
+        answer = "<p>"+input+"</p> "+info["bbox"]
+
+        instruction = random.choice(self.instruction_pool).format(input)
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['image_id'],
+        }
--- a/minigpt4/datasets/datasets/gqa_datasets.py
+++ b/minigpt4/datasets/datasets/gqa_datasets.py
@ -0,0 +1,65 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset
+
+from collections import OrderedDict
+import random
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "question": ann["question"],
+                "question_id": ann["question_id"],
+                "answers": "; ".join(ann["answer"]),
+                "image": sample["image"],
+            }
+        )
+
+
+class GQADataset(VQADataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+        self.instruction_pool =[
+            "[vqa] {}",
+            "[vqa] Based on the image, respond to this question with a short answer: {}"
+        ]
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        instruction = random.choice(self.instruction_pool).format(question)
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        answers = self.text_processor(ann["answer"])
+        # if "unk" in answers:
+        #     print("gqa",answers)
+
+        # print(answers)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answers,
+            # "weights": weights,
+        }
+
--- a/minigpt4/datasets/datasets/llava_dataset.py
+++ b/minigpt4/datasets/datasets/llava_dataset.py
@ -0,0 +1,390 @@
+import os
+import json
+import pickle
+import random
+import time
+# import iterto
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+class LlavaDetailDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['conversations'][1]['value']
+        instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['id'],
+        }
+
+class LlavaReasonDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        answer = info['conversations'][1]['value']
+        instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+
+        # instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+        # answer = self.text_processor(answer)
+
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            "image_id": info['id'],
+        }
+
+
+
+
+class MiniGPT4v(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+        self.instruction_pool = [
+            'please describe this image as detailed as possible',
+            'What do you see happening in this image?',
+            "Can you elaborate on the elements of the picture provided?",
+            "Describe the following image.",
+            "Write a detailed description of the given image.",
+            "Write a detailed description of the given image.",
+            "Explain the visual content of the image in great detail"
+        ]
+        self.ann=[]
+
+        with open(ann_path,"r") as f:
+            for line in f.readlines():
+                self.ann.append(json.loads(line))
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+        # print("info keys",info.keys())
+        if "image_path" in info.keys():
+            image_path = "/ibex/reference/CV/COCO/cocoapi/data/2017/images/jpeg/train/"+info['image_path']
+            
+        else:
+            # print("coming here?")
+            image_file = "images/"+info["image"]
+            image_path = os.path.join(self.vis_root, image_file)
+            # print(image_path)
+
+
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        if "question" in info.keys():
+            question = info['question']
+        else:
+            question = random.sample(self.instruction_pool,1)[0]
+
+
+        answer = info["caption"]
+
+
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
+
+        # instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+        # answer = self.text_processor(answer)
+        # print("image path", image_path)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            # "image_id": info['id'],
+        }
+
+
+
+
+class MiniGPT4v_emotion(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+        self.instruction_pool = [
+            'please describe this image as detailed as possible',
+            'What do you see happening in this image?',
+            "Can you elaborate on the elements of the picture provided?",
+            "Describe the following image",
+            "Write a detailed description of the given image",
+            "Write a detailed description of the given image",
+            "Explain the visual content of the image in great detail"
+        ]
+        # self.ann=[]
+
+        with open(ann_path,"r") as f:
+            # for line in f.readlines():
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+        # print("info keys",info.keys())
+     
+            # print("coming here?")
+        image_file = info["link"]
+        image_path = os.path.join(self.vis_root, image_file)
+        # print("image path",image_path)
+            # print(image_path)
+
+
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        question = random.sample(self.instruction_pool,1)[0]
+
+
+        answer = info["caption"]
+
+
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
+
+        # instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+        # answer = self.text_processor(answer)
+        # print("image path", image_path)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            # "image_id": info['id'],
+        }
+
+
+
+
+class MiniGPT4v_laion(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+        self.instruction_pool = [
+            'please describe this image as detailed as possible',
+            'What do you see happening in this image?',
+            "Can you elaborate on the elements of the picture provided?",
+            "Describe the following image",
+            "Write a detailed description of the given image",
+            "Write a detailed description of the given image",
+            "Explain the visual content of the image in great detail"
+        ]
+        # self.ann=[]
+
+        with open(ann_path,"r") as f:
+            # for line in f.readlines():
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+        # print("info keys",info.keys())
+     
+            # print("coming here?")
+        image_file = info["link"]
+        image_path = os.path.join(self.vis_root, image_file)
+        # print(image_path)
+            # print(image_path)
+
+
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        question = random.sample(self.instruction_pool,1)[0]
+
+
+        answer = info["caption"]
+
+
+        instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
+
+        # instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
+        # answer = self.text_processor(answer)
+        # print("image path", image_path)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": answer,
+            # "image_id": info['id'],
+        }
+
+
+
+class Minigpt2_conversation(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        answer = info['conversations'][1]['value']
+        instruction = info['conversations'][0]['value']
+
+        # print("instruction",instruction)
+        # print("answer", answer)
+        
+        return {
+            "instruction_input": instruction,
+            "answer": answer,
+        }
+    
+
+class LlavaConversationDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.ann=[]
+
+    
+        # with open(ann_path, 'r') as f:
+            # self.ann = json.load(f)
+
+        self.connect_sym = "!@#"
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
+
+        questions = [first_instruction]
+        answers = []
+
+        for i, item in enumerate(info["conversations"][1:]):
+            if i % 2 ==0:  # assistant
+                assistant_answer = item["value"]
+                answers.append(assistant_answer)
+            else:
+                human_instruction = item["value"]+" "
+                questions.append(human_instruction)
+
+        questions = self.connect_sym.join(questions)
+        # questions = questions.replace("\\\\","\\")
+        answers = self.connect_sym.join(answers)
+
+
+        return {
+            "image": image,
+            "conv_q": questions,
+            'conv_a': answers,
+            "image_id": info['id'],
+            "connect_sym": self.connect_sym
+        }
--- a/minigpt4/datasets/datasets/multitask_conversation.py
+++ b/minigpt4/datasets/datasets/multitask_conversation.py
@ -0,0 +1,75 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+
+class MultiTaskConversationDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+        self.connect_sym = "!@#"
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]
+
+        image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+
+        first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
+        first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
+
+        questions = [first_instruction]
+        answers = []
+
+        for i, item in enumerate(info["conversations"][1:]):
+            if i % 2 ==0:  # assistant
+                assistant_answer = item["value"]
+                answers.append(assistant_answer)
+            else:
+                human_instruction = item["value"]+" "
+                questions.append(human_instruction)
+
+        questions = self.connect_sym.join(questions)
+        answers = self.connect_sym.join(answers)
+
+
+        return {
+            "image": image,
+            "conv_q": questions,
+            'conv_a': answers,
+            "image_id": info['id'],
+            "connect_sym": self.connect_sym
+        }
--- a/minigpt4/datasets/datasets/text_caps.py
+++ b/minigpt4/datasets/datasets/text_caps.py
@ -0,0 +1,186 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+
+
+class TextCapDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self.instruction_pool = [
+            # "generate a short image caption incorporating text in the image",
+            # "generate a brief image description combining the text shown in the image",
+            # "what text is writen in this image?",
+            # "describe the text that you can see from this image",
+            # "What does the text in the image say?"
+            'Briefly describe this image.',
+            'Provide a concise depiction of this image.',
+            'Present a short description of this image.',
+            'Summarize this image in a few words.',
+            'A short image caption:',
+            'A short image description:',
+            'A photo of ',
+            'An image that shows ',
+            'Write a short description for the image. ',
+            'Write a description for the photo.',
+            'Provide a description of what is presented in the photo.',
+            'Briefly describe the content of the image.',
+            'Can you briefly explain what you see in the image?',
+            'Could you use a few words to describe what you perceive in the photo?',
+            'Please provide a short depiction of the picture.',
+            'Using language, provide a short account of the image.',
+            'Use a few words to illustrate what is happening in the picture.',
+        ]
+        
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+
+    def __len__(self):
+        return len(self.ann["data"])
+
+
+    def __getitem__(self, index):
+        info = self.ann["data"][index]
+
+        image_file = '{}.jpg'.format(info['image_id'])
+
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        # image_width,image_length = image.size
+        image = self.vis_processor(image)
+
+        # ocr_info = self.ann[index]["data"]
+        caption = info["caption_str"]
+        caption = self.text_processor(caption)
+
+        # instruction = random.choice(self.instruction_pool).format(word_bbox)
+        instruction = "<Img><ImageHere></Img> [caption] {} ".format(random.choice(self.instruction_pool))
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": caption,
+            "data_type": "bbox",
+            "question_split": True
+        }
+
+
+
+class TextCapBboxToObjectDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        # self.instruction_pool = [
+        #     "<Img><ImageHere></Img> What text does it show in  {} ",
+        #     "<Img><ImageHere></Img> Extract the text from {} ",
+        #     "<Img><ImageHere></Img> What is the textual content in {} ",
+        #     "<Img><ImageHere></Img> Extract the textual information present in the {} ",
+        #     "<Img><ImageHere></Img> What is the text written within this defined region {}",
+        #     "<Img><ImageHere></Img> Transcribe the text located inside {}",
+        #     "<Img><ImageHere></Img> Can you read and extract the text from this specific area {}",
+        #     ]
+        
+        self.instruction_pool = [
+            "<Img><ImageHere></Img> [OCR] {}"
+        ]
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+                
+        self.new_ann = {"data":[]}
+        for da in self.ann["data"]:
+            if da["ocr_info"] !=[]:
+                ocr_info_filter = []
+                for d in da["ocr_info"]:
+                    if (d["bounding_box"]["width"]+d["bounding_box"]["top_left_x"])<=1.0 and (d["bounding_box"]["height"]+d["bounding_box"]["top_left_y"]) <=1.0 \
+                        and d["bounding_box"]["top_left_x"]>=0 and d["bounding_box"]["top_left_y"]>=0:
+                        ocr_info_filter.append(d)
+                    if ocr_info_filter !=[]:
+                        da["ocr_info"]=ocr_info_filter
+                        self.new_ann["data"].append(da) 
+        self.ann = self.new_ann
+
+
+    def __len__(self):
+        return len(self.ann["data"])
+
+
+    def __getitem__(self, index):
+        
+        info = self.ann["data"][index]
+
+
+        image_file = '{}.jpg'.format(info['image_id'])
+
+        image_path = os.path.join(self.vis_root, image_file)
+        image = Image.open(image_path).convert("RGB")
+        # image_width,image_length = image.size
+        image = self.vis_processor(image)
+
+
+
+        image_size = 100
+        
+        ocr_info = info["ocr_info"]
+
+        sampled_ocr = random.sample(ocr_info,1)[0]
+
+        # print("sampled ocr", sampled_ocr)
+
+        word_text = sampled_ocr["word"]
+        width = sampled_ocr["bounding_box"]["width"]
+        height = sampled_ocr["bounding_box"]["height"]
+        top_left_x = sampled_ocr["bounding_box"]["top_left_x"]
+        top_left_y = sampled_ocr["bounding_box"]["top_left_y"]
+        
+        x1 = int(top_left_x*image_size)
+        y1 = int(top_left_y*image_size)
+        x2 = x1 + int(width*image_size)
+        y2 = y1 + int(height*image_size)
+        assert x1>=0 and x1<=image_size
+        assert x2>=0 and x2<=image_size
+        assert y1>=0 and y1<=image_size
+        assert y2>=0 and y2<=image_size
+
+        
+        word_bbox = "{<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">}"
+
+        instruction = random.choice(self.instruction_pool).format(word_bbox)
+        return {
+            "image": image,
+            "instruction_input": instruction,
+            "answer": word_text,
+            "data_type": "bbox",
+            "question_split": True
+        }
--- a/minigpt4/datasets/datasets/unnatural_instruction.py
+++ b/minigpt4/datasets/datasets/unnatural_instruction.py
@ -0,0 +1,52 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class UnnaturalDataset(Dataset):
+    def __init__(self, text_processor, ann_path):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.text_processor = text_processor
+
+        with open(ann_path, 'r') as f:
+            self.ann = json.load(f)
+
+        # with open(ann_path, 'r') as f:
+        #     for data in f.readlines():
+        #         data = json.loads(data)
+        #         self.ann.append(data)
+
+    def __len__(self):
+        return len(self.ann)
+
+    def __getitem__(self, index):
+        info = self.ann[index]["instances"][0]
+        instruction = info["instruction_with_input"]
+        constraints = info["constraints"]
+        answer = info["output"]
+        if constraints != None:
+            instruction = instruction+" "+constraints
+
+        return {
+            # "image":None,
+            "instruction_input": self.text_processor(instruction),
+            "answer": self.text_processor(answer),
+        }
--- a/minigpt4/datasets/datasets/vg_dataset.py
+++ b/minigpt4/datasets/datasets/vg_dataset.py
@ -0,0 +1,98 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+from visual_genome import local
+
+
+import threading
+
+# Global lock
+lock = threading.Lock()
+
+
+class ReferVisualGenomeDataset(Dataset):
+    def __init__(self, vis_processor, text_processor, data_dir):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.data_dir = data_dir
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        all_regions = local.get_all_region_descriptions(self.data_dir)
+        all_regions = [region for regions in all_regions for region in regions]
+
+        # follow OFA practice, only regions smaller than 16384 pixels are used for refer
+        self.regions = [region for region in all_regions if region.width * region.height < 16384]
+
+        print('Visual Genome grounding', len(self.regions))
+
+
+        self.instruction_pool = [
+            "[refer] {}",
+            "[refer] give me the location of {}",
+            "[refer] where is {} ?",
+            "[refer] from this image, tell me the location of {}",
+            "[refer] the location of {} is",
+            "[refer] could you tell me the location for {} ?",
+            "[refer] where can I locate the {} ?",
+        ]
+
+
+    def __len__(self):
+        return len(self.regions)
+
+    def preprocess(self, index):
+        region = self.regions[index]
+        image_file = region.image.url.split('/')[-2:]
+        image_path = os.path.join(self.data_dir, *image_file)
+        image = Image.open(image_path).convert("RGB")
+        image_orig_size = image.size
+        image = self.vis_processor(image)
+        image_new_size = [100,100]
+
+        sample_sentence = region.phrase
+        refer_sentence = self.text_processor(sample_sentence)
+
+        bbox = [region.x, region.y, region.width, region.height]
+
+        bbox = [
+            bbox[0] / image_orig_size[0] * image_new_size[0],
+            bbox[1] / image_orig_size[1] * image_new_size[1],
+            (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+            (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+        ]
+        bbox = [int(x) for x in bbox]
+        bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+        return {
+            "image": image,
+            "refer_sentence": refer_sentence,
+            "bbox": bbox,
+            "image_id": region.image.id,
+        }
+
+    def __getitem__(self, index):
+        data = self.preprocess(index)
+        instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+        instruction = "<Img><ImageHere></Img> {} ".format(instruction)
+
+        # assert False
+
+        return {
+            "image": data['image'],
+            "instruction_input": instruction,
+            "answer": data['bbox'],
+            "image_id": data['image_id'],
+        }
+
+
--- a/minigpt4/datasets/datasets/vqa_datasets.py
+++ b/minigpt4/datasets/datasets/vqa_datasets.py
@ -0,0 +1,223 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+from PIL import Image
+import os
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+
+
+class VQADataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    # def collater(self, samples):
+    #     image_list, question_list, answer_list, weight_list = [], [], [], []
+    
+    #     num_answers = []
+    
+    #     for sample in samples:
+    #         image_list.append(sample["image"])
+    #         question_list.append(sample["question"])
+    
+    #         weight_list.extend(sample["weights"])
+    
+    #         answers = sample["answer"]
+    
+    #         answer_list.extend(answers)
+    #         num_answers.append(len(answers))
+    
+    #     return {
+    #         "image": torch.stack(image_list, dim=0),
+    #         "text_input": question_list,
+    #         "answer": answer_list,
+    #         "weight": torch.Tensor(weight_list),
+    #         "n_answers": torch.LongTensor(num_answers),
+    #     }
+
+
+class VQAEvalDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+
+class OKVQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['image_id']
+        question = data['question']
+        question_id = data['question_id']
+        img_file = '{:0>12}.jpg'.format(img_id)
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id, img_id
+
+class VizWizEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_id = data['image']
+        question = data['question']
+        answers = data['answers']
+        answers = '_'.join([answer['answer'] for answer in answers])
+        image_path = os.path.join(self.root_path, img_id)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        # question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
+        return image, question, answers
+
+class AOKVQADAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_file = data['image']
+        question = data['question']
+        question_id = data['question_id']
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id
+
+class AOKVQAMCEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        img_file = data['image']
+        question = data['question']
+        question_id = data['question_id']
+        image_path = os.path.join(self.root_path, img_file)
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image).half().cuda()
+        candidates=data['choices']
+        # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, question_id, candidates
+
+class IconQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        data = self.loaded_data[idx]
+        image_id = data['image_id']
+        question = data['question']
+        image_path = os.path.join(self.root_path, image_id, 'image.png')
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image).half().cuda()
+        candidates = '_'.join(data['choices'])
+        answer = data['answer']
+        # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        return image, question, candidates, answer
+
+class GQAEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_id = ann["image"]
+        image_path = os.path.join(self.root_path, f"{image_id}")
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["question"]
+        # question = f'Question: {question} Short answer: '
+        question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+        # question = f"[vqa] {question} "
+        labels = ann["answer"]
+
+        return image, question, labels
+
+class HMEvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_id = ann["img"]
+        image_path = os.path.join(self.root_path, f"{image_id}")
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["text"]
+        question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
+        labels = ann["label"]
+
+        return image, question, labels
+
+class VSREvalData(torch.utils.data.Dataset):
+    def __init__(self, loaded_data, vis_processor, root_path):
+        self.loaded_data = loaded_data
+        self.root_path = root_path
+        self.vis_processor = vis_processor
+
+    def __len__(self):
+        return len(self.loaded_data)
+    
+    def __getitem__(self, idx):
+        ann = self.loaded_data[idx]
+        image_path = os.path.join(self.root_path, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        image = self.vis_processor(image)
+        question = ann["caption"]
+        question = f'[vqa] Based on the image, is this statement true or false? {question}'
+        question_id = ann["image"].split('.')[0]
+        labels = 'true' if ann["label"] == 1 else 'false'
+
+        return image, question, labels
--- a/train_configs/minigpt_v2_finetune.yaml
+++ b/train_configs/minigpt_v2_finetune.yaml
@ -0,0 +1,300 @@
+model:
+  arch: minigpt_v2
+  model_type: pretrain
+  freeze_vit: True
+  freeze_qformer: True
+  max_txt_len: 1024
+  low_resource: False
+  image_size: 448
+  end_sym: "</s>"
+  llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
+  ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
+  use_grad_checkpoint: True
+  chat_template: True
+  lora_r: 64
+  lora_alpha: 16
+
+
+datasets:
+ 
+
+  multitask_conversation:
+    batch_size_train: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 40
+
+  llava_conversation:   # 77k
+    batch_size_train: 2
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10
+
+
+
+
+  # unnatural_instruction:
+  #   batch_size: 1
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 15
+
+
+  # refvg:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 40
+
+  # llava_detail: #23K
+  #   batch_size: 4
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 20
+
+  # llava_reason:   # 77k
+  #   batch_size: 4
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 80
+    
+
+  # grounded_detailed_image_caption:
+  #   batch_size: 2
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 80
+
+  # CaptionToPhrase:
+  #   batch_size: 2
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 80
+
+  # ObjectToPhrase:
+  #   batch_size: 2
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 80
+
+  # coco_caption:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 10  
+
+    
+  # textcaps_caption:  #
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 10
+
+  # refcoco:  # 142k
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 15
+
+
+  # refcocop:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 15
+
+  # refcocog:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 15
+
+
+
+  # invrefcoco:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 10
+
+  # invrefcocop:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 10
+
+  # invrefcocog:
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 10
+
+
+  # coco_vqa:    # 82K
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 15
+
+  # ok_vqa:   # 9k
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 8
+
+  # aok_vqa:  # 17k
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 12
+
+  # gqa:    # 82K
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 40
+
+  # ocrvqa:  # 800K
+  #   batch_size: 6
+  #   vis_processor:
+  #     train:
+  #       name: "blip2_image_train"
+  #       image_size: 448
+  #   text_processor:
+  #     train:
+  #       name: "blip_caption"
+  #   sample_ratio: 30
+
+
+run:
+  task: image_text_pretrain
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5
+  min_lr: 8e-5
+  warmup_lr: 1e-6
+
+  weight_decay: 0.05
+  max_epoch: 50
+  num_workers: 6
+  warmup_steps: 1000
+  iters_per_epoch: 1000
+
+  seed: 42
+  output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
+
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True