diff --git a/README.md b/README.md
index c294ff5..e5a4a1b 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Lin
For MiniGPT-v2, run
```
-python demo_v2.py --cfg-path eval_configs/minigpt4v2_eval.yaml --gpu-id 0
+python demo_v2.py --cfg-path eval_configs/minigptv2_eval.yaml --gpu-id 0
```
For MiniGPT-4 (Vicuna version), run
diff --git a/jobs/srun_test.sh b/jobs/srun_test.sh
new file mode 100644
index 0000000..3dd5709
--- /dev/null
+++ b/jobs/srun_test.sh
@@ -0,0 +1,30 @@
+
+cd ..
+
+job_name=minigpt4_v2_test
+read LOWERPORT UPPERPORT < /proc/sys/net/ipv4/ip_local_port_range
+while :
+do
+ PORT="`shuf -i $LOWERPORT-$UPPERPORT -n 1`"
+ ss -lpn | grep -q ":$PORT " || break
+done
+
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_gqa.yaml
+
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/448_final_v1_gqa_ablation2.yaml
+torchrun --master-port ${PORT} --nproc-per-node 2 train.py --cfg-path train_configs/minigpt_v2_finetune.yaml
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path finetune_conversation_ablation/conversation_v2_last_336_test.yaml
+
+#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_13B.yaml
+
+# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/448_v2_llama2.yaml
+#accelerate launch train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2.yaml
+
+
+# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2_clip_encoder.yaml
+
+#best_data_ratio_336_full_dataset_lr2e4_v1.yaml
+
diff --git a/minigpt4/configs/datasets/aokvqa/defaults.yaml b/minigpt4/configs/datasets/aokvqa/defaults.yaml
new file mode 100755
index 0000000..79d2054
--- /dev/null
+++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml
@@ -0,0 +1,29 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ aok_vqa:
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ # annotations:
+ # train:
+ # url:
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
+ # storage:
+ # - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
+ # images:
+ # storage: /path/to/coco/images/
+
+ annotations:
+ train:
+ url:
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
+ storage:
+ - /ibex/project/c2133/minigpt4_v2_dataset/aokvqa/annotations/aokvqa_v1p0_train.json
+ images:
+ storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml
new file mode 100644
index 0000000..873c286
--- /dev/null
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@@ -0,0 +1,38 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ coco_caption: # name of the dataset builder
+ # dataset_card: dataset_card/coco_caption.md
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ # build_info:
+ # # Be careful not to append minus sign (-) before split to avoid itemizing
+ # annotations:
+ # train:
+ # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
+ # md5: aa31ac474cf6250ebb81d18348a07ed8
+ # storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
+ # images:
+ # storage: /path/to/coco/images/
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ annotations:
+ train:
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
+ md5: aa31ac474cf6250ebb81d18348a07ed8
+ storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_train.json
+ # val:
+ # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+ # md5: b273847456ef5580e33713b1f7de52a0
+ # storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
+ # test:
+ # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+ # md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
+ # storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
+ images:
+ storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco/defaults_vqa.yaml b/minigpt4/configs/datasets/coco/defaults_vqa.yaml
new file mode 100755
index 0000000..87ae494
--- /dev/null
+++ b/minigpt4/configs/datasets/coco/defaults_vqa.yaml
@@ -0,0 +1,33 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ coco_vqa:
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ build_info:
+
+ # annotations:
+ # train:
+ # url:
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
+ # storage:
+ # - /path/to/vqav2/annotations/vqa_train.json
+ # - /path/to/vqav2/coco/annotations/vqa_val.json
+ # images:
+ # storage: /path/to/coco/images/
+
+ annotations:
+ train:
+ url:
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
+ storage:
+ - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/annotations/vqa_train.json
+ - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/coco/annotations/vqa_val.json
+ images:
+ storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
new file mode 100755
index 0000000..580694b
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
@@ -0,0 +1,8 @@
+datasets:
+ invrefcoco:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: invrefcoco
+ splitBy: unc
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
new file mode 100755
index 0000000..67af2eb
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
@@ -0,0 +1,8 @@
+datasets:
+ invrefcocog:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: invrefcocog
+ splitBy: umd
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
new file mode 100755
index 0000000..576004e
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
@@ -0,0 +1,8 @@
+datasets:
+ invrefcocop:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: invrefcoco+
+ splitBy: unc
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/refcoco.yaml b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
new file mode 100755
index 0000000..edf16ba
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml
@@ -0,0 +1,8 @@
+datasets:
+ refcoco:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: refcoco
+ splitBy: unc
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/refcocog.yaml b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
new file mode 100755
index 0000000..5ed7cc9
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml
@@ -0,0 +1,8 @@
+datasets:
+ refcocog:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: refcocog
+ splitBy: umd
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/coco_bbox/refcocop.yaml b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
new file mode 100755
index 0000000..4e3af6f
--- /dev/null
+++ b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml
@@ -0,0 +1,8 @@
+datasets:
+ refcocop:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/object_detection_datasets/
+ dataset: refcoco+
+ splitBy: unc
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
new file mode 100755
index 0000000..0901b6f
--- /dev/null
+++ b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
@@ -0,0 +1,6 @@
+datasets:
+ CaptionToPhrase:
+ data_type: images
+ build_info:
+ image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_v2_last.json
diff --git a/minigpt4/configs/datasets/flickr/default.yaml b/minigpt4/configs/datasets/flickr/default.yaml
new file mode 100755
index 0000000..a732dd4
--- /dev/null
+++ b/minigpt4/configs/datasets/flickr/default.yaml
@@ -0,0 +1,6 @@
+datasets:
+ grounded_detailed_image_caption:
+ data_type: images
+ build_info:
+ image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_last.json
diff --git a/minigpt4/configs/datasets/flickr/object_to_phrase.yaml b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
new file mode 100755
index 0000000..c0189b6
--- /dev/null
+++ b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml
@@ -0,0 +1,6 @@
+datasets:
+ ObjectToPhrase:
+ data_type: images
+ build_info:
+ image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_phrase2bbox_resample_last.json
diff --git a/minigpt4/configs/datasets/gqa/balanced_val.yaml b/minigpt4/configs/datasets/gqa/balanced_val.yaml
new file mode 100644
index 0000000..5a9e55c
--- /dev/null
+++ b/minigpt4/configs/datasets/gqa/balanced_val.yaml
@@ -0,0 +1,33 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ gqa:
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ # build_info:
+ # # Be careful not to append minus sign (-) before split to avoid itemizing
+ # annotations:
+ # train:
+ # url:
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+ # storage:
+ # - /path/to/gqa/annotations/train_balanced_questions.json
+
+ # images:
+ # storage: /path/to/gqa/images/
+
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ annotations:
+ train:
+ url:
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
+ storage:
+ - /ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json
+ images:
+ storage: /ibex/project/c2133/minigpt4_v2_dataset/gqa/images_copy/
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/llava/conversation.yaml b/minigpt4/configs/datasets/llava/conversation.yaml
new file mode 100755
index 0000000..6978069
--- /dev/null
+++ b/minigpt4/configs/datasets/llava/conversation.yaml
@@ -0,0 +1,12 @@
+datasets:
+ # llava_conversation:
+ # data_type: images
+ # build_info:
+ # image_path: /path/to/coco/images
+ # ann_path: /path/to/llava/conversation_58k.json
+
+ llava_conversation:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/conversation_58k.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/llava/detail.yaml b/minigpt4/configs/datasets/llava/detail.yaml
new file mode 100755
index 0000000..f4d0f0a
--- /dev/null
+++ b/minigpt4/configs/datasets/llava/detail.yaml
@@ -0,0 +1,12 @@
+datasets:
+ # llava_detail:
+ # data_type: images
+ # build_info:
+ # image_path: /path/to/coco/images
+ # ann_path: /path/to/llava/detail_23k.json
+
+ llava_detail:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/detail_23k.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/llava/reason.yaml b/minigpt4/configs/datasets/llava/reason.yaml
new file mode 100755
index 0000000..ea6cb06
--- /dev/null
+++ b/minigpt4/configs/datasets/llava/reason.yaml
@@ -0,0 +1,12 @@
+datasets:
+ # llava_reason:
+ # data_type: images
+ # build_info:
+ # image_path: /path/to/coco/images
+ # ann_path: /path/to/llava/complex_reasoning_77k.json
+
+ llava_reason:
+ data_type: images
+ build_info:
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/complex_reasoning_77k.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/multitask_conversation/default.yaml b/minigpt4/configs/datasets/multitask_conversation/default.yaml
new file mode 100644
index 0000000..29200ee
--- /dev/null
+++ b/minigpt4/configs/datasets/multitask_conversation/default.yaml
@@ -0,0 +1,14 @@
+datasets:
+ # multitask_conversation:
+ # data_type: images
+ # build_info:
+
+ # image_path: /path/to/coco/images
+ # ann_path: /path/to/multitask_conversation/multi_task_conversation.json
+
+ multitask_conversation:
+ data_type: images
+ build_info:
+
+ image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/multitask_conversation/multi_task_conversation.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
new file mode 100644
index 0000000..d9f31de
--- /dev/null
+++ b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
@@ -0,0 +1,10 @@
+datasets:
+ # unnatural_instruction:
+ # data_type: text
+ # build_info:
+ # ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
+
+ unnatural_instruction:
+ data_type: text
+ build_info:
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/unnatural-instructions/data/unnatural_instruction_filer.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
new file mode 100755
index 0000000..d2f6a94
--- /dev/null
+++ b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
@@ -0,0 +1,12 @@
+datasets:
+ # ocrvqa:
+ # data_type: images
+ # build_info:
+ # image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
+ # ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
+
+ ocrvqa:
+ data_type: images
+ build_info:
+ image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/okvqa/defaults.yaml b/minigpt4/configs/datasets/okvqa/defaults.yaml
new file mode 100755
index 0000000..402212c
--- /dev/null
+++ b/minigpt4/configs/datasets/okvqa/defaults.yaml
@@ -0,0 +1,36 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ ok_vqa:
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ # build_info:
+ # # Be careful not to append minus sign (-) before split to avoid itemizing
+ # annotations:
+ # train:
+ # url:
+ # # TODO make this order insensitive
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+ # storage:
+ # - /path/to/okvqa/annotations/okvqa_train.json
+ # images:
+ # storage: /path/to/okvqa/images
+
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ annotations:
+ train:
+ url:
+ # TODO make this order insensitive
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
+ # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
+ storage:
+ - /ibex/project/c2133/minigpt4_v2_dataset/okvqa_v2/annotations/okvqa_train.json
+ images:
+ storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
\ No newline at end of file
diff --git a/minigpt4/configs/datasets/textcaps/caption.yaml b/minigpt4/configs/datasets/textcaps/caption.yaml
new file mode 100755
index 0000000..61a92c7
--- /dev/null
+++ b/minigpt4/configs/datasets/textcaps/caption.yaml
@@ -0,0 +1,16 @@
+datasets:
+ # textcaps_caption:
+ # data_type: images
+
+ # build_info:
+ # image_path: /path/to/TextCaps/train_images
+ # ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
+
+ textcaps_caption:
+ data_type: images
+
+ build_info:
+ image_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/train_images
+ ann_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/TextCaps_0.1_train.json
+
+
diff --git a/minigpt4/configs/datasets/vg/ref.yaml b/minigpt4/configs/datasets/vg/ref.yaml
new file mode 100755
index 0000000..8b793a2
--- /dev/null
+++ b/minigpt4/configs/datasets/vg/ref.yaml
@@ -0,0 +1,10 @@
+datasets:
+ # refvg:
+ # data_type: images
+ # build_info:
+ # data_dir: /path/to/visual_genome
+
+ refvg:
+ data_type: images
+ build_info:
+ data_dir: /ibex/project/c2133/minigpt4_v2_dataset/visual_genome
\ No newline at end of file
diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py
index e5d66b8..c393c4f 100644
--- a/minigpt4/datasets/builders/image_text_pair_builder.py
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@@ -6,6 +6,418 @@ from minigpt4.common.registry import registry
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
from minigpt4.datasets.datasets.laion_dataset import LaionDataset
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
+from minigpt4.datasets.datasets.text_caps import TextCapDataset
+from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
+from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
+from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
+from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
+from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
+from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
+from minigpt4.datasets.datasets.gqa_datasets import GQADataset
+from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
+from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
+from minigpt4.datasets.datasets.doc_dataset import OCRVQADataset
+
+
+
+@registry.register_builder("multitask_conversation")
+class MultitaskConversationBuilder(BaseDatasetBuilder):
+ train_dataset_cls = MultiTaskConversationDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/multitask_conversation/default.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+
+@registry.register_builder("unnatural_instruction")
+class UnnaturalInstructionBuilder(BaseDatasetBuilder):
+ train_dataset_cls = UnnaturalDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/nlp/unnatural_instruction.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ )
+
+ return datasets
+
+
+
+@registry.register_builder("llava_detail")
+class LlavaDetailBuilder(BaseDatasetBuilder):
+ train_dataset_cls = LlavaDetailDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/llava/detail.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+
+
+@registry.register_builder("llava_reason")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+ train_dataset_cls = LlavaReasonDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/llava/reason.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+@registry.register_builder("llava_conversation")
+class LlavaReasonBuilder(BaseDatasetBuilder):
+ train_dataset_cls = LlavaConversationDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/llava/conversation.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+
+class AllRefCOCOBuilder(BaseDatasetBuilder):
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+
+ build_info = self.config.build_info
+ image_path = build_info.image_path
+ ann_path = build_info.ann_path
+
+ datasets = dict()
+
+ if not os.path.exists(image_path):
+ warnings.warn("image path {} does not exist.".format(image_path))
+ if not os.path.exists(ann_path):
+ warnings.warn("ann path {} does not exist.".format(ann_path))
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=ann_path,
+ vis_root=image_path,
+ dataset=build_info.dataset,
+ splitBy=build_info.splitBy
+ )
+
+ return datasets
+
+
+@registry.register_builder("refcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = ReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/refcoco.yaml",
+ }
+
+@registry.register_builder("refcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = ReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/refcocop.yaml",
+ }
+
+
+@registry.register_builder("refcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = ReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/refcocog.yaml",
+ }
+
+@registry.register_builder("invrefcoco")
+class RefCOCOBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = InvReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/invrefcoco.yaml",
+ }
+
+
+@registry.register_builder("invrefcocop")
+class RefCOCOPBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = InvReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/invrefcocop.yaml",
+ }
+
+
+@registry.register_builder("invrefcocog")
+class RefCOCOGBuilder(AllRefCOCOBuilder):
+ train_dataset_cls = InvReferCOCODataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco_bbox/invrefcocog.yaml",
+ }
+
+@registry.register_builder("refvg")
+class RefVisualGenomeBuilder(BaseDatasetBuilder):
+ train_dataset_cls = ReferVisualGenomeDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/vg/ref.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+
+ build_info = self.config.build_info
+ data_dir = build_info.data_dir
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ data_dir=data_dir,
+ )
+
+ return datasets
+
+
+@registry.register_builder("textcaps_caption")
+class TextcapCaptionBuilder(BaseDatasetBuilder):
+ train_dataset_cls = TextCapDataset
+
+ DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
+
+ def _download_ann(self):
+ pass
+
+ def _download_vis(self):
+ pass
+
+ def build(self):
+ self.build_processors()
+
+ build_info = self.config.build_info
+
+ datasets = dict()
+ split = "train"
+
+ # create datasets
+ # [NOTE] return inner_datasets (wds.DataPipeline)
+ dataset_cls = self.train_dataset_cls
+ datasets[split] = dataset_cls(
+ vis_processor=self.vis_processors[split],
+ text_processor=self.text_processors[split],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+@registry.register_builder("coco_vqa")
+class COCOVQABuilder(BaseDatasetBuilder):
+ train_dataset_cls = COCOVQADataset
+
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/coco/defaults_vqa.yaml",
+ }
+
+@registry.register_builder("aok_vqa")
+class AOKVQABuilder(BaseDatasetBuilder):
+ train_dataset_cls = AOKVQADataset
+
+ DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
+
+
+@registry.register_builder("gqa")
+class GQABuilder(BaseDatasetBuilder):
+ train_dataset_cls = GQADataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/gqa/balanced_val.yaml",
+ }
+
+
+
+
+@registry.register_builder("grounded_detailed_image_caption")
+class GroundedCaptionBuilder(BaseDatasetBuilder):
+ train_dataset_cls = GroundedDetailDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/flickr/default.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+
+@registry.register_builder("CaptionToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+ train_dataset_cls = CaptionToObjectDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/flickr/caption_to_phrase.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+@registry.register_builder("ObjectToPhrase")
+class CaptionToPhraseBuilder(BaseDatasetBuilder):
+ train_dataset_cls = PhraseToObjectDataset
+ DATASET_CONFIG_DICT = {
+ "default": "configs/datasets/flickr/object_to_phrase.yaml",
+ }
+
+ def build_datasets(self):
+ # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+ logging.info("Building datasets...")
+ self.build_processors()
+ build_info = self.config.build_info
+ datasets = dict()
+
+ # create datasets
+ dataset_cls = self.train_dataset_cls
+ datasets['train'] = dataset_cls(
+ vis_processor=self.vis_processors["train"],
+ text_processor=self.text_processors["train"],
+ ann_path=build_info.ann_path,
+ vis_root=build_info.image_path,
+ )
+
+ return datasets
+
+
+
+
+class DocumentVQABuilder(BaseDatasetBuilder):
+ def _download_ann(self):
+ pass
+
+ def _download_vis(self):
+ pass
+
+ def build(self):
+ self.build_processors()
+ build_info = self.config.build_info
+
+ datasets = dict()
+ split = "train"
+
+ dataset_cls = self.train_dataset_cls
+ datasets[split] = dataset_cls(
+ vis_processor=self.vis_processors[split],
+ text_processor=self.text_processors[split],
+ vis_root=build_info.image_path,
+ ann_path=build_info.ann_path
+ )
+
+ return datasets
+
+
+@registry.register_builder("ocrvqa")
+class OCRVQABuilder(DocumentVQABuilder):
+ train_dataset_cls = OCRVQADataset
+ DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"}
@registry.register_builder("cc_sbu")
diff --git a/minigpt4/datasets/datasets/aok_vqa_datasets.py b/minigpt4/datasets/datasets/aok_vqa_datasets.py
new file mode 100755
index 0000000..b65b42d
--- /dev/null
+++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py
@@ -0,0 +1,212 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from collections import OrderedDict
+import json
+import os
+import random
+import torch
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
+
+
+class __DisplMixin:
+ def displ_item(self, index):
+ sample, ann = self.__getitem__(index), self.annotation[index]
+ return OrderedDict(
+ {
+ "file": ann["image"],
+ "question": ann["question"],
+ "question_id": ann["question_id"],
+ "direct_answers": "; ".join(ann["direct_answers"]),
+ "choices": "; ".join(ann["choices"]),
+ "correct_choice": ann["choices"][ann["correct_choice_idx"]],
+ "image": sample["image"],
+ }
+ )
+
+
+class AOKVQADataset(VQADataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ self.instruction_pool =[
+ "[vqa] {}",
+ "[vqa] Based on the image, respond to this question with a short answer: {}"
+ ]
+
+ exist_annotation = []
+ for ann in self.annotation:
+ image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+ if os.path.exists(image_path):
+ exist_annotation.append(ann)
+ self.annotation = exist_annotation
+
+ def get_data(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ question = self.text_processor(ann["question"])
+
+ answer_key = "direct_answers"
+
+ # print("answer key", answer_key)
+ # for answer in ann[answer_key]:
+ # print(answer)
+
+ answer_weight = {}
+ for answer in ann[answer_key]:
+ if answer in answer_weight.keys():
+ answer_weight[answer] += 1 / len(ann[answer_key])
+ else:
+ answer_weight[answer] = 1 / len(ann[answer_key])
+
+ answers = list(answer_weight.keys())
+ weights = list(answer_weight.values())
+
+ answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
+
+ return {
+ "image": image,
+ "question": question,
+ "answer": answer,
+ }
+
+ def __getitem__(self, index):
+ data = self.get_data(index)
+ question = self.text_processor(data["question"])
+ instruction = random.choice(self.instruction_pool).format(question)
+
+ instruction = "
{} ".format(instruction)
+
+ answer = self.text_processor(data['answer'])
+
+
+ return {
+ "image": data['image'],
+ "instruction_input": instruction,
+ "answer": answer,
+ }
+
+
+class AOKVQGDataset(AOKVQADataset):
+
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+ self.instruction_pool = [
+ 'Given the image, generate a question whose answer is: {}',
+ 'Based on the image, provide a question with the answer: {}',
+ 'Given the visual representation, create a question for which the answer is "{}"',
+ 'From the image provided, craft a question that leads to the reply: {}',
+ 'Considering the picture, come up with a question where the answer is: {}',
+ 'Taking the image into account, generate an question that has the answer: {}'
+ ]
+
+ def __getitem__(self, index):
+ data = self.get_data(index)
+ instruction = random.choice(self.instruction_pool).format(data['answer'])
+
+ return {
+ "image": data['image'],
+ "instruction_input": instruction,
+ "answer": data['question'],
+ }
+
+
+# class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+# def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+# """
+# vis_root (string): Root directory of images (e.g. coco/images/)
+# ann_root (string): directory to store the annotation file
+# """
+#
+# self.vis_root = vis_root
+#
+# self.annotation = json.load(open(ann_paths[0]))
+#
+# answer_list_path = ann_paths[1]
+# if os.path.exists(answer_list_path):
+# self.answer_list = json.load(open(answer_list_path))
+# else:
+# self.answer_list = None
+#
+# try:
+# self.coco_fmt_qust_file = ann_paths[2]
+# self.coco_fmt_anno_file = ann_paths[3]
+# except IndexError:
+# self.coco_fmt_qust_file = None
+# self.coco_fmt_anno_file = None
+#
+# self.vis_processor = vis_processor
+# self.text_processor = text_processor
+#
+# self._add_instance_ids()
+#
+# def collater(self, samples):
+# (
+# image_list,
+# question_list,
+# question_id_list,
+# instance_id_list,
+# choices_list,
+# correct_choice_idx_list,
+# direct_answers_list,
+# ) = ([], [], [], [], [], [], [])
+#
+# for sample in samples:
+# image_list.append(sample["image"])
+# question_list.append(sample["text_input"])
+# question_id_list.append(sample["question_id"])
+# instance_id_list.append(sample["instance_id"])
+# choices_list.append(sample["choices"])
+# correct_choice_idx_list.append(sample["correct_choice_idx"])
+# direct_answers_list.append(sample["direct_answers"])
+#
+# return {
+# "image": torch.stack(image_list, dim=0),
+# "text_input": question_list,
+# "question_id": question_id_list,
+# "instance_id": instance_id_list,
+# "choices": choices_list,
+# "correct_choice_idx": correct_choice_idx_list,
+# "direct_answers": direct_answers_list,
+# }
+#
+# def __getitem__(self, index):
+# ann = self.annotation[index]
+#
+# image_path = os.path.join(self.vis_root, ann["image"])
+# image = Image.open(image_path).convert("RGB")
+#
+# image = self.vis_processor(image)
+# question = self.text_processor(ann["question"])
+#
+# choices = ann["choices"]
+# if "correct_choice_idx" in ann:
+# correct_choice_idx = ann["correct_choice_idx"]
+# else:
+# correct_choice_idx = None
+#
+# if "direct_answers" in ann:
+# direct_answers = ann["direct_answers"]
+# else:
+# direct_answers = None
+#
+# return {
+# "image": image,
+# "text_input": question,
+# "question_id": ann["question_id"],
+# "instance_id": ann["instance_id"],
+# "choices": choices,
+# "correct_choice_idx": correct_choice_idx,
+# "direct_answers": direct_answers,
+# }
diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py
new file mode 100755
index 0000000..3df5279
--- /dev/null
+++ b/minigpt4/datasets/datasets/coco_caption.py
@@ -0,0 +1,122 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import torch
+import numpy as np
+
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
+
+COCOCapDataset = COCOCaptionDataset
+
+
+
+
+
+class COCOCapEvalDataset(CaptionEvalDataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ split (string): val or test
+ """
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ def __getitem__(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+
+ img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+
+ return {
+ "image": image,
+ "image_id": img_id,
+ "instance_id": ann["instance_id"],
+ }
+
+
+class NoCapsEvalDataset(CaptionEvalDataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ split (string): val or test
+ """
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ def __getitem__(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+
+ img_id = ann["img_id"]
+
+ return {
+ "image": image,
+ "image_id": img_id,
+ "instance_id": ann["instance_id"],
+ }
+
+
+class RefCOCOEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ img_id = data['img_id']
+ sent = data['sents']
+ image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image)
+ # question = f"[refer] {sent}"
+ question = f"[refer] where is {sent}?"
+ # question = f"where is the bounding box location of {sent}?"
+ return image, question, img_id
+
+class EvalCaptionData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+ ann = dict()
+ for item in self.loaded_data:
+ image_id = item['image_id']
+ ann[image_id] = item['image']
+ self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann]
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, idx):
+ data = self.ann[idx]
+ image_id = data['image_id']
+ img_file = data['image'].split('/')[-1]
+ image_path = os.path.join(self.root_path, img_file)
+ image = Image.open(image_path).convert('RGB')
+
+ image = self.vis_processor(image)
+ question = f"[caption] please describe this image?"
+ return image, question, image_id
diff --git a/minigpt4/datasets/datasets/coco_dataset.py b/minigpt4/datasets/datasets/coco_dataset.py
new file mode 100755
index 0000000..e89ba34
--- /dev/null
+++ b/minigpt4/datasets/datasets/coco_dataset.py
@@ -0,0 +1,667 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+import threading
+
+# Global lock
+lock = threading.Lock()
+
+def sample_object_bbox(objects, bbox):
+
+
+
+ zipped_list = list(zip(objects, bbox))
+
+ # Shuffle the zipped list
+ random.shuffle(zipped_list)
+
+ # Generate the new string with interleaved format
+ # interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list])
+
+ # print("objects", objects)
+ # print("bbox",bbox)
+
+ interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","")
+
+ # interleaved_list = " "+interleaved_list
+ # print(interleaved_list)
+ return interleaved_list
+
+def bbox_to_object(objects, bbox):
+
+ index_sample = random.sample(range(len(objects)),1)[0]
+
+ sample_object = str(objects[index_sample])
+ sample_bbox = bbox[index_sample]
+ # sample_center_point = center_point[index_sample]
+
+ sample_bbox = r"{"+str(sample_bbox) + "}"
+ return sample_bbox, sample_object
+
+def object_to_bbox(objects, bbox, center_point):
+ index_sample = random.sample(range(len(objects)),1)[0]
+
+ sample_object = objects[index_sample]
+ sample_bbox = bbox[index_sample]
+ sample_center_point = center_point[index_sample]
+
+ instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? "
+ answer = "{"+str(sample_object)+","+str(sample_bbox)+"}"
+
+
+
+ return instruction, answer
+
+
+class COCOBBOXDataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, location):
+ super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+ print("coco box dataset")
+ self.inner_dataset = wds.DataPipeline(
+ wds.ResampledShards(location),
+ wds.tarfile_to_samples(handler=wds.warn_and_continue),
+ wds.shuffle(1000, handler=wds.warn_and_continue),
+ wds.decode("pilrgb", handler=wds.warn_and_continue),
+ wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+ wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+ wds.map(self.to_dict, handler=wds.warn_and_continue),
+ )
+
+ def to_dict(self, sample):
+ objects = sample[1]["objects"]
+ boxes = sample[1]["bbox"]
+ caption = sample[1]["caption"]
+
+
+ new_bboxes = []
+
+ image_size = sample[0].shape[1]
+ image_size = 100
+ for index in range(len(boxes)):
+ box = boxes[index]
+ x1 = int(box[0]*image_size)
+ y1 = int(box[1]*image_size)
+ x2 = x1 + int(box[2]*image_size)
+ y2 = y1 + int(box[3]*image_size)
+ assert x1>=0 and x1<=image_size
+ assert x2>=0 and x2<=image_size
+ assert y1>=0 and y1<=image_size
+ assert y2>=0 and y2<=image_size
+
+ new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+ # new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+ new_bboxes.append(new_bbox)
+
+ instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. "
+ instruction = "
{}".format(self.text_processor(instruction))
+
+ answer = sample_object_bbox(objects, new_bboxes)
+
+ # print("instruction",instruction)
+ # print("answer", answer)
+
+ return {
+ "image": sample[0],
+ "instruction_input": instruction,
+ "answer": answer,
+ "data_type": "bbox",
+ "question_split": True
+ }
+
+
+class COCOBboxToObjectDataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, location):
+ super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+
+ self.inner_dataset = wds.DataPipeline(
+ wds.ResampledShards(location),
+ wds.tarfile_to_samples(handler=wds.warn_and_continue),
+ wds.shuffle(1000, handler=wds.warn_and_continue),
+ wds.decode("pilrgb", handler=wds.warn_and_continue),
+ wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+ wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+ wds.map(self.to_dict, handler=wds.warn_and_continue),
+ )
+
+
+ self.instruction_pool = [
+ "
what object is in this bounding box location {} ",
+ "
what object is in this location {} ",
+ "
identify the object present at this location {} ",
+ "
what is it in bounding box location{} ",
+ "
describe this object in {} ",
+ "
this {} is ",
+ "
the object in {} is ",
+ "
please tell me what is inside the bounding box position {} ",
+ "
what can you find in the bounding box area at position {}? ",
+ "
what is the object occupying this area {} ",
+ "
could you identify the content within the bounding box located at {} ",
+ ]
+
+ def to_dict(self, sample):
+
+ objects = sample[1]["objects"]
+ boxes = sample[1]["bbox"]
+
+ new_bboxes = []
+
+ image_size = sample[0].shape[1]
+ image_size=100
+ for index in range(len(boxes)):
+ box = boxes[index]
+ x1 = int(box[0]*image_size)
+ y1 = int(box[1]*image_size)
+ x2 = x1 + int(box[2]*image_size)
+ y2 = y1 + int(box[3]*image_size)
+ assert x1>=0 and x1<=image_size
+ assert x2>=0 and x2<=image_size
+ assert y1>=0 and y1<=image_size
+ assert y2>=0 and y2<=image_size
+
+ new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+ new_bboxes.append(new_bbox)
+
+ bbox, object = bbox_to_object(objects, new_bboxes)
+
+ instruction = random.choice(self.instruction_pool).format(bbox)
+ return {
+ "image": sample[0],
+ "instruction_input": instruction,
+ "answer": self.text_processor(object),
+ "data_type": "bbox",
+ "question_split": True
+ }
+
+
+
+# class ReferCOCODataset(Dataset):
+# def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
+# """
+# vis_root (string): Root directory of images (e.g. coco/images/)
+# ann_root (string): directory to store the annotation file
+# """
+# self.vis_root = vis_root
+
+# self.vis_processor = vis_processor
+# self.text_processor = text_processor
+
+# self.refer = REFER(ann_path, vis_root, dataset, splitBy)
+# self.ref_ids = self.refer.getRefIds()
+
+
+# self.instruction_pool = [
+# "[refer] {}",
+# "[refer] give me the location of {}",
+# "[refer] where is {} ?",
+# "[refer] from this image, tell me the location of {}",
+# "[refer] the location of {} is",
+# "[refer] could you tell me the location for {} ?",
+# "[refer] where can I locate the {} ?",
+# ]
+
+
+# def __len__(self):
+# return len(self.ref_ids)
+
+# def preprocess(self, index):
+# ref_id = self.ref_ids[index]
+# ref = self.refer.loadRefs(ref_id)[0]
+
+# image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
+# image_path = os.path.join(self.vis_root, image_file)
+# image = Image.open(image_path).convert("RGB")
+# image_orig_size = image.size
+# image = self.vis_processor(image)
+# image_new_size = [image.shape[1], image.shape[2]]
+
+# image_new_size = [100,100]
+
+# sample_sentence = random.choice(ref['sentences'])['raw']
+
+# refer_sentence = self.text_processor(sample_sentence)
+
+
+# bbox = self.refer.getRefBox(ref['ref_id'])
+
+# bbox_to_save = bbox
+# image_id_to_save = ref["image_id"]
+# ref_id_to_save = ref_id
+
+# item = {"image":image_id_to_save,"bbox":bbox_to_save,"ref id":ref_id_to_save, "sentence":refer_sentence}
+
+
+# def save_to_file():
+# with lock:
+# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "r") as f:
+# refer_json = json.load(f)
+
+# if ref_id_to_save not in refer_json.keys():
+# print(item)
+# refer_json[ref_id_to_save] = item
+
+# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "w") as f:
+# json.dump(refer_json, f)
+
+
+# save_to_file()
+# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","r") as f:
+# # refer_json = json.load(open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json"))
+
+# # if ref_id_to_save not in refer_json.keys():
+# # print(item)
+# # refer_json[ref_id_to_save] = item
+
+# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","w") as f:
+# # json.dump(refer_json,f)
+
+
+
+
+
+
+
+# bbox = [
+# bbox[0] / image_orig_size[0] * image_new_size[0],
+# bbox[1] / image_orig_size[1] * image_new_size[1],
+# (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+# (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+# ]
+# bbox = [int(x) for x in bbox]
+# bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+# return {
+# "image": image,
+# "refer_sentence": refer_sentence,
+# "bbox": bbox,
+# "image_id": ref['image_id'],
+# }
+
+# def __getitem__(self, index):
+# data = self.preprocess(index)
+# instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+# instruction = "
{} ".format(instruction)
+
+# return {
+# "image": data['image'],
+# "instruction_input": instruction,
+# "answer": data['bbox'],
+# "image_id": data['image_id'],
+# }
+
+
+# class InvReferCOCODataset(ReferCOCODataset):
+# def __init__(self, *args, **kwargs):
+# super(InvReferCOCODataset, self).__init__(*args, **kwargs)
+
+# self.instruction_pool = [
+# "[identify] {}",
+# "[identify] what object is in this location {}",
+# "[identify] identify the object present at this location {}",
+# "[identify] what is it in {}",
+# "[identify] describe this object in {}",
+# "[identify] this {} is",
+# "[identify] the object in {} is",
+# ]
+
+# def __getitem__(self, index):
+# data = self.preprocess(index)
+
+# instruction = random.choice(self.instruction_pool).format(data['bbox'])
+
+# instruction = "
{} ".format(instruction)
+
+# return {
+# "image": data['image'],
+# "instruction_input": instruction,
+# "answer": self.text_processor(data['refer_sentence']),
+# "image_id": data['image_id'],
+# }
+
+
+class ReferCOCODataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.refer = REFER(ann_path, vis_root, dataset, splitBy)
+ self.ref_ids = self.refer.getRefIds(split="train")
+
+ print(dataset, len(self.ref_ids))
+
+ self.instruction_pool = [
+ "[refer] {}",
+ "[refer] give me the location of {}",
+ "[refer] where is {} ?",
+ "[refer] from this image, tell me the location of {}",
+ "[refer] the location of {} is",
+ "[refer] could you tell me the location for {} ?",
+ "[refer] where can I locate the {} ?",
+ ]
+
+
+ def __len__(self):
+ return len(self.ref_ids)
+
+ def preprocess(self, index):
+ ref_id = self.ref_ids[index]
+ ref = self.refer.loadRefs(ref_id)[0]
+
+ image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image_orig_size = image.size
+ image = self.vis_processor(image)
+ image_new_size = [image.shape[1], image.shape[2]]
+
+ image_new_size = [100,100]
+
+ sample_sentence = random.choice(ref['sentences'])['raw']
+ refer_sentence = self.text_processor(sample_sentence)
+
+
+ bbox = self.refer.getRefBox(ref['ref_id'])
+ bbox = [
+ bbox[0] / image_orig_size[0] * image_new_size[0],
+ bbox[1] / image_orig_size[1] * image_new_size[1],
+ (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+ (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+ ]
+ bbox = [int(x) for x in bbox]
+ bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+ return {
+ "image": image,
+ "refer_sentence": refer_sentence,
+ "bbox": bbox,
+ "image_id": ref['image_id'],
+ }
+
+ def __getitem__(self, index):
+ data = self.preprocess(index)
+ instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": data['image'],
+ "instruction_input": instruction,
+ "answer": data['bbox'],
+ "image_id": data['image_id'],
+ }
+
+
+class InvReferCOCODataset(ReferCOCODataset):
+ def __init__(self, *args, **kwargs):
+ super(InvReferCOCODataset, self).__init__(*args, **kwargs)
+
+ self.instruction_pool = [
+ "[identify] {}",
+ "[identify] what object is in this location {}",
+ "[identify] identify the object present at this location {}",
+ "[identify] what is it in {}",
+ "[identify] describe this object in {}",
+ "[identify] this {} is",
+ "[identify] the object in {} is",
+ ]
+
+ def __getitem__(self, index):
+ data = self.preprocess(index)
+
+ instruction = random.choice(self.instruction_pool).format(data['bbox'])
+
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": data['image'],
+ "instruction_input": instruction,
+ "answer": self.text_processor(data['refer_sentence']),
+ "image_id": data['image_id'],
+ }
+
+
+class REFER:
+ def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'):
+ # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+ # also provide dataset name and splitBy information
+ # e.g., dataset = 'refcoco', splitBy = 'unc'
+ dataset = dataset.split('inv')[-1] # inv dataset is stored in the same path as normal dataset
+ print('loading dataset %s into memory...' % dataset)
+ self.ann_dir = os.path.join(data_root, dataset)
+ if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+ self.vis_root = vis_root
+ elif dataset == 'refclef':
+ raise 'No RefClef image data'
+ else:
+ raise 'No refer dataset is called [%s]' % dataset
+
+ # load refs from data/dataset/refs(dataset).json
+ tic = time.time()
+ ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p')
+ self.data = {}
+ self.data['dataset'] = dataset
+ self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+
+ # load annotations from data/dataset/instances.json
+ instances_file = os.path.join(self.ann_dir, 'instances.json')
+ instances = json.load(open(instances_file, 'r'))
+ self.data['images'] = instances['images']
+ self.data['annotations'] = instances['annotations']
+ self.data['categories'] = instances['categories']
+
+ # create index
+ self.createIndex()
+ print('DONE (t=%.2fs)' % (time.time() - tic))
+
+ def createIndex(self):
+ # create sets of mapping
+ # 1) Refs: {ref_id: ref}
+ # 2) Anns: {ann_id: ann}
+ # 3) Imgs: {image_id: image}
+ # 4) Cats: {category_id: category_name}
+ # 5) Sents: {sent_id: sent}
+ # 6) imgToRefs: {image_id: refs}
+ # 7) imgToAnns: {image_id: anns}
+ # 8) refToAnn: {ref_id: ann}
+ # 9) annToRef: {ann_id: ref}
+ # 10) catToRefs: {category_id: refs}
+ # 11) sentToRef: {sent_id: ref}
+ # 12) sentToTokens: {sent_id: tokens}
+ print('creating index...')
+ # fetch info from instances
+ Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+ for ann in self.data['annotations']:
+ Anns[ann['id']] = ann
+ imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+ for img in self.data['images']:
+ Imgs[img['id']] = img
+ for cat in self.data['categories']:
+ Cats[cat['id']] = cat['name']
+
+ # fetch info from refs
+ Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+ Sents, sentToRef, sentToTokens = {}, {}, {}
+ for ref in self.data['refs']:
+ # ids
+ ref_id = ref['ref_id']
+ ann_id = ref['ann_id']
+ category_id = ref['category_id']
+ image_id = ref['image_id']
+
+ # add mapping related to ref
+ Refs[ref_id] = ref
+ imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+ catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+ refToAnn[ref_id] = Anns[ann_id]
+ annToRef[ann_id] = ref
+
+ # add mapping of sent
+ for sent in ref['sentences']:
+ Sents[sent['sent_id']] = sent
+ sentToRef[sent['sent_id']] = ref
+ sentToTokens[sent['sent_id']] = sent['tokens']
+
+ # create class members
+ self.Refs = Refs
+ self.Anns = Anns
+ self.Imgs = Imgs
+ self.Cats = Cats
+ self.Sents = Sents
+ self.imgToRefs = imgToRefs
+ self.imgToAnns = imgToAnns
+ self.refToAnn = refToAnn
+ self.annToRef = annToRef
+ self.catToRefs = catToRefs
+ self.sentToRef = sentToRef
+ self.sentToTokens = sentToTokens
+ print('index created.')
+
+ def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+ image_ids = image_ids if type(image_ids) == list else [image_ids]
+ cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+ ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+ if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+ refs = self.data['refs']
+ else:
+ if not len(image_ids) == 0:
+ refs = [self.imgToRefs[image_id] for image_id in image_ids]
+ else:
+ refs = self.data['refs']
+ if not len(cat_ids) == 0:
+ refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+ if not len(ref_ids) == 0:
+ refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+ if not len(split) == 0:
+ if split in ['testA', 'testB', 'testC']:
+ refs = [ref for ref in refs if
+ split[-1] in ref['split']] # we also consider testAB, testBC, ...
+ elif split in ['testAB', 'testBC', 'testAC']:
+ refs = [ref for ref in refs if ref['split'] == split] # rarely used I guess...
+ elif split == 'test':
+ refs = [ref for ref in refs if 'test' in ref['split']]
+ elif split == 'train' or split == 'val':
+ refs = [ref for ref in refs if ref['split'] == split]
+ else:
+ raise 'No such split [%s]' % split
+ ref_ids = [ref['ref_id'] for ref in refs]
+ return ref_ids
+
+ def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+ image_ids = image_ids if type(image_ids) == list else [image_ids]
+ cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+ ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+ if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+ ann_ids = [ann['id'] for ann in self.data['annotations']]
+ else:
+ if not len(image_ids) == 0:
+ lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns] # list of [anns]
+ anns = list(itertools.chain.from_iterable(lists))
+ else:
+ anns = self.data['annotations']
+ if not len(cat_ids) == 0:
+ anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+ ann_ids = [ann['id'] for ann in anns]
+ if not len(ref_ids) == 0:
+ ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+ return ann_ids
+
+ def getImgIds(self, ref_ids=[]):
+ ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+ if not len(ref_ids) == 0:
+ image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
+ else:
+ image_ids = self.Imgs.keys()
+ return image_ids
+
+ def getCatIds(self):
+ return self.Cats.keys()
+
+ def loadRefs(self, ref_ids=[]):
+ if type(ref_ids) == list:
+ return [self.Refs[ref_id] for ref_id in ref_ids]
+ elif type(ref_ids) == int:
+ return [self.Refs[ref_ids]]
+
+ def loadAnns(self, ann_ids=[]):
+ if type(ann_ids) == list:
+ return [self.Anns[ann_id] for ann_id in ann_ids]
+ elif type(ann_ids) == int:
+ return [self.Anns[ann_ids]]
+
+ def loadImgs(self, image_ids=[]):
+ if type(image_ids) == list:
+ return [self.Imgs[image_id] for image_id in image_ids]
+ elif type(image_ids) == int:
+ return [self.Imgs[image_ids]]
+
+ def loadCats(self, cat_ids=[]):
+ if type(cat_ids) == list:
+ return [self.Cats[cat_id] for cat_id in cat_ids]
+ elif type(cat_ids) == int:
+ return [self.Cats[cat_ids]]
+
+ def getRefBox(self, ref_id):
+ ref = self.Refs[ref_id]
+ ann = self.refToAnn[ref_id]
+ return ann['bbox'] # [x, y, w, h]
+
+ def showRef(self, ref, seg_box='box'):
+ ax = plt.gca()
+ # show image
+ image = self.Imgs[ref['image_id']]
+ I = io.imread(os.path.join(self.vis_root, image['file_name']))
+ ax.imshow(I)
+ # show refer expression
+ for sid, sent in enumerate(ref['sentences']):
+ print('%s. %s' % (sid + 1, sent['sent']))
+ # show segmentations
+ if seg_box == 'seg':
+ ann_id = ref['ann_id']
+ ann = self.Anns[ann_id]
+ polygons = []
+ color = []
+ c = 'none'
+ if type(ann['segmentation'][0]) == list:
+ # polygon used for refcoco*
+ for seg in ann['segmentation']:
+ poly = np.array(seg).reshape((len(seg) / 2, 2))
+ polygons.append(Polygon(poly, True, alpha=0.4))
+ color.append(c)
+ p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
+ ax.add_collection(p) # thick yellow polygon
+ p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
+ ax.add_collection(p) # thin red polygon
+ else:
+ # mask used for refclef
+ raise NotImplementedError('RefClef is not downloaded')
+ # show bounding-box
+ elif seg_box == 'box':
+ ann_id = ref['ann_id']
+ ann = self.Anns[ann_id]
+ bbox = self.getRefBox(ref['ref_id'])
+ box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+ ax.add_patch(box_plot)
diff --git a/minigpt4/datasets/datasets/coco_vqa_datasets.py b/minigpt4/datasets/datasets/coco_vqa_datasets.py
new file mode 100755
index 0000000..a60426d
--- /dev/null
+++ b/minigpt4/datasets/datasets/coco_vqa_datasets.py
@@ -0,0 +1,184 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+import random
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
+
+from collections import OrderedDict
+
+
+class __DisplMixin:
+ def displ_item(self, index):
+ sample, ann = self.__getitem__(index), self.annotation[index]
+
+ return OrderedDict(
+ {
+ "file": ann["image"],
+ "question": ann["question"],
+ "question_id": ann["question_id"],
+ "answers": "; ".join(ann["answer"]),
+ "image": sample["image"],
+ }
+ )
+
+
+class COCOVQADataset(VQADataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ self.instruction_pool =[
+ "[vqa] {}",
+ "[vqa] Based on the image, respond to this question with a short answer: {}"
+ ]
+
+ exist_annotation = []
+ for ann in self.annotation:
+ image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+ if os.path.exists(image_path):
+ exist_annotation.append(ann)
+ self.annotation = exist_annotation
+
+
+ def get_data(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ question = self.text_processor(ann["question"])
+ question_id = ann["question_id"]
+
+ answer_weight = {}
+ for answer in ann["answer"]:
+ if answer in answer_weight.keys():
+ answer_weight[answer] += 1 / len(ann["answer"])
+ else:
+ answer_weight[answer] = 1 / len(ann["answer"])
+
+ answers = list(answer_weight.keys())
+ weights = list(answer_weight.values())
+
+ answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
+
+ # if "unk" in answer:
+ # print("cocovqa", answer)
+
+ return {
+ "image": image,
+ "question": question,
+ "question_id": question_id,
+ "answer": answer,
+ }
+
+ def __getitem__(self, index):
+ data = self.get_data(index)
+ instruction = random.choice(self.instruction_pool).format(data['question'])
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": data['image'],
+ "question_id": data["question_id"],
+ "instruction_input": instruction,
+ "answer": self.text_processor(data['answer']),
+ }
+
+
+class COCOVQGDataset(COCOVQADataset):
+
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+ self.instruction_pool = [
+ 'Given the image, generate a question whose answer is: {}',
+ 'Based on the image, provide a question with the answer: {}',
+ 'Given the visual representation, create a question for which the answer is "{}"',
+ 'From the image provided, craft a question that leads to the reply: {}',
+ 'Considering the picture, come up with a question where the answer is: {}',
+ 'Taking the image into account, generate an question that has the answer: {}'
+ ]
+
+ def __getitem__(self, index):
+ data = self.get_data(index)
+ instruction = random.choice(self.instruction_pool).format(data['answer'])
+ instruction = "
{}".format(instruction)
+
+ return {
+ "image": data['image'],
+ "question_id": data["question_id"],
+ "instruction_input": instruction,
+ "answer": data['question'],
+ }
+
+
+
+class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+
+ self.instruction_pool = [
+# '{}',
+# 'Question: {}',
+# '{} A short answer to the question is',
+# 'Q: {} A:',
+ 'Question: {} Short answer:',
+# 'Given the image, answer the following question with no more than three words. {}',
+# 'Based on the image, respond to this question with a short answer: {}.',
+# 'Use the provided image to answer the question: {} Provide your answer as short as possible.',
+# 'What is the answer to the following question? "{}"',
+# 'The question "{}" can be answered using the image. A short answer is'
+ ]
+# print('vis_root', vis_root)
+ self.vis_root = vis_root
+
+ self.annotation = json.load(open(ann_paths[0]))
+
+ answer_list_path = ann_paths[1]
+ if os.path.exists(answer_list_path):
+ self.answer_list = json.load(open(answer_list_path))
+ else:
+ self.answer_list = None
+
+ try:
+ self.coco_fmt_qust_file = ann_paths[2]
+ self.coco_fmt_anno_file = ann_paths[3]
+ except IndexError:
+ self.coco_fmt_qust_file = None
+ self.coco_fmt_anno_file = None
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self._add_instance_ids()
+
+ def __getitem__(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ question = self.text_processor(ann["question"])
+
+ instruction = random.choice(self.instruction_pool).format(question)
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": image,
+ 'image_path': image_path,
+ "question": question,
+ "question_id": ann["question_id"],
+ "instruction_input": instruction,
+ "instance_id": ann["instance_id"],
+ }
diff --git a/minigpt4/datasets/datasets/doc_dataset.py b/minigpt4/datasets/datasets/doc_dataset.py
new file mode 100755
index 0000000..adc2d6c
--- /dev/null
+++ b/minigpt4/datasets/datasets/doc_dataset.py
@@ -0,0 +1,290 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class SingleSlideVQADataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+ self.data = self.create_data(ann_path)
+
+ # self.instruction_pool = [
+ # "###Human:
{}###Assistant: ",
+ # "###Human:
From this slide, {}###Assistant: ",
+ # ]
+ self.instruction_pool = [
+ "
{}",
+ "
From this slide, {}",
+ ]
+ def create_data(self, ann_path):
+ with open(ann_path, 'r') as f:
+ samples = f.readlines()
+ data = []
+ for sample in samples:
+ sample = json.loads(sample)
+ if len(sample['evidence_pages']) != 1: continue # skip questions that need more than one slide page
+ page = sample['evidence_pages'][0]
+ image_name = 'slide_{}_1024.jpg'.format(page)
+ # assert [int(image_name.split('-')[-2]) for image_name in image_names] == list(range(1, 21)) # check the format
+ image_path = os.path.join(sample['deck_name'], image_name)
+ data.append({
+ 'qa_id': sample['qa_id'],
+ 'question': sample['question'],
+ 'answer': sample['answer'],
+ 'image_path': image_path
+ })
+
+ print("single slide ",len(data))
+ return data
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, index):
+ sample = self.data[index]
+ image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+ image = self.vis_processor(image)
+
+ # instruction = self.text_processor(sample["question"])
+ instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
+
+ # instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": sample['answer'],
+ "qa_id": sample['qa_id'],
+ }
+
+
+class OCRVQADataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+ self.data = self.create_data(ann_path)
+
+ self.instruction_pool =[
+ "[vqa] {}",
+ "[vqa] Based on the image, respond to this question with a short answer: {}"
+ ]
+
+ def create_data(self, ann_path):
+ processed_data = []
+ with open(ann_path, 'r') as f:
+ data = json.load(f)
+ for k in data.keys():
+ if data[k]['split'] != 1: continue # 1 for training, 2 for validation, 3 for test
+ ext = os.path.splitext(data[k]['imageURL'])[1]
+ imageFile = k + ext
+ assert len(data[k]['questions']) == len(data[k]['answers'])
+ for q, a in zip(data[k]['questions'], data[k]['answers']):
+ processed_data.append(
+ {'question': q,
+ 'answer': a,
+ 'image_path': imageFile,
+ 'image_id': k,
+ 'title': data[k]['title'],
+ 'genre': data[k]['genre'],
+ }
+ )
+ print("ocr vqa", len(processed_data))
+ return processed_data
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, index):
+ sample = self.data[index]
+ image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+ image = self.vis_processor(image)
+ question = self.text_processor(sample["question"])
+ answer = self.text_processor(sample["answer"])
+
+ instruction = random.choice(self.instruction_pool).format(question)
+ instruction = "
{} ".format(instruction)
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": sample['image_id']
+ }
+
+
+
+
+
+class TextOCRDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+ self.data = self.create_data(ann_path)
+
+ self.instruction_pool = [
+ "
[OCR] {}"
+ ]
+
+ def create_data(self, ann_path):
+ processed_data = []
+ with open(ann_path, 'r') as f:
+ data = json.load(f)
+ for k in data["anns"].keys():
+ # ext = os.path.splitext(data[k]['imageURL'])[1]
+ imageFile = data["anns"][k]["image_id"]+".jpg"
+ bbox = data["anns"][k]["bbox"]
+ text = data["anns"][k]["utf8_string"]
+ # assert len(data[k]['questions']) == len(data[k]['answers'])
+ # for q, a in zip(data[k]['questions'], data[k]['answers']):
+
+ processed_data.append(
+ {'bbox': bbox,
+ 'answer': text,
+ 'image_path': imageFile,
+ 'image_id': k,
+ }
+ )
+
+ return processed_data
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, index):
+ sample = self.data[index]
+ image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+ width, height = image.size
+ image = self.vis_processor(image)
+
+ new_bbox =""
+ image_size = 100
+ bbox = sample['bbox']
+ for index in range(len(bbox)):
+
+ x1 = int(bbox[0]/width*image_size)
+ y1 = int(bbox[1]/height*image_size)
+ x2 = x1 + int(bbox[2]/width*image_size)
+ y2 = y1 + int(bbox[3]/height*image_size)
+ assert x1>=0 and x1<=image_size
+ assert x2>=0 and x2<=image_size
+ assert y1>=0 and y1<=image_size
+ assert y2>=0 and y2<=image_size
+
+ new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
+
+ instruction = random.choice(self.instruction_pool).format(new_bbox)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": sample['answer'],
+ "image_id": sample['image_id']
+ }
+
+
+
+class PlotVQADataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+ self.data = self.create_data(ann_path)
+
+ self.instruction_pool = [
+ '{}',
+ 'Question: {}',
+ '{} A short answer to the question is',
+ 'Q: {} A:',
+ 'Question: {} Short answer:',
+ # 'Given the image, answer the following question with no more than three words. {}',
+ 'Based on the image, respond to this question with a short answer: {}.',
+ 'Use the provided image to answer the question: {} Provide your answer as short as possible.',
+ 'What is the answer to the following question? "{}"',
+ 'The question "{}" can be answered using the image. A short answer is'
+ ]
+
+ def create_data(self, ann_path):
+ processed_data = []
+ with open(ann_path, 'r') as f:
+ data = json.load(f)
+ for da in data["qa_pairs"]:
+ # ext = os.path.splitext(data[k]['imageURL'])[1]
+
+ imageFile = str(da["image_index"])+".png"
+ question = da["question_string"]
+ answer = str(da["answer"])
+ # assert len(data[k]['questions']) == len(data[k]['answers'])
+ # for q, a in zip(data[k]['questions'], data[k]['answers']):
+
+ processed_data.append(
+ {'question': question,
+ 'answer': answer,
+ 'image_path': imageFile,
+ 'image_id': str(da["image_index"]),
+ }
+ )
+
+ return processed_data
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, index):
+ sample = self.data[index]
+ image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
+ # width, height = image.size
+ image = self.vis_processor(image)
+
+
+ # image_shape = image.shape
+ instruction = "
{} ".format(sample["question"])
+
+ instruction = random.choice(self.instruction_pool).format(instruction)
+
+ answer = sample["answer"]
+
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": sample['image_id']
+ }
+
diff --git a/minigpt4/datasets/datasets/flickr.py b/minigpt4/datasets/datasets/flickr.py
new file mode 100755
index 0000000..68355f7
--- /dev/null
+++ b/minigpt4/datasets/datasets/flickr.py
@@ -0,0 +1,159 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class GroundedDetailDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.instruction_pool = [
+ '[grounding] please describe this image in details',
+ '[grounding] describe this image as detailed as possible',
+ '[grounding] summarize this image in details',
+ '[grounding] give a thorough description of what you see in this image',
+ ]
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+ image_file = '{}.jpg'.format(info['image_id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ answer = info['grounded_caption']
+
+ instruction = random.choice(self.instruction_pool)
+
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": info['image_id'],
+ }
+
+
+
+
+class CaptionToObjectDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.instruction_pool = [
+ '[detection] {}',
+ ]
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+ image_file = '{}.jpg'.format(info['image_id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ input = info["caption"]
+ answer = info["output"]
+
+ instruction = random.choice(self.instruction_pool).format(input)
+
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": info['image_id'],
+ }
+
+
+
+
+class PhraseToObjectDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.instruction_pool = [
+ '[detection] {}',
+ ]
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
+ image_file = '{}.jpg'.format(info['image_id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ input = info["phrase"]
+ answer = ""+input+"
"+info["bbox"]
+
+ instruction = random.choice(self.instruction_pool).format(input)
+
+ instruction = "
{} ".format(instruction)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": info['image_id'],
+ }
diff --git a/minigpt4/datasets/datasets/gqa_datasets.py b/minigpt4/datasets/datasets/gqa_datasets.py
new file mode 100755
index 0000000..053f9eb
--- /dev/null
+++ b/minigpt4/datasets/datasets/gqa_datasets.py
@@ -0,0 +1,65 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import json
+
+from PIL import Image
+
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset
+
+from collections import OrderedDict
+import random
+
+class __DisplMixin:
+ def displ_item(self, index):
+ sample, ann = self.__getitem__(index), self.annotation[index]
+
+ return OrderedDict(
+ {
+ "file": ann["image"],
+ "question": ann["question"],
+ "question_id": ann["question_id"],
+ "answers": "; ".join(ann["answer"]),
+ "image": sample["image"],
+ }
+ )
+
+
+class GQADataset(VQADataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+ self.instruction_pool =[
+ "[vqa] {}",
+ "[vqa] Based on the image, respond to this question with a short answer: {}"
+ ]
+
+ def __getitem__(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ question = self.text_processor(ann["question"])
+
+ instruction = random.choice(self.instruction_pool).format(question)
+ instruction = "
{} ".format(instruction)
+
+ answers = self.text_processor(ann["answer"])
+ # if "unk" in answers:
+ # print("gqa",answers)
+
+ # print(answers)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answers,
+ # "weights": weights,
+ }
+
diff --git a/minigpt4/datasets/datasets/llava_dataset.py b/minigpt4/datasets/datasets/llava_dataset.py
new file mode 100755
index 0000000..27b034c
--- /dev/null
+++ b/minigpt4/datasets/datasets/llava_dataset.py
@@ -0,0 +1,390 @@
+import os
+import json
+import pickle
+import random
+import time
+# import iterto
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+class LlavaDetailDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ answer = info['conversations'][1]['value']
+ instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip()
+
+ instruction = '
{} '.format(self.text_processor(instruction))
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": info['id'],
+ }
+
+class LlavaReasonDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ answer = info['conversations'][1]['value']
+ instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip()
+
+ instruction = '
{} '.format(self.text_processor(instruction))
+
+ # instruction = '
{} '.format(self.text_processor(instruction))
+ # answer = self.text_processor(answer)
+
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ "image_id": info['id'],
+ }
+
+
+
+
+class MiniGPT4v(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+
+ self.instruction_pool = [
+ 'please describe this image as detailed as possible',
+ 'What do you see happening in this image?',
+ "Can you elaborate on the elements of the picture provided?",
+ "Describe the following image.",
+ "Write a detailed description of the given image.",
+ "Write a detailed description of the given image.",
+ "Explain the visual content of the image in great detail"
+ ]
+ self.ann=[]
+
+ with open(ann_path,"r") as f:
+ for line in f.readlines():
+ self.ann.append(json.loads(line))
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+ # print("info keys",info.keys())
+ if "image_path" in info.keys():
+ image_path = "/ibex/reference/CV/COCO/cocoapi/data/2017/images/jpeg/train/"+info['image_path']
+
+ else:
+ # print("coming here?")
+ image_file = "images/"+info["image"]
+ image_path = os.path.join(self.vis_root, image_file)
+ # print(image_path)
+
+
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+ if "question" in info.keys():
+ question = info['question']
+ else:
+ question = random.sample(self.instruction_pool,1)[0]
+
+
+ answer = info["caption"]
+
+
+ instruction = '
{} '.format(self.text_processor(question))
+
+ # instruction = '
{} '.format(self.text_processor(instruction))
+ # answer = self.text_processor(answer)
+ # print("image path", image_path)
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ # "image_id": info['id'],
+ }
+
+
+
+
+class MiniGPT4v_emotion(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+
+ self.instruction_pool = [
+ 'please describe this image as detailed as possible',
+ 'What do you see happening in this image?',
+ "Can you elaborate on the elements of the picture provided?",
+ "Describe the following image",
+ "Write a detailed description of the given image",
+ "Write a detailed description of the given image",
+ "Explain the visual content of the image in great detail"
+ ]
+ # self.ann=[]
+
+ with open(ann_path,"r") as f:
+ # for line in f.readlines():
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+ # print("info keys",info.keys())
+
+ # print("coming here?")
+ image_file = info["link"]
+ image_path = os.path.join(self.vis_root, image_file)
+ # print("image path",image_path)
+ # print(image_path)
+
+
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ question = random.sample(self.instruction_pool,1)[0]
+
+
+ answer = info["caption"]
+
+
+ instruction = '
{} '.format(self.text_processor(question))
+
+ # instruction = '
{} '.format(self.text_processor(instruction))
+ # answer = self.text_processor(answer)
+ # print("image path", image_path)
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ # "image_id": info['id'],
+ }
+
+
+
+
+class MiniGPT4v_laion(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+
+ self.instruction_pool = [
+ 'please describe this image as detailed as possible',
+ 'What do you see happening in this image?',
+ "Can you elaborate on the elements of the picture provided?",
+ "Describe the following image",
+ "Write a detailed description of the given image",
+ "Write a detailed description of the given image",
+ "Explain the visual content of the image in great detail"
+ ]
+ # self.ann=[]
+
+ with open(ann_path,"r") as f:
+ # for line in f.readlines():
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
+ # print("info keys",info.keys())
+
+ # print("coming here?")
+ image_file = info["link"]
+ image_path = os.path.join(self.vis_root, image_file)
+ # print(image_path)
+ # print(image_path)
+
+
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ question = random.sample(self.instruction_pool,1)[0]
+
+
+ answer = info["caption"]
+
+
+ instruction = '
{} '.format(self.text_processor(question))
+
+ # instruction = '
{} '.format(self.text_processor(instruction))
+ # answer = self.text_processor(answer)
+ # print("image path", image_path)
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": answer,
+ # "image_id": info['id'],
+ }
+
+
+
+class Minigpt2_conversation(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ answer = info['conversations'][1]['value']
+ instruction = info['conversations'][0]['value']
+
+ # print("instruction",instruction)
+ # print("answer", answer)
+
+ return {
+ "instruction_input": instruction,
+ "answer": answer,
+ }
+
+
+class LlavaConversationDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.ann=[]
+
+
+ # with open(ann_path, 'r') as f:
+ # self.ann = json.load(f)
+
+ self.connect_sym = "!@#"
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ first_instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip()
+ first_instruction = '
{} '.format(first_instruction)
+
+ questions = [first_instruction]
+ answers = []
+
+ for i, item in enumerate(info["conversations"][1:]):
+ if i % 2 ==0: # assistant
+ assistant_answer = item["value"]
+ answers.append(assistant_answer)
+ else:
+ human_instruction = item["value"]+" "
+ questions.append(human_instruction)
+
+ questions = self.connect_sym.join(questions)
+ # questions = questions.replace("\\\\","\\")
+ answers = self.connect_sym.join(answers)
+
+
+ return {
+ "image": image,
+ "conv_q": questions,
+ 'conv_a': answers,
+ "image_id": info['id'],
+ "connect_sym": self.connect_sym
+ }
\ No newline at end of file
diff --git a/minigpt4/datasets/datasets/multitask_conversation.py b/minigpt4/datasets/datasets/multitask_conversation.py
new file mode 100644
index 0000000..3b13e52
--- /dev/null
+++ b/minigpt4/datasets/datasets/multitask_conversation.py
@@ -0,0 +1,75 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+
+class MultiTaskConversationDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ self.connect_sym = "!@#"
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]
+
+ image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+
+ first_instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip()
+ first_instruction = '
{} '.format(first_instruction)
+
+ questions = [first_instruction]
+ answers = []
+
+ for i, item in enumerate(info["conversations"][1:]):
+ if i % 2 ==0: # assistant
+ assistant_answer = item["value"]
+ answers.append(assistant_answer)
+ else:
+ human_instruction = item["value"]+" "
+ questions.append(human_instruction)
+
+ questions = self.connect_sym.join(questions)
+ answers = self.connect_sym.join(answers)
+
+
+ return {
+ "image": image,
+ "conv_q": questions,
+ 'conv_a': answers,
+ "image_id": info['id'],
+ "connect_sym": self.connect_sym
+ }
\ No newline at end of file
diff --git a/minigpt4/datasets/datasets/text_caps.py b/minigpt4/datasets/datasets/text_caps.py
new file mode 100755
index 0000000..21446c2
--- /dev/null
+++ b/minigpt4/datasets/datasets/text_caps.py
@@ -0,0 +1,186 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+
+
+
+class TextCapDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ self.instruction_pool = [
+ # "generate a short image caption incorporating text in the image",
+ # "generate a brief image description combining the text shown in the image",
+ # "what text is writen in this image?",
+ # "describe the text that you can see from this image",
+ # "What does the text in the image say?"
+ 'Briefly describe this image.',
+ 'Provide a concise depiction of this image.',
+ 'Present a short description of this image.',
+ 'Summarize this image in a few words.',
+ 'A short image caption:',
+ 'A short image description:',
+ 'A photo of ',
+ 'An image that shows ',
+ 'Write a short description for the image. ',
+ 'Write a description for the photo.',
+ 'Provide a description of what is presented in the photo.',
+ 'Briefly describe the content of the image.',
+ 'Can you briefly explain what you see in the image?',
+ 'Could you use a few words to describe what you perceive in the photo?',
+ 'Please provide a short depiction of the picture.',
+ 'Using language, provide a short account of the image.',
+ 'Use a few words to illustrate what is happening in the picture.',
+ ]
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+
+ def __len__(self):
+ return len(self.ann["data"])
+
+
+ def __getitem__(self, index):
+ info = self.ann["data"][index]
+
+ image_file = '{}.jpg'.format(info['image_id'])
+
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ # image_width,image_length = image.size
+ image = self.vis_processor(image)
+
+ # ocr_info = self.ann[index]["data"]
+ caption = info["caption_str"]
+ caption = self.text_processor(caption)
+
+ # instruction = random.choice(self.instruction_pool).format(word_bbox)
+ instruction = "
[caption] {} ".format(random.choice(self.instruction_pool))
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": caption,
+ "data_type": "bbox",
+ "question_split": True
+ }
+
+
+
+class TextCapBboxToObjectDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.vis_root = vis_root
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ # self.instruction_pool = [
+ # "
What text does it show in {} ",
+ # "
Extract the text from {} ",
+ # "
What is the textual content in {} ",
+ # "
Extract the textual information present in the {} ",
+ # "
What is the text written within this defined region {}",
+ # "
Transcribe the text located inside {}",
+ # "
Can you read and extract the text from this specific area {}",
+ # ]
+
+ self.instruction_pool = [
+ "
[OCR] {}"
+ ]
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ self.new_ann = {"data":[]}
+ for da in self.ann["data"]:
+ if da["ocr_info"] !=[]:
+ ocr_info_filter = []
+ for d in da["ocr_info"]:
+ if (d["bounding_box"]["width"]+d["bounding_box"]["top_left_x"])<=1.0 and (d["bounding_box"]["height"]+d["bounding_box"]["top_left_y"]) <=1.0 \
+ and d["bounding_box"]["top_left_x"]>=0 and d["bounding_box"]["top_left_y"]>=0:
+ ocr_info_filter.append(d)
+ if ocr_info_filter !=[]:
+ da["ocr_info"]=ocr_info_filter
+ self.new_ann["data"].append(da)
+ self.ann = self.new_ann
+
+
+ def __len__(self):
+ return len(self.ann["data"])
+
+
+ def __getitem__(self, index):
+
+ info = self.ann["data"][index]
+
+
+ image_file = '{}.jpg'.format(info['image_id'])
+
+ image_path = os.path.join(self.vis_root, image_file)
+ image = Image.open(image_path).convert("RGB")
+ # image_width,image_length = image.size
+ image = self.vis_processor(image)
+
+
+
+ image_size = 100
+
+ ocr_info = info["ocr_info"]
+
+ sampled_ocr = random.sample(ocr_info,1)[0]
+
+ # print("sampled ocr", sampled_ocr)
+
+ word_text = sampled_ocr["word"]
+ width = sampled_ocr["bounding_box"]["width"]
+ height = sampled_ocr["bounding_box"]["height"]
+ top_left_x = sampled_ocr["bounding_box"]["top_left_x"]
+ top_left_y = sampled_ocr["bounding_box"]["top_left_y"]
+
+ x1 = int(top_left_x*image_size)
+ y1 = int(top_left_y*image_size)
+ x2 = x1 + int(width*image_size)
+ y2 = y1 + int(height*image_size)
+ assert x1>=0 and x1<=image_size
+ assert x2>=0 and x2<=image_size
+ assert y1>=0 and y1<=image_size
+ assert y2>=0 and y2<=image_size
+
+
+ word_bbox = "{<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">}"
+
+ instruction = random.choice(self.instruction_pool).format(word_bbox)
+ return {
+ "image": image,
+ "instruction_input": instruction,
+ "answer": word_text,
+ "data_type": "bbox",
+ "question_split": True
+ }
\ No newline at end of file
diff --git a/minigpt4/datasets/datasets/unnatural_instruction.py b/minigpt4/datasets/datasets/unnatural_instruction.py
new file mode 100755
index 0000000..4857006
--- /dev/null
+++ b/minigpt4/datasets/datasets/unnatural_instruction.py
@@ -0,0 +1,52 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from torch.utils.data import Dataset
+import webdataset as wds
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class UnnaturalDataset(Dataset):
+ def __init__(self, text_processor, ann_path):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.text_processor = text_processor
+
+ with open(ann_path, 'r') as f:
+ self.ann = json.load(f)
+
+ # with open(ann_path, 'r') as f:
+ # for data in f.readlines():
+ # data = json.loads(data)
+ # self.ann.append(data)
+
+ def __len__(self):
+ return len(self.ann)
+
+ def __getitem__(self, index):
+ info = self.ann[index]["instances"][0]
+ instruction = info["instruction_with_input"]
+ constraints = info["constraints"]
+ answer = info["output"]
+ if constraints != None:
+ instruction = instruction+" "+constraints
+
+ return {
+ # "image":None,
+ "instruction_input": self.text_processor(instruction),
+ "answer": self.text_processor(answer),
+ }
diff --git a/minigpt4/datasets/datasets/vg_dataset.py b/minigpt4/datasets/datasets/vg_dataset.py
new file mode 100755
index 0000000..3042edd
--- /dev/null
+++ b/minigpt4/datasets/datasets/vg_dataset.py
@@ -0,0 +1,98 @@
+import os
+import json
+import pickle
+import random
+import time
+import itertools
+
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+from visual_genome import local
+
+
+import threading
+
+# Global lock
+lock = threading.Lock()
+
+
+class ReferVisualGenomeDataset(Dataset):
+ def __init__(self, vis_processor, text_processor, data_dir):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ self.data_dir = data_dir
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+
+ all_regions = local.get_all_region_descriptions(self.data_dir)
+ all_regions = [region for regions in all_regions for region in regions]
+
+ # follow OFA practice, only regions smaller than 16384 pixels are used for refer
+ self.regions = [region for region in all_regions if region.width * region.height < 16384]
+
+ print('Visual Genome grounding', len(self.regions))
+
+
+ self.instruction_pool = [
+ "[refer] {}",
+ "[refer] give me the location of {}",
+ "[refer] where is {} ?",
+ "[refer] from this image, tell me the location of {}",
+ "[refer] the location of {} is",
+ "[refer] could you tell me the location for {} ?",
+ "[refer] where can I locate the {} ?",
+ ]
+
+
+ def __len__(self):
+ return len(self.regions)
+
+ def preprocess(self, index):
+ region = self.regions[index]
+ image_file = region.image.url.split('/')[-2:]
+ image_path = os.path.join(self.data_dir, *image_file)
+ image = Image.open(image_path).convert("RGB")
+ image_orig_size = image.size
+ image = self.vis_processor(image)
+ image_new_size = [100,100]
+
+ sample_sentence = region.phrase
+ refer_sentence = self.text_processor(sample_sentence)
+
+ bbox = [region.x, region.y, region.width, region.height]
+
+ bbox = [
+ bbox[0] / image_orig_size[0] * image_new_size[0],
+ bbox[1] / image_orig_size[1] * image_new_size[1],
+ (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
+ (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
+ ]
+ bbox = [int(x) for x in bbox]
+ bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
+ return {
+ "image": image,
+ "refer_sentence": refer_sentence,
+ "bbox": bbox,
+ "image_id": region.image.id,
+ }
+
+ def __getitem__(self, index):
+ data = self.preprocess(index)
+ instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
+
+ instruction = "
{} ".format(instruction)
+
+ # assert False
+
+ return {
+ "image": data['image'],
+ "instruction_input": instruction,
+ "answer": data['bbox'],
+ "image_id": data['image_id'],
+ }
+
+
diff --git a/minigpt4/datasets/datasets/vqa_datasets.py b/minigpt4/datasets/datasets/vqa_datasets.py
new file mode 100755
index 0000000..5cdc0fa
--- /dev/null
+++ b/minigpt4/datasets/datasets/vqa_datasets.py
@@ -0,0 +1,223 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import torch
+from PIL import Image
+import os
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+
+
+class VQADataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ # def collater(self, samples):
+ # image_list, question_list, answer_list, weight_list = [], [], [], []
+
+ # num_answers = []
+
+ # for sample in samples:
+ # image_list.append(sample["image"])
+ # question_list.append(sample["question"])
+
+ # weight_list.extend(sample["weights"])
+
+ # answers = sample["answer"]
+
+ # answer_list.extend(answers)
+ # num_answers.append(len(answers))
+
+ # return {
+ # "image": torch.stack(image_list, dim=0),
+ # "text_input": question_list,
+ # "answer": answer_list,
+ # "weight": torch.Tensor(weight_list),
+ # "n_answers": torch.LongTensor(num_answers),
+ # }
+
+
+class VQAEvalDataset(BaseDataset):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+
+class OKVQAEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ img_id = data['image_id']
+ question = data['question']
+ question_id = data['question_id']
+ img_file = '{:0>12}.jpg'.format(img_id)
+ image_path = os.path.join(self.root_path, img_file)
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image)
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+ # question = f"[vqa] {question} "
+ return image, question, question_id, img_id
+
+class VizWizEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ img_id = data['image']
+ question = data['question']
+ answers = data['answers']
+ answers = '_'.join([answer['answer'] for answer in answers])
+ image_path = os.path.join(self.root_path, img_id)
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image)
+ # question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
+ return image, question, answers
+
+class AOKVQADAEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ img_file = data['image']
+ question = data['question']
+ question_id = data['question_id']
+ image_path = os.path.join(self.root_path, img_file)
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image)
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+ # question = f"[vqa] {question} "
+ return image, question, question_id
+
+class AOKVQAMCEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ img_file = data['image']
+ question = data['question']
+ question_id = data['question_id']
+ image_path = os.path.join(self.root_path, img_file)
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image).half().cuda()
+ candidates=data['choices']
+ # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+ # question = f"[vqa] {question} "
+ return image, question, question_id, candidates
+
+class IconQAEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ data = self.loaded_data[idx]
+ image_id = data['image_id']
+ question = data['question']
+ image_path = os.path.join(self.root_path, image_id, 'image.png')
+ image = Image.open(image_path).convert('RGB')
+ image = self.vis_processor(image).half().cuda()
+ candidates = '_'.join(data['choices'])
+ answer = data['answer']
+ # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+ # question = f"[vqa] {question} "
+ return image, question, candidates, answer
+
+class GQAEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ ann = self.loaded_data[idx]
+ image_id = ann["image"]
+ image_path = os.path.join(self.root_path, f"{image_id}")
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+ question = ann["question"]
+ # question = f'Question: {question} Short answer: '
+ question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
+ # question = f"[vqa] {question} "
+ labels = ann["answer"]
+
+ return image, question, labels
+
+class HMEvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ ann = self.loaded_data[idx]
+ image_id = ann["img"]
+ image_path = os.path.join(self.root_path, f"{image_id}")
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+ question = ann["text"]
+ question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
+ labels = ann["label"]
+
+ return image, question, labels
+
+class VSREvalData(torch.utils.data.Dataset):
+ def __init__(self, loaded_data, vis_processor, root_path):
+ self.loaded_data = loaded_data
+ self.root_path = root_path
+ self.vis_processor = vis_processor
+
+ def __len__(self):
+ return len(self.loaded_data)
+
+ def __getitem__(self, idx):
+ ann = self.loaded_data[idx]
+ image_path = os.path.join(self.root_path, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+ image = self.vis_processor(image)
+ question = ann["caption"]
+ question = f'[vqa] Based on the image, is this statement true or false? {question}'
+ question_id = ann["image"].split('.')[0]
+ labels = 'true' if ann["label"] == 1 else 'false'
+
+ return image, question, labels
\ No newline at end of file
diff --git a/train_configs/minigpt_v2_finetune.yaml b/train_configs/minigpt_v2_finetune.yaml
new file mode 100644
index 0000000..be5c7c6
--- /dev/null
+++ b/train_configs/minigpt_v2_finetune.yaml
@@ -0,0 +1,300 @@
+model:
+ arch: minigpt_v2
+ model_type: pretrain
+ freeze_vit: True
+ freeze_qformer: True
+ max_txt_len: 1024
+ low_resource: False
+ image_size: 448
+ end_sym: ""
+ llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
+ ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
+ use_grad_checkpoint: True
+ chat_template: True
+ lora_r: 64
+ lora_alpha: 16
+
+
+datasets:
+
+
+ multitask_conversation:
+ batch_size_train: 2
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 448
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 40
+
+ llava_conversation: # 77k
+ batch_size_train: 2
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 448
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 10
+
+
+
+
+ # unnatural_instruction:
+ # batch_size: 1
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 15
+
+
+ # refvg:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 40
+
+ # llava_detail: #23K
+ # batch_size: 4
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 20
+
+ # llava_reason: # 77k
+ # batch_size: 4
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 80
+
+
+ # grounded_detailed_image_caption:
+ # batch_size: 2
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 80
+
+ # CaptionToPhrase:
+ # batch_size: 2
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 80
+
+ # ObjectToPhrase:
+ # batch_size: 2
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 80
+
+ # coco_caption:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+
+ # textcaps_caption: #
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+ # refcoco: # 142k
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 15
+
+
+ # refcocop:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 15
+
+ # refcocog:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 15
+
+
+
+ # invrefcoco:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+ # invrefcocop:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+ # invrefcocog:
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+
+ # coco_vqa: # 82K
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 15
+
+ # ok_vqa: # 9k
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 8
+
+ # aok_vqa: # 17k
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 12
+
+ # gqa: # 82K
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 40
+
+ # ocrvqa: # 800K
+ # batch_size: 6
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 448
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # sample_ratio: 30
+
+
+run:
+ task: image_text_pretrain
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 1e-5
+ min_lr: 8e-5
+ warmup_lr: 1e-6
+
+ weight_decay: 0.05
+ max_epoch: 50
+ num_workers: 6
+ warmup_steps: 1000
+ iters_per_epoch: 1000
+
+ seed: 42
+ output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file