From 0e5d34ad2ead3eb3ec67fe8c4e40bb398d96555c Mon Sep 17 00:00:00 2001 From: junchen14 Date: Sun, 22 Oct 2023 21:37:45 +0300 Subject: [PATCH] add finetuning code --- jobs/srun_test.sh | 30 + .../configs/datasets/aokvqa/defaults.yaml | 29 + minigpt4/configs/datasets/coco/caption.yaml | 38 + .../configs/datasets/coco/defaults_vqa.yaml | 33 + .../datasets/coco_bbox/invrefcoco.yaml | 8 + .../datasets/coco_bbox/invrefcocog.yaml | 8 + .../datasets/coco_bbox/invrefcocop.yaml | 8 + .../configs/datasets/coco_bbox/refcoco.yaml | 8 + .../configs/datasets/coco_bbox/refcocog.yaml | 8 + .../configs/datasets/coco_bbox/refcocop.yaml | 8 + .../datasets/flickr/caption_to_phrase.yaml | 6 + minigpt4/configs/datasets/flickr/default.yaml | 6 + .../datasets/flickr/object_to_phrase.yaml | 6 + .../configs/datasets/gqa/balanced_val.yaml | 33 + .../configs/datasets/llava/conversation.yaml | 12 + minigpt4/configs/datasets/llava/detail.yaml | 12 + minigpt4/configs/datasets/llava/reason.yaml | 12 + .../multitask_conversation/default.yaml | 14 + .../datasets/nlp/unnatural_instruction.yaml | 10 + minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml | 12 + minigpt4/configs/datasets/okvqa/defaults.yaml | 36 + .../configs/datasets/textcaps/caption.yaml | 16 + minigpt4/configs/datasets/vg/ref.yaml | 10 + .../builders/image_text_pair_builder.py | 412 +++++++++++ .../datasets/datasets/aok_vqa_datasets.py | 212 ++++++ minigpt4/datasets/datasets/coco_caption.py | 122 ++++ minigpt4/datasets/datasets/coco_dataset.py | 667 ++++++++++++++++++ .../datasets/datasets/coco_vqa_datasets.py | 184 +++++ minigpt4/datasets/datasets/doc_dataset.py | 290 ++++++++ minigpt4/datasets/datasets/flickr.py | 159 +++++ minigpt4/datasets/datasets/gqa_datasets.py | 65 ++ minigpt4/datasets/datasets/llava_dataset.py | 390 ++++++++++ .../datasets/multitask_conversation.py | 75 ++ minigpt4/datasets/datasets/text_caps.py | 186 +++++ .../datasets/unnatural_instruction.py | 52 ++ minigpt4/datasets/datasets/vg_dataset.py | 98 +++ minigpt4/datasets/datasets/vqa_datasets.py | 223 ++++++ train_configs/minigpt_v2_finetune.yaml | 300 ++++++++ 38 files changed, 3798 insertions(+) create mode 100644 jobs/srun_test.sh create mode 100755 minigpt4/configs/datasets/aokvqa/defaults.yaml create mode 100644 minigpt4/configs/datasets/coco/caption.yaml create mode 100755 minigpt4/configs/datasets/coco/defaults_vqa.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/refcoco.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/refcocog.yaml create mode 100755 minigpt4/configs/datasets/coco_bbox/refcocop.yaml create mode 100755 minigpt4/configs/datasets/flickr/caption_to_phrase.yaml create mode 100755 minigpt4/configs/datasets/flickr/default.yaml create mode 100755 minigpt4/configs/datasets/flickr/object_to_phrase.yaml create mode 100644 minigpt4/configs/datasets/gqa/balanced_val.yaml create mode 100755 minigpt4/configs/datasets/llava/conversation.yaml create mode 100755 minigpt4/configs/datasets/llava/detail.yaml create mode 100755 minigpt4/configs/datasets/llava/reason.yaml create mode 100644 minigpt4/configs/datasets/multitask_conversation/default.yaml create mode 100644 minigpt4/configs/datasets/nlp/unnatural_instruction.yaml create mode 100755 minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml create mode 100755 minigpt4/configs/datasets/okvqa/defaults.yaml create mode 100755 minigpt4/configs/datasets/textcaps/caption.yaml create mode 100755 minigpt4/configs/datasets/vg/ref.yaml create mode 100755 minigpt4/datasets/datasets/aok_vqa_datasets.py create mode 100755 minigpt4/datasets/datasets/coco_caption.py create mode 100755 minigpt4/datasets/datasets/coco_dataset.py create mode 100755 minigpt4/datasets/datasets/coco_vqa_datasets.py create mode 100755 minigpt4/datasets/datasets/doc_dataset.py create mode 100755 minigpt4/datasets/datasets/flickr.py create mode 100755 minigpt4/datasets/datasets/gqa_datasets.py create mode 100755 minigpt4/datasets/datasets/llava_dataset.py create mode 100644 minigpt4/datasets/datasets/multitask_conversation.py create mode 100755 minigpt4/datasets/datasets/text_caps.py create mode 100755 minigpt4/datasets/datasets/unnatural_instruction.py create mode 100755 minigpt4/datasets/datasets/vg_dataset.py create mode 100755 minigpt4/datasets/datasets/vqa_datasets.py create mode 100644 train_configs/minigpt_v2_finetune.yaml diff --git a/jobs/srun_test.sh b/jobs/srun_test.sh new file mode 100644 index 0000000..3dd5709 --- /dev/null +++ b/jobs/srun_test.sh @@ -0,0 +1,30 @@ + +cd .. + +job_name=minigpt4_v2_test +read LOWERPORT UPPERPORT < /proc/sys/net/ipv4/ip_local_port_range +while : +do + PORT="`shuf -i $LOWERPORT-$UPPERPORT -n 1`" + ss -lpn | grep -q ":$PORT " || break +done + + +#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_gqa.yaml + + +#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/448_final_v1_gqa_ablation2.yaml +torchrun --master-port ${PORT} --nproc-per-node 2 train.py --cfg-path train_configs/minigpt_v2_finetune.yaml + +#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path finetune_conversation_ablation/conversation_v2_last_336_test.yaml + +#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_13B.yaml + +# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/448_v2_llama2.yaml +#accelerate launch train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2.yaml + + +# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2_clip_encoder.yaml + +#best_data_ratio_336_full_dataset_lr2e4_v1.yaml + diff --git a/minigpt4/configs/datasets/aokvqa/defaults.yaml b/minigpt4/configs/datasets/aokvqa/defaults.yaml new file mode 100755 index 0000000..79d2054 --- /dev/null +++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml @@ -0,0 +1,29 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + aok_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + # annotations: + # train: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json + # storage: + # - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json + # images: + # storage: /path/to/coco/images/ + + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json + storage: + - /ibex/project/c2133/minigpt4_v2_dataset/aokvqa/annotations/aokvqa_v1p0_train.json + images: + storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/ \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml new file mode 100644 index 0000000..873c286 --- /dev/null +++ b/minigpt4/configs/datasets/coco/caption.yaml @@ -0,0 +1,38 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_caption: # name of the dataset builder + # dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + # build_info: + # # Be careful not to append minus sign (-) before split to avoid itemizing + # annotations: + # train: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json + # md5: aa31ac474cf6250ebb81d18348a07ed8 + # storage: /path/to/coco_caption/annotations/coco_karpathy_train.json + # images: + # storage: /path/to/coco/images/ + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json + md5: aa31ac474cf6250ebb81d18348a07ed8 + storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_train.json + # val: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + # md5: b273847456ef5580e33713b1f7de52a0 + # storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json + # test: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + # md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 + # storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json + images: + storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco/defaults_vqa.yaml b/minigpt4/configs/datasets/coco/defaults_vqa.yaml new file mode 100755 index 0000000..87ae494 --- /dev/null +++ b/minigpt4/configs/datasets/coco/defaults_vqa.yaml @@ -0,0 +1,33 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + + # annotations: + # train: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json + # storage: + # - /path/to/vqav2/annotations/vqa_train.json + # - /path/to/vqav2/coco/annotations/vqa_val.json + # images: + # storage: /path/to/coco/images/ + + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json + storage: + - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/annotations/vqa_train.json + - /ibex/project/c2133/minigpt4_v2_dataset/vqav2/coco/annotations/vqa_val.json + images: + storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml new file mode 100755 index 0000000..580694b --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml @@ -0,0 +1,8 @@ +datasets: + invrefcoco: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: invrefcoco + splitBy: unc \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml new file mode 100755 index 0000000..67af2eb --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml @@ -0,0 +1,8 @@ +datasets: + invrefcocog: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: invrefcocog + splitBy: umd \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml new file mode 100755 index 0000000..576004e --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml @@ -0,0 +1,8 @@ +datasets: + invrefcocop: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: invrefcoco+ + splitBy: unc \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/refcoco.yaml b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml new file mode 100755 index 0000000..edf16ba --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/refcoco.yaml @@ -0,0 +1,8 @@ +datasets: + refcoco: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: refcoco + splitBy: unc \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/refcocog.yaml b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml new file mode 100755 index 0000000..5ed7cc9 --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/refcocog.yaml @@ -0,0 +1,8 @@ +datasets: + refcocog: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: refcocog + splitBy: umd \ No newline at end of file diff --git a/minigpt4/configs/datasets/coco_bbox/refcocop.yaml b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml new file mode 100755 index 0000000..4e3af6f --- /dev/null +++ b/minigpt4/configs/datasets/coco_bbox/refcocop.yaml @@ -0,0 +1,8 @@ +datasets: + refcocop: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/object_detection_datasets/ + dataset: refcoco+ + splitBy: unc \ No newline at end of file diff --git a/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml new file mode 100755 index 0000000..0901b6f --- /dev/null +++ b/minigpt4/configs/datasets/flickr/caption_to_phrase.yaml @@ -0,0 +1,6 @@ +datasets: + CaptionToPhrase: + data_type: images + build_info: + image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_v2_last.json diff --git a/minigpt4/configs/datasets/flickr/default.yaml b/minigpt4/configs/datasets/flickr/default.yaml new file mode 100755 index 0000000..a732dd4 --- /dev/null +++ b/minigpt4/configs/datasets/flickr/default.yaml @@ -0,0 +1,6 @@ +datasets: + grounded_detailed_image_caption: + data_type: images + build_info: + image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_last.json diff --git a/minigpt4/configs/datasets/flickr/object_to_phrase.yaml b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml new file mode 100755 index 0000000..c0189b6 --- /dev/null +++ b/minigpt4/configs/datasets/flickr/object_to_phrase.yaml @@ -0,0 +1,6 @@ +datasets: + ObjectToPhrase: + data_type: images + build_info: + image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_phrase2bbox_resample_last.json diff --git a/minigpt4/configs/datasets/gqa/balanced_val.yaml b/minigpt4/configs/datasets/gqa/balanced_val.yaml new file mode 100644 index 0000000..5a9e55c --- /dev/null +++ b/minigpt4/configs/datasets/gqa/balanced_val.yaml @@ -0,0 +1,33 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + gqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + # build_info: + # # Be careful not to append minus sign (-) before split to avoid itemizing + # annotations: + # train: + # url: + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + # storage: + # - /path/to/gqa/annotations/train_balanced_questions.json + + # images: + # storage: /path/to/gqa/images/ + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json + storage: + - /ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json + images: + storage: /ibex/project/c2133/minigpt4_v2_dataset/gqa/images_copy/ \ No newline at end of file diff --git a/minigpt4/configs/datasets/llava/conversation.yaml b/minigpt4/configs/datasets/llava/conversation.yaml new file mode 100755 index 0000000..6978069 --- /dev/null +++ b/minigpt4/configs/datasets/llava/conversation.yaml @@ -0,0 +1,12 @@ +datasets: + # llava_conversation: + # data_type: images + # build_info: + # image_path: /path/to/coco/images + # ann_path: /path/to/llava/conversation_58k.json + + llava_conversation: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/conversation_58k.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/llava/detail.yaml b/minigpt4/configs/datasets/llava/detail.yaml new file mode 100755 index 0000000..f4d0f0a --- /dev/null +++ b/minigpt4/configs/datasets/llava/detail.yaml @@ -0,0 +1,12 @@ +datasets: + # llava_detail: + # data_type: images + # build_info: + # image_path: /path/to/coco/images + # ann_path: /path/to/llava/detail_23k.json + + llava_detail: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/detail_23k.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/llava/reason.yaml b/minigpt4/configs/datasets/llava/reason.yaml new file mode 100755 index 0000000..ea6cb06 --- /dev/null +++ b/minigpt4/configs/datasets/llava/reason.yaml @@ -0,0 +1,12 @@ +datasets: + # llava_reason: + # data_type: images + # build_info: + # image_path: /path/to/coco/images + # ann_path: /path/to/llava/complex_reasoning_77k.json + + llava_reason: + data_type: images + build_info: + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/complex_reasoning_77k.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/multitask_conversation/default.yaml b/minigpt4/configs/datasets/multitask_conversation/default.yaml new file mode 100644 index 0000000..29200ee --- /dev/null +++ b/minigpt4/configs/datasets/multitask_conversation/default.yaml @@ -0,0 +1,14 @@ +datasets: + # multitask_conversation: + # data_type: images + # build_info: + + # image_path: /path/to/coco/images + # ann_path: /path/to/multitask_conversation/multi_task_conversation.json + + multitask_conversation: + data_type: images + build_info: + + image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/multitask_conversation/multi_task_conversation.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml new file mode 100644 index 0000000..d9f31de --- /dev/null +++ b/minigpt4/configs/datasets/nlp/unnatural_instruction.yaml @@ -0,0 +1,10 @@ +datasets: + # unnatural_instruction: + # data_type: text + # build_info: + # ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json + + unnatural_instruction: + data_type: text + build_info: + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/unnatural-instructions/data/unnatural_instruction_filer.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml new file mode 100755 index 0000000..d2f6a94 --- /dev/null +++ b/minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml @@ -0,0 +1,12 @@ +datasets: + # ocrvqa: + # data_type: images + # build_info: + # image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images + # ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json + + ocrvqa: + data_type: images + build_info: + image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json \ No newline at end of file diff --git a/minigpt4/configs/datasets/okvqa/defaults.yaml b/minigpt4/configs/datasets/okvqa/defaults.yaml new file mode 100755 index 0000000..402212c --- /dev/null +++ b/minigpt4/configs/datasets/okvqa/defaults.yaml @@ -0,0 +1,36 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + ok_vqa: + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + # build_info: + # # Be careful not to append minus sign (-) before split to avoid itemizing + # annotations: + # train: + # url: + # # TODO make this order insensitive + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json + # storage: + # - /path/to/okvqa/annotations/okvqa_train.json + # images: + # storage: /path/to/okvqa/images + + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + train: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json + # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json + storage: + - /ibex/project/c2133/minigpt4_v2_dataset/okvqa_v2/annotations/okvqa_train.json + images: + storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg \ No newline at end of file diff --git a/minigpt4/configs/datasets/textcaps/caption.yaml b/minigpt4/configs/datasets/textcaps/caption.yaml new file mode 100755 index 0000000..61a92c7 --- /dev/null +++ b/minigpt4/configs/datasets/textcaps/caption.yaml @@ -0,0 +1,16 @@ +datasets: + # textcaps_caption: + # data_type: images + + # build_info: + # image_path: /path/to/TextCaps/train_images + # ann_path: /path/to/TextCaps/TextCaps_0.1_train.json + + textcaps_caption: + data_type: images + + build_info: + image_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/train_images + ann_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/TextCaps_0.1_train.json + + diff --git a/minigpt4/configs/datasets/vg/ref.yaml b/minigpt4/configs/datasets/vg/ref.yaml new file mode 100755 index 0000000..8b793a2 --- /dev/null +++ b/minigpt4/configs/datasets/vg/ref.yaml @@ -0,0 +1,10 @@ +datasets: + # refvg: + # data_type: images + # build_info: + # data_dir: /path/to/visual_genome + + refvg: + data_type: images + build_info: + data_dir: /ibex/project/c2133/minigpt4_v2_dataset/visual_genome \ No newline at end of file diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py index e5d66b8..c393c4f 100644 --- a/minigpt4/datasets/builders/image_text_pair_builder.py +++ b/minigpt4/datasets/builders/image_text_pair_builder.py @@ -6,6 +6,418 @@ from minigpt4.common.registry import registry from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder from minigpt4.datasets.datasets.laion_dataset import LaionDataset from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset +from minigpt4.datasets.datasets.text_caps import TextCapDataset +from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset +from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset +from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset +from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset +from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset +from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset +from minigpt4.datasets.datasets.gqa_datasets import GQADataset +from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset +from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset +from minigpt4.datasets.datasets.doc_dataset import OCRVQADataset + + + +@registry.register_builder("multitask_conversation") +class MultitaskConversationBuilder(BaseDatasetBuilder): + train_dataset_cls = MultiTaskConversationDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/multitask_conversation/default.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + + +@registry.register_builder("unnatural_instruction") +class UnnaturalInstructionBuilder(BaseDatasetBuilder): + train_dataset_cls = UnnaturalDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/nlp/unnatural_instruction.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + ) + + return datasets + + + +@registry.register_builder("llava_detail") +class LlavaDetailBuilder(BaseDatasetBuilder): + train_dataset_cls = LlavaDetailDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/llava/detail.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + + + +@registry.register_builder("llava_reason") +class LlavaReasonBuilder(BaseDatasetBuilder): + train_dataset_cls = LlavaReasonDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/llava/reason.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + +@registry.register_builder("llava_conversation") +class LlavaReasonBuilder(BaseDatasetBuilder): + train_dataset_cls = LlavaConversationDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/llava/conversation.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + + +class AllRefCOCOBuilder(BaseDatasetBuilder): + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + + build_info = self.config.build_info + image_path = build_info.image_path + ann_path = build_info.ann_path + + datasets = dict() + + if not os.path.exists(image_path): + warnings.warn("image path {} does not exist.".format(image_path)) + if not os.path.exists(ann_path): + warnings.warn("ann path {} does not exist.".format(ann_path)) + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=ann_path, + vis_root=image_path, + dataset=build_info.dataset, + splitBy=build_info.splitBy + ) + + return datasets + + +@registry.register_builder("refcoco") +class RefCOCOBuilder(AllRefCOCOBuilder): + train_dataset_cls = ReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/refcoco.yaml", + } + +@registry.register_builder("refcocop") +class RefCOCOPBuilder(AllRefCOCOBuilder): + train_dataset_cls = ReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/refcocop.yaml", + } + + +@registry.register_builder("refcocog") +class RefCOCOGBuilder(AllRefCOCOBuilder): + train_dataset_cls = ReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/refcocog.yaml", + } + +@registry.register_builder("invrefcoco") +class RefCOCOBuilder(AllRefCOCOBuilder): + train_dataset_cls = InvReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/invrefcoco.yaml", + } + + +@registry.register_builder("invrefcocop") +class RefCOCOPBuilder(AllRefCOCOBuilder): + train_dataset_cls = InvReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/invrefcocop.yaml", + } + + +@registry.register_builder("invrefcocog") +class RefCOCOGBuilder(AllRefCOCOBuilder): + train_dataset_cls = InvReferCOCODataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco_bbox/invrefcocog.yaml", + } + +@registry.register_builder("refvg") +class RefVisualGenomeBuilder(BaseDatasetBuilder): + train_dataset_cls = ReferVisualGenomeDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/vg/ref.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + + build_info = self.config.build_info + data_dir = build_info.data_dir + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + data_dir=data_dir, + ) + + return datasets + + +@registry.register_builder("textcaps_caption") +class TextcapCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = TextCapDataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"} + + def _download_ann(self): + pass + + def _download_vis(self): + pass + + def build(self): + self.build_processors() + + build_info = self.config.build_info + + datasets = dict() + split = "train" + + # create datasets + # [NOTE] return inner_datasets (wds.DataPipeline) + dataset_cls = self.train_dataset_cls + datasets[split] = dataset_cls( + vis_processor=self.vis_processors[split], + text_processor=self.text_processors[split], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + +@registry.register_builder("coco_vqa") +class COCOVQABuilder(BaseDatasetBuilder): + train_dataset_cls = COCOVQADataset + + DATASET_CONFIG_DICT = { + "default": "configs/datasets/coco/defaults_vqa.yaml", + } + +@registry.register_builder("aok_vqa") +class AOKVQABuilder(BaseDatasetBuilder): + train_dataset_cls = AOKVQADataset + + DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"} + + +@registry.register_builder("gqa") +class GQABuilder(BaseDatasetBuilder): + train_dataset_cls = GQADataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/gqa/balanced_val.yaml", + } + + + + +@registry.register_builder("grounded_detailed_image_caption") +class GroundedCaptionBuilder(BaseDatasetBuilder): + train_dataset_cls = GroundedDetailDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/flickr/default.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + + +@registry.register_builder("CaptionToPhrase") +class CaptionToPhraseBuilder(BaseDatasetBuilder): + train_dataset_cls = CaptionToObjectDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/flickr/caption_to_phrase.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + +@registry.register_builder("ObjectToPhrase") +class CaptionToPhraseBuilder(BaseDatasetBuilder): + train_dataset_cls = PhraseToObjectDataset + DATASET_CONFIG_DICT = { + "default": "configs/datasets/flickr/object_to_phrase.yaml", + } + + def build_datasets(self): + # at this point, all the annotations and image/videos should be all downloaded to the specified locations. + logging.info("Building datasets...") + self.build_processors() + build_info = self.config.build_info + datasets = dict() + + # create datasets + dataset_cls = self.train_dataset_cls + datasets['train'] = dataset_cls( + vis_processor=self.vis_processors["train"], + text_processor=self.text_processors["train"], + ann_path=build_info.ann_path, + vis_root=build_info.image_path, + ) + + return datasets + + + + +class DocumentVQABuilder(BaseDatasetBuilder): + def _download_ann(self): + pass + + def _download_vis(self): + pass + + def build(self): + self.build_processors() + build_info = self.config.build_info + + datasets = dict() + split = "train" + + dataset_cls = self.train_dataset_cls + datasets[split] = dataset_cls( + vis_processor=self.vis_processors[split], + text_processor=self.text_processors[split], + vis_root=build_info.image_path, + ann_path=build_info.ann_path + ) + + return datasets + + +@registry.register_builder("ocrvqa") +class OCRVQABuilder(DocumentVQABuilder): + train_dataset_cls = OCRVQADataset + DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"} @registry.register_builder("cc_sbu") diff --git a/minigpt4/datasets/datasets/aok_vqa_datasets.py b/minigpt4/datasets/datasets/aok_vqa_datasets.py new file mode 100755 index 0000000..b65b42d --- /dev/null +++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py @@ -0,0 +1,212 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from collections import OrderedDict +import json +import os +import random +import torch + +from PIL import Image + +from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "direct_answers": "; ".join(ann["direct_answers"]), + "choices": "; ".join(ann["choices"]), + "correct_choice": ann["choices"][ann["correct_choice_idx"]], + "image": sample["image"], + } + ) + + +class AOKVQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.instruction_pool =[ + "[vqa] {}", + "[vqa] Based on the image, respond to this question with a short answer: {}" + ] + + exist_annotation = [] + for ann in self.annotation: + image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) + if os.path.exists(image_path): + exist_annotation.append(ann) + self.annotation = exist_annotation + + def get_data(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + answer_key = "direct_answers" + + # print("answer key", answer_key) + # for answer in ann[answer_key]: + # print(answer) + + answer_weight = {} + for answer in ann[answer_key]: + if answer in answer_weight.keys(): + answer_weight[answer] += 1 / len(ann[answer_key]) + else: + answer_weight[answer] = 1 / len(ann[answer_key]) + + answers = list(answer_weight.keys()) + weights = list(answer_weight.values()) + + answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights + + return { + "image": image, + "question": question, + "answer": answer, + } + + def __getitem__(self, index): + data = self.get_data(index) + question = self.text_processor(data["question"]) + instruction = random.choice(self.instruction_pool).format(question) + + instruction = " {} ".format(instruction) + + answer = self.text_processor(data['answer']) + + + return { + "image": data['image'], + "instruction_input": instruction, + "answer": answer, + } + + +class AOKVQGDataset(AOKVQADataset): + + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.instruction_pool = [ + 'Given the image, generate a question whose answer is: {}', + 'Based on the image, provide a question with the answer: {}', + 'Given the visual representation, create a question for which the answer is "{}"', + 'From the image provided, craft a question that leads to the reply: {}', + 'Considering the picture, come up with a question where the answer is: {}', + 'Taking the image into account, generate an question that has the answer: {}' + ] + + def __getitem__(self, index): + data = self.get_data(index) + instruction = random.choice(self.instruction_pool).format(data['answer']) + + return { + "image": data['image'], + "instruction_input": instruction, + "answer": data['question'], + } + + +# class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin): +# def __init__(self, vis_processor, text_processor, vis_root, ann_paths): +# """ +# vis_root (string): Root directory of images (e.g. coco/images/) +# ann_root (string): directory to store the annotation file +# """ +# +# self.vis_root = vis_root +# +# self.annotation = json.load(open(ann_paths[0])) +# +# answer_list_path = ann_paths[1] +# if os.path.exists(answer_list_path): +# self.answer_list = json.load(open(answer_list_path)) +# else: +# self.answer_list = None +# +# try: +# self.coco_fmt_qust_file = ann_paths[2] +# self.coco_fmt_anno_file = ann_paths[3] +# except IndexError: +# self.coco_fmt_qust_file = None +# self.coco_fmt_anno_file = None +# +# self.vis_processor = vis_processor +# self.text_processor = text_processor +# +# self._add_instance_ids() +# +# def collater(self, samples): +# ( +# image_list, +# question_list, +# question_id_list, +# instance_id_list, +# choices_list, +# correct_choice_idx_list, +# direct_answers_list, +# ) = ([], [], [], [], [], [], []) +# +# for sample in samples: +# image_list.append(sample["image"]) +# question_list.append(sample["text_input"]) +# question_id_list.append(sample["question_id"]) +# instance_id_list.append(sample["instance_id"]) +# choices_list.append(sample["choices"]) +# correct_choice_idx_list.append(sample["correct_choice_idx"]) +# direct_answers_list.append(sample["direct_answers"]) +# +# return { +# "image": torch.stack(image_list, dim=0), +# "text_input": question_list, +# "question_id": question_id_list, +# "instance_id": instance_id_list, +# "choices": choices_list, +# "correct_choice_idx": correct_choice_idx_list, +# "direct_answers": direct_answers_list, +# } +# +# def __getitem__(self, index): +# ann = self.annotation[index] +# +# image_path = os.path.join(self.vis_root, ann["image"]) +# image = Image.open(image_path).convert("RGB") +# +# image = self.vis_processor(image) +# question = self.text_processor(ann["question"]) +# +# choices = ann["choices"] +# if "correct_choice_idx" in ann: +# correct_choice_idx = ann["correct_choice_idx"] +# else: +# correct_choice_idx = None +# +# if "direct_answers" in ann: +# direct_answers = ann["direct_answers"] +# else: +# direct_answers = None +# +# return { +# "image": image, +# "text_input": question, +# "question_id": ann["question_id"], +# "instance_id": ann["instance_id"], +# "choices": choices, +# "correct_choice_idx": correct_choice_idx, +# "direct_answers": direct_answers, +# } diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py new file mode 100755 index 0000000..3df5279 --- /dev/null +++ b/minigpt4/datasets/datasets/coco_caption.py @@ -0,0 +1,122 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json +import torch +import numpy as np + +from PIL import Image +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + +from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset + +COCOCapDataset = COCOCaptionDataset + + + + + +class COCOCapEvalDataset(CaptionEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1] + + return { + "image": image, + "image_id": img_id, + "instance_id": ann["instance_id"], + } + + +class NoCapsEvalDataset(CaptionEvalDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + split (string): val or test + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + + img_id = ann["img_id"] + + return { + "image": image, + "image_id": img_id, + "instance_id": ann["instance_id"], + } + + +class RefCOCOEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + img_id = data['img_id'] + sent = data['sents'] + image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg') + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image) + # question = f"[refer] {sent}" + question = f"[refer] where is {sent}?" + # question = f"where is the bounding box location of {sent}?" + return image, question, img_id + +class EvalCaptionData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + ann = dict() + for item in self.loaded_data: + image_id = item['image_id'] + ann[image_id] = item['image'] + self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann] + + def __len__(self): + return len(self.ann) + + def __getitem__(self, idx): + data = self.ann[idx] + image_id = data['image_id'] + img_file = data['image'].split('/')[-1] + image_path = os.path.join(self.root_path, img_file) + image = Image.open(image_path).convert('RGB') + + image = self.vis_processor(image) + question = f"[caption] please describe this image?" + return image, question, image_id diff --git a/minigpt4/datasets/datasets/coco_dataset.py b/minigpt4/datasets/datasets/coco_dataset.py new file mode 100755 index 0000000..e89ba34 --- /dev/null +++ b/minigpt4/datasets/datasets/coco_dataset.py @@ -0,0 +1,667 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset +import threading + +# Global lock +lock = threading.Lock() + +def sample_object_bbox(objects, bbox): + + + + zipped_list = list(zip(objects, bbox)) + + # Shuffle the zipped list + random.shuffle(zipped_list) + + # Generate the new string with interleaved format + # interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list]) + + # print("objects", objects) + # print("bbox",bbox) + + interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","") + + # interleaved_list = " "+interleaved_list + # print(interleaved_list) + return interleaved_list + +def bbox_to_object(objects, bbox): + + index_sample = random.sample(range(len(objects)),1)[0] + + sample_object = str(objects[index_sample]) + sample_bbox = bbox[index_sample] + # sample_center_point = center_point[index_sample] + + sample_bbox = r"{"+str(sample_bbox) + "}" + return sample_bbox, sample_object + +def object_to_bbox(objects, bbox, center_point): + index_sample = random.sample(range(len(objects)),1)[0] + + sample_object = objects[index_sample] + sample_bbox = bbox[index_sample] + sample_center_point = center_point[index_sample] + + instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? " + answer = "{"+str(sample_object)+","+str(sample_bbox)+"}" + + + + return instruction, answer + + +class COCOBBOXDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, location): + super().__init__(vis_processor=vis_processor, text_processor=text_processor) + + print("coco box dataset") + self.inner_dataset = wds.DataPipeline( + wds.ResampledShards(location), + wds.tarfile_to_samples(handler=wds.warn_and_continue), + wds.shuffle(1000, handler=wds.warn_and_continue), + wds.decode("pilrgb", handler=wds.warn_and_continue), + wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), + wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), + wds.map(self.to_dict, handler=wds.warn_and_continue), + ) + + def to_dict(self, sample): + objects = sample[1]["objects"] + boxes = sample[1]["bbox"] + caption = sample[1]["caption"] + + + new_bboxes = [] + + image_size = sample[0].shape[1] + image_size = 100 + for index in range(len(boxes)): + box = boxes[index] + x1 = int(box[0]*image_size) + y1 = int(box[1]*image_size) + x2 = x1 + int(box[2]*image_size) + y2 = y1 + int(box[3]*image_size) + assert x1>=0 and x1<=image_size + assert x2>=0 and x2<=image_size + assert y1>=0 and y1<=image_size + assert y2>=0 and y2<=image_size + + new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" + # new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" + new_bboxes.append(new_bbox) + + instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. " + instruction = " {}".format(self.text_processor(instruction)) + + answer = sample_object_bbox(objects, new_bboxes) + + # print("instruction",instruction) + # print("answer", answer) + + return { + "image": sample[0], + "instruction_input": instruction, + "answer": answer, + "data_type": "bbox", + "question_split": True + } + + +class COCOBboxToObjectDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, location): + super().__init__(vis_processor=vis_processor, text_processor=text_processor) + + + self.inner_dataset = wds.DataPipeline( + wds.ResampledShards(location), + wds.tarfile_to_samples(handler=wds.warn_and_continue), + wds.shuffle(1000, handler=wds.warn_and_continue), + wds.decode("pilrgb", handler=wds.warn_and_continue), + wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), + wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), + wds.map(self.to_dict, handler=wds.warn_and_continue), + ) + + + self.instruction_pool = [ + " what object is in this bounding box location {} ", + " what object is in this location {} ", + " identify the object present at this location {} ", + " what is it in bounding box location{} ", + " describe this object in {} ", + " this {} is ", + " the object in {} is ", + " please tell me what is inside the bounding box position {} ", + " what can you find in the bounding box area at position {}? ", + " what is the object occupying this area {} ", + " could you identify the content within the bounding box located at {} ", + ] + + def to_dict(self, sample): + + objects = sample[1]["objects"] + boxes = sample[1]["bbox"] + + new_bboxes = [] + + image_size = sample[0].shape[1] + image_size=100 + for index in range(len(boxes)): + box = boxes[index] + x1 = int(box[0]*image_size) + y1 = int(box[1]*image_size) + x2 = x1 + int(box[2]*image_size) + y2 = y1 + int(box[3]*image_size) + assert x1>=0 and x1<=image_size + assert x2>=0 and x2<=image_size + assert y1>=0 and y1<=image_size + assert y2>=0 and y2<=image_size + + new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" + new_bboxes.append(new_bbox) + + bbox, object = bbox_to_object(objects, new_bboxes) + + instruction = random.choice(self.instruction_pool).format(bbox) + return { + "image": sample[0], + "instruction_input": instruction, + "answer": self.text_processor(object), + "data_type": "bbox", + "question_split": True + } + + + +# class ReferCOCODataset(Dataset): +# def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'): +# """ +# vis_root (string): Root directory of images (e.g. coco/images/) +# ann_root (string): directory to store the annotation file +# """ +# self.vis_root = vis_root + +# self.vis_processor = vis_processor +# self.text_processor = text_processor + +# self.refer = REFER(ann_path, vis_root, dataset, splitBy) +# self.ref_ids = self.refer.getRefIds() + + +# self.instruction_pool = [ +# "[refer] {}", +# "[refer] give me the location of {}", +# "[refer] where is {} ?", +# "[refer] from this image, tell me the location of {}", +# "[refer] the location of {} is", +# "[refer] could you tell me the location for {} ?", +# "[refer] where can I locate the {} ?", +# ] + + +# def __len__(self): +# return len(self.ref_ids) + +# def preprocess(self, index): +# ref_id = self.ref_ids[index] +# ref = self.refer.loadRefs(ref_id)[0] + +# image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"]) +# image_path = os.path.join(self.vis_root, image_file) +# image = Image.open(image_path).convert("RGB") +# image_orig_size = image.size +# image = self.vis_processor(image) +# image_new_size = [image.shape[1], image.shape[2]] + +# image_new_size = [100,100] + +# sample_sentence = random.choice(ref['sentences'])['raw'] + +# refer_sentence = self.text_processor(sample_sentence) + + +# bbox = self.refer.getRefBox(ref['ref_id']) + +# bbox_to_save = bbox +# image_id_to_save = ref["image_id"] +# ref_id_to_save = ref_id + +# item = {"image":image_id_to_save,"bbox":bbox_to_save,"ref id":ref_id_to_save, "sentence":refer_sentence} + + +# def save_to_file(): +# with lock: +# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "r") as f: +# refer_json = json.load(f) + +# if ref_id_to_save not in refer_json.keys(): +# print(item) +# refer_json[ref_id_to_save] = item + +# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "w") as f: +# json.dump(refer_json, f) + + +# save_to_file() +# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","r") as f: +# # refer_json = json.load(open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json")) + +# # if ref_id_to_save not in refer_json.keys(): +# # print(item) +# # refer_json[ref_id_to_save] = item + +# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","w") as f: +# # json.dump(refer_json,f) + + + + + + + +# bbox = [ +# bbox[0] / image_orig_size[0] * image_new_size[0], +# bbox[1] / image_orig_size[1] * image_new_size[1], +# (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0], +# (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1] +# ] +# bbox = [int(x) for x in bbox] +# bbox = "{{<{}><{}><{}><{}>}}".format(*bbox) +# return { +# "image": image, +# "refer_sentence": refer_sentence, +# "bbox": bbox, +# "image_id": ref['image_id'], +# } + +# def __getitem__(self, index): +# data = self.preprocess(index) +# instruction = random.choice(self.instruction_pool).format(data['refer_sentence']) + +# instruction = " {} ".format(instruction) + +# return { +# "image": data['image'], +# "instruction_input": instruction, +# "answer": data['bbox'], +# "image_id": data['image_id'], +# } + + +# class InvReferCOCODataset(ReferCOCODataset): +# def __init__(self, *args, **kwargs): +# super(InvReferCOCODataset, self).__init__(*args, **kwargs) + +# self.instruction_pool = [ +# "[identify] {}", +# "[identify] what object is in this location {}", +# "[identify] identify the object present at this location {}", +# "[identify] what is it in {}", +# "[identify] describe this object in {}", +# "[identify] this {} is", +# "[identify] the object in {} is", +# ] + +# def __getitem__(self, index): +# data = self.preprocess(index) + +# instruction = random.choice(self.instruction_pool).format(data['bbox']) + +# instruction = " {} ".format(instruction) + +# return { +# "image": data['image'], +# "instruction_input": instruction, +# "answer": self.text_processor(data['refer_sentence']), +# "image_id": data['image_id'], +# } + + +class ReferCOCODataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.refer = REFER(ann_path, vis_root, dataset, splitBy) + self.ref_ids = self.refer.getRefIds(split="train") + + print(dataset, len(self.ref_ids)) + + self.instruction_pool = [ + "[refer] {}", + "[refer] give me the location of {}", + "[refer] where is {} ?", + "[refer] from this image, tell me the location of {}", + "[refer] the location of {} is", + "[refer] could you tell me the location for {} ?", + "[refer] where can I locate the {} ?", + ] + + + def __len__(self): + return len(self.ref_ids) + + def preprocess(self, index): + ref_id = self.ref_ids[index] + ref = self.refer.loadRefs(ref_id)[0] + + image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"]) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image_orig_size = image.size + image = self.vis_processor(image) + image_new_size = [image.shape[1], image.shape[2]] + + image_new_size = [100,100] + + sample_sentence = random.choice(ref['sentences'])['raw'] + refer_sentence = self.text_processor(sample_sentence) + + + bbox = self.refer.getRefBox(ref['ref_id']) + bbox = [ + bbox[0] / image_orig_size[0] * image_new_size[0], + bbox[1] / image_orig_size[1] * image_new_size[1], + (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0], + (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1] + ] + bbox = [int(x) for x in bbox] + bbox = "{{<{}><{}><{}><{}>}}".format(*bbox) + return { + "image": image, + "refer_sentence": refer_sentence, + "bbox": bbox, + "image_id": ref['image_id'], + } + + def __getitem__(self, index): + data = self.preprocess(index) + instruction = random.choice(self.instruction_pool).format(data['refer_sentence']) + + instruction = " {} ".format(instruction) + + return { + "image": data['image'], + "instruction_input": instruction, + "answer": data['bbox'], + "image_id": data['image_id'], + } + + +class InvReferCOCODataset(ReferCOCODataset): + def __init__(self, *args, **kwargs): + super(InvReferCOCODataset, self).__init__(*args, **kwargs) + + self.instruction_pool = [ + "[identify] {}", + "[identify] what object is in this location {}", + "[identify] identify the object present at this location {}", + "[identify] what is it in {}", + "[identify] describe this object in {}", + "[identify] this {} is", + "[identify] the object in {} is", + ] + + def __getitem__(self, index): + data = self.preprocess(index) + + instruction = random.choice(self.instruction_pool).format(data['bbox']) + + instruction = " {} ".format(instruction) + + return { + "image": data['image'], + "instruction_input": instruction, + "answer": self.text_processor(data['refer_sentence']), + "image_id": data['image_id'], + } + + +class REFER: + def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'): + # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog + # also provide dataset name and splitBy information + # e.g., dataset = 'refcoco', splitBy = 'unc' + dataset = dataset.split('inv')[-1] # inv dataset is stored in the same path as normal dataset + print('loading dataset %s into memory...' % dataset) + self.ann_dir = os.path.join(data_root, dataset) + if dataset in ['refcoco', 'refcoco+', 'refcocog']: + self.vis_root = vis_root + elif dataset == 'refclef': + raise 'No RefClef image data' + else: + raise 'No refer dataset is called [%s]' % dataset + + # load refs from data/dataset/refs(dataset).json + tic = time.time() + ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p') + self.data = {} + self.data['dataset'] = dataset + self.data['refs'] = pickle.load(open(ref_file, 'rb')) + + # load annotations from data/dataset/instances.json + instances_file = os.path.join(self.ann_dir, 'instances.json') + instances = json.load(open(instances_file, 'r')) + self.data['images'] = instances['images'] + self.data['annotations'] = instances['annotations'] + self.data['categories'] = instances['categories'] + + # create index + self.createIndex() + print('DONE (t=%.2fs)' % (time.time() - tic)) + + def createIndex(self): + # create sets of mapping + # 1) Refs: {ref_id: ref} + # 2) Anns: {ann_id: ann} + # 3) Imgs: {image_id: image} + # 4) Cats: {category_id: category_name} + # 5) Sents: {sent_id: sent} + # 6) imgToRefs: {image_id: refs} + # 7) imgToAnns: {image_id: anns} + # 8) refToAnn: {ref_id: ann} + # 9) annToRef: {ann_id: ref} + # 10) catToRefs: {category_id: refs} + # 11) sentToRef: {sent_id: ref} + # 12) sentToTokens: {sent_id: tokens} + print('creating index...') + # fetch info from instances + Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {} + for ann in self.data['annotations']: + Anns[ann['id']] = ann + imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann] + for img in self.data['images']: + Imgs[img['id']] = img + for cat in self.data['categories']: + Cats[cat['id']] = cat['name'] + + # fetch info from refs + Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {} + Sents, sentToRef, sentToTokens = {}, {}, {} + for ref in self.data['refs']: + # ids + ref_id = ref['ref_id'] + ann_id = ref['ann_id'] + category_id = ref['category_id'] + image_id = ref['image_id'] + + # add mapping related to ref + Refs[ref_id] = ref + imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref] + catToRefs[category_id] = catToRefs.get(category_id, []) + [ref] + refToAnn[ref_id] = Anns[ann_id] + annToRef[ann_id] = ref + + # add mapping of sent + for sent in ref['sentences']: + Sents[sent['sent_id']] = sent + sentToRef[sent['sent_id']] = ref + sentToTokens[sent['sent_id']] = sent['tokens'] + + # create class members + self.Refs = Refs + self.Anns = Anns + self.Imgs = Imgs + self.Cats = Cats + self.Sents = Sents + self.imgToRefs = imgToRefs + self.imgToAnns = imgToAnns + self.refToAnn = refToAnn + self.annToRef = annToRef + self.catToRefs = catToRefs + self.sentToRef = sentToRef + self.sentToTokens = sentToTokens + print('index created.') + + def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''): + image_ids = image_ids if type(image_ids) == list else [image_ids] + cat_ids = cat_ids if type(cat_ids) == list else [cat_ids] + ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] + + if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0: + refs = self.data['refs'] + else: + if not len(image_ids) == 0: + refs = [self.imgToRefs[image_id] for image_id in image_ids] + else: + refs = self.data['refs'] + if not len(cat_ids) == 0: + refs = [ref for ref in refs if ref['category_id'] in cat_ids] + if not len(ref_ids) == 0: + refs = [ref for ref in refs if ref['ref_id'] in ref_ids] + if not len(split) == 0: + if split in ['testA', 'testB', 'testC']: + refs = [ref for ref in refs if + split[-1] in ref['split']] # we also consider testAB, testBC, ... + elif split in ['testAB', 'testBC', 'testAC']: + refs = [ref for ref in refs if ref['split'] == split] # rarely used I guess... + elif split == 'test': + refs = [ref for ref in refs if 'test' in ref['split']] + elif split == 'train' or split == 'val': + refs = [ref for ref in refs if ref['split'] == split] + else: + raise 'No such split [%s]' % split + ref_ids = [ref['ref_id'] for ref in refs] + return ref_ids + + def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]): + image_ids = image_ids if type(image_ids) == list else [image_ids] + cat_ids = cat_ids if type(cat_ids) == list else [cat_ids] + ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] + + if len(image_ids) == len(cat_ids) == len(ref_ids) == 0: + ann_ids = [ann['id'] for ann in self.data['annotations']] + else: + if not len(image_ids) == 0: + lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns] # list of [anns] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = self.data['annotations'] + if not len(cat_ids) == 0: + anns = [ann for ann in anns if ann['category_id'] in cat_ids] + ann_ids = [ann['id'] for ann in anns] + if not len(ref_ids) == 0: + ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids])) + return ann_ids + + def getImgIds(self, ref_ids=[]): + ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] + + if not len(ref_ids) == 0: + image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids])) + else: + image_ids = self.Imgs.keys() + return image_ids + + def getCatIds(self): + return self.Cats.keys() + + def loadRefs(self, ref_ids=[]): + if type(ref_ids) == list: + return [self.Refs[ref_id] for ref_id in ref_ids] + elif type(ref_ids) == int: + return [self.Refs[ref_ids]] + + def loadAnns(self, ann_ids=[]): + if type(ann_ids) == list: + return [self.Anns[ann_id] for ann_id in ann_ids] + elif type(ann_ids) == int: + return [self.Anns[ann_ids]] + + def loadImgs(self, image_ids=[]): + if type(image_ids) == list: + return [self.Imgs[image_id] for image_id in image_ids] + elif type(image_ids) == int: + return [self.Imgs[image_ids]] + + def loadCats(self, cat_ids=[]): + if type(cat_ids) == list: + return [self.Cats[cat_id] for cat_id in cat_ids] + elif type(cat_ids) == int: + return [self.Cats[cat_ids]] + + def getRefBox(self, ref_id): + ref = self.Refs[ref_id] + ann = self.refToAnn[ref_id] + return ann['bbox'] # [x, y, w, h] + + def showRef(self, ref, seg_box='box'): + ax = plt.gca() + # show image + image = self.Imgs[ref['image_id']] + I = io.imread(os.path.join(self.vis_root, image['file_name'])) + ax.imshow(I) + # show refer expression + for sid, sent in enumerate(ref['sentences']): + print('%s. %s' % (sid + 1, sent['sent'])) + # show segmentations + if seg_box == 'seg': + ann_id = ref['ann_id'] + ann = self.Anns[ann_id] + polygons = [] + color = [] + c = 'none' + if type(ann['segmentation'][0]) == list: + # polygon used for refcoco* + for seg in ann['segmentation']: + poly = np.array(seg).reshape((len(seg) / 2, 2)) + polygons.append(Polygon(poly, True, alpha=0.4)) + color.append(c) + p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1) + ax.add_collection(p) # thick yellow polygon + p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1) + ax.add_collection(p) # thin red polygon + else: + # mask used for refclef + raise NotImplementedError('RefClef is not downloaded') + # show bounding-box + elif seg_box == 'box': + ann_id = ref['ann_id'] + ann = self.Anns[ann_id] + bbox = self.getRefBox(ref['ref_id']) + box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3) + ax.add_patch(box_plot) diff --git a/minigpt4/datasets/datasets/coco_vqa_datasets.py b/minigpt4/datasets/datasets/coco_vqa_datasets.py new file mode 100755 index 0000000..a60426d --- /dev/null +++ b/minigpt4/datasets/datasets/coco_vqa_datasets.py @@ -0,0 +1,184 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json +import random + +from PIL import Image + +from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset + +from collections import OrderedDict + + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "answers": "; ".join(ann["answer"]), + "image": sample["image"], + } + ) + + +class COCOVQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.instruction_pool =[ + "[vqa] {}", + "[vqa] Based on the image, respond to this question with a short answer: {}" + ] + + exist_annotation = [] + for ann in self.annotation: + image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) + if os.path.exists(image_path): + exist_annotation.append(ann) + self.annotation = exist_annotation + + + def get_data(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + question_id = ann["question_id"] + + answer_weight = {} + for answer in ann["answer"]: + if answer in answer_weight.keys(): + answer_weight[answer] += 1 / len(ann["answer"]) + else: + answer_weight[answer] = 1 / len(ann["answer"]) + + answers = list(answer_weight.keys()) + weights = list(answer_weight.values()) + + answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights + + # if "unk" in answer: + # print("cocovqa", answer) + + return { + "image": image, + "question": question, + "question_id": question_id, + "answer": answer, + } + + def __getitem__(self, index): + data = self.get_data(index) + instruction = random.choice(self.instruction_pool).format(data['question']) + instruction = " {} ".format(instruction) + + return { + "image": data['image'], + "question_id": data["question_id"], + "instruction_input": instruction, + "answer": self.text_processor(data['answer']), + } + + +class COCOVQGDataset(COCOVQADataset): + + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.instruction_pool = [ + 'Given the image, generate a question whose answer is: {}', + 'Based on the image, provide a question with the answer: {}', + 'Given the visual representation, create a question for which the answer is "{}"', + 'From the image provided, craft a question that leads to the reply: {}', + 'Considering the picture, come up with a question where the answer is: {}', + 'Taking the image into account, generate an question that has the answer: {}' + ] + + def __getitem__(self, index): + data = self.get_data(index) + instruction = random.choice(self.instruction_pool).format(data['answer']) + instruction = " {}".format(instruction) + + return { + "image": data['image'], + "question_id": data["question_id"], + "instruction_input": instruction, + "answer": data['question'], + } + + + +class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.instruction_pool = [ +# '{}', +# 'Question: {}', +# '{} A short answer to the question is', +# 'Q: {} A:', + 'Question: {} Short answer:', +# 'Given the image, answer the following question with no more than three words. {}', +# 'Based on the image, respond to this question with a short answer: {}.', +# 'Use the provided image to answer the question: {} Provide your answer as short as possible.', +# 'What is the answer to the following question? "{}"', +# 'The question "{}" can be answered using the image. A short answer is' + ] +# print('vis_root', vis_root) + self.vis_root = vis_root + + self.annotation = json.load(open(ann_paths[0])) + + answer_list_path = ann_paths[1] + if os.path.exists(answer_list_path): + self.answer_list = json.load(open(answer_list_path)) + else: + self.answer_list = None + + try: + self.coco_fmt_qust_file = ann_paths[2] + self.coco_fmt_anno_file = ann_paths[3] + except IndexError: + self.coco_fmt_qust_file = None + self.coco_fmt_anno_file = None + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self._add_instance_ids() + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + instruction = random.choice(self.instruction_pool).format(question) + instruction = " {} ".format(instruction) + + return { + "image": image, + 'image_path': image_path, + "question": question, + "question_id": ann["question_id"], + "instruction_input": instruction, + "instance_id": ann["instance_id"], + } diff --git a/minigpt4/datasets/datasets/doc_dataset.py b/minigpt4/datasets/datasets/doc_dataset.py new file mode 100755 index 0000000..adc2d6c --- /dev/null +++ b/minigpt4/datasets/datasets/doc_dataset.py @@ -0,0 +1,290 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + + +class SingleSlideVQADataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + self.data = self.create_data(ann_path) + + # self.instruction_pool = [ + # "###Human: {}###Assistant: ", + # "###Human: From this slide, {}###Assistant: ", + # ] + self.instruction_pool = [ + " {}", + " From this slide, {}", + ] + def create_data(self, ann_path): + with open(ann_path, 'r') as f: + samples = f.readlines() + data = [] + for sample in samples: + sample = json.loads(sample) + if len(sample['evidence_pages']) != 1: continue # skip questions that need more than one slide page + page = sample['evidence_pages'][0] + image_name = 'slide_{}_1024.jpg'.format(page) + # assert [int(image_name.split('-')[-2]) for image_name in image_names] == list(range(1, 21)) # check the format + image_path = os.path.join(sample['deck_name'], image_name) + data.append({ + 'qa_id': sample['qa_id'], + 'question': sample['question'], + 'answer': sample['answer'], + 'image_path': image_path + }) + + print("single slide ",len(data)) + return data + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB") + image = self.vis_processor(image) + + # instruction = self.text_processor(sample["question"]) + instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"])) + + # instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"])) + return { + "image": image, + "instruction_input": instruction, + "answer": sample['answer'], + "qa_id": sample['qa_id'], + } + + +class OCRVQADataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + self.data = self.create_data(ann_path) + + self.instruction_pool =[ + "[vqa] {}", + "[vqa] Based on the image, respond to this question with a short answer: {}" + ] + + def create_data(self, ann_path): + processed_data = [] + with open(ann_path, 'r') as f: + data = json.load(f) + for k in data.keys(): + if data[k]['split'] != 1: continue # 1 for training, 2 for validation, 3 for test + ext = os.path.splitext(data[k]['imageURL'])[1] + imageFile = k + ext + assert len(data[k]['questions']) == len(data[k]['answers']) + for q, a in zip(data[k]['questions'], data[k]['answers']): + processed_data.append( + {'question': q, + 'answer': a, + 'image_path': imageFile, + 'image_id': k, + 'title': data[k]['title'], + 'genre': data[k]['genre'], + } + ) + print("ocr vqa", len(processed_data)) + return processed_data + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB") + image = self.vis_processor(image) + question = self.text_processor(sample["question"]) + answer = self.text_processor(sample["answer"]) + + instruction = random.choice(self.instruction_pool).format(question) + instruction = " {} ".format(instruction) + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": sample['image_id'] + } + + + + + +class TextOCRDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + self.data = self.create_data(ann_path) + + self.instruction_pool = [ + " [OCR] {}" + ] + + def create_data(self, ann_path): + processed_data = [] + with open(ann_path, 'r') as f: + data = json.load(f) + for k in data["anns"].keys(): + # ext = os.path.splitext(data[k]['imageURL'])[1] + imageFile = data["anns"][k]["image_id"]+".jpg" + bbox = data["anns"][k]["bbox"] + text = data["anns"][k]["utf8_string"] + # assert len(data[k]['questions']) == len(data[k]['answers']) + # for q, a in zip(data[k]['questions'], data[k]['answers']): + + processed_data.append( + {'bbox': bbox, + 'answer': text, + 'image_path': imageFile, + 'image_id': k, + } + ) + + return processed_data + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB") + width, height = image.size + image = self.vis_processor(image) + + new_bbox ="" + image_size = 100 + bbox = sample['bbox'] + for index in range(len(bbox)): + + x1 = int(bbox[0]/width*image_size) + y1 = int(bbox[1]/height*image_size) + x2 = x1 + int(bbox[2]/width*image_size) + y2 = y1 + int(bbox[3]/height*image_size) + assert x1>=0 and x1<=image_size + assert x2>=0 and x2<=image_size + assert y1>=0 and y1<=image_size + assert y2>=0 and y2<=image_size + + new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">" + + instruction = random.choice(self.instruction_pool).format(new_bbox) + + return { + "image": image, + "instruction_input": instruction, + "answer": sample['answer'], + "image_id": sample['image_id'] + } + + + +class PlotVQADataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + self.data = self.create_data(ann_path) + + self.instruction_pool = [ + '{}', + 'Question: {}', + '{} A short answer to the question is', + 'Q: {} A:', + 'Question: {} Short answer:', + # 'Given the image, answer the following question with no more than three words. {}', + 'Based on the image, respond to this question with a short answer: {}.', + 'Use the provided image to answer the question: {} Provide your answer as short as possible.', + 'What is the answer to the following question? "{}"', + 'The question "{}" can be answered using the image. A short answer is' + ] + + def create_data(self, ann_path): + processed_data = [] + with open(ann_path, 'r') as f: + data = json.load(f) + for da in data["qa_pairs"]: + # ext = os.path.splitext(data[k]['imageURL'])[1] + + imageFile = str(da["image_index"])+".png" + question = da["question_string"] + answer = str(da["answer"]) + # assert len(data[k]['questions']) == len(data[k]['answers']) + # for q, a in zip(data[k]['questions'], data[k]['answers']): + + processed_data.append( + {'question': question, + 'answer': answer, + 'image_path': imageFile, + 'image_id': str(da["image_index"]), + } + ) + + return processed_data + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB") + # width, height = image.size + image = self.vis_processor(image) + + + # image_shape = image.shape + instruction = " {} ".format(sample["question"]) + + instruction = random.choice(self.instruction_pool).format(instruction) + + answer = sample["answer"] + + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": sample['image_id'] + } + diff --git a/minigpt4/datasets/datasets/flickr.py b/minigpt4/datasets/datasets/flickr.py new file mode 100755 index 0000000..68355f7 --- /dev/null +++ b/minigpt4/datasets/datasets/flickr.py @@ -0,0 +1,159 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + + +class GroundedDetailDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.instruction_pool = [ + '[grounding] please describe this image in details', + '[grounding] describe this image as detailed as possible', + '[grounding] summarize this image in details', + '[grounding] give a thorough description of what you see in this image', + ] + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id']) + image_file = '{}.jpg'.format(info['image_id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + answer = info['grounded_caption'] + + instruction = random.choice(self.instruction_pool) + + instruction = " {} ".format(instruction) + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": info['image_id'], + } + + + + +class CaptionToObjectDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.instruction_pool = [ + '[detection] {}', + ] + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id']) + image_file = '{}.jpg'.format(info['image_id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + input = info["caption"] + answer = info["output"] + + instruction = random.choice(self.instruction_pool).format(input) + + instruction = " {} ".format(instruction) + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": info['image_id'], + } + + + + +class PhraseToObjectDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.instruction_pool = [ + '[detection] {}', + ] + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_id']) + image_file = '{}.jpg'.format(info['image_id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + input = info["phrase"] + answer = "

"+input+"

"+info["bbox"] + + instruction = random.choice(self.instruction_pool).format(input) + + instruction = " {} ".format(instruction) + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": info['image_id'], + } diff --git a/minigpt4/datasets/datasets/gqa_datasets.py b/minigpt4/datasets/datasets/gqa_datasets.py new file mode 100755 index 0000000..053f9eb --- /dev/null +++ b/minigpt4/datasets/datasets/gqa_datasets.py @@ -0,0 +1,65 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import os +import json + +from PIL import Image + +from minigpt4.datasets.datasets.vqa_datasets import VQADataset + +from collections import OrderedDict +import random + +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + + return OrderedDict( + { + "file": ann["image"], + "question": ann["question"], + "question_id": ann["question_id"], + "answers": "; ".join(ann["answer"]), + "image": sample["image"], + } + ) + + +class GQADataset(VQADataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.instruction_pool =[ + "[vqa] {}", + "[vqa] Based on the image, respond to this question with a short answer: {}" + ] + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + instruction = random.choice(self.instruction_pool).format(question) + instruction = " {} ".format(instruction) + + answers = self.text_processor(ann["answer"]) + # if "unk" in answers: + # print("gqa",answers) + + # print(answers) + + return { + "image": image, + "instruction_input": instruction, + "answer": answers, + # "weights": weights, + } + diff --git a/minigpt4/datasets/datasets/llava_dataset.py b/minigpt4/datasets/datasets/llava_dataset.py new file mode 100755 index 0000000..27b034c --- /dev/null +++ b/minigpt4/datasets/datasets/llava_dataset.py @@ -0,0 +1,390 @@ +import os +import json +import pickle +import random +import time +# import iterto +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + +class LlavaDetailDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + image_file = 'COCO_train2014_{}.jpg'.format(info['id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + answer = info['conversations'][1]['value'] + instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip() + + instruction = ' {} '.format(self.text_processor(instruction)) + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": info['id'], + } + +class LlavaReasonDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + image_file = 'COCO_train2014_{}.jpg'.format(info['id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + answer = info['conversations'][1]['value'] + instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip() + + instruction = ' {} '.format(self.text_processor(instruction)) + + # instruction = ' {} '.format(self.text_processor(instruction)) + # answer = self.text_processor(answer) + + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + "image_id": info['id'], + } + + + + +class MiniGPT4v(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + + self.instruction_pool = [ + 'please describe this image as detailed as possible', + 'What do you see happening in this image?', + "Can you elaborate on the elements of the picture provided?", + "Describe the following image.", + "Write a detailed description of the given image.", + "Write a detailed description of the given image.", + "Explain the visual content of the image in great detail" + ] + self.ann=[] + + with open(ann_path,"r") as f: + for line in f.readlines(): + self.ann.append(json.loads(line)) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path']) + # print("info keys",info.keys()) + if "image_path" in info.keys(): + image_path = "/ibex/reference/CV/COCO/cocoapi/data/2017/images/jpeg/train/"+info['image_path'] + + else: + # print("coming here?") + image_file = "images/"+info["image"] + image_path = os.path.join(self.vis_root, image_file) + # print(image_path) + + + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + if "question" in info.keys(): + question = info['question'] + else: + question = random.sample(self.instruction_pool,1)[0] + + + answer = info["caption"] + + + instruction = ' {} '.format(self.text_processor(question)) + + # instruction = ' {} '.format(self.text_processor(instruction)) + # answer = self.text_processor(answer) + # print("image path", image_path) + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + # "image_id": info['id'], + } + + + + +class MiniGPT4v_emotion(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + + self.instruction_pool = [ + 'please describe this image as detailed as possible', + 'What do you see happening in this image?', + "Can you elaborate on the elements of the picture provided?", + "Describe the following image", + "Write a detailed description of the given image", + "Write a detailed description of the given image", + "Explain the visual content of the image in great detail" + ] + # self.ann=[] + + with open(ann_path,"r") as f: + # for line in f.readlines(): + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path']) + # print("info keys",info.keys()) + + # print("coming here?") + image_file = info["link"] + image_path = os.path.join(self.vis_root, image_file) + # print("image path",image_path) + # print(image_path) + + + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + question = random.sample(self.instruction_pool,1)[0] + + + answer = info["caption"] + + + instruction = ' {} '.format(self.text_processor(question)) + + # instruction = ' {} '.format(self.text_processor(instruction)) + # answer = self.text_processor(answer) + # print("image path", image_path) + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + # "image_id": info['id'], + } + + + + +class MiniGPT4v_laion(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + + self.instruction_pool = [ + 'please describe this image as detailed as possible', + 'What do you see happening in this image?', + "Can you elaborate on the elements of the picture provided?", + "Describe the following image", + "Write a detailed description of the given image", + "Write a detailed description of the given image", + "Explain the visual content of the image in great detail" + ] + # self.ann=[] + + with open(ann_path,"r") as f: + # for line in f.readlines(): + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + # image_file = 'COCO_train2014_{}.jpg'.format(info['image_path']) + # print("info keys",info.keys()) + + # print("coming here?") + image_file = info["link"] + image_path = os.path.join(self.vis_root, image_file) + # print(image_path) + # print(image_path) + + + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + question = random.sample(self.instruction_pool,1)[0] + + + answer = info["caption"] + + + instruction = ' {} '.format(self.text_processor(question)) + + # instruction = ' {} '.format(self.text_processor(instruction)) + # answer = self.text_processor(answer) + # print("image path", image_path) + return { + "image": image, + "instruction_input": instruction, + "answer": answer, + # "image_id": info['id'], + } + + + +class Minigpt2_conversation(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + answer = info['conversations'][1]['value'] + instruction = info['conversations'][0]['value'] + + # print("instruction",instruction) + # print("answer", answer) + + return { + "instruction_input": instruction, + "answer": answer, + } + + +class LlavaConversationDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.ann=[] + + + # with open(ann_path, 'r') as f: + # self.ann = json.load(f) + + self.connect_sym = "!@#" + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + image_file = 'COCO_train2014_{}.jpg'.format(info['id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + first_instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip() + first_instruction = ' {} '.format(first_instruction) + + questions = [first_instruction] + answers = [] + + for i, item in enumerate(info["conversations"][1:]): + if i % 2 ==0: # assistant + assistant_answer = item["value"] + answers.append(assistant_answer) + else: + human_instruction = item["value"]+" " + questions.append(human_instruction) + + questions = self.connect_sym.join(questions) + # questions = questions.replace("\\\\","\\") + answers = self.connect_sym.join(answers) + + + return { + "image": image, + "conv_q": questions, + 'conv_a': answers, + "image_id": info['id'], + "connect_sym": self.connect_sym + } \ No newline at end of file diff --git a/minigpt4/datasets/datasets/multitask_conversation.py b/minigpt4/datasets/datasets/multitask_conversation.py new file mode 100644 index 0000000..3b13e52 --- /dev/null +++ b/minigpt4/datasets/datasets/multitask_conversation.py @@ -0,0 +1,75 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + + + + +class MultiTaskConversationDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + self.connect_sym = "!@#" + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index] + + image_file = 'COCO_train2014_{}.jpg'.format(info['id']) + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + + first_instruction = info['conversations'][0]['value'].replace('', '').replace('\n', '').strip() + first_instruction = ' {} '.format(first_instruction) + + questions = [first_instruction] + answers = [] + + for i, item in enumerate(info["conversations"][1:]): + if i % 2 ==0: # assistant + assistant_answer = item["value"] + answers.append(assistant_answer) + else: + human_instruction = item["value"]+" " + questions.append(human_instruction) + + questions = self.connect_sym.join(questions) + answers = self.connect_sym.join(answers) + + + return { + "image": image, + "conv_q": questions, + 'conv_a': answers, + "image_id": info['id'], + "connect_sym": self.connect_sym + } \ No newline at end of file diff --git a/minigpt4/datasets/datasets/text_caps.py b/minigpt4/datasets/datasets/text_caps.py new file mode 100755 index 0000000..21446c2 --- /dev/null +++ b/minigpt4/datasets/datasets/text_caps.py @@ -0,0 +1,186 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + + + + + +class TextCapDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + self.instruction_pool = [ + # "generate a short image caption incorporating text in the image", + # "generate a brief image description combining the text shown in the image", + # "what text is writen in this image?", + # "describe the text that you can see from this image", + # "What does the text in the image say?" + 'Briefly describe this image.', + 'Provide a concise depiction of this image.', + 'Present a short description of this image.', + 'Summarize this image in a few words.', + 'A short image caption:', + 'A short image description:', + 'A photo of ', + 'An image that shows ', + 'Write a short description for the image. ', + 'Write a description for the photo.', + 'Provide a description of what is presented in the photo.', + 'Briefly describe the content of the image.', + 'Can you briefly explain what you see in the image?', + 'Could you use a few words to describe what you perceive in the photo?', + 'Please provide a short depiction of the picture.', + 'Using language, provide a short account of the image.', + 'Use a few words to illustrate what is happening in the picture.', + ] + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + + def __len__(self): + return len(self.ann["data"]) + + + def __getitem__(self, index): + info = self.ann["data"][index] + + image_file = '{}.jpg'.format(info['image_id']) + + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + # image_width,image_length = image.size + image = self.vis_processor(image) + + # ocr_info = self.ann[index]["data"] + caption = info["caption_str"] + caption = self.text_processor(caption) + + # instruction = random.choice(self.instruction_pool).format(word_bbox) + instruction = " [caption] {} ".format(random.choice(self.instruction_pool)) + return { + "image": image, + "instruction_input": instruction, + "answer": caption, + "data_type": "bbox", + "question_split": True + } + + + +class TextCapBboxToObjectDataset(Dataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.vis_root = vis_root + + self.vis_processor = vis_processor + self.text_processor = text_processor + + # self.instruction_pool = [ + # " What text does it show in {} ", + # " Extract the text from {} ", + # " What is the textual content in {} ", + # " Extract the textual information present in the {} ", + # " What is the text written within this defined region {}", + # " Transcribe the text located inside {}", + # " Can you read and extract the text from this specific area {}", + # ] + + self.instruction_pool = [ + " [OCR] {}" + ] + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + self.new_ann = {"data":[]} + for da in self.ann["data"]: + if da["ocr_info"] !=[]: + ocr_info_filter = [] + for d in da["ocr_info"]: + if (d["bounding_box"]["width"]+d["bounding_box"]["top_left_x"])<=1.0 and (d["bounding_box"]["height"]+d["bounding_box"]["top_left_y"]) <=1.0 \ + and d["bounding_box"]["top_left_x"]>=0 and d["bounding_box"]["top_left_y"]>=0: + ocr_info_filter.append(d) + if ocr_info_filter !=[]: + da["ocr_info"]=ocr_info_filter + self.new_ann["data"].append(da) + self.ann = self.new_ann + + + def __len__(self): + return len(self.ann["data"]) + + + def __getitem__(self, index): + + info = self.ann["data"][index] + + + image_file = '{}.jpg'.format(info['image_id']) + + image_path = os.path.join(self.vis_root, image_file) + image = Image.open(image_path).convert("RGB") + # image_width,image_length = image.size + image = self.vis_processor(image) + + + + image_size = 100 + + ocr_info = info["ocr_info"] + + sampled_ocr = random.sample(ocr_info,1)[0] + + # print("sampled ocr", sampled_ocr) + + word_text = sampled_ocr["word"] + width = sampled_ocr["bounding_box"]["width"] + height = sampled_ocr["bounding_box"]["height"] + top_left_x = sampled_ocr["bounding_box"]["top_left_x"] + top_left_y = sampled_ocr["bounding_box"]["top_left_y"] + + x1 = int(top_left_x*image_size) + y1 = int(top_left_y*image_size) + x2 = x1 + int(width*image_size) + y2 = y1 + int(height*image_size) + assert x1>=0 and x1<=image_size + assert x2>=0 and x2<=image_size + assert y1>=0 and y1<=image_size + assert y2>=0 and y2<=image_size + + + word_bbox = "{<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">}" + + instruction = random.choice(self.instruction_pool).format(word_bbox) + return { + "image": image, + "instruction_input": instruction, + "answer": word_text, + "data_type": "bbox", + "question_split": True + } \ No newline at end of file diff --git a/minigpt4/datasets/datasets/unnatural_instruction.py b/minigpt4/datasets/datasets/unnatural_instruction.py new file mode 100755 index 0000000..4857006 --- /dev/null +++ b/minigpt4/datasets/datasets/unnatural_instruction.py @@ -0,0 +1,52 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +import skimage.io as io +import matplotlib.pyplot as plt +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon, Rectangle +from torch.utils.data import Dataset +import webdataset as wds + +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset + + +class UnnaturalDataset(Dataset): + def __init__(self, text_processor, ann_path): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.text_processor = text_processor + + with open(ann_path, 'r') as f: + self.ann = json.load(f) + + # with open(ann_path, 'r') as f: + # for data in f.readlines(): + # data = json.loads(data) + # self.ann.append(data) + + def __len__(self): + return len(self.ann) + + def __getitem__(self, index): + info = self.ann[index]["instances"][0] + instruction = info["instruction_with_input"] + constraints = info["constraints"] + answer = info["output"] + if constraints != None: + instruction = instruction+" "+constraints + + return { + # "image":None, + "instruction_input": self.text_processor(instruction), + "answer": self.text_processor(answer), + } diff --git a/minigpt4/datasets/datasets/vg_dataset.py b/minigpt4/datasets/datasets/vg_dataset.py new file mode 100755 index 0000000..3042edd --- /dev/null +++ b/minigpt4/datasets/datasets/vg_dataset.py @@ -0,0 +1,98 @@ +import os +import json +import pickle +import random +import time +import itertools + +import numpy as np +from PIL import Image +from torch.utils.data import Dataset +from visual_genome import local + + +import threading + +# Global lock +lock = threading.Lock() + + +class ReferVisualGenomeDataset(Dataset): + def __init__(self, vis_processor, text_processor, data_dir): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + self.data_dir = data_dir + + self.vis_processor = vis_processor + self.text_processor = text_processor + + all_regions = local.get_all_region_descriptions(self.data_dir) + all_regions = [region for regions in all_regions for region in regions] + + # follow OFA practice, only regions smaller than 16384 pixels are used for refer + self.regions = [region for region in all_regions if region.width * region.height < 16384] + + print('Visual Genome grounding', len(self.regions)) + + + self.instruction_pool = [ + "[refer] {}", + "[refer] give me the location of {}", + "[refer] where is {} ?", + "[refer] from this image, tell me the location of {}", + "[refer] the location of {} is", + "[refer] could you tell me the location for {} ?", + "[refer] where can I locate the {} ?", + ] + + + def __len__(self): + return len(self.regions) + + def preprocess(self, index): + region = self.regions[index] + image_file = region.image.url.split('/')[-2:] + image_path = os.path.join(self.data_dir, *image_file) + image = Image.open(image_path).convert("RGB") + image_orig_size = image.size + image = self.vis_processor(image) + image_new_size = [100,100] + + sample_sentence = region.phrase + refer_sentence = self.text_processor(sample_sentence) + + bbox = [region.x, region.y, region.width, region.height] + + bbox = [ + bbox[0] / image_orig_size[0] * image_new_size[0], + bbox[1] / image_orig_size[1] * image_new_size[1], + (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0], + (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1] + ] + bbox = [int(x) for x in bbox] + bbox = "{{<{}><{}><{}><{}>}}".format(*bbox) + return { + "image": image, + "refer_sentence": refer_sentence, + "bbox": bbox, + "image_id": region.image.id, + } + + def __getitem__(self, index): + data = self.preprocess(index) + instruction = random.choice(self.instruction_pool).format(data['refer_sentence']) + + instruction = " {} ".format(instruction) + + # assert False + + return { + "image": data['image'], + "instruction_input": instruction, + "answer": data['bbox'], + "image_id": data['image_id'], + } + + diff --git a/minigpt4/datasets/datasets/vqa_datasets.py b/minigpt4/datasets/datasets/vqa_datasets.py new file mode 100755 index 0000000..5cdc0fa --- /dev/null +++ b/minigpt4/datasets/datasets/vqa_datasets.py @@ -0,0 +1,223 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +import torch +from PIL import Image +import os + +from minigpt4.datasets.datasets.base_dataset import BaseDataset + + +class VQADataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + # def collater(self, samples): + # image_list, question_list, answer_list, weight_list = [], [], [], [] + + # num_answers = [] + + # for sample in samples: + # image_list.append(sample["image"]) + # question_list.append(sample["question"]) + + # weight_list.extend(sample["weights"]) + + # answers = sample["answer"] + + # answer_list.extend(answers) + # num_answers.append(len(answers)) + + # return { + # "image": torch.stack(image_list, dim=0), + # "text_input": question_list, + # "answer": answer_list, + # "weight": torch.Tensor(weight_list), + # "n_answers": torch.LongTensor(num_answers), + # } + + +class VQAEvalDataset(BaseDataset): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + +class OKVQAEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + img_id = data['image_id'] + question = data['question'] + question_id = data['question_id'] + img_file = '{:0>12}.jpg'.format(img_id) + image_path = os.path.join(self.root_path, img_file) + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image) + question = f"[vqa] Based on the image, respond to this question with a short answer: {question}" + # question = f"[vqa] {question} " + return image, question, question_id, img_id + +class VizWizEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + img_id = data['image'] + question = data['question'] + answers = data['answers'] + answers = '_'.join([answer['answer'] for answer in answers]) + image_path = os.path.join(self.root_path, img_id) + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image) + # question = f"[vqa] Based on the image, respond to this question with a short answer: {question} " + question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it" + return image, question, answers + +class AOKVQADAEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + img_file = data['image'] + question = data['question'] + question_id = data['question_id'] + image_path = os.path.join(self.root_path, img_file) + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image) + question = f"[vqa] Based on the image, respond to this question with a short answer: {question}" + # question = f"[vqa] {question} " + return image, question, question_id + +class AOKVQAMCEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + img_file = data['image'] + question = data['question'] + question_id = data['question_id'] + image_path = os.path.join(self.root_path, img_file) + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image).half().cuda() + candidates=data['choices'] + # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} " + question = f"[vqa] Based on the image, respond to this question with a short answer: {question}" + # question = f"[vqa] {question} " + return image, question, question_id, candidates + +class IconQAEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + data = self.loaded_data[idx] + image_id = data['image_id'] + question = data['question'] + image_path = os.path.join(self.root_path, image_id, 'image.png') + image = Image.open(image_path).convert('RGB') + image = self.vis_processor(image).half().cuda() + candidates = '_'.join(data['choices']) + answer = data['answer'] + # question = f"Given this image, choose one correct answer from {candidates} for this question: {question} " + question = f"[vqa] Based on the image, respond to this question with a short answer: {question}" + # question = f"[vqa] {question} " + return image, question, candidates, answer + +class GQAEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + ann = self.loaded_data[idx] + image_id = ann["image"] + image_path = os.path.join(self.root_path, f"{image_id}") + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + question = ann["question"] + # question = f'Question: {question} Short answer: ' + question = f"[vqa] Based on the image, respond to this question with a short answer: {question}" + # question = f"[vqa] {question} " + labels = ann["answer"] + + return image, question, labels + +class HMEvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + ann = self.loaded_data[idx] + image_id = ann["img"] + image_path = os.path.join(self.root_path, f"{image_id}") + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + question = ann["text"] + question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:" + labels = ann["label"] + + return image, question, labels + +class VSREvalData(torch.utils.data.Dataset): + def __init__(self, loaded_data, vis_processor, root_path): + self.loaded_data = loaded_data + self.root_path = root_path + self.vis_processor = vis_processor + + def __len__(self): + return len(self.loaded_data) + + def __getitem__(self, idx): + ann = self.loaded_data[idx] + image_path = os.path.join(self.root_path, ann["image"]) + image = Image.open(image_path).convert("RGB") + image = self.vis_processor(image) + question = ann["caption"] + question = f'[vqa] Based on the image, is this statement true or false? {question}' + question_id = ann["image"].split('.')[0] + labels = 'true' if ann["label"] == 1 else 'false' + + return image, question, labels \ No newline at end of file diff --git a/train_configs/minigpt_v2_finetune.yaml b/train_configs/minigpt_v2_finetune.yaml new file mode 100644 index 0000000..be5c7c6 --- /dev/null +++ b/train_configs/minigpt_v2_finetune.yaml @@ -0,0 +1,300 @@ +model: + arch: minigpt_v2 + model_type: pretrain + freeze_vit: True + freeze_qformer: True + max_txt_len: 1024 + low_resource: False + image_size: 448 + end_sym: "" + llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update" + ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth" + use_grad_checkpoint: True + chat_template: True + lora_r: 64 + lora_alpha: 16 + + +datasets: + + + multitask_conversation: + batch_size_train: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 40 + + llava_conversation: # 77k + batch_size_train: 2 + vis_processor: + train: + name: "blip2_image_train" + image_size: 448 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 + + + + + # unnatural_instruction: + # batch_size: 1 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 15 + + + # refvg: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 40 + + # llava_detail: #23K + # batch_size: 4 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 20 + + # llava_reason: # 77k + # batch_size: 4 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 80 + + + # grounded_detailed_image_caption: + # batch_size: 2 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 80 + + # CaptionToPhrase: + # batch_size: 2 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 80 + + # ObjectToPhrase: + # batch_size: 2 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 80 + + # coco_caption: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 10 + + + # textcaps_caption: # + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 10 + + # refcoco: # 142k + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 15 + + + # refcocop: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 15 + + # refcocog: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 15 + + + + # invrefcoco: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 10 + + # invrefcocop: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 10 + + # invrefcocog: + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 10 + + + # coco_vqa: # 82K + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 15 + + # ok_vqa: # 9k + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 8 + + # aok_vqa: # 17k + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 12 + + # gqa: # 82K + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 40 + + # ocrvqa: # 800K + # batch_size: 6 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 448 + # text_processor: + # train: + # name: "blip_caption" + # sample_ratio: 30 + + +run: + task: image_text_pretrain + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 1e-5 + min_lr: 8e-5 + warmup_lr: 1e-6 + + weight_decay: 0.05 + max_epoch: 50 + num_workers: 6 + warmup_steps: 1000 + iters_per_epoch: 1000 + + seed: 42 + output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file