mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-05 10:30:45 +00:00
add finetuning code
This commit is contained in:
parent
10f61a4dd8
commit
0e5d34ad2e
30
jobs/srun_test.sh
Normal file
30
jobs/srun_test.sh
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
cd ..
|
||||
|
||||
job_name=minigpt4_v2_test
|
||||
read LOWERPORT UPPERPORT < /proc/sys/net/ipv4/ip_local_port_range
|
||||
while :
|
||||
do
|
||||
PORT="`shuf -i $LOWERPORT-$UPPERPORT -n 1`"
|
||||
ss -lpn | grep -q ":$PORT " || break
|
||||
done
|
||||
|
||||
|
||||
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_gqa.yaml
|
||||
|
||||
|
||||
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/448_final_v1_gqa_ablation2.yaml
|
||||
torchrun --master-port ${PORT} --nproc-per-node 2 train.py --cfg-path train_configs/minigpt_v2_finetune.yaml
|
||||
|
||||
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path finetune_conversation_ablation/conversation_v2_last_336_test.yaml
|
||||
|
||||
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_13B.yaml
|
||||
|
||||
# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/448_v2_llama2.yaml
|
||||
#accelerate launch train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2.yaml
|
||||
|
||||
|
||||
# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2_clip_encoder.yaml
|
||||
|
||||
#best_data_ratio_336_full_dataset_lr2e4_v1.yaml
|
||||
|
29
minigpt4/configs/datasets/aokvqa/defaults.yaml
Executable file
29
minigpt4/configs/datasets/aokvqa/defaults.yaml
Executable file
@ -0,0 +1,29 @@
|
||||
# Copyright (c) 2022, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
datasets:
|
||||
aok_vqa:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
||||
# storage:
|
||||
# - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
||||
storage:
|
||||
- /ibex/project/c2133/minigpt4_v2_dataset/aokvqa/annotations/aokvqa_v1p0_train.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/
|
38
minigpt4/configs/datasets/coco/caption.yaml
Normal file
38
minigpt4/configs/datasets/coco/caption.yaml
Normal file
@ -0,0 +1,38 @@
|
||||
# Copyright (c) 2022, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
datasets:
|
||||
coco_caption: # name of the dataset builder
|
||||
# dataset_card: dataset_card/coco_caption.md
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
||||
# md5: aa31ac474cf6250ebb81d18348a07ed8
|
||||
# storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
train:
|
||||
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
||||
md5: aa31ac474cf6250ebb81d18348a07ed8
|
||||
storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_train.json
|
||||
# val:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
|
||||
# md5: b273847456ef5580e33713b1f7de52a0
|
||||
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
|
||||
# test:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
|
||||
# md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
|
||||
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
33
minigpt4/configs/datasets/coco/defaults_vqa.yaml
Executable file
33
minigpt4/configs/datasets/coco/defaults_vqa.yaml
Executable file
@ -0,0 +1,33 @@
|
||||
# Copyright (c) 2022, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
datasets:
|
||||
coco_vqa:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
build_info:
|
||||
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
|
||||
# storage:
|
||||
# - /path/to/vqav2/annotations/vqa_train.json
|
||||
# - /path/to/vqav2/coco/annotations/vqa_val.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
|
||||
storage:
|
||||
- /ibex/project/c2133/minigpt4_v2_dataset/vqav2/annotations/vqa_train.json
|
||||
- /ibex/project/c2133/minigpt4_v2_dataset/vqav2/coco/annotations/vqa_val.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
8
minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/invrefcoco.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
invrefcoco:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: invrefcoco
|
||||
splitBy: unc
|
8
minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/invrefcocog.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
invrefcocog:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: invrefcocog
|
||||
splitBy: umd
|
8
minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/invrefcocop.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
invrefcocop:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: invrefcoco+
|
||||
splitBy: unc
|
8
minigpt4/configs/datasets/coco_bbox/refcoco.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/refcoco.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
refcoco:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: refcoco
|
||||
splitBy: unc
|
8
minigpt4/configs/datasets/coco_bbox/refcocog.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/refcocog.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
refcocog:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: refcocog
|
||||
splitBy: umd
|
8
minigpt4/configs/datasets/coco_bbox/refcocop.yaml
Executable file
8
minigpt4/configs/datasets/coco_bbox/refcocop.yaml
Executable file
@ -0,0 +1,8 @@
|
||||
datasets:
|
||||
refcocop:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/object_detection_datasets/
|
||||
dataset: refcoco+
|
||||
splitBy: unc
|
6
minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
Executable file
6
minigpt4/configs/datasets/flickr/caption_to_phrase.yaml
Executable file
@ -0,0 +1,6 @@
|
||||
datasets:
|
||||
CaptionToPhrase:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_v2_last.json
|
6
minigpt4/configs/datasets/flickr/default.yaml
Executable file
6
minigpt4/configs/datasets/flickr/default.yaml
Executable file
@ -0,0 +1,6 @@
|
||||
datasets:
|
||||
grounded_detailed_image_caption:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_last.json
|
6
minigpt4/configs/datasets/flickr/object_to_phrase.yaml
Executable file
6
minigpt4/configs/datasets/flickr/object_to_phrase.yaml
Executable file
@ -0,0 +1,6 @@
|
||||
datasets:
|
||||
ObjectToPhrase:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_phrase2bbox_resample_last.json
|
33
minigpt4/configs/datasets/gqa/balanced_val.yaml
Normal file
33
minigpt4/configs/datasets/gqa/balanced_val.yaml
Normal file
@ -0,0 +1,33 @@
|
||||
# Copyright (c) 2022, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
datasets:
|
||||
gqa:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||
# storage:
|
||||
# - /path/to/gqa/annotations/train_balanced_questions.json
|
||||
|
||||
# images:
|
||||
# storage: /path/to/gqa/images/
|
||||
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||
storage:
|
||||
- /ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json
|
||||
images:
|
||||
storage: /ibex/project/c2133/minigpt4_v2_dataset/gqa/images_copy/
|
12
minigpt4/configs/datasets/llava/conversation.yaml
Executable file
12
minigpt4/configs/datasets/llava/conversation.yaml
Executable file
@ -0,0 +1,12 @@
|
||||
datasets:
|
||||
# llava_conversation:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/conversation_58k.json
|
||||
|
||||
llava_conversation:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/conversation_58k.json
|
12
minigpt4/configs/datasets/llava/detail.yaml
Executable file
12
minigpt4/configs/datasets/llava/detail.yaml
Executable file
@ -0,0 +1,12 @@
|
||||
datasets:
|
||||
# llava_detail:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/detail_23k.json
|
||||
|
||||
llava_detail:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/detail_23k.json
|
12
minigpt4/configs/datasets/llava/reason.yaml
Executable file
12
minigpt4/configs/datasets/llava/reason.yaml
Executable file
@ -0,0 +1,12 @@
|
||||
datasets:
|
||||
# llava_reason:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/complex_reasoning_77k.json
|
||||
|
||||
llava_reason:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/complex_reasoning_77k.json
|
@ -0,0 +1,14 @@
|
||||
datasets:
|
||||
# multitask_conversation:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/multitask_conversation/multi_task_conversation.json
|
||||
|
||||
multitask_conversation:
|
||||
data_type: images
|
||||
build_info:
|
||||
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/multitask_conversation/multi_task_conversation.json
|
10
minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
Normal file
10
minigpt4/configs/datasets/nlp/unnatural_instruction.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
datasets:
|
||||
# unnatural_instruction:
|
||||
# data_type: text
|
||||
# build_info:
|
||||
# ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
|
||||
|
||||
unnatural_instruction:
|
||||
data_type: text
|
||||
build_info:
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/unnatural-instructions/data/unnatural_instruction_filer.json
|
12
minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
Executable file
12
minigpt4/configs/datasets/ocrvqa/ocrvqa.yaml
Executable file
@ -0,0 +1,12 @@
|
||||
datasets:
|
||||
# ocrvqa:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
|
||||
# ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
|
||||
|
||||
ocrvqa:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
|
36
minigpt4/configs/datasets/okvqa/defaults.yaml
Executable file
36
minigpt4/configs/datasets/okvqa/defaults.yaml
Executable file
@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2022, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
datasets:
|
||||
ok_vqa:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# # TODO make this order insensitive
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
||||
# storage:
|
||||
# - /path/to/okvqa/annotations/okvqa_train.json
|
||||
# images:
|
||||
# storage: /path/to/okvqa/images
|
||||
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
# TODO make this order insensitive
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
||||
storage:
|
||||
- /ibex/project/c2133/minigpt4_v2_dataset/okvqa_v2/annotations/okvqa_train.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
16
minigpt4/configs/datasets/textcaps/caption.yaml
Executable file
16
minigpt4/configs/datasets/textcaps/caption.yaml
Executable file
@ -0,0 +1,16 @@
|
||||
datasets:
|
||||
# textcaps_caption:
|
||||
# data_type: images
|
||||
|
||||
# build_info:
|
||||
# image_path: /path/to/TextCaps/train_images
|
||||
# ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
|
||||
|
||||
textcaps_caption:
|
||||
data_type: images
|
||||
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/train_images
|
||||
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/TextCaps_0.1_train.json
|
||||
|
||||
|
10
minigpt4/configs/datasets/vg/ref.yaml
Executable file
10
minigpt4/configs/datasets/vg/ref.yaml
Executable file
@ -0,0 +1,10 @@
|
||||
datasets:
|
||||
# refvg:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# data_dir: /path/to/visual_genome
|
||||
|
||||
refvg:
|
||||
data_type: images
|
||||
build_info:
|
||||
data_dir: /ibex/project/c2133/minigpt4_v2_dataset/visual_genome
|
@ -6,6 +6,418 @@ from minigpt4.common.registry import registry
|
||||
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
|
||||
from minigpt4.datasets.datasets.laion_dataset import LaionDataset
|
||||
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
|
||||
from minigpt4.datasets.datasets.text_caps import TextCapDataset
|
||||
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
|
||||
from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
|
||||
from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
|
||||
from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
|
||||
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
|
||||
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
|
||||
from minigpt4.datasets.datasets.gqa_datasets import GQADataset
|
||||
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
|
||||
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
|
||||
from minigpt4.datasets.datasets.doc_dataset import OCRVQADataset
|
||||
|
||||
|
||||
|
||||
@registry.register_builder("multitask_conversation")
|
||||
class MultitaskConversationBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = MultiTaskConversationDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/multitask_conversation/default.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
@registry.register_builder("unnatural_instruction")
|
||||
class UnnaturalInstructionBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = UnnaturalDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/nlp/unnatural_instruction.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
|
||||
@registry.register_builder("llava_detail")
|
||||
class LlavaDetailBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = LlavaDetailDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/llava/detail.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
|
||||
@registry.register_builder("llava_reason")
|
||||
class LlavaReasonBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = LlavaReasonDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/llava/reason.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
@registry.register_builder("llava_conversation")
|
||||
class LlavaReasonBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = LlavaConversationDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/llava/conversation.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
class AllRefCOCOBuilder(BaseDatasetBuilder):
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
|
||||
build_info = self.config.build_info
|
||||
image_path = build_info.image_path
|
||||
ann_path = build_info.ann_path
|
||||
|
||||
datasets = dict()
|
||||
|
||||
if not os.path.exists(image_path):
|
||||
warnings.warn("image path {} does not exist.".format(image_path))
|
||||
if not os.path.exists(ann_path):
|
||||
warnings.warn("ann path {} does not exist.".format(ann_path))
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=ann_path,
|
||||
vis_root=image_path,
|
||||
dataset=build_info.dataset,
|
||||
splitBy=build_info.splitBy
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
@registry.register_builder("refcoco")
|
||||
class RefCOCOBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = ReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/refcoco.yaml",
|
||||
}
|
||||
|
||||
@registry.register_builder("refcocop")
|
||||
class RefCOCOPBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = ReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/refcocop.yaml",
|
||||
}
|
||||
|
||||
|
||||
@registry.register_builder("refcocog")
|
||||
class RefCOCOGBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = ReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/refcocog.yaml",
|
||||
}
|
||||
|
||||
@registry.register_builder("invrefcoco")
|
||||
class RefCOCOBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = InvReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/invrefcoco.yaml",
|
||||
}
|
||||
|
||||
|
||||
@registry.register_builder("invrefcocop")
|
||||
class RefCOCOPBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = InvReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/invrefcocop.yaml",
|
||||
}
|
||||
|
||||
|
||||
@registry.register_builder("invrefcocog")
|
||||
class RefCOCOGBuilder(AllRefCOCOBuilder):
|
||||
train_dataset_cls = InvReferCOCODataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco_bbox/invrefcocog.yaml",
|
||||
}
|
||||
|
||||
@registry.register_builder("refvg")
|
||||
class RefVisualGenomeBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = ReferVisualGenomeDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/vg/ref.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
|
||||
build_info = self.config.build_info
|
||||
data_dir = build_info.data_dir
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
data_dir=data_dir,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
@registry.register_builder("textcaps_caption")
|
||||
class TextcapCaptionBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = TextCapDataset
|
||||
|
||||
DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
|
||||
|
||||
def _download_ann(self):
|
||||
pass
|
||||
|
||||
def _download_vis(self):
|
||||
pass
|
||||
|
||||
def build(self):
|
||||
self.build_processors()
|
||||
|
||||
build_info = self.config.build_info
|
||||
|
||||
datasets = dict()
|
||||
split = "train"
|
||||
|
||||
# create datasets
|
||||
# [NOTE] return inner_datasets (wds.DataPipeline)
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets[split] = dataset_cls(
|
||||
vis_processor=self.vis_processors[split],
|
||||
text_processor=self.text_processors[split],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
@registry.register_builder("coco_vqa")
|
||||
class COCOVQABuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = COCOVQADataset
|
||||
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/coco/defaults_vqa.yaml",
|
||||
}
|
||||
|
||||
@registry.register_builder("aok_vqa")
|
||||
class AOKVQABuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = AOKVQADataset
|
||||
|
||||
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
|
||||
|
||||
|
||||
@registry.register_builder("gqa")
|
||||
class GQABuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = GQADataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/gqa/balanced_val.yaml",
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@registry.register_builder("grounded_detailed_image_caption")
|
||||
class GroundedCaptionBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = GroundedDetailDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/flickr/default.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
@registry.register_builder("CaptionToPhrase")
|
||||
class CaptionToPhraseBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = CaptionToObjectDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/flickr/caption_to_phrase.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
@registry.register_builder("ObjectToPhrase")
|
||||
class CaptionToPhraseBuilder(BaseDatasetBuilder):
|
||||
train_dataset_cls = PhraseToObjectDataset
|
||||
DATASET_CONFIG_DICT = {
|
||||
"default": "configs/datasets/flickr/object_to_phrase.yaml",
|
||||
}
|
||||
|
||||
def build_datasets(self):
|
||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||
logging.info("Building datasets...")
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
datasets = dict()
|
||||
|
||||
# create datasets
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets['train'] = dataset_cls(
|
||||
vis_processor=self.vis_processors["train"],
|
||||
text_processor=self.text_processors["train"],
|
||||
ann_path=build_info.ann_path,
|
||||
vis_root=build_info.image_path,
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
|
||||
|
||||
class DocumentVQABuilder(BaseDatasetBuilder):
|
||||
def _download_ann(self):
|
||||
pass
|
||||
|
||||
def _download_vis(self):
|
||||
pass
|
||||
|
||||
def build(self):
|
||||
self.build_processors()
|
||||
build_info = self.config.build_info
|
||||
|
||||
datasets = dict()
|
||||
split = "train"
|
||||
|
||||
dataset_cls = self.train_dataset_cls
|
||||
datasets[split] = dataset_cls(
|
||||
vis_processor=self.vis_processors[split],
|
||||
text_processor=self.text_processors[split],
|
||||
vis_root=build_info.image_path,
|
||||
ann_path=build_info.ann_path
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
|
||||
@registry.register_builder("ocrvqa")
|
||||
class OCRVQABuilder(DocumentVQABuilder):
|
||||
train_dataset_cls = OCRVQADataset
|
||||
DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"}
|
||||
|
||||
|
||||
@registry.register_builder("cc_sbu")
|
||||
|
212
minigpt4/datasets/datasets/aok_vqa_datasets.py
Executable file
212
minigpt4/datasets/datasets/aok_vqa_datasets.py
Executable file
@ -0,0 +1,212 @@
|
||||
"""
|
||||
Copyright (c) 2022, salesforce.com, inc.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
|
||||
|
||||
|
||||
class __DisplMixin:
|
||||
def displ_item(self, index):
|
||||
sample, ann = self.__getitem__(index), self.annotation[index]
|
||||
return OrderedDict(
|
||||
{
|
||||
"file": ann["image"],
|
||||
"question": ann["question"],
|
||||
"question_id": ann["question_id"],
|
||||
"direct_answers": "; ".join(ann["direct_answers"]),
|
||||
"choices": "; ".join(ann["choices"]),
|
||||
"correct_choice": ann["choices"][ann["correct_choice_idx"]],
|
||||
"image": sample["image"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class AOKVQADataset(VQADataset, __DisplMixin):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
self.instruction_pool =[
|
||||
"[vqa] {}",
|
||||
"[vqa] Based on the image, respond to this question with a short answer: {}"
|
||||
]
|
||||
|
||||
exist_annotation = []
|
||||
for ann in self.annotation:
|
||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||
if os.path.exists(image_path):
|
||||
exist_annotation.append(ann)
|
||||
self.annotation = exist_annotation
|
||||
|
||||
def get_data(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
question = self.text_processor(ann["question"])
|
||||
|
||||
answer_key = "direct_answers"
|
||||
|
||||
# print("answer key", answer_key)
|
||||
# for answer in ann[answer_key]:
|
||||
# print(answer)
|
||||
|
||||
answer_weight = {}
|
||||
for answer in ann[answer_key]:
|
||||
if answer in answer_weight.keys():
|
||||
answer_weight[answer] += 1 / len(ann[answer_key])
|
||||
else:
|
||||
answer_weight[answer] = 1 / len(ann[answer_key])
|
||||
|
||||
answers = list(answer_weight.keys())
|
||||
weights = list(answer_weight.values())
|
||||
|
||||
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"question": question,
|
||||
"answer": answer,
|
||||
}
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.get_data(index)
|
||||
question = self.text_processor(data["question"])
|
||||
instruction = random.choice(self.instruction_pool).format(question)
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
answer = self.text_processor(data['answer'])
|
||||
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
}
|
||||
|
||||
|
||||
class AOKVQGDataset(AOKVQADataset):
|
||||
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
self.instruction_pool = [
|
||||
'Given the image, generate a question whose answer is: {}',
|
||||
'Based on the image, provide a question with the answer: {}',
|
||||
'Given the visual representation, create a question for which the answer is "{}"',
|
||||
'From the image provided, craft a question that leads to the reply: {}',
|
||||
'Considering the picture, come up with a question where the answer is: {}',
|
||||
'Taking the image into account, generate an question that has the answer: {}'
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.get_data(index)
|
||||
instruction = random.choice(self.instruction_pool).format(data['answer'])
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"instruction_input": instruction,
|
||||
"answer": data['question'],
|
||||
}
|
||||
|
||||
|
||||
# class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
|
||||
# def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
# """
|
||||
# vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
# ann_root (string): directory to store the annotation file
|
||||
# """
|
||||
#
|
||||
# self.vis_root = vis_root
|
||||
#
|
||||
# self.annotation = json.load(open(ann_paths[0]))
|
||||
#
|
||||
# answer_list_path = ann_paths[1]
|
||||
# if os.path.exists(answer_list_path):
|
||||
# self.answer_list = json.load(open(answer_list_path))
|
||||
# else:
|
||||
# self.answer_list = None
|
||||
#
|
||||
# try:
|
||||
# self.coco_fmt_qust_file = ann_paths[2]
|
||||
# self.coco_fmt_anno_file = ann_paths[3]
|
||||
# except IndexError:
|
||||
# self.coco_fmt_qust_file = None
|
||||
# self.coco_fmt_anno_file = None
|
||||
#
|
||||
# self.vis_processor = vis_processor
|
||||
# self.text_processor = text_processor
|
||||
#
|
||||
# self._add_instance_ids()
|
||||
#
|
||||
# def collater(self, samples):
|
||||
# (
|
||||
# image_list,
|
||||
# question_list,
|
||||
# question_id_list,
|
||||
# instance_id_list,
|
||||
# choices_list,
|
||||
# correct_choice_idx_list,
|
||||
# direct_answers_list,
|
||||
# ) = ([], [], [], [], [], [], [])
|
||||
#
|
||||
# for sample in samples:
|
||||
# image_list.append(sample["image"])
|
||||
# question_list.append(sample["text_input"])
|
||||
# question_id_list.append(sample["question_id"])
|
||||
# instance_id_list.append(sample["instance_id"])
|
||||
# choices_list.append(sample["choices"])
|
||||
# correct_choice_idx_list.append(sample["correct_choice_idx"])
|
||||
# direct_answers_list.append(sample["direct_answers"])
|
||||
#
|
||||
# return {
|
||||
# "image": torch.stack(image_list, dim=0),
|
||||
# "text_input": question_list,
|
||||
# "question_id": question_id_list,
|
||||
# "instance_id": instance_id_list,
|
||||
# "choices": choices_list,
|
||||
# "correct_choice_idx": correct_choice_idx_list,
|
||||
# "direct_answers": direct_answers_list,
|
||||
# }
|
||||
#
|
||||
# def __getitem__(self, index):
|
||||
# ann = self.annotation[index]
|
||||
#
|
||||
# image_path = os.path.join(self.vis_root, ann["image"])
|
||||
# image = Image.open(image_path).convert("RGB")
|
||||
#
|
||||
# image = self.vis_processor(image)
|
||||
# question = self.text_processor(ann["question"])
|
||||
#
|
||||
# choices = ann["choices"]
|
||||
# if "correct_choice_idx" in ann:
|
||||
# correct_choice_idx = ann["correct_choice_idx"]
|
||||
# else:
|
||||
# correct_choice_idx = None
|
||||
#
|
||||
# if "direct_answers" in ann:
|
||||
# direct_answers = ann["direct_answers"]
|
||||
# else:
|
||||
# direct_answers = None
|
||||
#
|
||||
# return {
|
||||
# "image": image,
|
||||
# "text_input": question,
|
||||
# "question_id": ann["question_id"],
|
||||
# "instance_id": ann["instance_id"],
|
||||
# "choices": choices,
|
||||
# "correct_choice_idx": correct_choice_idx,
|
||||
# "direct_answers": direct_answers,
|
||||
# }
|
122
minigpt4/datasets/datasets/coco_caption.py
Executable file
122
minigpt4/datasets/datasets/coco_caption.py
Executable file
@ -0,0 +1,122 @@
|
||||
"""
|
||||
Copyright (c) 2022, salesforce.com, inc.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from PIL import Image
|
||||
from PIL import ImageFile
|
||||
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||
|
||||
from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
|
||||
|
||||
COCOCapDataset = COCOCaptionDataset
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class COCOCapEvalDataset(CaptionEvalDataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
split (string): val or test
|
||||
"""
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
def __getitem__(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
|
||||
img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"image_id": img_id,
|
||||
"instance_id": ann["instance_id"],
|
||||
}
|
||||
|
||||
|
||||
class NoCapsEvalDataset(CaptionEvalDataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
split (string): val or test
|
||||
"""
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
def __getitem__(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
|
||||
img_id = ann["img_id"]
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"image_id": img_id,
|
||||
"instance_id": ann["instance_id"],
|
||||
}
|
||||
|
||||
|
||||
class RefCOCOEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
img_id = data['img_id']
|
||||
sent = data['sents']
|
||||
image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image)
|
||||
# question = f"[refer] {sent}"
|
||||
question = f"[refer] where is {sent}?"
|
||||
# question = f"where is the bounding box location of {sent}?"
|
||||
return image, question, img_id
|
||||
|
||||
class EvalCaptionData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
ann = dict()
|
||||
for item in self.loaded_data:
|
||||
image_id = item['image_id']
|
||||
ann[image_id] = item['image']
|
||||
self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.ann[idx]
|
||||
image_id = data['image_id']
|
||||
img_file = data['image'].split('/')[-1]
|
||||
image_path = os.path.join(self.root_path, img_file)
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
image = self.vis_processor(image)
|
||||
question = f"[caption] please describe this image?"
|
||||
return image, question, image_id
|
667
minigpt4/datasets/datasets/coco_dataset.py
Executable file
667
minigpt4/datasets/datasets/coco_dataset.py
Executable file
@ -0,0 +1,667 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
import threading
|
||||
|
||||
# Global lock
|
||||
lock = threading.Lock()
|
||||
|
||||
def sample_object_bbox(objects, bbox):
|
||||
|
||||
|
||||
|
||||
zipped_list = list(zip(objects, bbox))
|
||||
|
||||
# Shuffle the zipped list
|
||||
random.shuffle(zipped_list)
|
||||
|
||||
# Generate the new string with interleaved format
|
||||
# interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list])
|
||||
|
||||
# print("objects", objects)
|
||||
# print("bbox",bbox)
|
||||
|
||||
interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","")
|
||||
|
||||
# interleaved_list = " "+interleaved_list
|
||||
# print(interleaved_list)
|
||||
return interleaved_list
|
||||
|
||||
def bbox_to_object(objects, bbox):
|
||||
|
||||
index_sample = random.sample(range(len(objects)),1)[0]
|
||||
|
||||
sample_object = str(objects[index_sample])
|
||||
sample_bbox = bbox[index_sample]
|
||||
# sample_center_point = center_point[index_sample]
|
||||
|
||||
sample_bbox = r"{"+str(sample_bbox) + "}"
|
||||
return sample_bbox, sample_object
|
||||
|
||||
def object_to_bbox(objects, bbox, center_point):
|
||||
index_sample = random.sample(range(len(objects)),1)[0]
|
||||
|
||||
sample_object = objects[index_sample]
|
||||
sample_bbox = bbox[index_sample]
|
||||
sample_center_point = center_point[index_sample]
|
||||
|
||||
instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? "
|
||||
answer = "{"+str(sample_object)+","+str(sample_bbox)+"}"
|
||||
|
||||
|
||||
|
||||
return instruction, answer
|
||||
|
||||
|
||||
class COCOBBOXDataset(BaseDataset):
|
||||
def __init__(self, vis_processor, text_processor, location):
|
||||
super().__init__(vis_processor=vis_processor, text_processor=text_processor)
|
||||
|
||||
print("coco box dataset")
|
||||
self.inner_dataset = wds.DataPipeline(
|
||||
wds.ResampledShards(location),
|
||||
wds.tarfile_to_samples(handler=wds.warn_and_continue),
|
||||
wds.shuffle(1000, handler=wds.warn_and_continue),
|
||||
wds.decode("pilrgb", handler=wds.warn_and_continue),
|
||||
wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
|
||||
wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
|
||||
wds.map(self.to_dict, handler=wds.warn_and_continue),
|
||||
)
|
||||
|
||||
def to_dict(self, sample):
|
||||
objects = sample[1]["objects"]
|
||||
boxes = sample[1]["bbox"]
|
||||
caption = sample[1]["caption"]
|
||||
|
||||
|
||||
new_bboxes = []
|
||||
|
||||
image_size = sample[0].shape[1]
|
||||
image_size = 100
|
||||
for index in range(len(boxes)):
|
||||
box = boxes[index]
|
||||
x1 = int(box[0]*image_size)
|
||||
y1 = int(box[1]*image_size)
|
||||
x2 = x1 + int(box[2]*image_size)
|
||||
y2 = y1 + int(box[3]*image_size)
|
||||
assert x1>=0 and x1<=image_size
|
||||
assert x2>=0 and x2<=image_size
|
||||
assert y1>=0 and y1<=image_size
|
||||
assert y2>=0 and y2<=image_size
|
||||
|
||||
new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
|
||||
# new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
|
||||
new_bboxes.append(new_bbox)
|
||||
|
||||
instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. "
|
||||
instruction = "<Img><ImageHere></Img> {}".format(self.text_processor(instruction))
|
||||
|
||||
answer = sample_object_bbox(objects, new_bboxes)
|
||||
|
||||
# print("instruction",instruction)
|
||||
# print("answer", answer)
|
||||
|
||||
return {
|
||||
"image": sample[0],
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"data_type": "bbox",
|
||||
"question_split": True
|
||||
}
|
||||
|
||||
|
||||
class COCOBboxToObjectDataset(BaseDataset):
|
||||
def __init__(self, vis_processor, text_processor, location):
|
||||
super().__init__(vis_processor=vis_processor, text_processor=text_processor)
|
||||
|
||||
|
||||
self.inner_dataset = wds.DataPipeline(
|
||||
wds.ResampledShards(location),
|
||||
wds.tarfile_to_samples(handler=wds.warn_and_continue),
|
||||
wds.shuffle(1000, handler=wds.warn_and_continue),
|
||||
wds.decode("pilrgb", handler=wds.warn_and_continue),
|
||||
wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
|
||||
wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
|
||||
wds.map(self.to_dict, handler=wds.warn_and_continue),
|
||||
)
|
||||
|
||||
|
||||
self.instruction_pool = [
|
||||
"<Img><ImageHere></Img> what object is in this bounding box location {} ",
|
||||
"<Img><ImageHere></Img> what object is in this location {} ",
|
||||
"<Img><ImageHere></Img> identify the object present at this location {} ",
|
||||
"<Img><ImageHere></Img> what is it in bounding box location{} ",
|
||||
"<Img><ImageHere></Img> describe this object in {} ",
|
||||
"<Img><ImageHere></Img> this {} is ",
|
||||
"<Img><ImageHere></Img> the object in {} is ",
|
||||
"<Img><ImageHere></Img> please tell me what is inside the bounding box position {} ",
|
||||
"<Img><ImageHere></Img> what can you find in the bounding box area at position {}? ",
|
||||
"<Img><ImageHere></Img> what is the object occupying this area {} ",
|
||||
"<Img><ImageHere></Img> could you identify the content within the bounding box located at {} ",
|
||||
]
|
||||
|
||||
def to_dict(self, sample):
|
||||
|
||||
objects = sample[1]["objects"]
|
||||
boxes = sample[1]["bbox"]
|
||||
|
||||
new_bboxes = []
|
||||
|
||||
image_size = sample[0].shape[1]
|
||||
image_size=100
|
||||
for index in range(len(boxes)):
|
||||
box = boxes[index]
|
||||
x1 = int(box[0]*image_size)
|
||||
y1 = int(box[1]*image_size)
|
||||
x2 = x1 + int(box[2]*image_size)
|
||||
y2 = y1 + int(box[3]*image_size)
|
||||
assert x1>=0 and x1<=image_size
|
||||
assert x2>=0 and x2<=image_size
|
||||
assert y1>=0 and y1<=image_size
|
||||
assert y2>=0 and y2<=image_size
|
||||
|
||||
new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
|
||||
new_bboxes.append(new_bbox)
|
||||
|
||||
bbox, object = bbox_to_object(objects, new_bboxes)
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(bbox)
|
||||
return {
|
||||
"image": sample[0],
|
||||
"instruction_input": instruction,
|
||||
"answer": self.text_processor(object),
|
||||
"data_type": "bbox",
|
||||
"question_split": True
|
||||
}
|
||||
|
||||
|
||||
|
||||
# class ReferCOCODataset(Dataset):
|
||||
# def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
|
||||
# """
|
||||
# vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
# ann_root (string): directory to store the annotation file
|
||||
# """
|
||||
# self.vis_root = vis_root
|
||||
|
||||
# self.vis_processor = vis_processor
|
||||
# self.text_processor = text_processor
|
||||
|
||||
# self.refer = REFER(ann_path, vis_root, dataset, splitBy)
|
||||
# self.ref_ids = self.refer.getRefIds()
|
||||
|
||||
|
||||
# self.instruction_pool = [
|
||||
# "[refer] {}",
|
||||
# "[refer] give me the location of {}",
|
||||
# "[refer] where is {} ?",
|
||||
# "[refer] from this image, tell me the location of {}",
|
||||
# "[refer] the location of {} is",
|
||||
# "[refer] could you tell me the location for {} ?",
|
||||
# "[refer] where can I locate the {} ?",
|
||||
# ]
|
||||
|
||||
|
||||
# def __len__(self):
|
||||
# return len(self.ref_ids)
|
||||
|
||||
# def preprocess(self, index):
|
||||
# ref_id = self.ref_ids[index]
|
||||
# ref = self.refer.loadRefs(ref_id)[0]
|
||||
|
||||
# image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
|
||||
# image_path = os.path.join(self.vis_root, image_file)
|
||||
# image = Image.open(image_path).convert("RGB")
|
||||
# image_orig_size = image.size
|
||||
# image = self.vis_processor(image)
|
||||
# image_new_size = [image.shape[1], image.shape[2]]
|
||||
|
||||
# image_new_size = [100,100]
|
||||
|
||||
# sample_sentence = random.choice(ref['sentences'])['raw']
|
||||
|
||||
# refer_sentence = self.text_processor(sample_sentence)
|
||||
|
||||
|
||||
# bbox = self.refer.getRefBox(ref['ref_id'])
|
||||
|
||||
# bbox_to_save = bbox
|
||||
# image_id_to_save = ref["image_id"]
|
||||
# ref_id_to_save = ref_id
|
||||
|
||||
# item = {"image":image_id_to_save,"bbox":bbox_to_save,"ref id":ref_id_to_save, "sentence":refer_sentence}
|
||||
|
||||
|
||||
# def save_to_file():
|
||||
# with lock:
|
||||
# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "r") as f:
|
||||
# refer_json = json.load(f)
|
||||
|
||||
# if ref_id_to_save not in refer_json.keys():
|
||||
# print(item)
|
||||
# refer_json[ref_id_to_save] = item
|
||||
|
||||
# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "w") as f:
|
||||
# json.dump(refer_json, f)
|
||||
|
||||
|
||||
# save_to_file()
|
||||
# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","r") as f:
|
||||
# # refer_json = json.load(open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json"))
|
||||
|
||||
# # if ref_id_to_save not in refer_json.keys():
|
||||
# # print(item)
|
||||
# # refer_json[ref_id_to_save] = item
|
||||
|
||||
# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","w") as f:
|
||||
# # json.dump(refer_json,f)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# bbox = [
|
||||
# bbox[0] / image_orig_size[0] * image_new_size[0],
|
||||
# bbox[1] / image_orig_size[1] * image_new_size[1],
|
||||
# (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
|
||||
# (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
|
||||
# ]
|
||||
# bbox = [int(x) for x in bbox]
|
||||
# bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
|
||||
# return {
|
||||
# "image": image,
|
||||
# "refer_sentence": refer_sentence,
|
||||
# "bbox": bbox,
|
||||
# "image_id": ref['image_id'],
|
||||
# }
|
||||
|
||||
# def __getitem__(self, index):
|
||||
# data = self.preprocess(index)
|
||||
# instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
|
||||
|
||||
# instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
# return {
|
||||
# "image": data['image'],
|
||||
# "instruction_input": instruction,
|
||||
# "answer": data['bbox'],
|
||||
# "image_id": data['image_id'],
|
||||
# }
|
||||
|
||||
|
||||
# class InvReferCOCODataset(ReferCOCODataset):
|
||||
# def __init__(self, *args, **kwargs):
|
||||
# super(InvReferCOCODataset, self).__init__(*args, **kwargs)
|
||||
|
||||
# self.instruction_pool = [
|
||||
# "[identify] {}",
|
||||
# "[identify] what object is in this location {}",
|
||||
# "[identify] identify the object present at this location {}",
|
||||
# "[identify] what is it in {}",
|
||||
# "[identify] describe this object in {}",
|
||||
# "[identify] this {} is",
|
||||
# "[identify] the object in {} is",
|
||||
# ]
|
||||
|
||||
# def __getitem__(self, index):
|
||||
# data = self.preprocess(index)
|
||||
|
||||
# instruction = random.choice(self.instruction_pool).format(data['bbox'])
|
||||
|
||||
# instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
# return {
|
||||
# "image": data['image'],
|
||||
# "instruction_input": instruction,
|
||||
# "answer": self.text_processor(data['refer_sentence']),
|
||||
# "image_id": data['image_id'],
|
||||
# }
|
||||
|
||||
|
||||
class ReferCOCODataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.refer = REFER(ann_path, vis_root, dataset, splitBy)
|
||||
self.ref_ids = self.refer.getRefIds(split="train")
|
||||
|
||||
print(dataset, len(self.ref_ids))
|
||||
|
||||
self.instruction_pool = [
|
||||
"[refer] {}",
|
||||
"[refer] give me the location of {}",
|
||||
"[refer] where is {} ?",
|
||||
"[refer] from this image, tell me the location of {}",
|
||||
"[refer] the location of {} is",
|
||||
"[refer] could you tell me the location for {} ?",
|
||||
"[refer] where can I locate the {} ?",
|
||||
]
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ref_ids)
|
||||
|
||||
def preprocess(self, index):
|
||||
ref_id = self.ref_ids[index]
|
||||
ref = self.refer.loadRefs(ref_id)[0]
|
||||
|
||||
image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image_orig_size = image.size
|
||||
image = self.vis_processor(image)
|
||||
image_new_size = [image.shape[1], image.shape[2]]
|
||||
|
||||
image_new_size = [100,100]
|
||||
|
||||
sample_sentence = random.choice(ref['sentences'])['raw']
|
||||
refer_sentence = self.text_processor(sample_sentence)
|
||||
|
||||
|
||||
bbox = self.refer.getRefBox(ref['ref_id'])
|
||||
bbox = [
|
||||
bbox[0] / image_orig_size[0] * image_new_size[0],
|
||||
bbox[1] / image_orig_size[1] * image_new_size[1],
|
||||
(bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
|
||||
(bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
|
||||
]
|
||||
bbox = [int(x) for x in bbox]
|
||||
bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
|
||||
return {
|
||||
"image": image,
|
||||
"refer_sentence": refer_sentence,
|
||||
"bbox": bbox,
|
||||
"image_id": ref['image_id'],
|
||||
}
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.preprocess(index)
|
||||
instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"instruction_input": instruction,
|
||||
"answer": data['bbox'],
|
||||
"image_id": data['image_id'],
|
||||
}
|
||||
|
||||
|
||||
class InvReferCOCODataset(ReferCOCODataset):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(InvReferCOCODataset, self).__init__(*args, **kwargs)
|
||||
|
||||
self.instruction_pool = [
|
||||
"[identify] {}",
|
||||
"[identify] what object is in this location {}",
|
||||
"[identify] identify the object present at this location {}",
|
||||
"[identify] what is it in {}",
|
||||
"[identify] describe this object in {}",
|
||||
"[identify] this {} is",
|
||||
"[identify] the object in {} is",
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.preprocess(index)
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(data['bbox'])
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"instruction_input": instruction,
|
||||
"answer": self.text_processor(data['refer_sentence']),
|
||||
"image_id": data['image_id'],
|
||||
}
|
||||
|
||||
|
||||
class REFER:
|
||||
def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'):
|
||||
# provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
|
||||
# also provide dataset name and splitBy information
|
||||
# e.g., dataset = 'refcoco', splitBy = 'unc'
|
||||
dataset = dataset.split('inv')[-1] # inv dataset is stored in the same path as normal dataset
|
||||
print('loading dataset %s into memory...' % dataset)
|
||||
self.ann_dir = os.path.join(data_root, dataset)
|
||||
if dataset in ['refcoco', 'refcoco+', 'refcocog']:
|
||||
self.vis_root = vis_root
|
||||
elif dataset == 'refclef':
|
||||
raise 'No RefClef image data'
|
||||
else:
|
||||
raise 'No refer dataset is called [%s]' % dataset
|
||||
|
||||
# load refs from data/dataset/refs(dataset).json
|
||||
tic = time.time()
|
||||
ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p')
|
||||
self.data = {}
|
||||
self.data['dataset'] = dataset
|
||||
self.data['refs'] = pickle.load(open(ref_file, 'rb'))
|
||||
|
||||
# load annotations from data/dataset/instances.json
|
||||
instances_file = os.path.join(self.ann_dir, 'instances.json')
|
||||
instances = json.load(open(instances_file, 'r'))
|
||||
self.data['images'] = instances['images']
|
||||
self.data['annotations'] = instances['annotations']
|
||||
self.data['categories'] = instances['categories']
|
||||
|
||||
# create index
|
||||
self.createIndex()
|
||||
print('DONE (t=%.2fs)' % (time.time() - tic))
|
||||
|
||||
def createIndex(self):
|
||||
# create sets of mapping
|
||||
# 1) Refs: {ref_id: ref}
|
||||
# 2) Anns: {ann_id: ann}
|
||||
# 3) Imgs: {image_id: image}
|
||||
# 4) Cats: {category_id: category_name}
|
||||
# 5) Sents: {sent_id: sent}
|
||||
# 6) imgToRefs: {image_id: refs}
|
||||
# 7) imgToAnns: {image_id: anns}
|
||||
# 8) refToAnn: {ref_id: ann}
|
||||
# 9) annToRef: {ann_id: ref}
|
||||
# 10) catToRefs: {category_id: refs}
|
||||
# 11) sentToRef: {sent_id: ref}
|
||||
# 12) sentToTokens: {sent_id: tokens}
|
||||
print('creating index...')
|
||||
# fetch info from instances
|
||||
Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
|
||||
for ann in self.data['annotations']:
|
||||
Anns[ann['id']] = ann
|
||||
imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
|
||||
for img in self.data['images']:
|
||||
Imgs[img['id']] = img
|
||||
for cat in self.data['categories']:
|
||||
Cats[cat['id']] = cat['name']
|
||||
|
||||
# fetch info from refs
|
||||
Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
|
||||
Sents, sentToRef, sentToTokens = {}, {}, {}
|
||||
for ref in self.data['refs']:
|
||||
# ids
|
||||
ref_id = ref['ref_id']
|
||||
ann_id = ref['ann_id']
|
||||
category_id = ref['category_id']
|
||||
image_id = ref['image_id']
|
||||
|
||||
# add mapping related to ref
|
||||
Refs[ref_id] = ref
|
||||
imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
|
||||
catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
|
||||
refToAnn[ref_id] = Anns[ann_id]
|
||||
annToRef[ann_id] = ref
|
||||
|
||||
# add mapping of sent
|
||||
for sent in ref['sentences']:
|
||||
Sents[sent['sent_id']] = sent
|
||||
sentToRef[sent['sent_id']] = ref
|
||||
sentToTokens[sent['sent_id']] = sent['tokens']
|
||||
|
||||
# create class members
|
||||
self.Refs = Refs
|
||||
self.Anns = Anns
|
||||
self.Imgs = Imgs
|
||||
self.Cats = Cats
|
||||
self.Sents = Sents
|
||||
self.imgToRefs = imgToRefs
|
||||
self.imgToAnns = imgToAnns
|
||||
self.refToAnn = refToAnn
|
||||
self.annToRef = annToRef
|
||||
self.catToRefs = catToRefs
|
||||
self.sentToRef = sentToRef
|
||||
self.sentToTokens = sentToTokens
|
||||
print('index created.')
|
||||
|
||||
def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
|
||||
image_ids = image_ids if type(image_ids) == list else [image_ids]
|
||||
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
|
||||
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
|
||||
|
||||
if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
|
||||
refs = self.data['refs']
|
||||
else:
|
||||
if not len(image_ids) == 0:
|
||||
refs = [self.imgToRefs[image_id] for image_id in image_ids]
|
||||
else:
|
||||
refs = self.data['refs']
|
||||
if not len(cat_ids) == 0:
|
||||
refs = [ref for ref in refs if ref['category_id'] in cat_ids]
|
||||
if not len(ref_ids) == 0:
|
||||
refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
|
||||
if not len(split) == 0:
|
||||
if split in ['testA', 'testB', 'testC']:
|
||||
refs = [ref for ref in refs if
|
||||
split[-1] in ref['split']] # we also consider testAB, testBC, ...
|
||||
elif split in ['testAB', 'testBC', 'testAC']:
|
||||
refs = [ref for ref in refs if ref['split'] == split] # rarely used I guess...
|
||||
elif split == 'test':
|
||||
refs = [ref for ref in refs if 'test' in ref['split']]
|
||||
elif split == 'train' or split == 'val':
|
||||
refs = [ref for ref in refs if ref['split'] == split]
|
||||
else:
|
||||
raise 'No such split [%s]' % split
|
||||
ref_ids = [ref['ref_id'] for ref in refs]
|
||||
return ref_ids
|
||||
|
||||
def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
|
||||
image_ids = image_ids if type(image_ids) == list else [image_ids]
|
||||
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
|
||||
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
|
||||
|
||||
if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
|
||||
ann_ids = [ann['id'] for ann in self.data['annotations']]
|
||||
else:
|
||||
if not len(image_ids) == 0:
|
||||
lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns] # list of [anns]
|
||||
anns = list(itertools.chain.from_iterable(lists))
|
||||
else:
|
||||
anns = self.data['annotations']
|
||||
if not len(cat_ids) == 0:
|
||||
anns = [ann for ann in anns if ann['category_id'] in cat_ids]
|
||||
ann_ids = [ann['id'] for ann in anns]
|
||||
if not len(ref_ids) == 0:
|
||||
ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
|
||||
return ann_ids
|
||||
|
||||
def getImgIds(self, ref_ids=[]):
|
||||
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
|
||||
|
||||
if not len(ref_ids) == 0:
|
||||
image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
|
||||
else:
|
||||
image_ids = self.Imgs.keys()
|
||||
return image_ids
|
||||
|
||||
def getCatIds(self):
|
||||
return self.Cats.keys()
|
||||
|
||||
def loadRefs(self, ref_ids=[]):
|
||||
if type(ref_ids) == list:
|
||||
return [self.Refs[ref_id] for ref_id in ref_ids]
|
||||
elif type(ref_ids) == int:
|
||||
return [self.Refs[ref_ids]]
|
||||
|
||||
def loadAnns(self, ann_ids=[]):
|
||||
if type(ann_ids) == list:
|
||||
return [self.Anns[ann_id] for ann_id in ann_ids]
|
||||
elif type(ann_ids) == int:
|
||||
return [self.Anns[ann_ids]]
|
||||
|
||||
def loadImgs(self, image_ids=[]):
|
||||
if type(image_ids) == list:
|
||||
return [self.Imgs[image_id] for image_id in image_ids]
|
||||
elif type(image_ids) == int:
|
||||
return [self.Imgs[image_ids]]
|
||||
|
||||
def loadCats(self, cat_ids=[]):
|
||||
if type(cat_ids) == list:
|
||||
return [self.Cats[cat_id] for cat_id in cat_ids]
|
||||
elif type(cat_ids) == int:
|
||||
return [self.Cats[cat_ids]]
|
||||
|
||||
def getRefBox(self, ref_id):
|
||||
ref = self.Refs[ref_id]
|
||||
ann = self.refToAnn[ref_id]
|
||||
return ann['bbox'] # [x, y, w, h]
|
||||
|
||||
def showRef(self, ref, seg_box='box'):
|
||||
ax = plt.gca()
|
||||
# show image
|
||||
image = self.Imgs[ref['image_id']]
|
||||
I = io.imread(os.path.join(self.vis_root, image['file_name']))
|
||||
ax.imshow(I)
|
||||
# show refer expression
|
||||
for sid, sent in enumerate(ref['sentences']):
|
||||
print('%s. %s' % (sid + 1, sent['sent']))
|
||||
# show segmentations
|
||||
if seg_box == 'seg':
|
||||
ann_id = ref['ann_id']
|
||||
ann = self.Anns[ann_id]
|
||||
polygons = []
|
||||
color = []
|
||||
c = 'none'
|
||||
if type(ann['segmentation'][0]) == list:
|
||||
# polygon used for refcoco*
|
||||
for seg in ann['segmentation']:
|
||||
poly = np.array(seg).reshape((len(seg) / 2, 2))
|
||||
polygons.append(Polygon(poly, True, alpha=0.4))
|
||||
color.append(c)
|
||||
p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
|
||||
ax.add_collection(p) # thick yellow polygon
|
||||
p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
|
||||
ax.add_collection(p) # thin red polygon
|
||||
else:
|
||||
# mask used for refclef
|
||||
raise NotImplementedError('RefClef is not downloaded')
|
||||
# show bounding-box
|
||||
elif seg_box == 'box':
|
||||
ann_id = ref['ann_id']
|
||||
ann = self.Anns[ann_id]
|
||||
bbox = self.getRefBox(ref['ref_id'])
|
||||
box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
|
||||
ax.add_patch(box_plot)
|
184
minigpt4/datasets/datasets/coco_vqa_datasets.py
Executable file
184
minigpt4/datasets/datasets/coco_vqa_datasets.py
Executable file
@ -0,0 +1,184 @@
|
||||
"""
|
||||
Copyright (c) 2022, salesforce.com, inc.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import random
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
class __DisplMixin:
|
||||
def displ_item(self, index):
|
||||
sample, ann = self.__getitem__(index), self.annotation[index]
|
||||
|
||||
return OrderedDict(
|
||||
{
|
||||
"file": ann["image"],
|
||||
"question": ann["question"],
|
||||
"question_id": ann["question_id"],
|
||||
"answers": "; ".join(ann["answer"]),
|
||||
"image": sample["image"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class COCOVQADataset(VQADataset, __DisplMixin):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
self.instruction_pool =[
|
||||
"[vqa] {}",
|
||||
"[vqa] Based on the image, respond to this question with a short answer: {}"
|
||||
]
|
||||
|
||||
exist_annotation = []
|
||||
for ann in self.annotation:
|
||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||
if os.path.exists(image_path):
|
||||
exist_annotation.append(ann)
|
||||
self.annotation = exist_annotation
|
||||
|
||||
|
||||
def get_data(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
question = self.text_processor(ann["question"])
|
||||
question_id = ann["question_id"]
|
||||
|
||||
answer_weight = {}
|
||||
for answer in ann["answer"]:
|
||||
if answer in answer_weight.keys():
|
||||
answer_weight[answer] += 1 / len(ann["answer"])
|
||||
else:
|
||||
answer_weight[answer] = 1 / len(ann["answer"])
|
||||
|
||||
answers = list(answer_weight.keys())
|
||||
weights = list(answer_weight.values())
|
||||
|
||||
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
|
||||
|
||||
# if "unk" in answer:
|
||||
# print("cocovqa", answer)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"question": question,
|
||||
"question_id": question_id,
|
||||
"answer": answer,
|
||||
}
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.get_data(index)
|
||||
instruction = random.choice(self.instruction_pool).format(data['question'])
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"question_id": data["question_id"],
|
||||
"instruction_input": instruction,
|
||||
"answer": self.text_processor(data['answer']),
|
||||
}
|
||||
|
||||
|
||||
class COCOVQGDataset(COCOVQADataset):
|
||||
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
self.instruction_pool = [
|
||||
'Given the image, generate a question whose answer is: {}',
|
||||
'Based on the image, provide a question with the answer: {}',
|
||||
'Given the visual representation, create a question for which the answer is "{}"',
|
||||
'From the image provided, craft a question that leads to the reply: {}',
|
||||
'Considering the picture, come up with a question where the answer is: {}',
|
||||
'Taking the image into account, generate an question that has the answer: {}'
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.get_data(index)
|
||||
instruction = random.choice(self.instruction_pool).format(data['answer'])
|
||||
instruction = "<Img><ImageHere></Img> {}".format(instruction)
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"question_id": data["question_id"],
|
||||
"instruction_input": instruction,
|
||||
"answer": data['question'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
|
||||
self.instruction_pool = [
|
||||
# '{}',
|
||||
# 'Question: {}',
|
||||
# '{} A short answer to the question is',
|
||||
# 'Q: {} A:',
|
||||
'Question: {} Short answer:',
|
||||
# 'Given the image, answer the following question with no more than three words. {}',
|
||||
# 'Based on the image, respond to this question with a short answer: {}.',
|
||||
# 'Use the provided image to answer the question: {} Provide your answer as short as possible.',
|
||||
# 'What is the answer to the following question? "{}"',
|
||||
# 'The question "{}" can be answered using the image. A short answer is'
|
||||
]
|
||||
# print('vis_root', vis_root)
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.annotation = json.load(open(ann_paths[0]))
|
||||
|
||||
answer_list_path = ann_paths[1]
|
||||
if os.path.exists(answer_list_path):
|
||||
self.answer_list = json.load(open(answer_list_path))
|
||||
else:
|
||||
self.answer_list = None
|
||||
|
||||
try:
|
||||
self.coco_fmt_qust_file = ann_paths[2]
|
||||
self.coco_fmt_anno_file = ann_paths[3]
|
||||
except IndexError:
|
||||
self.coco_fmt_qust_file = None
|
||||
self.coco_fmt_anno_file = None
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self._add_instance_ids()
|
||||
|
||||
def __getitem__(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
question = self.text_processor(ann["question"])
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(question)
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
'image_path': image_path,
|
||||
"question": question,
|
||||
"question_id": ann["question_id"],
|
||||
"instruction_input": instruction,
|
||||
"instance_id": ann["instance_id"],
|
||||
}
|
290
minigpt4/datasets/datasets/doc_dataset.py
Executable file
290
minigpt4/datasets/datasets/doc_dataset.py
Executable file
@ -0,0 +1,290 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
|
||||
class SingleSlideVQADataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
self.data = self.create_data(ann_path)
|
||||
|
||||
# self.instruction_pool = [
|
||||
# "###Human: <Img><ImageHere></Img> {}###Assistant: ",
|
||||
# "###Human: <Img><ImageHere></Img> From this slide, {}###Assistant: ",
|
||||
# ]
|
||||
self.instruction_pool = [
|
||||
"<Img><ImageHere></Img> {}",
|
||||
"<Img><ImageHere></Img> From this slide, {}",
|
||||
]
|
||||
def create_data(self, ann_path):
|
||||
with open(ann_path, 'r') as f:
|
||||
samples = f.readlines()
|
||||
data = []
|
||||
for sample in samples:
|
||||
sample = json.loads(sample)
|
||||
if len(sample['evidence_pages']) != 1: continue # skip questions that need more than one slide page
|
||||
page = sample['evidence_pages'][0]
|
||||
image_name = 'slide_{}_1024.jpg'.format(page)
|
||||
# assert [int(image_name.split('-')[-2]) for image_name in image_names] == list(range(1, 21)) # check the format
|
||||
image_path = os.path.join(sample['deck_name'], image_name)
|
||||
data.append({
|
||||
'qa_id': sample['qa_id'],
|
||||
'question': sample['question'],
|
||||
'answer': sample['answer'],
|
||||
'image_path': image_path
|
||||
})
|
||||
|
||||
print("single slide ",len(data))
|
||||
return data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample = self.data[index]
|
||||
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
# instruction = self.text_processor(sample["question"])
|
||||
instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
|
||||
|
||||
# instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": sample['answer'],
|
||||
"qa_id": sample['qa_id'],
|
||||
}
|
||||
|
||||
|
||||
class OCRVQADataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
self.data = self.create_data(ann_path)
|
||||
|
||||
self.instruction_pool =[
|
||||
"[vqa] {}",
|
||||
"[vqa] Based on the image, respond to this question with a short answer: {}"
|
||||
]
|
||||
|
||||
def create_data(self, ann_path):
|
||||
processed_data = []
|
||||
with open(ann_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for k in data.keys():
|
||||
if data[k]['split'] != 1: continue # 1 for training, 2 for validation, 3 for test
|
||||
ext = os.path.splitext(data[k]['imageURL'])[1]
|
||||
imageFile = k + ext
|
||||
assert len(data[k]['questions']) == len(data[k]['answers'])
|
||||
for q, a in zip(data[k]['questions'], data[k]['answers']):
|
||||
processed_data.append(
|
||||
{'question': q,
|
||||
'answer': a,
|
||||
'image_path': imageFile,
|
||||
'image_id': k,
|
||||
'title': data[k]['title'],
|
||||
'genre': data[k]['genre'],
|
||||
}
|
||||
)
|
||||
print("ocr vqa", len(processed_data))
|
||||
return processed_data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample = self.data[index]
|
||||
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
question = self.text_processor(sample["question"])
|
||||
answer = self.text_processor(sample["answer"])
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(question)
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": sample['image_id']
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class TextOCRDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
self.data = self.create_data(ann_path)
|
||||
|
||||
self.instruction_pool = [
|
||||
"<Img><ImageHere></Img> [OCR] {}"
|
||||
]
|
||||
|
||||
def create_data(self, ann_path):
|
||||
processed_data = []
|
||||
with open(ann_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for k in data["anns"].keys():
|
||||
# ext = os.path.splitext(data[k]['imageURL'])[1]
|
||||
imageFile = data["anns"][k]["image_id"]+".jpg"
|
||||
bbox = data["anns"][k]["bbox"]
|
||||
text = data["anns"][k]["utf8_string"]
|
||||
# assert len(data[k]['questions']) == len(data[k]['answers'])
|
||||
# for q, a in zip(data[k]['questions'], data[k]['answers']):
|
||||
|
||||
processed_data.append(
|
||||
{'bbox': bbox,
|
||||
'answer': text,
|
||||
'image_path': imageFile,
|
||||
'image_id': k,
|
||||
}
|
||||
)
|
||||
|
||||
return processed_data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample = self.data[index]
|
||||
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
|
||||
width, height = image.size
|
||||
image = self.vis_processor(image)
|
||||
|
||||
new_bbox =""
|
||||
image_size = 100
|
||||
bbox = sample['bbox']
|
||||
for index in range(len(bbox)):
|
||||
|
||||
x1 = int(bbox[0]/width*image_size)
|
||||
y1 = int(bbox[1]/height*image_size)
|
||||
x2 = x1 + int(bbox[2]/width*image_size)
|
||||
y2 = y1 + int(bbox[3]/height*image_size)
|
||||
assert x1>=0 and x1<=image_size
|
||||
assert x2>=0 and x2<=image_size
|
||||
assert y1>=0 and y1<=image_size
|
||||
assert y2>=0 and y2<=image_size
|
||||
|
||||
new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(new_bbox)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": sample['answer'],
|
||||
"image_id": sample['image_id']
|
||||
}
|
||||
|
||||
|
||||
|
||||
class PlotVQADataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
self.data = self.create_data(ann_path)
|
||||
|
||||
self.instruction_pool = [
|
||||
'{}',
|
||||
'Question: {}',
|
||||
'{} A short answer to the question is',
|
||||
'Q: {} A:',
|
||||
'Question: {} Short answer:',
|
||||
# 'Given the image, answer the following question with no more than three words. {}',
|
||||
'Based on the image, respond to this question with a short answer: {}.',
|
||||
'Use the provided image to answer the question: {} Provide your answer as short as possible.',
|
||||
'What is the answer to the following question? "{}"',
|
||||
'The question "{}" can be answered using the image. A short answer is'
|
||||
]
|
||||
|
||||
def create_data(self, ann_path):
|
||||
processed_data = []
|
||||
with open(ann_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for da in data["qa_pairs"]:
|
||||
# ext = os.path.splitext(data[k]['imageURL'])[1]
|
||||
|
||||
imageFile = str(da["image_index"])+".png"
|
||||
question = da["question_string"]
|
||||
answer = str(da["answer"])
|
||||
# assert len(data[k]['questions']) == len(data[k]['answers'])
|
||||
# for q, a in zip(data[k]['questions'], data[k]['answers']):
|
||||
|
||||
processed_data.append(
|
||||
{'question': question,
|
||||
'answer': answer,
|
||||
'image_path': imageFile,
|
||||
'image_id': str(da["image_index"]),
|
||||
}
|
||||
)
|
||||
|
||||
return processed_data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample = self.data[index]
|
||||
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
|
||||
# width, height = image.size
|
||||
image = self.vis_processor(image)
|
||||
|
||||
|
||||
# image_shape = image.shape
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(sample["question"])
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(instruction)
|
||||
|
||||
answer = sample["answer"]
|
||||
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": sample['image_id']
|
||||
}
|
||||
|
159
minigpt4/datasets/datasets/flickr.py
Executable file
159
minigpt4/datasets/datasets/flickr.py
Executable file
@ -0,0 +1,159 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
|
||||
class GroundedDetailDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.instruction_pool = [
|
||||
'[grounding] please describe this image in details',
|
||||
'[grounding] describe this image as detailed as possible',
|
||||
'[grounding] summarize this image in details',
|
||||
'[grounding] give a thorough description of what you see in this image',
|
||||
]
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
|
||||
image_file = '{}.jpg'.format(info['image_id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
answer = info['grounded_caption']
|
||||
|
||||
instruction = random.choice(self.instruction_pool)
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": info['image_id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class CaptionToObjectDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.instruction_pool = [
|
||||
'[detection] {}',
|
||||
]
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
|
||||
image_file = '{}.jpg'.format(info['image_id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
input = info["caption"]
|
||||
answer = info["output"]
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(input)
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": info['image_id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class PhraseToObjectDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.instruction_pool = [
|
||||
'[detection] {}',
|
||||
]
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
|
||||
image_file = '{}.jpg'.format(info['image_id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
input = info["phrase"]
|
||||
answer = "<p>"+input+"</p> "+info["bbox"]
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(input)
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": info['image_id'],
|
||||
}
|
65
minigpt4/datasets/datasets/gqa_datasets.py
Executable file
65
minigpt4/datasets/datasets/gqa_datasets.py
Executable file
@ -0,0 +1,65 @@
|
||||
"""
|
||||
Copyright (c) 2022, salesforce.com, inc.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from minigpt4.datasets.datasets.vqa_datasets import VQADataset
|
||||
|
||||
from collections import OrderedDict
|
||||
import random
|
||||
|
||||
class __DisplMixin:
|
||||
def displ_item(self, index):
|
||||
sample, ann = self.__getitem__(index), self.annotation[index]
|
||||
|
||||
return OrderedDict(
|
||||
{
|
||||
"file": ann["image"],
|
||||
"question": ann["question"],
|
||||
"question_id": ann["question_id"],
|
||||
"answers": "; ".join(ann["answer"]),
|
||||
"image": sample["image"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class GQADataset(VQADataset, __DisplMixin):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
self.instruction_pool =[
|
||||
"[vqa] {}",
|
||||
"[vqa] Based on the image, respond to this question with a short answer: {}"
|
||||
]
|
||||
|
||||
def __getitem__(self, index):
|
||||
ann = self.annotation[index]
|
||||
|
||||
image_path = os.path.join(self.vis_root, ann["image"])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
|
||||
image = self.vis_processor(image)
|
||||
question = self.text_processor(ann["question"])
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(question)
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
answers = self.text_processor(ann["answer"])
|
||||
# if "unk" in answers:
|
||||
# print("gqa",answers)
|
||||
|
||||
# print(answers)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answers,
|
||||
# "weights": weights,
|
||||
}
|
||||
|
390
minigpt4/datasets/datasets/llava_dataset.py
Executable file
390
minigpt4/datasets/datasets/llava_dataset.py
Executable file
@ -0,0 +1,390 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
# import iterto
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
class LlavaDetailDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
answer = info['conversations'][1]['value']
|
||||
instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
|
||||
|
||||
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": info['id'],
|
||||
}
|
||||
|
||||
class LlavaReasonDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
answer = info['conversations'][1]['value']
|
||||
instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
|
||||
|
||||
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
|
||||
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
# answer = self.text_processor(answer)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
"image_id": info['id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class MiniGPT4v(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
|
||||
self.instruction_pool = [
|
||||
'please describe this image as detailed as possible',
|
||||
'What do you see happening in this image?',
|
||||
"Can you elaborate on the elements of the picture provided?",
|
||||
"Describe the following image.",
|
||||
"Write a detailed description of the given image.",
|
||||
"Write a detailed description of the given image.",
|
||||
"Explain the visual content of the image in great detail"
|
||||
]
|
||||
self.ann=[]
|
||||
|
||||
with open(ann_path,"r") as f:
|
||||
for line in f.readlines():
|
||||
self.ann.append(json.loads(line))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
|
||||
# print("info keys",info.keys())
|
||||
if "image_path" in info.keys():
|
||||
image_path = "/ibex/reference/CV/COCO/cocoapi/data/2017/images/jpeg/train/"+info['image_path']
|
||||
|
||||
else:
|
||||
# print("coming here?")
|
||||
image_file = "images/"+info["image"]
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
# print(image_path)
|
||||
|
||||
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
if "question" in info.keys():
|
||||
question = info['question']
|
||||
else:
|
||||
question = random.sample(self.instruction_pool,1)[0]
|
||||
|
||||
|
||||
answer = info["caption"]
|
||||
|
||||
|
||||
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
|
||||
|
||||
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
# answer = self.text_processor(answer)
|
||||
# print("image path", image_path)
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
# "image_id": info['id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class MiniGPT4v_emotion(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
|
||||
self.instruction_pool = [
|
||||
'please describe this image as detailed as possible',
|
||||
'What do you see happening in this image?',
|
||||
"Can you elaborate on the elements of the picture provided?",
|
||||
"Describe the following image",
|
||||
"Write a detailed description of the given image",
|
||||
"Write a detailed description of the given image",
|
||||
"Explain the visual content of the image in great detail"
|
||||
]
|
||||
# self.ann=[]
|
||||
|
||||
with open(ann_path,"r") as f:
|
||||
# for line in f.readlines():
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
|
||||
# print("info keys",info.keys())
|
||||
|
||||
# print("coming here?")
|
||||
image_file = info["link"]
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
# print("image path",image_path)
|
||||
# print(image_path)
|
||||
|
||||
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
question = random.sample(self.instruction_pool,1)[0]
|
||||
|
||||
|
||||
answer = info["caption"]
|
||||
|
||||
|
||||
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
|
||||
|
||||
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
# answer = self.text_processor(answer)
|
||||
# print("image path", image_path)
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
# "image_id": info['id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class MiniGPT4v_laion(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
|
||||
self.instruction_pool = [
|
||||
'please describe this image as detailed as possible',
|
||||
'What do you see happening in this image?',
|
||||
"Can you elaborate on the elements of the picture provided?",
|
||||
"Describe the following image",
|
||||
"Write a detailed description of the given image",
|
||||
"Write a detailed description of the given image",
|
||||
"Explain the visual content of the image in great detail"
|
||||
]
|
||||
# self.ann=[]
|
||||
|
||||
with open(ann_path,"r") as f:
|
||||
# for line in f.readlines():
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
|
||||
# print("info keys",info.keys())
|
||||
|
||||
# print("coming here?")
|
||||
image_file = info["link"]
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
# print(image_path)
|
||||
# print(image_path)
|
||||
|
||||
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
question = random.sample(self.instruction_pool,1)[0]
|
||||
|
||||
|
||||
answer = info["caption"]
|
||||
|
||||
|
||||
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
|
||||
|
||||
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
|
||||
# answer = self.text_processor(answer)
|
||||
# print("image path", image_path)
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
# "image_id": info['id'],
|
||||
}
|
||||
|
||||
|
||||
|
||||
class Minigpt2_conversation(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
answer = info['conversations'][1]['value']
|
||||
instruction = info['conversations'][0]['value']
|
||||
|
||||
# print("instruction",instruction)
|
||||
# print("answer", answer)
|
||||
|
||||
return {
|
||||
"instruction_input": instruction,
|
||||
"answer": answer,
|
||||
}
|
||||
|
||||
|
||||
class LlavaConversationDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.ann=[]
|
||||
|
||||
|
||||
# with open(ann_path, 'r') as f:
|
||||
# self.ann = json.load(f)
|
||||
|
||||
self.connect_sym = "!@#"
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
|
||||
first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
|
||||
|
||||
questions = [first_instruction]
|
||||
answers = []
|
||||
|
||||
for i, item in enumerate(info["conversations"][1:]):
|
||||
if i % 2 ==0: # assistant
|
||||
assistant_answer = item["value"]
|
||||
answers.append(assistant_answer)
|
||||
else:
|
||||
human_instruction = item["value"]+" "
|
||||
questions.append(human_instruction)
|
||||
|
||||
questions = self.connect_sym.join(questions)
|
||||
# questions = questions.replace("\\\\","\\")
|
||||
answers = self.connect_sym.join(answers)
|
||||
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"conv_q": questions,
|
||||
'conv_a': answers,
|
||||
"image_id": info['id'],
|
||||
"connect_sym": self.connect_sym
|
||||
}
|
75
minigpt4/datasets/datasets/multitask_conversation.py
Normal file
75
minigpt4/datasets/datasets/multitask_conversation.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
|
||||
|
||||
|
||||
class MultiTaskConversationDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
self.connect_sym = "!@#"
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]
|
||||
|
||||
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
|
||||
first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
|
||||
first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
|
||||
|
||||
questions = [first_instruction]
|
||||
answers = []
|
||||
|
||||
for i, item in enumerate(info["conversations"][1:]):
|
||||
if i % 2 ==0: # assistant
|
||||
assistant_answer = item["value"]
|
||||
answers.append(assistant_answer)
|
||||
else:
|
||||
human_instruction = item["value"]+" "
|
||||
questions.append(human_instruction)
|
||||
|
||||
questions = self.connect_sym.join(questions)
|
||||
answers = self.connect_sym.join(answers)
|
||||
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"conv_q": questions,
|
||||
'conv_a': answers,
|
||||
"image_id": info['id'],
|
||||
"connect_sym": self.connect_sym
|
||||
}
|
186
minigpt4/datasets/datasets/text_caps.py
Executable file
186
minigpt4/datasets/datasets/text_caps.py
Executable file
@ -0,0 +1,186 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class TextCapDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
self.instruction_pool = [
|
||||
# "generate a short image caption incorporating text in the image",
|
||||
# "generate a brief image description combining the text shown in the image",
|
||||
# "what text is writen in this image?",
|
||||
# "describe the text that you can see from this image",
|
||||
# "What does the text in the image say?"
|
||||
'Briefly describe this image.',
|
||||
'Provide a concise depiction of this image.',
|
||||
'Present a short description of this image.',
|
||||
'Summarize this image in a few words.',
|
||||
'A short image caption:',
|
||||
'A short image description:',
|
||||
'A photo of ',
|
||||
'An image that shows ',
|
||||
'Write a short description for the image. ',
|
||||
'Write a description for the photo.',
|
||||
'Provide a description of what is presented in the photo.',
|
||||
'Briefly describe the content of the image.',
|
||||
'Can you briefly explain what you see in the image?',
|
||||
'Could you use a few words to describe what you perceive in the photo?',
|
||||
'Please provide a short depiction of the picture.',
|
||||
'Using language, provide a short account of the image.',
|
||||
'Use a few words to illustrate what is happening in the picture.',
|
||||
]
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann["data"])
|
||||
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann["data"][index]
|
||||
|
||||
image_file = '{}.jpg'.format(info['image_id'])
|
||||
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
# image_width,image_length = image.size
|
||||
image = self.vis_processor(image)
|
||||
|
||||
# ocr_info = self.ann[index]["data"]
|
||||
caption = info["caption_str"]
|
||||
caption = self.text_processor(caption)
|
||||
|
||||
# instruction = random.choice(self.instruction_pool).format(word_bbox)
|
||||
instruction = "<Img><ImageHere></Img> [caption] {} ".format(random.choice(self.instruction_pool))
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": caption,
|
||||
"data_type": "bbox",
|
||||
"question_split": True
|
||||
}
|
||||
|
||||
|
||||
|
||||
class TextCapBboxToObjectDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.vis_root = vis_root
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
# self.instruction_pool = [
|
||||
# "<Img><ImageHere></Img> What text does it show in {} ",
|
||||
# "<Img><ImageHere></Img> Extract the text from {} ",
|
||||
# "<Img><ImageHere></Img> What is the textual content in {} ",
|
||||
# "<Img><ImageHere></Img> Extract the textual information present in the {} ",
|
||||
# "<Img><ImageHere></Img> What is the text written within this defined region {}",
|
||||
# "<Img><ImageHere></Img> Transcribe the text located inside {}",
|
||||
# "<Img><ImageHere></Img> Can you read and extract the text from this specific area {}",
|
||||
# ]
|
||||
|
||||
self.instruction_pool = [
|
||||
"<Img><ImageHere></Img> [OCR] {}"
|
||||
]
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
self.new_ann = {"data":[]}
|
||||
for da in self.ann["data"]:
|
||||
if da["ocr_info"] !=[]:
|
||||
ocr_info_filter = []
|
||||
for d in da["ocr_info"]:
|
||||
if (d["bounding_box"]["width"]+d["bounding_box"]["top_left_x"])<=1.0 and (d["bounding_box"]["height"]+d["bounding_box"]["top_left_y"]) <=1.0 \
|
||||
and d["bounding_box"]["top_left_x"]>=0 and d["bounding_box"]["top_left_y"]>=0:
|
||||
ocr_info_filter.append(d)
|
||||
if ocr_info_filter !=[]:
|
||||
da["ocr_info"]=ocr_info_filter
|
||||
self.new_ann["data"].append(da)
|
||||
self.ann = self.new_ann
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann["data"])
|
||||
|
||||
|
||||
def __getitem__(self, index):
|
||||
|
||||
info = self.ann["data"][index]
|
||||
|
||||
|
||||
image_file = '{}.jpg'.format(info['image_id'])
|
||||
|
||||
image_path = os.path.join(self.vis_root, image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
# image_width,image_length = image.size
|
||||
image = self.vis_processor(image)
|
||||
|
||||
|
||||
|
||||
image_size = 100
|
||||
|
||||
ocr_info = info["ocr_info"]
|
||||
|
||||
sampled_ocr = random.sample(ocr_info,1)[0]
|
||||
|
||||
# print("sampled ocr", sampled_ocr)
|
||||
|
||||
word_text = sampled_ocr["word"]
|
||||
width = sampled_ocr["bounding_box"]["width"]
|
||||
height = sampled_ocr["bounding_box"]["height"]
|
||||
top_left_x = sampled_ocr["bounding_box"]["top_left_x"]
|
||||
top_left_y = sampled_ocr["bounding_box"]["top_left_y"]
|
||||
|
||||
x1 = int(top_left_x*image_size)
|
||||
y1 = int(top_left_y*image_size)
|
||||
x2 = x1 + int(width*image_size)
|
||||
y2 = y1 + int(height*image_size)
|
||||
assert x1>=0 and x1<=image_size
|
||||
assert x2>=0 and x2<=image_size
|
||||
assert y1>=0 and y1<=image_size
|
||||
assert y2>=0 and y2<=image_size
|
||||
|
||||
|
||||
word_bbox = "{<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">}"
|
||||
|
||||
instruction = random.choice(self.instruction_pool).format(word_bbox)
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
"answer": word_text,
|
||||
"data_type": "bbox",
|
||||
"question_split": True
|
||||
}
|
52
minigpt4/datasets/datasets/unnatural_instruction.py
Executable file
52
minigpt4/datasets/datasets/unnatural_instruction.py
Executable file
@ -0,0 +1,52 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import skimage.io as io
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.collections import PatchCollection
|
||||
from matplotlib.patches import Polygon, Rectangle
|
||||
from torch.utils.data import Dataset
|
||||
import webdataset as wds
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
|
||||
|
||||
|
||||
class UnnaturalDataset(Dataset):
|
||||
def __init__(self, text_processor, ann_path):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.text_processor = text_processor
|
||||
|
||||
with open(ann_path, 'r') as f:
|
||||
self.ann = json.load(f)
|
||||
|
||||
# with open(ann_path, 'r') as f:
|
||||
# for data in f.readlines():
|
||||
# data = json.loads(data)
|
||||
# self.ann.append(data)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.ann)
|
||||
|
||||
def __getitem__(self, index):
|
||||
info = self.ann[index]["instances"][0]
|
||||
instruction = info["instruction_with_input"]
|
||||
constraints = info["constraints"]
|
||||
answer = info["output"]
|
||||
if constraints != None:
|
||||
instruction = instruction+" "+constraints
|
||||
|
||||
return {
|
||||
# "image":None,
|
||||
"instruction_input": self.text_processor(instruction),
|
||||
"answer": self.text_processor(answer),
|
||||
}
|
98
minigpt4/datasets/datasets/vg_dataset.py
Executable file
98
minigpt4/datasets/datasets/vg_dataset.py
Executable file
@ -0,0 +1,98 @@
|
||||
import os
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from torch.utils.data import Dataset
|
||||
from visual_genome import local
|
||||
|
||||
|
||||
import threading
|
||||
|
||||
# Global lock
|
||||
lock = threading.Lock()
|
||||
|
||||
|
||||
class ReferVisualGenomeDataset(Dataset):
|
||||
def __init__(self, vis_processor, text_processor, data_dir):
|
||||
"""
|
||||
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||
ann_root (string): directory to store the annotation file
|
||||
"""
|
||||
self.data_dir = data_dir
|
||||
|
||||
self.vis_processor = vis_processor
|
||||
self.text_processor = text_processor
|
||||
|
||||
all_regions = local.get_all_region_descriptions(self.data_dir)
|
||||
all_regions = [region for regions in all_regions for region in regions]
|
||||
|
||||
# follow OFA practice, only regions smaller than 16384 pixels are used for refer
|
||||
self.regions = [region for region in all_regions if region.width * region.height < 16384]
|
||||
|
||||
print('Visual Genome grounding', len(self.regions))
|
||||
|
||||
|
||||
self.instruction_pool = [
|
||||
"[refer] {}",
|
||||
"[refer] give me the location of {}",
|
||||
"[refer] where is {} ?",
|
||||
"[refer] from this image, tell me the location of {}",
|
||||
"[refer] the location of {} is",
|
||||
"[refer] could you tell me the location for {} ?",
|
||||
"[refer] where can I locate the {} ?",
|
||||
]
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.regions)
|
||||
|
||||
def preprocess(self, index):
|
||||
region = self.regions[index]
|
||||
image_file = region.image.url.split('/')[-2:]
|
||||
image_path = os.path.join(self.data_dir, *image_file)
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image_orig_size = image.size
|
||||
image = self.vis_processor(image)
|
||||
image_new_size = [100,100]
|
||||
|
||||
sample_sentence = region.phrase
|
||||
refer_sentence = self.text_processor(sample_sentence)
|
||||
|
||||
bbox = [region.x, region.y, region.width, region.height]
|
||||
|
||||
bbox = [
|
||||
bbox[0] / image_orig_size[0] * image_new_size[0],
|
||||
bbox[1] / image_orig_size[1] * image_new_size[1],
|
||||
(bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
|
||||
(bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
|
||||
]
|
||||
bbox = [int(x) for x in bbox]
|
||||
bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
|
||||
return {
|
||||
"image": image,
|
||||
"refer_sentence": refer_sentence,
|
||||
"bbox": bbox,
|
||||
"image_id": region.image.id,
|
||||
}
|
||||
|
||||
def __getitem__(self, index):
|
||||
data = self.preprocess(index)
|
||||
instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
# assert False
|
||||
|
||||
return {
|
||||
"image": data['image'],
|
||||
"instruction_input": instruction,
|
||||
"answer": data['bbox'],
|
||||
"image_id": data['image_id'],
|
||||
}
|
||||
|
||||
|
223
minigpt4/datasets/datasets/vqa_datasets.py
Executable file
223
minigpt4/datasets/datasets/vqa_datasets.py
Executable file
@ -0,0 +1,223 @@
|
||||
"""
|
||||
Copyright (c) 2022, salesforce.com, inc.
|
||||
All rights reserved.
|
||||
SPDX-License-Identifier: BSD-3-Clause
|
||||
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||
"""
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
import os
|
||||
|
||||
from minigpt4.datasets.datasets.base_dataset import BaseDataset
|
||||
|
||||
|
||||
class VQADataset(BaseDataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
# def collater(self, samples):
|
||||
# image_list, question_list, answer_list, weight_list = [], [], [], []
|
||||
|
||||
# num_answers = []
|
||||
|
||||
# for sample in samples:
|
||||
# image_list.append(sample["image"])
|
||||
# question_list.append(sample["question"])
|
||||
|
||||
# weight_list.extend(sample["weights"])
|
||||
|
||||
# answers = sample["answer"]
|
||||
|
||||
# answer_list.extend(answers)
|
||||
# num_answers.append(len(answers))
|
||||
|
||||
# return {
|
||||
# "image": torch.stack(image_list, dim=0),
|
||||
# "text_input": question_list,
|
||||
# "answer": answer_list,
|
||||
# "weight": torch.Tensor(weight_list),
|
||||
# "n_answers": torch.LongTensor(num_answers),
|
||||
# }
|
||||
|
||||
|
||||
class VQAEvalDataset(BaseDataset):
|
||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||
|
||||
|
||||
class OKVQAEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
img_id = data['image_id']
|
||||
question = data['question']
|
||||
question_id = data['question_id']
|
||||
img_file = '{:0>12}.jpg'.format(img_id)
|
||||
image_path = os.path.join(self.root_path, img_file)
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image)
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
|
||||
# question = f"[vqa] {question} "
|
||||
return image, question, question_id, img_id
|
||||
|
||||
class VizWizEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
img_id = data['image']
|
||||
question = data['question']
|
||||
answers = data['answers']
|
||||
answers = '_'.join([answer['answer'] for answer in answers])
|
||||
image_path = os.path.join(self.root_path, img_id)
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image)
|
||||
# question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
|
||||
return image, question, answers
|
||||
|
||||
class AOKVQADAEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
img_file = data['image']
|
||||
question = data['question']
|
||||
question_id = data['question_id']
|
||||
image_path = os.path.join(self.root_path, img_file)
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image)
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
|
||||
# question = f"[vqa] {question} "
|
||||
return image, question, question_id
|
||||
|
||||
class AOKVQAMCEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
img_file = data['image']
|
||||
question = data['question']
|
||||
question_id = data['question_id']
|
||||
image_path = os.path.join(self.root_path, img_file)
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image).half().cuda()
|
||||
candidates=data['choices']
|
||||
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
|
||||
# question = f"[vqa] {question} "
|
||||
return image, question, question_id, candidates
|
||||
|
||||
class IconQAEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
data = self.loaded_data[idx]
|
||||
image_id = data['image_id']
|
||||
question = data['question']
|
||||
image_path = os.path.join(self.root_path, image_id, 'image.png')
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
image = self.vis_processor(image).half().cuda()
|
||||
candidates = '_'.join(data['choices'])
|
||||
answer = data['answer']
|
||||
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
|
||||
# question = f"[vqa] {question} "
|
||||
return image, question, candidates, answer
|
||||
|
||||
class GQAEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
ann = self.loaded_data[idx]
|
||||
image_id = ann["image"]
|
||||
image_path = os.path.join(self.root_path, f"{image_id}")
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
question = ann["question"]
|
||||
# question = f'Question: {question} Short answer: '
|
||||
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
|
||||
# question = f"[vqa] {question} "
|
||||
labels = ann["answer"]
|
||||
|
||||
return image, question, labels
|
||||
|
||||
class HMEvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
ann = self.loaded_data[idx]
|
||||
image_id = ann["img"]
|
||||
image_path = os.path.join(self.root_path, f"{image_id}")
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
question = ann["text"]
|
||||
question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
|
||||
labels = ann["label"]
|
||||
|
||||
return image, question, labels
|
||||
|
||||
class VSREvalData(torch.utils.data.Dataset):
|
||||
def __init__(self, loaded_data, vis_processor, root_path):
|
||||
self.loaded_data = loaded_data
|
||||
self.root_path = root_path
|
||||
self.vis_processor = vis_processor
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loaded_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
ann = self.loaded_data[idx]
|
||||
image_path = os.path.join(self.root_path, ann["image"])
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
image = self.vis_processor(image)
|
||||
question = ann["caption"]
|
||||
question = f'[vqa] Based on the image, is this statement true or false? {question}'
|
||||
question_id = ann["image"].split('.')[0]
|
||||
labels = 'true' if ann["label"] == 1 else 'false'
|
||||
|
||||
return image, question, labels
|
300
train_configs/minigpt_v2_finetune.yaml
Normal file
300
train_configs/minigpt_v2_finetune.yaml
Normal file
@ -0,0 +1,300 @@
|
||||
model:
|
||||
arch: minigpt_v2
|
||||
model_type: pretrain
|
||||
freeze_vit: True
|
||||
freeze_qformer: True
|
||||
max_txt_len: 1024
|
||||
low_resource: False
|
||||
image_size: 448
|
||||
end_sym: "</s>"
|
||||
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
|
||||
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
|
||||
use_grad_checkpoint: True
|
||||
chat_template: True
|
||||
lora_r: 64
|
||||
lora_alpha: 16
|
||||
|
||||
|
||||
datasets:
|
||||
|
||||
|
||||
multitask_conversation:
|
||||
batch_size_train: 2
|
||||
vis_processor:
|
||||
train:
|
||||
name: "blip2_image_train"
|
||||
image_size: 448
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 40
|
||||
|
||||
llava_conversation: # 77k
|
||||
batch_size_train: 2
|
||||
vis_processor:
|
||||
train:
|
||||
name: "blip2_image_train"
|
||||
image_size: 448
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 10
|
||||
|
||||
|
||||
|
||||
|
||||
# unnatural_instruction:
|
||||
# batch_size: 1
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 15
|
||||
|
||||
|
||||
# refvg:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 40
|
||||
|
||||
# llava_detail: #23K
|
||||
# batch_size: 4
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 20
|
||||
|
||||
# llava_reason: # 77k
|
||||
# batch_size: 4
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 80
|
||||
|
||||
|
||||
# grounded_detailed_image_caption:
|
||||
# batch_size: 2
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 80
|
||||
|
||||
# CaptionToPhrase:
|
||||
# batch_size: 2
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 80
|
||||
|
||||
# ObjectToPhrase:
|
||||
# batch_size: 2
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 80
|
||||
|
||||
# coco_caption:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 10
|
||||
|
||||
|
||||
# textcaps_caption: #
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 10
|
||||
|
||||
# refcoco: # 142k
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 15
|
||||
|
||||
|
||||
# refcocop:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 15
|
||||
|
||||
# refcocog:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 15
|
||||
|
||||
|
||||
|
||||
# invrefcoco:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 10
|
||||
|
||||
# invrefcocop:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 10
|
||||
|
||||
# invrefcocog:
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 10
|
||||
|
||||
|
||||
# coco_vqa: # 82K
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 15
|
||||
|
||||
# ok_vqa: # 9k
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 8
|
||||
|
||||
# aok_vqa: # 17k
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 12
|
||||
|
||||
# gqa: # 82K
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 40
|
||||
|
||||
# ocrvqa: # 800K
|
||||
# batch_size: 6
|
||||
# vis_processor:
|
||||
# train:
|
||||
# name: "blip2_image_train"
|
||||
# image_size: 448
|
||||
# text_processor:
|
||||
# train:
|
||||
# name: "blip_caption"
|
||||
# sample_ratio: 30
|
||||
|
||||
|
||||
run:
|
||||
task: image_text_pretrain
|
||||
# optimizer
|
||||
lr_sched: "linear_warmup_cosine_lr"
|
||||
init_lr: 1e-5
|
||||
min_lr: 8e-5
|
||||
warmup_lr: 1e-6
|
||||
|
||||
weight_decay: 0.05
|
||||
max_epoch: 50
|
||||
num_workers: 6
|
||||
warmup_steps: 1000
|
||||
iters_per_epoch: 1000
|
||||
|
||||
seed: 42
|
||||
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
|
||||
|
||||
amp: True
|
||||
resume_ckpt_path: null
|
||||
|
||||
evaluate: False
|
||||
train_splits: ["train"]
|
||||
|
||||
device: "cuda"
|
||||
world_size: 1
|
||||
dist_url: "env://"
|
||||
distributed: True
|
Loading…
Reference in New Issue
Block a user