Merge branch 'main' of github.com:TsuTikgiau/MiniGPT-4 into main

This commit is contained in:
Deyao Zhu 2023-10-22 22:32:01 +03:00
commit ba6cdb6992
39 changed files with 3799 additions and 1 deletions

View File

@ -115,7 +115,7 @@ in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Lin
For MiniGPT-v2, run
```
python demo_v2.py --cfg-path eval_configs/minigpt4v2_eval.yaml --gpu-id 0
python demo_v2.py --cfg-path eval_configs/minigptv2_eval.yaml --gpu-id 0
```
For MiniGPT-4 (Vicuna version), run

30
jobs/srun_test.sh Normal file
View File

@ -0,0 +1,30 @@
cd ..
job_name=minigpt4_v2_test
read LOWERPORT UPPERPORT < /proc/sys/net/ipv4/ip_local_port_range
while :
do
PORT="`shuf -i $LOWERPORT-$UPPERPORT -n 1`"
ss -lpn | grep -q ":$PORT " || break
done
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_gqa.yaml
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/448_final_v1_gqa_ablation2.yaml
torchrun --master-port ${PORT} --nproc-per-node 2 train.py --cfg-path train_configs/minigpt_v2_finetune.yaml
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path finetune_conversation_ablation/conversation_v2_last_336_test.yaml
#torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_llama2/336_final_v1_13B.yaml
# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/448_v2_llama2.yaml
#accelerate launch train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2.yaml
# torchrun --master-port ${PORT} --nproc-per-node 2 train.py --job_name ${job_name} --cfg-path train_configs_final_ablations/336_v2_llama2_clip_encoder.yaml
#best_data_ratio_336_full_dataset_lr2e4_v1.yaml

View File

@ -0,0 +1,29 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
aok_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
# storage:
# - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
# images:
# storage: /path/to/coco/images/
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage:
- /ibex/project/c2133/minigpt4_v2_dataset/aokvqa/annotations/aokvqa_v1p0_train.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/

View File

@ -0,0 +1,38 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_caption: # name of the dataset builder
# dataset_card: dataset_card/coco_caption.md
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
# md5: aa31ac474cf6250ebb81d18348a07ed8
# storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
# images:
# storage: /path/to/coco/images/
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
md5: aa31ac474cf6250ebb81d18348a07ed8
storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_train.json
# val:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
# md5: b273847456ef5580e33713b1f7de52a0
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
# test:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
# md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg

View File

@ -0,0 +1,33 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
# storage:
# - /path/to/vqav2/annotations/vqa_train.json
# - /path/to/vqav2/coco/annotations/vqa_val.json
# images:
# storage: /path/to/coco/images/
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
storage:
- /ibex/project/c2133/minigpt4_v2_dataset/vqav2/annotations/vqa_train.json
- /ibex/project/c2133/minigpt4_v2_dataset/vqav2/coco/annotations/vqa_val.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg

View File

@ -0,0 +1,8 @@
datasets:
invrefcoco:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: invrefcoco
splitBy: unc

View File

@ -0,0 +1,8 @@
datasets:
invrefcocog:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: invrefcocog
splitBy: umd

View File

@ -0,0 +1,8 @@
datasets:
invrefcocop:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: invrefcoco+
splitBy: unc

View File

@ -0,0 +1,8 @@
datasets:
refcoco:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: refcoco
splitBy: unc

View File

@ -0,0 +1,8 @@
datasets:
refcocog:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: refcocog
splitBy: umd

View File

@ -0,0 +1,8 @@
datasets:
refcocop:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/object_detection_datasets/
dataset: refcoco+
splitBy: unc

View File

@ -0,0 +1,6 @@
datasets:
CaptionToPhrase:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_v2_last.json

View File

@ -0,0 +1,6 @@
datasets:
grounded_detailed_image_caption:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_grounding_phrase5_last.json

View File

@ -0,0 +1,6 @@
datasets:
ObjectToPhrase:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/train_phrase2bbox_resample_last.json

View File

@ -0,0 +1,33 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
gqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
# storage:
# - /path/to/gqa/annotations/train_balanced_questions.json
# images:
# storage: /path/to/gqa/images/
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
storage:
- /ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json
images:
storage: /ibex/project/c2133/minigpt4_v2_dataset/gqa/images_copy/

View File

@ -0,0 +1,12 @@
datasets:
# llava_conversation:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/conversation_58k.json
llava_conversation:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/conversation_58k.json

View File

@ -0,0 +1,12 @@
datasets:
# llava_detail:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/detail_23k.json
llava_detail:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/detail_23k.json

View File

@ -0,0 +1,12 @@
datasets:
# llava_reason:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/complex_reasoning_77k.json
llava_reason:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/llava/complex_reasoning_77k.json

View File

@ -0,0 +1,14 @@
datasets:
# multitask_conversation:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/multitask_conversation/multi_task_conversation.json
multitask_conversation:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/multitask_conversation/multi_task_conversation.json

View File

@ -0,0 +1,10 @@
datasets:
# unnatural_instruction:
# data_type: text
# build_info:
# ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
unnatural_instruction:
data_type: text
build_info:
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/unnatural-instructions/data/unnatural_instruction_filer.json

View File

@ -0,0 +1,12 @@
datasets:
# ocrvqa:
# data_type: images
# build_info:
# image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
# ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
ocrvqa:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json

View File

@ -0,0 +1,36 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
ok_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# # TODO make this order insensitive
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
# storage:
# - /path/to/okvqa/annotations/okvqa_train.json
# images:
# storage: /path/to/okvqa/images
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
storage:
- /ibex/project/c2133/minigpt4_v2_dataset/okvqa_v2/annotations/okvqa_train.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg

View File

@ -0,0 +1,16 @@
datasets:
# textcaps_caption:
# data_type: images
# build_info:
# image_path: /path/to/TextCaps/train_images
# ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
textcaps_caption:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/train_images
ann_path: /ibex/project/c2133/minigpt4_v2_dataset/TextCaps/TextCaps_0.1_train.json

View File

@ -0,0 +1,10 @@
datasets:
# refvg:
# data_type: images
# build_info:
# data_dir: /path/to/visual_genome
refvg:
data_type: images
build_info:
data_dir: /ibex/project/c2133/minigpt4_v2_dataset/visual_genome

View File

@ -6,6 +6,418 @@ from minigpt4.common.registry import registry
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
from minigpt4.datasets.datasets.laion_dataset import LaionDataset
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
from minigpt4.datasets.datasets.text_caps import TextCapDataset
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
from minigpt4.datasets.datasets.gqa_datasets import GQADataset
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
from minigpt4.datasets.datasets.doc_dataset import OCRVQADataset
@registry.register_builder("multitask_conversation")
class MultitaskConversationBuilder(BaseDatasetBuilder):
train_dataset_cls = MultiTaskConversationDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/multitask_conversation/default.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("unnatural_instruction")
class UnnaturalInstructionBuilder(BaseDatasetBuilder):
train_dataset_cls = UnnaturalDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/nlp/unnatural_instruction.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
)
return datasets
@registry.register_builder("llava_detail")
class LlavaDetailBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaDetailDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/llava/detail.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("llava_reason")
class LlavaReasonBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaReasonDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/llava/reason.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("llava_conversation")
class LlavaReasonBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaConversationDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/llava/conversation.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
class AllRefCOCOBuilder(BaseDatasetBuilder):
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
image_path = build_info.image_path
ann_path = build_info.ann_path
datasets = dict()
if not os.path.exists(image_path):
warnings.warn("image path {} does not exist.".format(image_path))
if not os.path.exists(ann_path):
warnings.warn("ann path {} does not exist.".format(ann_path))
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=ann_path,
vis_root=image_path,
dataset=build_info.dataset,
splitBy=build_info.splitBy
)
return datasets
@registry.register_builder("refcoco")
class RefCOCOBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcoco.yaml",
}
@registry.register_builder("refcocop")
class RefCOCOPBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcocop.yaml",
}
@registry.register_builder("refcocog")
class RefCOCOGBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcocog.yaml",
}
@registry.register_builder("invrefcoco")
class RefCOCOBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcoco.yaml",
}
@registry.register_builder("invrefcocop")
class RefCOCOPBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcocop.yaml",
}
@registry.register_builder("invrefcocog")
class RefCOCOGBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcocog.yaml",
}
@registry.register_builder("refvg")
class RefVisualGenomeBuilder(BaseDatasetBuilder):
train_dataset_cls = ReferVisualGenomeDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/vg/ref.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
data_dir = build_info.data_dir
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
data_dir=data_dir,
)
return datasets
@registry.register_builder("textcaps_caption")
class TextcapCaptionBuilder(BaseDatasetBuilder):
train_dataset_cls = TextCapDataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
def _download_ann(self):
pass
def _download_vis(self):
pass
def build(self):
self.build_processors()
build_info = self.config.build_info
datasets = dict()
split = "train"
# create datasets
# [NOTE] return inner_datasets (wds.DataPipeline)
dataset_cls = self.train_dataset_cls
datasets[split] = dataset_cls(
vis_processor=self.vis_processors[split],
text_processor=self.text_processors[split],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("coco_vqa")
class COCOVQABuilder(BaseDatasetBuilder):
train_dataset_cls = COCOVQADataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco/defaults_vqa.yaml",
}
@registry.register_builder("aok_vqa")
class AOKVQABuilder(BaseDatasetBuilder):
train_dataset_cls = AOKVQADataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
@registry.register_builder("gqa")
class GQABuilder(BaseDatasetBuilder):
train_dataset_cls = GQADataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/gqa/balanced_val.yaml",
}
@registry.register_builder("grounded_detailed_image_caption")
class GroundedCaptionBuilder(BaseDatasetBuilder):
train_dataset_cls = GroundedDetailDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/flickr/default.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("CaptionToPhrase")
class CaptionToPhraseBuilder(BaseDatasetBuilder):
train_dataset_cls = CaptionToObjectDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/flickr/caption_to_phrase.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("ObjectToPhrase")
class CaptionToPhraseBuilder(BaseDatasetBuilder):
train_dataset_cls = PhraseToObjectDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/flickr/object_to_phrase.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
class DocumentVQABuilder(BaseDatasetBuilder):
def _download_ann(self):
pass
def _download_vis(self):
pass
def build(self):
self.build_processors()
build_info = self.config.build_info
datasets = dict()
split = "train"
dataset_cls = self.train_dataset_cls
datasets[split] = dataset_cls(
vis_processor=self.vis_processors[split],
text_processor=self.text_processors[split],
vis_root=build_info.image_path,
ann_path=build_info.ann_path
)
return datasets
@registry.register_builder("ocrvqa")
class OCRVQABuilder(DocumentVQABuilder):
train_dataset_cls = OCRVQADataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/ocrvqa/ocrvqa.yaml"}
@registry.register_builder("cc_sbu")

View File

@ -0,0 +1,212 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
from collections import OrderedDict
import json
import os
import random
import torch
from PIL import Image
from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
class __DisplMixin:
def displ_item(self, index):
sample, ann = self.__getitem__(index), self.annotation[index]
return OrderedDict(
{
"file": ann["image"],
"question": ann["question"],
"question_id": ann["question_id"],
"direct_answers": "; ".join(ann["direct_answers"]),
"choices": "; ".join(ann["choices"]),
"correct_choice": ann["choices"][ann["correct_choice_idx"]],
"image": sample["image"],
}
)
class AOKVQADataset(VQADataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[
"[vqa] {}",
"[vqa] Based on the image, respond to this question with a short answer: {}"
]
exist_annotation = []
for ann in self.annotation:
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
if os.path.exists(image_path):
exist_annotation.append(ann)
self.annotation = exist_annotation
def get_data(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
answer_key = "direct_answers"
# print("answer key", answer_key)
# for answer in ann[answer_key]:
# print(answer)
answer_weight = {}
for answer in ann[answer_key]:
if answer in answer_weight.keys():
answer_weight[answer] += 1 / len(ann[answer_key])
else:
answer_weight[answer] = 1 / len(ann[answer_key])
answers = list(answer_weight.keys())
weights = list(answer_weight.values())
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
return {
"image": image,
"question": question,
"answer": answer,
}
def __getitem__(self, index):
data = self.get_data(index)
question = self.text_processor(data["question"])
instruction = random.choice(self.instruction_pool).format(question)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
answer = self.text_processor(data['answer'])
return {
"image": data['image'],
"instruction_input": instruction,
"answer": answer,
}
class AOKVQGDataset(AOKVQADataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool = [
'Given the image, generate a question whose answer is: {}',
'Based on the image, provide a question with the answer: {}',
'Given the visual representation, create a question for which the answer is "{}"',
'From the image provided, craft a question that leads to the reply: {}',
'Considering the picture, come up with a question where the answer is: {}',
'Taking the image into account, generate an question that has the answer: {}'
]
def __getitem__(self, index):
data = self.get_data(index)
instruction = random.choice(self.instruction_pool).format(data['answer'])
return {
"image": data['image'],
"instruction_input": instruction,
"answer": data['question'],
}
# class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
# def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
# """
# vis_root (string): Root directory of images (e.g. coco/images/)
# ann_root (string): directory to store the annotation file
# """
#
# self.vis_root = vis_root
#
# self.annotation = json.load(open(ann_paths[0]))
#
# answer_list_path = ann_paths[1]
# if os.path.exists(answer_list_path):
# self.answer_list = json.load(open(answer_list_path))
# else:
# self.answer_list = None
#
# try:
# self.coco_fmt_qust_file = ann_paths[2]
# self.coco_fmt_anno_file = ann_paths[3]
# except IndexError:
# self.coco_fmt_qust_file = None
# self.coco_fmt_anno_file = None
#
# self.vis_processor = vis_processor
# self.text_processor = text_processor
#
# self._add_instance_ids()
#
# def collater(self, samples):
# (
# image_list,
# question_list,
# question_id_list,
# instance_id_list,
# choices_list,
# correct_choice_idx_list,
# direct_answers_list,
# ) = ([], [], [], [], [], [], [])
#
# for sample in samples:
# image_list.append(sample["image"])
# question_list.append(sample["text_input"])
# question_id_list.append(sample["question_id"])
# instance_id_list.append(sample["instance_id"])
# choices_list.append(sample["choices"])
# correct_choice_idx_list.append(sample["correct_choice_idx"])
# direct_answers_list.append(sample["direct_answers"])
#
# return {
# "image": torch.stack(image_list, dim=0),
# "text_input": question_list,
# "question_id": question_id_list,
# "instance_id": instance_id_list,
# "choices": choices_list,
# "correct_choice_idx": correct_choice_idx_list,
# "direct_answers": direct_answers_list,
# }
#
# def __getitem__(self, index):
# ann = self.annotation[index]
#
# image_path = os.path.join(self.vis_root, ann["image"])
# image = Image.open(image_path).convert("RGB")
#
# image = self.vis_processor(image)
# question = self.text_processor(ann["question"])
#
# choices = ann["choices"]
# if "correct_choice_idx" in ann:
# correct_choice_idx = ann["correct_choice_idx"]
# else:
# correct_choice_idx = None
#
# if "direct_answers" in ann:
# direct_answers = ann["direct_answers"]
# else:
# direct_answers = None
#
# return {
# "image": image,
# "text_input": question,
# "question_id": ann["question_id"],
# "instance_id": ann["instance_id"],
# "choices": choices,
# "correct_choice_idx": correct_choice_idx,
# "direct_answers": direct_answers,
# }

View File

@ -0,0 +1,122 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import json
import torch
import numpy as np
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
COCOCapDataset = COCOCaptionDataset
class COCOCapEvalDataset(CaptionEvalDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
split (string): val or test
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
return {
"image": image,
"image_id": img_id,
"instance_id": ann["instance_id"],
}
class NoCapsEvalDataset(CaptionEvalDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
split (string): val or test
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
img_id = ann["img_id"]
return {
"image": image,
"image_id": img_id,
"instance_id": ann["instance_id"],
}
class RefCOCOEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_id = data['img_id']
sent = data['sents']
image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
# question = f"[refer] {sent}"
question = f"[refer] where is {sent}?"
# question = f"where is the bounding box location of {sent}?"
return image, question, img_id
class EvalCaptionData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
ann = dict()
for item in self.loaded_data:
image_id = item['image_id']
ann[image_id] = item['image']
self.ann = [{'image_id':image_id, 'image': ann[image_id]} for image_id in ann]
def __len__(self):
return len(self.ann)
def __getitem__(self, idx):
data = self.ann[idx]
image_id = data['image_id']
img_file = data['image'].split('/')[-1]
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[caption] please describe this image?"
return image, question, image_id

View File

@ -0,0 +1,667 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
import threading
# Global lock
lock = threading.Lock()
def sample_object_bbox(objects, bbox):
zipped_list = list(zip(objects, bbox))
# Shuffle the zipped list
random.shuffle(zipped_list)
# Generate the new string with interleaved format
# interleaved_list = str([{'{},{}'.format(obj, str(bbox).replace("[","").replace("]","") )} for obj, bbox in zipped_list])
# print("objects", objects)
# print("bbox",bbox)
interleaved_list = str([{'{},{}'.format(obj, bbox.strip())} for obj, bbox in zipped_list]).replace("'","").replace("[","").replace("]","")
# interleaved_list = " "+interleaved_list
# print(interleaved_list)
return interleaved_list
def bbox_to_object(objects, bbox):
index_sample = random.sample(range(len(objects)),1)[0]
sample_object = str(objects[index_sample])
sample_bbox = bbox[index_sample]
# sample_center_point = center_point[index_sample]
sample_bbox = r"{"+str(sample_bbox) + "}"
return sample_bbox, sample_object
def object_to_bbox(objects, bbox, center_point):
index_sample = random.sample(range(len(objects)),1)[0]
sample_object = objects[index_sample]
sample_bbox = bbox[index_sample]
sample_center_point = center_point[index_sample]
instruction = "what is object and the bounding box in the center coordinate of "+str(sample_center_point)+"? "
answer = "{"+str(sample_object)+","+str(sample_bbox)+"}"
return instruction, answer
class COCOBBOXDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, location):
super().__init__(vis_processor=vis_processor, text_processor=text_processor)
print("coco box dataset")
self.inner_dataset = wds.DataPipeline(
wds.ResampledShards(location),
wds.tarfile_to_samples(handler=wds.warn_and_continue),
wds.shuffle(1000, handler=wds.warn_and_continue),
wds.decode("pilrgb", handler=wds.warn_and_continue),
wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
wds.map(self.to_dict, handler=wds.warn_and_continue),
)
def to_dict(self, sample):
objects = sample[1]["objects"]
boxes = sample[1]["bbox"]
caption = sample[1]["caption"]
new_bboxes = []
image_size = sample[0].shape[1]
image_size = 100
for index in range(len(boxes)):
box = boxes[index]
x1 = int(box[0]*image_size)
y1 = int(box[1]*image_size)
x2 = x1 + int(box[2]*image_size)
y2 = y1 + int(box[3]*image_size)
assert x1>=0 and x1<=image_size
assert x2>=0 and x2<=image_size
assert y1>=0 and y1<=image_size
assert y2>=0 and y2<=image_size
new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
# new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
new_bboxes.append(new_bbox)
instruction = r"Given an image, identify the objects and their bounding boxes in the format of {object,x1 y1 x2 y2}. "
instruction = "<Img><ImageHere></Img> {}".format(self.text_processor(instruction))
answer = sample_object_bbox(objects, new_bboxes)
# print("instruction",instruction)
# print("answer", answer)
return {
"image": sample[0],
"instruction_input": instruction,
"answer": answer,
"data_type": "bbox",
"question_split": True
}
class COCOBboxToObjectDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, location):
super().__init__(vis_processor=vis_processor, text_processor=text_processor)
self.inner_dataset = wds.DataPipeline(
wds.ResampledShards(location),
wds.tarfile_to_samples(handler=wds.warn_and_continue),
wds.shuffle(1000, handler=wds.warn_and_continue),
wds.decode("pilrgb", handler=wds.warn_and_continue),
wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
wds.map(self.to_dict, handler=wds.warn_and_continue),
)
self.instruction_pool = [
"<Img><ImageHere></Img> what object is in this bounding box location {} ",
"<Img><ImageHere></Img> what object is in this location {} ",
"<Img><ImageHere></Img> identify the object present at this location {} ",
"<Img><ImageHere></Img> what is it in bounding box location{} ",
"<Img><ImageHere></Img> describe this object in {} ",
"<Img><ImageHere></Img> this {} is ",
"<Img><ImageHere></Img> the object in {} is ",
"<Img><ImageHere></Img> please tell me what is inside the bounding box position {} ",
"<Img><ImageHere></Img> what can you find in the bounding box area at position {}? ",
"<Img><ImageHere></Img> what is the object occupying this area {} ",
"<Img><ImageHere></Img> could you identify the content within the bounding box located at {} ",
]
def to_dict(self, sample):
objects = sample[1]["objects"]
boxes = sample[1]["bbox"]
new_bboxes = []
image_size = sample[0].shape[1]
image_size=100
for index in range(len(boxes)):
box = boxes[index]
x1 = int(box[0]*image_size)
y1 = int(box[1]*image_size)
x2 = x1 + int(box[2]*image_size)
y2 = y1 + int(box[3]*image_size)
assert x1>=0 and x1<=image_size
assert x2>=0 and x2<=image_size
assert y1>=0 and y1<=image_size
assert y2>=0 and y2<=image_size
new_bbox = "<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
new_bboxes.append(new_bbox)
bbox, object = bbox_to_object(objects, new_bboxes)
instruction = random.choice(self.instruction_pool).format(bbox)
return {
"image": sample[0],
"instruction_input": instruction,
"answer": self.text_processor(object),
"data_type": "bbox",
"question_split": True
}
# class ReferCOCODataset(Dataset):
# def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
# """
# vis_root (string): Root directory of images (e.g. coco/images/)
# ann_root (string): directory to store the annotation file
# """
# self.vis_root = vis_root
# self.vis_processor = vis_processor
# self.text_processor = text_processor
# self.refer = REFER(ann_path, vis_root, dataset, splitBy)
# self.ref_ids = self.refer.getRefIds()
# self.instruction_pool = [
# "[refer] {}",
# "[refer] give me the location of {}",
# "[refer] where is {} ?",
# "[refer] from this image, tell me the location of {}",
# "[refer] the location of {} is",
# "[refer] could you tell me the location for {} ?",
# "[refer] where can I locate the {} ?",
# ]
# def __len__(self):
# return len(self.ref_ids)
# def preprocess(self, index):
# ref_id = self.ref_ids[index]
# ref = self.refer.loadRefs(ref_id)[0]
# image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
# image_path = os.path.join(self.vis_root, image_file)
# image = Image.open(image_path).convert("RGB")
# image_orig_size = image.size
# image = self.vis_processor(image)
# image_new_size = [image.shape[1], image.shape[2]]
# image_new_size = [100,100]
# sample_sentence = random.choice(ref['sentences'])['raw']
# refer_sentence = self.text_processor(sample_sentence)
# bbox = self.refer.getRefBox(ref['ref_id'])
# bbox_to_save = bbox
# image_id_to_save = ref["image_id"]
# ref_id_to_save = ref_id
# item = {"image":image_id_to_save,"bbox":bbox_to_save,"ref id":ref_id_to_save, "sentence":refer_sentence}
# def save_to_file():
# with lock:
# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "r") as f:
# refer_json = json.load(f)
# if ref_id_to_save not in refer_json.keys():
# print(item)
# refer_json[ref_id_to_save] = item
# with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json", "w") as f:
# json.dump(refer_json, f)
# save_to_file()
# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","r") as f:
# # refer_json = json.load(open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json"))
# # if ref_id_to_save not in refer_json.keys():
# # print(item)
# # refer_json[ref_id_to_save] = item
# # with open("/ibex/project/c2133/minigpt4_v2_dataset/refercoco_record/save.json","w") as f:
# # json.dump(refer_json,f)
# bbox = [
# bbox[0] / image_orig_size[0] * image_new_size[0],
# bbox[1] / image_orig_size[1] * image_new_size[1],
# (bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
# (bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
# ]
# bbox = [int(x) for x in bbox]
# bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
# return {
# "image": image,
# "refer_sentence": refer_sentence,
# "bbox": bbox,
# "image_id": ref['image_id'],
# }
# def __getitem__(self, index):
# data = self.preprocess(index)
# instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
# instruction = "<Img><ImageHere></Img> {} ".format(instruction)
# return {
# "image": data['image'],
# "instruction_input": instruction,
# "answer": data['bbox'],
# "image_id": data['image_id'],
# }
# class InvReferCOCODataset(ReferCOCODataset):
# def __init__(self, *args, **kwargs):
# super(InvReferCOCODataset, self).__init__(*args, **kwargs)
# self.instruction_pool = [
# "[identify] {}",
# "[identify] what object is in this location {}",
# "[identify] identify the object present at this location {}",
# "[identify] what is it in {}",
# "[identify] describe this object in {}",
# "[identify] this {} is",
# "[identify] the object in {} is",
# ]
# def __getitem__(self, index):
# data = self.preprocess(index)
# instruction = random.choice(self.instruction_pool).format(data['bbox'])
# instruction = "<Img><ImageHere></Img> {} ".format(instruction)
# return {
# "image": data['image'],
# "instruction_input": instruction,
# "answer": self.text_processor(data['refer_sentence']),
# "image_id": data['image_id'],
# }
class ReferCOCODataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path, dataset='refcoco', splitBy='unc'):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.refer = REFER(ann_path, vis_root, dataset, splitBy)
self.ref_ids = self.refer.getRefIds(split="train")
print(dataset, len(self.ref_ids))
self.instruction_pool = [
"[refer] {}",
"[refer] give me the location of {}",
"[refer] where is {} ?",
"[refer] from this image, tell me the location of {}",
"[refer] the location of {} is",
"[refer] could you tell me the location for {} ?",
"[refer] where can I locate the {} ?",
]
def __len__(self):
return len(self.ref_ids)
def preprocess(self, index):
ref_id = self.ref_ids[index]
ref = self.refer.loadRefs(ref_id)[0]
image_file = 'COCO_train2014_{:0>12}.jpg'.format(ref["image_id"])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image_orig_size = image.size
image = self.vis_processor(image)
image_new_size = [image.shape[1], image.shape[2]]
image_new_size = [100,100]
sample_sentence = random.choice(ref['sentences'])['raw']
refer_sentence = self.text_processor(sample_sentence)
bbox = self.refer.getRefBox(ref['ref_id'])
bbox = [
bbox[0] / image_orig_size[0] * image_new_size[0],
bbox[1] / image_orig_size[1] * image_new_size[1],
(bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
(bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
]
bbox = [int(x) for x in bbox]
bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
return {
"image": image,
"refer_sentence": refer_sentence,
"bbox": bbox,
"image_id": ref['image_id'],
}
def __getitem__(self, index):
data = self.preprocess(index)
instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": data['image'],
"instruction_input": instruction,
"answer": data['bbox'],
"image_id": data['image_id'],
}
class InvReferCOCODataset(ReferCOCODataset):
def __init__(self, *args, **kwargs):
super(InvReferCOCODataset, self).__init__(*args, **kwargs)
self.instruction_pool = [
"[identify] {}",
"[identify] what object is in this location {}",
"[identify] identify the object present at this location {}",
"[identify] what is it in {}",
"[identify] describe this object in {}",
"[identify] this {} is",
"[identify] the object in {} is",
]
def __getitem__(self, index):
data = self.preprocess(index)
instruction = random.choice(self.instruction_pool).format(data['bbox'])
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": data['image'],
"instruction_input": instruction,
"answer": self.text_processor(data['refer_sentence']),
"image_id": data['image_id'],
}
class REFER:
def __init__(self, data_root, vis_root, dataset='refcoco', splitBy='unc'):
# provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
# also provide dataset name and splitBy information
# e.g., dataset = 'refcoco', splitBy = 'unc'
dataset = dataset.split('inv')[-1] # inv dataset is stored in the same path as normal dataset
print('loading dataset %s into memory...' % dataset)
self.ann_dir = os.path.join(data_root, dataset)
if dataset in ['refcoco', 'refcoco+', 'refcocog']:
self.vis_root = vis_root
elif dataset == 'refclef':
raise 'No RefClef image data'
else:
raise 'No refer dataset is called [%s]' % dataset
# load refs from data/dataset/refs(dataset).json
tic = time.time()
ref_file = os.path.join(self.ann_dir, 'refs(' + splitBy + ').p')
self.data = {}
self.data['dataset'] = dataset
self.data['refs'] = pickle.load(open(ref_file, 'rb'))
# load annotations from data/dataset/instances.json
instances_file = os.path.join(self.ann_dir, 'instances.json')
instances = json.load(open(instances_file, 'r'))
self.data['images'] = instances['images']
self.data['annotations'] = instances['annotations']
self.data['categories'] = instances['categories']
# create index
self.createIndex()
print('DONE (t=%.2fs)' % (time.time() - tic))
def createIndex(self):
# create sets of mapping
# 1) Refs: {ref_id: ref}
# 2) Anns: {ann_id: ann}
# 3) Imgs: {image_id: image}
# 4) Cats: {category_id: category_name}
# 5) Sents: {sent_id: sent}
# 6) imgToRefs: {image_id: refs}
# 7) imgToAnns: {image_id: anns}
# 8) refToAnn: {ref_id: ann}
# 9) annToRef: {ann_id: ref}
# 10) catToRefs: {category_id: refs}
# 11) sentToRef: {sent_id: ref}
# 12) sentToTokens: {sent_id: tokens}
print('creating index...')
# fetch info from instances
Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
for ann in self.data['annotations']:
Anns[ann['id']] = ann
imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
for img in self.data['images']:
Imgs[img['id']] = img
for cat in self.data['categories']:
Cats[cat['id']] = cat['name']
# fetch info from refs
Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
Sents, sentToRef, sentToTokens = {}, {}, {}
for ref in self.data['refs']:
# ids
ref_id = ref['ref_id']
ann_id = ref['ann_id']
category_id = ref['category_id']
image_id = ref['image_id']
# add mapping related to ref
Refs[ref_id] = ref
imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
refToAnn[ref_id] = Anns[ann_id]
annToRef[ann_id] = ref
# add mapping of sent
for sent in ref['sentences']:
Sents[sent['sent_id']] = sent
sentToRef[sent['sent_id']] = ref
sentToTokens[sent['sent_id']] = sent['tokens']
# create class members
self.Refs = Refs
self.Anns = Anns
self.Imgs = Imgs
self.Cats = Cats
self.Sents = Sents
self.imgToRefs = imgToRefs
self.imgToAnns = imgToAnns
self.refToAnn = refToAnn
self.annToRef = annToRef
self.catToRefs = catToRefs
self.sentToRef = sentToRef
self.sentToTokens = sentToTokens
print('index created.')
def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
image_ids = image_ids if type(image_ids) == list else [image_ids]
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
refs = self.data['refs']
else:
if not len(image_ids) == 0:
refs = [self.imgToRefs[image_id] for image_id in image_ids]
else:
refs = self.data['refs']
if not len(cat_ids) == 0:
refs = [ref for ref in refs if ref['category_id'] in cat_ids]
if not len(ref_ids) == 0:
refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
if not len(split) == 0:
if split in ['testA', 'testB', 'testC']:
refs = [ref for ref in refs if
split[-1] in ref['split']] # we also consider testAB, testBC, ...
elif split in ['testAB', 'testBC', 'testAC']:
refs = [ref for ref in refs if ref['split'] == split] # rarely used I guess...
elif split == 'test':
refs = [ref for ref in refs if 'test' in ref['split']]
elif split == 'train' or split == 'val':
refs = [ref for ref in refs if ref['split'] == split]
else:
raise 'No such split [%s]' % split
ref_ids = [ref['ref_id'] for ref in refs]
return ref_ids
def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
image_ids = image_ids if type(image_ids) == list else [image_ids]
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
ann_ids = [ann['id'] for ann in self.data['annotations']]
else:
if not len(image_ids) == 0:
lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns] # list of [anns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.data['annotations']
if not len(cat_ids) == 0:
anns = [ann for ann in anns if ann['category_id'] in cat_ids]
ann_ids = [ann['id'] for ann in anns]
if not len(ref_ids) == 0:
ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
return ann_ids
def getImgIds(self, ref_ids=[]):
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if not len(ref_ids) == 0:
image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
else:
image_ids = self.Imgs.keys()
return image_ids
def getCatIds(self):
return self.Cats.keys()
def loadRefs(self, ref_ids=[]):
if type(ref_ids) == list:
return [self.Refs[ref_id] for ref_id in ref_ids]
elif type(ref_ids) == int:
return [self.Refs[ref_ids]]
def loadAnns(self, ann_ids=[]):
if type(ann_ids) == list:
return [self.Anns[ann_id] for ann_id in ann_ids]
elif type(ann_ids) == int:
return [self.Anns[ann_ids]]
def loadImgs(self, image_ids=[]):
if type(image_ids) == list:
return [self.Imgs[image_id] for image_id in image_ids]
elif type(image_ids) == int:
return [self.Imgs[image_ids]]
def loadCats(self, cat_ids=[]):
if type(cat_ids) == list:
return [self.Cats[cat_id] for cat_id in cat_ids]
elif type(cat_ids) == int:
return [self.Cats[cat_ids]]
def getRefBox(self, ref_id):
ref = self.Refs[ref_id]
ann = self.refToAnn[ref_id]
return ann['bbox'] # [x, y, w, h]
def showRef(self, ref, seg_box='box'):
ax = plt.gca()
# show image
image = self.Imgs[ref['image_id']]
I = io.imread(os.path.join(self.vis_root, image['file_name']))
ax.imshow(I)
# show refer expression
for sid, sent in enumerate(ref['sentences']):
print('%s. %s' % (sid + 1, sent['sent']))
# show segmentations
if seg_box == 'seg':
ann_id = ref['ann_id']
ann = self.Anns[ann_id]
polygons = []
color = []
c = 'none'
if type(ann['segmentation'][0]) == list:
# polygon used for refcoco*
for seg in ann['segmentation']:
poly = np.array(seg).reshape((len(seg) / 2, 2))
polygons.append(Polygon(poly, True, alpha=0.4))
color.append(c)
p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
ax.add_collection(p) # thick yellow polygon
p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
ax.add_collection(p) # thin red polygon
else:
# mask used for refclef
raise NotImplementedError('RefClef is not downloaded')
# show bounding-box
elif seg_box == 'box':
ann_id = ref['ann_id']
ann = self.Anns[ann_id]
bbox = self.getRefBox(ref['ref_id'])
box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
ax.add_patch(box_plot)

View File

@ -0,0 +1,184 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import json
import random
from PIL import Image
from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
from collections import OrderedDict
class __DisplMixin:
def displ_item(self, index):
sample, ann = self.__getitem__(index), self.annotation[index]
return OrderedDict(
{
"file": ann["image"],
"question": ann["question"],
"question_id": ann["question_id"],
"answers": "; ".join(ann["answer"]),
"image": sample["image"],
}
)
class COCOVQADataset(VQADataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[
"[vqa] {}",
"[vqa] Based on the image, respond to this question with a short answer: {}"
]
exist_annotation = []
for ann in self.annotation:
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
if os.path.exists(image_path):
exist_annotation.append(ann)
self.annotation = exist_annotation
def get_data(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
question_id = ann["question_id"]
answer_weight = {}
for answer in ann["answer"]:
if answer in answer_weight.keys():
answer_weight[answer] += 1 / len(ann["answer"])
else:
answer_weight[answer] = 1 / len(ann["answer"])
answers = list(answer_weight.keys())
weights = list(answer_weight.values())
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
# if "unk" in answer:
# print("cocovqa", answer)
return {
"image": image,
"question": question,
"question_id": question_id,
"answer": answer,
}
def __getitem__(self, index):
data = self.get_data(index)
instruction = random.choice(self.instruction_pool).format(data['question'])
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": data['image'],
"question_id": data["question_id"],
"instruction_input": instruction,
"answer": self.text_processor(data['answer']),
}
class COCOVQGDataset(COCOVQADataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool = [
'Given the image, generate a question whose answer is: {}',
'Based on the image, provide a question with the answer: {}',
'Given the visual representation, create a question for which the answer is "{}"',
'From the image provided, craft a question that leads to the reply: {}',
'Considering the picture, come up with a question where the answer is: {}',
'Taking the image into account, generate an question that has the answer: {}'
]
def __getitem__(self, index):
data = self.get_data(index)
instruction = random.choice(self.instruction_pool).format(data['answer'])
instruction = "<Img><ImageHere></Img> {}".format(instruction)
return {
"image": data['image'],
"question_id": data["question_id"],
"instruction_input": instruction,
"answer": data['question'],
}
class COCOVQAEvalDataset(VQAEvalDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.instruction_pool = [
# '{}',
# 'Question: {}',
# '{} A short answer to the question is',
# 'Q: {} A:',
'Question: {} Short answer:',
# 'Given the image, answer the following question with no more than three words. {}',
# 'Based on the image, respond to this question with a short answer: {}.',
# 'Use the provided image to answer the question: {} Provide your answer as short as possible.',
# 'What is the answer to the following question? "{}"',
# 'The question "{}" can be answered using the image. A short answer is'
]
# print('vis_root', vis_root)
self.vis_root = vis_root
self.annotation = json.load(open(ann_paths[0]))
answer_list_path = ann_paths[1]
if os.path.exists(answer_list_path):
self.answer_list = json.load(open(answer_list_path))
else:
self.answer_list = None
try:
self.coco_fmt_qust_file = ann_paths[2]
self.coco_fmt_anno_file = ann_paths[3]
except IndexError:
self.coco_fmt_qust_file = None
self.coco_fmt_anno_file = None
self.vis_processor = vis_processor
self.text_processor = text_processor
self._add_instance_ids()
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
instruction = random.choice(self.instruction_pool).format(question)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": image,
'image_path': image_path,
"question": question,
"question_id": ann["question_id"],
"instruction_input": instruction,
"instance_id": ann["instance_id"],
}

View File

@ -0,0 +1,290 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class SingleSlideVQADataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.data = self.create_data(ann_path)
# self.instruction_pool = [
# "###Human: <Img><ImageHere></Img> {}###Assistant: ",
# "###Human: <Img><ImageHere></Img> From this slide, {}###Assistant: ",
# ]
self.instruction_pool = [
"<Img><ImageHere></Img> {}",
"<Img><ImageHere></Img> From this slide, {}",
]
def create_data(self, ann_path):
with open(ann_path, 'r') as f:
samples = f.readlines()
data = []
for sample in samples:
sample = json.loads(sample)
if len(sample['evidence_pages']) != 1: continue # skip questions that need more than one slide page
page = sample['evidence_pages'][0]
image_name = 'slide_{}_1024.jpg'.format(page)
# assert [int(image_name.split('-')[-2]) for image_name in image_names] == list(range(1, 21)) # check the format
image_path = os.path.join(sample['deck_name'], image_name)
data.append({
'qa_id': sample['qa_id'],
'question': sample['question'],
'answer': sample['answer'],
'image_path': image_path
})
print("single slide ",len(data))
return data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = self.data[index]
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
image = self.vis_processor(image)
# instruction = self.text_processor(sample["question"])
instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
# instruction = random.choice(self.instruction_pool).format(self.text_processor(sample["question"]))
return {
"image": image,
"instruction_input": instruction,
"answer": sample['answer'],
"qa_id": sample['qa_id'],
}
class OCRVQADataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.data = self.create_data(ann_path)
self.instruction_pool =[
"[vqa] {}",
"[vqa] Based on the image, respond to this question with a short answer: {}"
]
def create_data(self, ann_path):
processed_data = []
with open(ann_path, 'r') as f:
data = json.load(f)
for k in data.keys():
if data[k]['split'] != 1: continue # 1 for training, 2 for validation, 3 for test
ext = os.path.splitext(data[k]['imageURL'])[1]
imageFile = k + ext
assert len(data[k]['questions']) == len(data[k]['answers'])
for q, a in zip(data[k]['questions'], data[k]['answers']):
processed_data.append(
{'question': q,
'answer': a,
'image_path': imageFile,
'image_id': k,
'title': data[k]['title'],
'genre': data[k]['genre'],
}
)
print("ocr vqa", len(processed_data))
return processed_data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = self.data[index]
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(sample["question"])
answer = self.text_processor(sample["answer"])
instruction = random.choice(self.instruction_pool).format(question)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": sample['image_id']
}
class TextOCRDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.data = self.create_data(ann_path)
self.instruction_pool = [
"<Img><ImageHere></Img> [OCR] {}"
]
def create_data(self, ann_path):
processed_data = []
with open(ann_path, 'r') as f:
data = json.load(f)
for k in data["anns"].keys():
# ext = os.path.splitext(data[k]['imageURL'])[1]
imageFile = data["anns"][k]["image_id"]+".jpg"
bbox = data["anns"][k]["bbox"]
text = data["anns"][k]["utf8_string"]
# assert len(data[k]['questions']) == len(data[k]['answers'])
# for q, a in zip(data[k]['questions'], data[k]['answers']):
processed_data.append(
{'bbox': bbox,
'answer': text,
'image_path': imageFile,
'image_id': k,
}
)
return processed_data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = self.data[index]
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
width, height = image.size
image = self.vis_processor(image)
new_bbox =""
image_size = 100
bbox = sample['bbox']
for index in range(len(bbox)):
x1 = int(bbox[0]/width*image_size)
y1 = int(bbox[1]/height*image_size)
x2 = x1 + int(bbox[2]/width*image_size)
y2 = y1 + int(bbox[3]/height*image_size)
assert x1>=0 and x1<=image_size
assert x2>=0 and x2<=image_size
assert y1>=0 and y1<=image_size
assert y2>=0 and y2<=image_size
new_bbox = " <"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">"
instruction = random.choice(self.instruction_pool).format(new_bbox)
return {
"image": image,
"instruction_input": instruction,
"answer": sample['answer'],
"image_id": sample['image_id']
}
class PlotVQADataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.data = self.create_data(ann_path)
self.instruction_pool = [
'{}',
'Question: {}',
'{} A short answer to the question is',
'Q: {} A:',
'Question: {} Short answer:',
# 'Given the image, answer the following question with no more than three words. {}',
'Based on the image, respond to this question with a short answer: {}.',
'Use the provided image to answer the question: {} Provide your answer as short as possible.',
'What is the answer to the following question? "{}"',
'The question "{}" can be answered using the image. A short answer is'
]
def create_data(self, ann_path):
processed_data = []
with open(ann_path, 'r') as f:
data = json.load(f)
for da in data["qa_pairs"]:
# ext = os.path.splitext(data[k]['imageURL'])[1]
imageFile = str(da["image_index"])+".png"
question = da["question_string"]
answer = str(da["answer"])
# assert len(data[k]['questions']) == len(data[k]['answers'])
# for q, a in zip(data[k]['questions'], data[k]['answers']):
processed_data.append(
{'question': question,
'answer': answer,
'image_path': imageFile,
'image_id': str(da["image_index"]),
}
)
return processed_data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = self.data[index]
image = Image.open(os.path.join(self.vis_root, sample['image_path'])).convert("RGB")
# width, height = image.size
image = self.vis_processor(image)
# image_shape = image.shape
instruction = "<Img><ImageHere></Img> {} ".format(sample["question"])
instruction = random.choice(self.instruction_pool).format(instruction)
answer = sample["answer"]
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": sample['image_id']
}

View File

@ -0,0 +1,159 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class GroundedDetailDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'[grounding] please describe this image in details',
'[grounding] describe this image as detailed as possible',
'[grounding] summarize this image in details',
'[grounding] give a thorough description of what you see in this image',
]
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
image_file = '{}.jpg'.format(info['image_id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
answer = info['grounded_caption']
instruction = random.choice(self.instruction_pool)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": info['image_id'],
}
class CaptionToObjectDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'[detection] {}',
]
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
image_file = '{}.jpg'.format(info['image_id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
input = info["caption"]
answer = info["output"]
instruction = random.choice(self.instruction_pool).format(input)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": info['image_id'],
}
class PhraseToObjectDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'[detection] {}',
]
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
image_file = '{}.jpg'.format(info['image_id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
input = info["phrase"]
answer = "<p>"+input+"</p> "+info["bbox"]
instruction = random.choice(self.instruction_pool).format(input)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": info['image_id'],
}

View File

@ -0,0 +1,65 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import json
from PIL import Image
from minigpt4.datasets.datasets.vqa_datasets import VQADataset
from collections import OrderedDict
import random
class __DisplMixin:
def displ_item(self, index):
sample, ann = self.__getitem__(index), self.annotation[index]
return OrderedDict(
{
"file": ann["image"],
"question": ann["question"],
"question_id": ann["question_id"],
"answers": "; ".join(ann["answer"]),
"image": sample["image"],
}
)
class GQADataset(VQADataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[
"[vqa] {}",
"[vqa] Based on the image, respond to this question with a short answer: {}"
]
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
instruction = random.choice(self.instruction_pool).format(question)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
answers = self.text_processor(ann["answer"])
# if "unk" in answers:
# print("gqa",answers)
# print(answers)
return {
"image": image,
"instruction_input": instruction,
"answer": answers,
# "weights": weights,
}

View File

@ -0,0 +1,390 @@
import os
import json
import pickle
import random
import time
# import iterto
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class LlavaDetailDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
answer = info['conversations'][1]['value']
instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": info['id'],
}
class LlavaReasonDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
answer = info['conversations'][1]['value']
instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
# answer = self.text_processor(answer)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
"image_id": info['id'],
}
class MiniGPT4v(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'please describe this image as detailed as possible',
'What do you see happening in this image?',
"Can you elaborate on the elements of the picture provided?",
"Describe the following image.",
"Write a detailed description of the given image.",
"Write a detailed description of the given image.",
"Explain the visual content of the image in great detail"
]
self.ann=[]
with open(ann_path,"r") as f:
for line in f.readlines():
self.ann.append(json.loads(line))
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
# print("info keys",info.keys())
if "image_path" in info.keys():
image_path = "/ibex/reference/CV/COCO/cocoapi/data/2017/images/jpeg/train/"+info['image_path']
else:
# print("coming here?")
image_file = "images/"+info["image"]
image_path = os.path.join(self.vis_root, image_file)
# print(image_path)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
if "question" in info.keys():
question = info['question']
else:
question = random.sample(self.instruction_pool,1)[0]
answer = info["caption"]
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
# answer = self.text_processor(answer)
# print("image path", image_path)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
# "image_id": info['id'],
}
class MiniGPT4v_emotion(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'please describe this image as detailed as possible',
'What do you see happening in this image?',
"Can you elaborate on the elements of the picture provided?",
"Describe the following image",
"Write a detailed description of the given image",
"Write a detailed description of the given image",
"Explain the visual content of the image in great detail"
]
# self.ann=[]
with open(ann_path,"r") as f:
# for line in f.readlines():
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
# print("info keys",info.keys())
# print("coming here?")
image_file = info["link"]
image_path = os.path.join(self.vis_root, image_file)
# print("image path",image_path)
# print(image_path)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = random.sample(self.instruction_pool,1)[0]
answer = info["caption"]
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
# answer = self.text_processor(answer)
# print("image path", image_path)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
# "image_id": info['id'],
}
class MiniGPT4v_laion(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
'please describe this image as detailed as possible',
'What do you see happening in this image?',
"Can you elaborate on the elements of the picture provided?",
"Describe the following image",
"Write a detailed description of the given image",
"Write a detailed description of the given image",
"Explain the visual content of the image in great detail"
]
# self.ann=[]
with open(ann_path,"r") as f:
# for line in f.readlines():
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
# image_file = 'COCO_train2014_{}.jpg'.format(info['image_path'])
# print("info keys",info.keys())
# print("coming here?")
image_file = info["link"]
image_path = os.path.join(self.vis_root, image_file)
# print(image_path)
# print(image_path)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = random.sample(self.instruction_pool,1)[0]
answer = info["caption"]
instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(question))
# instruction = '<Img><ImageHere></Img> {} '.format(self.text_processor(instruction))
# answer = self.text_processor(answer)
# print("image path", image_path)
return {
"image": image,
"instruction_input": instruction,
"answer": answer,
# "image_id": info['id'],
}
class Minigpt2_conversation(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
answer = info['conversations'][1]['value']
instruction = info['conversations'][0]['value']
# print("instruction",instruction)
# print("answer", answer)
return {
"instruction_input": instruction,
"answer": answer,
}
class LlavaConversationDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.ann=[]
# with open(ann_path, 'r') as f:
# self.ann = json.load(f)
self.connect_sym = "!@#"
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
questions = [first_instruction]
answers = []
for i, item in enumerate(info["conversations"][1:]):
if i % 2 ==0: # assistant
assistant_answer = item["value"]
answers.append(assistant_answer)
else:
human_instruction = item["value"]+" "
questions.append(human_instruction)
questions = self.connect_sym.join(questions)
# questions = questions.replace("\\\\","\\")
answers = self.connect_sym.join(answers)
return {
"image": image,
"conv_q": questions,
'conv_a': answers,
"image_id": info['id'],
"connect_sym": self.connect_sym
}

View File

@ -0,0 +1,75 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class MultiTaskConversationDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
with open(ann_path, 'r') as f:
self.ann = json.load(f)
self.connect_sym = "!@#"
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]
image_file = 'COCO_train2014_{}.jpg'.format(info['id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
first_instruction = info['conversations'][0]['value'].replace('<image>', '').replace('\n', '').strip()
first_instruction = '<Img><ImageHere></Img> {} '.format(first_instruction)
questions = [first_instruction]
answers = []
for i, item in enumerate(info["conversations"][1:]):
if i % 2 ==0: # assistant
assistant_answer = item["value"]
answers.append(assistant_answer)
else:
human_instruction = item["value"]+" "
questions.append(human_instruction)
questions = self.connect_sym.join(questions)
answers = self.connect_sym.join(answers)
return {
"image": image,
"conv_q": questions,
'conv_a': answers,
"image_id": info['id'],
"connect_sym": self.connect_sym
}

View File

@ -0,0 +1,186 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class TextCapDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
self.instruction_pool = [
# "generate a short image caption incorporating text in the image",
# "generate a brief image description combining the text shown in the image",
# "what text is writen in this image?",
# "describe the text that you can see from this image",
# "What does the text in the image say?"
'Briefly describe this image.',
'Provide a concise depiction of this image.',
'Present a short description of this image.',
'Summarize this image in a few words.',
'A short image caption:',
'A short image description:',
'A photo of ',
'An image that shows ',
'Write a short description for the image. ',
'Write a description for the photo.',
'Provide a description of what is presented in the photo.',
'Briefly describe the content of the image.',
'Can you briefly explain what you see in the image?',
'Could you use a few words to describe what you perceive in the photo?',
'Please provide a short depiction of the picture.',
'Using language, provide a short account of the image.',
'Use a few words to illustrate what is happening in the picture.',
]
with open(ann_path, 'r') as f:
self.ann = json.load(f)
def __len__(self):
return len(self.ann["data"])
def __getitem__(self, index):
info = self.ann["data"][index]
image_file = '{}.jpg'.format(info['image_id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
# image_width,image_length = image.size
image = self.vis_processor(image)
# ocr_info = self.ann[index]["data"]
caption = info["caption_str"]
caption = self.text_processor(caption)
# instruction = random.choice(self.instruction_pool).format(word_bbox)
instruction = "<Img><ImageHere></Img> [caption] {} ".format(random.choice(self.instruction_pool))
return {
"image": image,
"instruction_input": instruction,
"answer": caption,
"data_type": "bbox",
"question_split": True
}
class TextCapBboxToObjectDataset(Dataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.vis_processor = vis_processor
self.text_processor = text_processor
# self.instruction_pool = [
# "<Img><ImageHere></Img> What text does it show in {} ",
# "<Img><ImageHere></Img> Extract the text from {} ",
# "<Img><ImageHere></Img> What is the textual content in {} ",
# "<Img><ImageHere></Img> Extract the textual information present in the {} ",
# "<Img><ImageHere></Img> What is the text written within this defined region {}",
# "<Img><ImageHere></Img> Transcribe the text located inside {}",
# "<Img><ImageHere></Img> Can you read and extract the text from this specific area {}",
# ]
self.instruction_pool = [
"<Img><ImageHere></Img> [OCR] {}"
]
with open(ann_path, 'r') as f:
self.ann = json.load(f)
self.new_ann = {"data":[]}
for da in self.ann["data"]:
if da["ocr_info"] !=[]:
ocr_info_filter = []
for d in da["ocr_info"]:
if (d["bounding_box"]["width"]+d["bounding_box"]["top_left_x"])<=1.0 and (d["bounding_box"]["height"]+d["bounding_box"]["top_left_y"]) <=1.0 \
and d["bounding_box"]["top_left_x"]>=0 and d["bounding_box"]["top_left_y"]>=0:
ocr_info_filter.append(d)
if ocr_info_filter !=[]:
da["ocr_info"]=ocr_info_filter
self.new_ann["data"].append(da)
self.ann = self.new_ann
def __len__(self):
return len(self.ann["data"])
def __getitem__(self, index):
info = self.ann["data"][index]
image_file = '{}.jpg'.format(info['image_id'])
image_path = os.path.join(self.vis_root, image_file)
image = Image.open(image_path).convert("RGB")
# image_width,image_length = image.size
image = self.vis_processor(image)
image_size = 100
ocr_info = info["ocr_info"]
sampled_ocr = random.sample(ocr_info,1)[0]
# print("sampled ocr", sampled_ocr)
word_text = sampled_ocr["word"]
width = sampled_ocr["bounding_box"]["width"]
height = sampled_ocr["bounding_box"]["height"]
top_left_x = sampled_ocr["bounding_box"]["top_left_x"]
top_left_y = sampled_ocr["bounding_box"]["top_left_y"]
x1 = int(top_left_x*image_size)
y1 = int(top_left_y*image_size)
x2 = x1 + int(width*image_size)
y2 = y1 + int(height*image_size)
assert x1>=0 and x1<=image_size
assert x2>=0 and x2<=image_size
assert y1>=0 and y1<=image_size
assert y2>=0 and y2<=image_size
word_bbox = "{<"+str(x1)+"><"+str(y1)+"><"+str(x2)+"><"+str(y2)+">}"
instruction = random.choice(self.instruction_pool).format(word_bbox)
return {
"image": image,
"instruction_input": instruction,
"answer": word_text,
"data_type": "bbox",
"question_split": True
}

View File

@ -0,0 +1,52 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
import skimage.io as io
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from torch.utils.data import Dataset
import webdataset as wds
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
class UnnaturalDataset(Dataset):
def __init__(self, text_processor, ann_path):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.text_processor = text_processor
with open(ann_path, 'r') as f:
self.ann = json.load(f)
# with open(ann_path, 'r') as f:
# for data in f.readlines():
# data = json.loads(data)
# self.ann.append(data)
def __len__(self):
return len(self.ann)
def __getitem__(self, index):
info = self.ann[index]["instances"][0]
instruction = info["instruction_with_input"]
constraints = info["constraints"]
answer = info["output"]
if constraints != None:
instruction = instruction+" "+constraints
return {
# "image":None,
"instruction_input": self.text_processor(instruction),
"answer": self.text_processor(answer),
}

View File

@ -0,0 +1,98 @@
import os
import json
import pickle
import random
import time
import itertools
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from visual_genome import local
import threading
# Global lock
lock = threading.Lock()
class ReferVisualGenomeDataset(Dataset):
def __init__(self, vis_processor, text_processor, data_dir):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.data_dir = data_dir
self.vis_processor = vis_processor
self.text_processor = text_processor
all_regions = local.get_all_region_descriptions(self.data_dir)
all_regions = [region for regions in all_regions for region in regions]
# follow OFA practice, only regions smaller than 16384 pixels are used for refer
self.regions = [region for region in all_regions if region.width * region.height < 16384]
print('Visual Genome grounding', len(self.regions))
self.instruction_pool = [
"[refer] {}",
"[refer] give me the location of {}",
"[refer] where is {} ?",
"[refer] from this image, tell me the location of {}",
"[refer] the location of {} is",
"[refer] could you tell me the location for {} ?",
"[refer] where can I locate the {} ?",
]
def __len__(self):
return len(self.regions)
def preprocess(self, index):
region = self.regions[index]
image_file = region.image.url.split('/')[-2:]
image_path = os.path.join(self.data_dir, *image_file)
image = Image.open(image_path).convert("RGB")
image_orig_size = image.size
image = self.vis_processor(image)
image_new_size = [100,100]
sample_sentence = region.phrase
refer_sentence = self.text_processor(sample_sentence)
bbox = [region.x, region.y, region.width, region.height]
bbox = [
bbox[0] / image_orig_size[0] * image_new_size[0],
bbox[1] / image_orig_size[1] * image_new_size[1],
(bbox[0] + bbox[2]) / image_orig_size[0] * image_new_size[0],
(bbox[1] + bbox[3]) / image_orig_size[1] * image_new_size[1]
]
bbox = [int(x) for x in bbox]
bbox = "{{<{}><{}><{}><{}>}}".format(*bbox)
return {
"image": image,
"refer_sentence": refer_sentence,
"bbox": bbox,
"image_id": region.image.id,
}
def __getitem__(self, index):
data = self.preprocess(index)
instruction = random.choice(self.instruction_pool).format(data['refer_sentence'])
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
# assert False
return {
"image": data['image'],
"instruction_input": instruction,
"answer": data['bbox'],
"image_id": data['image_id'],
}

View File

@ -0,0 +1,223 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import torch
from PIL import Image
import os
from minigpt4.datasets.datasets.base_dataset import BaseDataset
class VQADataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
# def collater(self, samples):
# image_list, question_list, answer_list, weight_list = [], [], [], []
# num_answers = []
# for sample in samples:
# image_list.append(sample["image"])
# question_list.append(sample["question"])
# weight_list.extend(sample["weights"])
# answers = sample["answer"]
# answer_list.extend(answers)
# num_answers.append(len(answers))
# return {
# "image": torch.stack(image_list, dim=0),
# "text_input": question_list,
# "answer": answer_list,
# "weight": torch.Tensor(weight_list),
# "n_answers": torch.LongTensor(num_answers),
# }
class VQAEvalDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
class OKVQAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_id = data['image_id']
question = data['question']
question_id = data['question_id']
img_file = '{:0>12}.jpg'.format(img_id)
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id, img_id
class VizWizEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_id = data['image']
question = data['question']
answers = data['answers']
answers = '_'.join([answer['answer'] for answer in answers])
image_path = os.path.join(self.root_path, img_id)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
# question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
return image, question, answers
class AOKVQADAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_file = data['image']
question = data['question']
question_id = data['question_id']
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id
class AOKVQAMCEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_file = data['image']
question = data['question']
question_id = data['question_id']
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image).half().cuda()
candidates=data['choices']
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id, candidates
class IconQAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
image_id = data['image_id']
question = data['question']
image_path = os.path.join(self.root_path, image_id, 'image.png')
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image).half().cuda()
candidates = '_'.join(data['choices'])
answer = data['answer']
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, candidates, answer
class GQAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
ann = self.loaded_data[idx]
image_id = ann["image"]
image_path = os.path.join(self.root_path, f"{image_id}")
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = ann["question"]
# question = f'Question: {question} Short answer: '
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
labels = ann["answer"]
return image, question, labels
class HMEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
ann = self.loaded_data[idx]
image_id = ann["img"]
image_path = os.path.join(self.root_path, f"{image_id}")
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = ann["text"]
question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
labels = ann["label"]
return image, question, labels
class VSREvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
ann = self.loaded_data[idx]
image_path = os.path.join(self.root_path, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = ann["caption"]
question = f'[vqa] Based on the image, is this statement true or false? {question}'
question_id = ann["image"].split('.')[0]
labels = 'true' if ann["label"] == 1 else 'false'
return image, question, labels

View File

@ -0,0 +1,300 @@
model:
arch: minigpt_v2
model_type: pretrain
freeze_vit: True
freeze_qformer: True
max_txt_len: 1024
low_resource: False
image_size: 448
end_sym: "</s>"
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
use_grad_checkpoint: True
chat_template: True
lora_r: 64
lora_alpha: 16
datasets:
multitask_conversation:
batch_size_train: 2
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 40
llava_conversation: # 77k
batch_size_train: 2
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
# unnatural_instruction:
# batch_size: 1
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refvg:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 40
# llava_detail: #23K
# batch_size: 4
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 20
# llava_reason: # 77k
# batch_size: 4
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# grounded_detailed_image_caption:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# CaptionToPhrase:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# ObjectToPhrase:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# coco_caption:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# textcaps_caption: #
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# refcoco: # 142k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# invrefcoco:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# coco_vqa: # 82K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# ok_vqa: # 9k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 8
# aok_vqa: # 17k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 12
# gqa: # 82K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 40
# ocrvqa: # 800K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 30
run:
task: image_text_pretrain
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-5
min_lr: 8e-5
warmup_lr: 1e-6
weight_decay: 0.05
max_epoch: 50
num_workers: 6
warmup_steps: 1000
iters_per_epoch: 1000
seed: 42
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True