This commit is contained in:
hanziwang 2024-03-29 08:25:42 +00:00
commit 5bec4d0608
246 changed files with 45100 additions and 811 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

2
command.txt Normal file
View File

@ -0,0 +1,2 @@
chmod +x *.sh
tensorboard --bind_all --logdir

View File

@ -116,6 +116,7 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
max_new_tokens=300, max_new_tokens=300,
max_length=2000)[0] max_length=2000)[0]
chatbot[-1][1] = llm_message chatbot[-1][1] = llm_message
print(llm_message)
return chatbot, chat_state, img_list return chatbot, chat_state, img_list

View File

@ -30,7 +30,7 @@ from minigpt4.tasks import *
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Demo") parser = argparse.ArgumentParser(description="Demo")
parser.add_argument("--cfg-path", default='eval_configs/minigptv2_eval.yaml', parser.add_argument("--cfg-path", default='minigpt4/projects/minigpt/eval/minigptv2_eval.yaml',
help="path to configuration file.") help="path to configuration file.")
parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.") parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
parser.add_argument( parser.add_argument(

View File

@ -11,10 +11,10 @@ After the first stage, the visual features are mapped and can be understood by t
model. model.
To launch the first stage training, run the following command. In our experiments, we use 4 A100. To launch the first stage training, run the following command. In our experiments, we use 4 A100.
You can change the save path in the config file You can change the save path in the config file
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml) [minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml](minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml)
```bash ```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml
``` ```
A MiniGPT-4 checkpoint with only stage one training can be downloaded A MiniGPT-4 checkpoint with only stage one training can be downloaded
@ -30,12 +30,12 @@ To download and prepare our second stage dataset, please check our
[second stage dataset preparation instruction](dataset/README_2_STAGE.md). [second stage dataset preparation instruction](dataset/README_2_STAGE.md).
To launch the second stage alignment, To launch the second stage alignment,
first specify the path to the checkpoint file trained in stage 1 in first specify the path to the checkpoint file trained in stage 1 in
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml). [minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml](minigpt4/projects/minigpt/train/minigpt4_stage2_finetune.yaml).
You can also specify the output path there. You can also specify the output path there.
Then, run the following command. In our experiments, we use 1 A100. Then, run the following command. In our experiments, we use 1 A100.
```bash ```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigpt4_stage2_finetune.yaml
``` ```
After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.

View File

@ -4,7 +4,7 @@
You firstly need to prepare the dataset. you can follow this step to prepare the dataset. You firstly need to prepare the dataset. you can follow this step to prepare the dataset.
our [dataset preparation](dataset/README_MINIGPTv2_FINETUNE.md). our [dataset preparation](dataset/README_MINIGPTv2_FINETUNE.md).
In the train_configs/minigptv2_finetune.yaml, you need to set up the following paths: In the minigpt4/projects/minigpt/train/minigptv2_finetune.yaml, you need to set up the following paths:
llama_model checkpoint path: "/path/to/llama_checkpoint" llama_model checkpoint path: "/path/to/llama_checkpoint"
@ -19,6 +19,6 @@ For ckpt, you may load from our pretrained model checkpoints:
```bash ```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigptv2_finetune.yaml torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigptv2_finetune.yaml
``` ```

View File

@ -82,13 +82,13 @@ Download the corresponding LLM weights from the following huggingface space via
Then, set the variable *llama_model* in the model config file to the LLM weight path. Then, set the variable *llama_model* in the model config file to the LLM weight path.
* For MiniGPT-v2, set the LLM path * For MiniGPT-v2, set the LLM path
[here](minigpt4/configs/models/minigpt_v2.yaml#L15) at Line 14. [here](minigpt4/configs/models/minigpt/minigpt_v2.yaml#L15) at Line 14.
* For MiniGPT-4 (Llama2), set the LLM path * For MiniGPT-4 (Llama2), set the LLM path
[here](minigpt4/configs/models/minigpt4_llama2.yaml#L15) at Line 15. [here](minigpt4/configs/models/minigpt/minigpt4_llama2.yaml#L15) at Line 15.
* For MiniGPT-4 (Vicuna), set the LLM path * For MiniGPT-4 (Vicuna), set the LLM path
[here](minigpt4/configs/models/minigpt4_vicuna0.yaml#L18) at Line 18 [here](minigpt4/configs/models/minigpt/minigpt4_vicuna0.yaml#L18) at Line 18
**3. Prepare the pretrained model checkpoints** **3. Prepare the pretrained model checkpoints**
@ -101,7 +101,7 @@ Download the pretrained model checkpoints
For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file
in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at Line 8. in [minigpt4/projects/minigpt/eval/minigptv2_eval.yaml](minigpt4/projects/minigpt/eval/minigptv2_eval.yaml#L10) at Line 8.
@ -110,7 +110,7 @@ in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at L
| [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/11nAPjEok8eAGGEG1N2vXo3kBLCg0WgUk/view?usp=sharing) | | [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/11nAPjEok8eAGGEG1N2vXo3kBLCg0WgUk/view?usp=sharing) |
For **MiniGPT-4**, set the path to the pretrained checkpoint in the evaluation config file For **MiniGPT-4**, set the path to the pretrained checkpoint in the evaluation config file
in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 8 for Vicuna version or [eval_configs/minigpt4_llama2_eval.yaml](eval_configs/minigpt4_llama2_eval.yaml#L10) for LLama2 version. in [minigpt4/projects/minigpt/eval/minigpt4_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_eval.yaml#L10) at Line 8 for Vicuna version or [minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml#L10) for LLama2 version.
@ -118,19 +118,19 @@ in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Lin
For MiniGPT-v2, run For MiniGPT-v2, run
``` ```
python demo_v2.py --cfg-path eval_configs/minigptv2_eval.yaml --gpu-id 0 python demo_v2.py --cfg-path minigpt4/projects/minigpt/eval/minigptv2_eval.yaml --gpu-id 0
``` ```
For MiniGPT-4 (Vicuna version), run For MiniGPT-4 (Vicuna version), run
``` ```
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0 python demo.py --cfg-path minigpt4/projects/minigpt/eval/minigpt4_eval.yaml --gpu-id 0
``` ```
For MiniGPT-4 (Llama2 version), run For MiniGPT-4 (Llama2 version), run
``` ```
python demo.py --cfg-path eval_configs/minigpt4_llama2_eval.yaml --gpu-id 0 python demo.py --cfg-path minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml --gpu-id 0
``` ```
@ -139,9 +139,9 @@ This configuration requires about 23G GPU memory for 13B LLM and 11.5G GPU memor
For more powerful GPUs, you can run the model For more powerful GPUs, you can run the model
in 16 bit by setting `low_resource` to `False` in the relevant config file: in 16 bit by setting `low_resource` to `False` in the relevant config file:
* MiniGPT-v2: [minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#6) * MiniGPT-v2: [minigptv2_eval.yaml](minigpt4/projects/minigpt/eval/minigptv2_eval.yaml#6)
* MiniGPT-4 (Llama2): [minigpt4_llama2_eval.yaml](eval_configs/minigpt4_llama2_eval.yaml#6) * MiniGPT-4 (Llama2): [minigpt4_llama2_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml#6)
* MiniGPT-4 (Vicuna): [minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#6) * MiniGPT-4 (Vicuna): [minigpt4_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_eval.yaml#6)
Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run MiniGPT-4 on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run MiniGPT-4 on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)

View File

@ -1,4 +1,4 @@
name: minigptv name: promptmoe
channels: channels:
- pytorch - pytorch
- defaults - defaults
@ -31,3 +31,5 @@ dependencies:
- accelerate==0.20.3 - accelerate==0.20.3
- bitsandbytes==0.37.0 - bitsandbytes==0.37.0
- wandb - wandb
- visual_genome
- scikit-image

92
evaluate.py Normal file
View File

@ -0,0 +1,92 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import argparse
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import minigpt4.tasks as tasks
from minigpt4.common.config import Config
from minigpt4.common.dist_utils import get_rank, init_distributed_mode
from minigpt4.common.logger import setup_logger
from minigpt4.common.optims import (
LinearWarmupCosineLRScheduler,
LinearWarmupStepLRScheduler,
)
from minigpt4.common.utils import now
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners.runner_base import RunnerBase
from minigpt4.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Training")
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
# if 'LOCAL_RANK' not in os.environ:
# os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def main():
# allow auto-dl completes on main process without timeout when using NCCL backend.
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
job_id = now()
cfg = Config(parse_args())
init_distributed_mode(cfg.run_cfg)
setup_seeds(cfg)
# set after init_distributed_mode() to only log on master.
setup_logger()
cfg.pretty_print()
task = tasks.setup_task(cfg)
datasets = task.build_datasets(cfg)
model = task.build_model(cfg)
runner = RunnerBase(
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
)
runner.evaluate(skip_reload=True)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,94 @@
import os
import json
import pandas as pd
from tqdm import tqdm
from pycocoevalcap.eval import COCOEvalCap
from collections import defaultdict
class COCO_Annotation:
def __init__(self, annotation_file):
self.coco_cn_file = annotation_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = defaultdict(list)
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
for line in fin:
line = line.strip()
temp = eval(line)
annotations = temp['annotations']
for ann in annotations:
image_id = str(ann['image_id']).zfill(6)
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
return imgToAnns
def getImgIds(self):
return self.imgToAnns.keys()
class COCO_Result:
def __init__(self,result_file):
self.coco_cn_file = result_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = dict()
data = json.load(open(self.coco_cn_file, "r"))
for d in data:
tmp = {
'image_id':d['question_id'][-6:],
'caption':d['answer']
}
imgToAnns[d['question_id'][-6:]] = [tmp]
return imgToAnns
def coco_caption_eval(results_file, split_name):
files = {
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
}
# create coco object and coco_result object
annotation_file = files[split_name]
coco = COCO_Annotation(annotation_file)
coco_result = COCO_Result(results_file)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()
# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()
# print output evaluation scores
for metric, score in coco_eval.eval.items():
print(f"{metric}: {score:.3f}")
return coco_eval
def main():
result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json"
split_name = "val"
coco_val = coco_caption_eval(result_file, split_name)
agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
# log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
# with open(
# os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
# ) as f:
# f.write(json.dumps(log_stats) + "\n")
coco_res = {k: v for k, v in coco_val.eval.items()}
coco_res["agg_metrics"] = agg_metrics
print(coco_res)
main()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 380 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 457 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 538 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 679 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 555 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 658 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 690 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 713 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 597 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 190 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 603 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 634 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 305 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 588 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 805 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 853 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 567 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 712 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 519 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 565 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 865 KiB

View File

@ -0,0 +1,8 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = "aagrawal"

View File

@ -0,0 +1,127 @@
from collections import defaultdict
from pycocoevalcap.eval import COCOEvalCap
import json
class COCO_Annotation:
def __init__(self, annotation_file):
self.coco_cn_file = annotation_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = defaultdict(list)
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
for line in fin:
line = line.strip()
temp = eval(line)
annotations = temp['annotations']
for ann in annotations:
image_id = str(ann['image_id']).zfill(6)
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
return imgToAnns
def getImgIds(self):
return self.imgToAnns.keys()
class COCO_Result:
def __init__(self,result_file):
self.coco_cn_file = result_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = dict()
data = json.load(open(self.coco_cn_file, "r"))
for d in data:
tmp = {
'image_id':d['question_id'][-6:],
'caption':d['answer']
}
imgToAnns[d['question_id'][-6:]] = [tmp]
return imgToAnns
def coco_caption_eval(coco_gt_root, results_file, split_name):
files = {
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
}
# create coco object and coco_result object
annotation_file = files[split_name]
coco = COCO_Annotation(annotation_file)
coco_result = COCO_Result(results_file)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()
# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()
# print output evaluation scores
for metric, score in coco_eval.eval.items():
print(f"{metric}: {score:.3f}")
return coco_eval
class TextCap_Annotation:
def __init__(self, annotation_file):
self.anno_file = annotation_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = defaultdict(list)
annotations = json.load(open(self.anno_file,"r"))['data']
for ann in annotations:
image_id = str(ann['image_name'])
imgToAnns[image_id].append({
'image_id':image_id,
# 'caption':ann['reference_strs'],
'caption':ann['caption_str'],
'image': ann['image_path']
})
return imgToAnns
def getImgIds(self):
return self.imgToAnns.keys()
class TextCap_Result:
def __init__(self,result_file):
self.result_file = result_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = dict()
data = json.load(open(self.result_file, "r"))
for d in data:
tmp = {
'image_id':d['question_id'], # actually image_id
'caption':d['answer']
}
imgToAnns[d['question_id']] = [tmp]
return imgToAnns
def textcaps_caption_eval(annotation_file, results_file):
# create coco object and coco_result object
anno = TextCap_Annotation(annotation_file)
result = TextCap_Result(results_file)
# create coco_eval object by taking coco and coco_result
text_eval = COCOEvalCap(anno, result)
# SPICE will take a few minutes the first time, but speeds up due to caching
text_eval.evaluate()
# print output evaluation scores
for metric, score in text_eval.eval.items():
print(f"{metric}: {score:.3f}")
return text_eval

View File

@ -29,6 +29,7 @@ class Config:
runner_config = self.build_runner_config(config) runner_config = self.build_runner_config(config)
model_config = self.build_model_config(config, **user_config) model_config = self.build_model_config(config, **user_config)
dataset_config = self.build_dataset_config(config) dataset_config = self.build_dataset_config(config)
evaluation_dataset_config = self.build_evaluation_dataset_config(config)
# Validate the user-provided runner configuration # Validate the user-provided runner configuration
# model and dataset configuration are supposed to be validated by the respective classes # model and dataset configuration are supposed to be validated by the respective classes
@ -37,7 +38,7 @@ class Config:
# Override the default configuration with user options. # Override the default configuration with user options.
self.config = OmegaConf.merge( self.config = OmegaConf.merge(
runner_config, model_config, dataset_config, user_config runner_config, model_config, dataset_config, evaluation_dataset_config, user_config
) )
def _validate_runner_config(self, runner_config): def _validate_runner_config(self, runner_config):
@ -111,6 +112,29 @@ class Config:
return dataset_config return dataset_config
@staticmethod
def build_evaluation_dataset_config(config):
# from Minigpt-v2
datasets = config.get("evaluation_datasets", None)
# if datasets is None:
# raise KeyError(
# "Expecting 'datasets' as the root key for dataset configuration."
# )
dataset_config = OmegaConf.create()
if datasets is not None:
for dataset_name in datasets:
builder_cls = registry.get_builder_class(dataset_name)
# hierarchy override, customized config > default config
dataset_config = OmegaConf.merge(
dataset_config,
{"evaluation_datasets": {dataset_name: config["evaluation_datasets"][dataset_name]}},
)
return dataset_config
def _convert_to_dot_list(self, opts): def _convert_to_dot_list(self, opts):
if opts is None: if opts is None:
opts = [] opts = []
@ -136,6 +160,10 @@ class Config:
def datasets_cfg(self): def datasets_cfg(self):
return self.config.datasets return self.config.datasets
@property
def evaluation_datasets_cfg(self):
return self.config.evaluation_datasets
@property @property
def model_cfg(self): def model_cfg(self):
return self.config.model return self.config.model

View File

@ -0,0 +1,76 @@
import argparse
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from minigpt4.common.registry import registry
from minigpt4.common.config import Config
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
def eval_parser():
parser = argparse.ArgumentParser(description="Demo")
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument("--name", type=str, default='A2', help="evaluation name")
parser.add_argument("--ckpt", type=str, help="path to configuration file.")
parser.add_argument("--eval_opt", type=str, default='all', help="path to configuration file.")
parser.add_argument("--max_new_tokens", type=int, default=10, help="max number of generated tokens")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--lora_r", type=int, default=64, help="lora rank of the model")
parser.add_argument("--lora_alpha", type=int, default=16, help="lora alpha")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
return parser
def prepare_texts(texts, conv_temp):
convs = [conv_temp.copy() for _ in range(len(texts))]
[conv.append_message(
conv.roles[0], '<Img><ImageHere></Img> {}'.format(text)) for conv, text in zip(convs, texts)]
[conv.append_message(conv.roles[1], None) for conv in convs]
texts = [conv.get_prompt() for conv in convs]
return texts
def init_model(args):
print('Initialization Model')
cfg = Config(args)
# cfg.model_cfg.ckpt = args.ckpt
# cfg.model_cfg.lora_r = args.lora_r
# cfg.model_cfg.lora_alpha = args.lora_alpha
model_config = cfg.model_cfg
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:0')
# import pudb; pudb.set_trace()
key = list(cfg.datasets_cfg.keys())[0]
vis_processor_cfg = cfg.datasets_cfg.get(key).vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
print('Initialization Finished')
return model, vis_processor
def computeIoU(bbox1, bbox2):
x1, y1, x2, y2 = bbox1
x3, y3, x4, y4 = bbox2
intersection_x1 = max(x1, x3)
intersection_y1 = max(y1, y3)
intersection_x2 = min(x2, x4)
intersection_y2 = min(y2, y4)
intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1)
bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
union_area = bbox1_area + bbox2_area - intersection_area
iou = intersection_area / union_area
return iou

View File

@ -2,13 +2,14 @@
Copyright (c) 2022, salesforce.com, inc. Copyright (c) 2022, salesforce.com, inc.
All rights reserved. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
""" """
import datetime import datetime
import logging import logging
import time import time
from collections import defaultdict, deque from collections import defaultdict, deque
from torch.utils.tensorboard import SummaryWriter
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@ -80,9 +81,10 @@ class SmoothedValue(object):
class MetricLogger(object): class MetricLogger(object):
def __init__(self, delimiter="\t"): def __init__(self, delimiter="\t",writer: SummaryWriter=None):
self.meters = defaultdict(SmoothedValue) self.meters = defaultdict(SmoothedValue)
self.delimiter = delimiter self.delimiter = delimiter
self.writer = writer
def update(self, **kwargs): def update(self, **kwargs):
for k, v in kwargs.items(): for k, v in kwargs.items():
@ -91,6 +93,10 @@ class MetricLogger(object):
assert isinstance(v, (float, int)) assert isinstance(v, (float, int))
self.meters[k].update(v) self.meters[k].update(v)
def update_writer(self, it):
for name, meter in self.meters.items():
self.writer.add_scalar(name, meter, )
def __getattr__(self, attr): def __getattr__(self, attr):
if attr in self.meters: if attr in self.meters:
return self.meters[attr] return self.meters[attr]

View File

@ -0,0 +1,8 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = "aagrawal"

View File

@ -0,0 +1,211 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = "aagrawal"
__version__ = "0.9"
# Interface for accessing the VQA dataset.
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
# The following functions are defined:
# VQA - VQA class that loads VQA annotation file and prepares data structures.
# getQuesIds - Get question ids that satisfy given filter conditions.
# getImgIds - Get image ids that satisfy given filter conditions.
# loadQA - Load questions and answers with the specified question ids.
# showQA - Display the specified questions and answers.
# loadRes - Load result file and create result object.
# Help on each function can be accessed by: "help(COCO.function)"
import json
import datetime
import copy
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""
Constructor of VQA helper class for reading and visualizing questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {}
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
print("loading VQA annotations and questions into memory...")
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, "r"))
questions = json.load(open(question_file, "r"))
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
# create index
print("creating index...")
imgToQA = {ann["image_id"]: [] for ann in self.dataset["annotations"]}
qa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
qqa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
for ann in self.dataset["annotations"]:
imgToQA[ann["image_id"]] += [ann]
qa[ann["question_id"]] = ann
for ques in self.questions["questions"]:
qqa[ques["question_id"]] = ques
print("index created!")
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
def info(self):
"""
Print information about the VQA annotation file.
:return:
"""
for key, value in self.datset["info"].items():
print("%s: %s" % (key, value))
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
"""
Get question ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get question ids for given imgs
quesTypes (str array) : get question ids for given question types
ansTypes (str array) : get question ids for given answer types
:return: ids (int array) : integer array of question ids
"""
imgIds = imgIds if type(imgIds) == list else [imgIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset["annotations"]
else:
if not len(imgIds) == 0:
anns = sum(
[self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA],
[],
)
else:
anns = self.dataset["annotations"]
anns = (
anns
if len(quesTypes) == 0
else [ann for ann in anns if ann["question_type"] in quesTypes]
)
anns = (
anns
if len(ansTypes) == 0
else [ann for ann in anns if ann["answer_type"] in ansTypes]
)
ids = [ann["question_id"] for ann in anns]
return ids
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
"""
Get image ids that satisfy given filter conditions. default skips that filter
:param quesIds (int array) : get image ids for given question ids
quesTypes (str array) : get image ids for given question types
ansTypes (str array) : get image ids for given answer types
:return: ids (int array) : integer array of image ids
"""
quesIds = quesIds if type(quesIds) == list else [quesIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset["annotations"]
else:
if not len(quesIds) == 0:
anns = sum(
[self.qa[quesId] for quesId in quesIds if quesId in self.qa], []
)
else:
anns = self.dataset["annotations"]
anns = (
anns
if len(quesTypes) == 0
else [ann for ann in anns if ann["question_type"] in quesTypes]
)
anns = (
anns
if len(ansTypes) == 0
else [ann for ann in anns if ann["answer_type"] in ansTypes]
)
ids = [ann["image_id"] for ann in anns]
return ids
def loadQA(self, ids=[]):
"""
Load questions and answers with the specified question ids.
:param ids (int array) : integer ids specifying question ids
:return: qa (object array) : loaded qa objects
"""
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def showQA(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann["question_id"]
print("Question: %s" % (self.qqa[quesId]["question"]))
for ans in ann["answers"]:
print("Answer %d: %s" % (ans["answer_id"], ans["answer"]))
def loadRes(self, resFile, quesFile):
"""
Load result file and return a result object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = VQA()
res.questions = json.load(open(quesFile))
res.dataset["info"] = copy.deepcopy(self.questions["info"])
res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"])
res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"])
res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"])
res.dataset["license"] = copy.deepcopy(self.questions["license"])
print("Loading and preparing results... ")
time_t = datetime.datetime.utcnow()
anns = json.load(open(resFile))
assert type(anns) == list, "results is not an array of objects"
annsQuesIds = [ann["question_id"] for ann in anns]
assert set(annsQuesIds) == set(
self.getQuesIds()
), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file."
for ann in anns:
quesId = ann["question_id"]
if res.dataset["task_type"] == "Multiple Choice":
assert (
ann["answer"] in self.qqa[quesId]["multiple_choices"]
), "predicted answer is not one of the multiple choices"
qaAnn = self.qa[quesId]
ann["image_id"] = qaAnn["image_id"]
ann["question_type"] = qaAnn["question_type"]
ann["answer_type"] = qaAnn["answer_type"]
print(
"DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds())
)
res.dataset["annotations"] = anns
res.createIndex()
return res

View File

@ -0,0 +1,324 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
# coding=utf-8
__author__ = "aagrawal"
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys
import re
class VQAEval:
def __init__(self, vqa=None, vqaRes=None, n=2):
self.n = n
self.accuracy = {}
self.evalQA = {}
self.evalQuesType = {}
self.evalAnsType = {}
self.vqa = vqa # annotation
self.vqaRes = vqaRes # predict answers
if vqa is not None:
self.params = {"question_id": vqa.getQuesIds()}
self.contractions = {
"aint": "ain't",
"arent": "aren't",
"cant": "can't",
"couldve": "could've",
"couldnt": "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
"didnt": "didn't",
"doesnt": "doesn't",
"dont": "don't",
"hadnt": "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
"hasnt": "hasn't",
"havent": "haven't",
"hed": "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
"hes": "he's",
"howd": "how'd",
"howll": "how'll",
"hows": "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
"Im": "I'm",
"Ive": "I've",
"isnt": "isn't",
"itd": "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
"itll": "it'll",
"let's": "let's",
"maam": "ma'am",
"mightnt": "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
"mightve": "might've",
"mustnt": "mustn't",
"mustve": "must've",
"neednt": "needn't",
"notve": "not've",
"oclock": "o'clock",
"oughtnt": "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
"shant": "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
"shouldve": "should've",
"shouldnt": "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd",
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
"somebodyll": "somebody'll",
"somebodys": "somebody's",
"someoned": "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
"someonell": "someone'll",
"someones": "someone's",
"somethingd": "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
"somethingll": "something'll",
"thats": "that's",
"thered": "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
"therere": "there're",
"theres": "there's",
"theyd": "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
"theyll": "they'll",
"theyre": "they're",
"theyve": "they've",
"twas": "'twas",
"wasnt": "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
"weve": "we've",
"werent": "weren't",
"whatll": "what'll",
"whatre": "what're",
"whats": "what's",
"whatve": "what've",
"whens": "when's",
"whered": "where'd",
"wheres": "where's",
"whereve": "where've",
"whod": "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
"wholl": "who'll",
"whos": "who's",
"whove": "who've",
"whyll": "why'll",
"whyre": "why're",
"whys": "why's",
"wont": "won't",
"wouldve": "would've",
"wouldnt": "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
"yall": "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
"youd": "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
"youll": "you'll",
"youre": "you're",
"youve": "you've",
}
self.manualMap = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
}
self.articles = ["a", "an", "the"]
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
self.commaStrip = re.compile("(\d)(,)(\d)")
self.punct = [
";",
r"/",
"[",
"]",
'"',
"{",
"}",
"(",
")",
"=",
"+",
"\\",
"_",
"-",
">",
"<",
"@",
"`",
",",
"?",
"!",
]
def evaluate(self, quesIds=None):
if quesIds == None:
quesIds = [quesId for quesId in self.params["question_id"]]
gts = {}
res = {}
for quesId in quesIds:
gts[quesId] = self.vqa.qa[quesId]
res[quesId] = self.vqaRes.qa[quesId]
# =================================================
# Compute accuracy
# =================================================
accQA = []
accQuesType = {}
accAnsType = {}
print("computing accuracy")
step = 0
for quesId in quesIds:
resAns = res[quesId]["answer"]
resAns = resAns.replace("\n", " ")
resAns = resAns.replace("\t", " ")
resAns = resAns.strip()
resAns = self.processPunctuation(resAns)
resAns = self.processDigitArticle(resAns)
gtAcc = []
gtAnswers = [ans["answer"] for ans in gts[quesId]["answers"]]
if len(set(gtAnswers)) > 1:
for ansDic in gts[quesId]["answers"]:
ansDic["answer"] = self.processPunctuation(ansDic["answer"])
for gtAnsDatum in gts[quesId]["answers"]:
otherGTAns = [
item for item in gts[quesId]["answers"] if item != gtAnsDatum
]
matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
acc = min(1, float(len(matchingAns)) / 3)
gtAcc.append(acc)
quesType = gts[quesId]["question_type"]
ansType = gts[quesId]["answer_type"]
avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
accQA.append(avgGTAcc)
if quesType not in accQuesType:
accQuesType[quesType] = []
accQuesType[quesType].append(avgGTAcc)
if ansType not in accAnsType:
accAnsType[ansType] = []
accAnsType[ansType].append(avgGTAcc)
self.setEvalQA(quesId, avgGTAcc)
self.setEvalQuesType(quesId, quesType, avgGTAcc)
self.setEvalAnsType(quesId, ansType, avgGTAcc)
if step % 100 == 0:
self.updateProgress(step / float(len(quesIds)))
step = step + 1
self.setAccuracy(accQA, accQuesType, accAnsType)
print("Done computing accuracy")
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + " " in inText or " " + p in inText) or (
re.search(self.commaStrip, inText) != None
):
outText = outText.replace(p, "")
else:
outText = outText.replace(p, " ")
outText = self.periodStrip.sub("", outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = " ".join(outText)
return outText
def setAccuracy(self, accQA, accQuesType, accAnsType):
self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n)
self.accuracy["perQuestionType"] = {
quesType: round(
100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]),
self.n,
)
for quesType in accQuesType
}
self.accuracy["perAnswerType"] = {
ansType: round(
100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n
)
for ansType in accAnsType
}
def setEvalQA(self, quesId, acc):
self.evalQA[quesId] = round(100 * acc, self.n)
def setEvalQuesType(self, quesId, quesType, acc):
if quesType not in self.evalQuesType:
self.evalQuesType[quesType] = {}
self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
def setEvalAnsType(self, quesId, ansType, acc):
if ansType not in self.evalAnsType:
self.evalAnsType[ansType] = {}
self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
def updateProgress(self, progress):
barLength = 20
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength * progress))
text = "\rFinshed Percent: [{0}] {1}% {2}".format(
"#" * block + "-" * (barLength - block), int(progress * 100), status
)
sys.stdout.write(text)
sys.stdout.flush()

View File

@ -15,6 +15,16 @@ datasets:
url: url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage: storage:
- /path/to/aokvqa_v1p0_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json
val:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
images: images:
storage: /path/to/coco/images storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -14,8 +14,18 @@ datasets:
annotations: annotations:
train: train:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
md5: aa31ac474cf6250ebb81d18348a07ed8 # md5: aa31ac474cf6250ebb81d18348a07ed8
storage: /path/to/coco_caption/coco_karpathy_train.json storage:
images: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
storage: /path/to/coco/images val:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
test:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -0,0 +1,26 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_caption: # name of the dataset builder
# dataset_card: dataset_card/coco_caption.md
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
val:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
test:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -13,12 +13,36 @@ datasets:
annotations: annotations:
train: train:
url: url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json # 443752
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json # 214352
storage: storage:
- /path/to/vqav2/vqa_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /path/to/vqav2/vqa_val.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_train_part100.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_part100.json
val:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_test_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
images: images:
storage: /path/to/coco/images storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -0,0 +1,39 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
annotations:
val:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -0,0 +1,48 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
annotations:
train:
url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json # 443752
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json # 214352
storage:
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_train_part100.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_part100.json
val:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_test_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -2,7 +2,7 @@ datasets:
invrefcoco: invrefcoco:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: invrefcoco dataset: invrefcoco
splitBy: unc splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
invrefcocog: invrefcocog:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: invrefcocog dataset: invrefcocog
splitBy: umd splitBy: umd

View File

@ -2,7 +2,7 @@ datasets:
invrefcocop: invrefcocop:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: invrefcoco+ dataset: invrefcoco+
splitBy: unc splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
refcoco: refcoco:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: refcoco dataset: refcoco
splitBy: unc splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
refcocog: refcocog:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: refcocog dataset: refcocog
splitBy: umd splitBy: umd

View File

@ -2,7 +2,7 @@ datasets:
refcocop: refcocop:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
ann_path: /path/to/refcoco_annotations ann_path: /path/to/refcoco_annotations
dataset: refcoco+ dataset: refcoco+
splitBy: unc splitBy: unc

View File

@ -0,0 +1,30 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
gqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/train_balanced_questions.json
val:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
images:
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/

View File

@ -11,11 +11,15 @@ datasets:
build_info: build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing # Be careful not to append minus sign (-) before split to avoid itemizing
annotations: annotations:
train: val:
url: url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
storage: storage:
- /path/to/gqa/train_balanced_questions.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
images: images:
storage: /path/to/gqa/images storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/

View File

@ -0,0 +1,30 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
gqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/train_balanced_questions_90k.json
val:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
images:
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/

View File

@ -3,5 +3,6 @@ datasets:
llava_conversation: llava_conversation:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
ann_path: /path/to/llava/conversation_58k.json # ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/conversation_58k.json
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/llava_conver_single_turn_257k_clean_v2.json

View File

@ -2,5 +2,5 @@ datasets:
llava_detail: llava_detail:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
ann_path: /path/to/llava/detail_23k.json ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/detail_23k.json

View File

@ -0,0 +1,12 @@
datasets:
llava_mix:
data_type: images
build_info:
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/llava_v1_5_mix665k/llava_v1_5_mix665k.json
image_path_coco: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
image_path_gqa: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images
image_path_ocr: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images
image_path_text: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images
# image_path_vg: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VG

View File

@ -0,0 +1,6 @@
datasets:
llava_pretrain:
data_type: images
build_info:
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/llava-cc3m-595k/images
ann_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/llava-cc3m-595k/chat.json

View File

@ -3,5 +3,5 @@ datasets:
llava_reason: llava_reason:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
ann_path: /path/to/llava/complex_reasoning_77k.json ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/complex_reasoning_77k.json

View File

@ -0,0 +1,10 @@
datasets:
llava_mix:
data_type: images
build_info:
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/llava_v1_5_mix665k/mix_coco_gqa_162k.json
image_path_coco: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
image_path_gqa: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images
image_path_ocr: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images
image_path_text: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images

View File

@ -3,5 +3,5 @@ datasets:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/coco/images image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
ann_path: /path/to/multitask_conversation/multi_task_conversation.json ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/minigptv2_llava_multitask_conv/multitask_conversation.json

View File

@ -2,4 +2,4 @@ datasets:
unnatural_instruction: unnatural_instruction:
data_type: text data_type: text
build_info: build_info:
ann_path: /path/to/unnatural_instructions/filtered_unnatural_instruction.json ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/unnatural_instructions/filtered_unnatural_instruction.json

View File

@ -2,5 +2,5 @@ datasets:
ocrvqa: ocrvqa:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/ocrvqa/images image_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/image # 207572
ann_path: /path/to/ocrvqa/dataset.json ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/dataset.json

View File

@ -15,7 +15,38 @@ datasets:
url: url:
# TODO make this order insensitive # TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
storage: storage:
- /path/to/okvqa/okvqa_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train_part100.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_train2014_questions.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_train2014_annotations.json
val:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
test:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
images: images:
storage: /path/to/coco/images storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -0,0 +1,42 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
ok_vqa:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
val:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
test:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -3,7 +3,15 @@ datasets:
data_type: images data_type: images
build_info: build_info:
image_path: /path/to/textcaps/train_images annotations:
ann_path: /path/to/textcaps/TextCaps_0.1_train.json train:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_train.json
val:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_val.json
test:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA

View File

@ -0,0 +1,17 @@
datasets:
text_vqa:
data_type: images
build_info:
annotations:
train:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_train.json
val:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_val.json
test:
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA

View File

@ -0,0 +1,59 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_t5_instruct_pro_moe
model_type: flant5xxl
load_finetuned: False
load_pretrained: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# T5
t5_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/google-flan-t5-xxl"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
moe_position: "pre" # post (position to insert PromptMoE Part)
embed_extract: "blip2_pretrain" # t5, random (way to extract embeddings of task instruction if moe_position is pre)
repeat_to_init_qt_candidates: True
num_qt_candidates: 20
moe_topk: 2
eval_gate_save: False
train_gate_save: False
gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/flant5xxl/prompt_moe/llava_st_257k_raw_train_qf_train_qt_linear_gate_textt5_20ex_3loss_textinqf_epo3_1012/"
preprocess:
vis_processor:
train:
name: "blip_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,56 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_t5_qformer_moe
model_type: flant5xxl
load_finetuned: False
load_pretrained: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# T5
t5_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/google-flan-t5-xxl"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
moebert_expert_num: 5
moebert_route_method: "gate-sentence"
moebert_load_balance: 0.1
moe_topk: 2
preprocess:
vis_processor:
train:
name: "blip_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,44 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna7b
load_finetuned: False
load_pretrained: True
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/instruct_blip_vicuna7b_trimmed/instruct_blip_vicuna7b_trimmed.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,42 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: pretrain
load_finetuned: False
# pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_pretrained/blip2_pretrained.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# moe
use_moeqformer: True
use_route_moe: True
moebert_expert_num: 5
moebert_num_beams: 2
preprocess:
vis_processor:
train:
name: "blip_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,43 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna7b
load_finetuned: False
load_pretrained: True
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,59 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
load_finetuned: False
load_pretrained: True
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# path to Vicuna checkpoint
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
general_version: 'route_moe'
moebert_route_method: "post-route"
moebert_load_balance: 0.05
moebert_expert_num: 3
moebert_num_beams: 3
moe_weight_type: 'ffn_prob'
use_balance_loss: False
ln_position: "out"
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,59 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
load_finetuned: False
load_pretrained: True
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# path to Vicuna checkpoint
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
general_version: 'uni_route_moe'
moebert_route_method: "post-route-uni"
moebert_load_balance: 0.05
moebert_expert_num: 3
moebert_num_beams: 3
moe_weight_type: 'ffn_prob'
use_balance_loss: False
ln_position: "in"
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -0,0 +1,60 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
load_finetuned: False
load_pretrained: True
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/instruct_blip_vicuna7b_trimmed/instruct_blip_vicuna7b_trimmed.pth"
finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# path to Vicuna checkpoint
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
general_version: "naive_moe"
moebert_expert_num: 5
moebert_route_method: "gate-sentence-post"
moebert_load_balance: 0
moe_topk: 1
use_balance_loss: False
moe_weight_type: 'average'
preprocess:
vis_processor:
train:
name: "blip_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"

View File

@ -15,7 +15,7 @@ model:
# generation configs # generation configs
prompt: "" prompt: ""
llama_model: "please set this value to the path of vicuna model" llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
preprocess: preprocess:
vis_processor: vis_processor:

View File

@ -11,7 +11,7 @@ model:
# generation configs # generation configs
prompt: "" prompt: ""
llama_model: "please set this value to the path of llama2-chat-7b" llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/llama_2_7b_chat"
lora_r: 64 lora_r: 64
lora_alpha: 16 lora_alpha: 16

View File

@ -208,7 +208,8 @@ class BaseDatasetBuilder:
ann_paths = abs_ann_paths ann_paths = abs_ann_paths
# visual data storage path # visual data storage path
vis_path = os.path.join(vis_info.storage, split) # vis_path = os.path.join(vis_info.storage, split)
vis_path = os.path.join(vis_info.storage)
if not os.path.isabs(vis_path): if not os.path.isabs(vis_path):
# vis_path = os.path.join(utils.get_cache_path(), vis_path) # vis_path = os.path.join(utils.get_cache_path(), vis_path)
@ -219,12 +220,14 @@ class BaseDatasetBuilder:
# create datasets # create datasets
dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
print(dataset_cls)
datasets[split] = dataset_cls( datasets[split] = dataset_cls(
vis_processor=vis_processor, vis_processor=vis_processor,
text_processor=text_processor, text_processor=text_processor,
ann_paths=ann_paths, ann_paths=ann_paths,
vis_root=vis_path, vis_root=vis_path,
) )
print("{} Length {} : {}".format(dataset_cls.__name__, split, len(datasets[split]))) # print class name
return datasets return datasets

View File

@ -6,19 +6,18 @@ from minigpt4.common.registry import registry
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
from minigpt4.datasets.datasets.laion_dataset import LaionDataset from minigpt4.datasets.datasets.laion_dataset import LaionDataset
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
from minigpt4.datasets.datasets.text_caps import TextCapDataset from minigpt4.datasets.datasets.text_caps import TextCapDataset, TextCapEvalDataset
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset from minigpt4.datasets.datasets.text_vqa_dataset import TextVQADataset, TextVQAEvalDataset
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset, LlavaMixDataset, LlavaPretrainDataset
from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
from minigpt4.datasets.datasets.gqa_datasets import GQADataset from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
from minigpt4.datasets.datasets.coco_caption import COCOCapDataset from minigpt4.datasets.datasets.coco_caption import COCOCapDataset, COCOCapEvalDataset
@registry.register_builder("multitask_conversation") @registry.register_builder("multitask_conversation")
class MultitaskConversationBuilder(BaseDatasetBuilder): class MultitaskConversationBuilder(BaseDatasetBuilder):
@ -29,7 +28,7 @@ class MultitaskConversationBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[multitask_conversation]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -55,7 +54,7 @@ class UnnaturalInstructionBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[unnatural_instruction]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -66,6 +65,7 @@ class UnnaturalInstructionBuilder(BaseDatasetBuilder):
text_processor=self.text_processors["train"], text_processor=self.text_processors["train"],
ann_path=build_info.ann_path, ann_path=build_info.ann_path,
) )
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets return datasets
@ -80,7 +80,7 @@ class LlavaDetailBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[llava_detail]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -93,11 +93,10 @@ class LlavaDetailBuilder(BaseDatasetBuilder):
ann_path=build_info.ann_path, ann_path=build_info.ann_path,
vis_root=build_info.image_path, vis_root=build_info.image_path,
) )
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets return datasets
@registry.register_builder("llava_reason") @registry.register_builder("llava_reason")
class LlavaReasonBuilder(BaseDatasetBuilder): class LlavaReasonBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaReasonDataset train_dataset_cls = LlavaReasonDataset
@ -107,7 +106,7 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[llava_reason]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -120,9 +119,37 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
ann_path=build_info.ann_path, ann_path=build_info.ann_path,
vis_root=build_info.image_path, vis_root=build_info.image_path,
) )
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets return datasets
@registry.register_builder("llava_pretrain")
class LlavaPretrainBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaPretrainDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/llava/pretrain_cap.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("[llava_pretrain]: Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets
@registry.register_builder("llava_conversation") @registry.register_builder("llava_conversation")
class LlavaReasonBuilder(BaseDatasetBuilder): class LlavaReasonBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaConversationDataset train_dataset_cls = LlavaConversationDataset
@ -132,7 +159,7 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[llava_conversation]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -145,6 +172,49 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
ann_path=build_info.ann_path, ann_path=build_info.ann_path,
vis_root=build_info.image_path, vis_root=build_info.image_path,
) )
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets
@registry.register_builder("llava_mix")
class LlavaMixBuilder(BaseDatasetBuilder):
train_dataset_cls = LlavaMixDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/llava/mix.yaml",
"mix_coco_gqa": "configs/datasets/mix_vqa/mix_vqa.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("[llava_mix]: Building datasets...")
self.build_processors()
build_info = self.config.build_info
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
vis_roots = {
'coco':build_info.image_path_coco,
'gqa':build_info.image_path_gqa,
'ocr':build_info.image_path_ocr,
'text':build_info.image_path_text,
# 'vg':build_info.image_path_vg,
}
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
ann_path=build_info.ann_path,
vis_root=vis_roots,
)
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
# vis_roots = {
# 'coco':'/mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014',
# 'gqa':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images',
# 'ocr':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images',
# 'text':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images',
# # 'vg':build_info.image_path_vg,
# }
return datasets return datasets
@ -153,7 +223,7 @@ class AllRefCOCOBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[AllRefCOCOBuilder]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
@ -181,81 +251,10 @@ class AllRefCOCOBuilder(BaseDatasetBuilder):
return datasets return datasets
@registry.register_builder("refcoco")
class RefCOCOBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcoco.yaml",
}
@registry.register_builder("refcocop")
class RefCOCOPBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcocop.yaml",
}
@registry.register_builder("refcocog")
class RefCOCOGBuilder(AllRefCOCOBuilder):
train_dataset_cls = ReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/refcocog.yaml",
}
@registry.register_builder("invrefcoco")
class RefCOCOBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcoco.yaml",
}
@registry.register_builder("invrefcocop")
class RefCOCOPBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcocop.yaml",
}
@registry.register_builder("invrefcocog")
class RefCOCOGBuilder(AllRefCOCOBuilder):
train_dataset_cls = InvReferCOCODataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco_bbox/invrefcocog.yaml",
}
@registry.register_builder("refvg")
class RefVisualGenomeBuilder(BaseDatasetBuilder):
train_dataset_cls = ReferVisualGenomeDataset
DATASET_CONFIG_DICT = {
"default": "configs/datasets/vg/ref.yaml",
}
def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...")
self.build_processors()
build_info = self.config.build_info
data_dir = build_info.data_dir
datasets = dict()
# create datasets
dataset_cls = self.train_dataset_cls
datasets['train'] = dataset_cls(
vis_processor=self.vis_processors["train"],
text_processor=self.text_processors["train"],
data_dir=data_dir,
)
return datasets
@registry.register_builder("textcaps_caption") @registry.register_builder("textcaps_caption")
class TextcapCaptionBuilder(BaseDatasetBuilder): class TextcapCaptionBuilder(BaseDatasetBuilder):
train_dataset_cls = TextCapDataset train_dataset_cls = TextCapDataset
eval_dataset_cls = TextCapEvalDataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"} DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
@ -265,44 +264,45 @@ class TextcapCaptionBuilder(BaseDatasetBuilder):
def _download_vis(self): def _download_vis(self):
pass pass
def build(self): @registry.register_builder("text_vqa")
self.build_processors() class TextVQABuilder(BaseDatasetBuilder):
train_dataset_cls = TextVQADataset
eval_dataset_cls = TextVQAEvalDataset
build_info = self.config.build_info DATASET_CONFIG_DICT = {"default": "configs/datasets/textvqa/vqa.yaml"}
datasets = dict() def _download_ann(self):
split = "train" pass
# create datasets def _download_vis(self):
# [NOTE] return inner_datasets (wds.DataPipeline) pass
dataset_cls = self.train_dataset_cls
datasets[split] = dataset_cls(
vis_processor=self.vis_processors[split],
text_processor=self.text_processors[split],
ann_path=build_info.ann_path,
vis_root=build_info.image_path,
)
return datasets
@registry.register_builder("coco_vqa") @registry.register_builder("coco_vqa")
class COCOVQABuilder(BaseDatasetBuilder): class COCOVQABuilder(BaseDatasetBuilder):
train_dataset_cls = COCOVQADataset train_dataset_cls = COCOVQADataset
eval_dataset_cls = COCOVQAEvalDataset
DATASET_CONFIG_DICT = { DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco/defaults_vqa.yaml", "default": "configs/datasets/coco/defaults_vqa.yaml",
"vqa_v2_eval": "configs/datasets/coco/defaults_vqa_eval.yaml",
"vqa_v2_part": "configs/datasets/coco/defaults_vqa_part.yaml",
} }
@registry.register_builder("ok_vqa") @registry.register_builder("ok_vqa")
class OKVQABuilder(COCOVQABuilder): class OKVQABuilder(COCOVQABuilder):
train_dataset_cls = OKVQADataset
eval_dataset_cls = OKVQAEvalDataset
DATASET_CONFIG_DICT = { DATASET_CONFIG_DICT = {
"default": "configs/datasets/okvqa/defaults.yaml", "default": "configs/datasets/okvqa/defaults.yaml",
"ok_vqa_eval": "configs/datasets/okvqa/eval.yaml",
} }
@registry.register_builder("aok_vqa") @registry.register_builder("aok_vqa")
class AOKVQABuilder(BaseDatasetBuilder): class AOKVQABuilder(BaseDatasetBuilder):
train_dataset_cls = AOKVQADataset train_dataset_cls = AOKVQADataset
eval_dataset_cls = AOKVQAEvalDataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"} DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
@ -310,13 +310,15 @@ class AOKVQABuilder(BaseDatasetBuilder):
@registry.register_builder("gqa") @registry.register_builder("gqa")
class GQABuilder(BaseDatasetBuilder): class GQABuilder(BaseDatasetBuilder):
train_dataset_cls = GQADataset train_dataset_cls = GQADataset
eval_dataset_cls = GQAEvalDataset
DATASET_CONFIG_DICT = { DATASET_CONFIG_DICT = {
"default": "configs/datasets/gqa/balanced_val.yaml", "balanced_sft_raw": "configs/datasets/gqa/balanced_sft_raw.yaml",
"balanced_sft_raw_eval":"configs/datasets/gqa/balanced_sft_raw_eval.yaml",
"balanced_sft_raw_part":"configs/datasets/gqa/balanced_sft_raw_part.yaml",
} }
@registry.register_builder("flickr_grounded_caption") @registry.register_builder("flickr_grounded_caption")
class GroundedCaptionBuilder(BaseDatasetBuilder): class GroundedCaptionBuilder(BaseDatasetBuilder):
train_dataset_cls = GroundedDetailDataset train_dataset_cls = GroundedDetailDataset
@ -326,7 +328,7 @@ class GroundedCaptionBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[flickr_grounded_caption]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -352,7 +354,7 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[flickr_CaptionToPhrase]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -377,7 +379,7 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
def build_datasets(self): def build_datasets(self):
# at this point, all the annotations and image/videos should be all downloaded to the specified locations. # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
logging.info("Building datasets...") logging.info("[flickr_ObjectToPhrase]: Building datasets...")
self.build_processors() self.build_processors()
build_info = self.config.build_info build_info = self.config.build_info
datasets = dict() datasets = dict()
@ -394,8 +396,6 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
return datasets return datasets
class DocumentVQABuilder(BaseDatasetBuilder): class DocumentVQABuilder(BaseDatasetBuilder):
def _download_ann(self): def _download_ann(self):
pass pass
@ -417,6 +417,7 @@ class DocumentVQABuilder(BaseDatasetBuilder):
vis_root=build_info.image_path, vis_root=build_info.image_path,
ann_path=build_info.ann_path ann_path=build_info.ann_path
) )
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
return datasets return datasets
@ -495,9 +496,11 @@ class LaionBuilder(BaseDatasetBuilder):
@registry.register_builder("coco_caption") @registry.register_builder("coco_caption")
class COCOCapBuilder(BaseDatasetBuilder): class COCOCapBuilder(BaseDatasetBuilder):
train_dataset_cls = COCOCapDataset train_dataset_cls = COCOCapDataset
eval_dataset_cls = COCOCapEvalDataset
DATASET_CONFIG_DICT = { DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco/caption.yaml", "default": "configs/datasets/coco/caption.yaml",
"coco_cap_eval": "configs/datasets/coco/caption_eval.yaml",
} }

View File

@ -13,7 +13,7 @@ import torch
from PIL import Image from PIL import Image
from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
class __DisplMixin: class __DisplMixin:
@ -37,80 +37,191 @@ class AOKVQADataset(VQADataset, __DisplMixin):
super().__init__(vis_processor, text_processor, vis_root, ann_paths) super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[ self.instruction_pool =[
"[vqa] {}", '{} Choose from {}.',
"[vqa] Based on the image, respond to this question with a short answer: {}" 'Q: {} Multi Choices: {} A: ',
'Question: {} Multi Choices: {} Answer: ',
"{} Choose one from the following possible answers: {}. ",
'{} Choose from {}. The answer is',
] ]
exist_annotation = [] exist_annotation = []
for ann in self.annotation: for ann in self.annotation:
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) # image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
image_path = os.path.join(self.vis_root, ann["image"])
if os.path.exists(image_path): if os.path.exists(image_path):
exist_annotation.append(ann) exist_annotation.append(ann)
self.annotation = exist_annotation self.annotation = exist_annotation
self.source = 'aokvqa'
def get_data(self, index): def get_data(self, index):
ann = self.annotation[index] ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1]) # image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB") image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image) image = self.vis_processor(image)
question = self.text_processor(ann["question"]) question = self.text_processor(ann["question"])
answer_key = "direct_answers" answer_lst = ann["choices"]
direct_answers = ann["direct_answers"]
answer_weight = {} final_answer = random.choices(direct_answers, k=1)[0]
for answer in ann[answer_key]: for answer in answer_lst:
if answer in answer_weight.keys(): if answer in direct_answers:
answer_weight[answer] += 1 / len(ann[answer_key]) final_answer = answer
else:
answer_weight[answer] = 1 / len(ann[answer_key])
answers = list(answer_weight.keys())
weights = list(answer_weight.values())
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
return { return {
"image": image, "image": image,
"image_id": ann["image"],
"question": question, "question": question,
"answer": answer, "answer": final_answer,
"choices": ", ".join(answer_lst)
} }
def __getitem__(self, index): def __getitem__(self, index):
data = self.get_data(index) data = self.get_data(index)
question = self.text_processor(data["question"]) question = self.text_processor(data["question"])
instruction = random.choice(self.instruction_pool).format(question)
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
answer = self.text_processor(data['answer']) answer = self.text_processor(data['answer'])
q_input = question
llm_input = random.choice(self.instruction_pool).format(question, data["choices"])
return { return {
"image": data['image'], "image": data['image'],
"instruction_input": instruction, "image_id": data["image_id"],
# "q_input": q_input,
"q_input": llm_input,
"llm_input": llm_input,
"text_input": question,
"text_output": answer,
"answer": answer, "answer": answer,
"source": 'aokvqa',
} }
class AOKVQGDataset(AOKVQADataset): class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths): def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths) """
self.instruction_pool = [ vis_root (string): Root directory of images (e.g. coco/images/)
'Given the image, generate a question whose answer is: {}', ann_root (string): directory to store the annotation file
'Based on the image, provide a question with the answer: {}', """
'Given the visual representation, create a question for which the answer is "{}"',
'From the image provided, craft a question that leads to the reply: {}', self.vis_root = vis_root
'Considering the picture, come up with a question where the answer is: {}',
'Taking the image into account, generate an question that has the answer: {}' self.annotation = json.load(open(ann_paths[0]))
self.instruction_pool =[
'{} Choose from {}.',
'Q: {} Multi Choices: {} A: ',
'Question: {} Multi Choices: {} Answer: ',
"{} Choose one from the following possible answers: {}. ",
'{} Choose from {}. The answer is',
] ]
def __getitem__(self, index): try:
data = self.get_data(index) self.coco_fmt_qust_file = ann_paths[2]
instruction = random.choice(self.instruction_pool).format(data['answer']) self.coco_fmt_anno_file = ann_paths[3]
except IndexError:
self.coco_fmt_qust_file = None
self.coco_fmt_anno_file = None
self.vis_processor = vis_processor
self.text_processor = text_processor
self.source = 'aokvqa'
self.annotation_add = self.get_data()
def collater(self, samples):
(
image_list,
question_list,
question_id_list,
choices_list,
correct_choice_idx_list,
direct_answers_list,
llm_input_list,
q_input_list,
gt_answers_list,
source_list,
) = ([], [], [], [], [], [], [], [], [], [])
for sample in samples:
image_list.append(sample["image"])
question_list.append(sample["text_input"])
question_id_list.append(sample["question_id"])
choices_list.append(sample["choices"])
correct_choice_idx_list.append(sample["correct_choice_idx"])
direct_answers_list.append(sample["direct_answers"])
llm_input_list.append(sample["llm_input"])
q_input_list.append(sample["q_input"])
gt_answers_list.append(sample["gt_answers"])
source_list.append(sample["source"])
return { return {
"image": data['image'], "image": torch.stack(image_list, dim=0),
"instruction_input": instruction, "text_input": question_list,
"answer": data['question'], "question_id": question_id_list,
"choices": choices_list,
"correct_choice_idx": correct_choice_idx_list,
"direct_answers": direct_answers_list,
"llm_input": llm_input_list,
"q_input": llm_input_list,
# "q_input": q_input_list,
"gt_answers": gt_answers_list,
"source": source_list,
} }
def get_data(self):
import numpy as np
ann_instruct = list()
for i in range(len(self.annotation)):
ann = self.annotation[i].copy()
j = i % len(self.instruction_pool)
question = self.text_processor(ann["question"])
choices = ann["choices"]
llm_input = self.instruction_pool[j].format(question, ", ".join(choices))
ann['llm_input'] = llm_input
ann_instruct.append(ann)
np.random.seed(10)
np.random.shuffle(ann_instruct)
return ann_instruct
def __getitem__(self, index):
# ann = self.annotation[index]
ann = self.annotation_add[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
if "direct_answers" in ann:
direct_answers = ann["direct_answers"]
else:
direct_answers = None
choices = ann["choices"]
if "correct_choice_idx" in ann:
correct_choice_idx = ann["correct_choice_idx"]
correct_answer = choices[correct_choice_idx]
else:
correct_choice_idx = None
correct_answer = direct_answers
llm_input = ann.get("llm_input",random.choice(self.instruction_pool).format(question))
# llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices))
return {
"image": image,
# "q_input": question,
"q_input": llm_input,
"llm_input": llm_input,
"text_input": question,
"question_id": ann["question_id"],
"choices": choices,
"correct_choice_idx": correct_choice_idx,
"gt_answers": correct_answer,
"direct_answers": direct_answers,
"source": 'aokvqa',
}

Some files were not shown because too many files have changed in this diff Show More