Merge branch 'dev' of https://github.com/Hannahhhhhhhhhhhh/PromptMoE into main
BIN
MiniGPTv2.pdf
6290
Post_Route_Universal_PromptMoE_RawProb_backward_graph
Normal file
BIN
Post_Route_Universal_PromptMoE_RawProb_backward_graph.pdf
Normal file
5294
Pre_PromptMoE_RawProb_backward_graph
Normal file
BIN
Pre_PromptMoE_RawProb_backward_graph.pdf
Normal file
2
command.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
chmod +x *.sh
|
||||||
|
tensorboard --bind_all --logdir
|
1
demo.py
@ -116,6 +116,7 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
|
|||||||
max_new_tokens=300,
|
max_new_tokens=300,
|
||||||
max_length=2000)[0]
|
max_length=2000)[0]
|
||||||
chatbot[-1][1] = llm_message
|
chatbot[-1][1] = llm_message
|
||||||
|
print(llm_message)
|
||||||
return chatbot, chat_state, img_list
|
return chatbot, chat_state, img_list
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ from minigpt4.tasks import *
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="Demo")
|
parser = argparse.ArgumentParser(description="Demo")
|
||||||
parser.add_argument("--cfg-path", default='eval_configs/minigptv2_eval.yaml',
|
parser.add_argument("--cfg-path", default='minigpt4/projects/minigpt/eval/minigptv2_eval.yaml',
|
||||||
help="path to configuration file.")
|
help="path to configuration file.")
|
||||||
parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
|
parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -11,10 +11,10 @@ After the first stage, the visual features are mapped and can be understood by t
|
|||||||
model.
|
model.
|
||||||
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
|
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
|
||||||
You can change the save path in the config file
|
You can change the save path in the config file
|
||||||
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
|
[minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml](minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
|
torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
A MiniGPT-4 checkpoint with only stage one training can be downloaded
|
A MiniGPT-4 checkpoint with only stage one training can be downloaded
|
||||||
@ -30,12 +30,12 @@ To download and prepare our second stage dataset, please check our
|
|||||||
[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
|
[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
|
||||||
To launch the second stage alignment,
|
To launch the second stage alignment,
|
||||||
first specify the path to the checkpoint file trained in stage 1 in
|
first specify the path to the checkpoint file trained in stage 1 in
|
||||||
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
|
[minigpt4/projects/minigpt/train/minigpt4_stage1_pretrain.yaml](minigpt4/projects/minigpt/train/minigpt4_stage2_finetune.yaml).
|
||||||
You can also specify the output path there.
|
You can also specify the output path there.
|
||||||
Then, run the following command. In our experiments, we use 1 A100.
|
Then, run the following command. In our experiments, we use 1 A100.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
|
torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigpt4_stage2_finetune.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
|
After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
|
@ -4,7 +4,7 @@
|
|||||||
You firstly need to prepare the dataset. you can follow this step to prepare the dataset.
|
You firstly need to prepare the dataset. you can follow this step to prepare the dataset.
|
||||||
our [dataset preparation](dataset/README_MINIGPTv2_FINETUNE.md).
|
our [dataset preparation](dataset/README_MINIGPTv2_FINETUNE.md).
|
||||||
|
|
||||||
In the train_configs/minigptv2_finetune.yaml, you need to set up the following paths:
|
In the minigpt4/projects/minigpt/train/minigptv2_finetune.yaml, you need to set up the following paths:
|
||||||
|
|
||||||
llama_model checkpoint path: "/path/to/llama_checkpoint"
|
llama_model checkpoint path: "/path/to/llama_checkpoint"
|
||||||
|
|
||||||
@ -19,6 +19,6 @@ For ckpt, you may load from our pretrained model checkpoints:
|
|||||||
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigptv2_finetune.yaml
|
torchrun --nproc-per-node NUM_GPU train.py --cfg-path minigpt4/projects/minigpt/train/minigptv2_finetune.yaml
|
||||||
```
|
```
|
||||||
|
|
@ -82,13 +82,13 @@ Download the corresponding LLM weights from the following huggingface space via
|
|||||||
Then, set the variable *llama_model* in the model config file to the LLM weight path.
|
Then, set the variable *llama_model* in the model config file to the LLM weight path.
|
||||||
|
|
||||||
* For MiniGPT-v2, set the LLM path
|
* For MiniGPT-v2, set the LLM path
|
||||||
[here](minigpt4/configs/models/minigpt_v2.yaml#L15) at Line 14.
|
[here](minigpt4/configs/models/minigpt/minigpt_v2.yaml#L15) at Line 14.
|
||||||
|
|
||||||
* For MiniGPT-4 (Llama2), set the LLM path
|
* For MiniGPT-4 (Llama2), set the LLM path
|
||||||
[here](minigpt4/configs/models/minigpt4_llama2.yaml#L15) at Line 15.
|
[here](minigpt4/configs/models/minigpt/minigpt4_llama2.yaml#L15) at Line 15.
|
||||||
|
|
||||||
* For MiniGPT-4 (Vicuna), set the LLM path
|
* For MiniGPT-4 (Vicuna), set the LLM path
|
||||||
[here](minigpt4/configs/models/minigpt4_vicuna0.yaml#L18) at Line 18
|
[here](minigpt4/configs/models/minigpt/minigpt4_vicuna0.yaml#L18) at Line 18
|
||||||
|
|
||||||
**3. Prepare the pretrained model checkpoints**
|
**3. Prepare the pretrained model checkpoints**
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ Download the pretrained model checkpoints
|
|||||||
|
|
||||||
|
|
||||||
For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file
|
For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file
|
||||||
in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at Line 8.
|
in [minigpt4/projects/minigpt/eval/minigptv2_eval.yaml](minigpt4/projects/minigpt/eval/minigptv2_eval.yaml#L10) at Line 8.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at L
|
|||||||
| [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/11nAPjEok8eAGGEG1N2vXo3kBLCg0WgUk/view?usp=sharing) |
|
| [Download](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) | [Download](https://drive.google.com/file/d/11nAPjEok8eAGGEG1N2vXo3kBLCg0WgUk/view?usp=sharing) |
|
||||||
|
|
||||||
For **MiniGPT-4**, set the path to the pretrained checkpoint in the evaluation config file
|
For **MiniGPT-4**, set the path to the pretrained checkpoint in the evaluation config file
|
||||||
in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 8 for Vicuna version or [eval_configs/minigpt4_llama2_eval.yaml](eval_configs/minigpt4_llama2_eval.yaml#L10) for LLama2 version.
|
in [minigpt4/projects/minigpt/eval/minigpt4_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_eval.yaml#L10) at Line 8 for Vicuna version or [minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml#L10) for LLama2 version.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -118,19 +118,19 @@ in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Lin
|
|||||||
|
|
||||||
For MiniGPT-v2, run
|
For MiniGPT-v2, run
|
||||||
```
|
```
|
||||||
python demo_v2.py --cfg-path eval_configs/minigptv2_eval.yaml --gpu-id 0
|
python demo_v2.py --cfg-path minigpt4/projects/minigpt/eval/minigptv2_eval.yaml --gpu-id 0
|
||||||
```
|
```
|
||||||
|
|
||||||
For MiniGPT-4 (Vicuna version), run
|
For MiniGPT-4 (Vicuna version), run
|
||||||
|
|
||||||
```
|
```
|
||||||
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
|
python demo.py --cfg-path minigpt4/projects/minigpt/eval/minigpt4_eval.yaml --gpu-id 0
|
||||||
```
|
```
|
||||||
|
|
||||||
For MiniGPT-4 (Llama2 version), run
|
For MiniGPT-4 (Llama2 version), run
|
||||||
|
|
||||||
```
|
```
|
||||||
python demo.py --cfg-path eval_configs/minigpt4_llama2_eval.yaml --gpu-id 0
|
python demo.py --cfg-path minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml --gpu-id 0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -139,9 +139,9 @@ This configuration requires about 23G GPU memory for 13B LLM and 11.5G GPU memor
|
|||||||
For more powerful GPUs, you can run the model
|
For more powerful GPUs, you can run the model
|
||||||
in 16 bit by setting `low_resource` to `False` in the relevant config file:
|
in 16 bit by setting `low_resource` to `False` in the relevant config file:
|
||||||
|
|
||||||
* MiniGPT-v2: [minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#6)
|
* MiniGPT-v2: [minigptv2_eval.yaml](minigpt4/projects/minigpt/eval/minigptv2_eval.yaml#6)
|
||||||
* MiniGPT-4 (Llama2): [minigpt4_llama2_eval.yaml](eval_configs/minigpt4_llama2_eval.yaml#6)
|
* MiniGPT-4 (Llama2): [minigpt4_llama2_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_llama2_eval.yaml#6)
|
||||||
* MiniGPT-4 (Vicuna): [minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#6)
|
* MiniGPT-4 (Vicuna): [minigpt4_eval.yaml](minigpt4/projects/minigpt/eval/minigpt4_eval.yaml#6)
|
||||||
|
|
||||||
Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run MiniGPT-4 on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
|
Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run MiniGPT-4 on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
name: minigptv
|
name: promptmoe
|
||||||
channels:
|
channels:
|
||||||
- pytorch
|
- pytorch
|
||||||
- defaults
|
- defaults
|
||||||
@ -31,3 +31,5 @@ dependencies:
|
|||||||
- accelerate==0.20.3
|
- accelerate==0.20.3
|
||||||
- bitsandbytes==0.37.0
|
- bitsandbytes==0.37.0
|
||||||
- wandb
|
- wandb
|
||||||
|
- visual_genome
|
||||||
|
- scikit-image
|
||||||
|
92
evaluate.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
"""
|
||||||
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
All rights reserved.
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.backends.cudnn as cudnn
|
||||||
|
|
||||||
|
import minigpt4.tasks as tasks
|
||||||
|
from minigpt4.common.config import Config
|
||||||
|
from minigpt4.common.dist_utils import get_rank, init_distributed_mode
|
||||||
|
from minigpt4.common.logger import setup_logger
|
||||||
|
from minigpt4.common.optims import (
|
||||||
|
LinearWarmupCosineLRScheduler,
|
||||||
|
LinearWarmupStepLRScheduler,
|
||||||
|
)
|
||||||
|
from minigpt4.common.utils import now
|
||||||
|
|
||||||
|
# imports modules for registration
|
||||||
|
from minigpt4.datasets.builders import *
|
||||||
|
from minigpt4.models import *
|
||||||
|
from minigpt4.processors import *
|
||||||
|
from minigpt4.runners.runner_base import RunnerBase
|
||||||
|
from minigpt4.tasks import *
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Training")
|
||||||
|
|
||||||
|
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--options",
|
||||||
|
nargs="+",
|
||||||
|
help="override some settings in the used config, the key-value pair "
|
||||||
|
"in xxx=yyy format will be merged into config file (deprecate), "
|
||||||
|
"change to --cfg-options instead.",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
# if 'LOCAL_RANK' not in os.environ:
|
||||||
|
# os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def setup_seeds(config):
|
||||||
|
seed = config.run_cfg.seed + get_rank()
|
||||||
|
|
||||||
|
random.seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
|
||||||
|
cudnn.benchmark = False
|
||||||
|
cudnn.deterministic = True
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# allow auto-dl completes on main process without timeout when using NCCL backend.
|
||||||
|
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
|
||||||
|
|
||||||
|
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
|
||||||
|
job_id = now()
|
||||||
|
|
||||||
|
cfg = Config(parse_args())
|
||||||
|
|
||||||
|
init_distributed_mode(cfg.run_cfg)
|
||||||
|
|
||||||
|
setup_seeds(cfg)
|
||||||
|
|
||||||
|
# set after init_distributed_mode() to only log on master.
|
||||||
|
setup_logger()
|
||||||
|
|
||||||
|
cfg.pretty_print()
|
||||||
|
|
||||||
|
task = tasks.setup_task(cfg)
|
||||||
|
datasets = task.build_datasets(cfg)
|
||||||
|
model = task.build_model(cfg)
|
||||||
|
|
||||||
|
runner = RunnerBase(
|
||||||
|
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
|
||||||
|
)
|
||||||
|
runner.evaluate(skip_reload=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
94
evaluation/coco_caption.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pycocoevalcap.eval import COCOEvalCap
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
class COCO_Annotation:
|
||||||
|
def __init__(self, annotation_file):
|
||||||
|
self.coco_cn_file = annotation_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = defaultdict(list)
|
||||||
|
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
|
||||||
|
for line in fin:
|
||||||
|
line = line.strip()
|
||||||
|
temp = eval(line)
|
||||||
|
annotations = temp['annotations']
|
||||||
|
for ann in annotations:
|
||||||
|
image_id = str(ann['image_id']).zfill(6)
|
||||||
|
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
def getImgIds(self):
|
||||||
|
return self.imgToAnns.keys()
|
||||||
|
|
||||||
|
class COCO_Result:
|
||||||
|
def __init__(self,result_file):
|
||||||
|
self.coco_cn_file = result_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = dict()
|
||||||
|
data = json.load(open(self.coco_cn_file, "r"))
|
||||||
|
for d in data:
|
||||||
|
tmp = {
|
||||||
|
'image_id':d['question_id'][-6:],
|
||||||
|
'caption':d['answer']
|
||||||
|
}
|
||||||
|
imgToAnns[d['question_id'][-6:]] = [tmp]
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
def coco_caption_eval(results_file, split_name):
|
||||||
|
files = {
|
||||||
|
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
|
||||||
|
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# create coco object and coco_result object
|
||||||
|
annotation_file = files[split_name]
|
||||||
|
coco = COCO_Annotation(annotation_file)
|
||||||
|
coco_result = COCO_Result(results_file)
|
||||||
|
|
||||||
|
# create coco_eval object by taking coco and coco_result
|
||||||
|
coco_eval = COCOEvalCap(coco, coco_result)
|
||||||
|
|
||||||
|
# evaluate on a subset of images by setting
|
||||||
|
# coco_eval.params['image_id'] = coco_result.getImgIds()
|
||||||
|
# please remove this line when evaluating the full validation set
|
||||||
|
# coco_eval.params['image_id'] = coco_result.getImgIds()
|
||||||
|
|
||||||
|
# evaluate results
|
||||||
|
# SPICE will take a few minutes the first time, but speeds up due to caching
|
||||||
|
coco_eval.evaluate()
|
||||||
|
|
||||||
|
# print output evaluation scores
|
||||||
|
for metric, score in coco_eval.eval.items():
|
||||||
|
print(f"{metric}: {score:.3f}")
|
||||||
|
|
||||||
|
return coco_eval
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json"
|
||||||
|
split_name = "val"
|
||||||
|
coco_val = coco_caption_eval(result_file, split_name)
|
||||||
|
|
||||||
|
agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
|
||||||
|
|
||||||
|
# log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
|
||||||
|
# with open(
|
||||||
|
# os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
|
||||||
|
# ) as f:
|
||||||
|
# f.write(json.dumps(log_stats) + "\n")
|
||||||
|
|
||||||
|
coco_res = {k: v for k, v in coco_val.eval.items()}
|
||||||
|
coco_res["agg_metrics"] = agg_metrics
|
||||||
|
|
||||||
|
print(coco_res)
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
Before Width: | Height: | Size: 380 KiB |
Before Width: | Height: | Size: 457 KiB |
Before Width: | Height: | Size: 538 KiB |
Before Width: | Height: | Size: 586 KiB |
Before Width: | Height: | Size: 679 KiB |
Before Width: | Height: | Size: 555 KiB |
Before Width: | Height: | Size: 468 KiB |
Before Width: | Height: | Size: 658 KiB |
Before Width: | Height: | Size: 690 KiB |
Before Width: | Height: | Size: 586 KiB |
Before Width: | Height: | Size: 713 KiB |
Before Width: | Height: | Size: 597 KiB |
Before Width: | Height: | Size: 190 KiB |
Before Width: | Height: | Size: 603 KiB |
Before Width: | Height: | Size: 634 KiB |
Before Width: | Height: | Size: 249 KiB |
Before Width: | Height: | Size: 305 KiB |
Before Width: | Height: | Size: 588 KiB |
Before Width: | Height: | Size: 805 KiB |
Before Width: | Height: | Size: 853 KiB |
Before Width: | Height: | Size: 567 KiB |
Before Width: | Height: | Size: 712 KiB |
Before Width: | Height: | Size: 519 KiB |
Before Width: | Height: | Size: 565 KiB |
Before Width: | Height: | Size: 91 KiB |
Before Width: | Height: | Size: 83 KiB |
Before Width: | Height: | Size: 1.5 MiB |
Before Width: | Height: | Size: 1.2 MiB |
Before Width: | Height: | Size: 92 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 116 KiB |
Before Width: | Height: | Size: 865 KiB |
8
minigpt4/common/caption_tools/__init__.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
"""
|
||||||
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
All rights reserved.
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
"""
|
||||||
|
|
||||||
|
__author__ = "aagrawal"
|
127
minigpt4/common/caption_tools/caption_utils.py
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from pycocoevalcap.eval import COCOEvalCap
|
||||||
|
import json
|
||||||
|
|
||||||
|
class COCO_Annotation:
|
||||||
|
def __init__(self, annotation_file):
|
||||||
|
self.coco_cn_file = annotation_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = defaultdict(list)
|
||||||
|
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
|
||||||
|
for line in fin:
|
||||||
|
line = line.strip()
|
||||||
|
temp = eval(line)
|
||||||
|
annotations = temp['annotations']
|
||||||
|
for ann in annotations:
|
||||||
|
image_id = str(ann['image_id']).zfill(6)
|
||||||
|
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
def getImgIds(self):
|
||||||
|
return self.imgToAnns.keys()
|
||||||
|
|
||||||
|
class COCO_Result:
|
||||||
|
def __init__(self,result_file):
|
||||||
|
self.coco_cn_file = result_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = dict()
|
||||||
|
data = json.load(open(self.coco_cn_file, "r"))
|
||||||
|
for d in data:
|
||||||
|
tmp = {
|
||||||
|
'image_id':d['question_id'][-6:],
|
||||||
|
'caption':d['answer']
|
||||||
|
}
|
||||||
|
imgToAnns[d['question_id'][-6:]] = [tmp]
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
def coco_caption_eval(coco_gt_root, results_file, split_name):
|
||||||
|
files = {
|
||||||
|
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
|
||||||
|
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# create coco object and coco_result object
|
||||||
|
annotation_file = files[split_name]
|
||||||
|
coco = COCO_Annotation(annotation_file)
|
||||||
|
coco_result = COCO_Result(results_file)
|
||||||
|
|
||||||
|
# create coco_eval object by taking coco and coco_result
|
||||||
|
coco_eval = COCOEvalCap(coco, coco_result)
|
||||||
|
|
||||||
|
# evaluate on a subset of images by setting
|
||||||
|
# coco_eval.params['image_id'] = coco_result.getImgIds()
|
||||||
|
# please remove this line when evaluating the full validation set
|
||||||
|
# coco_eval.params['image_id'] = coco_result.getImgIds()
|
||||||
|
|
||||||
|
# evaluate results
|
||||||
|
# SPICE will take a few minutes the first time, but speeds up due to caching
|
||||||
|
coco_eval.evaluate()
|
||||||
|
|
||||||
|
# print output evaluation scores
|
||||||
|
for metric, score in coco_eval.eval.items():
|
||||||
|
print(f"{metric}: {score:.3f}")
|
||||||
|
|
||||||
|
return coco_eval
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TextCap_Annotation:
|
||||||
|
def __init__(self, annotation_file):
|
||||||
|
self.anno_file = annotation_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = defaultdict(list)
|
||||||
|
annotations = json.load(open(self.anno_file,"r"))['data']
|
||||||
|
for ann in annotations:
|
||||||
|
image_id = str(ann['image_name'])
|
||||||
|
imgToAnns[image_id].append({
|
||||||
|
'image_id':image_id,
|
||||||
|
# 'caption':ann['reference_strs'],
|
||||||
|
'caption':ann['caption_str'],
|
||||||
|
'image': ann['image_path']
|
||||||
|
})
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
def getImgIds(self):
|
||||||
|
return self.imgToAnns.keys()
|
||||||
|
|
||||||
|
class TextCap_Result:
|
||||||
|
def __init__(self,result_file):
|
||||||
|
self.result_file = result_file
|
||||||
|
self.imgToAnns = self.build_imgToAnns()
|
||||||
|
|
||||||
|
def build_imgToAnns(self):
|
||||||
|
imgToAnns = dict()
|
||||||
|
data = json.load(open(self.result_file, "r"))
|
||||||
|
for d in data:
|
||||||
|
tmp = {
|
||||||
|
'image_id':d['question_id'], # actually image_id
|
||||||
|
'caption':d['answer']
|
||||||
|
}
|
||||||
|
imgToAnns[d['question_id']] = [tmp]
|
||||||
|
return imgToAnns
|
||||||
|
|
||||||
|
|
||||||
|
def textcaps_caption_eval(annotation_file, results_file):
|
||||||
|
|
||||||
|
# create coco object and coco_result object
|
||||||
|
anno = TextCap_Annotation(annotation_file)
|
||||||
|
result = TextCap_Result(results_file)
|
||||||
|
|
||||||
|
# create coco_eval object by taking coco and coco_result
|
||||||
|
text_eval = COCOEvalCap(anno, result)
|
||||||
|
|
||||||
|
# SPICE will take a few minutes the first time, but speeds up due to caching
|
||||||
|
text_eval.evaluate()
|
||||||
|
|
||||||
|
# print output evaluation scores
|
||||||
|
for metric, score in text_eval.eval.items():
|
||||||
|
print(f"{metric}: {score:.3f}")
|
||||||
|
|
||||||
|
return text_eval
|
@ -29,6 +29,7 @@ class Config:
|
|||||||
runner_config = self.build_runner_config(config)
|
runner_config = self.build_runner_config(config)
|
||||||
model_config = self.build_model_config(config, **user_config)
|
model_config = self.build_model_config(config, **user_config)
|
||||||
dataset_config = self.build_dataset_config(config)
|
dataset_config = self.build_dataset_config(config)
|
||||||
|
evaluation_dataset_config = self.build_evaluation_dataset_config(config)
|
||||||
|
|
||||||
# Validate the user-provided runner configuration
|
# Validate the user-provided runner configuration
|
||||||
# model and dataset configuration are supposed to be validated by the respective classes
|
# model and dataset configuration are supposed to be validated by the respective classes
|
||||||
@ -37,7 +38,7 @@ class Config:
|
|||||||
|
|
||||||
# Override the default configuration with user options.
|
# Override the default configuration with user options.
|
||||||
self.config = OmegaConf.merge(
|
self.config = OmegaConf.merge(
|
||||||
runner_config, model_config, dataset_config, user_config
|
runner_config, model_config, dataset_config, evaluation_dataset_config, user_config
|
||||||
)
|
)
|
||||||
|
|
||||||
def _validate_runner_config(self, runner_config):
|
def _validate_runner_config(self, runner_config):
|
||||||
@ -111,6 +112,29 @@ class Config:
|
|||||||
|
|
||||||
return dataset_config
|
return dataset_config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_evaluation_dataset_config(config):
|
||||||
|
# from Minigpt-v2
|
||||||
|
datasets = config.get("evaluation_datasets", None)
|
||||||
|
# if datasets is None:
|
||||||
|
# raise KeyError(
|
||||||
|
# "Expecting 'datasets' as the root key for dataset configuration."
|
||||||
|
# )
|
||||||
|
|
||||||
|
dataset_config = OmegaConf.create()
|
||||||
|
|
||||||
|
if datasets is not None:
|
||||||
|
for dataset_name in datasets:
|
||||||
|
builder_cls = registry.get_builder_class(dataset_name)
|
||||||
|
|
||||||
|
# hierarchy override, customized config > default config
|
||||||
|
dataset_config = OmegaConf.merge(
|
||||||
|
dataset_config,
|
||||||
|
{"evaluation_datasets": {dataset_name: config["evaluation_datasets"][dataset_name]}},
|
||||||
|
)
|
||||||
|
|
||||||
|
return dataset_config
|
||||||
|
|
||||||
def _convert_to_dot_list(self, opts):
|
def _convert_to_dot_list(self, opts):
|
||||||
if opts is None:
|
if opts is None:
|
||||||
opts = []
|
opts = []
|
||||||
@ -136,6 +160,10 @@ class Config:
|
|||||||
def datasets_cfg(self):
|
def datasets_cfg(self):
|
||||||
return self.config.datasets
|
return self.config.datasets
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_datasets_cfg(self):
|
||||||
|
return self.config.evaluation_datasets
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_cfg(self):
|
def model_cfg(self):
|
||||||
return self.config.model
|
return self.config.model
|
||||||
|
76
minigpt4/common/eval_utils.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
from nltk.translate.bleu_score import sentence_bleu
|
||||||
|
|
||||||
|
from minigpt4.common.registry import registry
|
||||||
|
from minigpt4.common.config import Config
|
||||||
|
|
||||||
|
# imports modules for registration
|
||||||
|
from minigpt4.datasets.builders import *
|
||||||
|
from minigpt4.models import *
|
||||||
|
from minigpt4.processors import *
|
||||||
|
from minigpt4.runners import *
|
||||||
|
from minigpt4.tasks import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def eval_parser():
|
||||||
|
parser = argparse.ArgumentParser(description="Demo")
|
||||||
|
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
|
||||||
|
parser.add_argument("--name", type=str, default='A2', help="evaluation name")
|
||||||
|
parser.add_argument("--ckpt", type=str, help="path to configuration file.")
|
||||||
|
parser.add_argument("--eval_opt", type=str, default='all', help="path to configuration file.")
|
||||||
|
parser.add_argument("--max_new_tokens", type=int, default=10, help="max number of generated tokens")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=32)
|
||||||
|
parser.add_argument("--lora_r", type=int, default=64, help="lora rank of the model")
|
||||||
|
parser.add_argument("--lora_alpha", type=int, default=16, help="lora alpha")
|
||||||
|
parser.add_argument(
|
||||||
|
"--options",
|
||||||
|
nargs="+",
|
||||||
|
help="override some settings in the used config, the key-value pair "
|
||||||
|
"in xxx=yyy format will be merged into config file (deprecate), "
|
||||||
|
"change to --cfg-options instead.",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_texts(texts, conv_temp):
|
||||||
|
convs = [conv_temp.copy() for _ in range(len(texts))]
|
||||||
|
[conv.append_message(
|
||||||
|
conv.roles[0], '<Img><ImageHere></Img> {}'.format(text)) for conv, text in zip(convs, texts)]
|
||||||
|
[conv.append_message(conv.roles[1], None) for conv in convs]
|
||||||
|
texts = [conv.get_prompt() for conv in convs]
|
||||||
|
return texts
|
||||||
|
|
||||||
|
|
||||||
|
def init_model(args):
|
||||||
|
print('Initialization Model')
|
||||||
|
cfg = Config(args)
|
||||||
|
# cfg.model_cfg.ckpt = args.ckpt
|
||||||
|
# cfg.model_cfg.lora_r = args.lora_r
|
||||||
|
# cfg.model_cfg.lora_alpha = args.lora_alpha
|
||||||
|
|
||||||
|
model_config = cfg.model_cfg
|
||||||
|
model_cls = registry.get_model_class(model_config.arch)
|
||||||
|
model = model_cls.from_config(model_config).to('cuda:0')
|
||||||
|
|
||||||
|
# import pudb; pudb.set_trace()
|
||||||
|
key = list(cfg.datasets_cfg.keys())[0]
|
||||||
|
vis_processor_cfg = cfg.datasets_cfg.get(key).vis_processor.train
|
||||||
|
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
|
||||||
|
print('Initialization Finished')
|
||||||
|
return model, vis_processor
|
||||||
|
|
||||||
|
def computeIoU(bbox1, bbox2):
|
||||||
|
x1, y1, x2, y2 = bbox1
|
||||||
|
x3, y3, x4, y4 = bbox2
|
||||||
|
intersection_x1 = max(x1, x3)
|
||||||
|
intersection_y1 = max(y1, y3)
|
||||||
|
intersection_x2 = min(x2, x4)
|
||||||
|
intersection_y2 = min(y2, y4)
|
||||||
|
intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1)
|
||||||
|
bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
|
||||||
|
bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
|
||||||
|
union_area = bbox1_area + bbox2_area - intersection_area
|
||||||
|
iou = intersection_area / union_area
|
||||||
|
return iou
|
@ -2,13 +2,14 @@
|
|||||||
Copyright (c) 2022, salesforce.com, inc.
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
SPDX-License-Identifier: BSD-3-Clause
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict, deque
|
from collections import defaultdict, deque
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
@ -80,9 +81,10 @@ class SmoothedValue(object):
|
|||||||
|
|
||||||
|
|
||||||
class MetricLogger(object):
|
class MetricLogger(object):
|
||||||
def __init__(self, delimiter="\t"):
|
def __init__(self, delimiter="\t",writer: SummaryWriter=None):
|
||||||
self.meters = defaultdict(SmoothedValue)
|
self.meters = defaultdict(SmoothedValue)
|
||||||
self.delimiter = delimiter
|
self.delimiter = delimiter
|
||||||
|
self.writer = writer
|
||||||
|
|
||||||
def update(self, **kwargs):
|
def update(self, **kwargs):
|
||||||
for k, v in kwargs.items():
|
for k, v in kwargs.items():
|
||||||
@ -91,6 +93,10 @@ class MetricLogger(object):
|
|||||||
assert isinstance(v, (float, int))
|
assert isinstance(v, (float, int))
|
||||||
self.meters[k].update(v)
|
self.meters[k].update(v)
|
||||||
|
|
||||||
|
def update_writer(self, it):
|
||||||
|
for name, meter in self.meters.items():
|
||||||
|
self.writer.add_scalar(name, meter, )
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
def __getattr__(self, attr):
|
||||||
if attr in self.meters:
|
if attr in self.meters:
|
||||||
return self.meters[attr]
|
return self.meters[attr]
|
||||||
|
8
minigpt4/common/vqa_tools/__init__.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
"""
|
||||||
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
All rights reserved.
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
"""
|
||||||
|
|
||||||
|
__author__ = "aagrawal"
|
211
minigpt4/common/vqa_tools/vqa.py
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
"""
|
||||||
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
All rights reserved.
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
"""
|
||||||
|
|
||||||
|
__author__ = "aagrawal"
|
||||||
|
__version__ = "0.9"
|
||||||
|
|
||||||
|
# Interface for accessing the VQA dataset.
|
||||||
|
|
||||||
|
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
|
||||||
|
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
|
||||||
|
|
||||||
|
# The following functions are defined:
|
||||||
|
# VQA - VQA class that loads VQA annotation file and prepares data structures.
|
||||||
|
# getQuesIds - Get question ids that satisfy given filter conditions.
|
||||||
|
# getImgIds - Get image ids that satisfy given filter conditions.
|
||||||
|
# loadQA - Load questions and answers with the specified question ids.
|
||||||
|
# showQA - Display the specified questions and answers.
|
||||||
|
# loadRes - Load result file and create result object.
|
||||||
|
|
||||||
|
# Help on each function can be accessed by: "help(COCO.function)"
|
||||||
|
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
import copy
|
||||||
|
|
||||||
|
|
||||||
|
class VQA:
|
||||||
|
def __init__(self, annotation_file=None, question_file=None):
|
||||||
|
"""
|
||||||
|
Constructor of VQA helper class for reading and visualizing questions and answers.
|
||||||
|
:param annotation_file (str): location of VQA annotation file
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
# load dataset
|
||||||
|
self.dataset = {}
|
||||||
|
self.questions = {}
|
||||||
|
self.qa = {}
|
||||||
|
self.qqa = {}
|
||||||
|
self.imgToQA = {}
|
||||||
|
if not annotation_file == None and not question_file == None:
|
||||||
|
print("loading VQA annotations and questions into memory...")
|
||||||
|
time_t = datetime.datetime.utcnow()
|
||||||
|
dataset = json.load(open(annotation_file, "r"))
|
||||||
|
questions = json.load(open(question_file, "r"))
|
||||||
|
self.dataset = dataset
|
||||||
|
self.questions = questions
|
||||||
|
self.createIndex()
|
||||||
|
|
||||||
|
def createIndex(self):
|
||||||
|
# create index
|
||||||
|
print("creating index...")
|
||||||
|
imgToQA = {ann["image_id"]: [] for ann in self.dataset["annotations"]}
|
||||||
|
qa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
|
||||||
|
qqa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
|
||||||
|
for ann in self.dataset["annotations"]:
|
||||||
|
imgToQA[ann["image_id"]] += [ann]
|
||||||
|
qa[ann["question_id"]] = ann
|
||||||
|
for ques in self.questions["questions"]:
|
||||||
|
qqa[ques["question_id"]] = ques
|
||||||
|
print("index created!")
|
||||||
|
|
||||||
|
# create class members
|
||||||
|
self.qa = qa
|
||||||
|
self.qqa = qqa
|
||||||
|
self.imgToQA = imgToQA
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
"""
|
||||||
|
Print information about the VQA annotation file.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
for key, value in self.datset["info"].items():
|
||||||
|
print("%s: %s" % (key, value))
|
||||||
|
|
||||||
|
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
|
||||||
|
"""
|
||||||
|
Get question ids that satisfy given filter conditions. default skips that filter
|
||||||
|
:param imgIds (int array) : get question ids for given imgs
|
||||||
|
quesTypes (str array) : get question ids for given question types
|
||||||
|
ansTypes (str array) : get question ids for given answer types
|
||||||
|
:return: ids (int array) : integer array of question ids
|
||||||
|
"""
|
||||||
|
imgIds = imgIds if type(imgIds) == list else [imgIds]
|
||||||
|
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
|
||||||
|
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
|
||||||
|
|
||||||
|
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
|
||||||
|
anns = self.dataset["annotations"]
|
||||||
|
else:
|
||||||
|
if not len(imgIds) == 0:
|
||||||
|
anns = sum(
|
||||||
|
[self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA],
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
anns = self.dataset["annotations"]
|
||||||
|
anns = (
|
||||||
|
anns
|
||||||
|
if len(quesTypes) == 0
|
||||||
|
else [ann for ann in anns if ann["question_type"] in quesTypes]
|
||||||
|
)
|
||||||
|
anns = (
|
||||||
|
anns
|
||||||
|
if len(ansTypes) == 0
|
||||||
|
else [ann for ann in anns if ann["answer_type"] in ansTypes]
|
||||||
|
)
|
||||||
|
ids = [ann["question_id"] for ann in anns]
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
|
||||||
|
"""
|
||||||
|
Get image ids that satisfy given filter conditions. default skips that filter
|
||||||
|
:param quesIds (int array) : get image ids for given question ids
|
||||||
|
quesTypes (str array) : get image ids for given question types
|
||||||
|
ansTypes (str array) : get image ids for given answer types
|
||||||
|
:return: ids (int array) : integer array of image ids
|
||||||
|
"""
|
||||||
|
quesIds = quesIds if type(quesIds) == list else [quesIds]
|
||||||
|
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
|
||||||
|
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
|
||||||
|
|
||||||
|
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
|
||||||
|
anns = self.dataset["annotations"]
|
||||||
|
else:
|
||||||
|
if not len(quesIds) == 0:
|
||||||
|
anns = sum(
|
||||||
|
[self.qa[quesId] for quesId in quesIds if quesId in self.qa], []
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
anns = self.dataset["annotations"]
|
||||||
|
anns = (
|
||||||
|
anns
|
||||||
|
if len(quesTypes) == 0
|
||||||
|
else [ann for ann in anns if ann["question_type"] in quesTypes]
|
||||||
|
)
|
||||||
|
anns = (
|
||||||
|
anns
|
||||||
|
if len(ansTypes) == 0
|
||||||
|
else [ann for ann in anns if ann["answer_type"] in ansTypes]
|
||||||
|
)
|
||||||
|
ids = [ann["image_id"] for ann in anns]
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def loadQA(self, ids=[]):
|
||||||
|
"""
|
||||||
|
Load questions and answers with the specified question ids.
|
||||||
|
:param ids (int array) : integer ids specifying question ids
|
||||||
|
:return: qa (object array) : loaded qa objects
|
||||||
|
"""
|
||||||
|
if type(ids) == list:
|
||||||
|
return [self.qa[id] for id in ids]
|
||||||
|
elif type(ids) == int:
|
||||||
|
return [self.qa[ids]]
|
||||||
|
|
||||||
|
def showQA(self, anns):
|
||||||
|
"""
|
||||||
|
Display the specified annotations.
|
||||||
|
:param anns (array of object): annotations to display
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
if len(anns) == 0:
|
||||||
|
return 0
|
||||||
|
for ann in anns:
|
||||||
|
quesId = ann["question_id"]
|
||||||
|
print("Question: %s" % (self.qqa[quesId]["question"]))
|
||||||
|
for ans in ann["answers"]:
|
||||||
|
print("Answer %d: %s" % (ans["answer_id"], ans["answer"]))
|
||||||
|
|
||||||
|
def loadRes(self, resFile, quesFile):
|
||||||
|
"""
|
||||||
|
Load result file and return a result object.
|
||||||
|
:param resFile (str) : file name of result file
|
||||||
|
:return: res (obj) : result api object
|
||||||
|
"""
|
||||||
|
res = VQA()
|
||||||
|
res.questions = json.load(open(quesFile))
|
||||||
|
res.dataset["info"] = copy.deepcopy(self.questions["info"])
|
||||||
|
res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"])
|
||||||
|
res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"])
|
||||||
|
res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"])
|
||||||
|
res.dataset["license"] = copy.deepcopy(self.questions["license"])
|
||||||
|
|
||||||
|
print("Loading and preparing results... ")
|
||||||
|
time_t = datetime.datetime.utcnow()
|
||||||
|
anns = json.load(open(resFile))
|
||||||
|
assert type(anns) == list, "results is not an array of objects"
|
||||||
|
annsQuesIds = [ann["question_id"] for ann in anns]
|
||||||
|
assert set(annsQuesIds) == set(
|
||||||
|
self.getQuesIds()
|
||||||
|
), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file."
|
||||||
|
for ann in anns:
|
||||||
|
quesId = ann["question_id"]
|
||||||
|
if res.dataset["task_type"] == "Multiple Choice":
|
||||||
|
assert (
|
||||||
|
ann["answer"] in self.qqa[quesId]["multiple_choices"]
|
||||||
|
), "predicted answer is not one of the multiple choices"
|
||||||
|
qaAnn = self.qa[quesId]
|
||||||
|
ann["image_id"] = qaAnn["image_id"]
|
||||||
|
ann["question_type"] = qaAnn["question_type"]
|
||||||
|
ann["answer_type"] = qaAnn["answer_type"]
|
||||||
|
print(
|
||||||
|
"DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds())
|
||||||
|
)
|
||||||
|
|
||||||
|
res.dataset["annotations"] = anns
|
||||||
|
res.createIndex()
|
||||||
|
return res
|
324
minigpt4/common/vqa_tools/vqa_eval.py
Normal file
@ -0,0 +1,324 @@
|
|||||||
|
"""
|
||||||
|
Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
All rights reserved.
|
||||||
|
SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
"""
|
||||||
|
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
__author__ = "aagrawal"
|
||||||
|
|
||||||
|
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
|
||||||
|
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class VQAEval:
|
||||||
|
def __init__(self, vqa=None, vqaRes=None, n=2):
|
||||||
|
self.n = n
|
||||||
|
self.accuracy = {}
|
||||||
|
self.evalQA = {}
|
||||||
|
self.evalQuesType = {}
|
||||||
|
self.evalAnsType = {}
|
||||||
|
self.vqa = vqa # annotation
|
||||||
|
self.vqaRes = vqaRes # predict answers
|
||||||
|
if vqa is not None:
|
||||||
|
self.params = {"question_id": vqa.getQuesIds()}
|
||||||
|
self.contractions = {
|
||||||
|
"aint": "ain't",
|
||||||
|
"arent": "aren't",
|
||||||
|
"cant": "can't",
|
||||||
|
"couldve": "could've",
|
||||||
|
"couldnt": "couldn't",
|
||||||
|
"couldn'tve": "couldn't've",
|
||||||
|
"couldnt've": "couldn't've",
|
||||||
|
"didnt": "didn't",
|
||||||
|
"doesnt": "doesn't",
|
||||||
|
"dont": "don't",
|
||||||
|
"hadnt": "hadn't",
|
||||||
|
"hadnt've": "hadn't've",
|
||||||
|
"hadn'tve": "hadn't've",
|
||||||
|
"hasnt": "hasn't",
|
||||||
|
"havent": "haven't",
|
||||||
|
"hed": "he'd",
|
||||||
|
"hed've": "he'd've",
|
||||||
|
"he'dve": "he'd've",
|
||||||
|
"hes": "he's",
|
||||||
|
"howd": "how'd",
|
||||||
|
"howll": "how'll",
|
||||||
|
"hows": "how's",
|
||||||
|
"Id've": "I'd've",
|
||||||
|
"I'dve": "I'd've",
|
||||||
|
"Im": "I'm",
|
||||||
|
"Ive": "I've",
|
||||||
|
"isnt": "isn't",
|
||||||
|
"itd": "it'd",
|
||||||
|
"itd've": "it'd've",
|
||||||
|
"it'dve": "it'd've",
|
||||||
|
"itll": "it'll",
|
||||||
|
"let's": "let's",
|
||||||
|
"maam": "ma'am",
|
||||||
|
"mightnt": "mightn't",
|
||||||
|
"mightnt've": "mightn't've",
|
||||||
|
"mightn'tve": "mightn't've",
|
||||||
|
"mightve": "might've",
|
||||||
|
"mustnt": "mustn't",
|
||||||
|
"mustve": "must've",
|
||||||
|
"neednt": "needn't",
|
||||||
|
"notve": "not've",
|
||||||
|
"oclock": "o'clock",
|
||||||
|
"oughtnt": "oughtn't",
|
||||||
|
"ow's'at": "'ow's'at",
|
||||||
|
"'ows'at": "'ow's'at",
|
||||||
|
"'ow'sat": "'ow's'at",
|
||||||
|
"shant": "shan't",
|
||||||
|
"shed've": "she'd've",
|
||||||
|
"she'dve": "she'd've",
|
||||||
|
"she's": "she's",
|
||||||
|
"shouldve": "should've",
|
||||||
|
"shouldnt": "shouldn't",
|
||||||
|
"shouldnt've": "shouldn't've",
|
||||||
|
"shouldn'tve": "shouldn't've",
|
||||||
|
"somebody'd": "somebodyd",
|
||||||
|
"somebodyd've": "somebody'd've",
|
||||||
|
"somebody'dve": "somebody'd've",
|
||||||
|
"somebodyll": "somebody'll",
|
||||||
|
"somebodys": "somebody's",
|
||||||
|
"someoned": "someone'd",
|
||||||
|
"someoned've": "someone'd've",
|
||||||
|
"someone'dve": "someone'd've",
|
||||||
|
"someonell": "someone'll",
|
||||||
|
"someones": "someone's",
|
||||||
|
"somethingd": "something'd",
|
||||||
|
"somethingd've": "something'd've",
|
||||||
|
"something'dve": "something'd've",
|
||||||
|
"somethingll": "something'll",
|
||||||
|
"thats": "that's",
|
||||||
|
"thered": "there'd",
|
||||||
|
"thered've": "there'd've",
|
||||||
|
"there'dve": "there'd've",
|
||||||
|
"therere": "there're",
|
||||||
|
"theres": "there's",
|
||||||
|
"theyd": "they'd",
|
||||||
|
"theyd've": "they'd've",
|
||||||
|
"they'dve": "they'd've",
|
||||||
|
"theyll": "they'll",
|
||||||
|
"theyre": "they're",
|
||||||
|
"theyve": "they've",
|
||||||
|
"twas": "'twas",
|
||||||
|
"wasnt": "wasn't",
|
||||||
|
"wed've": "we'd've",
|
||||||
|
"we'dve": "we'd've",
|
||||||
|
"weve": "we've",
|
||||||
|
"werent": "weren't",
|
||||||
|
"whatll": "what'll",
|
||||||
|
"whatre": "what're",
|
||||||
|
"whats": "what's",
|
||||||
|
"whatve": "what've",
|
||||||
|
"whens": "when's",
|
||||||
|
"whered": "where'd",
|
||||||
|
"wheres": "where's",
|
||||||
|
"whereve": "where've",
|
||||||
|
"whod": "who'd",
|
||||||
|
"whod've": "who'd've",
|
||||||
|
"who'dve": "who'd've",
|
||||||
|
"wholl": "who'll",
|
||||||
|
"whos": "who's",
|
||||||
|
"whove": "who've",
|
||||||
|
"whyll": "why'll",
|
||||||
|
"whyre": "why're",
|
||||||
|
"whys": "why's",
|
||||||
|
"wont": "won't",
|
||||||
|
"wouldve": "would've",
|
||||||
|
"wouldnt": "wouldn't",
|
||||||
|
"wouldnt've": "wouldn't've",
|
||||||
|
"wouldn'tve": "wouldn't've",
|
||||||
|
"yall": "y'all",
|
||||||
|
"yall'll": "y'all'll",
|
||||||
|
"y'allll": "y'all'll",
|
||||||
|
"yall'd've": "y'all'd've",
|
||||||
|
"y'alld've": "y'all'd've",
|
||||||
|
"y'all'dve": "y'all'd've",
|
||||||
|
"youd": "you'd",
|
||||||
|
"youd've": "you'd've",
|
||||||
|
"you'dve": "you'd've",
|
||||||
|
"youll": "you'll",
|
||||||
|
"youre": "you're",
|
||||||
|
"youve": "you've",
|
||||||
|
}
|
||||||
|
self.manualMap = {
|
||||||
|
"none": "0",
|
||||||
|
"zero": "0",
|
||||||
|
"one": "1",
|
||||||
|
"two": "2",
|
||||||
|
"three": "3",
|
||||||
|
"four": "4",
|
||||||
|
"five": "5",
|
||||||
|
"six": "6",
|
||||||
|
"seven": "7",
|
||||||
|
"eight": "8",
|
||||||
|
"nine": "9",
|
||||||
|
"ten": "10",
|
||||||
|
}
|
||||||
|
self.articles = ["a", "an", "the"]
|
||||||
|
|
||||||
|
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
|
||||||
|
self.commaStrip = re.compile("(\d)(,)(\d)")
|
||||||
|
self.punct = [
|
||||||
|
";",
|
||||||
|
r"/",
|
||||||
|
"[",
|
||||||
|
"]",
|
||||||
|
'"',
|
||||||
|
"{",
|
||||||
|
"}",
|
||||||
|
"(",
|
||||||
|
")",
|
||||||
|
"=",
|
||||||
|
"+",
|
||||||
|
"\\",
|
||||||
|
"_",
|
||||||
|
"-",
|
||||||
|
">",
|
||||||
|
"<",
|
||||||
|
"@",
|
||||||
|
"`",
|
||||||
|
",",
|
||||||
|
"?",
|
||||||
|
"!",
|
||||||
|
]
|
||||||
|
|
||||||
|
def evaluate(self, quesIds=None):
|
||||||
|
if quesIds == None:
|
||||||
|
quesIds = [quesId for quesId in self.params["question_id"]]
|
||||||
|
gts = {}
|
||||||
|
res = {}
|
||||||
|
for quesId in quesIds:
|
||||||
|
gts[quesId] = self.vqa.qa[quesId]
|
||||||
|
res[quesId] = self.vqaRes.qa[quesId]
|
||||||
|
|
||||||
|
# =================================================
|
||||||
|
# Compute accuracy
|
||||||
|
# =================================================
|
||||||
|
accQA = []
|
||||||
|
accQuesType = {}
|
||||||
|
accAnsType = {}
|
||||||
|
print("computing accuracy")
|
||||||
|
step = 0
|
||||||
|
for quesId in quesIds:
|
||||||
|
resAns = res[quesId]["answer"]
|
||||||
|
resAns = resAns.replace("\n", " ")
|
||||||
|
resAns = resAns.replace("\t", " ")
|
||||||
|
resAns = resAns.strip()
|
||||||
|
resAns = self.processPunctuation(resAns)
|
||||||
|
resAns = self.processDigitArticle(resAns)
|
||||||
|
gtAcc = []
|
||||||
|
gtAnswers = [ans["answer"] for ans in gts[quesId]["answers"]]
|
||||||
|
if len(set(gtAnswers)) > 1:
|
||||||
|
for ansDic in gts[quesId]["answers"]:
|
||||||
|
ansDic["answer"] = self.processPunctuation(ansDic["answer"])
|
||||||
|
for gtAnsDatum in gts[quesId]["answers"]:
|
||||||
|
otherGTAns = [
|
||||||
|
item for item in gts[quesId]["answers"] if item != gtAnsDatum
|
||||||
|
]
|
||||||
|
matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
|
||||||
|
acc = min(1, float(len(matchingAns)) / 3)
|
||||||
|
gtAcc.append(acc)
|
||||||
|
quesType = gts[quesId]["question_type"]
|
||||||
|
ansType = gts[quesId]["answer_type"]
|
||||||
|
avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
|
||||||
|
accQA.append(avgGTAcc)
|
||||||
|
if quesType not in accQuesType:
|
||||||
|
accQuesType[quesType] = []
|
||||||
|
accQuesType[quesType].append(avgGTAcc)
|
||||||
|
if ansType not in accAnsType:
|
||||||
|
accAnsType[ansType] = []
|
||||||
|
accAnsType[ansType].append(avgGTAcc)
|
||||||
|
self.setEvalQA(quesId, avgGTAcc)
|
||||||
|
self.setEvalQuesType(quesId, quesType, avgGTAcc)
|
||||||
|
self.setEvalAnsType(quesId, ansType, avgGTAcc)
|
||||||
|
if step % 100 == 0:
|
||||||
|
self.updateProgress(step / float(len(quesIds)))
|
||||||
|
step = step + 1
|
||||||
|
|
||||||
|
self.setAccuracy(accQA, accQuesType, accAnsType)
|
||||||
|
print("Done computing accuracy")
|
||||||
|
|
||||||
|
def processPunctuation(self, inText):
|
||||||
|
outText = inText
|
||||||
|
for p in self.punct:
|
||||||
|
if (p + " " in inText or " " + p in inText) or (
|
||||||
|
re.search(self.commaStrip, inText) != None
|
||||||
|
):
|
||||||
|
outText = outText.replace(p, "")
|
||||||
|
else:
|
||||||
|
outText = outText.replace(p, " ")
|
||||||
|
outText = self.periodStrip.sub("", outText, re.UNICODE)
|
||||||
|
return outText
|
||||||
|
|
||||||
|
def processDigitArticle(self, inText):
|
||||||
|
outText = []
|
||||||
|
tempText = inText.lower().split()
|
||||||
|
for word in tempText:
|
||||||
|
word = self.manualMap.setdefault(word, word)
|
||||||
|
if word not in self.articles:
|
||||||
|
outText.append(word)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
for wordId, word in enumerate(outText):
|
||||||
|
if word in self.contractions:
|
||||||
|
outText[wordId] = self.contractions[word]
|
||||||
|
outText = " ".join(outText)
|
||||||
|
return outText
|
||||||
|
|
||||||
|
def setAccuracy(self, accQA, accQuesType, accAnsType):
|
||||||
|
self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n)
|
||||||
|
self.accuracy["perQuestionType"] = {
|
||||||
|
quesType: round(
|
||||||
|
100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]),
|
||||||
|
self.n,
|
||||||
|
)
|
||||||
|
for quesType in accQuesType
|
||||||
|
}
|
||||||
|
self.accuracy["perAnswerType"] = {
|
||||||
|
ansType: round(
|
||||||
|
100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n
|
||||||
|
)
|
||||||
|
for ansType in accAnsType
|
||||||
|
}
|
||||||
|
|
||||||
|
def setEvalQA(self, quesId, acc):
|
||||||
|
self.evalQA[quesId] = round(100 * acc, self.n)
|
||||||
|
|
||||||
|
def setEvalQuesType(self, quesId, quesType, acc):
|
||||||
|
if quesType not in self.evalQuesType:
|
||||||
|
self.evalQuesType[quesType] = {}
|
||||||
|
self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
|
||||||
|
|
||||||
|
def setEvalAnsType(self, quesId, ansType, acc):
|
||||||
|
if ansType not in self.evalAnsType:
|
||||||
|
self.evalAnsType[ansType] = {}
|
||||||
|
self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
|
||||||
|
|
||||||
|
def updateProgress(self, progress):
|
||||||
|
barLength = 20
|
||||||
|
status = ""
|
||||||
|
if isinstance(progress, int):
|
||||||
|
progress = float(progress)
|
||||||
|
if not isinstance(progress, float):
|
||||||
|
progress = 0
|
||||||
|
status = "error: progress var must be float\r\n"
|
||||||
|
if progress < 0:
|
||||||
|
progress = 0
|
||||||
|
status = "Halt...\r\n"
|
||||||
|
if progress >= 1:
|
||||||
|
progress = 1
|
||||||
|
status = "Done...\r\n"
|
||||||
|
block = int(round(barLength * progress))
|
||||||
|
text = "\rFinshed Percent: [{0}] {1}% {2}".format(
|
||||||
|
"#" * block + "-" * (barLength - block), int(progress * 100), status
|
||||||
|
)
|
||||||
|
sys.stdout.write(text)
|
||||||
|
sys.stdout.flush()
|
@ -15,6 +15,16 @@ datasets:
|
|||||||
url:
|
url:
|
||||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
||||||
storage:
|
storage:
|
||||||
- /path/to/aokvqa_v1p0_train.json
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
|
||||||
images:
|
images:
|
||||||
storage: /path/to/coco/images
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
@ -14,8 +14,18 @@ datasets:
|
|||||||
annotations:
|
annotations:
|
||||||
train:
|
train:
|
||||||
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
||||||
md5: aa31ac474cf6250ebb81d18348a07ed8
|
# md5: aa31ac474cf6250ebb81d18348a07ed8
|
||||||
storage: /path/to/coco_caption/coco_karpathy_train.json
|
storage:
|
||||||
images:
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
|
||||||
storage: /path/to/coco/images
|
val:
|
||||||
|
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
|
||||||
|
test:
|
||||||
|
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
|
||||||
|
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
|
|
||||||
|
26
minigpt4/configs/datasets/coco/caption_eval.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
coco_caption: # name of the dataset builder
|
||||||
|
# dataset_card: dataset_card/coco_caption.md
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||||
|
annotations:
|
||||||
|
val:
|
||||||
|
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
|
||||||
|
test:
|
||||||
|
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
|
||||||
|
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
|
|
@ -13,12 +13,36 @@ datasets:
|
|||||||
annotations:
|
annotations:
|
||||||
train:
|
train:
|
||||||
url:
|
url:
|
||||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json # 443752
|
||||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json # 214352
|
||||||
storage:
|
storage:
|
||||||
- /path/to/vqav2/vqa_train.json
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
|
||||||
- /path/to/vqav2/vqa_val.json
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_train_part100.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_part100.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_eval_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_test_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
|
||||||
images:
|
images:
|
||||||
storage: /path/to/coco/images
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
|
|
||||||
|
|
39
minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
coco_vqa:
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
|
||||||
|
annotations:
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
|
|
||||||
|
|
48
minigpt4/configs/datasets/coco/defaults_vqa_part.yaml
Executable file
@ -0,0 +1,48 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
coco_vqa:
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
|
||||||
|
annotations:
|
||||||
|
train:
|
||||||
|
url:
|
||||||
|
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json # 443752
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json # 214352
|
||||||
|
storage:
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_train_part100.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_part100.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_val_eval_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/test_part/vqa_test_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
|
||||||
|
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
|
|
||||||
|
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
invrefcoco:
|
invrefcoco:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: invrefcoco
|
dataset: invrefcoco
|
||||||
splitBy: unc
|
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
invrefcocog:
|
invrefcocog:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: invrefcocog
|
dataset: invrefcocog
|
||||||
splitBy: umd
|
splitBy: umd
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
invrefcocop:
|
invrefcocop:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: invrefcoco+
|
dataset: invrefcoco+
|
||||||
splitBy: unc
|
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
refcoco:
|
refcoco:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: refcoco
|
dataset: refcoco
|
||||||
splitBy: unc
|
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
refcocog:
|
refcocog:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: refcocog
|
dataset: refcocog
|
||||||
splitBy: umd
|
splitBy: umd
|
@ -2,7 +2,7 @@ datasets:
|
|||||||
refcocop:
|
refcocop:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
||||||
ann_path: /path/to/refcoco_annotations
|
ann_path: /path/to/refcoco_annotations
|
||||||
dataset: refcoco+
|
dataset: refcoco+
|
||||||
splitBy: unc
|
splitBy: unc
|
30
minigpt4/configs/datasets/gqa/balanced_sft_raw.yaml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
gqa:
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||||
|
annotations:
|
||||||
|
train:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/train_balanced_questions.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/
|
@ -11,11 +11,15 @@ datasets:
|
|||||||
build_info:
|
build_info:
|
||||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||||
annotations:
|
annotations:
|
||||||
train:
|
val:
|
||||||
url:
|
url:
|
||||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
|
||||||
storage:
|
storage:
|
||||||
- /path/to/gqa/train_balanced_questions.json
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
|
||||||
images:
|
images:
|
||||||
storage: /path/to/gqa/images
|
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/
|
30
minigpt4/configs/datasets/gqa/balanced_sft_raw_part.yaml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
gqa:
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||||
|
annotations:
|
||||||
|
train:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/train_balanced_questions_90k.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/testdev_balanced_questions.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/test_balanced_questions.json
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images/
|
@ -3,5 +3,6 @@ datasets:
|
|||||||
llava_conversation:
|
llava_conversation:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
ann_path: /path/to/llava/conversation_58k.json
|
# ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/conversation_58k.json
|
||||||
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/llava_conver_single_turn_257k_clean_v2.json
|
@ -2,5 +2,5 @@ datasets:
|
|||||||
llava_detail:
|
llava_detail:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
ann_path: /path/to/llava/detail_23k.json
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/detail_23k.json
|
12
minigpt4/configs/datasets/llava/mix.yaml
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
datasets:
|
||||||
|
|
||||||
|
llava_mix:
|
||||||
|
data_type: images
|
||||||
|
build_info:
|
||||||
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/llava_v1_5_mix665k/llava_v1_5_mix665k.json
|
||||||
|
image_path_coco: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
|
image_path_gqa: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images
|
||||||
|
image_path_ocr: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images
|
||||||
|
image_path_text: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images
|
||||||
|
# image_path_vg: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VG
|
||||||
|
|
6
minigpt4/configs/datasets/llava/pretrain_cap.yaml
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
datasets:
|
||||||
|
llava_pretrain:
|
||||||
|
data_type: images
|
||||||
|
build_info:
|
||||||
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/llava-cc3m-595k/images
|
||||||
|
ann_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/llava-cc3m-595k/chat.json
|
@ -3,5 +3,5 @@ datasets:
|
|||||||
llava_reason:
|
llava_reason:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
ann_path: /path/to/llava/complex_reasoning_77k.json
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/multimodal-sft/llava_150k/en/complex_reasoning_77k.json
|
10
minigpt4/configs/datasets/mix_vqa/mix_vqa.yaml
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
datasets:
|
||||||
|
|
||||||
|
llava_mix:
|
||||||
|
data_type: images
|
||||||
|
build_info:
|
||||||
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/llava_v1_5_mix665k/mix_coco_gqa_162k.json
|
||||||
|
image_path_coco: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
|
image_path_gqa: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images
|
||||||
|
image_path_ocr: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images
|
||||||
|
image_path_text: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images
|
@ -3,5 +3,5 @@ datasets:
|
|||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
|
|
||||||
image_path: /path/to/coco/images
|
image_path: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014
|
||||||
ann_path: /path/to/multitask_conversation/multi_task_conversation.json
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/minigptv2_llava_multitask_conv/multitask_conversation.json
|
@ -2,4 +2,4 @@ datasets:
|
|||||||
unnatural_instruction:
|
unnatural_instruction:
|
||||||
data_type: text
|
data_type: text
|
||||||
build_info:
|
build_info:
|
||||||
ann_path: /path/to/unnatural_instructions/filtered_unnatural_instruction.json
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/unnatural_instructions/filtered_unnatural_instruction.json
|
@ -2,5 +2,5 @@ datasets:
|
|||||||
ocrvqa:
|
ocrvqa:
|
||||||
data_type: images
|
data_type: images
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/ocrvqa/images
|
image_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/image # 207572
|
||||||
ann_path: /path/to/ocrvqa/dataset.json
|
ann_path: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/dataset.json
|
@ -15,7 +15,38 @@ datasets:
|
|||||||
url:
|
url:
|
||||||
# TODO make this order insensitive
|
# TODO make this order insensitive
|
||||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
||||||
|
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
||||||
|
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
||||||
storage:
|
storage:
|
||||||
- /path/to/okvqa/okvqa_train.json
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train_part100.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_train2014_questions.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_train2014_annotations.json
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
|
||||||
images:
|
images:
|
||||||
storage: /path/to/coco/images
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
42
minigpt4/configs/datasets/okvqa/eval.yaml
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
ok_vqa:
|
||||||
|
# data_dir: ${env.data_dir}/datasets
|
||||||
|
data_type: images # [images|videos|features]
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||||
|
annotations:
|
||||||
|
val:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
|
||||||
|
test:
|
||||||
|
url:
|
||||||
|
# TODO make this order insensitive
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
|
||||||
|
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
|
@ -3,7 +3,15 @@ datasets:
|
|||||||
data_type: images
|
data_type: images
|
||||||
|
|
||||||
build_info:
|
build_info:
|
||||||
image_path: /path/to/textcaps/train_images
|
annotations:
|
||||||
ann_path: /path/to/textcaps/TextCaps_0.1_train.json
|
train:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_train.json
|
||||||
|
val:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_val.json
|
||||||
|
test:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextCap/TextCaps_0.1_test.json
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA
|
||||||
|
17
minigpt4/configs/datasets/textvqa/vqa.yaml
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
datasets:
|
||||||
|
text_vqa:
|
||||||
|
data_type: images
|
||||||
|
|
||||||
|
build_info:
|
||||||
|
annotations:
|
||||||
|
train:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_train.json
|
||||||
|
val:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_val.json
|
||||||
|
test:
|
||||||
|
storage:
|
||||||
|
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/TextVQA_0.5.1_test.json
|
||||||
|
images:
|
||||||
|
storage: /mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA
|
@ -0,0 +1,59 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: blip2_t5_instruct_pro_moe
|
||||||
|
model_type: flant5xxl
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
vit_model: eva_clip_g
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
|
||||||
|
finetuned: ""
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
qformer_text_input: True
|
||||||
|
|
||||||
|
# T5
|
||||||
|
t5_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/google-flan-t5-xxl"
|
||||||
|
prompt: ""
|
||||||
|
max_txt_len: 256
|
||||||
|
max_output_txt_len: 256
|
||||||
|
|
||||||
|
# freeze
|
||||||
|
freeze_vit: True
|
||||||
|
freeze_llm: True
|
||||||
|
freeze_qformer: False
|
||||||
|
freeze_t5_proj: False
|
||||||
|
|
||||||
|
# moe
|
||||||
|
moe_position: "pre" # post (position to insert PromptMoE Part)
|
||||||
|
embed_extract: "blip2_pretrain" # t5, random (way to extract embeddings of task instruction if moe_position is pre)
|
||||||
|
repeat_to_init_qt_candidates: True
|
||||||
|
num_qt_candidates: 20
|
||||||
|
moe_topk: 2
|
||||||
|
eval_gate_save: False
|
||||||
|
train_gate_save: False
|
||||||
|
gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/flant5xxl/prompt_moe/llava_st_257k_raw_train_qf_train_qt_linear_gate_textt5_20ex_3loss_textinqf_epo3_1012/"
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
@ -0,0 +1,56 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: blip2_t5_qformer_moe
|
||||||
|
model_type: flant5xxl
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
vit_model: eva_clip_g
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
|
||||||
|
finetuned: ""
|
||||||
|
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2-flant5-xxl/blip2_pretrained_flant5xxl.pth"
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
qformer_text_input: True
|
||||||
|
|
||||||
|
# T5
|
||||||
|
t5_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/google-flan-t5-xxl"
|
||||||
|
prompt: ""
|
||||||
|
max_txt_len: 256
|
||||||
|
max_output_txt_len: 256
|
||||||
|
|
||||||
|
# freeze
|
||||||
|
freeze_vit: True
|
||||||
|
freeze_llm: True
|
||||||
|
freeze_qformer: False
|
||||||
|
freeze_t5_proj: False
|
||||||
|
|
||||||
|
# moe
|
||||||
|
moebert_expert_num: 5
|
||||||
|
moebert_route_method: "gate-sentence"
|
||||||
|
moebert_load_balance: 0.1
|
||||||
|
moe_topk: 2
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
44
minigpt4/configs/models/blip2/blip2_instruct_vicuna7b.yaml
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: instruct_vicuna7b
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/instruct_blip_vicuna7b_trimmed/instruct_blip_vicuna7b_trimmed.pth"
|
||||||
|
finetuned: ""
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
freeze_vit: True
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
|
||||||
|
# path to Vicuna checkpoint
|
||||||
|
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
|
||||||
|
# generation configs
|
||||||
|
prompt: ""
|
||||||
|
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
||||||
|
|
42
minigpt4/configs/models/blip2/blip2_pretrain.yaml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: pretrain
|
||||||
|
load_finetuned: False
|
||||||
|
|
||||||
|
# pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_pretrained/blip2_pretrained.pth"
|
||||||
|
finetuned: ""
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
freeze_vit: True
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
|
||||||
|
# moe
|
||||||
|
use_moeqformer: True
|
||||||
|
use_route_moe: True
|
||||||
|
moebert_expert_num: 5
|
||||||
|
moebert_num_beams: 2
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
43
minigpt4/configs/models/blip2/blip2_pretrain_vicuna7b.yaml
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: instruct_vicuna7b
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
finetuned: ""
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
freeze_vit: True
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
|
||||||
|
# path to Vicuna checkpoint
|
||||||
|
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
|
||||||
|
# generation configs
|
||||||
|
prompt: ""
|
||||||
|
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip2_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
@ -0,0 +1,59 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: blip2_vicuna_instruct
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
finetuned: ""
|
||||||
|
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
qformer_text_input: True
|
||||||
|
|
||||||
|
# path to Vicuna checkpoint
|
||||||
|
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
prompt: ""
|
||||||
|
max_txt_len: 256
|
||||||
|
max_output_txt_len: 256
|
||||||
|
|
||||||
|
# freeze
|
||||||
|
freeze_vit: True
|
||||||
|
freeze_llm: True
|
||||||
|
freeze_qformer: False
|
||||||
|
freeze_t5_proj: False
|
||||||
|
|
||||||
|
# moe
|
||||||
|
general_version: 'route_moe'
|
||||||
|
moebert_route_method: "post-route"
|
||||||
|
moebert_load_balance: 0.05
|
||||||
|
moebert_expert_num: 3
|
||||||
|
moebert_num_beams: 3
|
||||||
|
moe_weight_type: 'ffn_prob'
|
||||||
|
use_balance_loss: False
|
||||||
|
ln_position: "out"
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip2_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
@ -0,0 +1,59 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: blip2_vicuna_instruct
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
finetuned: ""
|
||||||
|
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
qformer_text_input: True
|
||||||
|
|
||||||
|
# path to Vicuna checkpoint
|
||||||
|
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
prompt: ""
|
||||||
|
max_txt_len: 256
|
||||||
|
max_output_txt_len: 256
|
||||||
|
|
||||||
|
# freeze
|
||||||
|
freeze_vit: True
|
||||||
|
freeze_llm: True
|
||||||
|
freeze_qformer: False
|
||||||
|
freeze_t5_proj: False
|
||||||
|
|
||||||
|
# moe
|
||||||
|
general_version: 'uni_route_moe'
|
||||||
|
moebert_route_method: "post-route-uni"
|
||||||
|
moebert_load_balance: 0.05
|
||||||
|
moebert_expert_num: 3
|
||||||
|
moebert_num_beams: 3
|
||||||
|
moe_weight_type: 'ffn_prob'
|
||||||
|
use_balance_loss: False
|
||||||
|
ln_position: "in"
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip2_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
@ -0,0 +1,60 @@
|
|||||||
|
# Copyright (c) 2022, salesforce.com, inc.
|
||||||
|
# All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
model:
|
||||||
|
arch: blip2_vicuna_instruct
|
||||||
|
load_finetuned: False
|
||||||
|
load_pretrained: True
|
||||||
|
|
||||||
|
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/instruct_blip_vicuna7b_trimmed/instruct_blip_vicuna7b_trimmed.pth"
|
||||||
|
finetuned: ""
|
||||||
|
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
|
||||||
|
|
||||||
|
# vit encoder
|
||||||
|
image_size: 224
|
||||||
|
drop_path_rate: 0
|
||||||
|
use_grad_checkpoint: False
|
||||||
|
vit_precision: "fp16"
|
||||||
|
|
||||||
|
# Q-Former
|
||||||
|
num_query_token: 32
|
||||||
|
qformer_text_input: True
|
||||||
|
|
||||||
|
# path to Vicuna checkpoint
|
||||||
|
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
prompt: ""
|
||||||
|
max_txt_len: 256
|
||||||
|
max_output_txt_len: 256
|
||||||
|
|
||||||
|
# freeze
|
||||||
|
freeze_vit: True
|
||||||
|
freeze_llm: True
|
||||||
|
freeze_qformer: False
|
||||||
|
freeze_t5_proj: False
|
||||||
|
|
||||||
|
# moe
|
||||||
|
general_version: "naive_moe"
|
||||||
|
moebert_expert_num: 5
|
||||||
|
moebert_route_method: "gate-sentence-post"
|
||||||
|
moebert_load_balance: 0
|
||||||
|
moe_topk: 1
|
||||||
|
use_balance_loss: False
|
||||||
|
moe_weight_type: 'average'
|
||||||
|
|
||||||
|
|
||||||
|
preprocess:
|
||||||
|
vis_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_image_train"
|
||||||
|
image_size: 224
|
||||||
|
eval:
|
||||||
|
name: "blip_image_eval"
|
||||||
|
image_size: 224
|
||||||
|
text_processor:
|
||||||
|
train:
|
||||||
|
name: "blip_caption"
|
||||||
|
eval:
|
||||||
|
name: "blip_caption"
|
||||||
|
|
@ -15,7 +15,7 @@ model:
|
|||||||
# generation configs
|
# generation configs
|
||||||
prompt: ""
|
prompt: ""
|
||||||
|
|
||||||
llama_model: "please set this value to the path of vicuna model"
|
llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
|
||||||
|
|
||||||
preprocess:
|
preprocess:
|
||||||
vis_processor:
|
vis_processor:
|
@ -11,7 +11,7 @@ model:
|
|||||||
# generation configs
|
# generation configs
|
||||||
prompt: ""
|
prompt: ""
|
||||||
|
|
||||||
llama_model: "please set this value to the path of llama2-chat-7b"
|
llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/llama_2_7b_chat"
|
||||||
lora_r: 64
|
lora_r: 64
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|
@ -208,7 +208,8 @@ class BaseDatasetBuilder:
|
|||||||
ann_paths = abs_ann_paths
|
ann_paths = abs_ann_paths
|
||||||
|
|
||||||
# visual data storage path
|
# visual data storage path
|
||||||
vis_path = os.path.join(vis_info.storage, split)
|
# vis_path = os.path.join(vis_info.storage, split)
|
||||||
|
vis_path = os.path.join(vis_info.storage)
|
||||||
|
|
||||||
if not os.path.isabs(vis_path):
|
if not os.path.isabs(vis_path):
|
||||||
# vis_path = os.path.join(utils.get_cache_path(), vis_path)
|
# vis_path = os.path.join(utils.get_cache_path(), vis_path)
|
||||||
@ -219,12 +220,14 @@ class BaseDatasetBuilder:
|
|||||||
|
|
||||||
# create datasets
|
# create datasets
|
||||||
dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
|
dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
|
||||||
|
print(dataset_cls)
|
||||||
datasets[split] = dataset_cls(
|
datasets[split] = dataset_cls(
|
||||||
vis_processor=vis_processor,
|
vis_processor=vis_processor,
|
||||||
text_processor=text_processor,
|
text_processor=text_processor,
|
||||||
ann_paths=ann_paths,
|
ann_paths=ann_paths,
|
||||||
vis_root=vis_path,
|
vis_root=vis_path,
|
||||||
)
|
)
|
||||||
|
print("{} Length {} : {}".format(dataset_cls.__name__, split, len(datasets[split]))) # print class name
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
|
@ -6,19 +6,18 @@ from minigpt4.common.registry import registry
|
|||||||
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
|
from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
|
||||||
from minigpt4.datasets.datasets.laion_dataset import LaionDataset
|
from minigpt4.datasets.datasets.laion_dataset import LaionDataset
|
||||||
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
|
from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
|
||||||
from minigpt4.datasets.datasets.text_caps import TextCapDataset
|
from minigpt4.datasets.datasets.text_caps import TextCapDataset, TextCapEvalDataset
|
||||||
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset
|
from minigpt4.datasets.datasets.text_vqa_dataset import TextVQADataset, TextVQAEvalDataset
|
||||||
|
from minigpt4.datasets.datasets.llava_dataset import LlavaDetailDataset, LlavaReasonDataset, LlavaConversationDataset, LlavaMixDataset, LlavaPretrainDataset
|
||||||
from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
|
from minigpt4.datasets.datasets.unnatural_instruction import UnnaturalDataset
|
||||||
from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
|
from minigpt4.datasets.datasets.multitask_conversation import MultiTaskConversationDataset
|
||||||
from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
|
from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObjectDataset,PhraseToObjectDataset
|
||||||
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
|
from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
|
||||||
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
|
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
|
||||||
from minigpt4.datasets.datasets.gqa_datasets import GQADataset
|
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
|
||||||
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
|
from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset
|
||||||
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset
|
|
||||||
from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
|
from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
|
||||||
from minigpt4.datasets.datasets.coco_caption import COCOCapDataset
|
from minigpt4.datasets.datasets.coco_caption import COCOCapDataset, COCOCapEvalDataset
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("multitask_conversation")
|
@registry.register_builder("multitask_conversation")
|
||||||
class MultitaskConversationBuilder(BaseDatasetBuilder):
|
class MultitaskConversationBuilder(BaseDatasetBuilder):
|
||||||
@ -29,7 +28,7 @@ class MultitaskConversationBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[multitask_conversation]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -55,7 +54,7 @@ class UnnaturalInstructionBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[unnatural_instruction]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -66,6 +65,7 @@ class UnnaturalInstructionBuilder(BaseDatasetBuilder):
|
|||||||
text_processor=self.text_processors["train"],
|
text_processor=self.text_processors["train"],
|
||||||
ann_path=build_info.ann_path,
|
ann_path=build_info.ann_path,
|
||||||
)
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ class LlavaDetailBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[llava_detail]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -93,11 +93,10 @@ class LlavaDetailBuilder(BaseDatasetBuilder):
|
|||||||
ann_path=build_info.ann_path,
|
ann_path=build_info.ann_path,
|
||||||
vis_root=build_info.image_path,
|
vis_root=build_info.image_path,
|
||||||
)
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("llava_reason")
|
@registry.register_builder("llava_reason")
|
||||||
class LlavaReasonBuilder(BaseDatasetBuilder):
|
class LlavaReasonBuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = LlavaReasonDataset
|
train_dataset_cls = LlavaReasonDataset
|
||||||
@ -107,7 +106,7 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[llava_reason]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -120,9 +119,37 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
|
|||||||
ann_path=build_info.ann_path,
|
ann_path=build_info.ann_path,
|
||||||
vis_root=build_info.image_path,
|
vis_root=build_info.image_path,
|
||||||
)
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
|
@registry.register_builder("llava_pretrain")
|
||||||
|
class LlavaPretrainBuilder(BaseDatasetBuilder):
|
||||||
|
train_dataset_cls = LlavaPretrainDataset
|
||||||
|
DATASET_CONFIG_DICT = {
|
||||||
|
"default": "configs/datasets/llava/pretrain_cap.yaml",
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_datasets(self):
|
||||||
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
|
logging.info("[llava_pretrain]: Building datasets...")
|
||||||
|
self.build_processors()
|
||||||
|
build_info = self.config.build_info
|
||||||
|
datasets = dict()
|
||||||
|
|
||||||
|
# create datasets
|
||||||
|
dataset_cls = self.train_dataset_cls
|
||||||
|
datasets['train'] = dataset_cls(
|
||||||
|
vis_processor=self.vis_processors["train"],
|
||||||
|
text_processor=self.text_processors["train"],
|
||||||
|
ann_path=build_info.ann_path,
|
||||||
|
vis_root=build_info.image_path,
|
||||||
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
|
return datasets
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("llava_conversation")
|
@registry.register_builder("llava_conversation")
|
||||||
class LlavaReasonBuilder(BaseDatasetBuilder):
|
class LlavaReasonBuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = LlavaConversationDataset
|
train_dataset_cls = LlavaConversationDataset
|
||||||
@ -132,7 +159,7 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[llava_conversation]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -145,6 +172,49 @@ class LlavaReasonBuilder(BaseDatasetBuilder):
|
|||||||
ann_path=build_info.ann_path,
|
ann_path=build_info.ann_path,
|
||||||
vis_root=build_info.image_path,
|
vis_root=build_info.image_path,
|
||||||
)
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
|
return datasets
|
||||||
|
|
||||||
|
@registry.register_builder("llava_mix")
|
||||||
|
class LlavaMixBuilder(BaseDatasetBuilder):
|
||||||
|
train_dataset_cls = LlavaMixDataset
|
||||||
|
DATASET_CONFIG_DICT = {
|
||||||
|
"default": "configs/datasets/llava/mix.yaml",
|
||||||
|
"mix_coco_gqa": "configs/datasets/mix_vqa/mix_vqa.yaml",
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_datasets(self):
|
||||||
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
|
logging.info("[llava_mix]: Building datasets...")
|
||||||
|
self.build_processors()
|
||||||
|
build_info = self.config.build_info
|
||||||
|
datasets = dict()
|
||||||
|
|
||||||
|
# create datasets
|
||||||
|
dataset_cls = self.train_dataset_cls
|
||||||
|
vis_roots = {
|
||||||
|
'coco':build_info.image_path_coco,
|
||||||
|
'gqa':build_info.image_path_gqa,
|
||||||
|
'ocr':build_info.image_path_ocr,
|
||||||
|
'text':build_info.image_path_text,
|
||||||
|
# 'vg':build_info.image_path_vg,
|
||||||
|
}
|
||||||
|
datasets['train'] = dataset_cls(
|
||||||
|
vis_processor=self.vis_processors["train"],
|
||||||
|
text_processor=self.text_processors["train"],
|
||||||
|
ann_path=build_info.ann_path,
|
||||||
|
vis_root=vis_roots,
|
||||||
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
|
# vis_roots = {
|
||||||
|
# 'coco':'/mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO/train2014',
|
||||||
|
# 'gqa':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/GQA/images',
|
||||||
|
# 'ocr':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/OCRVQA/images',
|
||||||
|
# 'text':'/mnt/pfs-guan-ssai/nlu/wanghanzi/data/TextVQA/train_images',
|
||||||
|
# # 'vg':build_info.image_path_vg,
|
||||||
|
# }
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
@ -153,7 +223,7 @@ class AllRefCOCOBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[AllRefCOCOBuilder]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
|
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
@ -181,81 +251,10 @@ class AllRefCOCOBuilder(BaseDatasetBuilder):
|
|||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("refcoco")
|
|
||||||
class RefCOCOBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = ReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/refcoco.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
@registry.register_builder("refcocop")
|
|
||||||
class RefCOCOPBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = ReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/refcocop.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("refcocog")
|
|
||||||
class RefCOCOGBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = ReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/refcocog.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
@registry.register_builder("invrefcoco")
|
|
||||||
class RefCOCOBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = InvReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/invrefcoco.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("invrefcocop")
|
|
||||||
class RefCOCOPBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = InvReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/invrefcocop.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("invrefcocog")
|
|
||||||
class RefCOCOGBuilder(AllRefCOCOBuilder):
|
|
||||||
train_dataset_cls = InvReferCOCODataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/coco_bbox/invrefcocog.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
@registry.register_builder("refvg")
|
|
||||||
class RefVisualGenomeBuilder(BaseDatasetBuilder):
|
|
||||||
train_dataset_cls = ReferVisualGenomeDataset
|
|
||||||
DATASET_CONFIG_DICT = {
|
|
||||||
"default": "configs/datasets/vg/ref.yaml",
|
|
||||||
}
|
|
||||||
|
|
||||||
def build_datasets(self):
|
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
|
||||||
logging.info("Building datasets...")
|
|
||||||
self.build_processors()
|
|
||||||
|
|
||||||
build_info = self.config.build_info
|
|
||||||
data_dir = build_info.data_dir
|
|
||||||
datasets = dict()
|
|
||||||
|
|
||||||
# create datasets
|
|
||||||
dataset_cls = self.train_dataset_cls
|
|
||||||
datasets['train'] = dataset_cls(
|
|
||||||
vis_processor=self.vis_processors["train"],
|
|
||||||
text_processor=self.text_processors["train"],
|
|
||||||
data_dir=data_dir,
|
|
||||||
)
|
|
||||||
|
|
||||||
return datasets
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("textcaps_caption")
|
@registry.register_builder("textcaps_caption")
|
||||||
class TextcapCaptionBuilder(BaseDatasetBuilder):
|
class TextcapCaptionBuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = TextCapDataset
|
train_dataset_cls = TextCapDataset
|
||||||
|
eval_dataset_cls = TextCapEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
|
DATASET_CONFIG_DICT = {"default": "configs/datasets/textcaps/caption.yaml"}
|
||||||
|
|
||||||
@ -265,44 +264,45 @@ class TextcapCaptionBuilder(BaseDatasetBuilder):
|
|||||||
def _download_vis(self):
|
def _download_vis(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def build(self):
|
@registry.register_builder("text_vqa")
|
||||||
self.build_processors()
|
class TextVQABuilder(BaseDatasetBuilder):
|
||||||
|
train_dataset_cls = TextVQADataset
|
||||||
|
eval_dataset_cls = TextVQAEvalDataset
|
||||||
|
|
||||||
build_info = self.config.build_info
|
DATASET_CONFIG_DICT = {"default": "configs/datasets/textvqa/vqa.yaml"}
|
||||||
|
|
||||||
datasets = dict()
|
def _download_ann(self):
|
||||||
split = "train"
|
pass
|
||||||
|
|
||||||
# create datasets
|
def _download_vis(self):
|
||||||
# [NOTE] return inner_datasets (wds.DataPipeline)
|
pass
|
||||||
dataset_cls = self.train_dataset_cls
|
|
||||||
datasets[split] = dataset_cls(
|
|
||||||
vis_processor=self.vis_processors[split],
|
|
||||||
text_processor=self.text_processors[split],
|
|
||||||
ann_path=build_info.ann_path,
|
|
||||||
vis_root=build_info.image_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
return datasets
|
|
||||||
|
|
||||||
@registry.register_builder("coco_vqa")
|
@registry.register_builder("coco_vqa")
|
||||||
class COCOVQABuilder(BaseDatasetBuilder):
|
class COCOVQABuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = COCOVQADataset
|
train_dataset_cls = COCOVQADataset
|
||||||
|
eval_dataset_cls = COCOVQAEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {
|
DATASET_CONFIG_DICT = {
|
||||||
"default": "configs/datasets/coco/defaults_vqa.yaml",
|
"default": "configs/datasets/coco/defaults_vqa.yaml",
|
||||||
|
"vqa_v2_eval": "configs/datasets/coco/defaults_vqa_eval.yaml",
|
||||||
|
"vqa_v2_part": "configs/datasets/coco/defaults_vqa_part.yaml",
|
||||||
}
|
}
|
||||||
|
|
||||||
@registry.register_builder("ok_vqa")
|
@registry.register_builder("ok_vqa")
|
||||||
class OKVQABuilder(COCOVQABuilder):
|
class OKVQABuilder(COCOVQABuilder):
|
||||||
|
train_dataset_cls = OKVQADataset
|
||||||
|
eval_dataset_cls = OKVQAEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {
|
DATASET_CONFIG_DICT = {
|
||||||
"default": "configs/datasets/okvqa/defaults.yaml",
|
"default": "configs/datasets/okvqa/defaults.yaml",
|
||||||
|
"ok_vqa_eval": "configs/datasets/okvqa/eval.yaml",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("aok_vqa")
|
@registry.register_builder("aok_vqa")
|
||||||
class AOKVQABuilder(BaseDatasetBuilder):
|
class AOKVQABuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = AOKVQADataset
|
train_dataset_cls = AOKVQADataset
|
||||||
|
eval_dataset_cls = AOKVQAEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
|
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
|
||||||
|
|
||||||
@ -310,13 +310,15 @@ class AOKVQABuilder(BaseDatasetBuilder):
|
|||||||
@registry.register_builder("gqa")
|
@registry.register_builder("gqa")
|
||||||
class GQABuilder(BaseDatasetBuilder):
|
class GQABuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = GQADataset
|
train_dataset_cls = GQADataset
|
||||||
|
eval_dataset_cls = GQAEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {
|
DATASET_CONFIG_DICT = {
|
||||||
"default": "configs/datasets/gqa/balanced_val.yaml",
|
"balanced_sft_raw": "configs/datasets/gqa/balanced_sft_raw.yaml",
|
||||||
|
"balanced_sft_raw_eval":"configs/datasets/gqa/balanced_sft_raw_eval.yaml",
|
||||||
|
"balanced_sft_raw_part":"configs/datasets/gqa/balanced_sft_raw_part.yaml",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@registry.register_builder("flickr_grounded_caption")
|
@registry.register_builder("flickr_grounded_caption")
|
||||||
class GroundedCaptionBuilder(BaseDatasetBuilder):
|
class GroundedCaptionBuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = GroundedDetailDataset
|
train_dataset_cls = GroundedDetailDataset
|
||||||
@ -326,7 +328,7 @@ class GroundedCaptionBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[flickr_grounded_caption]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -352,7 +354,7 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[flickr_CaptionToPhrase]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -377,7 +379,7 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
|
|||||||
|
|
||||||
def build_datasets(self):
|
def build_datasets(self):
|
||||||
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
# at this point, all the annotations and image/videos should be all downloaded to the specified locations.
|
||||||
logging.info("Building datasets...")
|
logging.info("[flickr_ObjectToPhrase]: Building datasets...")
|
||||||
self.build_processors()
|
self.build_processors()
|
||||||
build_info = self.config.build_info
|
build_info = self.config.build_info
|
||||||
datasets = dict()
|
datasets = dict()
|
||||||
@ -394,8 +396,6 @@ class CaptionToPhraseBuilder(BaseDatasetBuilder):
|
|||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentVQABuilder(BaseDatasetBuilder):
|
class DocumentVQABuilder(BaseDatasetBuilder):
|
||||||
def _download_ann(self):
|
def _download_ann(self):
|
||||||
pass
|
pass
|
||||||
@ -417,6 +417,7 @@ class DocumentVQABuilder(BaseDatasetBuilder):
|
|||||||
vis_root=build_info.image_path,
|
vis_root=build_info.image_path,
|
||||||
ann_path=build_info.ann_path
|
ann_path=build_info.ann_path
|
||||||
)
|
)
|
||||||
|
print("{} Length: {}".format(dataset_cls.__name__, len(datasets['train']))) # print class name
|
||||||
|
|
||||||
return datasets
|
return datasets
|
||||||
|
|
||||||
@ -495,9 +496,11 @@ class LaionBuilder(BaseDatasetBuilder):
|
|||||||
@registry.register_builder("coco_caption")
|
@registry.register_builder("coco_caption")
|
||||||
class COCOCapBuilder(BaseDatasetBuilder):
|
class COCOCapBuilder(BaseDatasetBuilder):
|
||||||
train_dataset_cls = COCOCapDataset
|
train_dataset_cls = COCOCapDataset
|
||||||
|
eval_dataset_cls = COCOCapEvalDataset
|
||||||
|
|
||||||
DATASET_CONFIG_DICT = {
|
DATASET_CONFIG_DICT = {
|
||||||
"default": "configs/datasets/coco/caption.yaml",
|
"default": "configs/datasets/coco/caption.yaml",
|
||||||
|
"coco_cap_eval": "configs/datasets/coco/caption_eval.yaml",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import torch
|
|||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
|
from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
|
||||||
|
|
||||||
|
|
||||||
class __DisplMixin:
|
class __DisplMixin:
|
||||||
@ -37,80 +37,191 @@ class AOKVQADataset(VQADataset, __DisplMixin):
|
|||||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
||||||
|
|
||||||
self.instruction_pool =[
|
self.instruction_pool =[
|
||||||
"[vqa] {}",
|
'{} Choose from {}.',
|
||||||
"[vqa] Based on the image, respond to this question with a short answer: {}"
|
'Q: {} Multi Choices: {} A: ',
|
||||||
|
'Question: {} Multi Choices: {} Answer: ',
|
||||||
|
"{} Choose one from the following possible answers: {}. ",
|
||||||
|
'{} Choose from {}. The answer is',
|
||||||
]
|
]
|
||||||
|
|
||||||
exist_annotation = []
|
exist_annotation = []
|
||||||
for ann in self.annotation:
|
for ann in self.annotation:
|
||||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
# image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||||
|
image_path = os.path.join(self.vis_root, ann["image"])
|
||||||
if os.path.exists(image_path):
|
if os.path.exists(image_path):
|
||||||
exist_annotation.append(ann)
|
exist_annotation.append(ann)
|
||||||
self.annotation = exist_annotation
|
self.annotation = exist_annotation
|
||||||
|
self.source = 'aokvqa'
|
||||||
|
|
||||||
def get_data(self, index):
|
def get_data(self, index):
|
||||||
ann = self.annotation[index]
|
ann = self.annotation[index]
|
||||||
|
|
||||||
image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
# image_path = os.path.join(self.vis_root, ann["image"].split('/')[-1])
|
||||||
|
image_path = os.path.join(self.vis_root, ann["image"])
|
||||||
image = Image.open(image_path).convert("RGB")
|
image = Image.open(image_path).convert("RGB")
|
||||||
|
|
||||||
image = self.vis_processor(image)
|
image = self.vis_processor(image)
|
||||||
question = self.text_processor(ann["question"])
|
question = self.text_processor(ann["question"])
|
||||||
|
|
||||||
answer_key = "direct_answers"
|
answer_lst = ann["choices"]
|
||||||
|
direct_answers = ann["direct_answers"]
|
||||||
answer_weight = {}
|
final_answer = random.choices(direct_answers, k=1)[0]
|
||||||
for answer in ann[answer_key]:
|
for answer in answer_lst:
|
||||||
if answer in answer_weight.keys():
|
if answer in direct_answers:
|
||||||
answer_weight[answer] += 1 / len(ann[answer_key])
|
final_answer = answer
|
||||||
else:
|
|
||||||
answer_weight[answer] = 1 / len(ann[answer_key])
|
|
||||||
|
|
||||||
answers = list(answer_weight.keys())
|
|
||||||
weights = list(answer_weight.values())
|
|
||||||
|
|
||||||
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image": image,
|
"image": image,
|
||||||
|
"image_id": ann["image"],
|
||||||
"question": question,
|
"question": question,
|
||||||
"answer": answer,
|
"answer": final_answer,
|
||||||
|
"choices": ", ".join(answer_lst)
|
||||||
}
|
}
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index):
|
||||||
data = self.get_data(index)
|
data = self.get_data(index)
|
||||||
question = self.text_processor(data["question"])
|
question = self.text_processor(data["question"])
|
||||||
instruction = random.choice(self.instruction_pool).format(question)
|
|
||||||
|
|
||||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
|
||||||
answer = self.text_processor(data['answer'])
|
answer = self.text_processor(data['answer'])
|
||||||
|
q_input = question
|
||||||
|
llm_input = random.choice(self.instruction_pool).format(question, data["choices"])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image": data['image'],
|
"image": data['image'],
|
||||||
"instruction_input": instruction,
|
"image_id": data["image_id"],
|
||||||
|
# "q_input": q_input,
|
||||||
|
"q_input": llm_input,
|
||||||
|
"llm_input": llm_input,
|
||||||
|
"text_input": question,
|
||||||
|
"text_output": answer,
|
||||||
"answer": answer,
|
"answer": answer,
|
||||||
|
"source": 'aokvqa',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class AOKVQGDataset(AOKVQADataset):
|
class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
|
||||||
|
|
||||||
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
|
||||||
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
|
"""
|
||||||
self.instruction_pool = [
|
vis_root (string): Root directory of images (e.g. coco/images/)
|
||||||
'Given the image, generate a question whose answer is: {}',
|
ann_root (string): directory to store the annotation file
|
||||||
'Based on the image, provide a question with the answer: {}',
|
"""
|
||||||
'Given the visual representation, create a question for which the answer is "{}"',
|
|
||||||
'From the image provided, craft a question that leads to the reply: {}',
|
self.vis_root = vis_root
|
||||||
'Considering the picture, come up with a question where the answer is: {}',
|
|
||||||
'Taking the image into account, generate an question that has the answer: {}'
|
self.annotation = json.load(open(ann_paths[0]))
|
||||||
|
|
||||||
|
self.instruction_pool =[
|
||||||
|
'{} Choose from {}.',
|
||||||
|
'Q: {} Multi Choices: {} A: ',
|
||||||
|
'Question: {} Multi Choices: {} Answer: ',
|
||||||
|
"{} Choose one from the following possible answers: {}. ",
|
||||||
|
'{} Choose from {}. The answer is',
|
||||||
]
|
]
|
||||||
|
|
||||||
def __getitem__(self, index):
|
try:
|
||||||
data = self.get_data(index)
|
self.coco_fmt_qust_file = ann_paths[2]
|
||||||
instruction = random.choice(self.instruction_pool).format(data['answer'])
|
self.coco_fmt_anno_file = ann_paths[3]
|
||||||
|
except IndexError:
|
||||||
|
self.coco_fmt_qust_file = None
|
||||||
|
self.coco_fmt_anno_file = None
|
||||||
|
|
||||||
|
self.vis_processor = vis_processor
|
||||||
|
self.text_processor = text_processor
|
||||||
|
self.source = 'aokvqa'
|
||||||
|
self.annotation_add = self.get_data()
|
||||||
|
|
||||||
|
def collater(self, samples):
|
||||||
|
(
|
||||||
|
image_list,
|
||||||
|
question_list,
|
||||||
|
question_id_list,
|
||||||
|
choices_list,
|
||||||
|
correct_choice_idx_list,
|
||||||
|
direct_answers_list,
|
||||||
|
llm_input_list,
|
||||||
|
q_input_list,
|
||||||
|
gt_answers_list,
|
||||||
|
source_list,
|
||||||
|
) = ([], [], [], [], [], [], [], [], [], [])
|
||||||
|
|
||||||
|
for sample in samples:
|
||||||
|
image_list.append(sample["image"])
|
||||||
|
question_list.append(sample["text_input"])
|
||||||
|
question_id_list.append(sample["question_id"])
|
||||||
|
choices_list.append(sample["choices"])
|
||||||
|
correct_choice_idx_list.append(sample["correct_choice_idx"])
|
||||||
|
direct_answers_list.append(sample["direct_answers"])
|
||||||
|
llm_input_list.append(sample["llm_input"])
|
||||||
|
q_input_list.append(sample["q_input"])
|
||||||
|
gt_answers_list.append(sample["gt_answers"])
|
||||||
|
source_list.append(sample["source"])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image": data['image'],
|
"image": torch.stack(image_list, dim=0),
|
||||||
"instruction_input": instruction,
|
"text_input": question_list,
|
||||||
"answer": data['question'],
|
"question_id": question_id_list,
|
||||||
|
"choices": choices_list,
|
||||||
|
"correct_choice_idx": correct_choice_idx_list,
|
||||||
|
"direct_answers": direct_answers_list,
|
||||||
|
"llm_input": llm_input_list,
|
||||||
|
"q_input": llm_input_list,
|
||||||
|
# "q_input": q_input_list,
|
||||||
|
"gt_answers": gt_answers_list,
|
||||||
|
"source": source_list,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
import numpy as np
|
||||||
|
ann_instruct = list()
|
||||||
|
for i in range(len(self.annotation)):
|
||||||
|
ann = self.annotation[i].copy()
|
||||||
|
j = i % len(self.instruction_pool)
|
||||||
|
question = self.text_processor(ann["question"])
|
||||||
|
choices = ann["choices"]
|
||||||
|
llm_input = self.instruction_pool[j].format(question, ", ".join(choices))
|
||||||
|
ann['llm_input'] = llm_input
|
||||||
|
ann_instruct.append(ann)
|
||||||
|
np.random.seed(10)
|
||||||
|
np.random.shuffle(ann_instruct)
|
||||||
|
return ann_instruct
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
# ann = self.annotation[index]
|
||||||
|
ann = self.annotation_add[index]
|
||||||
|
|
||||||
|
image_path = os.path.join(self.vis_root, ann["image"])
|
||||||
|
image = Image.open(image_path).convert("RGB")
|
||||||
|
|
||||||
|
image = self.vis_processor(image)
|
||||||
|
question = self.text_processor(ann["question"])
|
||||||
|
|
||||||
|
if "direct_answers" in ann:
|
||||||
|
direct_answers = ann["direct_answers"]
|
||||||
|
else:
|
||||||
|
direct_answers = None
|
||||||
|
|
||||||
|
choices = ann["choices"]
|
||||||
|
if "correct_choice_idx" in ann:
|
||||||
|
correct_choice_idx = ann["correct_choice_idx"]
|
||||||
|
correct_answer = choices[correct_choice_idx]
|
||||||
|
else:
|
||||||
|
correct_choice_idx = None
|
||||||
|
correct_answer = direct_answers
|
||||||
|
|
||||||
|
llm_input = ann.get("llm_input",random.choice(self.instruction_pool).format(question))
|
||||||
|
# llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"image": image,
|
||||||
|
# "q_input": question,
|
||||||
|
"q_input": llm_input,
|
||||||
|
"llm_input": llm_input,
|
||||||
|
"text_input": question,
|
||||||
|
"question_id": ann["question_id"],
|
||||||
|
"choices": choices,
|
||||||
|
"correct_choice_idx": correct_choice_idx,
|
||||||
|
"gt_answers": correct_answer,
|
||||||
|
"direct_answers": direct_answers,
|
||||||
|
"source": 'aokvqa',
|
||||||
|
}
|
||||||
|
|