Merge pull request #406 from junchen14/main

add evaluation script
This commit is contained in:
ZhuDeyao 2023-11-01 15:49:35 +03:00 committed by GitHub
commit ff0322c35b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 2051 additions and 88 deletions

View File

@ -15,7 +15,7 @@ RefCOCOg | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog
OKVQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json"> annotations </a>
AOK-VQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json"> annotations </a>
OCR-VQA | <a href="https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing"> annotations </a>
GQA | <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a> &nbsp;&nbsp; <a href="/ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json"> annotations </a>
GQA | <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a> &nbsp;&nbsp; <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json"> annotations </a>
Filtered flickr-30k | <a href="https://drive.google.com/drive/folders/19c_ggBI77AvdtYlPbuI0ZpnPz73T5teX?usp=sharing"> annotations </a>
Multi-task conversation | <a href="https://drive.google.com/file/d/11HHqB2c29hbSk-WLxdta-nG8UCUrcCN1/view?usp=sharing"> annotations </a>
Filtered unnatural instruction | <a href="https://drive.google.com/file/d/1lXNnBcb5WU-sc8Fe2T2N8J0NRw4sBLev/view?usp=sharing"> annotations </a>

View File

@ -0,0 +1,79 @@
model:
arch: minigpt_v2
model_type: pretrain
max_txt_len: 500
end_sym: "</s>"
low_resource: False
prompt_template: '[INST] {} [/INST]'
llama_model: ""
ckpt: ""
lora_r: 64
lora_alpha: 16
datasets:
cc_sbu_align:
vis_processor:
train:
name: "blip2_image_eval"
image_size: 448
text_processor:
train:
name: "blip_caption"
evaluation_datasets:
refcoco:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
refcocog:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
refcoco+:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
gqa:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
okvqa:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
vizwiz:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
iconvqa:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
vsr:
eval_file_path: cambridgeltl/vsr_zeroshot
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 10
hm:
eval_file_path: /path/to/eval/annotation/path
img_path: /path/to/eval/image/path
max_new_tokens: 20
batch_size: 100
run:
task: image_text_pretrain
name: minigptv2_evaluation
save_path: /path/to/save/folder_path

View File

@ -1,11 +1,11 @@
model:
arch: minigpt_v2
model_type: pretrain
max_txt_len: 160
max_txt_len: 500
end_sym: "</s>"
low_resource: True
prompt_template: '[INST] {} [/INST]'
ckpt: 'please set this value to the path of pretrained checkpoint'
ckpt: "please set this value to the path of pretrained checkpoint"
lora_r: 64
lora_alpha: 16

104
eval_scripts/EVAL_README.md Normal file
View File

@ -0,0 +1,104 @@
## Evaluation Instruction for MiniGPT-v2
### Data preparation
Images download
Image source | Download path
--- | :---:
OKVQA| <a href="https://drive.google.com/drive/folders/1jxIgAhtaLu_YqnZEl8Ym11f7LhX3nptN?usp=sharing">annotations</a> &nbsp;&nbsp; <a href="http://images.cocodataset.org/zips/train2017.zip"> images</a>
gqa | <a href="https://drive.google.com/drive/folders/1-dF-cgFwstutS4qq2D9CFQTDS0UTmIft?usp=drive_link">annotations</a> &nbsp;&nbsp; <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a>
hateful meme | <a href="https://github.com/faizanahemad/facebook-hateful-memes">images and annotations</a>
iconqa | <a href="https://iconqa.github.io/#download">images and annotation</a>
vizwiz | <a href="https://vizwiz.org/tasks-and-datasets/vqa/">images and annotation</a>
RefCOCO | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip"> annotations </a>
RefCOCO+ | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip"> annotations </a>
RefCOCOg | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip"> annotations </a>
### Evaluation dataset structure
```
${MINIGPTv2_EVALUATION_DATASET}
├── gqa
│ └── test_balanced_questions.json
│ ├── testdev_balanced_questions.json
│ ├── gqa_images
├── hateful_meme
│ └── hm_images
│ ├── dev.jsonl
├── iconvqa
│ └── iconvqa_images
│ ├── choose_text_val.json
├── vizwiz
│ └── vizwiz_images
│ ├── val.json
├── vsr
│ └── vsr_images
├── okvqa
│ ├── okvqa_test_split.json
│ ├── mscoco_val2014_annotations_clean.json
│ ├── OpenEnded_mscoco_val2014_questions_clean.json
├── refcoco
│ └── instances.json
│ ├── refs(google).p
│ ├── refs(unc).p
├── refcoco+
│ └── instances.json
│ ├── refs(unc).p
├── refercocog
│ └── instances.json
│ ├── refs(google).p
│ ├── refs(und).p
...
```
### environment setup
```
export PYTHONPATH=$PYTHONPATH:/path/to/directory/of/MiniGPT-4
```
### config file setup
Set **llama_model** to the path of LLaMA model.
Set **ckpt** to the path of our pretrained model.
Set **eval_file_path** to the path of the annotation files for each evaluation data.
Set **img_path** to the img_path for each evaluation dataset.
Set **save_path** to the save_path for evch evaluation dataset.
in [minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml](../minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml)
### start evalauting RefCOCO, RefCOCO+, RefCOCOg
port=port_number
cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml
dataset names:
| refcoco | refcoco+ | refcocog |
| ------- | -------- | -------- |
```
torchrun --master-port ${port} --nproc_per_node 1 eval_ref.py \
--cfg-path ${cfg_path} --dataset refcoco,refcoco+,refcocog --resample
```
### start evaluating visual question answering
port=port_number
cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml
dataset names:
| okvqa | vizwiz | iconvqa | gqa | vsr | hm |
| ------- | -------- | -------- |-------- | -------- | -------- |
```
torchrun --master-port ${port} --nproc_per_node 1 eval_vqa.py \
--cfg-path ${cfg_path} --dataset okvqa,vizwiz,iconvqa,gqa,vsr,hm
```

25
eval_scripts/eval.sh Normal file
View File

@ -0,0 +1,25 @@
#!/bin/bash --login
export PYTHONPATH=$PYTHONPATH:$(pwd)
cfg_path=eval_configs/minigpt4_llama2_eval.yaml
CKPT=YOUR_CKPT_PATH
NAME=EXP_NAME
IMG_PATH=YOUR_IMG_PATH
EVAL_FILE_PATH=YOUR_EVAL_FILE_PATH
# torchrun --nproc_per_node 1 eval_scripts/eval_ref.py --name ${NAME} \
# --cfg-path ${cfg_path} \
# --ckpt ${CKPT} --dataset refcoco,refcoco+,refcocog --lora_r 64 --lora_alpha 16 \
# --batch_size 64 --max_new_tokens 20 --resample --img_path ${IMG_PATH} --eval_file_path ${EVAL_FILE_PATH}
# torchrun --nproc_per_node 1 eval_scripts/eval_vqa.py --name ${NAME} \
# --cfg-path ${cfg_path} \
# --ckpt ${CKPT} --split val,test --dataset okvqa,vizwiz,iconqa,gqa,vsr,hm --lora_r 64 --lora_alpha 16 \
# --batch_size 32 --max_new_tokens 20 --resample
torchrun --nproc_per_node 1 eval_scripts/eval_vqa.py --name ${NAME} \
--cfg-path ${cfg_path} \
--ckpt ${CKPT} --split val,test --dataset okvqa --lora_r 64 --lora_alpha 16 \
--batch_size 32 --max_new_tokens 20

128
eval_scripts/eval_ref.py Normal file
View File

@ -0,0 +1,128 @@
import os
import re
import json
import argparse
from collections import defaultdict
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from minigpt4.common.config import Config
from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU
from minigpt4.conversation.conversation import CONV_VISION_minigptv2
from minigpt4.datasets.datasets.coco_caption import RefCOCOEvalData
def list_of_str(arg):
return list(map(str, arg.split(',')))
parser = eval_parser()
parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate")
parser.add_argument("--res", type=float, default=100.0, help="resolution used in refcoco")
parser.add_argument("--resample", action='store_true', help="resolution used in refcoco")
args = parser.parse_args()
cfg = Config(args)
eval_dict = {'refcoco': ['val','testA','testB'],
'refcoco+': ['val','testA','testB'],
'refcocog': ['val','test']}
model, vis_processor = init_model(args)
model.eval()
CONV_VISION = CONV_VISION_minigptv2
conv_temp = CONV_VISION.copy()
conv_temp.system = ""
#
model.eval()
save_path = cfg.run_cfg.save_path
for dataset in args.dataset:
for split in eval_dict[dataset]:
eval_file_path = cfg.evaluation_datasets_cfg[dataset]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg[dataset]["img_path"]
batch_size = cfg.evaluation_datasets_cfg[dataset]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg[dataset]["max_new_tokens"]
with open(os.path.join(eval_file_path,f"{dataset}/{dataset}_{split}.json"), 'r') as f:
refcoco = json.load(f)
data = RefCOCOEvalData(refcoco, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
minigpt4_predict = defaultdict(list)
resamples = []
for images, questions, img_ids in tqdm(eval_dataloader):
texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, img_id, question in zip(answers, img_ids, questions):
answer = answer.replace("<unk>","").replace(" ","").strip()
pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}'
if re.match(pattern, answer):
minigpt4_predict[img_id].append(answer)
else:
resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]})
if args.resample:
for i in range(20):
data = RefCOCOEvalData(resamples, vis_processor, img_path)
resamples = []
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
for images, questions, img_ids in tqdm(eval_dataloader):
texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, img_id, question in zip(answers, img_ids, questions):
answer = answer.replace("<unk>","").replace(" ","").strip()
pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}'
if re.match(pattern, answer) or i == 4:
minigpt4_predict[img_id].append(answer)
else:
resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]})
if len(resamples) == 0:
break
file_save_path = os.path.join(save_path,f"{args.dataset}_{split}.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
count=0
total=len(refcoco)
res=args.res
refcoco_dict = defaultdict()
for item in refcoco:
refcoco_dict[item['img_id']] = item
for img_id in refcoco_dict:
item = refcoco_dict[img_id]
bbox = item['bbox']
outputs = minigpt4_predict[img_id]
for output in outputs:
try:
integers = re.findall(r'\d+', output)
pred_bbox = [int(num) for num in integers]
height = item['height']
width = item['width']
pred_bbox[0] = pred_bbox[0] / res * width
pred_bbox[1] = pred_bbox[1] / res * height
pred_bbox[2] = pred_bbox[2] / res * width
pred_bbox[3] = pred_bbox[3] / res * height
gt_bbox = [0,0,0,0]
gt_bbox[0] = bbox[0]
gt_bbox[1] = bbox[1]
gt_bbox[2] = bbox[0] + bbox[2]
gt_bbox[3] = bbox[1] + bbox[3]
iou_score = computeIoU(pred_bbox, gt_bbox)
if iou_score > 0.5:
count+=1
except:
continue
print(f'{dataset} {split}:', count / total * 100, flush=True)

252
eval_scripts/eval_vqa.py Normal file
View File

@ -0,0 +1,252 @@
import os
import re
import json
import argparse
from collections import defaultdict
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from minigpt4.datasets.datasets.vqa_datasets import OKVQAEvalData,VizWizEvalData,IconQAEvalData,GQAEvalData,VSREvalData,HMEvalData
from minigpt4.common.vqa_tools.VQA.PythonHelperTools.vqaTools.vqa import VQA
from minigpt4.common.vqa_tools.VQA.PythonEvaluationTools.vqaEvaluation.vqaEval import VQAEval
from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser
from minigpt4.conversation.conversation import CONV_VISION_minigptv2
from minigpt4.common.config import Config
def list_of_str(arg):
return list(map(str, arg.split(',')))
parser = eval_parser()
parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate")
args = parser.parse_args()
cfg = Config(args)
model, vis_processor = init_model(args)
conv_temp = CONV_VISION_minigptv2.copy()
conv_temp.system = ""
model.eval()
save_path = cfg.run_cfg.save_path
if 'okvqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["okvqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["okvqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["okvqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["okvqa"]["max_new_tokens"]
evaluation_annntation_path = os.path.join(eval_file_path, "okvqa_test_split.json")
with open(evaluation_annntation_path) as f:
ok_vqa_test_split = json.load(f)
data = OKVQAEvalData(ok_vqa_test_split, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
minigpt4_predict = []
for images, questions, question_ids, img_ids in eval_dataloader:
texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, question_id, question, img_id in zip(answers, question_ids, questions, img_ids):
result = dict()
answer = answer.lower().replace('<unk>','').strip()
result['answer'] = answer
result['question_id'] = int(question_id)
minigpt4_predict.append(result)
file_save_path= os.path.join(save_path,"okvqa.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
annFile = os.path.join(eval_file_path,"mscoco_val2014_annotations_clean.json")
quesFile = os.path.join(eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" )
vqa = VQA(annFile, quesFile)
vqaRes = vqa.loadRes(file_save_path, quesFile)
vqaEval = VQAEval(vqa, vqaRes, n=2)
vqaEval.evaluate()
print ("Overall OKVQA Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']), flush=True)
if 'vizwiz' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["vizwiz"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["vizwiz"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["vizwiz"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["vizwiz"]["max_new_tokens"]
vizwiz = json.load(open(eval_file_path, 'r'))
data = VizWizEvalData(vizwiz, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
minigpt4_predict = []
total_acc = []
for images, texts, gt_answers in tqdm(eval_dataloader):
texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template
with torch.no_grad():
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False,repetition_penalty=1.0)
for answer, gt_answer in zip(answers, gt_answers):
result = dict()
result['answer'] = answer.replace('<unk>','').strip()
minigpt4_predict.append(result)
count=0
gt_answer = gt_answer.split('_')
for gt in gt_answer:
if gt.lower() == answer.lower():
count += 1
acc = min(count/3.0, 1.0)
total_acc.append(acc)
file_save_path = os.path.join(save_path, "vizwiz.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
print('vizwiz Acc: ', np.average(total_acc)* 100.0, flush=True)
if 'iconvqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["iconvqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["iconvqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["iconvqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["iconvqa"]["max_new_tokens"]
iconqa_text_val = json.load(open(eval_file_path,"r"))
data = IconQAEvalData(iconqa_text_val, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count = 0
for images, texts, candidates, answers in tqdm(eval_dataloader):
candidates = [candidate.split('_') for candidate in candidates]
num_cand = [len(candidate) for candidate in candidates]
for candidate in candidates:
candidate.extend(['none'] * (max(num_cand) - len(candidate)))
candidates = [list(x) for x in zip(*candidates)]
instructions = ["<s>[INST] <Img><ImageHere></Img> {} [/INST]".format(text) for text in texts]
answer_ranks = model.multi_select(images, instructions, candidates, num_cand=num_cand)
for idx, answer in enumerate(answers):
if answer_ranks[idx][0] == answer:
count += 1
print('iconqa Acc: ', count / len(iconqa_text_val) * 100.0, flush=True)
if 'gqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["gqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["gqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["gqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["gqa"]["max_new_tokens"]
gqa = json.load(open(eval_file_path))
data = GQAEvalData(gqa, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count=0
total=0
minigpt4_predict = []
for images, texts, labels in tqdm(eval_dataloader):
texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, label in zip(answers, labels):
result = dict()
result['pred'] = answer.lower().replace('<unk>','').strip()
result['gt'] = label
minigpt4_predict.append(result)
if answer.lower() == label:
count+=1
total+=1
print('gqa val:', count / total * 100, flush=True)
file_save_path = os.path.join(save_path, "gqa.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
if 'vsr' in args.dataset:
img_path = cfg.evaluation_datasets_cfg["vsr"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["vsr"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["vsr"]["max_new_tokens"]
annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test')
data = VSREvalData(annotation, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count=0
total=0
minigpt4_predict = []
for images, texts, labels in tqdm(eval_dataloader):
texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, label in zip(answers, labels):
result = dict()
result['pred'] = answer.replace('<unk>','').strip()
result['gt'] = label
minigpt4_predict.append(result)
if answer.lower() == label.lower():
count+=1
total+=1
print('vsr test:', count / total * 100, flush=True)
file_save_path = os.path.join(save_path,"vsr.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
if 'hm' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["hm"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["hm"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["hm"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["hm"]["max_new_tokens"]
annotation = []
with open(eval_file_path, 'r') as jsonl_file:
for line in jsonl_file:
json_obj = json.loads(line)
annotation.append(json_obj)
data = HMEvalData(annotation, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count=0
total=0
minigpt4_predict = []
for images, texts, labels in tqdm(eval_dataloader):
texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, label in zip(answers, labels):
result = dict()
if answer.lower().strip() =="yes":
answer=1
elif answer.lower().strip()=="no":
answer=0
else:
print("non-matching answer",answer)
result['pred'] = answer
result['gt'] = int(label)
minigpt4_predict.append(result)
if answer == label:
count+=1
total+=1
print('hm val:', count / total * 100, flush=True)
file_save_path = os.path.join(save_path, "hm.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)

View File

@ -29,6 +29,7 @@ class Config:
runner_config = self.build_runner_config(config)
model_config = self.build_model_config(config, **user_config)
dataset_config = self.build_dataset_config(config)
evaluation_dataset_config = self.build_evaluation_dataset_config(config)
# Validate the user-provided runner configuration
# model and dataset configuration are supposed to be validated by the respective classes
@ -37,7 +38,7 @@ class Config:
# Override the default configuration with user options.
self.config = OmegaConf.merge(
runner_config, model_config, dataset_config, user_config
runner_config, model_config, dataset_config,evaluation_dataset_config, user_config
)
def _validate_runner_config(self, runner_config):
@ -111,6 +112,28 @@ class Config:
return dataset_config
@staticmethod
def build_evaluation_dataset_config(config):
datasets = config.get("evaluation_datasets", None)
if datasets is None:
raise KeyError(
"Expecting 'datasets' as the root key for dataset configuration."
)
dataset_config = OmegaConf.create()
for dataset_name in datasets:
builder_cls = registry.get_builder_class(dataset_name)
# hierarchy override, customized config > default config
dataset_config = OmegaConf.merge(
dataset_config,
{"evaluation_datasets": {dataset_name: config["evaluation_datasets"][dataset_name]}},
)
return dataset_config
def _convert_to_dot_list(self, opts):
if opts is None:
opts = []
@ -136,6 +159,10 @@ class Config:
def datasets_cfg(self):
return self.config.datasets
@property
def evaluation_datasets_cfg(self):
return self.config.evaluation_datasets
@property
def model_cfg(self):
return self.config.model

View File

@ -0,0 +1,76 @@
import argparse
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from minigpt4.common.registry import registry
from minigpt4.common.config import Config
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
def eval_parser():
parser = argparse.ArgumentParser(description="Demo")
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument("--name", type=str, default='A2', help="evaluation name")
parser.add_argument("--ckpt", type=str, help="path to configuration file.")
parser.add_argument("--eval_opt", type=str, default='all', help="path to configuration file.")
parser.add_argument("--max_new_tokens", type=int, default=10, help="max number of generated tokens")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--lora_r", type=int, default=64, help="lora rank of the model")
parser.add_argument("--lora_alpha", type=int, default=16, help="lora alpha")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
return parser
def prepare_texts(texts, conv_temp):
convs = [conv_temp.copy() for _ in range(len(texts))]
[conv.append_message(
conv.roles[0], '<Img><ImageHere></Img> {}'.format(text)) for conv, text in zip(convs, texts)]
[conv.append_message(conv.roles[1], None) for conv in convs]
texts = [conv.get_prompt() for conv in convs]
return texts
def init_model(args):
print('Initialization Model')
cfg = Config(args)
# cfg.model_cfg.ckpt = args.ckpt
# cfg.model_cfg.lora_r = args.lora_r
# cfg.model_cfg.lora_alpha = args.lora_alpha
model_config = cfg.model_cfg
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:0')
# import pudb; pudb.set_trace()
key = list(cfg.datasets_cfg.keys())[0]
vis_processor_cfg = cfg.datasets_cfg.get(key).vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
print('Initialization Finished')
return model, vis_processor
def computeIoU(bbox1, bbox2):
x1, y1, x2, y2 = bbox1
x3, y3, x4, y4 = bbox2
intersection_x1 = max(x1, x3)
intersection_y1 = max(y1, y3)
intersection_x2 = min(x2, x4)
intersection_y2 = min(y2, y4)
intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1)
bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
union_area = bbox1_area + bbox2_area - intersection_area
iou = intersection_area / union_area
return iou

View File

@ -0,0 +1,89 @@
# coding: utf-8
import sys
dataDir = '../../VQA'
sys.path.insert(0, '%s/PythonHelperTools/vqaTools' %(dataDir))
from vqa import VQA
from vqaEvaluation.vqaEval import VQAEval
import matplotlib.pyplot as plt
import skimage.io as io
import json
import random
import os
# set up file names and paths
versionType ='v2_' # this should be '' when using VQA v2.0 dataset
taskType ='OpenEnded' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
dataType ='mscoco' # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
dataSubType ='train2014'
annFile ='%s/Annotations/%s%s_%s_annotations.json'%(dataDir, versionType, dataType, dataSubType)
quesFile ='%s/Questions/%s%s_%s_%s_questions.json'%(dataDir, versionType, taskType, dataType, dataSubType)
imgDir ='%s/Images/%s/%s/' %(dataDir, dataType, dataSubType)
resultType ='fake'
fileTypes = ['results', 'accuracy', 'evalQA', 'evalQuesType', 'evalAnsType']
# An example result json file has been provided in './Results' folder.
[resFile, accuracyFile, evalQAFile, evalQuesTypeFile, evalAnsTypeFile] = ['%s/Results/%s%s_%s_%s_%s_%s.json'%(dataDir, versionType, taskType, dataType, dataSubType, \
resultType, fileType) for fileType in fileTypes]
# create vqa object and vqaRes object
vqa = VQA(annFile, quesFile)
vqaRes = vqa.loadRes(resFile, quesFile)
# create vqaEval object by taking vqa and vqaRes
vqaEval = VQAEval(vqa, vqaRes, n=2) #n is precision of accuracy (number of places after decimal), default is 2
# evaluate results
"""
If you have a list of question ids on which you would like to evaluate your results, pass it as a list to below function
By default it uses all the question ids in annotation file
"""
vqaEval.evaluate()
# print accuracies
print "\n"
print "Overall Accuracy is: %.02f\n" %(vqaEval.accuracy['overall'])
print "Per Question Type Accuracy is the following:"
for quesType in vqaEval.accuracy['perQuestionType']:
print "%s : %.02f" %(quesType, vqaEval.accuracy['perQuestionType'][quesType])
print "\n"
print "Per Answer Type Accuracy is the following:"
for ansType in vqaEval.accuracy['perAnswerType']:
print "%s : %.02f" %(ansType, vqaEval.accuracy['perAnswerType'][ansType])
print "\n"
# demo how to use evalQA to retrieve low score result
evals = [quesId for quesId in vqaEval.evalQA if vqaEval.evalQA[quesId]<35] #35 is per question percentage accuracy
if len(evals) > 0:
print 'ground truth answers'
randomEval = random.choice(evals)
randomAnn = vqa.loadQA(randomEval)
vqa.showQA(randomAnn)
print '\n'
print 'generated answer (accuracy %.02f)'%(vqaEval.evalQA[randomEval])
ann = vqaRes.loadQA(randomEval)[0]
print "Answer: %s\n" %(ann['answer'])
imgId = randomAnn[0]['image_id']
imgFilename = 'COCO_' + dataSubType + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
I = io.imread(imgDir + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
# plot accuracy for various question types
plt.bar(range(len(vqaEval.accuracy['perQuestionType'])), vqaEval.accuracy['perQuestionType'].values(), align='center')
plt.xticks(range(len(vqaEval.accuracy['perQuestionType'])), vqaEval.accuracy['perQuestionType'].keys(), rotation='0',fontsize=10)
plt.title('Per Question Type Accuracy', fontsize=10)
plt.xlabel('Question Types', fontsize=10)
plt.ylabel('Accuracy', fontsize=10)
plt.show()
# save evaluation results to ./Results folder
json.dump(vqaEval.accuracy, open(accuracyFile, 'w'))
json.dump(vqaEval.evalQA, open(evalQAFile, 'w'))
json.dump(vqaEval.evalQuesType, open(evalQuesTypeFile, 'w'))
json.dump(vqaEval.evalAnsType, open(evalAnsTypeFile, 'w'))

View File

@ -0,0 +1 @@
author='aagrawal'

View File

@ -0,0 +1,192 @@
# coding=utf-8
__author__='aagrawal'
import re
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys
class VQAEval:
def __init__(self, vqa, vqaRes, n=2):
self.n = n
self.accuracy = {}
self.evalQA = {}
self.evalQuesType = {}
self.evalAnsType = {}
self.vqa = vqa
self.vqaRes = vqaRes
self.params = {'question_id': vqa.getQuesIds()}
self.contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
"couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
"hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
"he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
"Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
"maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
"mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
"ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
"she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
"somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
"somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
"someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
"something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
"there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
"they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
"wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
"whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
"whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
"whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
"wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
"y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
"youll": "you'll", "youre": "you're", "youve": "you've"}
self.manualMap = { 'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10'
}
self.articles = ['a',
'an',
'the'
]
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
self.commaStrip = re.compile("(\d)(\,)(\d)")
self.punct = [';', r"/", '[', ']', '"', '{', '}',
'(', ')', '=', '+', '\\', '_', '-',
'>', '<', '@', '`', ',', '?', '!']
def evaluate(self, quesIds=None):
if quesIds == None:
quesIds = [quesId for quesId in self.params['question_id']]
gts = {}
res = {}
for quesId in quesIds:
gts[quesId] = self.vqa.qa[quesId]
res[quesId] = self.vqaRes.qa[quesId]
# =================================================
# Compute accuracy
# =================================================
accQA = []
accQuesType = {}
accAnsType = {}
# print "computing accuracy"
step = 0
for quesId in quesIds:
for ansDic in gts[quesId]['answers']:
ansDic['answer'] = ansDic['answer'].replace('\n', ' ')
ansDic['answer'] = ansDic['answer'].replace('\t', ' ')
ansDic['answer'] = ansDic['answer'].strip()
resAns = res[quesId]['answer']
resAns = resAns.replace('\n', ' ')
resAns = resAns.replace('\t', ' ')
resAns = resAns.strip()
gtAcc = []
gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
if len(set(gtAnswers)) > 1:
for ansDic in gts[quesId]['answers']:
ansDic['answer'] = self.processPunctuation(ansDic['answer'])
ansDic['answer'] = self.processDigitArticle(ansDic['answer'])
resAns = self.processPunctuation(resAns)
resAns = self.processDigitArticle(resAns)
for gtAnsDatum in gts[quesId]['answers']:
otherGTAns = [item for item in gts[quesId]['answers'] if item!=gtAnsDatum]
matchingAns = [item for item in otherGTAns if item['answer'].lower()==resAns.lower()]
acc = min(1, float(len(matchingAns))/3)
gtAcc.append(acc)
quesType = gts[quesId]['question_type']
ansType = gts[quesId]['answer_type']
avgGTAcc = float(sum(gtAcc))/len(gtAcc)
accQA.append(avgGTAcc)
if quesType not in accQuesType:
accQuesType[quesType] = []
accQuesType[quesType].append(avgGTAcc)
if ansType not in accAnsType:
accAnsType[ansType] = []
accAnsType[ansType].append(avgGTAcc)
self.setEvalQA(quesId, avgGTAcc)
self.setEvalQuesType(quesId, quesType, avgGTAcc)
self.setEvalAnsType(quesId, ansType, avgGTAcc)
if step%100 == 0:
self.updateProgress(step/float(len(quesIds)))
step = step + 1
self.setAccuracy(accQA, accQuesType, accAnsType)
# print "Done computing accuracy"
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub("",
outText,
re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = ' '.join(outText)
return outText
def setAccuracy(self, accQA, accQuesType, accAnsType):
self.accuracy['overall'] = round(100*float(sum(accQA))/len(accQA), self.n)
self.accuracy['perQuestionType'] = {quesType: round(100*float(sum(accQuesType[quesType]))/len(accQuesType[quesType]), self.n) for quesType in accQuesType}
self.accuracy['perAnswerType'] = {ansType: round(100*float(sum(accAnsType[ansType]))/len(accAnsType[ansType]), self.n) for ansType in accAnsType}
def setEvalQA(self, quesId, acc):
self.evalQA[quesId] = round(100*acc, self.n)
def setEvalQuesType(self, quesId, quesType, acc):
if quesType not in self.evalQuesType:
self.evalQuesType[quesType] = {}
self.evalQuesType[quesType][quesId] = round(100*acc, self.n)
def setEvalAnsType(self, quesId, ansType, acc):
if ansType not in self.evalAnsType:
self.evalAnsType[ansType] = {}
self.evalAnsType[ansType][quesId] = round(100*acc, self.n)
def updateProgress(self, progress):
barLength = 20
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength*progress))
text = "\rFinshed Percent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), int(progress*100), status)
sys.stdout.write(text)
sys.stdout.flush()

View File

@ -0,0 +1,73 @@
# coding: utf-8
from vqaTools.vqa import VQA
import random
import skimage.io as io
import matplotlib.pyplot as plt
import os
dataDir ='../../VQA'
versionType ='v2_' # this should be '' when using VQA v2.0 dataset
taskType ='OpenEnded' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
dataType ='mscoco' # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
dataSubType ='train2014'
annFile ='%s/Annotations/%s%s_%s_annotations.json'%(dataDir, versionType, dataType, dataSubType)
quesFile ='%s/Questions/%s%s_%s_%s_questions.json'%(dataDir, versionType, taskType, dataType, dataSubType)
imgDir = '%s/Images/%s/%s/' %(dataDir, dataType, dataSubType)
# initialize VQA api for QA annotations
vqa=VQA(annFile, quesFile)
# load and display QA annotations for given question types
"""
All possible quesTypes for abstract and mscoco has been provided in respective text files in ../QuestionTypes/ folder.
"""
annIds = vqa.getQuesIds(quesTypes='how many');
anns = vqa.loadQA(annIds)
randomAnn = random.choice(anns)
vqa.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
I = io.imread(imgDir + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
# load and display QA annotations for given answer types
"""
ansTypes can be one of the following
yes/no
number
other
"""
annIds = vqa.getQuesIds(ansTypes='yes/no');
anns = vqa.loadQA(annIds)
randomAnn = random.choice(anns)
vqa.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
I = io.imread(imgDir + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()
# load and display QA annotations for given images
"""
Usage: vqa.getImgIds(quesIds=[], quesTypes=[], ansTypes=[])
Above method can be used to retrieve imageIds for given question Ids or given question types or given answer types.
"""
ids = vqa.getImgIds()
annIds = vqa.getQuesIds(imgIds=random.sample(ids,5));
anns = vqa.loadQA(annIds)
randomAnn = random.choice(anns)
vqa.showQA([randomAnn])
imgId = randomAnn['image_id']
imgFilename = 'COCO_' + dataSubType + '_'+ str(imgId).zfill(12) + '.jpg'
if os.path.isfile(imgDir + imgFilename):
I = io.imread(imgDir + imgFilename)
plt.imshow(I)
plt.axis('off')
plt.show()

View File

@ -0,0 +1 @@
__author__ = 'aagrawal'

View File

@ -0,0 +1,179 @@
__author__ = 'aagrawal'
__version__ = '0.9'
# Interface for accessing the VQA dataset.
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
# The following functions are defined:
# VQA - VQA class that loads VQA annotation file and prepares data structures.
# getQuesIds - Get question ids that satisfy given filter conditions.
# getImgIds - Get image ids that satisfy given filter conditions.
# loadQA - Load questions and answers with the specified question ids.
# showQA - Display the specified questions and answers.
# loadRes - Load result file and create result object.
# Help on each function can be accessed by: "help(COCO.function)"
import json
import datetime
import copy
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""
Constructor of VQA helper class for reading and visualizing questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {}
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
# print 'loading VQA annotations and questions into memory...'
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, 'r'))
questions = json.load(open(question_file, 'r'))
# print datetime.datetime.utcnow() - time_t
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
for ann in self.dataset['annotations']:
imgToQA[ann['image_id']] += [ann]
qa[ann['question_id']] = ann
for ques in self.questions['questions']:
qqa[ques['question_id']] = ques
# print 'index created!'
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
def info(self):
"""
Print information about the VQA annotation file.
:return:
"""
# for key, value in self.datset['info'].items():
# print '%s: %s'%(key, value)
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
"""
Get question ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get question ids for given imgs
quesTypes (str array) : get question ids for given question types
ansTypes (str array) : get question ids for given answer types
:return: ids (int array) : integer array of question ids
"""
imgIds = imgIds if type(imgIds) == list else [imgIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(imgIds) == 0:
anns = sum([self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['question_id'] for ann in anns]
return ids
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
"""
Get image ids that satisfy given filter conditions. default skips that filter
:param quesIds (int array) : get image ids for given question ids
quesTypes (str array) : get image ids for given question types
ansTypes (str array) : get image ids for given answer types
:return: ids (int array) : integer array of image ids
"""
quesIds = quesIds if type(quesIds) == list else [quesIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(quesIds) == 0:
anns = sum([self.qa[quesId] for quesId in quesIds if quesId in self.qa], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['image_id'] for ann in anns]
return ids
def loadQA(self, ids=[]):
"""
Load questions and answers with the specified question ids.
:param ids (int array) : integer ids specifying question ids
:return: qa (object array) : loaded qa objects
"""
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def showQA(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann['question_id']
print("Question: %s" % (self.qqa[quesId]['question']))
for ans in ann['answers']:
print("Answer %d: %s" % (ans['answer_id'], ans['answer']))
def loadRes(self, resFile, quesFile):
"""
Load result file and return a result object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = VQA()
res.questions = json.load(open(quesFile))
res.dataset['info'] = copy.deepcopy(self.questions['info'])
res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
res.dataset['data_subtype'] = copy.deepcopy(self.questions['data_subtype'])
res.dataset['license'] = copy.deepcopy(self.questions['license'])
# print 'Loading and preparing results... '
time_t = datetime.datetime.utcnow()
anns = json.load(open(resFile))
assert type(anns) == list, 'results is not an array of objects'
annsQuesIds = [ann['question_id'] for ann in anns]
assert set(annsQuesIds) == set(self.getQuesIds()), \
'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
for ann in anns:
quesId = ann['question_id']
if res.dataset['task_type'] == 'Multiple Choice':
assert ann['answer'] in self.qqa[quesId][
'multiple_choices'], 'predicted answer is not one of the multiple choices'
qaAnn = self.qa[quesId]
ann['image_id'] = qaAnn['image_id']
ann['question_type'] = qaAnn['question_type']
ann['answer_type'] = qaAnn['answer_type']
# print 'DONE (t=%0.2fs)'%((datetime.datetime.utcnow() - time_t).total_seconds())
res.dataset['annotations'] = anns
res.createIndex()
return res

View File

@ -0,0 +1,81 @@
how many
what color is the
is the
where is the
what
what is
are the
what is the
is there a
does the
is the woman
is the man
what is on the
is it
is the girl
is the boy
is the dog
are they
who is
what kind of
what color are the
what is in the
what is the man
is there
what is the woman
what are the
what is the boy
are there
what is the girl
is this
how
which
how many people are
is the cat
why is the
are
will the
what type of
what is the dog
do
is she
does
do the
is
is the baby
are there any
is the lady
can
what animal is
where are the
is the sun
what are they
did the
what is the cat
what is the lady
how many clouds are
is that
is the little girl
is he
are these
how many trees are
how many pillows
are the people
why
is the young
how many windows are
is this a
what is the little
is the tv
how many animals are
who
how many pictures
how many plants are
how many birds are
what color is
what is the baby
is anyone
what color
how many bushes
is the old man
none of the above

View File

@ -0,0 +1,65 @@
how many
is the
what
what color is the
what is the
is this
is this a
what is
are the
what kind of
is there a
what type of
is it
what are the
where is the
is there
does the
what color are the
are these
are there
which
is
what is the man
is the man
are
how
does this
what is on the
what does the
how many people are
what is in the
what is this
do
what are
are they
what time
what sport is
are there any
is he
what color is
why
where are the
what color
who is
what animal is
is the woman
is this an
do you
how many people are in
what room is
has
is this person
what is the woman
can you
why is the
is the person
what is the color of the
what is the person
could
was
is that a
what number is
what is the name
what brand
none of the above

View File

@ -0,0 +1,80 @@
Python API and Evaluation Code for v2.0 and v1.0 releases of the VQA dataset.
===================
## VQA v2.0 release ##
This release consists of
- Real
- 82,783 MS COCO training images, 40,504 MS COCO validation images and 81,434 MS COCO testing images (images are obtained from [MS COCO website] (http://mscoco.org/dataset/#download))
- 443,757 questions for training, 214,354 questions for validation and 447,793 questions for testing
- 4,437,570 answers for training and 2,143,540 answers for validation (10 per question)
There is only one type of task
- Open-ended task
## VQA v1.0 release ##
This release consists of
- Real
- 82,783 MS COCO training images, 40,504 MS COCO validation images and 81,434 MS COCO testing images (images are obtained from [MS COCO website] (http://mscoco.org/dataset/#download))
- 248,349 questions for training, 121,512 questions for validation and 244,302 questions for testing (3 per image)
- 2,483,490 answers for training and 1,215,120 answers for validation (10 per question)
- Abstract
- 20,000 training images, 10,000 validation images and 20,000 MS COCO testing images
- 60,000 questions for training, 30,000 questions for validation and 60,000 questions for testing (3 per image)
- 600,000 answers for training and 300,000 answers for validation (10 per question)
There are two types of tasks
- Open-ended task
- Multiple-choice task (18 choices per question)
## Requirements ##
- python 2.7
- scikit-image (visit [this page](http://scikit-image.org/docs/dev/install.html) for installation)
- matplotlib (visit [this page](http://matplotlib.org/users/installing.html) for installation)
## Files ##
./Questions
- For v2.0, download the question files from the [VQA download page](http://www.visualqa.org/download.html), extract them and place in this folder.
- For v1.0, both real and abstract, question files can be found on the [VQA v1 download page](http://www.visualqa.org/vqa_v1_download.html).
- Question files from Beta v0.9 release (123,287 MSCOCO train and val images, 369,861 questions, 3,698,610 answers) can be found below
- [training question files](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.9/Questions_Train_mscoco.zip)
- [validation question files](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.9/Questions_Val_mscoco.zip)
- Question files from Beta v0.1 release (10k MSCOCO images, 30k questions, 300k answers) can be found [here](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.1/Questions_Train_mscoco.zip).
./Annotations
- For v2.0, download the annotations files from the [VQA download page](http://www.visualqa.org/download.html), extract them and place in this folder.
- For v1.0, for both real and abstract, annotation files can be found on the [VQA v1 download page](http://www.visualqa.org/vqa_v1_download.html).
- Annotation files from Beta v0.9 release (123,287 MSCOCO train and val images, 369,861 questions, 3,698,610 answers) can be found below
- [training annotation files](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.9/Annotations_Train_mscoco.zip)
- [validation annotation files](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.9/Annotations_Val_mscoco.zip)
- Annotation files from Beta v0.1 release (10k MSCOCO images, 30k questions, 300k answers) can be found [here](http://visualqa.org/data/mscoco/prev_rel/Beta_v0.1/Annotations_Train_mscoco.zip).
./Images
- For real, create a directory with name mscoco inside this directory. For each of train, val and test, create directories with names train2014, val2014 and test2015 respectively inside mscoco directory, download respective images from [MS COCO website](http://mscoco.org/dataset/#download) and place them in respective folders.
- For abstract, create a directory with name abstract_v002 inside this directory. For each of train, val and test, create directories with names train2015, val2015 and test2015 respectively inside abstract_v002 directory, download respective images from [VQA download page](http://www.visualqa.org/download.html) and place them in respective folders.
./PythonHelperTools
- This directory contains the Python API to read and visualize the VQA dataset
- vqaDemo.py (demo script)
- vqaTools (API to read and visualize data)
./PythonEvaluationTools
- This directory contains the Python evaluation code
- vqaEvalDemo.py (evaluation demo script)
- vqaEvaluation (evaluation code)
./Results
- OpenEnded_mscoco_train2014_fake_results.json (an example of a fake results file for v1.0 to run the demo)
- Visit [VQA evaluation page] (http://visualqa.org/evaluation) for more details.
./QuestionTypes
- This directory contains the following lists of question types for both real and abstract questions (question types are unchanged from v1.0 to v2.0). In a list, if there are question types of length n+k and length n with the same first n words, then the question type of length n does not include questions that belong to the question type of length n+k.
- mscoco_question_types.txt
- abstract_v002_question_types.txt
## References ##
- [VQA: Visual Question Answering](http://visualqa.org/)
- [Microsoft COCO](http://mscoco.org/)
## Developers ##
- Aishwarya Agrawal (Virginia Tech)
- Code for API is based on [MSCOCO API code](https://github.com/pdollar/coco).
- The format of the code for evaluation is based on [MSCOCO evaluation code](https://github.com/tylin/coco-caption).

View File

@ -0,0 +1,30 @@
Copyright (c) 2014, Aishwarya Agrawal
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The views and conclusions contained in the software and documentation are
those
of the authors and should not be interpreted as representing official
policies,
either expressed or implied, of the FreeBSD Project.

View File

@ -0,0 +1,8 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = "aagrawal"

View File

@ -0,0 +1,211 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = "aagrawal"
__version__ = "0.9"
# Interface for accessing the VQA dataset.
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
# The following functions are defined:
# VQA - VQA class that loads VQA annotation file and prepares data structures.
# getQuesIds - Get question ids that satisfy given filter conditions.
# getImgIds - Get image ids that satisfy given filter conditions.
# loadQA - Load questions and answers with the specified question ids.
# showQA - Display the specified questions and answers.
# loadRes - Load result file and create result object.
# Help on each function can be accessed by: "help(COCO.function)"
import json
import datetime
import copy
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""
Constructor of VQA helper class for reading and visualizing questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {}
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
print("loading VQA annotations and questions into memory...")
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, "r"))
questions = json.load(open(question_file, "r"))
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
# create index
print("creating index...")
imgToQA = {ann["image_id"]: [] for ann in self.dataset["annotations"]}
qa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
qqa = {ann["question_id"]: [] for ann in self.dataset["annotations"]}
for ann in self.dataset["annotations"]:
imgToQA[ann["image_id"]] += [ann]
qa[ann["question_id"]] = ann
for ques in self.questions["questions"]:
qqa[ques["question_id"]] = ques
print("index created!")
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
def info(self):
"""
Print information about the VQA annotation file.
:return:
"""
for key, value in self.datset["info"].items():
print("%s: %s" % (key, value))
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
"""
Get question ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get question ids for given imgs
quesTypes (str array) : get question ids for given question types
ansTypes (str array) : get question ids for given answer types
:return: ids (int array) : integer array of question ids
"""
imgIds = imgIds if type(imgIds) == list else [imgIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset["annotations"]
else:
if not len(imgIds) == 0:
anns = sum(
[self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA],
[],
)
else:
anns = self.dataset["annotations"]
anns = (
anns
if len(quesTypes) == 0
else [ann for ann in anns if ann["question_type"] in quesTypes]
)
anns = (
anns
if len(ansTypes) == 0
else [ann for ann in anns if ann["answer_type"] in ansTypes]
)
ids = [ann["question_id"] for ann in anns]
return ids
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
"""
Get image ids that satisfy given filter conditions. default skips that filter
:param quesIds (int array) : get image ids for given question ids
quesTypes (str array) : get image ids for given question types
ansTypes (str array) : get image ids for given answer types
:return: ids (int array) : integer array of image ids
"""
quesIds = quesIds if type(quesIds) == list else [quesIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset["annotations"]
else:
if not len(quesIds) == 0:
anns = sum(
[self.qa[quesId] for quesId in quesIds if quesId in self.qa], []
)
else:
anns = self.dataset["annotations"]
anns = (
anns
if len(quesTypes) == 0
else [ann for ann in anns if ann["question_type"] in quesTypes]
)
anns = (
anns
if len(ansTypes) == 0
else [ann for ann in anns if ann["answer_type"] in ansTypes]
)
ids = [ann["image_id"] for ann in anns]
return ids
def loadQA(self, ids=[]):
"""
Load questions and answers with the specified question ids.
:param ids (int array) : integer ids specifying question ids
:return: qa (object array) : loaded qa objects
"""
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def showQA(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann["question_id"]
print("Question: %s" % (self.qqa[quesId]["question"]))
for ans in ann["answers"]:
print("Answer %d: %s" % (ans["answer_id"], ans["answer"]))
def loadRes(self, resFile, quesFile):
"""
Load result file and return a result object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = VQA()
res.questions = json.load(open(quesFile))
res.dataset["info"] = copy.deepcopy(self.questions["info"])
res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"])
res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"])
res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"])
res.dataset["license"] = copy.deepcopy(self.questions["license"])
print("Loading and preparing results... ")
time_t = datetime.datetime.utcnow()
anns = json.load(open(resFile))
assert type(anns) == list, "results is not an array of objects"
annsQuesIds = [ann["question_id"] for ann in anns]
assert set(annsQuesIds) == set(
self.getQuesIds()
), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file."
for ann in anns:
quesId = ann["question_id"]
if res.dataset["task_type"] == "Multiple Choice":
assert (
ann["answer"] in self.qqa[quesId]["multiple_choices"]
), "predicted answer is not one of the multiple choices"
qaAnn = self.qa[quesId]
ann["image_id"] = qaAnn["image_id"]
ann["question_type"] = qaAnn["question_type"]
ann["answer_type"] = qaAnn["answer_type"]
print(
"DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds())
)
res.dataset["annotations"] = anns
res.createIndex()
return res

View File

@ -0,0 +1,324 @@
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
# coding=utf-8
__author__ = "aagrawal"
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys
import re
class VQAEval:
def __init__(self, vqa=None, vqaRes=None, n=2):
self.n = n
self.accuracy = {}
self.evalQA = {}
self.evalQuesType = {}
self.evalAnsType = {}
self.vqa = vqa
self.vqaRes = vqaRes
if vqa is not None:
self.params = {"question_id": vqa.getQuesIds()}
self.contractions = {
"aint": "ain't",
"arent": "aren't",
"cant": "can't",
"couldve": "could've",
"couldnt": "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
"didnt": "didn't",
"doesnt": "doesn't",
"dont": "don't",
"hadnt": "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
"hasnt": "hasn't",
"havent": "haven't",
"hed": "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
"hes": "he's",
"howd": "how'd",
"howll": "how'll",
"hows": "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
"Im": "I'm",
"Ive": "I've",
"isnt": "isn't",
"itd": "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
"itll": "it'll",
"let's": "let's",
"maam": "ma'am",
"mightnt": "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
"mightve": "might've",
"mustnt": "mustn't",
"mustve": "must've",
"neednt": "needn't",
"notve": "not've",
"oclock": "o'clock",
"oughtnt": "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
"shant": "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
"shouldve": "should've",
"shouldnt": "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd",
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
"somebodyll": "somebody'll",
"somebodys": "somebody's",
"someoned": "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
"someonell": "someone'll",
"someones": "someone's",
"somethingd": "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
"somethingll": "something'll",
"thats": "that's",
"thered": "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
"therere": "there're",
"theres": "there's",
"theyd": "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
"theyll": "they'll",
"theyre": "they're",
"theyve": "they've",
"twas": "'twas",
"wasnt": "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
"weve": "we've",
"werent": "weren't",
"whatll": "what'll",
"whatre": "what're",
"whats": "what's",
"whatve": "what've",
"whens": "when's",
"whered": "where'd",
"wheres": "where's",
"whereve": "where've",
"whod": "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
"wholl": "who'll",
"whos": "who's",
"whove": "who've",
"whyll": "why'll",
"whyre": "why're",
"whys": "why's",
"wont": "won't",
"wouldve": "would've",
"wouldnt": "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
"yall": "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
"youd": "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
"youll": "you'll",
"youre": "you're",
"youve": "you've",
}
self.manualMap = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
}
self.articles = ["a", "an", "the"]
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
self.commaStrip = re.compile("(\d)(,)(\d)")
self.punct = [
";",
r"/",
"[",
"]",
'"',
"{",
"}",
"(",
")",
"=",
"+",
"\\",
"_",
"-",
">",
"<",
"@",
"`",
",",
"?",
"!",
]
def evaluate(self, quesIds=None):
if quesIds == None:
quesIds = [quesId for quesId in self.params["question_id"]]
gts = {}
res = {}
for quesId in quesIds:
gts[quesId] = self.vqa.qa[quesId]
res[quesId] = self.vqaRes.qa[quesId]
# =================================================
# Compute accuracy
# =================================================
accQA = []
accQuesType = {}
accAnsType = {}
print("computing accuracy")
step = 0
for quesId in quesIds:
resAns = res[quesId]["answer"]
resAns = resAns.replace("\n", " ")
resAns = resAns.replace("\t", " ")
resAns = resAns.strip()
resAns = self.processPunctuation(resAns)
resAns = self.processDigitArticle(resAns)
gtAcc = []
gtAnswers = [ans["answer"] for ans in gts[quesId]["answers"]]
if len(set(gtAnswers)) > 1:
for ansDic in gts[quesId]["answers"]:
ansDic["answer"] = self.processPunctuation(ansDic["answer"])
for gtAnsDatum in gts[quesId]["answers"]:
otherGTAns = [
item for item in gts[quesId]["answers"] if item != gtAnsDatum
]
matchingAns = [item for item in otherGTAns if item["answer"] == resAns]
acc = min(1, float(len(matchingAns)) / 3)
gtAcc.append(acc)
quesType = gts[quesId]["question_type"]
ansType = gts[quesId]["answer_type"]
avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
accQA.append(avgGTAcc)
if quesType not in accQuesType:
accQuesType[quesType] = []
accQuesType[quesType].append(avgGTAcc)
if ansType not in accAnsType:
accAnsType[ansType] = []
accAnsType[ansType].append(avgGTAcc)
self.setEvalQA(quesId, avgGTAcc)
self.setEvalQuesType(quesId, quesType, avgGTAcc)
self.setEvalAnsType(quesId, ansType, avgGTAcc)
if step % 100 == 0:
self.updateProgress(step / float(len(quesIds)))
step = step + 1
self.setAccuracy(accQA, accQuesType, accAnsType)
print("Done computing accuracy")
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + " " in inText or " " + p in inText) or (
re.search(self.commaStrip, inText) != None
):
outText = outText.replace(p, "")
else:
outText = outText.replace(p, " ")
outText = self.periodStrip.sub("", outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = " ".join(outText)
return outText
def setAccuracy(self, accQA, accQuesType, accAnsType):
self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n)
self.accuracy["perQuestionType"] = {
quesType: round(
100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]),
self.n,
)
for quesType in accQuesType
}
self.accuracy["perAnswerType"] = {
ansType: round(
100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n
)
for ansType in accAnsType
}
def setEvalQA(self, quesId, acc):
self.evalQA[quesId] = round(100 * acc, self.n)
def setEvalQuesType(self, quesId, quesType, acc):
if quesType not in self.evalQuesType:
self.evalQuesType[quesType] = {}
self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
def setEvalAnsType(self, quesId, ansType, acc):
if ansType not in self.evalAnsType:
self.evalAnsType[ansType] = {}
self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
def updateProgress(self, progress):
barLength = 20
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength * progress))
text = "\rFinshed Percent: [{0}] {1}% {2}".format(
"#" * block + "-" * (barLength - block), int(progress * 100), status
)
sys.stdout.write(text)
sys.stdout.flush()

View File

@ -127,7 +127,14 @@ CONV_VISION_LLama2 = Conversation(
sep="",
)
CONV_VISION_minigptv2 = Conversation(
system="",
roles=("<s>[INST] ", " [/INST]"),
messages=[],
offset=2,
sep_style=SeparatorStyle.SINGLE,
sep="",
)
class Chat:
def __init__(self, model, vis_processor, device='cuda:0', stopping_criteria=None):

View File

@ -91,7 +91,7 @@ class RefCOCOEvalData(torch.utils.data.Dataset):
image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg')
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[refer] tell me the location of {sent}?"
question = f"[refer] give me the location of {sent}"
return image, question, img_id
class EvalCaptionData(torch.utils.data.Dataset):

View File

@ -3,7 +3,6 @@ import json
import pickle
import random
import time
# import iterto
import numpy as np
from PIL import Image
import skimage.io as io

View File

@ -16,30 +16,6 @@ class VQADataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
# def collater(self, samples):
# image_list, question_list, answer_list, weight_list = [], [], [], []
# num_answers = []
# for sample in samples:
# image_list.append(sample["image"])
# question_list.append(sample["question"])
# weight_list.extend(sample["weights"])
# answers = sample["answer"]
# answer_list.extend(answers)
# num_answers.append(len(answers))
# return {
# "image": torch.stack(image_list, dim=0),
# "text_input": question_list,
# "answer": answer_list,
# "weight": torch.Tensor(weight_list),
# "n_answers": torch.LongTensor(num_answers),
# }
class VQAEvalDataset(BaseDataset):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
@ -65,7 +41,6 @@ class OKVQAEvalData(torch.utils.data.Dataset):
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id, img_id
class VizWizEvalData(torch.utils.data.Dataset):
@ -86,54 +61,9 @@ class VizWizEvalData(torch.utils.data.Dataset):
image_path = os.path.join(self.root_path, img_id)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
# question = f"[vqa] Based on the image, respond to this question with a short answer: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question} and reply 'unanswerable' if you could not answer it"
question = f"[vqa] The question is '{question}' Based on the image, answer the question with a single word or phrase. and reply 'unanswerable' when the provided information is insufficient"
return image, question, answers
class AOKVQADAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_file = data['image']
question = data['question']
question_id = data['question_id']
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id
class AOKVQAMCEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
self.root_path = root_path
self.vis_processor = vis_processor
def __len__(self):
return len(self.loaded_data)
def __getitem__(self, idx):
data = self.loaded_data[idx]
img_file = data['image']
question = data['question']
question_id = data['question_id']
image_path = os.path.join(self.root_path, img_file)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image).half().cuda()
candidates=data['choices']
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, question_id, candidates
class IconQAEvalData(torch.utils.data.Dataset):
def __init__(self, loaded_data, vis_processor, root_path):
self.loaded_data = loaded_data
@ -152,9 +82,7 @@ class IconQAEvalData(torch.utils.data.Dataset):
image = self.vis_processor(image).half().cuda()
candidates = '_'.join(data['choices'])
answer = data['answer']
# question = f"Given this image, choose one correct answer from {candidates} for this question: {question} "
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
return image, question, candidates, answer
class GQAEvalData(torch.utils.data.Dataset):
@ -173,9 +101,7 @@ class GQAEvalData(torch.utils.data.Dataset):
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = ann["question"]
# question = f'Question: {question} Short answer: '
question = f"[vqa] Based on the image, respond to this question with a short answer: {question}"
# question = f"[vqa] {question} "
labels = ann["answer"]
return image, question, labels
@ -196,7 +122,7 @@ class HMEvalData(torch.utils.data.Dataset):
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = ann["text"]
question = f"[vqa] This is an image with: \"{question}\" written on it. Is it hateful? Answer:"
question = f"This is an image writting '{question}'. Is this image hateful? Answer yes or no. Answer:"
labels = ann["label"]
return image, question, labels
@ -217,7 +143,6 @@ class VSREvalData(torch.utils.data.Dataset):
image = self.vis_processor(image)
question = ann["caption"]
question = f'[vqa] Based on the image, is this statement true or false? {question}'
question_id = ann["image"].split('.')[0]
labels = 'true' if ann["label"] == 1 else 'false'
return image, question, labels

View File

@ -11,8 +11,6 @@ from transformers import StoppingCriteria, StoppingCriteriaList
from minigpt4.conversation.conversation import StoppingCriteriaSub
class MiniGPTBase(BaseModel):
"""
Base class for MiniGPT-4 and MiniGPT-v2
@ -368,9 +366,18 @@ class MiniGPTBase(BaseModel):
min_length=min_length,
top_p=top_p,
repetition_penalty=repetition_penalty,
stopping_criteria=stopping_criteria,
# stopping_criteria=stopping_criteria,
)
# with self.maybe_autocast():
# outputs = self.llama_model.generate(
# inputs_embeds=embs,
# attention_mask=attn_mask,
# max_new_tokens=max_new_tokens,
# num_beams=num_beams,
# do_sample=do_sample,
# # stopping_criteria=stopping_criteria,
# )
answers = []
for output_token in outputs:
if output_token[0] == 0:
@ -400,4 +407,4 @@ class MiniGPTBase(BaseModel):
for i in range(all_losses.shape[0]):
all_losses[i, num_cand[i]:] = 9999
output_class_ranks = torch.argsort(all_losses, dim=-1)
return output_class_ranks.tolist()
return output_class_ranks.tolist()