MiniGPT-4/minigpt4/eval_scripts/eval_vqa.py

356 lines
13 KiB
Python
Raw Normal View History

2023-11-23 05:39:24 +00:00
# python eval_vqa.py --dataset vizwiz
import os
import re
import json
import argparse
from collections import defaultdict
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn
import sys
sys.path.append("/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE")
from minigpt4.common.logger import setup_logger
from minigpt4.common.dist_utils import get_rank
from minigpt4.common.registry import registry
2023-12-19 03:24:51 +00:00
from minigpt4.datasets.datasets.vqa_datasets import OKVQAEvalData,VizWizEvalData,IconQAEvalData,VSREvalData,HMEvalData
2023-11-23 05:39:24 +00:00
from minigpt4.common.vqa_tools.vqa import VQA
from minigpt4.common.vqa_tools.vqa_eval import VQAEval
from minigpt4.common.config import Config
def list_of_str(arg):
return list(map(str, arg.split(',')))
def eval_parser():
parser = argparse.ArgumentParser(description="Demo")
parser.add_argument(
"--device",
default=0,
help="device to run the model",
)
2023-11-23 05:39:24 +00:00
parser.add_argument(
"--cfg-path",
2023-12-19 03:24:51 +00:00
default="/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml",
2023-11-23 05:39:24 +00:00
help="path to configuration file.")
parser.add_argument(
"--dataset",
default=['vizwiz','hm'],
type=list_of_str,
help="dataset to evaluate",
)
2023-11-23 05:39:24 +00:00
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
return parser
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def init_model(cfg, device):
print('Initialization Model')
model_config = cfg.model_cfg
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to(device)
key = list(cfg.datasets_cfg.keys())[0]
vis_processor_cfg = cfg.datasets_cfg.get(key).vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
txt_processor_cfg = cfg.datasets_cfg.get(key).text_processor.train
text_processor = registry.get_processor_class(txt_processor_cfg.name).from_config(txt_processor_cfg)
print('Initialization Finished')
return model, vis_processor, text_processor
parser = eval_parser()
args = parser.parse_args()
cfg = Config(args)
setup_seeds(cfg)
print(cfg._convert_node_to_json(cfg.config))
setup_logger()
device = torch.device("cuda:{}".format(args.device) if torch.cuda.is_available() else "cpu")
2023-11-23 05:39:24 +00:00
model, vis_processor, _ = init_model(cfg, device)
model.eval()
run_cfg = cfg.run_cfg
save_path = cfg.run_cfg.save_path
num_beams = run_cfg.get("num_beams", 3)
max_len = run_cfg.get("max_len", 20)
min_len = run_cfg.get("min_len", 1)
inference_method = run_cfg.get("inference_method", "rank")
num_ans_candidates = run_cfg.get("num_ans_candidates", 128)
prompt = run_cfg.get("prompt", "")
if not os.path.exists(save_path):
os.mkdir(save_path)
if 'vizwiz' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["vizwiz"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["vizwiz"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["vizwiz"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["vizwiz"]["max_new_tokens"]
vizwiz = json.load(open(eval_file_path, 'r'))
data = VizWizEvalData(vizwiz, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
predicts = []
total_acc = []
for samples in tqdm(eval_dataloader):
samples['image'] = samples['image'].half().to(device)
texts = samples['q_input']
gt_answers = samples['gt_ans']
image_ids = samples['image_id']
answers = model.predict_answers(
samples=samples,
inference_method=inference_method,
num_beams=num_beams,
max_len=max_len,
min_len=min_len,
num_ans_candidates=num_ans_candidates,
prompt=prompt,
)
for i in range(len(answers)):
question, answer, gt_answer, img_id = texts[i], answers[i], gt_answers[i], image_ids[i]
result = {'img_id':img_id, 'question':question}
result['answer'] = answer.replace('<unk>','').strip()
count=0
gt_answer = gt_answer.split('_')
for gt in gt_answer:
if gt.lower() == answer.lower():
count += 1
acc = min(count/3.0, 1.0)
total_acc.append(acc)
result['gt_ans'] = gt_answer
predicts.append(result)
vizwiz_acc = np.average(total_acc)* 100.0
print('vizwiz Acc: ', vizwiz_acc, flush=True)
file_save_path = os.path.join(save_path, "vizwiz.json")
with open(file_save_path,'a+') as f:
json.dump(predicts, f)
with open(os.path.join(save_path, f"evaluate_vizwiz.txt"), "a") as f:
f.write(json.dumps({'agg_metrics': vizwiz_acc}) + "\n")
if 'okvqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["okvqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["okvqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["okvqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["okvqa"]["max_new_tokens"]
evaluation_annntation_path = os.path.join(eval_file_path, "okvqa_test_split.json")
with open(evaluation_annntation_path) as f:
ok_vqa_test_split = json.load(f)
data = OKVQAEvalData(ok_vqa_test_split, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
minigpt4_predict = []
for images, questions, question_ids, img_ids in eval_dataloader:
texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, question_id, question, img_id in zip(answers, question_ids, questions, img_ids):
result = dict()
answer = answer.lower().replace('<unk>','').strip()
result['answer'] = answer
result['question_id'] = int(question_id)
minigpt4_predict.append(result)
file_save_path= os.path.join(save_path,"okvqa.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
annFile = os.path.join(eval_file_path,"mscoco_val2014_annotations_clean.json")
quesFile = os.path.join(eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" )
vqa = VQA(annFile, quesFile)
vqaRes = vqa.loadRes(file_save_path, quesFile)
vqaEval = VQAEval(vqa, vqaRes, n=2)
vqaEval.evaluate()
print ("Overall OKVQA Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']), flush=True)
if 'iconvqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["iconvqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["iconvqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["iconvqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["iconvqa"]["max_new_tokens"]
iconqa_text_val = json.load(open(eval_file_path,"r"))
data = IconQAEvalData(iconqa_text_val, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count = 0
for images, texts, candidates, answers in tqdm(eval_dataloader):
candidates = [candidate.split('_') for candidate in candidates]
num_cand = [len(candidate) for candidate in candidates]
for candidate in candidates:
candidate.extend(['none'] * (max(num_cand) - len(candidate)))
candidates = [list(x) for x in zip(*candidates)]
instructions = ["<s>[INST] <Img><ImageHere></Img> {} [/INST]".format(text) for text in texts]
answer_ranks = model.multi_select(images, instructions, candidates, num_cand=num_cand)
for idx, answer in enumerate(answers):
if answer_ranks[idx][0] == answer:
count += 1
print('iconqa Acc: ', count / len(iconqa_text_val) * 100.0, flush=True)
if 'gqa' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["gqa"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["gqa"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["gqa"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["gqa"]["max_new_tokens"]
gqa = json.load(open(eval_file_path))
data = GQAEvalData(gqa, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count=0
total=0
minigpt4_predict = []
for images, texts, labels in tqdm(eval_dataloader):
texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template
answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
for answer, label in zip(answers, labels):
result = dict()
result['pred'] = answer.lower().replace('<unk>','').strip()
result['gt'] = label
minigpt4_predict.append(result)
if answer.lower() == label:
count+=1
total+=1
print('gqa val:', count / total * 100, flush=True)
file_save_path = os.path.join(save_path, "gqa.json")
with open(file_save_path,'w') as f:
json.dump(minigpt4_predict, f)
if 'vsr' in args.dataset:
img_path = cfg.evaluation_datasets_cfg["vsr"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["vsr"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["vsr"]["max_new_tokens"]
from datasets import load_dataset
2023-11-23 05:39:24 +00:00
annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test')
data = VSREvalData(annotation, vis_processor, img_path)
eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
count=0
total=0
minigpt4_predict = []
2023-12-19 03:24:51 +00:00
for samples in tqdm(eval_dataloader):
2023-11-23 05:39:24 +00:00
2023-12-19 03:24:51 +00:00
texts = samples['q_input']
labels = samples['gt_ans']
image_ids = samples['image_id']
answers = model.predict_answers(
samples=samples,
inference_method=inference_method,
num_beams=num_beams,
max_len=max_len,
min_len=min_len,
num_ans_candidates=num_ans_candidates,
prompt=prompt,
)
# answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
2023-11-23 05:39:24 +00:00
for answer, label in zip(answers, labels):
result = dict()
result['pred'] = answer.replace('<unk>','').strip()
result['gt'] = label
minigpt4_predict.append(result)
if answer.lower() == label.lower():
count+=1
total+=1
print('vsr test:', count / total * 100, flush=True)
2023-12-19 03:24:51 +00:00
# file_save_path = os.path.join(save_path,"vsr.json")
# with open(file_save_path,'w') as f:
# json.dump(minigpt4_predict, f)
2023-11-23 05:39:24 +00:00
if 'hm' in args.dataset:
eval_file_path = cfg.evaluation_datasets_cfg["hm"]["eval_file_path"]
img_path = cfg.evaluation_datasets_cfg["hm"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["hm"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["hm"]["max_new_tokens"]
annotation = []
with open(eval_file_path, 'r') as jsonl_file:
for line in jsonl_file:
json_obj = json.loads(line)
annotation.append(json_obj)
data = HMEvalData(annotation, vis_processor, img_path)
2023-12-19 03:24:51 +00:00
eval_dataloader = DataLoader(data, batch_size=20, shuffle=False)
2023-11-23 05:39:24 +00:00
count=0
total=0
2023-12-19 03:24:51 +00:00
predict = []
2023-11-23 05:39:24 +00:00
2023-12-19 03:24:51 +00:00
for samples in tqdm(eval_dataloader):
samples['image'] = samples['image'].half().to(device)
texts = samples['q_input']
labels = samples['gt_ans']
answers = model.predict_answers(
samples=samples,
inference_method=inference_method,
num_beams=num_beams,
max_len=max_len,
min_len=min_len,
num_ans_candidates=num_ans_candidates,
prompt=prompt,
)
2023-11-23 05:39:24 +00:00
for answer, label in zip(answers, labels):
result = dict()
if answer.lower().strip() =="yes":
answer=1
elif answer.lower().strip()=="no":
answer=0
else:
print("non-matching answer",answer)
result['pred'] = answer
result['gt'] = int(label)
2023-12-19 03:24:51 +00:00
predict.append(result)
2023-11-23 05:39:24 +00:00
if answer == label:
count+=1
total+=1
2023-12-19 03:24:51 +00:00
print(answers)
2023-11-23 05:39:24 +00:00
print('hm val:', count / total * 100, flush=True)
file_save_path = os.path.join(save_path, "hm.json")
with open(file_save_path,'w') as f:
2023-12-19 03:24:51 +00:00
json.dump(predict, f)