import os import re import json import argparse from collections import defaultdict import numpy as np from PIL import Image from tqdm import tqdm import torch from torch.utils.data import DataLoader from datasets import load_dataset from minigpt4.datasets.datasets.vqa_datasets import OKVQAEvalData,VizWizEvalData,IconQAEvalData,GQAEvalData,VSREvalData,HMEvalData from minigpt4.common.vqa_tools.VQA.PythonHelperTools.vqaTools.vqa import VQA from minigpt4.common.vqa_tools.VQA.PythonEvaluationTools.vqaEvaluation.vqaEval import VQAEval from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser from minigpt4.conversation.conversation import CONV_VISION_minigptv2 def list_of_str(arg): return list(map(str, arg.split(','))) parser = eval_parser() parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate") parser.add_argument("--split", type=list_of_str, default='testB', help="dataset split to evaluate") parser.add_argument("--resample", action='store_true', help="resolution used in refcoco") parser.add_argument("--img_path", type=str) parser.add_argument("--eval_file_path", type=str) parser.add_argument("--save_path", type=str) args = parser.parse_args() print(args.ckpt) print(args.name) model, vis_processor = init_model(args) conv_temp = CONV_VISION_minigptv2.copy() conv_temp.system = "" model.eval() os.makedirs('results', exist_ok=True) if 'okvqa' in args.dataset: evaluation_annntation_path = os.path.join(args.eval_file_path, "okvqa_test_split.json") with open(evaluation_annntation_path) as f: ok_vqa_test_split = json.load(f) data = OKVQAEvalData(ok_vqa_test_split, vis_processor, args.img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) minigpt4_predict = [] resamples = [] for images, questions, question_ids, img_ids in eval_dataloader: texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, question_id, question, img_id in zip(answers, question_ids, questions, img_ids): result = dict() answer = answer.lower().replace('','').strip() result['answer'] = answer result['question_id'] = int(question_id) minigpt4_predict.append(result) with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) annFile = os.path.join(args.eval_file_path,"mscoco_val2014_annotations_clean.json") quesFile = os.path.join(args.eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" ) vqa = VQA(annFile, quesFile) vqaRes = vqa.loadRes(args.save_path, quesFile) vqaEval = VQAEval(vqa, vqaRes, n=2) vqaEval.evaluate() print ("Overall OKVQA Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']), flush=True) if 'vizwiz' in args.dataset: img_path= args.img_path vizwiz = json.load(open(args.eval_file_path, 'r')) data = VizWizEvalData(vizwiz, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) minigpt4_predict = [] total_acc = [] for images, texts, gt_answers in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template with torch.no_grad(): answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False,repetition_penalty=1.0) for answer, gt_answer in zip(answers, gt_answers): result = dict() result['answer'] = answer.replace('','').strip() minigpt4_predict.append(result) count=0 gt_answer = gt_answer.split('_') for gt in gt_answer: if gt.lower() == answer.lower(): count += 1 acc = min(count/3.0, 1.0) total_acc.append(acc) save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) print('vizwiz Acc: ', np.average(total_acc)* 100.0, flush=True) if 'iconvqa' in args.dataset: iconqa_text_val = json.load(open(args.eval_file_path,"r")) img_path = args.img_path data = IconQAEvalData(iconqa_text_val, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count = 0 for images, texts, candidates, answers in tqdm(eval_dataloader): candidates = [candidate.split('_') for candidate in candidates] num_cand = [len(candidate) for candidate in candidates] for candidate in candidates: candidate.extend(['none'] * (max(num_cand) - len(candidate))) candidates = [list(x) for x in zip(*candidates)] instructions = ["[INST] {} [/INST]".format(text) for text in texts] answer_ranks = model.multi_select(images, instructions, candidates, num_cand=num_cand) for idx, answer in enumerate(answers): if answer_ranks[idx][0] == answer: count += 1 print('iconqa Acc: ', count / len(iconqa_text_val) * 100.0, flush=True) if 'gqa' in args.dataset: img_path = args.img_path gqa = json.load(open(args.eval_file_path)) data = GQAEvalData(gqa, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count=0 total=0 minigpt4_predict = [] for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() result['pred'] = answer.lower().replace('','').strip() result['gt'] = label minigpt4_predict.append(result) if answer.lower() == label: count+=1 total+=1 print('gqa val:', count / total * 100, flush=True) save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) if 'vsr' in args.dataset: annotation = load_dataset(args.eval_file_path, split='test') img_path = args.img_path data = VSREvalData(annotation, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count=0 total=0 minigpt4_predict = [] for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() result['pred'] = answer.replace('','').strip() result['gt'] = label minigpt4_predict.append(result) if answer.lower() == label.lower(): count+=1 total+=1 print('vsr test:', count / total * 100, flush=True) with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) if 'hm' in args.dataset: img_path = args.img_path annotation = [] with open(args.eval_file_path, 'r') as jsonl_file: for line in jsonl_file: json_obj = json.loads(line) annotation.append(json_obj) data = HMEvalData(annotation, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count=0 total=0 minigpt4_predict = [] for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() if answer.lower().strip() =="yes": answer=1 elif answer.lower().strip()=="no": answer=0 else: print("answer",answer) result['pred'] = answer result['gt'] = int(label) minigpt4_predict.append(result) if answer == label: count+=1 total+=1 print('hm val:', count / total * 100, flush=True) with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f)