diff --git a/eval_configs/minigptv2_benchmark_evaluation.yaml b/eval_configs/minigptv2_benchmark_evaluation.yaml new file mode 100644 index 0000000..0977f82 --- /dev/null +++ b/eval_configs/minigptv2_benchmark_evaluation.yaml @@ -0,0 +1,35 @@ +model: + arch: minigpt_v2 + model_type: pretrain + max_txt_len: 500 + end_sym: "" + low_resource: False + prompt_template: '[INST] {} [/INST]' + llama_model: "" + ckpt: "" + lora_r: 64 + lora_alpha: 16 + + +datasets: + cc_sbu_align: + vis_processor: + train: + name: "blip2_image_eval" + image_size: 448 + text_processor: + train: + name: "blip_caption" + +run: + task: image_text_pretrain + max_new_tokens: 20 + name: minigptv2_evaluation + batch_size: 10 + eval_file_path: /path/to/eval/annotation/path # annotation file + img_path: /path/to/eval/image/path # image file path + save_path: /path/to/save/path # saved result + + + + diff --git a/eval_scripts/EVAL_README.md b/eval_scripts/EVAL_README.md index 9b4078e..659c55f 100644 --- a/eval_scripts/EVAL_README.md +++ b/eval_scripts/EVAL_README.md @@ -57,32 +57,39 @@ ${MINIGPTv2_EVALUATION_DATASET} export PYTHONPATH=$PYTHONPATH:/path/to/directory/of/MiniGPT-4 ``` +### evaluation config files +Set **llama_model** to the path of LLaMA model. +Set **ckpt** to the path of our pretrained model. +Set **eval_file_path** to the path of the annotation files for the evaluation data. +Set **img_path** to the path of the images. +Set **save_path** to the path of saving evaluation output. + +- [minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml](../minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml) + + + + ### start evalauting RefCOCO, RefCOCO+, RefCOCOg port=port_number -cfg_path=/path/to/eval_configs/minigptv2_eval.yaml -save_path=/path/to/save/path -ckpt=/path/to/evaluation/checkpoint -split=data_evaluation_split -dataset=dataset_name +cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml -dataset | split ---- | :---: -refcoco | val, testA, testB -refcoco+ | val, testA, testB -refcocog | val, test + +dataset | +--- | +refcoco | +refcoco+ | +refcocog | ``` torchrun --master-port ${port} --nproc_per_node 1 eval_ref.py \ - --cfg-path ${cfg_path} --eval_file_path ${eval_file_path} --save_path ${save_path} \ - --ckpt ${ckpt} --split ${split} --dataset ${dataset} --lora_r 64 --lora_alpha 16 \ - --batch_size 10 --max_new_tokens 20 --resample + --cfg-path ${cfg_path} --dataset dataset_name ``` ### start evaluating visual question answering port=port_number -cfg_path=/path/to/eval_configs/minigptv2_eval.yaml +cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml eval_file_path=/path/to/eval/annotation/path image_path=/path/to/eval/image/path save_path=/path/to/save/path @@ -91,14 +98,14 @@ split=evaluation_data_split dataset=dataset_type -dataset | image_path | eval_file_path ---- | :---:| :---: -okvqa | coco_2017 | /path/to/okvqa/folder -vizwiz | vizwiz_images | /path/to/vizwiz/folder -iconvqa | iconvqa_images | /path/to/iconvqa/folder -gqa | gqa_images | /path/to/gqa/folder -vsr | vsr_images | None -hateful meme | hm_images | /path/to/hateful_mem/folder +dataset_names | +--- | +okvqa | +vizwiz | +iconvqa | +gqa | +vsr | +hm | ``` diff --git a/eval_scripts/eval_ref.py b/eval_scripts/eval_ref.py index 5f8e60f..a29856f 100644 --- a/eval_scripts/eval_ref.py +++ b/eval_scripts/eval_ref.py @@ -9,7 +9,7 @@ from PIL import Image from tqdm import tqdm import torch from torch.utils.data import DataLoader - +from minigpt4.common.config import Config from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU from minigpt4.conversation.conversation import CONV_VISION_minigptv2 @@ -20,43 +20,43 @@ def list_of_str(arg): parser = eval_parser() parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate") -parser.add_argument("--split", type=list_of_str, default='test', help="dataset to evaluate") parser.add_argument("--res", type=float, default=100.0, help="resolution used in refcoco") parser.add_argument("--resample", action='store_true', help="resolution used in refcoco") -parser.add_argument("--img_path", type=str) -parser.add_argument("--eval_file_path", type=str) -parser.add_argument("--save_path", type=str) args = parser.parse_args() +cfg = Config(args) -print(args.ckpt) -print(args.name) - -eval_dict = {'refcoco': args.split, - 'refcoco+': args.split, - 'refcocog': args.split} +eval_dict = {'refcoco': ['val','testA','testB'], + 'refcoco+': ['val','testA','testB'], + 'refcocog': ['val','test']} model, vis_processor = init_model(args) model.eval() CONV_VISION = CONV_VISION_minigptv2 conv_temp = CONV_VISION.copy() conv_temp.system = "" + # model.eval() +eval_file_path = cfg.run_cfg.eval_file_path +img_path = cfg.run_cfg.img_path +batch_size = cfg.run_cfg.batch_size +max_new_tokens = cfg.run_cfg.max_new_tokens + for dataset in args.dataset: for split in eval_dict[dataset]: - with open(os.path.join(args.eval_file_path,f"{dataset}/{dataset}_{split}.json"), 'r') as f: + with open(os.path.join(eval_file_path,f"{dataset}/{dataset}_{split}.json"), 'r') as f: refcoco = json.load(f) - data = RefCOCOEvalData(refcoco, vis_processor, args.img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + data = RefCOCOEvalData(refcoco, vis_processor, img_path) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) minigpt4_predict = defaultdict(list) resamples = [] for images, questions, img_ids in tqdm(eval_dataloader): texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, img_id, question in zip(answers, img_ids, questions): answer = answer.replace("","").replace(" ","").strip() pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}' @@ -66,12 +66,12 @@ for dataset in args.dataset: resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]}) if args.resample: for i in range(20): - data = RefCOCOEvalData(resamples, vis_processor, args.img_path) + data = RefCOCOEvalData(resamples, vis_processor, img_path) resamples = [] - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) for images, questions, img_ids in tqdm(eval_dataloader): texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, img_id, question in zip(answers, img_ids, questions): answer = answer.replace("","").replace(" ","").strip() pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}' @@ -83,7 +83,7 @@ for dataset in args.dataset: if len(resamples) == 0: break - with open(args.save_path,'w') as f: + with open(save_path,'w') as f: json.dump(minigpt4_predict, f) count=0 diff --git a/eval_scripts/eval_vqa.py b/eval_scripts/eval_vqa.py index 0ec4cf6..14c2e1f 100644 --- a/eval_scripts/eval_vqa.py +++ b/eval_scripts/eval_vqa.py @@ -18,6 +18,7 @@ from minigpt4.common.vqa_tools.VQA.PythonEvaluationTools.vqaEvaluation.vqaEval i from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser from minigpt4.conversation.conversation import CONV_VISION_minigptv2 +from minigpt4.common.config import Config def list_of_str(arg): @@ -25,41 +26,34 @@ def list_of_str(arg): parser = eval_parser() parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate") -parser.add_argument("--split", type=list_of_str, default='testB', help="dataset split to evaluate") -parser.add_argument("--resample", action='store_true', help="resolution used in refcoco") -parser.add_argument("--img_path", type=str) -parser.add_argument("--eval_file_path", type=str) -parser.add_argument("--save_path", type=str) args = parser.parse_args() +cfg = Config(args) -print(args.ckpt) -print(args.name) model, vis_processor = init_model(args) conv_temp = CONV_VISION_minigptv2.copy() conv_temp.system = "" - - - model.eval() -os.makedirs('results', exist_ok=True) +eval_file_path = cfg.run_cfg.eval_file_path +img_path=cfg.run_cfg.img_path +save_path = cfg.run_cfg.save_path +batch_size = cfg.run_cfg.batch_size +max_new_tokens = cfg.run_cfg.max_new_tokens if 'okvqa' in args.dataset: - evaluation_annntation_path = os.path.join(args.eval_file_path, "okvqa_test_split.json") + evaluation_annntation_path = os.path.join(eval_file_path, "okvqa_test_split.json") with open(evaluation_annntation_path) as f: ok_vqa_test_split = json.load(f) - data = OKVQAEvalData(ok_vqa_test_split, vis_processor, args.img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + data = OKVQAEvalData(ok_vqa_test_split, vis_processor, img_path) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) minigpt4_predict = [] - resamples = [] - for images, questions, question_ids, img_ids in eval_dataloader: texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, question_id, question, img_id in zip(answers, question_ids, questions, img_ids): result = dict() @@ -68,31 +62,30 @@ if 'okvqa' in args.dataset: result['question_id'] = int(question_id) minigpt4_predict.append(result) - with open(args.save_path,'w') as f: + with open(save_path,'w') as f: json.dump(minigpt4_predict, f) - annFile = os.path.join(args.eval_file_path,"mscoco_val2014_annotations_clean.json") - quesFile = os.path.join(args.eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" ) + annFile = os.path.join(eval_file_path,"mscoco_val2014_annotations_clean.json") + quesFile = os.path.join(eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" ) vqa = VQA(annFile, quesFile) - vqaRes = vqa.loadRes(args.save_path, quesFile) + vqaRes = vqa.loadRes(save_path, quesFile) vqaEval = VQAEval(vqa, vqaRes, n=2) vqaEval.evaluate() print ("Overall OKVQA Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']), flush=True) if 'vizwiz' in args.dataset: - img_path= args.img_path - vizwiz = json.load(open(args.eval_file_path, 'r')) + vizwiz = json.load(open(eval_file_path, 'r')) data = VizWizEvalData(vizwiz, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) minigpt4_predict = [] total_acc = [] for images, texts, gt_answers in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template with torch.no_grad(): - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False,repetition_penalty=1.0) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False,repetition_penalty=1.0) for answer, gt_answer in zip(answers, gt_answers): result = dict() @@ -106,18 +99,16 @@ if 'vizwiz' in args.dataset: acc = min(count/3.0, 1.0) total_acc.append(acc) - save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) print('vizwiz Acc: ', np.average(total_acc)* 100.0, flush=True) if 'iconvqa' in args.dataset: - iconqa_text_val = json.load(open(args.eval_file_path,"r")) - img_path = args.img_path + iconqa_text_val = json.load(open(eval_file_path,"r")) data = IconQAEvalData(iconqa_text_val, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) count = 0 for images, texts, candidates, answers in tqdm(eval_dataloader): @@ -126,7 +117,7 @@ if 'iconvqa' in args.dataset: for candidate in candidates: candidate.extend(['none'] * (max(num_cand) - len(candidate))) candidates = [list(x) for x in zip(*candidates)] - instructions = ["[INST] {} [/INST]".format(text) for text in texts] + instructions = ["[INST] {} [/INST]".format(text) for text in texts] answer_ranks = model.multi_select(images, instructions, candidates, num_cand=num_cand) for idx, answer in enumerate(answers): if answer_ranks[idx][0] == answer: @@ -136,16 +127,15 @@ if 'iconvqa' in args.dataset: if 'gqa' in args.dataset: - img_path = args.img_path - gqa = json.load(open(args.eval_file_path)) + gqa = json.load(open(eval_file_path)) data = GQAEvalData(gqa, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) count=0 total=0 minigpt4_predict = [] for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() @@ -157,15 +147,13 @@ if 'gqa' in args.dataset: total+=1 print('gqa val:', count / total * 100, flush=True) - save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) if 'vsr' in args.dataset: annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test') - img_path = args.img_path data = VSREvalData(annotation, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) count=0 total=0 @@ -173,7 +161,7 @@ if 'vsr' in args.dataset: for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() @@ -184,19 +172,18 @@ if 'vsr' in args.dataset: count+=1 total+=1 print('vsr test:', count / total * 100, flush=True) - with open(args.save_path,'w') as f: + with open(save_path,'w') as f: json.dump(minigpt4_predict, f) if 'hm' in args.dataset: - img_path = args.img_path annotation = [] - with open(args.eval_file_path, 'r') as jsonl_file: + with open(eval_file_path, 'r') as jsonl_file: for line in jsonl_file: json_obj = json.loads(line) annotation.append(json_obj) data = HMEvalData(annotation, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) + eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False) count=0 total=0 @@ -205,7 +192,7 @@ if 'hm' in args.dataset: for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() @@ -214,10 +201,9 @@ if 'hm' in args.dataset: elif answer.lower().strip()=="no": answer=0 else: - print("answer",answer) + print("non-matching answer",answer) result['pred'] = answer - result['gt'] = int(label) minigpt4_predict.append(result) if answer == label: @@ -226,5 +212,5 @@ if 'hm' in args.dataset: print('hm val:', count / total * 100, flush=True) - with open(args.save_path,'w') as f: + with open(save_path,'w') as f: json.dump(minigpt4_predict, f) diff --git a/minigpt4/common/eval_utils.py b/minigpt4/common/eval_utils.py index 0a317e9..3087d2a 100644 --- a/minigpt4/common/eval_utils.py +++ b/minigpt4/common/eval_utils.py @@ -46,9 +46,9 @@ def prepare_texts(texts, conv_temp): def init_model(args): print('Initialization Model') cfg = Config(args) - cfg.model_cfg.ckpt = args.ckpt - cfg.model_cfg.lora_r = args.lora_r - cfg.model_cfg.lora_alpha = args.lora_alpha + # cfg.model_cfg.ckpt = args.ckpt + # cfg.model_cfg.lora_r = args.lora_r + # cfg.model_cfg.lora_alpha = args.lora_alpha model_config = cfg.model_cfg model_cls = registry.get_model_class(model_config.arch)