diff --git a/eval_configs/minigptv2_eval.yaml b/eval_configs/minigptv2_eval.yaml index dedcde0..00c21ed 100644 --- a/eval_configs/minigptv2_eval.yaml +++ b/eval_configs/minigptv2_eval.yaml @@ -3,10 +3,10 @@ model: model_type: pretrain max_txt_len: 500 end_sym: "" - low_resource: True + low_resource: False prompt_template: '[INST] {} [/INST]' llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update" - ckpt: "/ibex/ai/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6/20231007035/checkpoint_35.pth" + ckpt: "" lora_r: 64 lora_alpha: 16 diff --git a/eval_scripts/EVAL_README.md b/eval_scripts/EVAL_README.md new file mode 100644 index 0000000..9285f31 --- /dev/null +++ b/eval_scripts/EVAL_README.md @@ -0,0 +1,53 @@ +## Evaluation Instruction for MiniGPT-v2 + +### Data preparation + + + +### environment setup + +``` +export PYTHONPATH=$PYTHONPATH:/path/to/directory/of/MiniGPT-4 +``` + +### start evalauting RefCOCO, RefCOCO+, RefCOCOg +port=port_number +cfg_path=/path/to/eval_configs/minigptv2_eval.yaml +eval_file_path=/path/to/eval/image/path +save_path=/path/to/save/path +ckpt=/path/to/evaluation/checkpoint + + +split=/evaluation/data/split/type # e.g. val, testA, testB, test +dataset=/data/type #refcoco, refcoco+, refcocog + +``` +torchrun --master-port ${port} --nproc_per_node 1 eval_ref.py \ + --cfg-path ${cfg_path} --img_path ${IMG_PATH} --eval_file_path ${eval_file_path} --save_path ${save_path} \ + --ckpt ${ckpt} --split ${split} --dataset ${dataset} --lora_r 64 --lora_alpha 16 \ + --batch_size 10 --max_new_tokens 20 --resample +``` + + +### start evaluating visual question answering + +port=port_number +cfg_path=/path/to/eval_configs/minigptv2_eval.yaml +eval_file_path=/path/to/eval/image/path +save_path=/path/to/save/path +ckpt=/path/to/evaluation/checkpoint + + +split=/evaluation/data/split/type # e.g. val,test +dataset=/data/type # vqa data types: okvqa, vizwiz, iconvqa, gqa, vsr, hm + +``` +torchrun --master-port ${port} --nproc_per_node 1 eval_ref.py \ + --cfg-path ${cfg_path} --img_path ${IMG_PATH} --eval_file_path ${eval_file_path} --save_path ${save_path} \ + --ckpt ${ckpt} --split ${split} --dataset ${dataset} --lora_r 64 --lora_alpha 16 \ + --batch_size 10 --max_new_tokens 20 --resample +``` + + + + diff --git a/eval_scripts/eval_ref.py b/eval_scripts/eval_ref.py index 9ef700e..5f8e60f 100644 --- a/eval_scripts/eval_ref.py +++ b/eval_scripts/eval_ref.py @@ -11,7 +11,7 @@ import torch from torch.utils.data import DataLoader from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU -from minigpt4.conversation.conversation import CONV_VISION_LLama2 +from minigpt4.conversation.conversation import CONV_VISION_minigptv2 from minigpt4.datasets.datasets.coco_caption import RefCOCOEvalData @@ -25,8 +25,10 @@ parser.add_argument("--res", type=float, default=100.0, help="resolution used in parser.add_argument("--resample", action='store_true', help="resolution used in refcoco") parser.add_argument("--img_path", type=str) parser.add_argument("--eval_file_path", type=str) +parser.add_argument("--save_path", type=str) args = parser.parse_args() + print(args.ckpt) print(args.name) @@ -36,23 +38,20 @@ eval_dict = {'refcoco': args.split, model, vis_processor = init_model(args) model.eval() -CONV_VISION = CONV_VISION_LLama2 +CONV_VISION = CONV_VISION_minigptv2 conv_temp = CONV_VISION.copy() conv_temp.system = "" - +# model.eval() -img_path=f'{args.img_path}/COCO/cocoapi/data/2017/images/jpeg/train' - + for dataset in args.dataset: for split in eval_dict[dataset]: - with open(f'{args.eval_file_path}/{dataset}/{dataset}_{split}.json', 'r') as f: + with open(os.path.join(args.eval_file_path,f"{dataset}/{dataset}_{split}.json"), 'r') as f: refcoco = json.load(f) - data = RefCOCOEvalData(refcoco, vis_processor, img_path) + data = RefCOCOEvalData(refcoco, vis_processor, args.img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) - minigpt4_predict = defaultdict(list) - resamples = [] for images, questions, img_ids in tqdm(eval_dataloader): @@ -64,11 +63,10 @@ for dataset in args.dataset: if re.match(pattern, answer): minigpt4_predict[img_id].append(answer) else: - resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] where is','').replace('?','').strip()]}) - + resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]}) if args.resample: for i in range(20): - data = RefCOCOEvalData(resamples, vis_processor, img_path) + data = RefCOCOEvalData(resamples, vis_processor, args.img_path) resamples = [] eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) for images, questions, img_ids in tqdm(eval_dataloader): @@ -80,12 +78,12 @@ for dataset in args.dataset: if re.match(pattern, answer) or i == 4: minigpt4_predict[img_id].append(answer) else: - resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] where is','').replace('?','').strip()]}) + resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]}) if len(resamples) == 0: break - with open(f'results/{args.name}_{dataset}_{split}.json','w') as f: + with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) count=0 diff --git a/eval_vqa.py b/eval_scripts/eval_vqa.py similarity index 57% rename from eval_vqa.py rename to eval_scripts/eval_vqa.py index b3dd07a..3d5cdfc 100644 --- a/eval_vqa.py +++ b/eval_scripts/eval_vqa.py @@ -13,44 +13,45 @@ from datasets import load_dataset from minigpt4.datasets.datasets.vqa_datasets import OKVQAEvalData,VizWizEvalData,IconQAEvalData,GQAEvalData,VSREvalData,HMEvalData - from minigpt4.common.vqa_tools.VQA.PythonHelperTools.vqaTools.vqa import VQA from minigpt4.common.vqa_tools.VQA.PythonEvaluationTools.vqaEvaluation.vqaEval import VQAEval from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser from minigpt4.conversation.conversation import CONV_VISION_minigptv2 -import random def list_of_str(arg): return list(map(str, arg.split(','))) - parser = eval_parser() parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate") parser.add_argument("--split", type=list_of_str, default='testB', help="dataset split to evaluate") parser.add_argument("--resample", action='store_true', help="resolution used in refcoco") parser.add_argument("--img_path", type=str) parser.add_argument("--eval_file_path", type=str) +parser.add_argument("--save_path", type=str) args = parser.parse_args() + print(args.ckpt) print(args.name) model, vis_processor = init_model(args) -conv_temp = CONV_VISION_LLama2.copy() +conv_temp = CONV_VISION_minigptv2.copy() conv_temp.system = "" + + model.eval() os.makedirs('results', exist_ok=True) if 'okvqa' in args.dataset: - img_path=os.path.join(args.img_path,"train") - with open(os.path.join(args.eval_file_path,"ok_vqa/test_split.json")) as f: + evaluation_annntation_path = os.path.join(args.eval_file_path, "test_split.json") + with open(evaluation_annntation_path) as f: ok_vqa_test_split = json.load(f) - data = OKVQAEvalData(ok_vqa_test_split, vis_processor, img_path) + data = OKVQAEvalData(ok_vqa_test_split, vis_processor, args.img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) minigpt4_predict = [] @@ -62,56 +63,27 @@ if 'okvqa' in args.dataset: for answer, question_id, question, img_id in zip(answers, question_ids, questions, img_ids): result = dict() - if "" in answer.lower(): - print("answer: ", answer) answer = answer.lower().replace('','').strip() result['answer'] = answer result['question_id'] = int(question_id) - if answer == "": - resamples.append({'image_id': img_id, 'question_id':question_id, 'question': [question.replace('[vqa] Based on the image, respond to this question with a short answer:','').strip()]}) - else: - minigpt4_predict.append(result) + minigpt4_predict.append(result) - if args.resample: - for i in range(20): - data = OKVQAEvalData(resamples, vis_processor, img_path) - resamples = [] - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) - for images, questions, question_ids, img_ids in eval_dataloader: - texts = prepare_texts(questions, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) - for answer, question_id, question in zip(answers, question_ids, questions): - result = dict() - answer = answer.lower().replace('','').strip() - result['answer'] = answer - result['question_id'] = int(question_id) - minigpt4_predict.append(result) - if answer == "": - resamples.append({'image_id': img_id, 'question_id':question_id, 'question': [question.replace('[vqa] Based on the image, respond to this question with a short answer:','').strip()]}) - else: - minigpt4_predict.append(result) - if len(resamples) == 0: - break - - save_path=f'results/{args.name}_okvqa.json' - with open(save_path,'w') as f: + with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) - annFile =f'{args.eval_file_path}/ok_vqa/mscoco_val2014_annotations_clean.json' - quesFile =f'{args.eval_file_path}/ok_vqa/OpenEnded_mscoco_val2014_questions_clean.json' + annFile = os.path.join(args.eval_file_path,"mscoco_val2014_annotations_clean.json") + quesFile = os.path.join(args.eval_file_path,"OpenEnded_mscoco_val2014_questions_clean.json" ) vqa = VQA(annFile, quesFile) - vqaRes = vqa.loadRes(save_path, quesFile) + vqaRes = vqa.loadRes(args.save_path, quesFile) vqaEval = VQAEval(vqa, vqaRes, n=2) - vqaEval.evaluate() - print ("Overall OKVQA Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']), flush=True) if 'vizwiz' in args.dataset: - img_path=f'{args.img_path}/vizwiz/val' - vizwiz = json.load(open(f'{args.eval_file_path}/vizwiz/val.json', 'r')) + img_path= args.img_path + vizwiz = json.load(open(args.eval_file_path, 'r')) data = VizWizEvalData(vizwiz, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) @@ -120,7 +92,7 @@ if 'vizwiz' in args.dataset: for images, texts, gt_answers in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template with torch.no_grad(): - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) + answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False,repetition_penalty=1.0) for answer, gt_answer in zip(answers, gt_answers): result = dict() @@ -134,52 +106,16 @@ if 'vizwiz' in args.dataset: acc = min(count/3.0, 1.0) total_acc.append(acc) - save_path=f'results/{args.name}_vizwiz.json' + save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) - print('vizwiz Acc: ', np.average(total_acc)* 100.0, flush=True) -if 'aokvqa' in args.dataset: - img_path=f'{args.img_path}/aokvqa/images' - for split in args.split: - with open(f'{args.eval_file_path}/aokvqa/annotations/aokvqa_v1p0_{split}.json','r') as f: - aokvqa_v1p0 = json.load(f) - - data = AOKVQADAEvalData(aokvqa_v1p0, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) +if 'iconvqa' in args.dataset: + iconqa_text_val = json.load(open(args.eval_file_path,"r")) + img_path = args.img_path - minigpt4_predict = defaultdict(dict) - - for images, texts, question_ids in tqdm(eval_dataloader): - texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template - answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) - - for answer, question_id in zip(answers, question_ids): - minigpt4_predict[question_id]['direct_answer'] = answer.lower().replace('','').strip() - - data = AOKVQAMCEvalData(aokvqa_v1p0, vis_processor, img_path) - eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) - - for images, texts, question_ids, answers in tqdm(eval_dataloader): - instructions = ["[INST] {} [/INST]".format(text) for text in texts] - answer_ranks = model.multi_select(images, instructions, answers) - candidates = [list(x) for x in zip(*answers)] - for idx, question_id in enumerate(question_ids): - minigpt4_predict[question_id]['multiple_choice'] = candidates[idx][answer_ranks[idx][0]] - - save_path=f'results/{args.name}_a_okvqa_{split}.json' - with open(save_path,'w') as f: - json.dump(minigpt4_predict, f) - - os.chdir('minigpt4/common/vqa_tools/aokvqa') - print(os.system(f'python evaluation/eval_predictions.py --aokvqa-dir {args.eval_file_path}/aokvqa/annotations --split {split} --preds ../../../../{save_path}'), flush=True) - os.chdir('../../../../') - -if 'iconqa' in args.dataset: - iconqa_text_val = json.load(open(f'{eval_file_path}/iconqa/choose_text_val.json','r')) - img_path = f'{args.img_path}/iconqa/val/choose_txt' data = IconQAEvalData(iconqa_text_val, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) @@ -200,8 +136,8 @@ if 'iconqa' in args.dataset: if 'gqa' in args.dataset: - img_path = f'{args.img_path}/gqa/images/val' - gqa = json.load(open(f'{args.eval_file_path}/gqa/annotations/testdev_balanced_questions.json', 'r')) + img_path = args.img_path + gqa = json.load(open(args.eval_file_path)) data = GQAEvalData(gqa, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count=0 @@ -221,13 +157,13 @@ if 'gqa' in args.dataset: total+=1 print('gqa val:', count / total * 100, flush=True) - save_path=f'results/{args.name}_gqa.json' + save_path=args.save_path with open(save_path,'w') as f: json.dump(minigpt4_predict, f) if 'vsr' in args.dataset: - annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test') - img_path = f'{args.img_path}/vsr/images' + annotation = load_dataset(args.eval_file_path, split='test') + img_path = args.img_path data = VSREvalData(annotation, vis_processor, img_path) eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False) count=0 @@ -237,11 +173,9 @@ if 'vsr' in args.dataset: for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template - # print("texts",texts) answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): - print(answer) result = dict() result['pred'] = answer.replace('','').strip() result['gt'] = label @@ -250,14 +184,13 @@ if 'vsr' in args.dataset: count+=1 total+=1 print('vsr test:', count / total * 100, flush=True) - save_path=f'results/{args.name}_vsr.json' - with open(save_path,'w') as f: + with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) if 'hm' in args.dataset: - img_path = f'{args.img_path}/hateful_meme' + img_path = args.img_path annotation = [] - with open(f'{args.eval_file_path}/hateful_meme/dev.jsonl', 'r') as jsonl_file: + with open(args.eval_file_path, 'r') as jsonl_file: for line in jsonl_file: json_obj = json.loads(line) annotation.append(json_obj) @@ -271,19 +204,27 @@ if 'hm' in args.dataset: for images, texts, labels in tqdm(eval_dataloader): texts = prepare_texts(texts, conv_temp) # warp the texts with conversation template + answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False) for answer, label in zip(answers, labels): result = dict() - answer = 1 if answer.lower().__contains__('yes') else 0 - result['pred'] = int(str(answer).replace('','').strip()) + if answer.lower().strip() =="yes": + answer=1 + elif answer.lower().strip()=="no": + answer=0 + else: + print("answer",answer) + + result['pred'] = answer + result['gt'] = int(label) minigpt4_predict.append(result) if answer == label: count+=1 total+=1 + print('hm val:', count / total * 100, flush=True) - save_path=f'results/{args.name}_hm.json' - with open(save_path,'w') as f: + with open(args.save_path,'w') as f: json.dump(minigpt4_predict, f) diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py index 76f86e4..2cafd93 100755 --- a/minigpt4/datasets/datasets/coco_caption.py +++ b/minigpt4/datasets/datasets/coco_caption.py @@ -91,7 +91,7 @@ class RefCOCOEvalData(torch.utils.data.Dataset): image_path = os.path.join(self.root_path, f'{img_id[:27]}.jpg') image = Image.open(image_path).convert('RGB') image = self.vis_processor(image) - question = f"[refer] tell me the location of {sent}?" + question = f"[refer] give me the location of {sent}" return image, question, img_id class EvalCaptionData(torch.utils.data.Dataset): diff --git a/minigpt4/datasets/datasets/vqa_datasets.py b/minigpt4/datasets/datasets/vqa_datasets.py index 486b5f2..a8df3cc 100755 --- a/minigpt4/datasets/datasets/vqa_datasets.py +++ b/minigpt4/datasets/datasets/vqa_datasets.py @@ -16,30 +16,6 @@ class VQADataset(BaseDataset): def __init__(self, vis_processor, text_processor, vis_root, ann_paths): super().__init__(vis_processor, text_processor, vis_root, ann_paths) - # def collater(self, samples): - # image_list, question_list, answer_list, weight_list = [], [], [], [] - - # num_answers = [] - - # for sample in samples: - # image_list.append(sample["image"]) - # question_list.append(sample["question"]) - - # weight_list.extend(sample["weights"]) - - # answers = sample["answer"] - - # answer_list.extend(answers) - # num_answers.append(len(answers)) - - # return { - # "image": torch.stack(image_list, dim=0), - # "text_input": question_list, - # "answer": answer_list, - # "weight": torch.Tensor(weight_list), - # "n_answers": torch.LongTensor(num_answers), - # } - class VQAEvalDataset(BaseDataset): def __init__(self, vis_processor, text_processor, vis_root, ann_paths): @@ -85,7 +61,7 @@ class VizWizEvalData(torch.utils.data.Dataset): image_path = os.path.join(self.root_path, img_id) image = Image.open(image_path).convert('RGB') image = self.vis_processor(image) - question = f"[vqa] The question is'{question}' Based on the image, answer the question with a single word or phrase. and reply 'unanswerable' when the provided information is insufficient" # 52.0 + question = f"[vqa] The question is '{question}' Based on the image, answer the question with a single word or phrase. and reply 'unanswerable' when the provided information is insufficient" return image, question, answers class IconQAEvalData(torch.utils.data.Dataset): diff --git a/minigpt4/models/base_model.py b/minigpt4/models/base_model.py index d70ca18..b01273e 100644 --- a/minigpt4/models/base_model.py +++ b/minigpt4/models/base_model.py @@ -184,7 +184,7 @@ class BaseModel(nn.Module): else: llama_model = LlamaForCausalLM.from_pretrained( llama_model_path, - torch_dtype=torch.float16, + torch_dtype=torch.float32, ) if lora_r > 0: diff --git a/minigpt4/models/minigpt_base.py b/minigpt4/models/minigpt_base.py index 22c4251..fa82187 100644 --- a/minigpt4/models/minigpt_base.py +++ b/minigpt4/models/minigpt_base.py @@ -367,9 +367,18 @@ class MiniGPTBase(BaseModel): min_length=min_length, top_p=top_p, repetition_penalty=repetition_penalty, - stopping_criteria=stopping_criteria, + # stopping_criteria=stopping_criteria, ) + # with self.maybe_autocast(): + # outputs = self.llama_model.generate( + # inputs_embeds=embs, + # attention_mask=attn_mask, + # max_new_tokens=max_new_tokens, + # num_beams=num_beams, + # do_sample=do_sample, + # # stopping_criteria=stopping_criteria, + # ) answers = [] for output_token in outputs: if output_token[0] == 0: