update 0304

This commit is contained in:
wanghanzi 2024-03-04 10:41:24 +08:00
parent eb022668a3
commit 802c7d9565
63 changed files with 4222 additions and 5844 deletions

View File

@ -0,0 +1,94 @@
import os
import json
import pandas as pd
from tqdm import tqdm
from pycocoevalcap.eval import COCOEvalCap
from collections import defaultdict
class COCO_Annotation:
def __init__(self, annotation_file):
self.coco_cn_file = annotation_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = defaultdict(list)
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
for line in fin:
line = line.strip()
temp = eval(line)
annotations = temp['annotations']
for ann in annotations:
image_id = str(ann['image_id']).zfill(6)
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
return imgToAnns
def getImgIds(self):
return self.imgToAnns.keys()
class COCO_Result:
def __init__(self,result_file):
self.coco_cn_file = result_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = dict()
data = json.load(open(self.coco_cn_file, "r"))
for d in data:
tmp = {
'image_id':d['question_id'][-6:],
'caption':d['answer']
}
imgToAnns[d['question_id'][-6:]] = [tmp]
return imgToAnns
def coco_caption_eval(results_file, split_name):
files = {
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
}
# create coco object and coco_result object
annotation_file = files[split_name]
coco = COCO_Annotation(annotation_file)
coco_result = COCO_Result(results_file)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()
# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()
# print output evaluation scores
for metric, score in coco_eval.eval.items():
print(f"{metric}: {score:.3f}")
return coco_eval
def main():
result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json"
split_name = "val"
coco_val = coco_caption_eval(result_file, split_name)
agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
# log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
# with open(
# os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
# ) as f:
# f.write(json.dumps(log_stats) + "\n")
coco_res = {k: v for k, v in coco_val.eval.items()}
coco_res["agg_metrics"] = agg_metrics
print(coco_res)
main()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 380 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 457 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 538 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 679 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 555 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 658 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 690 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 713 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 597 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 190 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 603 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 634 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 305 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 588 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 805 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 853 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 567 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 712 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 519 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 565 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 865 KiB

View File

@ -16,11 +16,16 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json
# val:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
# storage:
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
val:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
test:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
# test:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json

View File

@ -17,14 +17,14 @@ datasets:
# md5: aa31ac474cf6250ebb81d18348a07ed8
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
# val:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
# storage:
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
# test:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
# storage:
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
val:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
test:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -0,0 +1,26 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
datasets:
coco_caption: # name of the dataset builder
# dataset_card: dataset_card/coco_caption.md
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
val:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
test:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO

View File

@ -14,7 +14,7 @@ from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObj
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset
from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
@ -384,7 +384,7 @@ class OKVQABuilder(COCOVQABuilder):
@registry.register_builder("aok_vqa")
class AOKVQABuilder(BaseDatasetBuilder):
train_dataset_cls = AOKVQADataset
eval_dataset_cls = AOKVQADataset
eval_dataset_cls = AOKVQAEvalDataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
@ -584,6 +584,7 @@ class COCOCapBuilder(BaseDatasetBuilder):
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco/caption.yaml",
"coco_cap_eval": "configs/datasets/coco/caption_eval.yaml",
}

View File

@ -13,7 +13,7 @@ import torch
from PIL import Image
from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
class __DisplMixin:
@ -37,11 +37,11 @@ class AOKVQADataset(VQADataset, __DisplMixin):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[
'{}',
'Q: {} A: ',
'Based on the image, respond to this question with a short answer: {}',
'{} A short answer to the question is ',
'Question: {} Short answer:',
'{} Choose from {}.',
'Q: {} Multi Choices: {} A: ',
'Question: {} Multi Choices: {} Answer: ',
"{} Choose one from the following possible answers: {}. ",
'{} Choose from {}. The answer is',
]
exist_annotation = []
@ -63,25 +63,19 @@ class AOKVQADataset(VQADataset, __DisplMixin):
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
answer_key = "direct_answers"
answer_weight = {}
for answer in ann[answer_key]:
if answer in answer_weight.keys():
answer_weight[answer] += 1 / len(ann[answer_key])
else:
answer_weight[answer] = 1 / len(ann[answer_key])
answers = list(answer_weight.keys())
weights = list(answer_weight.values())
answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
answer_lst = ann["choices"]
direct_answers = ann["direct_answers"]
final_answer = random.choices(direct_answers, k=1)[0]
for answer in answer_lst:
if answer in direct_answers:
final_answer = answer
return {
"image": image,
"image_id": ann["image"],
"question": question,
"answer": answer,
"answer": final_answer,
"choices": ", ".join(answer_lst)
}
def __getitem__(self, index):
@ -90,7 +84,7 @@ class AOKVQADataset(VQADataset, __DisplMixin):
answer = self.text_processor(data['answer'])
q_input = question
llm_input = random.choice(self.instruction_pool).format(question)
llm_input = random.choice(self.instruction_pool).format(question, data["choices"])
return {
"image": data['image'],
@ -104,25 +98,103 @@ class AOKVQADataset(VQADataset, __DisplMixin):
}
class AOKVQGDataset(AOKVQADataset):
class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool = [
'Given the image, generate a question whose answer is: {}',
'Based on the image, provide a question with the answer: {}',
'Given the visual representation, create a question for which the answer is "{}"',
'From the image provided, craft a question that leads to the reply: {}',
'Considering the picture, come up with a question where the answer is: {}',
'Taking the image into account, generate an question that has the answer: {}'
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
self.vis_root = vis_root
self.annotation = json.load(open(ann_paths[0]))
self.instruction_pool =[
'{} Choose from {}.',
'Q: {} Multi Choices: {} A: ',
'Question: {} Multi Choices: {} Answer: ',
"{} Choose one from the following possible answers: {}. ",
'{} Choose from {}. The answer is',
]
def __getitem__(self, index):
data = self.get_data(index)
instruction = random.choice(self.instruction_pool).format(data['answer'])
try:
self.coco_fmt_qust_file = ann_paths[2]
self.coco_fmt_anno_file = ann_paths[3]
except IndexError:
self.coco_fmt_qust_file = None
self.coco_fmt_anno_file = None
self.vis_processor = vis_processor
self.text_processor = text_processor
self.source = 'aokvqa'
def collater(self, samples):
(
image_list,
question_list,
question_id_list,
choices_list,
correct_choice_idx_list,
direct_answers_list,
llm_input_list,
q_input_list,
source_list,
) = ([], [], [], [], [], [], [], [], [])
for sample in samples:
image_list.append(sample["image"])
question_list.append(sample["text_input"])
question_id_list.append(sample["question_id"])
choices_list.append(sample["choices"])
correct_choice_idx_list.append(sample["correct_choice_idx"])
direct_answers_list.append(sample["direct_answers"])
llm_input_list.append(sample["llm_input"])
q_input_list.append(sample["q_input"])
source_list.append(sample["source"])
return {
"image": data['image'],
"instruction_input": instruction,
"answer": data['question'],
"image": torch.stack(image_list, dim=0),
"text_input": question_list,
"question_id": question_id_list,
"choices": choices_list,
"correct_choice_idx": correct_choice_idx_list,
"direct_answers": direct_answers_list,
"llm_input": llm_input_list,
"q_input": q_input_list,
"source": source_list,
}
def __getitem__(self, index):
ann = self.annotation[index]
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
choices = ann["choices"]
if "correct_choice_idx" in ann:
correct_choice_idx = ann["correct_choice_idx"]
else:
correct_choice_idx = None
if "direct_answers" in ann:
direct_answers = ann["direct_answers"]
else:
direct_answers = None
llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices))
return {
"image": image,
"q_input": question,
"llm_input": llm_input,
"text_input": question,
"question_id": ann["question_id"],
"choices": choices,
"correct_choice_idx": correct_choice_idx,
"direct_answers": direct_answers,
"source": 'aokvqa',
}

View File

@ -60,82 +60,6 @@ class CaptionDataset(BaseDataset, __DisplMixin):
"image_id": self.img_ids[ann["image_id"]],
}
class COCOCaptionDataset(BaseDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.img_ids = {}
n = 0
self.filter_anntation = []
for ann in self.annotation:
if "train" in ann["image"]:
self.filter_anntation.append(ann)
self.annotation = self.filter_anntation
for ann in self.annotation:
img_id = ann["image_id"]
if img_id not in self.img_ids.keys():
self.img_ids[img_id] = n
n += 1
self.instruction_pool = [
'Briefly describe this image.',
'Provide a concise depiction of this image.',
'Present a short description of this image.',
'Summarize this image in a few words.',
'A short image caption:',
'A short image description:',
'A photo of ',
'An image that shows ',
'Write a short description for the image. ',
'Write a description for the photo.',
'Provide a description of what is presented in the photo.',
'Briefly describe the content of the image.',
'Can you briefly explain what you see in the image?',
'Could you use a few words to describe what you perceive in the photo?',
'Please provide a short depiction of the picture.',
'Using language, provide a short account of the image.',
'Use a few words to illustrate what is happening in the picture.',
]
self.source = 'coco_cap'
def __getitem__(self, index):
# TODO this assumes image input, not general enough
ann = self.annotation[index]
# img_file = ann["image"].split("/")[-1]
img_file = ann["image"]
image_path = os.path.join(self.vis_root, img_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
caption = self.text_processor(ann["caption"])
# instruction = random.choice(self.instruction_pool)
# instruction = "<Img><ImageHere></Img> [caption] {} ".format(instruction)
q_input = ""
llm_input = random.choice(self.instruction_pool)
return {
"image": image,
"image_id": ann["image"],
"answer": caption,
"q_input": q_input,
"llm_input": llm_input,
"text_input": llm_input,
"text_output": caption,
"source": 'coco_cap',
}
class CaptionEvalDataset(BaseDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
@ -159,3 +83,4 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin):
"image_id": ann["image_id"],
"instance_id": ann["instance_id"],
}

View File

@ -9,18 +9,102 @@ import os
import json
import torch
import numpy as np
import random
from PIL import Image
from PIL import ImageFile
from collections import OrderedDict
ImageFile.LOAD_TRUNCATED_IMAGES = True
from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
from minigpt4.datasets.datasets.base_dataset import BaseDataset
from minigpt4.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset
COCOCapDataset = COCOCaptionDataset
class __DisplMixin:
def displ_item(self, index):
sample, ann = self.__getitem__(index), self.annotation[index]
return OrderedDict(
{
"file": ann["image"],
"caption": ann["caption"],
"image": sample["image"],
}
)
class COCOCapDataset(BaseDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
vis_root (string): Root directory of images (e.g. coco/images/)
ann_root (string): directory to store the annotation file
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.img_ids = {}
n = 0
self.filter_anntation = []
for ann in self.annotation:
if "train" in ann["image"]:
self.filter_anntation.append(ann)
self.annotation = self.filter_anntation
for ann in self.annotation:
img_id = ann["image_id"]
if img_id not in self.img_ids.keys():
self.img_ids[img_id] = n
n += 1
self.instruction_pool = [
'Briefly describe this image.',
'Provide a concise depiction of this image.',
'Present a short description of this image.',
'Summarize this image in a few words.',
'A short image caption:',
'A short image description:',
'A photo of ',
'An image that shows ',
'Write a short description for the image. ',
'Write a description for the photo.',
'Provide a description of what is presented in the photo.',
'Briefly describe the content of the image.',
'Can you briefly explain what you see in the image?',
'Could you use a few words to describe what you perceive in the photo?',
'Please provide a short depiction of the picture.',
'Using language, provide a short account of the image.',
'Use a few words to illustrate what is happening in the picture.',
]
self.source = 'coco_cap'
def __getitem__(self, index):
# TODO this assumes image input, not general enough
ann = self.annotation[index]
# img_file = ann["image"].split("/")[-1]
img_file = ann["image"]
image_path = os.path.join(self.vis_root, img_file)
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
caption = self.text_processor(ann["caption"])
instruction = random.choice(self.instruction_pool)
# q_input = ""
q_input = instruction
llm_input = instruction
return {
"image": image,
"image_id": ann["image"],
"answer": caption,
"q_input": q_input,
"llm_input": llm_input,
"text_input": llm_input,
"text_output": caption,
"source": 'coco_cap',
}
class COCOCapEvalDataset(CaptionEvalDataset):
@ -31,6 +115,26 @@ class COCOCapEvalDataset(CaptionEvalDataset):
split (string): val or test
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool = [
'Briefly describe this image.',
'Provide a concise depiction of this image.',
'Present a short description of this image.',
'Summarize this image in a few words.',
'A short image caption:',
'A short image description:',
'A photo of ',
'An image that shows ',
'Write a short description for the image. ',
'Write a description for the photo.',
'Provide a description of what is presented in the photo.',
'Briefly describe the content of the image.',
'Can you briefly explain what you see in the image?',
'Could you use a few words to describe what you perceive in the photo?',
'Please provide a short depiction of the picture.',
'Using language, provide a short account of the image.',
'Use a few words to illustrate what is happening in the picture.',
]
self.source = 'coco_cap'
def __getitem__(self, index):
@ -38,15 +142,25 @@ class COCOCapEvalDataset(CaptionEvalDataset):
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
image = self.vis_processor(image)
try:
image = self.vis_processor(image)
except Exception as e:
print(e)
print(image_path)
img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
instruction = random.choice(self.instruction_pool)
# q_input = ""
q_input = instruction
llm_input = instruction
return {
"image": image,
"image_id": img_id,
"instance_id": ann["instance_id"],
"text_input":llm_input,
"q_input": q_input,
"llm_input": llm_input,
"source": self.source,
}

View File

@ -149,7 +149,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
self.source = 'okvqa'
self.annotation_add = self.get_data()
self._add_instance_ids()
def get_data(self):
ann_instruct = list()
@ -180,7 +179,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
"image_id": ann["image"],
'image_path': image_path,
"question_id": ann["question_id"],
# "instance_id": ann["instance_id"],
"question": question,
"q_input": q_input,
"llm_input": llm_input,

View File

@ -45,7 +45,6 @@ from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
from minigpt4.models.moe.utils import (
FeedForward,
MoEModelOutput,
MoEModelOutputWithPooling,
use_experts,

File diff suppressed because it is too large Load Diff

View File

@ -389,17 +389,23 @@ class BertOutput(nn.Module): # Add & Norm
class FeedForward(nn.Module):
# remove LayerNorm
def __init__(self, config):
nn.Module.__init__(self)
# first layer
self.intermediate_query = BertIntermediate(config)
# second layer
self.output_query = BertOutput(config)
super().__init__()
self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2
# self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2
def forward(self, hidden_states: Tensor):
input_tensor = hidden_states
intermediate_output = self.intermediate_query(hidden_states)
hidden_states = self.output_query(intermediate_output, input_tensor)
hidden_states = self.dense1(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dense2(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
@ -433,7 +439,6 @@ class BertLayer(nn.Module):
self.layer_judge = moe_layer_judge(layer_num)
self.num_beams = config.moebert_num_beams
ffn = FeedForward(config)
if self.use_experts:
self.experts = RouteMoELayer(
hidden_size=config.hidden_size,
@ -446,8 +451,7 @@ class BertLayer(nn.Module):
)
else:
self.experts = ffn
# self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
@ -538,7 +542,7 @@ class BertLayer(nn.Module):
if self.layer_judge == 'first' and self.num_beams>1:
# if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
# adjust the dimension of layer_output_text to bz*num_beams
layer_output_text = self.adjust_layer_output_text(layer_output_text)
layer_output_text = self.adjust_hidden_states_by_num_beams(layer_output_text)
if self.layer_judge == 'mid' and self.num_beams > 1:
# layer_output_text [bz*num_beams, len, hidden_size]
@ -575,11 +579,11 @@ class BertLayer(nn.Module):
attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len])
return attention_mask
def adjust_layer_output_text(self, layer_output_text):
batch_size, text_length, hidden_size = layer_output_text.shape
tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
return layer_output_text
def adjust_hidden_states_by_num_beams(self, hidden_states):
batch_size, text_length, hidden_size = hidden_states.shape
tmp_text = hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
hidden_states = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
return hidden_states
def route_moe_last_layer_top1(self, layer_output, layer_output_text):
batch_size = layer_output[0].shape[0]
@ -602,20 +606,21 @@ class BertLayer(nn.Module):
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
# layer_output = self.LayerNorm(layer_output + attention_output)
return layer_output
def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route):
if not self.use_experts:
layer_output = self.experts(attention_output)
# layer_output = self.LayerNorm(layer_output + attention_output)
hidden_states = self.experts(attention_output)
layer_output = self.expert_ln(hidden_states + attention_output)
return layer_output, None, None, None, 0.0
layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
hidden_states, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
attention_output, expert_attention_mask, beam_scores, expert_route
)
if hidden_states.shape[0]==attention_output.shape[0]*self.num_beams and self.num_beams>1:
attention_output = self.adjust_hidden_states_by_num_beams(attention_output)
layer_output = self.expert_ln(hidden_states + attention_output)
# layer_output = self.LayerNorm(layer_output + attention_output)
return layer_output, beam_scores, expert_route, beam_idx, importance_loss
class BertEncoder(nn.Module):

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@ from minigpt4.common.logger import MetricLogger
from minigpt4.models.base_model import BaseModel
from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
from minigpt4.models.QformerMoE import BertMoELMHeadModel
from minigpt4.models.QformerMoELN import BertMoELMHeadModelLNIn
from minigpt4.models.QformerRouteMoE import BertMoERouteLMHeadModel
from minigpt4.models.eva_vit import create_eva_vit_g
from transformers import BertTokenizer
@ -88,7 +89,7 @@ class Blip2Base(BaseModel):
@classmethod
def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2):
def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2,ln_position="out"):
moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
moe_encoder_config.encoder_width = vision_width
@ -104,9 +105,14 @@ class Blip2Base(BaseModel):
moe_encoder_config.use_balance_loss = use_balance_loss
moe_encoder_config.moe_weight_type = moe_weight_type
MoEQformer = BertMoELMHeadModel.from_pretrained(
"/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
)
if ln_position == "out":
MoEQformer = BertMoELMHeadModel.from_pretrained(
"/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
)
elif ln_position == "in":
MoEQformer = BertMoELMHeadModelLNIn.from_pretrained(
"/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
)
query_tokens = nn.Parameter(
torch.zeros(1, num_query_token, moe_encoder_config.hidden_size)
)

View File

@ -65,6 +65,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss = True,
moe_weight_type = "l2_norm",
gate_save_path = None,
bal_loss_decay_epoch = 3,
ln_position = "out",
):
super().__init__()
transformers_version = version.parse(transformers.__version__)
@ -112,7 +114,8 @@ class Blip2VicunaInstruct(Blip2Base):
moe_topk=moe_topk,
use_balance_loss=use_balance_loss,
moe_weight_type=moe_weight_type,
cross_attention_freq=2
cross_attention_freq=2,
ln_position=ln_position,
)
else:
self.Qformer, self.query_tokens = self.init_Qformer(
@ -221,6 +224,7 @@ class Blip2VicunaInstruct(Blip2Base):
self.moebert_num_beams = moebert_num_beams
self.gate_save_path = gate_save_path
self.bal_loss_decay_epoch = bal_loss_decay_epoch
# if self.gate_save_path != None:
# import os
# if not os.path.exists(self.gate_save_path):
@ -394,7 +398,10 @@ class Blip2VicunaInstruct(Blip2Base):
)
if self.use_moeqformer:
loss = outputs.loss + self.moebert_load_balance * gate_loss
if samples['epoch'] > self.bal_loss_decay_epoch:
loss = outputs.loss
else:
loss = outputs.loss + self.moebert_load_balance * gate_loss
else:
loss = outputs.loss
@ -512,6 +519,16 @@ class Blip2VicunaInstruct(Blip2Base):
with self.maybe_autocast():
inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids)
# path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/embedding/"
# np.save(os.join(path, "inputs_llm.npy"), inputs_llm.cpu().numpy)
# np.save(os.join(path, "inputs_llm.npy"), self.llm_model.get_input_embeddings().weight.cpu().numpy)
# samples_copy = samples.copy()
# samples_copy.pop('image', None)
# with open(os.path.join(path, 'test_samples.json'),'a+') as f:
# f.write(f"{json.dumps(samples_copy)}\n")
inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1)
@ -654,6 +671,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss = cfg.get("use_balance_loss", True)
moe_weight_type = cfg.get("moe_weight_type",'l2_norm')
gate_save_path = cfg.get("gate_save_path", None)
bal_loss_decay_epoch = cfg.get("bal_loss_decay_epoch", 3)
ln_position = cfg.get("ln_position","out")
model = cls(
vit_model=vit_model,
@ -683,6 +702,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss=use_balance_loss,
moe_weight_type=moe_weight_type,
gate_save_path=gate_save_path,
bal_loss_decay_epoch=bal_loss_decay_epoch,
ln_position=ln_position,
)
# if qformer_text_input:

View File

@ -165,7 +165,7 @@ class RouteMoELayer(nn.Module):
self.route_method = route_method
if self.route_method == "pre-route":
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
elif self.route_method == "post-route":
elif self.route_method in ["post-route", "post-route-dp"]:
gate = nn.Linear(hidden_size, 1, bias=False).float()
self.gate = gate
# self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@ -252,6 +252,53 @@ class RouteMoELayer(nn.Module):
return beam_scores, expert_route, beam_idx
def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route', 'post-route-dp']:
# current_scores_log torch.Size([bz, num_experts])
assert beam_scores==None and expert_route==None
current_scores = torch.exp(current_scores_log)
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
beam_idx = torch.tensor(range(self.num_beams * batch_size))
else:
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
import pdb;pdb.set_trace()
next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
next_scores = next_scores_raw.view(batch_size, self.num_beams)
next_experts = next_experts_raw.view(batch_size, self.num_beams)
# next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
# next_scores torch.Size([bz * num_beams, 1])
# next_tokens torch.Size([bz * num_beams, 1])
next_batch_beam = list()
for batch_idx in range(batch_size):
next_sent_beam = list()
expert_id = next_experts[batch_idx]
expert_score = next_scores[batch_idx]
values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
for i in range(self.num_beams):
beam_id = index[i].item()
ex_id = expert_id[beam_id].item()
effective_beam_id = batch_idx*self.num_beams + beam_id
next_sent_beam.append((values[i], ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
import pdb;pdb.set_trace()
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
pre_route = expert_route[beam_idx,:]
expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
return beam_scores, expert_route, beam_idx
def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
# current_scores_log torch.Size([bz, num_experts])
@ -267,6 +314,8 @@ class RouteMoELayer(nn.Module):
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
import pdb;pdb.set_trace()
next_scores_raw1 = next_scores_exp.view(
batch_size, self.num_beams * self.num_experts
) # torch.Size([bz, num_beams*num_experts])
@ -289,7 +338,7 @@ class RouteMoELayer(nn.Module):
next_sent_beam.append((expert_score, ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
# import pdb;pdb.set_trace()
import pdb;pdb.set_trace()
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
@ -301,8 +350,6 @@ class RouteMoELayer(nn.Module):
return beam_scores, expert_route, beam_idx
def forward_expert_ffn(self, x, expert_select, current_scores):
"""
x_repeat : [bz*num_beams, 32,768]
@ -343,6 +390,7 @@ class RouteMoELayer(nn.Module):
batch_size, num_tokens = x.shape[0], x.shape[1]
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
current_expert_select = expert_route[:,-1]
import pdb;pdb.set_trace()
@ -368,7 +416,6 @@ class RouteMoELayer(nn.Module):
output_x = self.experts[expert_idx].forward(input_x)
return output_x
import pdb; pdb.set_trace()
outputs = list()
logits_gate_lst = list()
for expert_idx in range(self.num_experts):
@ -392,10 +439,14 @@ class RouteMoELayer(nn.Module):
# importance loss
importance_loss = self._importance_auxiliary_loss(current_scores)
# import pdb; pdb.set_trace()
batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
import pdb; pdb.set_trace()
if self.route_method == 'post-route':
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
elif self.route_method == 'post-route-dp':
beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
@ -431,7 +482,7 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
elif self.route_method == "post-route":
elif self.route_method in ['post-route', 'post-route-dp']:
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
@ -467,10 +518,11 @@ if __name__ == '__main__':
batch_size = 4
x = torch.randn(batch_size, 32, 768)
beam_scores, expert_route = None, None
x1 = x
x2 = x
x3 = x
beam_scores1, expert_route1 = None, None
beam_scores2, expert_route2 = None, None
for layer_num in [6, 8, 10]:
layer_judge = moe_layer_judge(layer_num)
@ -494,25 +546,41 @@ if __name__ == '__main__':
# print(importance_loss)
# x = hidden_states1
gate1 = nn.Linear(768, 1, bias=False).float()
# experts_post = RouteMoELayer(
# hidden_size=768,
# expert=ffn,
# num_experts=config.moebert_expert_num,
# num_beams=config.moebert_num_beams,
# layer_judge = layer_judge,
# route_method = "post-route",
# weight_type="ffn_prob"
# )
# layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
# hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
# print(beam_scores1)
# print(expert_route1)
# print(beam_idx)
# print(importance_loss)
# x1 = hidden_states2
experts_post = RouteMoELayer(
hidden_size=768,
expert=ffn,
num_experts=config.moebert_expert_num,
num_beams=config.moebert_num_beams,
layer_judge = layer_judge,
route_method = "post-route",
route_method = "post-route-dp",
weight_type="ffn_prob"
)
layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
print(beam_scores1)
print(expert_route1)
print(beam_idx)
print(importance_loss)
x1 = hidden_states2
layer_output = experts_post(x2, None, beam_scores2, expert_route2, False)
hidden_states3, beam_scores2, expert_route2, beam_idx2, importance_loss2 = layer_output
print(beam_scores2)
print(expert_route2)
print(beam_idx2)
print(importance_loss2)
x2 = hidden_states3
# gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
# experts_moe = MoELayer(
@ -526,12 +594,12 @@ if __name__ == '__main__':
# weight_type=config.moe_weight_type,
# )
# attn_mask = torch.ones([batch_size, 32])
# layer_output = experts_moe(x2, attn_mask)
# hidden_states3, select_prob_gate, gate_load,_ = layer_output
# layer_output = experts_moe(x3, attn_mask)
# hidden_states4, select_prob_gate, gate_load,_ = layer_output
# print(select_prob_gate)
# print(gate_load)
# x2 = hidden_states3
# x3 = hidden_states4
print("------------------------------------")
import pdb; pdb.set_trace()

View File

@ -18,7 +18,7 @@ class RouteMoELayer(nn.Module):
self.route_method = route_method
if self.route_method == "pre-route":
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
elif self.route_method == "post-route":
elif self.route_method in ["post-route", "post-route-dp"]:
gate = nn.Linear(hidden_size, 1, bias=False).float()
self.gate = gate
# self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@ -47,26 +47,67 @@ class RouteMoELayer(nn.Module):
prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
return prob_gate
def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method=='pre-route':
def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method in ['post-route-dp']:
# current_scores_log torch.Size([bz, num_experts])
assert beam_scores==None and expert_route==None
current_scores = torch.exp(current_scores_log)
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
beam_idx = torch.tensor(range(self.num_beams * batch_size))
else:
if self.layer_judge=='first' and self.route_method == 'post-route':
batch_size = batch_size
next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts])
else:
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
next_scores_raw1 = next_scores_exp.view(
batch_size, self.num_beams * self.num_experts
) # torch.Size([bz, num_beams*num_experts])
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
next_scores = next_scores_raw.view(batch_size, self.num_beams)
next_experts = next_experts_raw.view(batch_size, self.num_beams)
# next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
# next_scores torch.Size([bz * num_beams, 1])
# next_tokens torch.Size([bz * num_beams, 1])
next_batch_beam = list()
for batch_idx in range(batch_size):
next_sent_beam = list()
expert_id = next_experts[batch_idx]
expert_score = next_scores[batch_idx]
values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
for i in range(self.num_beams):
beam_id = index[i].item()
ex_id = expert_id[beam_id].item()
effective_beam_id = batch_idx*self.num_beams + beam_id
next_sent_beam.append((values[i], ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
pre_route = expert_route[beam_idx,:]
expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
return beam_scores, expert_route, beam_idx
def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
# current_scores_log torch.Size([bz, num_experts])
assert beam_scores==None and expert_route==None
current_scores = torch.exp(current_scores_log)
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
beam_idx = torch.tensor(range(self.num_beams * batch_size))
else:
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
next_scores_raw1 = next_scores_exp.view(
batch_size, self.num_beams * self.num_experts
) # torch.Size([bz, num_beams*num_experts])
next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True)
# next_scores torch.Size([bz, num_beams])
@ -86,19 +127,11 @@ class RouteMoELayer(nn.Module):
next_sent_beam.append((expert_score, ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
if self.layer_judge=='first' and self.route_method == 'post-route':
beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = next_experts.view(self.num_beams * batch_size)
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route.new([x[1] for x in next_batch_beam]).unsqueeze(-1)
beam_idx = expert_route.new([int(x[2]/self.num_beams) for x in next_batch_beam])
expert_route = beam_experts
else:
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
pre_route = expert_route[beam_idx,:]
expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
pre_route = expert_route[beam_idx,:]
expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
return beam_scores, expert_route, beam_idx
@ -153,7 +186,6 @@ class RouteMoELayer(nn.Module):
# import pdb;pdb.set_trace()
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
def forward_post_route(self, x, beam_scores, expert_route, use_log=True):
attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
@ -187,7 +219,12 @@ class RouteMoELayer(nn.Module):
importance_loss = self._importance_auxiliary_loss(current_scores)
batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
if self.route_method == 'post-route':
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
elif self.route_method == 'post-route-dp':
beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
@ -218,7 +255,7 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
elif self.route_method == "post-route":
elif self.route_method in ['post-route', 'post-route-dp']:
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss

View File

@ -13,7 +13,7 @@ from typing import Optional, Tuple, List
def use_experts(layer_idx):
# if layer_idx % 2 == 0:
# use moe_ffn after cross_attns
if int(layer_idx) in [6,8,10]:
if int(layer_idx) in [6,7,8,9,10,11]:
# layer 6/8/10
return True
else:

View File

@ -0,0 +1,114 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/20240301223/checkpoint_best.pth"
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# T5
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
use_moeqformer: True
use_route_moe: False
moebert_expert_num: 3
moebert_route_method: "gate-sentence-post"
moe_weight_type: "raw_prob"
moebert_load_balance: 0.05
moe_topk: 1
use_balance_loss: False
ln_position: "out"
datasets:
gqa:
type: balanced_sft_raw_eval
batch_size: 4
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
ok_vqa: # train, valid (9009, 5046)
type: ok_vqa_eval
batch_size: 4
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
coco_vqa: # 658104
type: vqa_v2_eval
batch_size: 4
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
aok_vqa: # train: 17056, val: 1145
batch_size: 4
vis_processor:
eval:
name: "blip2_image_train"
image_size: 224
text_processor:
eval:
name: "blip_caption"
run:
task: instruction_tuning
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/eval/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/"
num_workers: 4
amp: True
resume_ckpt_path: null
evaluate: True
test_splits: ["val"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True

View File

@ -10,7 +10,7 @@ model:
load_finetuned: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth"
finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/20240128142/checkpoint_best.pth"
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
@ -39,15 +39,39 @@ model:
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
moebert_load_balance: 0
moebert_load_balance: 0.01
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
use_balance_loss: False
bal_loss_decay_epoch: 8
gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"
datasets:
gqa:
type: balanced_sft_raw_eval
batch_size: 64
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
coco_vqa: # 658104
type: vqa_v2_eval
batch_size: 64
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
coco_caption: # 414113 train
type: coco_cap_eval
batch_size: 32
vis_processor:
eval:
@ -59,18 +83,7 @@ datasets:
ok_vqa: # train, valid (9009, 5046)
type: ok_vqa_eval
batch_size: 32
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
coco_vqa: # 658104
type: vqa_v2_eval
batch_size: 32
batch_size: 64
vis_processor:
eval:
name: "blip2_image_eval"
@ -96,7 +109,7 @@ run:
iters_per_epoch: 3000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"
amp: True
resume_ckpt_path: null

View File

@ -38,17 +38,17 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
moebert_load_balance: 0
moebert_expert_num: 3
moebert_num_beams: 3
moebert_route_method: "post-route-dp"
moebert_load_balance: 0.05
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 16
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -64,7 +64,7 @@ datasets:
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
batch_size: 16
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -80,7 +80,7 @@ datasets:
sample_ratio: 1
coco_vqa: # 658104
batch_size: 16
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -112,7 +112,7 @@ run:
iters_per_epoch: 5000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/"
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0121/"
amp: True
resume_ckpt_path: null

View File

@ -38,14 +38,17 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
moebert_expert_num: 5
moebert_num_beams: 1
# gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
moebert_route_method: "post-route"
moebert_load_balance: 0
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 4
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -61,7 +64,7 @@ datasets:
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
batch_size: 4
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -77,7 +80,7 @@ datasets:
sample_ratio: 1
coco_vqa: # 658104
batch_size: 4
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@ -96,20 +99,20 @@ run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 2e-5
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
max_epoch: 6
max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1212_Test/"
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_1loss_5e5lr_top6layer_textinqf_epo8_0123/"
amp: True
resume_ckpt_path: null

View File

@ -0,0 +1,145 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# vicuna
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
use_moeqformer: False
use_route_moe: False
moebert_route_method: "post-route"
moebert_load_balance: 0
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 1
coco_vqa: # 658104
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 9
coco_caption: # 414113 train
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 7
run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
# output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0122/"
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Base_top6layer_textinqf_epo8_0124/"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
valid_splits: ["val"]
# test_splits: ["val"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True

View File

@ -0,0 +1,145 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# vicuna
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
moebert_load_balance: 0.01
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
bal_loss_decay_epoch: 3
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 1
coco_vqa: # 658104
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 9
coco_caption: # 414113 train
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 7
run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_loss_decay_5e5lr_top6layer_textinqf_epo8_0129/"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
valid_splits: ["val"]
# test_splits: ["val"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True

View File

@ -0,0 +1,188 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# vicuna
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
moebert_load_balance: 0.05
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa:
type: balanced_sft_raw_eval
batch_size: 16
vis_processor:
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
eval:
name: "blip_caption"
ok_vqa: # train, valid (9009, 5046)
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 8
coco_vqa: # 658104
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 15
aok_vqa: # train: 17056, val: 1145
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 12
ocrvqa: # train 207572
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 30
llava_reason: # 76643
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 80
llava_conversation: # 56681
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 30
llava_detail: # 23240
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 20
coco_caption: # 414113 train
batch_size: 16
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
valid_splits: ["val"]
# test_splits: ["val"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True

View File

@ -0,0 +1,128 @@
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: blip2_vicuna_instruct
model_type: vicuna7b_pretrain
load_pretrained: True
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
# Q-Former
num_query_token: 32
qformer_text_input: True
# vicuna
llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
prompt: ""
max_txt_len: 256
max_output_txt_len: 256
# freeze
freeze_vit: True
freeze_llm: True
freeze_qformer: False
freeze_t5_proj: False
# moe
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route-dp"
moebert_load_balance: 0.05
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 1
coco_vqa: # 658104
batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
sample_ratio: 9
run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
valid_splits: ["val"]
# test_splits: ["val"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True

View File

@ -53,7 +53,7 @@ class InstructionTask(BaseTask):
run_cfg = cfg.run_cfg
num_beams = run_cfg.get("num_beams", 3)
max_len = run_cfg.get("max_len", 20)
max_len = run_cfg.get("max_len", 30)
min_len = run_cfg.get("min_len", 1)
evaluate = run_cfg.get("evaluate", False)
@ -112,22 +112,33 @@ class InstructionTask(BaseTask):
)
pred_qa_pairs = []
question_id = samples["question_id"]
question = samples["text_input"]
text_inputs = samples["text_input"]
sources = samples["source"]
source = samples["source"][0]
if source in ['vqav2','okvqa','gqa']:
sample_ids = [int(sample_id.item()) for sample_id in samples["question_id"]]
elif source in ['aokvqa']:
sample_ids = [sample_id for sample_id in samples["question_id"]]
elif source in ['coco_cap']:
sample_ids = samples["image_id"]
# For GQA
full_answers = samples.get("fullAnswer", ["" for i in range(len(question_id))])
gt_answers = samples.get("gt_answers", ["" for i in range(len(question_id))])
full_answers = samples.get("fullAnswer", ["" for i in range(len(sample_ids))])
gt_answers = samples.get("gt_answers", ["" for i in range(len(sample_ids))])
for answer, ques_id, ques, full_answer, gt_answer, source in zip(answers, question_id, question, full_answers, gt_answers, sources):
ques_id = int(ques_id.item())
# For AOKVQA
choices = samples.get("choices", ["" for i in range(len(sample_ids))])
for answer, sample_id, text_input, full_answer, gt_answer, choice, source in zip(answers, sample_ids, text_inputs, full_answers, gt_answers, choices, sources):
pred_qa_pairs.append({
"question_id": ques_id,
"question": ques,
"question_id": sample_id,
"question": text_input,
"full_answer": full_answer,
"answer": answer,
"gt_ans": gt_answer,
"choice": choice,
"source": source})
return pred_qa_pairs
@ -140,9 +151,7 @@ class InstructionTask(BaseTask):
total_results = list()
for sub_data_loader in data_loader.loaders:
results = []
ques_ids = []
for samples in metric_logger.log_every(sub_data_loader, print_freq, header):
ques_ids.extend(samples['question_id'].tolist())
samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
eval_output = self.valid_step(model=model, samples=samples)
@ -168,6 +177,7 @@ class InstructionTask(BaseTask):
filename=f"{split_name}_vqa_result_{source}",
remove_duplicate="question_id",
)
if source in ['vqav2','okvqa']:
try:
metrics = self._report_metrics_coco_vqa(result_file=result_file, split=split_name, source=source)
@ -180,7 +190,18 @@ class InstructionTask(BaseTask):
except Exception as e:
metrics = None
print(f"Report Metrics {source} Error: {e}")
elif source in ['aokvqa']:
try:
metrics = self._report_metrics_aokvqa(result_file=result_file, source=source)
except Exception as e:
metrics = None
print(f"Report Metrics {source} Error: {e}")
elif source in ['coco_cap']:
try:
metrics = self._report_metrics_caption(result_file=result_file, split_name=split_name, source=source)
except Exception as e:
metrics = None
print(f"Report Metrics {source} Error: {e}")
else:
metrics = None
final_metrics[source] = metrics
@ -234,10 +255,46 @@ class InstructionTask(BaseTask):
return metrics
@dist_utils.main_process
def _report_metrics_aokvqa(self, result_file, source='aokvqa'):
"""
Validation of aokvqa
"""
# measuring accuracy compared to answer
results = json.load(open(result_file, "r"))
acc = []
vqa_tool = VQAEval()
for res in results:
gt_ans = res["choice"]
pred = res["answer"]
pred = vqa_tool.processPunctuation(pred)
pred = vqa_tool.processDigitArticle(pred)
# vqa_acc = 1 if pred == gt_ans else 0
vqa_acc = 1 if pred in gt_ans else 0
acc.append(vqa_acc)
accuracy = sum(acc) / len(acc) * 100
metrics = {"agg_metrics": accuracy, "acc": accuracy}
with open(
os.path.join(registry.get_path("output_dir"), f"evaluate_{source}.txt"), "a"
) as f:
f.write(json.dumps(metrics) + "\n")
logging.info(metrics)
return metrics
@dist_utils.main_process
def _report_metrics_gqa(self, result_file, source='gqa'):
"""
Validation of GQA/VQAv2
Validation of GQA
"""
# measuring accuracy compared to answer
results = json.load(open(result_file, "r"))
@ -274,3 +331,90 @@ class InstructionTask(BaseTask):
return metrics
@dist_utils.main_process
def _report_metrics_caption(self, result_file, split_name, source='coco_cap'):
"""
Use official COCO Cap evaluation script to report metrics.
"""
coco_gt_root = os.path.join(registry.get_path("cache_root"), "coco_gt")
coco_val = coco_caption_eval(coco_gt_root, result_file, split_name)
agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
with open(
os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
) as f:
f.write(json.dumps(log_stats) + "\n")
coco_res = {k: v for k, v in coco_val.eval.items()}
coco_res["agg_metrics"] = agg_metrics
return coco_res
from collections import defaultdict
from pycocoevalcap.eval import COCOEvalCap
class COCO_Annotation:
def __init__(self, annotation_file):
self.coco_cn_file = annotation_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = defaultdict(list)
with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
for line in fin:
line = line.strip()
temp = eval(line)
annotations = temp['annotations']
for ann in annotations:
image_id = str(ann['image_id']).zfill(6)
imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
return imgToAnns
def getImgIds(self):
return self.imgToAnns.keys()
class COCO_Result:
def __init__(self,result_file):
self.coco_cn_file = result_file
self.imgToAnns = self.build_imgToAnns()
def build_imgToAnns(self):
imgToAnns = dict()
data = json.load(open(self.coco_cn_file, "r"))
for d in data:
tmp = {
'image_id':d['question_id'][-6:],
'caption':d['answer']
}
imgToAnns[d['question_id'][-6:]] = [tmp]
return imgToAnns
def coco_caption_eval(coco_gt_root, results_file, split_name):
files = {
"val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
"test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
}
# create coco object and coco_result object
annotation_file = files[split_name]
coco = COCO_Annotation(annotation_file)
coco_result = COCO_Result(results_file)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()
# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()
# print output evaluation scores
for metric, score in coco_eval.eval.items():
print(f"{metric}: {score:.3f}")
return coco_eval

View File

@ -1,4 +0,0 @@
<Img><ImageHere></Img> Describe this image in detail.
<Img><ImageHere></Img> Take a look at this image and describe what you notice.
<Img><ImageHere></Img> Please provide a detailed description of the picture.
<Img><ImageHere></Img> Could you describe the contents of this image for me?

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -0,0 +1,58 @@
import datasets
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import random
from tqdm import tqdm
# path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/minigpt4/models/cmrc2018_trial.json"
# dataset = load_dataset("json", data_files=[path], field="data", split="train")
# tokenizer = AutoTokenizer.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
# def preprocess_function(example):
# import pdb; pdb.set_trace()
# model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
# labels = tokenizer(example["title"], max_length=32, truncation=True)
# # label就是title编码的结果
# model_inputs["labels"] = labels["input_ids"]
# return model_inputs
# processed_datasets = dataset.map(preprocess_function)
dataset = load_dataset("/mnt/pfs-guan-ssai/nlu/wanghanzi/data/alpaca_20k")
train_dataset = dataset['train']
for i in tqdm(range(1, len(train_dataset))):
import pdb; pdb.set_trace()
idx = random.randint(0,i)
memory = train_dataset[idx]
memory_text = f"Instruction: {memory['instruction']}\n Answer: {memory['output']} \n"
train_dataset[i]['text'] = f"{memory_text} Instruction:{train_dataset[i]['instruction']}"
import pdb; pdb.set_trace()
model_path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/opt_350m"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
def formatting_prompts_func(example):
import pdb; pdb.set_trace()
output_texts = []
for i in range(len(example['instruction'])):
text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
output_texts.append(text)
return output_texts
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
trainer = SFTTrainer(
model,
train_dataset=train_dataset,
formatting_func=formatting_prompts_func,
data_collator=collator,
)
trainer.train()