update 0304

2025-04-05 02:20:47 +00:00 · 2024-03-04 10:41:24 +08:00 · 2024-03-04 10:41:24 +08:00 · 802c7d9565
commit 802c7d9565
parent eb022668a3
63 changed files with 4222 additions and 5844 deletions
--- a/evaluation/coco_caption.py
+++ b/evaluation/coco_caption.py
@ -0,0 +1,94 @@
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+
+from pycocoevalcap.eval import COCOEvalCap
+from collections import defaultdict
+
+class COCO_Annotation:
+    def __init__(self, annotation_file):
+        self.coco_cn_file = annotation_file
+        self.imgToAnns = self.build_imgToAnns()
+    
+    def build_imgToAnns(self):
+        imgToAnns = defaultdict(list)
+        with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
+            for line in fin:
+                line = line.strip()
+                temp = eval(line)
+                annotations = temp['annotations']
+                for ann in annotations:
+                    image_id = str(ann['image_id']).zfill(6)
+                    imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
+        return imgToAnns
+    
+    def getImgIds(self):
+        return self.imgToAnns.keys()  
+
+class COCO_Result:
+    def __init__(self,result_file):
+        self.coco_cn_file = result_file
+        self.imgToAnns = self.build_imgToAnns()
+    
+    def build_imgToAnns(self):
+        imgToAnns = dict()
+        data = json.load(open(self.coco_cn_file, "r"))
+        for d in data:
+            tmp = {
+                'image_id':d['question_id'][-6:],
+                'caption':d['answer']
+            }
+            imgToAnns[d['question_id'][-6:]] = [tmp]
+        return imgToAnns
+    
+def coco_caption_eval(results_file, split_name):
+    files = {
+        "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
+        "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
+    }
+
+    # create coco object and coco_result object
+    annotation_file = files[split_name]
+    coco = COCO_Annotation(annotation_file)
+    coco_result = COCO_Result(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # evaluate on a subset of images by setting
+    # coco_eval.params['image_id'] = coco_result.getImgIds()
+    # please remove this line when evaluating the full validation set
+    # coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # evaluate results
+    # SPICE will take a few minutes the first time, but speeds up due to caching
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f"{metric}: {score:.3f}")
+
+    return coco_eval
+
+
+def main():
+    result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json"
+    split_name = "val"
+    coco_val = coco_caption_eval(result_file, split_name)
+
+    agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
+    
+    # log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
+    # with open(
+    #     os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
+    # ) as f:
+    #     f.write(json.dumps(log_stats) + "\n")
+
+    coco_res = {k: v for k, v in coco_val.eval.items()}
+    coco_res["agg_metrics"] = agg_metrics
+
+    print(coco_res)
+
+
+main()
--- a/examples/ad_1.png
+++ b/examples/ad_1.png
--- a/examples/ad_2.png
+++ b/examples/ad_2.png
--- a/examples/cook_1.png
+++ b/examples/cook_1.png
--- a/examples/cook_2.png
+++ b/examples/cook_2.png
--- a/examples/describe_1.png
+++ b/examples/describe_1.png
--- a/examples/describe_2.png
+++ b/examples/describe_2.png
--- a/examples/fact_1.png
+++ b/examples/fact_1.png
--- a/examples/fact_2.png
+++ b/examples/fact_2.png
--- a/examples/fix_1.png
+++ b/examples/fix_1.png
--- a/examples/fix_2.png
+++ b/examples/fix_2.png
--- a/examples/fun_1.png
+++ b/examples/fun_1.png
--- a/examples/fun_2.png
+++ b/examples/fun_2.png
--- a/examples/logo_1.png
+++ b/examples/logo_1.png
--- a/examples/op_1.png
+++ b/examples/op_1.png
--- a/examples/op_2.png
+++ b/examples/op_2.png
--- a/examples/people_1.png
+++ b/examples/people_1.png
--- a/examples/people_2.png
+++ b/examples/people_2.png
--- a/examples/rhyme_1.png
+++ b/examples/rhyme_1.png
--- a/examples/rhyme_2.png
+++ b/examples/rhyme_2.png
--- a/examples/story_1.png
+++ b/examples/story_1.png
--- a/examples/story_2.png
+++ b/examples/story_2.png
--- a/examples/web_1.png
+++ b/examples/web_1.png
--- a/examples/wop_1.png
+++ b/examples/wop_1.png
--- a/examples/wop_2.png
+++ b/examples/wop_2.png
--- a/examples_v2/2000x1372_wmkn_0012149409555.jpg
+++ b/examples_v2/2000x1372_wmkn_0012149409555.jpg
--- a/examples_v2/KFC-20-for-20-Nuggets.jpg
+++ b/examples_v2/KFC-20-for-20-Nuggets.jpg
--- a/examples_v2/cockdial.png
+++ b/examples_v2/cockdial.png
--- a/examples_v2/float.png
+++ b/examples_v2/float.png
--- a/examples_v2/glip_test.jpg
+++ b/examples_v2/glip_test.jpg
--- a/examples_v2/office.jpg
+++ b/examples_v2/office.jpg
--- a/examples_v2/sofa.jpg
+++ b/examples_v2/sofa.jpg
--- a/examples_v2/thief.png
+++ b/examples_v2/thief.png
--- a/minigpt4/configs/datasets/aokvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml
@ -16,11 +16,16 @@ datasets:
              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
          storage:
              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json
-        # val:
-        #   url:
-        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
-        #   storage:
-        #       - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
+        val:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+          storage:
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
+        test:
+          url:
+              - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+          storage:
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
        # test:
        #   url:
        #       - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
--- a/minigpt4/configs/datasets/coco/caption.yaml
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@ -17,14 +17,14 @@ datasets:
          # md5: aa31ac474cf6250ebb81d18348a07ed8
          storage: 
              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
-        # val:
-        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
-        #   storage: 
-        #       - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
-        # test:
-        #   url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
-        #   storage: 
-        #       - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+          storage: 
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+          storage: 
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json

      images:
        storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
--- a/minigpt4/configs/datasets/coco/caption_eval.yaml
+++ b/minigpt4/configs/datasets/coco/caption_eval.yaml
@ -0,0 +1,26 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+  coco_caption: # name of the dataset builder
+    # dataset_card: dataset_card/coco_caption.md
+    # data_dir: ${env.data_dir}/datasets
+    data_type: images # [images|videos|features]
+
+    build_info:
+      # Be careful not to append minus sign (-) before split to avoid itemizing
+      annotations:
+        val:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+          storage: 
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
+        test:
+          url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+          storage: 
+              - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
+
+      images:
+        storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
+        
--- a/minigpt4/datasets/builders/image_text_pair_builder.py
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@ -14,7 +14,7 @@ from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObj
 from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
 from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
 from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
-from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
+from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
 from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
 from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset
 from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
@ -384,7 +384,7 @@ class OKVQABuilder(COCOVQABuilder):
@registry.register_builder("aok_vqa")
 class AOKVQABuilder(BaseDatasetBuilder):
    train_dataset_cls = AOKVQADataset
-    eval_dataset_cls = AOKVQADataset
+    eval_dataset_cls = AOKVQAEvalDataset

    DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}

@ -584,6 +584,7 @@ class COCOCapBuilder(BaseDatasetBuilder):
    
    DATASET_CONFIG_DICT = {
        "default": "configs/datasets/coco/caption.yaml",
+        "coco_cap_eval": "configs/datasets/coco/caption_eval.yaml",
    }


--- a/minigpt4/datasets/datasets/aok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py
@ -13,7 +13,7 @@ import torch

 from PIL import Image

-from minigpt4.datasets.datasets.vqa_datasets import VQADataset  #, VQAEvalDataset
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset


 class __DisplMixin:
@ -37,11 +37,11 @@ class AOKVQADataset(VQADataset, __DisplMixin):
        super().__init__(vis_processor, text_processor, vis_root, ann_paths)

        self.instruction_pool =[   
-            '{}',
-            'Q: {} A: ',
-            'Based on the image, respond to this question with a short answer: {}',
-            '{} A short answer to the question is ',
-            'Question: {} Short answer:',
+            '{} Choose from {}.',
+            'Q: {} Multi Choices: {} A: ',
+            'Question: {} Multi Choices: {} Answer: ',
+            "{} Choose one from the following possible answers: {}. ",
+            '{} Choose from {}. The answer is',
        ]

        exist_annotation = []
@ -63,25 +63,19 @@ class AOKVQADataset(VQADataset, __DisplMixin):
        image = self.vis_processor(image)
        question = self.text_processor(ann["question"])

-        answer_key = "direct_answers"
-
-        answer_weight = {}
-        for answer in ann[answer_key]:
-            if answer in answer_weight.keys():
-                answer_weight[answer] += 1 / len(ann[answer_key])
-            else:
-                answer_weight[answer] = 1 / len(ann[answer_key])
-
-        answers = list(answer_weight.keys())
-        weights = list(answer_weight.values())
-
-        answer = random.choices(answers, weights=weights, k=1)[0]  # random sample an answer according to weights
+        answer_lst = ann["choices"]
+        direct_answers = ann["direct_answers"]
+        final_answer = random.choices(direct_answers, k=1)[0]
+        for answer in answer_lst:
+            if answer in direct_answers:
+                final_answer = answer

        return {
            "image": image,
            "image_id": ann["image"],
            "question": question,
-            "answer": answer,
+            "answer": final_answer,
+            "choices": ", ".join(answer_lst)
        }

    def __getitem__(self, index):
@ -90,7 +84,7 @@ class AOKVQADataset(VQADataset, __DisplMixin):

        answer = self.text_processor(data['answer'])
        q_input = question
-        llm_input = random.choice(self.instruction_pool).format(question)
+        llm_input = random.choice(self.instruction_pool).format(question, data["choices"])

        return {
            "image": data['image'],
@ -104,25 +98,103 @@ class AOKVQADataset(VQADataset, __DisplMixin):
        }


-class AOKVQGDataset(AOKVQADataset):
-
+class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
-        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
-        self.instruction_pool = [
-            'Given the image, generate a question whose answer is: {}',
-            'Based on the image, provide a question with the answer: {}',
-            'Given the visual representation, create a question for which the answer is "{}"',
-            'From the image provided, craft a question that leads to the reply: {}',
-            'Considering the picture, come up with a question where the answer is: {}',
-            'Taking the image into account, generate an question that has the answer: {}'
-        ]
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """

-    def __getitem__(self, index):
-        data = self.get_data(index)
-        instruction = random.choice(self.instruction_pool).format(data['answer'])
+        self.vis_root = vis_root
+
+        self.annotation = json.load(open(ann_paths[0]))
+
+        self.instruction_pool =[   
+            '{} Choose from {}.',
+            'Q: {} Multi Choices: {} A: ',
+            'Question: {} Multi Choices: {} Answer: ',
+            "{} Choose one from the following possible answers: {}. ",
+            '{} Choose from {}. The answer is',
+        ]
+        
+        try:
+            self.coco_fmt_qust_file = ann_paths[2]
+            self.coco_fmt_anno_file = ann_paths[3]
+        except IndexError:
+            self.coco_fmt_qust_file = None
+            self.coco_fmt_anno_file = None
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+        self.source = 'aokvqa'
+
+    def collater(self, samples):
+        (
+            image_list,
+            question_list,
+            question_id_list,
+            choices_list,
+            correct_choice_idx_list,
+            direct_answers_list,
+            llm_input_list,
+            q_input_list,
+            source_list,
+        ) = ([], [], [], [], [], [], [], [], [])
+
+        for sample in samples:
+            image_list.append(sample["image"])
+            question_list.append(sample["text_input"])
+            question_id_list.append(sample["question_id"])
+            choices_list.append(sample["choices"])
+            correct_choice_idx_list.append(sample["correct_choice_idx"])
+            direct_answers_list.append(sample["direct_answers"])
+            llm_input_list.append(sample["llm_input"])
+            q_input_list.append(sample["q_input"])
+            source_list.append(sample["source"])

        return {
-            "image": data['image'],
-            "instruction_input": instruction,
-            "answer": data['question'],
+            "image": torch.stack(image_list, dim=0),
+            "text_input": question_list,
+            "question_id": question_id_list,
+            "choices": choices_list,
+            "correct_choice_idx": correct_choice_idx_list,
+            "direct_answers": direct_answers_list,
+            "llm_input": llm_input_list,
+            "q_input": q_input_list,
+            "source": source_list,
        }
+
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        question = self.text_processor(ann["question"])
+
+        choices = ann["choices"]
+        if "correct_choice_idx" in ann:
+            correct_choice_idx = ann["correct_choice_idx"]
+        else:
+            correct_choice_idx = None
+
+        if "direct_answers" in ann:
+            direct_answers = ann["direct_answers"]
+        else:
+            direct_answers = None
+
+        llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices))
+
+        return {
+            "image": image,
+            "q_input": question,
+            "llm_input": llm_input,
+            "text_input": question,
+            "question_id": ann["question_id"],
+            "choices": choices,
+            "correct_choice_idx": correct_choice_idx,
+            "direct_answers": direct_answers,
+            "source": 'aokvqa',
+        }
+    
--- a/minigpt4/datasets/datasets/caption_datasets.py
+++ b/minigpt4/datasets/datasets/caption_datasets.py
@ -59,83 +59,7 @@ class CaptionDataset(BaseDataset, __DisplMixin):
            "text_input": caption,
            "image_id": self.img_ids[ann["image_id"]],
        }
-
-
-
-class COCOCaptionDataset(BaseDataset, __DisplMixin):
-    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
-        """
-        vis_root (string): Root directory of images (e.g. coco/images/)
-        ann_root (string): directory to store the annotation file
-        """
-        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
-
-        self.img_ids = {}
-        n = 0
-
-        self.filter_anntation = []
-        
-        for ann in self.annotation:
-            if "train" in ann["image"]:
-                self.filter_anntation.append(ann)
-        self.annotation = self.filter_anntation
-
-        for ann in self.annotation:
-            img_id = ann["image_id"]
-            if img_id not in self.img_ids.keys():
-                self.img_ids[img_id] = n
-                n += 1
-
-        self.instruction_pool = [
-            'Briefly describe this image.',
-            'Provide a concise depiction of this image.',
-            'Present a short description of this image.',
-            'Summarize this image in a few words.',
-            'A short image caption:',
-            'A short image description:',
-            'A photo of ',
-            'An image that shows ',
-            'Write a short description for the image. ',
-            'Write a description for the photo.',
-            'Provide a description of what is presented in the photo.',
-            'Briefly describe the content of the image.',
-            'Can you briefly explain what you see in the image?',
-            'Could you use a few words to describe what you perceive in the photo?',
-            'Please provide a short depiction of the picture.',
-            'Using language, provide a short account of the image.',
-            'Use a few words to illustrate what is happening in the picture.',
-        ]
-        self.source = 'coco_cap'
-        
-    def __getitem__(self, index):
-
-        # TODO this assumes image input, not general enough
-        ann = self.annotation[index]
-
-        # img_file = ann["image"].split("/")[-1]
-        img_file = ann["image"]
-        image_path = os.path.join(self.vis_root, img_file)
-        image = Image.open(image_path).convert("RGB")
-
-        image = self.vis_processor(image)
-        caption = self.text_processor(ann["caption"])
-
-        # instruction = random.choice(self.instruction_pool)
-        # instruction = "<Img><ImageHere></Img> [caption] {} ".format(instruction)
-        q_input = ""
-        llm_input = random.choice(self.instruction_pool)
-
-        return {
-            "image": image,
-            "image_id": ann["image"],
-            "answer": caption,
-            "q_input": q_input,
-            "llm_input": llm_input,
-            "text_input": llm_input,
-            "text_output": caption,
-            "source": 'coco_cap',
-        }
-
+    
 class CaptionEvalDataset(BaseDataset, __DisplMixin):
    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
        """
@ -151,7 +75,7 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin):

        image_path = os.path.join(self.vis_root, ann["image"])
        image = Image.open(image_path).convert("RGB")
-
+        
        image = self.vis_processor(image)

        return {
@ -159,3 +83,4 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin):
            "image_id": ann["image_id"],
            "instance_id": ann["instance_id"],
        }
+
--- a/minigpt4/datasets/datasets/coco_caption.py
+++ b/minigpt4/datasets/datasets/coco_caption.py
@ -9,18 +9,102 @@ import os
 import json
 import torch
 import numpy as np
+import random

 from PIL import Image
 from PIL import ImageFile
+from collections import OrderedDict

 ImageFile.LOAD_TRUNCATED_IMAGES = True

-from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset

-COCOCapDataset = COCOCaptionDataset
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]

+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "caption": ann["caption"],
+                "image": sample["image"],
+            }
+        )
+    
+class COCOCapDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)

+        self.img_ids = {}
+        n = 0

+        self.filter_anntation = []
+        
+        for ann in self.annotation:
+            if "train" in ann["image"]:
+                self.filter_anntation.append(ann)
+        self.annotation = self.filter_anntation
+
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+        self.instruction_pool = [
+            'Briefly describe this image.',
+            'Provide a concise depiction of this image.',
+            'Present a short description of this image.',
+            'Summarize this image in a few words.',
+            'A short image caption:',
+            'A short image description:',
+            'A photo of ',
+            'An image that shows ',
+            'Write a short description for the image. ',
+            'Write a description for the photo.',
+            'Provide a description of what is presented in the photo.',
+            'Briefly describe the content of the image.',
+            'Can you briefly explain what you see in the image?',
+            'Could you use a few words to describe what you perceive in the photo?',
+            'Please provide a short depiction of the picture.',
+            'Using language, provide a short account of the image.',
+            'Use a few words to illustrate what is happening in the picture.',
+        ]
+        self.source = 'coco_cap'
+        
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        # img_file = ann["image"].split("/")[-1]
+        img_file = ann["image"]
+        image_path = os.path.join(self.vis_root, img_file)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        instruction = random.choice(self.instruction_pool)
+        # q_input = ""
+        q_input = instruction
+        llm_input = instruction
+
+        return {
+            "image": image,
+            "image_id": ann["image"],
+            "answer": caption,
+            "q_input": q_input,
+            "llm_input": llm_input,
+            "text_input": llm_input,
+            "text_output": caption,
+            "source": 'coco_cap',
+        }


 class COCOCapEvalDataset(CaptionEvalDataset):
@ -31,6 +115,26 @@ class COCOCapEvalDataset(CaptionEvalDataset):
        split (string): val or test
        """
        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.instruction_pool = [
+            'Briefly describe this image.',
+            'Provide a concise depiction of this image.',
+            'Present a short description of this image.',
+            'Summarize this image in a few words.',
+            'A short image caption:',
+            'A short image description:',
+            'A photo of ',
+            'An image that shows ',
+            'Write a short description for the image. ',
+            'Write a description for the photo.',
+            'Provide a description of what is presented in the photo.',
+            'Briefly describe the content of the image.',
+            'Can you briefly explain what you see in the image?',
+            'Could you use a few words to describe what you perceive in the photo?',
+            'Please provide a short depiction of the picture.',
+            'Using language, provide a short account of the image.',
+            'Use a few words to illustrate what is happening in the picture.',
+        ]
        self.source = 'coco_cap'

    def __getitem__(self, index):
@ -38,15 +142,25 @@ class COCOCapEvalDataset(CaptionEvalDataset):

        image_path = os.path.join(self.vis_root, ann["image"])
        image = Image.open(image_path).convert("RGB")
-
-        image = self.vis_processor(image)
+        try:
+            image = self.vis_processor(image)
+        except Exception as e:
+            print(e)
+            print(image_path)

        img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+        instruction = random.choice(self.instruction_pool)
+        # q_input = ""
+        q_input = instruction
+        llm_input = instruction

        return {
            "image": image,
            "image_id": img_id,
-            "instance_id": ann["instance_id"],
+            "text_input":llm_input,
+            "q_input": q_input,
+            "llm_input": llm_input,
+            "source": self.source,
        }


--- a/minigpt4/datasets/datasets/ok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/ok_vqa_datasets.py
@ -149,7 +149,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):

        self.source = 'okvqa'
        self.annotation_add = self.get_data()
-        self._add_instance_ids()

    def get_data(self):
        ann_instruct = list()
@ -180,7 +179,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
            "image_id": ann["image"],
            'image_path': image_path,
            "question_id": ann["question_id"],
-            # "instance_id": ann["instance_id"],
            "question": question,
            "q_input": q_input,
            "llm_input": llm_input,
--- a/minigpt4/models/QformerMoE.py
+++ b/minigpt4/models/QformerMoE.py
@ -45,7 +45,6 @@ from transformers.utils import logging
 from transformers.models.bert.configuration_bert import BertConfig

 from minigpt4.models.moe.utils import (
-    FeedForward,
    MoEModelOutput,
    MoEModelOutputWithPooling,
    use_experts,
--- a/minigpt4/models/QformerMoELN.py
+++ b/minigpt4/models/QformerMoELN.py
--- a/minigpt4/models/QformerRouteMoE.py
+++ b/minigpt4/models/QformerRouteMoE.py
@ -389,17 +389,23 @@ class BertOutput(nn.Module): # Add & Norm


 class FeedForward(nn.Module):
+    # remove LayerNorm
    def __init__(self, config):
-        nn.Module.__init__(self)
-        # first layer
-        self.intermediate_query = BertIntermediate(config)
-        # second layer
-        self.output_query = BertOutput(config)
+        super().__init__()
+        self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2
+        # self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2

    def forward(self, hidden_states: Tensor):
-        input_tensor = hidden_states
-        intermediate_output = self.intermediate_query(hidden_states)
-        hidden_states = self.output_query(intermediate_output, input_tensor)
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
        return hidden_states


@ -433,7 +439,6 @@ class BertLayer(nn.Module):
        self.layer_judge = moe_layer_judge(layer_num)
        self.num_beams = config.moebert_num_beams
        ffn = FeedForward(config)
-
        if self.use_experts:
            self.experts = RouteMoELayer(
                hidden_size=config.hidden_size,
@ -446,8 +451,7 @@ class BertLayer(nn.Module):
            )
        else:
            self.experts = ffn
-
-        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
@ -538,7 +542,7 @@ class BertLayer(nn.Module):
                if self.layer_judge == 'first' and self.num_beams>1:
                    # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
                    # adjust the dimension of layer_output_text to bz*num_beams
-                    layer_output_text = self.adjust_layer_output_text(layer_output_text)
+                    layer_output_text = self.adjust_hidden_states_by_num_beams(layer_output_text)

                if self.layer_judge == 'mid' and self.num_beams > 1:
                    # layer_output_text [bz*num_beams, len, hidden_size]
@ -575,11 +579,11 @@ class BertLayer(nn.Module):
        attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len])
        return attention_mask
    
-    def adjust_layer_output_text(self, layer_output_text):
-        batch_size, text_length, hidden_size = layer_output_text.shape
-        tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
-        layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
-        return layer_output_text
+    def adjust_hidden_states_by_num_beams(self, hidden_states):
+        batch_size, text_length, hidden_size = hidden_states.shape
+        tmp_text = hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
+        hidden_states = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
+        return hidden_states

    def route_moe_last_layer_top1(self, layer_output, layer_output_text):
        batch_size = layer_output[0].shape[0]
@ -602,20 +606,21 @@ class BertLayer(nn.Module):
    def feed_forward_chunk(self, attention_output):
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
-        # layer_output = self.LayerNorm(layer_output + attention_output)
        return layer_output

    def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route):
        if not self.use_experts:
-            layer_output = self.experts(attention_output)
-            # layer_output = self.LayerNorm(layer_output + attention_output)
+            hidden_states = self.experts(attention_output)
+            layer_output = self.expert_ln(hidden_states + attention_output)
            return layer_output, None, None, None, 0.0

-        layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
+        hidden_states, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
            attention_output, expert_attention_mask, beam_scores, expert_route
        )
+        if hidden_states.shape[0]==attention_output.shape[0]*self.num_beams and self.num_beams>1:
+            attention_output = self.adjust_hidden_states_by_num_beams(attention_output)
+        layer_output = self.expert_ln(hidden_states + attention_output)

-        # layer_output = self.LayerNorm(layer_output + attention_output)
        return layer_output, beam_scores, expert_route, beam_idx, importance_loss

 class BertEncoder(nn.Module):
@ -722,7 +727,7 @@ class BertEncoder(nn.Module):
                ]
                if v is not None
            )
-        
+
        return MoEModelOutput(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
--- a/minigpt4/models/QformerRouteMoELN.py
+++ b/minigpt4/models/QformerRouteMoELN.py
--- a/minigpt4/models/blip2.py
+++ b/minigpt4/models/blip2.py
@ -22,6 +22,7 @@ from minigpt4.common.logger import MetricLogger
 from minigpt4.models.base_model import BaseModel
 from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
 from minigpt4.models.QformerMoE import BertMoELMHeadModel
+from minigpt4.models.QformerMoELN import BertMoELMHeadModelLNIn
 from minigpt4.models.QformerRouteMoE import BertMoERouteLMHeadModel
 from minigpt4.models.eva_vit import create_eva_vit_g
 from transformers import BertTokenizer
@ -88,7 +89,7 @@ class Blip2Base(BaseModel):


    @classmethod
-    def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2):
+    def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2,ln_position="out"):
        moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")

        moe_encoder_config.encoder_width = vision_width
@ -104,9 +105,14 @@ class Blip2Base(BaseModel):
        moe_encoder_config.use_balance_loss = use_balance_loss
        moe_encoder_config.moe_weight_type = moe_weight_type

-        MoEQformer = BertMoELMHeadModel.from_pretrained(
-            "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
-        )
+        if ln_position == "out":
+            MoEQformer = BertMoELMHeadModel.from_pretrained(
+                "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
+            )
+        elif ln_position == "in":
+            MoEQformer = BertMoELMHeadModelLNIn.from_pretrained(
+                "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
+            )
        query_tokens = nn.Parameter(
            torch.zeros(1, num_query_token, moe_encoder_config.hidden_size)
        )
--- a/minigpt4/models/blip2_vicuna_instruct.py
+++ b/minigpt4/models/blip2_vicuna_instruct.py
@ -65,6 +65,8 @@ class Blip2VicunaInstruct(Blip2Base):
        use_balance_loss = True,
        moe_weight_type = "l2_norm",
        gate_save_path = None,
+        bal_loss_decay_epoch = 3,
+        ln_position = "out",
    ):
        super().__init__()
        transformers_version = version.parse(transformers.__version__)
@ -112,7 +114,8 @@ class Blip2VicunaInstruct(Blip2Base):
                    moe_topk=moe_topk,
                    use_balance_loss=use_balance_loss,
                    moe_weight_type=moe_weight_type,
-                    cross_attention_freq=2
+                    cross_attention_freq=2,
+                    ln_position=ln_position,
                )
        else:
            self.Qformer, self.query_tokens = self.init_Qformer(
@ -221,6 +224,7 @@ class Blip2VicunaInstruct(Blip2Base):
        self.moebert_num_beams = moebert_num_beams

        self.gate_save_path = gate_save_path
+        self.bal_loss_decay_epoch = bal_loss_decay_epoch
        # if self.gate_save_path != None:
            # import os
            # if not os.path.exists(self.gate_save_path):
@ -392,9 +396,12 @@ class Blip2VicunaInstruct(Blip2Base):
                return_dict=True,
                labels=targets,
            )
-
+            
        if self.use_moeqformer:
-            loss = outputs.loss + self.moebert_load_balance * gate_loss
+            if samples['epoch'] > self.bal_loss_decay_epoch:
+                loss = outputs.loss
+            else:
+                loss = outputs.loss + self.moebert_load_balance * gate_loss
        else:
            loss = outputs.loss

@ -512,6 +519,16 @@ class Blip2VicunaInstruct(Blip2Base):

        with self.maybe_autocast():
            inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids)
+
+            # path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/embedding/"
+            # np.save(os.join(path, "inputs_llm.npy"), inputs_llm.cpu().numpy)
+            # np.save(os.join(path, "inputs_llm.npy"), self.llm_model.get_input_embeddings().weight.cpu().numpy)
+            # samples_copy = samples.copy()
+            # samples_copy.pop('image', None)
+            # with open(os.path.join(path, 'test_samples.json'),'a+') as f:
+            #     f.write(f"{json.dumps(samples_copy)}\n")
+
+
            inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
            attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1)

@ -654,6 +671,8 @@ class Blip2VicunaInstruct(Blip2Base):
        use_balance_loss = cfg.get("use_balance_loss", True)
        moe_weight_type = cfg.get("moe_weight_type",'l2_norm')
        gate_save_path = cfg.get("gate_save_path", None)
+        bal_loss_decay_epoch = cfg.get("bal_loss_decay_epoch", 3)
+        ln_position = cfg.get("ln_position","out")

        model = cls(
            vit_model=vit_model,
@ -683,6 +702,8 @@ class Blip2VicunaInstruct(Blip2Base):
            use_balance_loss=use_balance_loss,
            moe_weight_type=moe_weight_type,
            gate_save_path=gate_save_path,
+            bal_loss_decay_epoch=bal_loss_decay_epoch,
+            ln_position=ln_position,
        )

        # if qformer_text_input:
--- a/minigpt4/models/moe/beam_search.py
+++ b/minigpt4/models/moe/beam_search.py
@ -165,7 +165,7 @@ class RouteMoELayer(nn.Module):
        self.route_method = route_method
        if self.route_method == "pre-route":
            self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
-        elif self.route_method == "post-route":
+        elif self.route_method in ["post-route", "post-route-dp"]:
            gate = nn.Linear(hidden_size, 1, bias=False).float()
            self.gate = gate
            # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@ -252,6 +252,53 @@ class RouteMoELayer(nn.Module):

        return beam_scores, expert_route, beam_idx
    
+    def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+        if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route', 'post-route-dp']:
+            # current_scores_log torch.Size([bz, num_experts])
+            assert beam_scores==None and expert_route==None
+            current_scores = torch.exp(current_scores_log)
+            topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+            beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
+            expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
+            beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
+        else:
+            batch_size = int(batch_size // self.num_beams)
+            next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1)  # torch.Size([4*3, 5]) # 取log 之后，可以直接相加概率
+            next_scores_exp = torch.exp(next_scores_raw)
+            import pdb;pdb.set_trace()
+
+            next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
+            next_scores = next_scores_raw.view(batch_size, self.num_beams)
+            next_experts = next_experts_raw.view(batch_size, self.num_beams)
+            # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
+            # next_scores torch.Size([bz * num_beams, 1])
+            # next_tokens torch.Size([bz * num_beams, 1])
+
+            next_batch_beam = list()
+            for batch_idx in range(batch_size):
+                next_sent_beam = list()
+                expert_id = next_experts[batch_idx]
+                expert_score = next_scores[batch_idx]
+                values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
+                for i in range(self.num_beams):
+                    beam_id = index[i].item()
+                    ex_id = expert_id[beam_id].item()
+                    effective_beam_id = batch_idx*self.num_beams + beam_id
+                    next_sent_beam.append((values[i], ex_id, effective_beam_id))
+                next_batch_beam.extend(next_sent_beam)
+
+            import pdb;pdb.set_trace()
+
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+            beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+            pre_route = expert_route[beam_idx,:]
+            expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+
+        return beam_scores, expert_route, beam_idx
+    
+
    def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
        if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
            # current_scores_log torch.Size([bz, num_experts])
@ -267,6 +314,8 @@ class RouteMoELayer(nn.Module):
            batch_size = int(batch_size // self.num_beams)
            next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1)  # torch.Size([4*3, 5]) # 取log 之后，可以直接相加概率
            next_scores_exp = torch.exp(next_scores_raw)
+            import pdb;pdb.set_trace()
+
            next_scores_raw1 = next_scores_exp.view(
                batch_size, self.num_beams * self.num_experts
            )  # torch.Size([bz, num_beams*num_experts])
@ -289,7 +338,7 @@ class RouteMoELayer(nn.Module):
                    next_sent_beam.append((expert_score, ex_id, effective_beam_id))
                next_batch_beam.extend(next_sent_beam)

-            # import pdb;pdb.set_trace()
+            import pdb;pdb.set_trace()

            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
            beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
@ -301,8 +350,6 @@ class RouteMoELayer(nn.Module):

        return beam_scores, expert_route, beam_idx
    
-
-
    def forward_expert_ffn(self, x, expert_select, current_scores):
        """
            x_repeat : [bz*num_beams, 32,768]
@ -343,6 +390,7 @@ class RouteMoELayer(nn.Module):

        batch_size, num_tokens = x.shape[0], x.shape[1]
        beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+
        current_expert_select = expert_route[:,-1]

        import pdb;pdb.set_trace()
@ -368,7 +416,6 @@ class RouteMoELayer(nn.Module):
            output_x = self.experts[expert_idx].forward(input_x)
            return output_x

-        import pdb; pdb.set_trace()
        outputs = list()
        logits_gate_lst = list()
        for expert_idx in range(self.num_experts):
@ -392,10 +439,14 @@ class RouteMoELayer(nn.Module):
        # importance loss
        importance_loss = self._importance_auxiliary_loss(current_scores)
        
-        # import pdb; pdb.set_trace()
-
        batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
-        beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+        import pdb; pdb.set_trace()
+
+        if self.route_method == 'post-route':
+            beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+        elif self.route_method == 'post-route-dp':
+            beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
+
        # beam_scores torch.Size([bz*num_beam])
        # expert_route torch.Size([bz*num_beam, layer_n])
        current_select_expert = expert_route[:,-1]
@ -431,7 +482,7 @@ class RouteMoELayer(nn.Module):
        """
        if self.route_method == 'pre-route':
            candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
-        elif self.route_method == "post-route":
+        elif self.route_method in ['post-route', 'post-route-dp']:
            candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)

        return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
@ -467,10 +518,11 @@ if __name__ == '__main__':
    batch_size = 4
    x = torch.randn(batch_size, 32, 768)
    beam_scores, expert_route = None, None
-
    x1 = x
    x2 = x
+    x3 = x
    beam_scores1, expert_route1 = None, None
+    beam_scores2, expert_route2 = None, None

    for layer_num in [6, 8, 10]:
        layer_judge = moe_layer_judge(layer_num)
@ -494,25 +546,41 @@ if __name__ == '__main__':
        # print(importance_loss)
        # x = hidden_states1

-        gate1 = nn.Linear(768, 1, bias=False).float()
+        # experts_post = RouteMoELayer(
+        #             hidden_size=768,
+        #             expert=ffn,
+        #             num_experts=config.moebert_expert_num,
+        #             num_beams=config.moebert_num_beams,
+        #             layer_judge = layer_judge,
+        #             route_method = "post-route",
+        #             weight_type="ffn_prob"
+        #         )
+        # layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
+        # hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
+
+        # print(beam_scores1)
+        # print(expert_route1)
+        # print(beam_idx)
+        # print(importance_loss)
+        # x1 = hidden_states2
+
        experts_post = RouteMoELayer(
                    hidden_size=768,
                    expert=ffn,
                    num_experts=config.moebert_expert_num,
                    num_beams=config.moebert_num_beams,
                    layer_judge = layer_judge,
-                    route_method = "post-route",
+                    route_method = "post-route-dp",
                    weight_type="ffn_prob"
                )
-        layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
-        hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
-
-        print(beam_scores1)
-        print(expert_route1)
-        print(beam_idx)
-        print(importance_loss)
-        x1 = hidden_states2
+        layer_output = experts_post(x2, None, beam_scores2, expert_route2, False)
+        hidden_states3, beam_scores2, expert_route2, beam_idx2, importance_loss2 = layer_output

+        print(beam_scores2)
+        print(expert_route2)
+        print(beam_idx2)
+        print(importance_loss2)
+        x2 = hidden_states3

        # gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
        # experts_moe = MoELayer(
@ -526,12 +594,12 @@ if __name__ == '__main__':
        #         weight_type=config.moe_weight_type,
        #     )
        # attn_mask = torch.ones([batch_size, 32])
-        # layer_output = experts_moe(x2, attn_mask)
-        # hidden_states3, select_prob_gate, gate_load,_ = layer_output
+        # layer_output = experts_moe(x3, attn_mask)
+        # hidden_states4, select_prob_gate, gate_load,_ = layer_output
        
        # print(select_prob_gate)
        # print(gate_load)
-        # x2 = hidden_states3
+        # x3 = hidden_states4

        print("------------------------------------")
        import pdb; pdb.set_trace()
--- a/minigpt4/models/moe/route_moe_layer.py
+++ b/minigpt4/models/moe/route_moe_layer.py
@ -18,7 +18,7 @@ class RouteMoELayer(nn.Module):
        self.route_method = route_method
        if self.route_method == "pre-route":
            self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
-        elif self.route_method == "post-route":
+        elif self.route_method in ["post-route", "post-route-dp"]:
            gate = nn.Linear(hidden_size, 1, bias=False).float()
            self.gate = gate
            # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@ -47,26 +47,67 @@ class RouteMoELayer(nn.Module):
        prob_gate = F.softmax(logits_gate, dim=-1) #  torch.Size([bz*num_beams, num_experts])
        return prob_gate

-
-    def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
-        if self.layer_judge=='first' and self.route_method=='pre-route':
+    def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+        if self.layer_judge=='first' and self.route_method in ['post-route-dp']:
+            # current_scores_log torch.Size([bz, num_experts])
            assert beam_scores==None and expert_route==None
            current_scores = torch.exp(current_scores_log)
            topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
            beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
            expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
            beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
        else:
-            if self.layer_judge=='first' and self.route_method == 'post-route':
-                batch_size = batch_size
-                next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts])
-            else:
-                batch_size = int(batch_size // self.num_beams)
-                next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1)  # torch.Size([4*3, 5]) # 取log 之后，可以直接相加概率
-                next_scores_exp = torch.exp(next_scores_raw)
-                next_scores_raw1 = next_scores_exp.view(
-                    batch_size, self.num_beams * self.num_experts
-                )  # torch.Size([bz, num_beams*num_experts])
+            batch_size = int(batch_size // self.num_beams)
+            next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1)  # torch.Size([4*3, 5]) # 取log 之后，可以直接相加概率
+            next_scores_exp = torch.exp(next_scores_raw)
+
+            next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
+            next_scores = next_scores_raw.view(batch_size, self.num_beams)
+            next_experts = next_experts_raw.view(batch_size, self.num_beams)
+            # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
+            # next_scores torch.Size([bz * num_beams, 1])
+            # next_tokens torch.Size([bz * num_beams, 1])
+
+            next_batch_beam = list()
+            for batch_idx in range(batch_size):
+                next_sent_beam = list()
+                expert_id = next_experts[batch_idx]
+                expert_score = next_scores[batch_idx]
+                values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
+                for i in range(self.num_beams):
+                    beam_id = index[i].item()
+                    ex_id = expert_id[beam_id].item()
+                    effective_beam_id = batch_idx*self.num_beams + beam_id
+                    next_sent_beam.append((values[i], ex_id, effective_beam_id))
+                next_batch_beam.extend(next_sent_beam)
+
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+            beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+            pre_route = expert_route[beam_idx,:]
+            expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+
+        return beam_scores, expert_route, beam_idx
+    
+    def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+        if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
+            # current_scores_log torch.Size([bz, num_experts])
+            assert beam_scores==None and expert_route==None
+            current_scores = torch.exp(current_scores_log)
+            topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+            beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
+            expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
+            beam_idx = torch.tensor(range(self.num_beams * batch_size))
+            
+        else:
+            batch_size = int(batch_size // self.num_beams)
+            next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1)  # torch.Size([4*3, 5]) # 取log 之后，可以直接相加概率
+            next_scores_exp = torch.exp(next_scores_raw)
+
+            next_scores_raw1 = next_scores_exp.view(
+                batch_size, self.num_beams * self.num_experts
+            )  # torch.Size([bz, num_beams*num_experts])

            next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True)
            # next_scores torch.Size([bz, num_beams])
@ -86,19 +127,11 @@ class RouteMoELayer(nn.Module):
                    next_sent_beam.append((expert_score, ex_id, effective_beam_id))
                next_batch_beam.extend(next_sent_beam)

-            if self.layer_judge=='first' and self.route_method == 'post-route':
-                beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
-                expert_route = next_experts.view(self.num_beams * batch_size)
-                beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-                beam_experts = expert_route.new([x[1] for x in next_batch_beam]).unsqueeze(-1)
-                beam_idx = expert_route.new([int(x[2]/self.num_beams) for x in next_batch_beam])
-                expert_route = beam_experts
-            else:
-                beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-                beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
-                beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
-                pre_route = expert_route[beam_idx,:]
-                expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+            beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+            pre_route = expert_route[beam_idx,:]
+            expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)

        return beam_scores, expert_route, beam_idx
    
@ -153,7 +186,6 @@ class RouteMoELayer(nn.Module):
        # import pdb;pdb.set_trace()
        return candidate_output, beam_scores, expert_route, beam_idx, importance_loss

-
    def forward_post_route(self, x, beam_scores, expert_route, use_log=True):
        
        attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
@ -187,7 +219,12 @@ class RouteMoELayer(nn.Module):
        importance_loss = self._importance_auxiliary_loss(current_scores)
        
        batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
-        beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+
+        if self.route_method == 'post-route':
+            beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+        elif self.route_method == 'post-route-dp':
+            beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
+
        # beam_scores torch.Size([bz*num_beam])
        # expert_route torch.Size([bz*num_beam, layer_n])
        current_select_expert = expert_route[:,-1]
@ -218,7 +255,7 @@ class RouteMoELayer(nn.Module):
        """
        if self.route_method == 'pre-route':
            candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
-        elif self.route_method == "post-route":
+        elif self.route_method in ['post-route', 'post-route-dp']:
            candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)

        return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
--- a/minigpt4/models/moe/utils.py
+++ b/minigpt4/models/moe/utils.py
@ -13,7 +13,7 @@ from typing import Optional, Tuple, List
 def use_experts(layer_idx):
    # if layer_idx % 2 == 0:
    # use moe_ffn after cross_attns
-    if int(layer_idx) in [6,8,10]:
+    if int(layer_idx) in [6,7,8,9,10,11]:
    # layer 6/8/10
        return True
    else:
--- a/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml
+++ b/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml
@ -0,0 +1,114 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b_pretrain
+  load_pretrained: True
+  load_finetuned: True
+  vit_model: eva_clip_g
+  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+  finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/20240301223/checkpoint_best.pth"
+  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+
+  # Q-Former
+  num_query_token: 32
+  qformer_text_input: True
+
+  # T5
+  llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+  prompt: ""
+  max_txt_len: 256
+  max_output_txt_len: 256
+
+  # freeze
+  freeze_vit: True
+  freeze_llm: True
+  freeze_qformer: False
+  freeze_t5_proj: False
+
+  # moe
+  use_moeqformer: True
+  use_route_moe: False
+  moebert_expert_num: 3
+  moebert_route_method: "gate-sentence-post"
+  moe_weight_type: "raw_prob"
+  moebert_load_balance: 0.05
+  moe_topk: 1
+  use_balance_loss: False
+  ln_position: "out"
+
+datasets:
+  gqa:
+    type: balanced_sft_raw_eval
+    batch_size: 4
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
+  ok_vqa: # train, valid (9009, 5046)
+    type: ok_vqa_eval
+    batch_size: 4
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
+  coco_vqa:    # 658104
+    type: vqa_v2_eval
+    batch_size: 4
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
+  aok_vqa: # train: 17056, val: 1145
+    batch_size: 4
+    vis_processor:
+      eval:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
+run:
+  task: instruction_tuning
+  seed: 42
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/eval/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/"
+  num_workers: 4
+  
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: True
+  test_splits: ["val"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  
+
+  
+
+  
+
--- a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
@ -10,7 +10,7 @@ model:
  load_finetuned: True
  vit_model: eva_clip_g
  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
-  finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth"
+  finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/20240128142/checkpoint_best.pth"
  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"

  # vit encoder
@ -39,27 +39,18 @@ model:
  use_moeqformer: True
  use_route_moe: True
  moebert_route_method: "post-route"
-  moebert_load_balance: 0
+  moebert_load_balance: 0.01
  moebert_expert_num: 2
  moebert_num_beams: 2
  moe_weight_type: 'ffn_prob'
-  gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
+  use_balance_loss: False
+  bal_loss_decay_epoch: 8
+  gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"

 datasets:
  gqa:
    type: balanced_sft_raw_eval
-    batch_size: 32
-    vis_processor:
-      eval:
-        name: "blip2_image_eval"
-        image_size: 224
-    text_processor:
-      eval:
-        name: "blip_caption"
-
-  ok_vqa: # train, valid (9009, 5046)
-    type: ok_vqa_eval
-    batch_size: 32
+    batch_size: 64
    vis_processor:
      eval:
        name: "blip2_image_eval"
@ -70,6 +61,17 @@ datasets:

  coco_vqa:    # 658104
    type: vqa_v2_eval
+    batch_size: 64
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
+  coco_caption: # 414113 train
+    type: coco_cap_eval
    batch_size: 32
    vis_processor:
      eval:
@ -78,7 +80,18 @@ datasets:
    text_processor:
      eval:
        name: "blip_caption"
-    
+  
+  ok_vqa: # train, valid (9009, 5046)
+    type: ok_vqa_eval
+    batch_size: 64
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+
 run:
  task: instruction_tuning
  # optimizer
@ -96,7 +109,7 @@ run:
  iters_per_epoch: 3000

  seed: 42
-  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"
  
  amp: True
  resume_ckpt_path: null
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
@ -38,17 +38,17 @@ model:
  # moe
  use_moeqformer: True
  use_route_moe: True
-  moebert_route_method: "post-route"
-  moebert_load_balance: 0
-  moebert_expert_num: 3
-  moebert_num_beams: 3
+  moebert_route_method: "post-route-dp"
+  moebert_load_balance: 0.05
+  moebert_expert_num: 2
+  moebert_num_beams: 2
  moe_weight_type: 'ffn_prob'
  use_balance_loss: False

 datasets:
  gqa: # train: 943000, 12578, 12578)
    type: balanced_sft_raw
-    batch_size: 16
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -64,7 +64,7 @@ datasets:
    sample_ratio: 10
  
  ok_vqa: # train, valid (9009, 5046)
-    batch_size: 16
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -80,7 +80,7 @@ datasets:
    sample_ratio: 1
  
  coco_vqa:    # 658104
-    batch_size: 16
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -112,7 +112,7 @@ run:
  iters_per_epoch: 5000

  seed: 42
-  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/"
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0121/"
  
  amp: True
  resume_ckpt_path: null
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml
@ -38,14 +38,17 @@ model:
  # moe
  use_moeqformer: True
  use_route_moe: True
-  moebert_expert_num: 5
-  moebert_num_beams: 1
-  # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+  moebert_route_method: "post-route"
+  moebert_load_balance: 0
+  moebert_expert_num: 2
+  moebert_num_beams: 2
+  moe_weight_type: 'ffn_prob'
+  use_balance_loss: False

 datasets:
  gqa: # train: 943000, 12578, 12578)
    type: balanced_sft_raw
-    batch_size: 4
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -61,7 +64,7 @@ datasets:
    sample_ratio: 10
  
  ok_vqa: # train, valid (9009, 5046)
-    batch_size: 4
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -77,7 +80,7 @@ datasets:
    sample_ratio: 1
  
  coco_vqa:    # 658104
-    batch_size: 4
+    batch_size: 32
    vis_processor:
      train:
        name: "blip2_image_train"
@ -96,20 +99,20 @@ run:
  task: instruction_tuning
  # optimizer
  lr_sched: "linear_warmup_cosine_lr"
-  init_lr: 2e-5
+  init_lr: 5e-5
  min_lr: 1e-6
  warmup_lr: 1e-6
  log_freq: 5
  save_freq: 1500

  weight_decay: 0.05
-  max_epoch: 6
+  max_epoch: 8
  num_workers: 4
  warmup_steps: 600
  iters_per_epoch: 5000

  seed: 42
-  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1212_Test/"
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_1loss_5e5lr_top6layer_textinqf_epo8_0123/"
  
  amp: True
  resume_ckpt_path: null
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml
@ -0,0 +1,145 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b_pretrain
+  load_pretrained: True
+  load_finetuned: False
+  vit_model: eva_clip_g
+  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+  # finetuned: ""
+  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+
+  # Q-Former
+  num_query_token: 32
+  qformer_text_input: True
+
+  # vicuna
+  llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+  prompt: ""
+  max_txt_len: 256
+  max_output_txt_len: 256
+
+  # freeze
+  freeze_vit: True
+  freeze_llm: True
+  freeze_qformer: False
+  freeze_t5_proj: False
+
+  # moe
+  use_moeqformer: False
+  use_route_moe: False
+  moebert_route_method: "post-route"
+  moebert_load_balance: 0
+  moebert_expert_num: 2
+  moebert_num_beams: 2
+  moe_weight_type: 'ffn_prob'
+  use_balance_loss: False
+
+datasets:
+  gqa:  # train: 943000, 12578, 12578)
+    type: balanced_sft_raw
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 10 
+  
+  ok_vqa: # train, valid (9009, 5046)
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 1  
+  
+  coco_vqa:    # 658104
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 9
+
+  coco_caption: # 414113 train
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 7
+
+run:
+  task: instruction_tuning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 5e-5
+  min_lr: 1e-6
+  warmup_lr: 1e-6
+  log_freq: 5
+  save_freq: 1500
+
+  weight_decay: 0.05
+  max_epoch: 8
+  num_workers: 4
+  warmup_steps: 600
+  iters_per_epoch: 5000
+
+  seed: 42
+  # output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0122/"
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Base_top6layer_textinqf_epo8_0124/"
+  
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["val"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml
@ -0,0 +1,145 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b_pretrain
+  load_pretrained: True
+  load_finetuned: False
+  vit_model: eva_clip_g
+  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+  # finetuned: ""
+  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+
+  # Q-Former
+  num_query_token: 32
+  qformer_text_input: True
+
+  # vicuna
+  llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+  prompt: ""
+  max_txt_len: 256
+  max_output_txt_len: 256
+
+  # freeze
+  freeze_vit: True
+  freeze_llm: True
+  freeze_qformer: False
+  freeze_t5_proj: False
+
+  # moe
+  use_moeqformer: True
+  use_route_moe: True
+  moebert_route_method: "post-route"
+  moebert_load_balance: 0.01
+  moebert_expert_num: 2
+  moebert_num_beams: 2
+  moe_weight_type: 'ffn_prob'
+  use_balance_loss: False
+  bal_loss_decay_epoch: 3
+
+datasets:
+  gqa:  # train: 943000, 12578, 12578)
+    type: balanced_sft_raw
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 10 
+  
+  ok_vqa: # train, valid (9009, 5046)
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 1  
+  
+  coco_vqa:    # 658104
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 9
+
+  coco_caption: # 414113 train
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 7
+
+run:
+  task: instruction_tuning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 5e-5
+  min_lr: 1e-6
+  warmup_lr: 1e-6
+  log_freq: 5
+  save_freq: 1500
+
+  weight_decay: 0.05
+  max_epoch: 8
+  num_workers: 4
+  warmup_steps: 600
+  iters_per_epoch: 5000
+
+  seed: 42
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_loss_decay_5e5lr_top6layer_textinqf_epo8_0129/"
+  
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["val"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml
@ -0,0 +1,188 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b_pretrain
+  load_pretrained: True
+  load_finetuned: False
+  vit_model: eva_clip_g
+  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+  # finetuned: ""
+  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+
+  # Q-Former
+  num_query_token: 32
+  qformer_text_input: True
+
+  # vicuna
+  llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+  prompt: ""
+  max_txt_len: 256
+  max_output_txt_len: 256
+
+  # freeze
+  freeze_vit: True
+  freeze_llm: True
+  freeze_qformer: False
+  freeze_t5_proj: False
+
+  # moe
+  use_moeqformer: True
+  use_route_moe: True
+  moebert_route_method: "post-route"
+  moebert_load_balance: 0.05
+  moebert_expert_num: 2
+  moebert_num_beams: 2
+  moe_weight_type: 'ffn_prob'
+  use_balance_loss: False
+
+datasets:
+  gqa:
+    type: balanced_sft_raw_eval
+    batch_size: 16
+    vis_processor:
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      eval:
+        name: "blip_caption"
+  
+  ok_vqa: # train, valid (9009, 5046)
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 8  
+  
+  coco_vqa:    # 658104
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 15
+
+  aok_vqa: # train: 17056, val: 1145
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 12
+
+  ocrvqa: # train 207572
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 30
+
+  llava_reason: # 76643
+    batch_size: 16
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 80
+
+  llava_conversation: # 56681
+    batch_size: 16
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 30
+  
+  llava_detail: # 23240
+    batch_size: 16
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 20
+
+  coco_caption: # 414113 train
+    batch_size: 16
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 10  
+
+run:
+  task: instruction_tuning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 5e-5
+  min_lr: 1e-6
+  warmup_lr: 1e-6
+  log_freq: 5
+  save_freq: 1500
+
+  weight_decay: 0.05
+  max_epoch: 8
+  num_workers: 4
+  warmup_steps: 600
+  iters_per_epoch: 5000
+
+  seed: 42
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
+  
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["val"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml
@ -0,0 +1,128 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: blip2_vicuna_instruct
+  model_type: vicuna7b_pretrain
+  load_pretrained: True
+  load_finetuned: False
+  vit_model: eva_clip_g
+  pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+  # finetuned: ""
+  q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+
+  # Q-Former
+  num_query_token: 32
+  qformer_text_input: True
+
+  # vicuna
+  llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+  prompt: ""
+  max_txt_len: 256
+  max_output_txt_len: 256
+
+  # freeze
+  freeze_vit: True
+  freeze_llm: True
+  freeze_qformer: False
+  freeze_t5_proj: False
+
+  # moe
+  use_moeqformer: True
+  use_route_moe: True
+  moebert_route_method: "post-route-dp"
+  moebert_load_balance: 0.05
+  moebert_expert_num: 2
+  moebert_num_beams: 2
+  moe_weight_type: 'ffn_prob'
+  use_balance_loss: False
+
+datasets:
+  gqa: # train: 943000, 12578, 12578)
+    type: balanced_sft_raw
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 10
+  
+  ok_vqa: # train, valid (9009, 5046)
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 1
+  
+  coco_vqa:    # 658104
+    batch_size: 32
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+      eval:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+      eval:
+        name: "blip_caption"
+    sample_ratio: 9
+
+run:
+  task: instruction_tuning
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 5e-5
+  min_lr: 1e-6
+  warmup_lr: 1e-6
+  log_freq: 5
+  save_freq: 1500
+
+  weight_decay: 0.05
+  max_epoch: 8
+  num_workers: 4
+  warmup_steps: 600
+  iters_per_epoch: 5000
+
+  seed: 42
+  output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
+  
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  # test_splits: ["val"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
--- a/minigpt4/tasks/instruction_tuning.py
+++ b/minigpt4/tasks/instruction_tuning.py
@ -53,7 +53,7 @@ class InstructionTask(BaseTask):
        run_cfg = cfg.run_cfg

        num_beams = run_cfg.get("num_beams", 3)
-        max_len = run_cfg.get("max_len", 20)
+        max_len = run_cfg.get("max_len", 30)
        min_len = run_cfg.get("min_len", 1)

        evaluate = run_cfg.get("evaluate", False)
@ -112,22 +112,33 @@ class InstructionTask(BaseTask):
        )
        pred_qa_pairs = []

-        question_id = samples["question_id"]
-        question = samples["text_input"]
+        text_inputs = samples["text_input"]
+
        sources = samples["source"]
+        source = samples["source"][0]
+
+        if source in ['vqav2','okvqa','gqa']:
+            sample_ids = [int(sample_id.item()) for sample_id in samples["question_id"]]
+        elif source in ['aokvqa']:
+            sample_ids = [sample_id for sample_id in samples["question_id"]]
+        elif source in ['coco_cap']:
+            sample_ids = samples["image_id"]

        # For GQA
-        full_answers = samples.get("fullAnswer", ["" for i in range(len(question_id))])
-        gt_answers = samples.get("gt_answers", ["" for i in range(len(question_id))])
+        full_answers = samples.get("fullAnswer", ["" for i in range(len(sample_ids))])
+        gt_answers = samples.get("gt_answers", ["" for i in range(len(sample_ids))])

-        for answer, ques_id, ques, full_answer, gt_answer, source in zip(answers, question_id, question, full_answers, gt_answers, sources):
-            ques_id = int(ques_id.item())
+        # For AOKVQA
+        choices = samples.get("choices", ["" for i in range(len(sample_ids))])
+
+        for answer, sample_id, text_input, full_answer, gt_answer, choice, source in zip(answers, sample_ids, text_inputs, full_answers, gt_answers, choices, sources):
            pred_qa_pairs.append({
-                "question_id": ques_id,
-                "question": ques,
+                "question_id": sample_id,
+                "question": text_input,
                "full_answer": full_answer,
                "answer": answer,
                "gt_ans": gt_answer,
+                "choice": choice,
                "source": source})
        return pred_qa_pairs

@ -140,9 +151,7 @@ class InstructionTask(BaseTask):
        total_results = list()
        for sub_data_loader in  data_loader.loaders:
            results = []
-            ques_ids = []
            for samples in metric_logger.log_every(sub_data_loader, print_freq, header):
-                ques_ids.extend(samples['question_id'].tolist())

                samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
                eval_output = self.valid_step(model=model, samples=samples)
@ -168,6 +177,7 @@ class InstructionTask(BaseTask):
                filename=f"{split_name}_vqa_result_{source}",
                remove_duplicate="question_id",
            )
+
            if source in ['vqav2','okvqa']:
                try:
                    metrics = self._report_metrics_coco_vqa(result_file=result_file, split=split_name, source=source)
@ -180,7 +190,18 @@ class InstructionTask(BaseTask):
                except Exception as e:
                    metrics = None
                    print(f"Report Metrics {source} Error: {e}")
-
+            elif source in ['aokvqa']:
+                try:
+                    metrics = self._report_metrics_aokvqa(result_file=result_file, source=source)
+                except Exception as e:
+                    metrics = None
+                    print(f"Report Metrics {source} Error: {e}")
+            elif source in ['coco_cap']:
+                try:
+                    metrics = self._report_metrics_caption(result_file=result_file, split_name=split_name, source=source)
+                except Exception as e:
+                    metrics = None
+                    print(f"Report Metrics {source} Error: {e}")
            else:
                metrics = None
            final_metrics[source] = metrics
@ -234,10 +255,46 @@ class InstructionTask(BaseTask):

        return metrics
    
+    @dist_utils.main_process
+    def _report_metrics_aokvqa(self, result_file, source='aokvqa'):
+        """
+        Validation of aokvqa
+        """
+        # measuring accuracy compared to answer
+        results = json.load(open(result_file, "r"))
+        acc = []
+        vqa_tool = VQAEval()
+
+        for res in results:
+
+            gt_ans = res["choice"]
+            pred = res["answer"]
+
+            pred = vqa_tool.processPunctuation(pred)
+            pred = vqa_tool.processDigitArticle(pred)
+
+            # vqa_acc = 1 if pred == gt_ans else 0
+            vqa_acc = 1 if pred in gt_ans else 0
+
+            acc.append(vqa_acc)
+
+        accuracy = sum(acc) / len(acc) * 100
+        metrics = {"agg_metrics": accuracy, "acc": accuracy}
+
+        with open(
+            os.path.join(registry.get_path("output_dir"), f"evaluate_{source}.txt"), "a"
+        ) as f:
+            f.write(json.dumps(metrics) + "\n")
+
+        logging.info(metrics)
+
+        return metrics
+
+
    @dist_utils.main_process
    def _report_metrics_gqa(self, result_file, source='gqa'):
        """
-        Validation of GQA/VQAv2
+        Validation of GQA
        """
        # measuring accuracy compared to answer
        results = json.load(open(result_file, "r"))
@ -274,3 +331,90 @@ class InstructionTask(BaseTask):

        return metrics
 
+    @dist_utils.main_process
+    def _report_metrics_caption(self, result_file, split_name, source='coco_cap'):
+        """
+        Use official COCO Cap evaluation script to report metrics.
+        """
+        coco_gt_root = os.path.join(registry.get_path("cache_root"), "coco_gt")
+        coco_val = coco_caption_eval(coco_gt_root, result_file, split_name)
+
+        agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
+        log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
+
+        with open(
+            os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
+        ) as f:
+            f.write(json.dumps(log_stats) + "\n")
+
+        coco_res = {k: v for k, v in coco_val.eval.items()}
+        coco_res["agg_metrics"] = agg_metrics
+
+        return coco_res
+
+from collections import defaultdict
+from pycocoevalcap.eval import COCOEvalCap
+class COCO_Annotation:
+    def __init__(self, annotation_file):
+        self.coco_cn_file = annotation_file
+        self.imgToAnns = self.build_imgToAnns()
+    
+    def build_imgToAnns(self):
+        imgToAnns = defaultdict(list)
+        with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
+            for line in fin:
+                line = line.strip()
+                temp = eval(line)
+                annotations = temp['annotations']
+                for ann in annotations:
+                    image_id = str(ann['image_id']).zfill(6)
+                    imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
+        return imgToAnns
+    
+    def getImgIds(self):
+        return self.imgToAnns.keys()  
+
+class COCO_Result:
+    def __init__(self,result_file):
+        self.coco_cn_file = result_file
+        self.imgToAnns = self.build_imgToAnns()
+    
+    def build_imgToAnns(self):
+        imgToAnns = dict()
+        data = json.load(open(self.coco_cn_file, "r"))
+        for d in data:
+            tmp = {
+                'image_id':d['question_id'][-6:],
+                'caption':d['answer']
+            }
+            imgToAnns[d['question_id'][-6:]] = [tmp]
+        return imgToAnns
+    
+def coco_caption_eval(coco_gt_root, results_file, split_name):
+    files = {
+        "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
+        "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
+    }
+
+    # create coco object and coco_result object
+    annotation_file = files[split_name]
+    coco = COCO_Annotation(annotation_file)
+    coco_result = COCO_Result(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # evaluate on a subset of images by setting
+    # coco_eval.params['image_id'] = coco_result.getImgIds()
+    # please remove this line when evaluating the full validation set
+    # coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # evaluate results
+    # SPICE will take a few minutes the first time, but speeds up due to caching
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f"{metric}: {score:.3f}")
+
+    return coco_eval
--- a/prompts/alignment.txt
+++ b/prompts/alignment.txt
@ -1,4 +0,0 @@
-<Img><ImageHere></Img> Describe this image in detail.
-<Img><ImageHere></Img> Take a look at this image and describe what you notice.
-<Img><ImageHere></Img> Please provide a detailed description of the picture.
-<Img><ImageHere></Img> Could you describe the contents of this image for me?
--- a/test.pdf/backward_graph
+++ b/test.pdf/backward_graph
--- a/test.pdf/backward_graph.pdf
+++ b/test.pdf/backward_graph.pdf
--- a/test/datasets/test_dataset.py
+++ b/test/datasets/test_dataset.py
@ -0,0 +1,58 @@
+import datasets
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
+import random
+from tqdm import tqdm
+
+# path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/minigpt4/models/cmrc2018_trial.json"
+# dataset = load_dataset("json", data_files=[path], field="data",  split="train")
+# tokenizer = AutoTokenizer.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
+# def preprocess_function(example):
+#     import pdb; pdb.set_trace()
+#     model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
+#     labels = tokenizer(example["title"], max_length=32, truncation=True)
+#     # label就是title编码的结果
+#     model_inputs["labels"] = labels["input_ids"]
+#     return model_inputs
+# processed_datasets = dataset.map(preprocess_function)
+
+dataset = load_dataset("/mnt/pfs-guan-ssai/nlu/wanghanzi/data/alpaca_20k")
+train_dataset = dataset['train']
+
+    
+for i in tqdm(range(1, len(train_dataset))):
+    import pdb; pdb.set_trace()
+
+    idx = random.randint(0,i)
+    memory = train_dataset[idx]
+    memory_text = f"Instruction: {memory['instruction']}\n Answer: {memory['output']} \n"
+    train_dataset[i]['text'] = f"{memory_text} Instruction:{train_dataset[i]['instruction']}"
+
+
+import pdb; pdb.set_trace()
+
+
+model_path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/opt_350m"
+model = AutoModelForCausalLM.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+
+def formatting_prompts_func(example):
+    import pdb; pdb.set_trace()
+    output_texts = []
+    for i in range(len(example['instruction'])):
+        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
+        output_texts.append(text)
+    return output_texts
+
+response_template = " ### Answer:"
+collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
+
+trainer = SFTTrainer(
+    model,
+    train_dataset=train_dataset,
+    formatting_func=formatting_prompts_func,
+    data_collator=collator,
+)
+trainer.train()