diff --git a/evaluation/coco_caption.py b/evaluation/coco_caption.py
new file mode 100644
index 0000000..b6179b8
--- /dev/null
+++ b/evaluation/coco_caption.py
@@ -0,0 +1,94 @@
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+
+from pycocoevalcap.eval import COCOEvalCap
+from collections import defaultdict
+
+class COCO_Annotation:
+ def __init__(self, annotation_file):
+ self.coco_cn_file = annotation_file
+ self.imgToAnns = self.build_imgToAnns()
+
+ def build_imgToAnns(self):
+ imgToAnns = defaultdict(list)
+ with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
+ for line in fin:
+ line = line.strip()
+ temp = eval(line)
+ annotations = temp['annotations']
+ for ann in annotations:
+ image_id = str(ann['image_id']).zfill(6)
+ imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
+ return imgToAnns
+
+ def getImgIds(self):
+ return self.imgToAnns.keys()
+
+class COCO_Result:
+ def __init__(self,result_file):
+ self.coco_cn_file = result_file
+ self.imgToAnns = self.build_imgToAnns()
+
+ def build_imgToAnns(self):
+ imgToAnns = dict()
+ data = json.load(open(self.coco_cn_file, "r"))
+ for d in data:
+ tmp = {
+ 'image_id':d['question_id'][-6:],
+ 'caption':d['answer']
+ }
+ imgToAnns[d['question_id'][-6:]] = [tmp]
+ return imgToAnns
+
+def coco_caption_eval(results_file, split_name):
+ files = {
+ "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
+ "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
+ }
+
+ # create coco object and coco_result object
+ annotation_file = files[split_name]
+ coco = COCO_Annotation(annotation_file)
+ coco_result = COCO_Result(results_file)
+
+ # create coco_eval object by taking coco and coco_result
+ coco_eval = COCOEvalCap(coco, coco_result)
+
+ # evaluate on a subset of images by setting
+ # coco_eval.params['image_id'] = coco_result.getImgIds()
+ # please remove this line when evaluating the full validation set
+ # coco_eval.params['image_id'] = coco_result.getImgIds()
+
+ # evaluate results
+ # SPICE will take a few minutes the first time, but speeds up due to caching
+ coco_eval.evaluate()
+
+ # print output evaluation scores
+ for metric, score in coco_eval.eval.items():
+ print(f"{metric}: {score:.3f}")
+
+ return coco_eval
+
+
+def main():
+ result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json"
+ split_name = "val"
+ coco_val = coco_caption_eval(result_file, split_name)
+
+ agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
+
+ # log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
+ # with open(
+ # os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
+ # ) as f:
+ # f.write(json.dumps(log_stats) + "\n")
+
+ coco_res = {k: v for k, v in coco_val.eval.items()}
+ coco_res["agg_metrics"] = agg_metrics
+
+ print(coco_res)
+
+
+main()
\ No newline at end of file
diff --git a/examples/ad_1.png b/examples/ad_1.png
deleted file mode 100644
index d0378e4..0000000
Binary files a/examples/ad_1.png and /dev/null differ
diff --git a/examples/ad_2.png b/examples/ad_2.png
deleted file mode 100644
index 674248b..0000000
Binary files a/examples/ad_2.png and /dev/null differ
diff --git a/examples/cook_1.png b/examples/cook_1.png
deleted file mode 100644
index d8cdb45..0000000
Binary files a/examples/cook_1.png and /dev/null differ
diff --git a/examples/cook_2.png b/examples/cook_2.png
deleted file mode 100644
index d08272b..0000000
Binary files a/examples/cook_2.png and /dev/null differ
diff --git a/examples/describe_1.png b/examples/describe_1.png
deleted file mode 100644
index 02f3c92..0000000
Binary files a/examples/describe_1.png and /dev/null differ
diff --git a/examples/describe_2.png b/examples/describe_2.png
deleted file mode 100644
index 20bf8c7..0000000
Binary files a/examples/describe_2.png and /dev/null differ
diff --git a/examples/fact_1.png b/examples/fact_1.png
deleted file mode 100644
index 1f75228..0000000
Binary files a/examples/fact_1.png and /dev/null differ
diff --git a/examples/fact_2.png b/examples/fact_2.png
deleted file mode 100644
index de6ef53..0000000
Binary files a/examples/fact_2.png and /dev/null differ
diff --git a/examples/fix_1.png b/examples/fix_1.png
deleted file mode 100644
index 023cfe6..0000000
Binary files a/examples/fix_1.png and /dev/null differ
diff --git a/examples/fix_2.png b/examples/fix_2.png
deleted file mode 100644
index f60da5f..0000000
Binary files a/examples/fix_2.png and /dev/null differ
diff --git a/examples/fun_1.png b/examples/fun_1.png
deleted file mode 100644
index f720ea6..0000000
Binary files a/examples/fun_1.png and /dev/null differ
diff --git a/examples/fun_2.png b/examples/fun_2.png
deleted file mode 100644
index 1d37a80..0000000
Binary files a/examples/fun_2.png and /dev/null differ
diff --git a/examples/logo_1.png b/examples/logo_1.png
deleted file mode 100644
index 8bbe438..0000000
Binary files a/examples/logo_1.png and /dev/null differ
diff --git a/examples/op_1.png b/examples/op_1.png
deleted file mode 100644
index 3dbb2ff..0000000
Binary files a/examples/op_1.png and /dev/null differ
diff --git a/examples/op_2.png b/examples/op_2.png
deleted file mode 100644
index 2cd3e1f..0000000
Binary files a/examples/op_2.png and /dev/null differ
diff --git a/examples/people_1.png b/examples/people_1.png
deleted file mode 100644
index 7e95c42..0000000
Binary files a/examples/people_1.png and /dev/null differ
diff --git a/examples/people_2.png b/examples/people_2.png
deleted file mode 100644
index aec6c83..0000000
Binary files a/examples/people_2.png and /dev/null differ
diff --git a/examples/rhyme_1.png b/examples/rhyme_1.png
deleted file mode 100644
index 7d13387..0000000
Binary files a/examples/rhyme_1.png and /dev/null differ
diff --git a/examples/rhyme_2.png b/examples/rhyme_2.png
deleted file mode 100644
index 6cf9bf8..0000000
Binary files a/examples/rhyme_2.png and /dev/null differ
diff --git a/examples/story_1.png b/examples/story_1.png
deleted file mode 100644
index 3eb6ccb..0000000
Binary files a/examples/story_1.png and /dev/null differ
diff --git a/examples/story_2.png b/examples/story_2.png
deleted file mode 100644
index 9d37142..0000000
Binary files a/examples/story_2.png and /dev/null differ
diff --git a/examples/web_1.png b/examples/web_1.png
deleted file mode 100644
index 8943842..0000000
Binary files a/examples/web_1.png and /dev/null differ
diff --git a/examples/wop_1.png b/examples/wop_1.png
deleted file mode 100644
index 88f37d6..0000000
Binary files a/examples/wop_1.png and /dev/null differ
diff --git a/examples/wop_2.png b/examples/wop_2.png
deleted file mode 100644
index 8255974..0000000
Binary files a/examples/wop_2.png and /dev/null differ
diff --git a/examples_v2/2000x1372_wmkn_0012149409555.jpg b/examples_v2/2000x1372_wmkn_0012149409555.jpg
deleted file mode 100755
index 1250f7f..0000000
Binary files a/examples_v2/2000x1372_wmkn_0012149409555.jpg and /dev/null differ
diff --git a/examples_v2/KFC-20-for-20-Nuggets.jpg b/examples_v2/KFC-20-for-20-Nuggets.jpg
deleted file mode 100755
index 0ec641c..0000000
Binary files a/examples_v2/KFC-20-for-20-Nuggets.jpg and /dev/null differ
diff --git a/examples_v2/cockdial.png b/examples_v2/cockdial.png
deleted file mode 100755
index 935f98e..0000000
Binary files a/examples_v2/cockdial.png and /dev/null differ
diff --git a/examples_v2/float.png b/examples_v2/float.png
deleted file mode 100755
index 900dcb0..0000000
Binary files a/examples_v2/float.png and /dev/null differ
diff --git a/examples_v2/glip_test.jpg b/examples_v2/glip_test.jpg
deleted file mode 100755
index f9198f2..0000000
Binary files a/examples_v2/glip_test.jpg and /dev/null differ
diff --git a/examples_v2/office.jpg b/examples_v2/office.jpg
deleted file mode 100755
index e35bdc2..0000000
Binary files a/examples_v2/office.jpg and /dev/null differ
diff --git a/examples_v2/sofa.jpg b/examples_v2/sofa.jpg
deleted file mode 100755
index 8610591..0000000
Binary files a/examples_v2/sofa.jpg and /dev/null differ
diff --git a/examples_v2/thief.png b/examples_v2/thief.png
deleted file mode 100755
index 579ee52..0000000
Binary files a/examples_v2/thief.png and /dev/null differ
diff --git a/minigpt4/configs/datasets/aokvqa/defaults.yaml b/minigpt4/configs/datasets/aokvqa/defaults.yaml
index 7bbd26b..bfbd821 100755
--- a/minigpt4/configs/datasets/aokvqa/defaults.yaml
+++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml
@@ -16,11 +16,16 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json
- # val:
- # url:
- # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
- # storage:
- # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
+ val:
+ url:
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
+ test:
+ url:
+ - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json
# test:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml
index 8e96a13..8d62c89 100644
--- a/minigpt4/configs/datasets/coco/caption.yaml
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@@ -17,14 +17,14 @@ datasets:
# md5: aa31ac474cf6250ebb81d18348a07ed8
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
- # val:
- # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
- # storage:
- # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
- # test:
- # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
- # storage:
- # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
+ val:
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
+ test:
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
diff --git a/minigpt4/configs/datasets/coco/caption_eval.yaml b/minigpt4/configs/datasets/coco/caption_eval.yaml
new file mode 100644
index 0000000..5a2a17f
--- /dev/null
+++ b/minigpt4/configs/datasets/coco/caption_eval.yaml
@@ -0,0 +1,26 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+datasets:
+ coco_caption: # name of the dataset builder
+ # dataset_card: dataset_card/coco_caption.md
+ # data_dir: ${env.data_dir}/datasets
+ data_type: images # [images|videos|features]
+
+ build_info:
+ # Be careful not to append minus sign (-) before split to avoid itemizing
+ annotations:
+ val:
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
+ test:
+ url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+ storage:
+ - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
+
+ images:
+ storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
+
diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py
index 4f6db0c..5317354 100644
--- a/minigpt4/datasets/builders/image_text_pair_builder.py
+++ b/minigpt4/datasets/builders/image_text_pair_builder.py
@@ -14,7 +14,7 @@ from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObj
from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset
from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset
from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset
-from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset
+from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset
from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset
from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset
from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset
@@ -384,7 +384,7 @@ class OKVQABuilder(COCOVQABuilder):
@registry.register_builder("aok_vqa")
class AOKVQABuilder(BaseDatasetBuilder):
train_dataset_cls = AOKVQADataset
- eval_dataset_cls = AOKVQADataset
+ eval_dataset_cls = AOKVQAEvalDataset
DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"}
@@ -584,6 +584,7 @@ class COCOCapBuilder(BaseDatasetBuilder):
DATASET_CONFIG_DICT = {
"default": "configs/datasets/coco/caption.yaml",
+ "coco_cap_eval": "configs/datasets/coco/caption_eval.yaml",
}
diff --git a/minigpt4/datasets/datasets/aok_vqa_datasets.py b/minigpt4/datasets/datasets/aok_vqa_datasets.py
index 3768c93..d4b83e5 100755
--- a/minigpt4/datasets/datasets/aok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py
@@ -13,7 +13,7 @@ import torch
from PIL import Image
-from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset
+from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset
class __DisplMixin:
@@ -37,11 +37,11 @@ class AOKVQADataset(VQADataset, __DisplMixin):
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
self.instruction_pool =[
- '{}',
- 'Q: {} A: ',
- 'Based on the image, respond to this question with a short answer: {}',
- '{} A short answer to the question is ',
- 'Question: {} Short answer:',
+ '{} Choose from {}.',
+ 'Q: {} Multi Choices: {} A: ',
+ 'Question: {} Multi Choices: {} Answer: ',
+ "{} Choose one from the following possible answers: {}. ",
+ '{} Choose from {}. The answer is',
]
exist_annotation = []
@@ -63,25 +63,19 @@ class AOKVQADataset(VQADataset, __DisplMixin):
image = self.vis_processor(image)
question = self.text_processor(ann["question"])
- answer_key = "direct_answers"
-
- answer_weight = {}
- for answer in ann[answer_key]:
- if answer in answer_weight.keys():
- answer_weight[answer] += 1 / len(ann[answer_key])
- else:
- answer_weight[answer] = 1 / len(ann[answer_key])
-
- answers = list(answer_weight.keys())
- weights = list(answer_weight.values())
-
- answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights
+ answer_lst = ann["choices"]
+ direct_answers = ann["direct_answers"]
+ final_answer = random.choices(direct_answers, k=1)[0]
+ for answer in answer_lst:
+ if answer in direct_answers:
+ final_answer = answer
return {
"image": image,
"image_id": ann["image"],
"question": question,
- "answer": answer,
+ "answer": final_answer,
+ "choices": ", ".join(answer_lst)
}
def __getitem__(self, index):
@@ -90,7 +84,7 @@ class AOKVQADataset(VQADataset, __DisplMixin):
answer = self.text_processor(data['answer'])
q_input = question
- llm_input = random.choice(self.instruction_pool).format(question)
+ llm_input = random.choice(self.instruction_pool).format(question, data["choices"])
return {
"image": data['image'],
@@ -104,25 +98,103 @@ class AOKVQADataset(VQADataset, __DisplMixin):
}
-class AOKVQGDataset(AOKVQADataset):
-
+class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
- super().__init__(vis_processor, text_processor, vis_root, ann_paths)
- self.instruction_pool = [
- 'Given the image, generate a question whose answer is: {}',
- 'Based on the image, provide a question with the answer: {}',
- 'Given the visual representation, create a question for which the answer is "{}"',
- 'From the image provided, craft a question that leads to the reply: {}',
- 'Considering the picture, come up with a question where the answer is: {}',
- 'Taking the image into account, generate an question that has the answer: {}'
- ]
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
- def __getitem__(self, index):
- data = self.get_data(index)
- instruction = random.choice(self.instruction_pool).format(data['answer'])
+ self.vis_root = vis_root
+
+ self.annotation = json.load(open(ann_paths[0]))
+
+ self.instruction_pool =[
+ '{} Choose from {}.',
+ 'Q: {} Multi Choices: {} A: ',
+ 'Question: {} Multi Choices: {} Answer: ',
+ "{} Choose one from the following possible answers: {}. ",
+ '{} Choose from {}. The answer is',
+ ]
+
+ try:
+ self.coco_fmt_qust_file = ann_paths[2]
+ self.coco_fmt_anno_file = ann_paths[3]
+ except IndexError:
+ self.coco_fmt_qust_file = None
+ self.coco_fmt_anno_file = None
+
+ self.vis_processor = vis_processor
+ self.text_processor = text_processor
+ self.source = 'aokvqa'
+
+ def collater(self, samples):
+ (
+ image_list,
+ question_list,
+ question_id_list,
+ choices_list,
+ correct_choice_idx_list,
+ direct_answers_list,
+ llm_input_list,
+ q_input_list,
+ source_list,
+ ) = ([], [], [], [], [], [], [], [], [])
+
+ for sample in samples:
+ image_list.append(sample["image"])
+ question_list.append(sample["text_input"])
+ question_id_list.append(sample["question_id"])
+ choices_list.append(sample["choices"])
+ correct_choice_idx_list.append(sample["correct_choice_idx"])
+ direct_answers_list.append(sample["direct_answers"])
+ llm_input_list.append(sample["llm_input"])
+ q_input_list.append(sample["q_input"])
+ source_list.append(sample["source"])
return {
- "image": data['image'],
- "instruction_input": instruction,
- "answer": data['question'],
+ "image": torch.stack(image_list, dim=0),
+ "text_input": question_list,
+ "question_id": question_id_list,
+ "choices": choices_list,
+ "correct_choice_idx": correct_choice_idx_list,
+ "direct_answers": direct_answers_list,
+ "llm_input": llm_input_list,
+ "q_input": q_input_list,
+ "source": source_list,
}
+
+ def __getitem__(self, index):
+ ann = self.annotation[index]
+
+ image_path = os.path.join(self.vis_root, ann["image"])
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ question = self.text_processor(ann["question"])
+
+ choices = ann["choices"]
+ if "correct_choice_idx" in ann:
+ correct_choice_idx = ann["correct_choice_idx"]
+ else:
+ correct_choice_idx = None
+
+ if "direct_answers" in ann:
+ direct_answers = ann["direct_answers"]
+ else:
+ direct_answers = None
+
+ llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices))
+
+ return {
+ "image": image,
+ "q_input": question,
+ "llm_input": llm_input,
+ "text_input": question,
+ "question_id": ann["question_id"],
+ "choices": choices,
+ "correct_choice_idx": correct_choice_idx,
+ "direct_answers": direct_answers,
+ "source": 'aokvqa',
+ }
+
\ No newline at end of file
diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py
index 6b74cb5..9354adc 100644
--- a/minigpt4/datasets/datasets/caption_datasets.py
+++ b/minigpt4/datasets/datasets/caption_datasets.py
@@ -59,83 +59,7 @@ class CaptionDataset(BaseDataset, __DisplMixin):
"text_input": caption,
"image_id": self.img_ids[ann["image_id"]],
}
-
-
-
-class COCOCaptionDataset(BaseDataset, __DisplMixin):
- def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
- """
- vis_root (string): Root directory of images (e.g. coco/images/)
- ann_root (string): directory to store the annotation file
- """
- super().__init__(vis_processor, text_processor, vis_root, ann_paths)
-
- self.img_ids = {}
- n = 0
-
- self.filter_anntation = []
-
- for ann in self.annotation:
- if "train" in ann["image"]:
- self.filter_anntation.append(ann)
- self.annotation = self.filter_anntation
-
- for ann in self.annotation:
- img_id = ann["image_id"]
- if img_id not in self.img_ids.keys():
- self.img_ids[img_id] = n
- n += 1
-
- self.instruction_pool = [
- 'Briefly describe this image.',
- 'Provide a concise depiction of this image.',
- 'Present a short description of this image.',
- 'Summarize this image in a few words.',
- 'A short image caption:',
- 'A short image description:',
- 'A photo of ',
- 'An image that shows ',
- 'Write a short description for the image. ',
- 'Write a description for the photo.',
- 'Provide a description of what is presented in the photo.',
- 'Briefly describe the content of the image.',
- 'Can you briefly explain what you see in the image?',
- 'Could you use a few words to describe what you perceive in the photo?',
- 'Please provide a short depiction of the picture.',
- 'Using language, provide a short account of the image.',
- 'Use a few words to illustrate what is happening in the picture.',
- ]
- self.source = 'coco_cap'
-
- def __getitem__(self, index):
-
- # TODO this assumes image input, not general enough
- ann = self.annotation[index]
-
- # img_file = ann["image"].split("/")[-1]
- img_file = ann["image"]
- image_path = os.path.join(self.vis_root, img_file)
- image = Image.open(image_path).convert("RGB")
-
- image = self.vis_processor(image)
- caption = self.text_processor(ann["caption"])
-
- # instruction = random.choice(self.instruction_pool)
- # instruction = "
[caption] {} ".format(instruction)
- q_input = ""
- llm_input = random.choice(self.instruction_pool)
-
- return {
- "image": image,
- "image_id": ann["image"],
- "answer": caption,
- "q_input": q_input,
- "llm_input": llm_input,
- "text_input": llm_input,
- "text_output": caption,
- "source": 'coco_cap',
- }
-
+
class CaptionEvalDataset(BaseDataset, __DisplMixin):
def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
"""
@@ -151,7 +75,7 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin):
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
-
+
image = self.vis_processor(image)
return {
@@ -159,3 +83,4 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin):
"image_id": ann["image_id"],
"instance_id": ann["instance_id"],
}
+
diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py
index e388956..a33dda4 100755
--- a/minigpt4/datasets/datasets/coco_caption.py
+++ b/minigpt4/datasets/datasets/coco_caption.py
@@ -9,18 +9,102 @@ import os
import json
import torch
import numpy as np
+import random
from PIL import Image
from PIL import ImageFile
+from collections import OrderedDict
ImageFile.LOAD_TRUNCATED_IMAGES = True
-from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset
-COCOCapDataset = COCOCaptionDataset
+class __DisplMixin:
+ def displ_item(self, index):
+ sample, ann = self.__getitem__(index), self.annotation[index]
+ return OrderedDict(
+ {
+ "file": ann["image"],
+ "caption": ann["caption"],
+ "image": sample["image"],
+ }
+ )
+
+class COCOCapDataset(BaseDataset, __DisplMixin):
+ def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+ """
+ vis_root (string): Root directory of images (e.g. coco/images/)
+ ann_root (string): directory to store the annotation file
+ """
+ super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+ self.img_ids = {}
+ n = 0
+ self.filter_anntation = []
+
+ for ann in self.annotation:
+ if "train" in ann["image"]:
+ self.filter_anntation.append(ann)
+ self.annotation = self.filter_anntation
+
+ for ann in self.annotation:
+ img_id = ann["image_id"]
+ if img_id not in self.img_ids.keys():
+ self.img_ids[img_id] = n
+ n += 1
+
+ self.instruction_pool = [
+ 'Briefly describe this image.',
+ 'Provide a concise depiction of this image.',
+ 'Present a short description of this image.',
+ 'Summarize this image in a few words.',
+ 'A short image caption:',
+ 'A short image description:',
+ 'A photo of ',
+ 'An image that shows ',
+ 'Write a short description for the image. ',
+ 'Write a description for the photo.',
+ 'Provide a description of what is presented in the photo.',
+ 'Briefly describe the content of the image.',
+ 'Can you briefly explain what you see in the image?',
+ 'Could you use a few words to describe what you perceive in the photo?',
+ 'Please provide a short depiction of the picture.',
+ 'Using language, provide a short account of the image.',
+ 'Use a few words to illustrate what is happening in the picture.',
+ ]
+ self.source = 'coco_cap'
+
+ def __getitem__(self, index):
+
+ # TODO this assumes image input, not general enough
+ ann = self.annotation[index]
+
+ # img_file = ann["image"].split("/")[-1]
+ img_file = ann["image"]
+ image_path = os.path.join(self.vis_root, img_file)
+ image = Image.open(image_path).convert("RGB")
+
+ image = self.vis_processor(image)
+ caption = self.text_processor(ann["caption"])
+
+ instruction = random.choice(self.instruction_pool)
+ # q_input = ""
+ q_input = instruction
+ llm_input = instruction
+
+ return {
+ "image": image,
+ "image_id": ann["image"],
+ "answer": caption,
+ "q_input": q_input,
+ "llm_input": llm_input,
+ "text_input": llm_input,
+ "text_output": caption,
+ "source": 'coco_cap',
+ }
class COCOCapEvalDataset(CaptionEvalDataset):
@@ -31,6 +115,26 @@ class COCOCapEvalDataset(CaptionEvalDataset):
split (string): val or test
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+ self.instruction_pool = [
+ 'Briefly describe this image.',
+ 'Provide a concise depiction of this image.',
+ 'Present a short description of this image.',
+ 'Summarize this image in a few words.',
+ 'A short image caption:',
+ 'A short image description:',
+ 'A photo of ',
+ 'An image that shows ',
+ 'Write a short description for the image. ',
+ 'Write a description for the photo.',
+ 'Provide a description of what is presented in the photo.',
+ 'Briefly describe the content of the image.',
+ 'Can you briefly explain what you see in the image?',
+ 'Could you use a few words to describe what you perceive in the photo?',
+ 'Please provide a short depiction of the picture.',
+ 'Using language, provide a short account of the image.',
+ 'Use a few words to illustrate what is happening in the picture.',
+ ]
self.source = 'coco_cap'
def __getitem__(self, index):
@@ -38,15 +142,25 @@ class COCOCapEvalDataset(CaptionEvalDataset):
image_path = os.path.join(self.vis_root, ann["image"])
image = Image.open(image_path).convert("RGB")
-
- image = self.vis_processor(image)
+ try:
+ image = self.vis_processor(image)
+ except Exception as e:
+ print(e)
+ print(image_path)
img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+ instruction = random.choice(self.instruction_pool)
+ # q_input = ""
+ q_input = instruction
+ llm_input = instruction
return {
"image": image,
"image_id": img_id,
- "instance_id": ann["instance_id"],
+ "text_input":llm_input,
+ "q_input": q_input,
+ "llm_input": llm_input,
+ "source": self.source,
}
diff --git a/minigpt4/datasets/datasets/ok_vqa_datasets.py b/minigpt4/datasets/datasets/ok_vqa_datasets.py
index 20b4494..c0bf799 100755
--- a/minigpt4/datasets/datasets/ok_vqa_datasets.py
+++ b/minigpt4/datasets/datasets/ok_vqa_datasets.py
@@ -149,7 +149,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
self.source = 'okvqa'
self.annotation_add = self.get_data()
- self._add_instance_ids()
def get_data(self):
ann_instruct = list()
@@ -180,7 +179,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin):
"image_id": ann["image"],
'image_path': image_path,
"question_id": ann["question_id"],
- # "instance_id": ann["instance_id"],
"question": question,
"q_input": q_input,
"llm_input": llm_input,
diff --git a/minigpt4/models/QformerMoE.py b/minigpt4/models/QformerMoE.py
index 5cc8c1f..addacc5 100644
--- a/minigpt4/models/QformerMoE.py
+++ b/minigpt4/models/QformerMoE.py
@@ -45,7 +45,6 @@ from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
from minigpt4.models.moe.utils import (
- FeedForward,
MoEModelOutput,
MoEModelOutputWithPooling,
use_experts,
diff --git a/minigpt4/models/QformerMoELN.py b/minigpt4/models/QformerMoELN.py
new file mode 100644
index 0000000..9ef1f6b
--- /dev/null
+++ b/minigpt4/models/QformerMoELN.py
@@ -0,0 +1,1276 @@
+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+ ModelOutput,
+)
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+ MaskedLMOutput,
+ MultipleChoiceModelOutput,
+ NextSentencePredictorOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+ PreTrainedModel,
+ apply_chunking_to_forward,
+ find_pruneable_heads_and_indices,
+ prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+from minigpt4.models.moe.utils import (
+ MoEModelOutput,
+ MoEModelOutputWithPooling,
+ use_experts,
+)
+from minigpt4.models.moe.moe_layer import MoELayer
+
+logging.set_verbosity_error() # ignore warning : Some weights of BertLMHeadModel were not initialized from the model checkpoint...
+logger = logging.get_logger(__name__)
+
+# from visualizer import get_local
+
+class BertEmbeddings(nn.Module):
+ """Construct the embeddings from word and position embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+ )
+ self.position_embeddings = nn.Embedding(
+ config.max_position_embeddings, config.hidden_size
+ )
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+ )
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+
+ self.config = config
+
+ def forward(
+ self,
+ input_ids=None,
+ position_ids=None,
+ query_embeds=None,
+ past_key_values_length=0,
+ ):
+ if input_ids is not None:
+ seq_length = input_ids.size()[1]
+ else:
+ seq_length = 0
+
+ if position_ids is None:
+ position_ids = self.position_ids[
+ :, past_key_values_length : seq_length + past_key_values_length
+ ].clone()
+
+ if input_ids is not None:
+ embeddings = self.word_embeddings(input_ids)
+ if self.position_embedding_type == "absolute":
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings = embeddings + position_embeddings
+
+ if query_embeds is not None:
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
+ else:
+ embeddings = query_embeds
+
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertSelfAttention(nn.Module):
+ def __init__(self, config, is_cross_attention):
+ super().__init__()
+ self.config = config
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+ config, "embedding_size"
+ ):
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+ )
+
+ self.num_attention_heads = config.num_attention_heads # 12
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 64
+ self.all_head_size = self.num_attention_heads * self.attention_head_size # 768
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+ if is_cross_attention:
+ self.key = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768)
+ self.value = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768)
+ else:
+ self.key = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ self.max_position_embeddings = config.max_position_embeddings
+ self.distance_embedding = nn.Embedding(
+ 2 * config.max_position_embeddings - 1, self.attention_head_size
+ )
+ self.save_attention = False
+
+ def save_attn_gradients(self, attn_gradients):
+ self.attn_gradients = attn_gradients
+
+ def get_attn_gradients(self):
+ return self.attn_gradients
+
+ def save_attention_map(self, attention_map):
+ self.attention_map = attention_map
+
+ def get_attention_map(self):
+ return self.attention_map
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (
+ self.num_attention_heads,
+ self.attention_head_size,
+ ) # torch.Size([1, 257, 12, 64])
+ x = x.view(*new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+
+ # If this is instantiated as a cross-attention module, the keys
+ # and values come from an encoder; the attention mask needs to be
+ # such that the encoder's padding tokens are not attended to.
+ is_cross_attention = encoder_hidden_states is not None
+
+ if is_cross_attention:
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) # encoder_hidden_states:[bz,257,1408], torch.Size([1, 12, 257, 64])
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) # torch.Size([1, 12, 257, 64])
+ attention_mask = encoder_attention_mask
+ elif past_key_value is not None:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+ else:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+ mixed_query_layer = self.query(hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer) # torch.Size([1, 12, 41, 64])
+
+ past_key_value = (key_layer, value_layer) # torch.Size([1, 12, 41, 257])
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ seq_length = hidden_states.size()[1]
+ position_ids_l = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(-1, 1)
+ position_ids_r = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(1, -1)
+ distance = position_ids_l - position_ids_r
+ positional_embedding = self.distance_embedding(
+ distance + self.max_position_embeddings - 1
+ )
+ positional_embedding = positional_embedding.to(
+ dtype=query_layer.dtype
+ ) # fp16 compatibility
+
+ if self.position_embedding_type == "relative_key":
+ relative_position_scores = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ attention_scores = attention_scores + relative_position_scores
+ elif self.position_embedding_type == "relative_key_query":
+ relative_position_scores_query = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ relative_position_scores_key = torch.einsum(
+ "bhrd,lrd->bhlr", key_layer, positional_embedding
+ )
+ attention_scores = (
+ attention_scores
+ + relative_position_scores_query
+ + relative_position_scores_key
+ )
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+ # extended_attention_mask
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+ if is_cross_attention and self.save_attention:
+ self.save_attention_map(attention_probs)
+ attention_probs.register_hook(self.save_attn_gradients)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs_dropped = self.dropout(attention_probs) # torch.Size([1, 12, 41, 257])
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs_dropped = attention_probs_dropped * head_mask
+
+ context_layer = torch.matmul(attention_probs_dropped, value_layer) # torch.Size([1, 12, 41, 64])
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape) # torch.Size([1, 41, 768])
+
+ outputs = (
+ (context_layer, attention_probs) if output_attentions else (context_layer,)
+ )
+
+ outputs = outputs + (past_key_value,)
+ return outputs
+
+
+class BertSelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertAttention(nn.Module):
+ def __init__(self, config, is_cross_attention=False):
+ super().__init__()
+ self.self = BertSelfAttention(config, is_cross_attention)
+ self.output = BertSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads,
+ self.self.num_attention_heads,
+ self.self.attention_head_size,
+ self.pruned_heads,
+ )
+
+ # Prune linear layers
+ self.self.query = prune_linear_layer(self.self.query, index)
+ self.self.key = prune_linear_layer(self.self.key, index)
+ self.self.value = prune_linear_layer(self.self.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+ self.self.all_head_size = (
+ self.self.attention_head_size * self.self.num_attention_heads
+ )
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+ self_outputs = self.self(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[
+ 1:
+ ] # add attentions if we output them
+ return outputs
+
+
+class BertIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class BertOutput(nn.Module): # Add & Norm
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class FeedForward(nn.Module):
+ # Add LayerNorm
+ def __init__(self, config):
+ super().__init__()
+ self.intermediate = BertIntermediate(config)
+ self.output = BertOutput(config)
+
+ def forward(self, hidden_states: Tensor):
+ intermediate_output = self.intermediate(hidden_states)
+ layer_output = self.output(intermediate_output, hidden_states)
+ return layer_output
+
+
+class BertLayer(nn.Module):
+ def __init__(self, config, layer_num):
+ super().__init__()
+ self.config = config
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = BertAttention(config)
+ self.layer_num = layer_num
+ if (
+ self.config.add_cross_attention
+ and layer_num % self.config.cross_attention_freq == 0
+ ):
+ self.crossattention = BertAttention(
+ config, is_cross_attention=self.config.add_cross_attention
+ )
+ self.has_cross_attention = True
+ else:
+ self.has_cross_attention = False
+
+ self.intermediate = BertIntermediate(config)
+ self.output = BertOutput(config)
+
+ self.intermediate_query = BertIntermediate(config)
+ self.output_query = BertOutput(config)
+
+ # Add MoE FFN
+ self.use_experts = use_experts(layer_num)
+ ffn = FeedForward(config)
+ if self.use_experts:
+ self.experts = MoELayer(
+ hidden_size=config.hidden_size,
+ expert=ffn,
+ num_experts=config.moebert_expert_num,
+ route_method=config.moebert_route_method,
+ topk=config.moe_topk,
+ use_balance_loss=config.use_balance_loss,
+ weight_type=config.moe_weight_type,
+ )
+ else:
+ self.experts = ffn
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ query_length=0,
+ ):
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = (
+ past_key_value[:2] if past_key_value is not None else None
+ )
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ past_key_value=self_attn_past_key_value,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:-1]
+
+ present_key_value = self_attention_outputs[-1]
+
+ if query_length > 0:
+ query_attention_output = attention_output[:, :query_length, :]
+
+ if self.has_cross_attention:
+ assert (
+ encoder_hidden_states is not None
+ ), "encoder_hidden_states must be given for cross-attention layers"
+ cross_attention_outputs = self.crossattention(
+ query_attention_output,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ query_attention_output = cross_attention_outputs[0]
+ outputs = (
+ outputs + cross_attention_outputs[1:-1]
+ ) # add cross attentions if we output attention weights
+
+ # add moe query ffn
+ # query_attention_output size: [bz, query_length+seq_len, 768]
+ # attention_mask size: [bz, 1, 1, query_length+seq_len]
+ moe_ffn_attention_input = query_attention_output[:, :query_length, :]
+ moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length]
+ layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask) # layer_output, gate_loss, gate_load
+ # import pdb; pdb.set_trace() # test0107
+
+ if attention_output.shape[1] > query_length: # have text input in Qformer
+ layer_output_text = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output[:, query_length:, :],
+ )
+ layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2])
+
+ else:
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output,
+ )
+ layer_output = (layer_output, 0.0, [])
+ outputs = (layer_output,) + outputs
+
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+ def feed_forward_chunk(self, attention_output):
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output
+
+ def feed_forward_query_moe(self, attention_output, expert_attention_mask):
+ if not self.use_experts:
+ hidden_states = self.experts(attention_output)
+ return hidden_states, 0.0, []
+
+ hidden_states, gate_loss, gate_load = self.experts(
+ attention_output, expert_attention_mask
+ )
+ return hidden_states, gate_loss, gate_load
+
+class BertEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList(
+ [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+ )
+
+ # @get_local('all_cross_attentions')
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ query_length=0,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = (
+ () if output_attentions and self.config.add_cross_attention else None
+ )
+
+ next_decoder_cache = () if use_cache else None
+ gate_loss = 0.0
+ gate_loads = list()
+ for i in range(self.config.num_hidden_layers):
+ layer_module = self.layer[i]
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+ if use_cache:
+ logger.warn(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(
+ *inputs, past_key_value, output_attentions, query_length
+ )
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(layer_module),
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states, #torch.Size([bz, 32+input_len, 768])
+ attention_mask, # torch.Size([bz, 1, 1, 32+input_len])
+ layer_head_mask, # None
+ encoder_hidden_states, # torch.Size([bz, 257, 1408])
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions, # False
+ query_length, # 32
+ )
+ hidden_states = layer_outputs[0][0]
+ gate_loss = gate_loss + layer_outputs[0][1]
+ gate_loads.append(layer_outputs[0][2])
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ next_decoder_cache,
+ all_hidden_states,
+ all_self_attentions,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+
+ return MoEModelOutput(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ gate_loss=gate_loss,
+ gate_loads=gate_loads,
+ )
+
+
+class BertPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.transform = BertPredictionHeadTransform(config)
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+ self.decoder.bias = self.bias
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.predictions = BertLMPredictionHead(config)
+
+ def forward(self, sequence_output):
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = BertConfig
+ base_model_prefix = "bert"
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Embedding)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+ """
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+ cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+ all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+ argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ input to the forward pass.
+ """
+
+ def __init__(self, config, add_pooling_layer=False):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = BertEmbeddings(config)
+
+ self.encoder = BertEncoder(config)
+
+ self.pooler = BertPooler(config) if add_pooling_layer else None
+
+ self.init_weights()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ def get_extended_attention_mask(
+ self,
+ attention_mask: Tensor,
+ input_shape: Tuple[int],
+ device: device,
+ is_decoder: bool,
+ has_query: bool = False,
+ ) -> Tensor:
+ """
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+ Arguments:
+ attention_mask (:obj:`torch.Tensor`):
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+ input_shape (:obj:`Tuple[int]`):
+ The shape of the input to the model.
+ device: (:obj:`torch.device`):
+ The device of the input to the model.
+
+ Returns:
+ :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+ """
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+
+ if attention_mask.dim() == 3:
+ extended_attention_mask = attention_mask[:, None, :, :]
+ elif attention_mask.dim() == 2:
+ # Provided a padding mask of dimensions [batch_size, seq_length]
+ # - if the model is a decoder, apply a causal mask in addition to the padding mask
+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if is_decoder:
+ batch_size, seq_length = input_shape
+
+ seq_ids = torch.arange(seq_length, device=device)
+ causal_mask = (
+ seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+ <= seq_ids[None, :, None]
+ )
+
+ # add a prefix ones mask to the causal mask
+ # causal and attention masks must have same type with pytorch version < 1.3
+ causal_mask = causal_mask.to(attention_mask.dtype)
+
+ if causal_mask.shape[1] < attention_mask.shape[1]:
+ prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+ if has_query: # UniLM style attention mask
+ causal_mask = torch.cat(
+ [
+ torch.zeros(
+ (batch_size, prefix_seq_len, seq_length),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=1,
+ )
+ causal_mask = torch.cat(
+ [
+ torch.ones(
+ (batch_size, causal_mask.shape[1], prefix_seq_len),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=-1,
+ )
+ extended_attention_mask = (
+ causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+ )
+ else:
+ extended_attention_mask = attention_mask[:, None, None, :]
+ else:
+ raise ValueError(
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+ input_shape, attention_mask.shape
+ )
+ )
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = extended_attention_mask.to(
+ dtype=self.dtype
+ ) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+ return extended_attention_mask
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ is_decoder=False,
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ """
+ output_attentions = (
+ output_attentions
+ if output_attentions is not None
+ else self.config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states
+ if output_hidden_states is not None
+ else self.config.output_hidden_states
+ )
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ if input_ids is None:
+ assert (
+ query_embeds is not None
+ ), "You have to specify query_embeds when input_ids is None"
+
+ # past_key_values_length
+ past_key_values_length = (
+ past_key_values[0][0].shape[2] - self.config.query_length
+ if past_key_values is not None
+ else 0
+ )
+
+ query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ query_embeds=query_embeds,
+ past_key_values_length=past_key_values_length,
+ )
+
+ input_shape = embedding_output.size()[:-1]
+ batch_size, seq_length = input_shape
+ device = embedding_output.device
+
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ ((batch_size, seq_length + past_key_values_length)), device=device
+ )
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if is_decoder:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask,
+ input_ids.shape,
+ device,
+ is_decoder,
+ has_query=(query_embeds is not None),
+ )
+ else:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask, input_shape, device, is_decoder
+ )
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if encoder_hidden_states is not None:
+ if type(encoder_hidden_states) == list:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+ 0
+ ].size()
+ else:
+ (
+ encoder_batch_size,
+ encoder_sequence_length,
+ _,
+ ) = encoder_hidden_states.size()
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+ if type(encoder_attention_mask) == list:
+ encoder_extended_attention_mask = [
+ self.invert_attention_mask(mask) for mask in encoder_attention_mask
+ ]
+ elif encoder_attention_mask is None:
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = None
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_extended_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ query_length=query_length,
+ )
+ sequence_output = encoder_outputs[0]
+ pooled_output = (
+ self.pooler(sequence_output) if self.pooler is not None else None
+ )
+
+ if not return_dict:
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return MoEModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ past_key_values=encoder_outputs.past_key_values,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ cross_attentions=encoder_outputs.cross_attentions,
+ gate_loss=encoder_outputs.gate_loss,
+ gate_loads=encoder_outputs.gate_loads,
+ )
+
+
+class BertMoELMHeadModelLNIn(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.load_balance_alpha = config.moebert_load_balance
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ past_key_values=None,
+ use_cache=True,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=True,
+ reduction="mean",
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+ ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+ ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ Returns:
+ Example::
+ >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+ >>> import torch
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> config = BertConfig.from_pretrained("bert-base-cased")
+ >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> prediction_logits = outputs.logits
+ """
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+ if labels is not None:
+ use_cache = False
+ if past_key_values is not None:
+ query_embeds = None
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+ gate_loss = outputs.gate_loss
+
+ sequence_output = outputs[0]
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores[:, :-1, :].contiguous()
+
+ lm_loss, total_loss = None, None
+ if labels is not None:
+ # we are doing next-token prediction; shift prediction scores and input ids by one
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+ labels = labels[:, 1:].contiguous()
+ loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+ lm_loss = loss_fct(
+ shifted_prediction_scores.view(-1, self.config.vocab_size),
+ labels.view(-1),
+ )
+ if reduction == "none":
+ lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+ total_loss = lm_loss + gate_loss * self.load_balance_alpha
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=total_loss,
+ logits=prediction_scores,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ cross_attentions=outputs.cross_attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+ ):
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+ if attention_mask is None:
+ attention_mask = input_ids.new_ones(input_ids.shape)
+ query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+ attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+ # cut decoder_input_ids if past is used
+ if past is not None:
+ input_ids = input_ids[:, -1:]
+
+ return {
+ "input_ids": input_ids,
+ "query_embeds": query_embeds,
+ "attention_mask": attention_mask,
+ "past_key_values": past,
+ "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+ "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+ "is_decoder": True,
+ }
+
+ def _reorder_cache(self, past, beam_idx):
+ reordered_past = ()
+ for layer_past in past:
+ reordered_past += (
+ tuple(
+ past_state.index_select(0, beam_idx) for past_state in layer_past
+ ),
+ )
+ return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=False,
+ ):
+ r"""
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+ config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ """
+
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
+ masked_lm_loss = loss_fct(
+ prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+ )
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return (
+ ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+ )
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
diff --git a/minigpt4/models/QformerRouteMoE.py b/minigpt4/models/QformerRouteMoE.py
index 8595dc6..5cdd983 100644
--- a/minigpt4/models/QformerRouteMoE.py
+++ b/minigpt4/models/QformerRouteMoE.py
@@ -389,17 +389,23 @@ class BertOutput(nn.Module): # Add & Norm
class FeedForward(nn.Module):
+ # remove LayerNorm
def __init__(self, config):
- nn.Module.__init__(self)
- # first layer
- self.intermediate_query = BertIntermediate(config)
- # second layer
- self.output_query = BertOutput(config)
+ super().__init__()
+ self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2
+ # self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2
def forward(self, hidden_states: Tensor):
- input_tensor = hidden_states
- intermediate_output = self.intermediate_query(hidden_states)
- hidden_states = self.output_query(intermediate_output, input_tensor)
+ hidden_states = self.dense1(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ hidden_states = self.dense2(hidden_states)
+ hidden_states = self.dropout(hidden_states)
return hidden_states
@@ -433,7 +439,6 @@ class BertLayer(nn.Module):
self.layer_judge = moe_layer_judge(layer_num)
self.num_beams = config.moebert_num_beams
ffn = FeedForward(config)
-
if self.use_experts:
self.experts = RouteMoELayer(
hidden_size=config.hidden_size,
@@ -446,8 +451,7 @@ class BertLayer(nn.Module):
)
else:
self.experts = ffn
-
- # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
@@ -538,7 +542,7 @@ class BertLayer(nn.Module):
if self.layer_judge == 'first' and self.num_beams>1:
# if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
# adjust the dimension of layer_output_text to bz*num_beams
- layer_output_text = self.adjust_layer_output_text(layer_output_text)
+ layer_output_text = self.adjust_hidden_states_by_num_beams(layer_output_text)
if self.layer_judge == 'mid' and self.num_beams > 1:
# layer_output_text [bz*num_beams, len, hidden_size]
@@ -575,11 +579,11 @@ class BertLayer(nn.Module):
attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len])
return attention_mask
- def adjust_layer_output_text(self, layer_output_text):
- batch_size, text_length, hidden_size = layer_output_text.shape
- tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
- layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
- return layer_output_text
+ def adjust_hidden_states_by_num_beams(self, hidden_states):
+ batch_size, text_length, hidden_size = hidden_states.shape
+ tmp_text = hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
+ hidden_states = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
+ return hidden_states
def route_moe_last_layer_top1(self, layer_output, layer_output_text):
batch_size = layer_output[0].shape[0]
@@ -602,20 +606,21 @@ class BertLayer(nn.Module):
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
- # layer_output = self.LayerNorm(layer_output + attention_output)
return layer_output
def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route):
if not self.use_experts:
- layer_output = self.experts(attention_output)
- # layer_output = self.LayerNorm(layer_output + attention_output)
+ hidden_states = self.experts(attention_output)
+ layer_output = self.expert_ln(hidden_states + attention_output)
return layer_output, None, None, None, 0.0
- layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
+ hidden_states, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
attention_output, expert_attention_mask, beam_scores, expert_route
)
+ if hidden_states.shape[0]==attention_output.shape[0]*self.num_beams and self.num_beams>1:
+ attention_output = self.adjust_hidden_states_by_num_beams(attention_output)
+ layer_output = self.expert_ln(hidden_states + attention_output)
- # layer_output = self.LayerNorm(layer_output + attention_output)
return layer_output, beam_scores, expert_route, beam_idx, importance_loss
class BertEncoder(nn.Module):
@@ -722,7 +727,7 @@ class BertEncoder(nn.Module):
]
if v is not None
)
-
+
return MoEModelOutput(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
diff --git a/minigpt4/models/QformerRouteMoELN.py b/minigpt4/models/QformerRouteMoELN.py
new file mode 100644
index 0000000..1f1f289
--- /dev/null
+++ b/minigpt4/models/QformerRouteMoELN.py
@@ -0,0 +1,1367 @@
+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+import copy
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+ ModelOutput,
+)
+from transformers.modeling_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+ CausalLMOutputWithCrossAttentions,
+ MaskedLMOutput,
+ MultipleChoiceModelOutput,
+ NextSentencePredictorOutput,
+ QuestionAnsweringModelOutput,
+ SequenceClassifierOutput,
+ TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+ PreTrainedModel,
+ apply_chunking_to_forward,
+ find_pruneable_heads_and_indices,
+ prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+from minigpt4.models.moe.utils import (
+ MoEModelOutput,
+ MoEModelOutputWithPooling,
+ use_experts_route,
+ moe_layer_judge,
+)
+from minigpt4.models.moe.route_moe_layer import RouteMoELayer
+
+logging.set_verbosity_error() # ignore warning : Some weights of BertLMHeadModel were not initialized from the model checkpoint...
+logger = logging.get_logger(__name__)
+
+# from visualizer import get_local
+
+class BertEmbeddings(nn.Module):
+ """Construct the embeddings from word and position embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+ )
+ self.position_embeddings = nn.Embedding(
+ config.max_position_embeddings, config.hidden_size
+ )
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+ )
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+
+ self.config = config
+
+ def forward(
+ self,
+ input_ids=None,
+ position_ids=None,
+ query_embeds=None,
+ past_key_values_length=0,
+ ):
+ if input_ids is not None:
+ seq_length = input_ids.size()[1]
+ else:
+ seq_length = 0
+
+ if position_ids is None:
+ position_ids = self.position_ids[
+ :, past_key_values_length : seq_length + past_key_values_length
+ ].clone()
+
+ if input_ids is not None:
+ embeddings = self.word_embeddings(input_ids)
+ if self.position_embedding_type == "absolute":
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings = embeddings + position_embeddings
+
+ if query_embeds is not None:
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
+ else:
+ embeddings = query_embeds
+
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertSelfAttention(nn.Module):
+ def __init__(self, config, is_cross_attention):
+ super().__init__()
+ self.config = config
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+ config, "embedding_size"
+ ):
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+ )
+
+ self.num_attention_heads = config.num_attention_heads # 12
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 64
+ self.all_head_size = self.num_attention_heads * self.attention_head_size # 768
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+ if is_cross_attention:
+ self.key = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768)
+ self.value = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768)
+ else:
+ self.key = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+ self.position_embedding_type = getattr(
+ config, "position_embedding_type", "absolute"
+ )
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ self.max_position_embeddings = config.max_position_embeddings
+ self.distance_embedding = nn.Embedding(
+ 2 * config.max_position_embeddings - 1, self.attention_head_size
+ )
+ self.save_attention = False
+
+ def save_attn_gradients(self, attn_gradients):
+ self.attn_gradients = attn_gradients
+
+ def get_attn_gradients(self):
+ return self.attn_gradients
+
+ def save_attention_map(self, attention_map):
+ self.attention_map = attention_map
+
+ def get_attention_map(self):
+ return self.attention_map
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (
+ self.num_attention_heads,
+ self.attention_head_size,
+ ) # torch.Size([1, 257, 12, 64])
+ x = x.view(*new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+
+ # If this is instantiated as a cross-attention module, the keys
+ # and values come from an encoder; the attention mask needs to be
+ # such that the encoder's padding tokens are not attended to.
+ is_cross_attention = encoder_hidden_states is not None
+
+ if is_cross_attention:
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) # encoder_hidden_states:[bz,257,1408], torch.Size([1, 12, 257, 64])
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) # torch.Size([1, 12, 257, 64])
+ attention_mask = encoder_attention_mask
+ elif past_key_value is not None:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+ else:
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+ mixed_query_layer = self.query(hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer) # torch.Size([1, 12, 41, 64])
+
+ past_key_value = (key_layer, value_layer) # torch.Size([1, 12, 41, 257])
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ if (
+ self.position_embedding_type == "relative_key"
+ or self.position_embedding_type == "relative_key_query"
+ ):
+ seq_length = hidden_states.size()[1]
+ position_ids_l = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(-1, 1)
+ position_ids_r = torch.arange(
+ seq_length, dtype=torch.long, device=hidden_states.device
+ ).view(1, -1)
+ distance = position_ids_l - position_ids_r
+ positional_embedding = self.distance_embedding(
+ distance + self.max_position_embeddings - 1
+ )
+ positional_embedding = positional_embedding.to(
+ dtype=query_layer.dtype
+ ) # fp16 compatibility
+
+ if self.position_embedding_type == "relative_key":
+ relative_position_scores = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ attention_scores = attention_scores + relative_position_scores
+ elif self.position_embedding_type == "relative_key_query":
+ relative_position_scores_query = torch.einsum(
+ "bhld,lrd->bhlr", query_layer, positional_embedding
+ )
+ relative_position_scores_key = torch.einsum(
+ "bhrd,lrd->bhlr", key_layer, positional_embedding
+ )
+ attention_scores = (
+ attention_scores
+ + relative_position_scores_query
+ + relative_position_scores_key
+ )
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+ # extended_attention_mask
+
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+ if is_cross_attention and self.save_attention:
+ self.save_attention_map(attention_probs)
+ attention_probs.register_hook(self.save_attn_gradients)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs_dropped = self.dropout(attention_probs) # torch.Size([1, 12, 41, 257])
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs_dropped = attention_probs_dropped * head_mask
+
+ context_layer = torch.matmul(attention_probs_dropped, value_layer) # torch.Size([1, 12, 41, 64])
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape) # torch.Size([1, 41, 768])
+
+ outputs = (
+ (context_layer, attention_probs) if output_attentions else (context_layer,)
+ )
+
+ outputs = outputs + (past_key_value,)
+ return outputs
+
+
+class BertSelfOutput(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertAttention(nn.Module):
+ def __init__(self, config, is_cross_attention=False):
+ super().__init__()
+ self.self = BertSelfAttention(config, is_cross_attention)
+ self.output = BertSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads):
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads,
+ self.self.num_attention_heads,
+ self.self.attention_head_size,
+ self.pruned_heads,
+ )
+
+ # Prune linear layers
+ self.self.query = prune_linear_layer(self.self.query, index)
+ self.self.key = prune_linear_layer(self.self.key, index)
+ self.self.value = prune_linear_layer(self.self.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+ self.self.all_head_size = (
+ self.self.attention_head_size * self.self.num_attention_heads
+ )
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ ):
+ self_outputs = self.self(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ )
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[
+ 1:
+ ] # add attentions if we output them
+ return outputs
+
+
+class BertIntermediate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class BertOutput(nn.Module): # Add & Norm
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ # Move LayerNorm & ResNet out of FFN After MoEFFN
+ hidden_states = self.LayerNorm(hidden_states + input_tensor) # 1
+ return hidden_states
+
+
+class FeedForward(nn.Module):
+ def __init__(self, config):
+ nn.Module.__init__(self)
+ # first layer
+ self.intermediate_query = BertIntermediate(config)
+ # second layer
+ self.output_query = BertOutput(config)
+
+ def forward(self, hidden_states: Tensor):
+ input_tensor = hidden_states
+ intermediate_output = self.intermediate_query(hidden_states)
+ hidden_states = self.output_query(intermediate_output, input_tensor)
+ return hidden_states
+
+
+class BertLayer(nn.Module):
+ def __init__(self, config, layer_num):
+ super().__init__()
+ self.config = config
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = BertAttention(config)
+ self.layer_num = layer_num
+ if (
+ self.config.add_cross_attention
+ and layer_num % self.config.cross_attention_freq == 0
+ ):
+ self.crossattention = BertAttention(
+ config, is_cross_attention=self.config.add_cross_attention
+ )
+ self.has_cross_attention = True
+ else:
+ self.has_cross_attention = False
+
+ self.intermediate = BertIntermediate(config)
+ self.output = BertOutput(config)
+
+ self.intermediate_query = BertIntermediate(config)
+ self.output_query = BertOutput(config)
+
+ # Add MoE FFN
+ self.use_experts = use_experts_route(layer_num)
+ self.layer_judge = moe_layer_judge(layer_num)
+ self.num_beams = config.moebert_num_beams
+ ffn = FeedForward(config)
+
+ if self.use_experts:
+ self.experts = RouteMoELayer(
+ hidden_size=config.hidden_size,
+ expert=ffn,
+ num_experts=config.moebert_expert_num,
+ num_beams=config.moebert_num_beams,
+ layer_judge = self.layer_judge,
+ route_method=config.route_method,
+ weight_type=config.moe_weight_type,
+ )
+ else:
+ self.experts = ffn
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ query_length=0,
+ beam_scores=None,
+ expert_route=None,
+ ):
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = (
+ past_key_value[:2] if past_key_value is not None else None
+ )
+ # import pdb; pdb.set_trace() # 0107test
+
+ # adjust the dimension of hidden_states, attention_mask, encoder_attention_mask and encoder_hidden_states to be the same
+ if self.num_beams > 1:
+ if hidden_states.shape[0]== attention_mask.shape[0]*self.num_beams:
+ # attention_mask dimension to be bz*num_beams
+ attention_mask = self.adjust_attention_mask(attention_mask)
+ encoder_attention_mask = self.adjust_attention_mask(encoder_attention_mask)
+
+ if hidden_states.shape[0]*self.num_beams == attention_mask.shape[0]:
+ # attention_mask dimension back to bz
+ batch_size = attention_mask.shape[0]
+ attention_mask = attention_mask[[ i for i in range(0, batch_size, self.num_beams)]]
+
+ if hidden_states.shape[0] == encoder_hidden_states.shape[0]*self.num_beams:
+ batch_size, visual_tokens, vision_dim = encoder_hidden_states.shape
+ tmp = encoder_hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, visual_tokens, vision_dim )
+ encoder_hidden_states = tmp.contiguous().view(batch_size* self.num_beams, visual_tokens, vision_dim) # torch.Size([bz, 257, 1408])
+
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ head_mask,
+ output_attentions=output_attentions,
+ past_key_value=self_attn_past_key_value,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:-1]
+
+ present_key_value = self_attention_outputs[-1]
+
+ if query_length > 0:
+ query_attention_output = attention_output[:, :query_length, :]
+
+ if self.has_cross_attention:
+
+ assert (
+ encoder_hidden_states is not None
+ ), "encoder_hidden_states must be given for cross-attention layers"
+
+ cross_attention_outputs = self.crossattention(
+ query_attention_output,
+ attention_mask,
+ head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ query_attention_output = cross_attention_outputs[0]
+ outputs = (
+ outputs + cross_attention_outputs[1:-1]
+ ) # add cross attentions if we output attention weights
+
+ # add moe query ffn
+ # query_attention_output size: [bz, query_length+seq_len, 768]
+ # attention_mask size: [bz, 1, 1, query_length+seq_len]
+ moe_ffn_attention_input = query_attention_output[:, :query_length, :]
+ moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length]
+ layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask, beam_scores, expert_route)
+ # layer_output = (layer_output, beam_scores, expert_route, beam_idx, importance_loss)
+ # import pdb; pdb.set_trace() # 0107test
+
+ if attention_output.shape[1] > query_length: # have text input in Qformer
+ layer_output_text = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output[:, query_length:, :],
+ )
+ if self.layer_judge == 'first' and self.num_beams>1:
+ # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
+ # adjust the dimension of layer_output_text to bz*num_beams
+ layer_output_text = self.adjust_layer_output_text(layer_output_text)
+
+ if self.layer_judge == 'mid' and self.num_beams > 1:
+ # layer_output_text [bz*num_beams, len, hidden_size]
+ beam_idx = layer_output[3]
+ layer_output_text = layer_output_text[beam_idx]
+
+ if self.layer_judge == 'last' and self.num_beams>1:
+ # select top1 for each sample among beams
+ # layer_output = (hidden_states, beam_scores, expert_route)
+ # layer_output & layer_output_text dimen_0 from bz*num_beams to bz
+ layer_output, layer_output_text = self.route_moe_last_layer_top1(layer_output, layer_output_text)
+
+ layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2], layer_output[3],layer_output[4])
+ # import pdb; pdb.set_trace() # 0107test
+
+ else:
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output,
+ )
+ layer_output = (layer_output, None, None, None, 0.0)
+
+ outputs = (layer_output,) + outputs
+
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+ def adjust_attention_mask(self, attention_mask):
+ batch_size = attention_mask.shape[0]
+ tmp = attention_mask.unsqueeze(1).expand(batch_size, self.num_beams, 1, 1, attention_mask.shape[3])
+ attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len])
+ return attention_mask
+
+ def adjust_layer_output_text(self, layer_output_text):
+ batch_size, text_length, hidden_size = layer_output_text.shape
+ tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size)
+ layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768]
+ return layer_output_text
+
+ def route_moe_last_layer_top1(self, layer_output, layer_output_text):
+ batch_size = layer_output[0].shape[0]
+ raw_batch_size = int(batch_size / self.num_beams)
+ hidden_states, beam_scores, expert_route, beam_idx = layer_output[0], layer_output[1], layer_output[2], layer_output[3]
+ layer_output_text = layer_output_text[beam_idx]
+
+ scores = beam_scores.view(raw_batch_size, self.num_beams)
+ _, gate = torch.topk(scores, 1, dim=1)
+ selects = [ (bz_idx * self.num_beams + gate[bz_idx].item()) for bz_idx in range(raw_batch_size)]
+
+ layer_output_text = layer_output_text[selects]
+ hidden_states_new = hidden_states[selects]
+ beam_scores_new = beam_scores[selects]
+ expert_route_new = expert_route[selects]
+
+ return (hidden_states_new, beam_scores_new, expert_route_new, layer_output[3], layer_output[4]), layer_output_text
+
+
+ def feed_forward_chunk(self, attention_output):
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ # layer_output = self.LayerNorm(layer_output + attention_output)
+ return layer_output
+
+ def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route):
+ if not self.use_experts:
+ layer_output = self.experts(attention_output)
+ # layer_output = self.LayerNorm(layer_output + attention_output)
+ return layer_output, None, None, None, 0.0
+
+ layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
+ attention_output, expert_attention_mask, beam_scores, expert_route
+ )
+
+ # layer_output = self.LayerNorm(layer_output + attention_output)
+ return layer_output, beam_scores, expert_route, beam_idx, importance_loss
+
+class BertEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList(
+ [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+ )
+
+ # @get_local('all_cross_attentions')
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ head_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ query_length=0,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = (
+ () if output_attentions and self.config.add_cross_attention else None
+ )
+
+ next_decoder_cache = () if use_cache else None
+ beam_scores=None
+ expert_route=None
+ importance_loss = 0
+ for i in range(self.config.num_hidden_layers):
+
+ layer_module = self.layer[i]
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
+ if use_cache:
+ logger.warn(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(
+ *inputs, past_key_value, output_attentions, query_length, beam_scores, expert_route
+ )
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(layer_module),
+ hidden_states,
+ attention_mask,
+ layer_head_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ )
+ else:
+ layer_outputs = layer_module(
+ hidden_states, #torch.Size([bz, 32+input_len, 768])
+ attention_mask, # torch.Size([bz, 1, 1, 32+input_len])
+ layer_head_mask, # None
+ encoder_hidden_states, # torch.Size([bz, 257, 1408])
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions, # False
+ query_length, # 32
+ beam_scores, # None
+ expert_route, # None
+ )
+ hidden_states = layer_outputs[0][0]
+ beam_scores = beam_scores if layer_outputs[0][1] == None else layer_outputs[0][1]
+ expert_route = expert_route if layer_outputs[0][2] == None else layer_outputs[0][2]
+ importance_loss += layer_outputs[0][4]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ next_decoder_cache,
+ all_hidden_states,
+ all_self_attentions,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+
+ return MoEModelOutput(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ beam_scores=beam_scores,
+ expert_route=expert_route,
+ gate_loss=importance_loss,
+ )
+
+
+class BertPooler(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.transform_act_fn = config.hidden_act
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.transform = BertPredictionHeadTransform(config)
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+ self.decoder.bias = self.bias
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states)
+ return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.predictions = BertLMPredictionHead(config)
+
+ def forward(self, sequence_output):
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = BertConfig
+ base_model_prefix = "bert"
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Embedding)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+ """
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+ cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+ all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+ argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+ input to the forward pass.
+ """
+
+ def __init__(self, config, add_pooling_layer=False):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = BertEmbeddings(config)
+
+ self.encoder = BertEncoder(config)
+
+ self.pooler = BertPooler(config) if add_pooling_layer else None
+
+ self.init_weights()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ def get_extended_attention_mask(
+ self,
+ attention_mask: Tensor,
+ input_shape: Tuple[int],
+ device: device,
+ is_decoder: bool,
+ has_query: bool = False,
+ ) -> Tensor:
+ """
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+ Arguments:
+ attention_mask (:obj:`torch.Tensor`):
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+ input_shape (:obj:`Tuple[int]`):
+ The shape of the input to the model.
+ device: (:obj:`torch.device`):
+ The device of the input to the model.
+
+ Returns:
+ :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+ """
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+
+ if attention_mask.dim() == 3:
+ extended_attention_mask = attention_mask[:, None, :, :]
+ elif attention_mask.dim() == 2:
+ # Provided a padding mask of dimensions [batch_size, seq_length]
+ # - if the model is a decoder, apply a causal mask in addition to the padding mask
+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if is_decoder:
+ batch_size, seq_length = input_shape
+
+ seq_ids = torch.arange(seq_length, device=device)
+ causal_mask = (
+ seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+ <= seq_ids[None, :, None]
+ )
+
+ # add a prefix ones mask to the causal mask
+ # causal and attention masks must have same type with pytorch version < 1.3
+ causal_mask = causal_mask.to(attention_mask.dtype)
+
+ if causal_mask.shape[1] < attention_mask.shape[1]:
+ prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+ if has_query: # UniLM style attention mask
+ causal_mask = torch.cat(
+ [
+ torch.zeros(
+ (batch_size, prefix_seq_len, seq_length),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=1,
+ )
+ causal_mask = torch.cat(
+ [
+ torch.ones(
+ (batch_size, causal_mask.shape[1], prefix_seq_len),
+ device=device,
+ dtype=causal_mask.dtype,
+ ),
+ causal_mask,
+ ],
+ axis=-1,
+ )
+ extended_attention_mask = (
+ causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+ )
+ else:
+ extended_attention_mask = attention_mask[:, None, None, :]
+ else:
+ raise ValueError(
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+ input_shape, attention_mask.shape
+ )
+ )
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = extended_attention_mask.to(
+ dtype=self.dtype
+ ) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+ return extended_attention_mask
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ is_decoder=False,
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ """
+ output_attentions = (
+ output_attentions
+ if output_attentions is not None
+ else self.config.output_attentions
+ )
+ output_hidden_states = (
+ output_hidden_states
+ if output_hidden_states is not None
+ else self.config.output_hidden_states
+ )
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ if input_ids is None:
+ assert (
+ query_embeds is not None
+ ), "You have to specify query_embeds when input_ids is None"
+
+ # past_key_values_length
+ past_key_values_length = (
+ past_key_values[0][0].shape[2] - self.config.query_length
+ if past_key_values is not None
+ else 0
+ )
+
+ query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ query_embeds=query_embeds,
+ past_key_values_length=past_key_values_length,
+ )
+
+ input_shape = embedding_output.size()[:-1]
+ batch_size, seq_length = input_shape
+ device = embedding_output.device
+
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ ((batch_size, seq_length + past_key_values_length)), device=device
+ )
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if is_decoder:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask,
+ input_ids.shape,
+ device,
+ is_decoder,
+ has_query=(query_embeds is not None),
+ )
+ else:
+ extended_attention_mask = self.get_extended_attention_mask(
+ attention_mask, input_shape, device, is_decoder
+ )
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if encoder_hidden_states is not None:
+ if type(encoder_hidden_states) == list:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+ 0
+ ].size()
+ else:
+ (
+ encoder_batch_size,
+ encoder_sequence_length,
+ _,
+ ) = encoder_hidden_states.size()
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+ if type(encoder_attention_mask) == list:
+ encoder_extended_attention_mask = [
+ self.invert_attention_mask(mask) for mask in encoder_attention_mask
+ ]
+ elif encoder_attention_mask is None:
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = self.invert_attention_mask(
+ encoder_attention_mask
+ )
+ else:
+ encoder_extended_attention_mask = None
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_extended_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ query_length=query_length,
+ )
+ sequence_output = encoder_outputs[0]
+ pooled_output = (
+ self.pooler(sequence_output) if self.pooler is not None else None
+ )
+
+ if not return_dict:
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return MoEModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ past_key_values=encoder_outputs.past_key_values,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ cross_attentions=encoder_outputs.cross_attentions,
+ beam_scores=encoder_outputs.beam_scores,
+ expert_route=encoder_outputs.expert_route,
+ gate_loss=encoder_outputs.gate_loss
+ )
+
+
+class BertMoERouteLMHeadModel(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ past_key_values=None,
+ use_cache=True,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=True,
+ reduction="mean",
+ ):
+ r"""
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+ ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+ ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+ use_cache (:obj:`bool`, `optional`):
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+ decoding (see :obj:`past_key_values`).
+ Returns:
+ Example::
+ >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+ >>> import torch
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+ >>> config = BertConfig.from_pretrained("bert-base-cased")
+ >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+ >>> outputs = model(**inputs)
+ >>> prediction_logits = outputs.logits
+ """
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+ if labels is not None:
+ use_cache = False
+ if past_key_values is not None:
+ query_embeds = None
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+ # gate_loss = outputs.gate_loss
+
+ sequence_output = outputs[0]
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores[:, :-1, :].contiguous()
+
+ lm_loss, total_loss = None, None
+ if labels is not None:
+ # we are doing next-token prediction; shift prediction scores and input ids by one
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+ labels = labels[:, 1:].contiguous()
+ loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+ lm_loss = loss_fct(
+ shifted_prediction_scores.view(-1, self.config.vocab_size),
+ labels.view(-1),
+ )
+ if reduction == "none":
+ lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+ total_loss = lm_loss
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return ((total_loss,) + output) if total_loss is not None else output
+
+ return CausalLMOutputWithCrossAttentions(
+ loss=total_loss,
+ logits=prediction_scores,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ cross_attentions=outputs.cross_attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+ ):
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+ if attention_mask is None:
+ attention_mask = input_ids.new_ones(input_ids.shape)
+ query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+ attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+ # cut decoder_input_ids if past is used
+ if past is not None:
+ input_ids = input_ids[:, -1:]
+
+ return {
+ "input_ids": input_ids,
+ "query_embeds": query_embeds,
+ "attention_mask": attention_mask,
+ "past_key_values": past,
+ "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+ "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+ "is_decoder": True,
+ }
+
+ def _reorder_cache(self, past, beam_idx):
+ reordered_past = ()
+ for layer_past in past:
+ reordered_past += (
+ tuple(
+ past_state.index_select(0, beam_idx) for past_state in layer_past
+ ),
+ )
+ return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.bert = BertModel(config, add_pooling_layer=False)
+ self.cls = BertOnlyMLMHead(config)
+
+ self.init_weights()
+
+ def get_output_embeddings(self):
+ return self.cls.predictions.decoder
+
+ def set_output_embeddings(self, new_embeddings):
+ self.cls.predictions.decoder = new_embeddings
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ query_embeds=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ labels=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ return_logits=False,
+ is_decoder=False,
+ ):
+ r"""
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+ config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+ """
+
+ return_dict = (
+ return_dict if return_dict is not None else self.config.use_return_dict
+ )
+
+ outputs = self.bert(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ query_embeds=query_embeds,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ is_decoder=is_decoder,
+ )
+
+ if query_embeds is not None:
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+ prediction_scores = self.cls(sequence_output)
+
+ if return_logits:
+ return prediction_scores
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
+ masked_lm_loss = loss_fct(
+ prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+ )
+
+ if not return_dict:
+ output = (prediction_scores,) + outputs[2:]
+ return (
+ ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+ )
+
+ return MaskedLMOutput(
+ loss=masked_lm_loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
diff --git a/minigpt4/models/blip2.py b/minigpt4/models/blip2.py
index a6bf474..593d829 100644
--- a/minigpt4/models/blip2.py
+++ b/minigpt4/models/blip2.py
@@ -22,6 +22,7 @@ from minigpt4.common.logger import MetricLogger
from minigpt4.models.base_model import BaseModel
from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
from minigpt4.models.QformerMoE import BertMoELMHeadModel
+from minigpt4.models.QformerMoELN import BertMoELMHeadModelLNIn
from minigpt4.models.QformerRouteMoE import BertMoERouteLMHeadModel
from minigpt4.models.eva_vit import create_eva_vit_g
from transformers import BertTokenizer
@@ -88,7 +89,7 @@ class Blip2Base(BaseModel):
@classmethod
- def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2):
+ def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2,ln_position="out"):
moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
moe_encoder_config.encoder_width = vision_width
@@ -104,9 +105,14 @@ class Blip2Base(BaseModel):
moe_encoder_config.use_balance_loss = use_balance_loss
moe_encoder_config.moe_weight_type = moe_weight_type
- MoEQformer = BertMoELMHeadModel.from_pretrained(
- "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
- )
+ if ln_position == "out":
+ MoEQformer = BertMoELMHeadModel.from_pretrained(
+ "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
+ )
+ elif ln_position == "in":
+ MoEQformer = BertMoELMHeadModelLNIn.from_pretrained(
+ "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
+ )
query_tokens = nn.Parameter(
torch.zeros(1, num_query_token, moe_encoder_config.hidden_size)
)
diff --git a/minigpt4/models/blip2_vicuna_instruct.py b/minigpt4/models/blip2_vicuna_instruct.py
index 13421ab..d7e26a5 100644
--- a/minigpt4/models/blip2_vicuna_instruct.py
+++ b/minigpt4/models/blip2_vicuna_instruct.py
@@ -65,6 +65,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss = True,
moe_weight_type = "l2_norm",
gate_save_path = None,
+ bal_loss_decay_epoch = 3,
+ ln_position = "out",
):
super().__init__()
transformers_version = version.parse(transformers.__version__)
@@ -112,7 +114,8 @@ class Blip2VicunaInstruct(Blip2Base):
moe_topk=moe_topk,
use_balance_loss=use_balance_loss,
moe_weight_type=moe_weight_type,
- cross_attention_freq=2
+ cross_attention_freq=2,
+ ln_position=ln_position,
)
else:
self.Qformer, self.query_tokens = self.init_Qformer(
@@ -221,6 +224,7 @@ class Blip2VicunaInstruct(Blip2Base):
self.moebert_num_beams = moebert_num_beams
self.gate_save_path = gate_save_path
+ self.bal_loss_decay_epoch = bal_loss_decay_epoch
# if self.gate_save_path != None:
# import os
# if not os.path.exists(self.gate_save_path):
@@ -392,9 +396,12 @@ class Blip2VicunaInstruct(Blip2Base):
return_dict=True,
labels=targets,
)
-
+
if self.use_moeqformer:
- loss = outputs.loss + self.moebert_load_balance * gate_loss
+ if samples['epoch'] > self.bal_loss_decay_epoch:
+ loss = outputs.loss
+ else:
+ loss = outputs.loss + self.moebert_load_balance * gate_loss
else:
loss = outputs.loss
@@ -512,6 +519,16 @@ class Blip2VicunaInstruct(Blip2Base):
with self.maybe_autocast():
inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids)
+
+ # path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/embedding/"
+ # np.save(os.join(path, "inputs_llm.npy"), inputs_llm.cpu().numpy)
+ # np.save(os.join(path, "inputs_llm.npy"), self.llm_model.get_input_embeddings().weight.cpu().numpy)
+ # samples_copy = samples.copy()
+ # samples_copy.pop('image', None)
+ # with open(os.path.join(path, 'test_samples.json'),'a+') as f:
+ # f.write(f"{json.dumps(samples_copy)}\n")
+
+
inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1)
@@ -654,6 +671,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss = cfg.get("use_balance_loss", True)
moe_weight_type = cfg.get("moe_weight_type",'l2_norm')
gate_save_path = cfg.get("gate_save_path", None)
+ bal_loss_decay_epoch = cfg.get("bal_loss_decay_epoch", 3)
+ ln_position = cfg.get("ln_position","out")
model = cls(
vit_model=vit_model,
@@ -683,6 +702,8 @@ class Blip2VicunaInstruct(Blip2Base):
use_balance_loss=use_balance_loss,
moe_weight_type=moe_weight_type,
gate_save_path=gate_save_path,
+ bal_loss_decay_epoch=bal_loss_decay_epoch,
+ ln_position=ln_position,
)
# if qformer_text_input:
diff --git a/minigpt4/models/moe/beam_search.py b/minigpt4/models/moe/beam_search.py
index c4b3c5b..c5c3a5a 100644
--- a/minigpt4/models/moe/beam_search.py
+++ b/minigpt4/models/moe/beam_search.py
@@ -165,7 +165,7 @@ class RouteMoELayer(nn.Module):
self.route_method = route_method
if self.route_method == "pre-route":
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
- elif self.route_method == "post-route":
+ elif self.route_method in ["post-route", "post-route-dp"]:
gate = nn.Linear(hidden_size, 1, bias=False).float()
self.gate = gate
# self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@@ -252,6 +252,53 @@ class RouteMoELayer(nn.Module):
return beam_scores, expert_route, beam_idx
+ def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+ if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route', 'post-route-dp']:
+ # current_scores_log torch.Size([bz, num_experts])
+ assert beam_scores==None and expert_route==None
+ current_scores = torch.exp(current_scores_log)
+ topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
+ expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
+ beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
+ else:
+ batch_size = int(batch_size // self.num_beams)
+ next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
+ next_scores_exp = torch.exp(next_scores_raw)
+ import pdb;pdb.set_trace()
+
+ next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
+ next_scores = next_scores_raw.view(batch_size, self.num_beams)
+ next_experts = next_experts_raw.view(batch_size, self.num_beams)
+ # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
+ # next_scores torch.Size([bz * num_beams, 1])
+ # next_tokens torch.Size([bz * num_beams, 1])
+
+ next_batch_beam = list()
+ for batch_idx in range(batch_size):
+ next_sent_beam = list()
+ expert_id = next_experts[batch_idx]
+ expert_score = next_scores[batch_idx]
+ values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
+ for i in range(self.num_beams):
+ beam_id = index[i].item()
+ ex_id = expert_id[beam_id].item()
+ effective_beam_id = batch_idx*self.num_beams + beam_id
+ next_sent_beam.append((values[i], ex_id, effective_beam_id))
+ next_batch_beam.extend(next_sent_beam)
+
+ import pdb;pdb.set_trace()
+
+ beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+ beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+ beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+ pre_route = expert_route[beam_idx,:]
+ expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+
+ return beam_scores, expert_route, beam_idx
+
+
def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
# current_scores_log torch.Size([bz, num_experts])
@@ -267,6 +314,8 @@ class RouteMoELayer(nn.Module):
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
next_scores_exp = torch.exp(next_scores_raw)
+ import pdb;pdb.set_trace()
+
next_scores_raw1 = next_scores_exp.view(
batch_size, self.num_beams * self.num_experts
) # torch.Size([bz, num_beams*num_experts])
@@ -289,7 +338,7 @@ class RouteMoELayer(nn.Module):
next_sent_beam.append((expert_score, ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
- # import pdb;pdb.set_trace()
+ import pdb;pdb.set_trace()
beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
@@ -301,8 +350,6 @@ class RouteMoELayer(nn.Module):
return beam_scores, expert_route, beam_idx
-
-
def forward_expert_ffn(self, x, expert_select, current_scores):
"""
x_repeat : [bz*num_beams, 32,768]
@@ -343,6 +390,7 @@ class RouteMoELayer(nn.Module):
batch_size, num_tokens = x.shape[0], x.shape[1]
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+
current_expert_select = expert_route[:,-1]
import pdb;pdb.set_trace()
@@ -368,7 +416,6 @@ class RouteMoELayer(nn.Module):
output_x = self.experts[expert_idx].forward(input_x)
return output_x
- import pdb; pdb.set_trace()
outputs = list()
logits_gate_lst = list()
for expert_idx in range(self.num_experts):
@@ -392,10 +439,14 @@ class RouteMoELayer(nn.Module):
# importance loss
importance_loss = self._importance_auxiliary_loss(current_scores)
- # import pdb; pdb.set_trace()
-
batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
- beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+ import pdb; pdb.set_trace()
+
+ if self.route_method == 'post-route':
+ beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+ elif self.route_method == 'post-route-dp':
+ beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
+
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
@@ -431,7 +482,7 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
- elif self.route_method == "post-route":
+ elif self.route_method in ['post-route', 'post-route-dp']:
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
@@ -467,10 +518,11 @@ if __name__ == '__main__':
batch_size = 4
x = torch.randn(batch_size, 32, 768)
beam_scores, expert_route = None, None
-
x1 = x
x2 = x
+ x3 = x
beam_scores1, expert_route1 = None, None
+ beam_scores2, expert_route2 = None, None
for layer_num in [6, 8, 10]:
layer_judge = moe_layer_judge(layer_num)
@@ -494,25 +546,41 @@ if __name__ == '__main__':
# print(importance_loss)
# x = hidden_states1
- gate1 = nn.Linear(768, 1, bias=False).float()
+ # experts_post = RouteMoELayer(
+ # hidden_size=768,
+ # expert=ffn,
+ # num_experts=config.moebert_expert_num,
+ # num_beams=config.moebert_num_beams,
+ # layer_judge = layer_judge,
+ # route_method = "post-route",
+ # weight_type="ffn_prob"
+ # )
+ # layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
+ # hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
+
+ # print(beam_scores1)
+ # print(expert_route1)
+ # print(beam_idx)
+ # print(importance_loss)
+ # x1 = hidden_states2
+
experts_post = RouteMoELayer(
hidden_size=768,
expert=ffn,
num_experts=config.moebert_expert_num,
num_beams=config.moebert_num_beams,
layer_judge = layer_judge,
- route_method = "post-route",
+ route_method = "post-route-dp",
weight_type="ffn_prob"
)
- layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
- hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
-
- print(beam_scores1)
- print(expert_route1)
- print(beam_idx)
- print(importance_loss)
- x1 = hidden_states2
+ layer_output = experts_post(x2, None, beam_scores2, expert_route2, False)
+ hidden_states3, beam_scores2, expert_route2, beam_idx2, importance_loss2 = layer_output
+ print(beam_scores2)
+ print(expert_route2)
+ print(beam_idx2)
+ print(importance_loss2)
+ x2 = hidden_states3
# gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
# experts_moe = MoELayer(
@@ -526,12 +594,12 @@ if __name__ == '__main__':
# weight_type=config.moe_weight_type,
# )
# attn_mask = torch.ones([batch_size, 32])
- # layer_output = experts_moe(x2, attn_mask)
- # hidden_states3, select_prob_gate, gate_load,_ = layer_output
+ # layer_output = experts_moe(x3, attn_mask)
+ # hidden_states4, select_prob_gate, gate_load,_ = layer_output
# print(select_prob_gate)
# print(gate_load)
- # x2 = hidden_states3
+ # x3 = hidden_states4
print("------------------------------------")
import pdb; pdb.set_trace()
diff --git a/minigpt4/models/moe/route_moe_layer.py b/minigpt4/models/moe/route_moe_layer.py
index 6012dd2..69fac18 100644
--- a/minigpt4/models/moe/route_moe_layer.py
+++ b/minigpt4/models/moe/route_moe_layer.py
@@ -18,7 +18,7 @@ class RouteMoELayer(nn.Module):
self.route_method = route_method
if self.route_method == "pre-route":
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
- elif self.route_method == "post-route":
+ elif self.route_method in ["post-route", "post-route-dp"]:
gate = nn.Linear(hidden_size, 1, bias=False).float()
self.gate = gate
# self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
@@ -47,26 +47,67 @@ class RouteMoELayer(nn.Module):
prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
return prob_gate
-
- def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
- if self.layer_judge=='first' and self.route_method=='pre-route':
+ def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+ if self.layer_judge=='first' and self.route_method in ['post-route-dp']:
+ # current_scores_log torch.Size([bz, num_experts])
assert beam_scores==None and expert_route==None
current_scores = torch.exp(current_scores_log)
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
else:
- if self.layer_judge=='first' and self.route_method == 'post-route':
- batch_size = batch_size
- next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts])
- else:
- batch_size = int(batch_size // self.num_beams)
- next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
- next_scores_exp = torch.exp(next_scores_raw)
- next_scores_raw1 = next_scores_exp.view(
- batch_size, self.num_beams * self.num_experts
- ) # torch.Size([bz, num_beams*num_experts])
+ batch_size = int(batch_size // self.num_beams)
+ next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
+ next_scores_exp = torch.exp(next_scores_raw)
+
+ next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True)
+ next_scores = next_scores_raw.view(batch_size, self.num_beams)
+ next_experts = next_experts_raw.view(batch_size, self.num_beams)
+ # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价
+ # next_scores torch.Size([bz * num_beams, 1])
+ # next_tokens torch.Size([bz * num_beams, 1])
+
+ next_batch_beam = list()
+ for batch_idx in range(batch_size):
+ next_sent_beam = list()
+ expert_id = next_experts[batch_idx]
+ expert_score = next_scores[batch_idx]
+ values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True)
+ for i in range(self.num_beams):
+ beam_id = index[i].item()
+ ex_id = expert_id[beam_id].item()
+ effective_beam_id = batch_idx*self.num_beams + beam_id
+ next_sent_beam.append((values[i], ex_id, effective_beam_id))
+ next_batch_beam.extend(next_sent_beam)
+
+ beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+ beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+ beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+ pre_route = expert_route[beam_idx,:]
+ expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+
+ return beam_scores, expert_route, beam_idx
+
+ def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+ if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
+ # current_scores_log torch.Size([bz, num_experts])
+ assert beam_scores==None and expert_route==None
+ current_scores = torch.exp(current_scores_log)
+ topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
+ expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
+ beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
+ else:
+ batch_size = int(batch_size // self.num_beams)
+ next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
+ next_scores_exp = torch.exp(next_scores_raw)
+
+ next_scores_raw1 = next_scores_exp.view(
+ batch_size, self.num_beams * self.num_experts
+ ) # torch.Size([bz, num_beams*num_experts])
next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True)
# next_scores torch.Size([bz, num_beams])
@@ -86,19 +127,11 @@ class RouteMoELayer(nn.Module):
next_sent_beam.append((expert_score, ex_id, effective_beam_id))
next_batch_beam.extend(next_sent_beam)
- if self.layer_judge=='first' and self.route_method == 'post-route':
- beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
- expert_route = next_experts.view(self.num_beams * batch_size)
- beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
- beam_experts = expert_route.new([x[1] for x in next_batch_beam]).unsqueeze(-1)
- beam_idx = expert_route.new([int(x[2]/self.num_beams) for x in next_batch_beam])
- expert_route = beam_experts
- else:
- beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
- beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
- beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
- pre_route = expert_route[beam_idx,:]
- expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+ beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+ beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+ beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+ pre_route = expert_route[beam_idx,:]
+ expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
return beam_scores, expert_route, beam_idx
@@ -153,7 +186,6 @@ class RouteMoELayer(nn.Module):
# import pdb;pdb.set_trace()
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
-
def forward_post_route(self, x, beam_scores, expert_route, use_log=True):
attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
@@ -187,7 +219,12 @@ class RouteMoELayer(nn.Module):
importance_loss = self._importance_auxiliary_loss(current_scores)
batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
- beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+
+ if self.route_method == 'post-route':
+ beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+ elif self.route_method == 'post-route-dp':
+ beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size)
+
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
@@ -218,7 +255,7 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
- elif self.route_method == "post-route":
+ elif self.route_method in ['post-route', 'post-route-dp']:
candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
diff --git a/minigpt4/models/moe/utils.py b/minigpt4/models/moe/utils.py
index 52f78b8..1489f60 100644
--- a/minigpt4/models/moe/utils.py
+++ b/minigpt4/models/moe/utils.py
@@ -13,7 +13,7 @@ from typing import Optional, Tuple, List
def use_experts(layer_idx):
# if layer_idx % 2 == 0:
# use moe_ffn after cross_attns
- if int(layer_idx) in [6,8,10]:
+ if int(layer_idx) in [6,7,8,9,10,11]:
# layer 6/8/10
return True
else:
diff --git a/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml b/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml
new file mode 100644
index 0000000..1617e28
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml
@@ -0,0 +1,114 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: True
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/20240301223/checkpoint_best.pth"
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # T5
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: False
+ moebert_expert_num: 3
+ moebert_route_method: "gate-sentence-post"
+ moe_weight_type: "raw_prob"
+ moebert_load_balance: 0.05
+ moe_topk: 1
+ use_balance_loss: False
+ ln_position: "out"
+
+datasets:
+ gqa:
+ type: balanced_sft_raw_eval
+ batch_size: 4
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+ ok_vqa: # train, valid (9009, 5046)
+ type: ok_vqa_eval
+ batch_size: 4
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+ coco_vqa: # 658104
+ type: vqa_v2_eval
+ batch_size: 4
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+ aok_vqa: # train: 17056, val: 1145
+ batch_size: 4
+ vis_processor:
+ eval:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+run:
+ task: instruction_tuning
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/eval/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/"
+ num_workers: 4
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: True
+ test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
+
+
+
+
+
+
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
index 74f4ab0..991ad2a 100644
--- a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
@@ -10,7 +10,7 @@ model:
load_finetuned: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
- finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth"
+ finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/20240128142/checkpoint_best.pth"
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
@@ -39,27 +39,18 @@ model:
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
- moebert_load_balance: 0
+ moebert_load_balance: 0.01
moebert_expert_num: 2
moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
- gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
+ use_balance_loss: False
+ bal_loss_decay_epoch: 8
+ gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"
datasets:
gqa:
type: balanced_sft_raw_eval
- batch_size: 32
- vis_processor:
- eval:
- name: "blip2_image_eval"
- image_size: 224
- text_processor:
- eval:
- name: "blip_caption"
-
- ok_vqa: # train, valid (9009, 5046)
- type: ok_vqa_eval
- batch_size: 32
+ batch_size: 64
vis_processor:
eval:
name: "blip2_image_eval"
@@ -70,6 +61,17 @@ datasets:
coco_vqa: # 658104
type: vqa_v2_eval
+ batch_size: 64
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+ coco_caption: # 414113 train
+ type: coco_cap_eval
batch_size: 32
vis_processor:
eval:
@@ -78,7 +80,18 @@ datasets:
text_processor:
eval:
name: "blip_caption"
-
+
+ ok_vqa: # train, valid (9009, 5046)
+ type: ok_vqa_eval
+ batch_size: 64
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
run:
task: instruction_tuning
# optimizer
@@ -96,7 +109,7 @@ run:
iters_per_epoch: 3000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
index 16440dc..7ae5cbc 100644
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
@@ -38,17 +38,17 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
- moebert_route_method: "post-route"
- moebert_load_balance: 0
- moebert_expert_num: 3
- moebert_num_beams: 3
+ moebert_route_method: "post-route-dp"
+ moebert_load_balance: 0.05
+ moebert_expert_num: 2
+ moebert_num_beams: 2
moe_weight_type: 'ffn_prob'
use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
- batch_size: 16
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -64,7 +64,7 @@ datasets:
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
- batch_size: 16
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -80,7 +80,7 @@ datasets:
sample_ratio: 1
coco_vqa: # 658104
- batch_size: 16
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -112,7 +112,7 @@ run:
iters_per_epoch: 5000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0121/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml
similarity index 84%
rename from minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml
rename to minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml
index 7124efc..b2cf35b 100644
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml
@@ -38,14 +38,17 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
- moebert_expert_num: 5
- moebert_num_beams: 1
- # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
type: balanced_sft_raw
- batch_size: 4
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -61,7 +64,7 @@ datasets:
sample_ratio: 10
ok_vqa: # train, valid (9009, 5046)
- batch_size: 4
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -77,7 +80,7 @@ datasets:
sample_ratio: 1
coco_vqa: # 658104
- batch_size: 4
+ batch_size: 32
vis_processor:
train:
name: "blip2_image_train"
@@ -96,20 +99,20 @@ run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
- init_lr: 2e-5
+ init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
- max_epoch: 6
+ max_epoch: 8
num_workers: 4
warmup_steps: 600
iters_per_epoch: 5000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1212_Test/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_1loss_5e5lr_top6layer_textinqf_epo8_0123/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml
new file mode 100644
index 0000000..15117b3
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml
@@ -0,0 +1,145 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: False
+ use_route_moe: False
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+
+datasets:
+ gqa: # train: 943000, 12578, 12578)
+ type: balanced_sft_raw
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 10
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 1
+
+ coco_vqa: # 658104
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 9
+
+ coco_caption: # 414113 train
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 7
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ # output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0122/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Base_top6layer_textinqf_epo8_0124/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+ # test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml
new file mode 100644
index 0000000..e1fbc8f
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml
@@ -0,0 +1,145 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: True
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0.01
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+ bal_loss_decay_epoch: 3
+
+datasets:
+ gqa: # train: 943000, 12578, 12578)
+ type: balanced_sft_raw
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 10
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 1
+
+ coco_vqa: # 658104
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 9
+
+ coco_caption: # 414113 train
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 7
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_loss_decay_5e5lr_top6layer_textinqf_epo8_0129/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+ # test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml
new file mode 100644
index 0000000..16b3ef5
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml
@@ -0,0 +1,188 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: True
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0.05
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+
+datasets:
+ gqa:
+ type: balanced_sft_raw_eval
+ batch_size: 16
+ vis_processor:
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ eval:
+ name: "blip_caption"
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 8
+
+ coco_vqa: # 658104
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 15
+
+ aok_vqa: # train: 17056, val: 1145
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 12
+
+ ocrvqa: # train 207572
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 30
+
+ llava_reason: # 76643
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 80
+
+ llava_conversation: # 56681
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 30
+
+ llava_detail: # 23240
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 20
+
+ coco_caption: # 414113 train
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ sample_ratio: 10
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+ # test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml
new file mode 100644
index 0000000..be65c17
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml
@@ -0,0 +1,128 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: True
+ moebert_route_method: "post-route-dp"
+ moebert_load_balance: 0.05
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+
+datasets:
+ gqa: # train: 943000, 12578, 12578)
+ type: balanced_sft_raw
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 10
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 1
+
+ coco_vqa: # 658104
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 9
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+ # test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/tasks/instruction_tuning.py b/minigpt4/tasks/instruction_tuning.py
index 759d8bd..341d601 100644
--- a/minigpt4/tasks/instruction_tuning.py
+++ b/minigpt4/tasks/instruction_tuning.py
@@ -53,7 +53,7 @@ class InstructionTask(BaseTask):
run_cfg = cfg.run_cfg
num_beams = run_cfg.get("num_beams", 3)
- max_len = run_cfg.get("max_len", 20)
+ max_len = run_cfg.get("max_len", 30)
min_len = run_cfg.get("min_len", 1)
evaluate = run_cfg.get("evaluate", False)
@@ -112,22 +112,33 @@ class InstructionTask(BaseTask):
)
pred_qa_pairs = []
- question_id = samples["question_id"]
- question = samples["text_input"]
+ text_inputs = samples["text_input"]
+
sources = samples["source"]
+ source = samples["source"][0]
+
+ if source in ['vqav2','okvqa','gqa']:
+ sample_ids = [int(sample_id.item()) for sample_id in samples["question_id"]]
+ elif source in ['aokvqa']:
+ sample_ids = [sample_id for sample_id in samples["question_id"]]
+ elif source in ['coco_cap']:
+ sample_ids = samples["image_id"]
# For GQA
- full_answers = samples.get("fullAnswer", ["" for i in range(len(question_id))])
- gt_answers = samples.get("gt_answers", ["" for i in range(len(question_id))])
+ full_answers = samples.get("fullAnswer", ["" for i in range(len(sample_ids))])
+ gt_answers = samples.get("gt_answers", ["" for i in range(len(sample_ids))])
- for answer, ques_id, ques, full_answer, gt_answer, source in zip(answers, question_id, question, full_answers, gt_answers, sources):
- ques_id = int(ques_id.item())
+ # For AOKVQA
+ choices = samples.get("choices", ["" for i in range(len(sample_ids))])
+
+ for answer, sample_id, text_input, full_answer, gt_answer, choice, source in zip(answers, sample_ids, text_inputs, full_answers, gt_answers, choices, sources):
pred_qa_pairs.append({
- "question_id": ques_id,
- "question": ques,
+ "question_id": sample_id,
+ "question": text_input,
"full_answer": full_answer,
"answer": answer,
"gt_ans": gt_answer,
+ "choice": choice,
"source": source})
return pred_qa_pairs
@@ -140,9 +151,7 @@ class InstructionTask(BaseTask):
total_results = list()
for sub_data_loader in data_loader.loaders:
results = []
- ques_ids = []
for samples in metric_logger.log_every(sub_data_loader, print_freq, header):
- ques_ids.extend(samples['question_id'].tolist())
samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
eval_output = self.valid_step(model=model, samples=samples)
@@ -168,6 +177,7 @@ class InstructionTask(BaseTask):
filename=f"{split_name}_vqa_result_{source}",
remove_duplicate="question_id",
)
+
if source in ['vqav2','okvqa']:
try:
metrics = self._report_metrics_coco_vqa(result_file=result_file, split=split_name, source=source)
@@ -180,7 +190,18 @@ class InstructionTask(BaseTask):
except Exception as e:
metrics = None
print(f"Report Metrics {source} Error: {e}")
-
+ elif source in ['aokvqa']:
+ try:
+ metrics = self._report_metrics_aokvqa(result_file=result_file, source=source)
+ except Exception as e:
+ metrics = None
+ print(f"Report Metrics {source} Error: {e}")
+ elif source in ['coco_cap']:
+ try:
+ metrics = self._report_metrics_caption(result_file=result_file, split_name=split_name, source=source)
+ except Exception as e:
+ metrics = None
+ print(f"Report Metrics {source} Error: {e}")
else:
metrics = None
final_metrics[source] = metrics
@@ -234,10 +255,46 @@ class InstructionTask(BaseTask):
return metrics
+ @dist_utils.main_process
+ def _report_metrics_aokvqa(self, result_file, source='aokvqa'):
+ """
+ Validation of aokvqa
+ """
+ # measuring accuracy compared to answer
+ results = json.load(open(result_file, "r"))
+ acc = []
+ vqa_tool = VQAEval()
+
+ for res in results:
+
+ gt_ans = res["choice"]
+ pred = res["answer"]
+
+ pred = vqa_tool.processPunctuation(pred)
+ pred = vqa_tool.processDigitArticle(pred)
+
+ # vqa_acc = 1 if pred == gt_ans else 0
+ vqa_acc = 1 if pred in gt_ans else 0
+
+ acc.append(vqa_acc)
+
+ accuracy = sum(acc) / len(acc) * 100
+ metrics = {"agg_metrics": accuracy, "acc": accuracy}
+
+ with open(
+ os.path.join(registry.get_path("output_dir"), f"evaluate_{source}.txt"), "a"
+ ) as f:
+ f.write(json.dumps(metrics) + "\n")
+
+ logging.info(metrics)
+
+ return metrics
+
+
@dist_utils.main_process
def _report_metrics_gqa(self, result_file, source='gqa'):
"""
- Validation of GQA/VQAv2
+ Validation of GQA
"""
# measuring accuracy compared to answer
results = json.load(open(result_file, "r"))
@@ -274,3 +331,90 @@ class InstructionTask(BaseTask):
return metrics
+ @dist_utils.main_process
+ def _report_metrics_caption(self, result_file, split_name, source='coco_cap'):
+ """
+ Use official COCO Cap evaluation script to report metrics.
+ """
+ coco_gt_root = os.path.join(registry.get_path("cache_root"), "coco_gt")
+ coco_val = coco_caption_eval(coco_gt_root, result_file, split_name)
+
+ agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"]
+ log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}}
+
+ with open(
+ os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a"
+ ) as f:
+ f.write(json.dumps(log_stats) + "\n")
+
+ coco_res = {k: v for k, v in coco_val.eval.items()}
+ coco_res["agg_metrics"] = agg_metrics
+
+ return coco_res
+
+from collections import defaultdict
+from pycocoevalcap.eval import COCOEvalCap
+class COCO_Annotation:
+ def __init__(self, annotation_file):
+ self.coco_cn_file = annotation_file
+ self.imgToAnns = self.build_imgToAnns()
+
+ def build_imgToAnns(self):
+ imgToAnns = defaultdict(list)
+ with open(self.coco_cn_file, "r", encoding="UTF-8") as fin:
+ for line in fin:
+ line = line.strip()
+ temp = eval(line)
+ annotations = temp['annotations']
+ for ann in annotations:
+ image_id = str(ann['image_id']).zfill(6)
+ imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']})
+ return imgToAnns
+
+ def getImgIds(self):
+ return self.imgToAnns.keys()
+
+class COCO_Result:
+ def __init__(self,result_file):
+ self.coco_cn_file = result_file
+ self.imgToAnns = self.build_imgToAnns()
+
+ def build_imgToAnns(self):
+ imgToAnns = dict()
+ data = json.load(open(self.coco_cn_file, "r"))
+ for d in data:
+ tmp = {
+ 'image_id':d['question_id'][-6:],
+ 'caption':d['answer']
+ }
+ imgToAnns[d['question_id'][-6:]] = [tmp]
+ return imgToAnns
+
+def coco_caption_eval(coco_gt_root, results_file, split_name):
+ files = {
+ "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json",
+ "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json"
+ }
+
+ # create coco object and coco_result object
+ annotation_file = files[split_name]
+ coco = COCO_Annotation(annotation_file)
+ coco_result = COCO_Result(results_file)
+
+ # create coco_eval object by taking coco and coco_result
+ coco_eval = COCOEvalCap(coco, coco_result)
+
+ # evaluate on a subset of images by setting
+ # coco_eval.params['image_id'] = coco_result.getImgIds()
+ # please remove this line when evaluating the full validation set
+ # coco_eval.params['image_id'] = coco_result.getImgIds()
+
+ # evaluate results
+ # SPICE will take a few minutes the first time, but speeds up due to caching
+ coco_eval.evaluate()
+
+ # print output evaluation scores
+ for metric, score in coco_eval.eval.items():
+ print(f"{metric}: {score:.3f}")
+
+ return coco_eval
\ No newline at end of file
diff --git a/prompts/alignment.txt b/prompts/alignment.txt
deleted file mode 100644
index 38ae75a..0000000
--- a/prompts/alignment.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-
Describe this image in detail.
-
Take a look at this image and describe what you notice.
-
Please provide a detailed description of the picture.
-
Could you describe the contents of this image for me?
\ No newline at end of file
diff --git a/test.pdf/backward_graph b/test.pdf/backward_graph
deleted file mode 100644
index 7867fb1..0000000
--- a/test.pdf/backward_graph
+++ /dev/null
@@ -1,5570 +0,0 @@
-digraph {
- graph [size="778.8,778.8"]
- node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled]
- 140509988778688 [label="
- (1, 49, 768)" fillcolor=darkolivegreen1]
- 140509588281712 [label=CatBackward0]
- 140509588282912 -> 140509588281712
- 140509588282912 [label=IndexBackward0]
- 140509588281808 -> 140509588282912
- 140509588281808 [label=SumBackward1]
- 140509588283152 -> 140509588281808
- 140509588283152 [label=MulBackward0]
- 140509588282864 -> 140509588283152
- 140509588282864 [label=CatBackward0]
- 140509591316848 -> 140509588282864
- 140509591316848 [label=UnsqueezeBackward0]
- 140509591314640 -> 140509591316848
- 140509591314640 [label=NativeLayerNormBackward0]
- 140509591317376 -> 140509591314640
- 140509591317376 [label=AddBackward0]
- 140509588312944 -> 140509591317376
- 140509588312944 [label=NativeDropoutBackward0]
- 140509588313424 -> 140509588312944
- 140509588313424 [label=ViewBackward0]
- 140509588313232 -> 140509588313424
- 140509588313232 [label=AddmmBackward0]
- 140509588312560 -> 140509588313232
- 140509588312560 [label=ToCopyBackward0]
- 140509591318384 -> 140509588312560
- 140509591260672 [label="encoder.layer.11.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591260672 -> 140509591318384
- 140509591318384 [label=AccumulateGrad]
- 140509588313040 -> 140509588313232
- 140509588313040 [label=ViewBackward0]
- 140509588312368 -> 140509588313040
- 140509588312368 [label=GeluBackward0]
- 140509588312176 -> 140509588312368
- 140509588312176 [label=ViewBackward0]
- 140509588313328 -> 140509588312176
- 140509588313328 [label=AddmmBackward0]
- 140509588313520 -> 140509588313328
- 140509588313520 [label=ToCopyBackward0]
- 140509588313808 -> 140509588313520
- 140509591261072 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591261072 -> 140509588313808
- 140509588313808 [label=AccumulateGrad]
- 140509588313616 -> 140509588313328
- 140509588313616 [label=ViewBackward0]
- 140509588314096 -> 140509588313616
- 140509588314096 [label=ToCopyBackward0]
- 140509588312608 -> 140509588314096
- 140509588312608 [label=SliceBackward0]
- 140509588314048 -> 140509588312608
- 140509588314048 [label=SliceBackward0]
- 140509588314288 -> 140509588314048
- 140509588314288 [label=SliceBackward0]
- 140509588314480 -> 140509588314288
- 140509588314480 [label=SliceBackward0]
- 140509588314528 -> 140509588314480
- 140509588314528 [label=SliceBackward0]
- 140509588314768 -> 140509588314528
- 140509588314768 [label=NativeLayerNormBackward0]
- 140509588314960 -> 140509588314768
- 140509588314960 [label=AddBackward0]
- 140509588315248 -> 140509588314960
- 140509588315248 [label=NativeDropoutBackward0]
- 140509588315632 -> 140509588315248
- 140509588315632 [label=ViewBackward0]
- 140509588315824 -> 140509588315632
- 140509588315824 [label=AddmmBackward0]
- 140509588316016 -> 140509588315824
- 140509588316016 [label=ToCopyBackward0]
- 140509588315968 -> 140509588316016
- 140509591290880 [label="encoder.layer.11.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591290880 -> 140509588315968
- 140509588315968 [label=AccumulateGrad]
- 140509588315728 -> 140509588315824
- 140509588315728 [label=ViewBackward0]
- 140509588316112 -> 140509588315728
- 140509588316112 [label=ViewBackward0]
- 140509588345136 -> 140509588316112
- 140509588345136 [label=CloneBackward0]
- 140509588345184 -> 140509588345136
- 140509588345184 [label=PermuteBackward0]
- 140509588345424 -> 140509588345184
- 140509588345424 [label=UnsafeViewBackward0]
- 140509588345616 -> 140509588345424
- 140509588345616 [label=BmmBackward0]
- 140509588345664 -> 140509588345616
- 140509588345664 [label=ReshapeAliasBackward0]
- 140509588346192 -> 140509588345664
- 140509588346192 [label=ExpandBackward0]
- 140509588346288 -> 140509588346192
- 140509588346288 [label=ToCopyBackward0]
- 140509588346480 -> 140509588346288
- 140509588346480 [label=NativeDropoutBackward0]
- 140509588346672 -> 140509588346480
- 140509588346672 [label=SoftmaxBackward0]
- 140509588346768 -> 140509588346672
- 140509588346768 [label=AddBackward0]
- 140509588346960 -> 140509588346768
- 140509588346960 [label=DivBackward0]
- 140509588347152 -> 140509588346960
- 140509588347152 [label=UnsafeViewBackward0]
- 140509588347248 -> 140509588347152
- 140509588347248 [label=BmmBackward0]
- 140509588347440 -> 140509588347248
- 140509588347440 [label=UnsafeViewBackward0]
- 140509588347536 -> 140509588347440
- 140509588347536 [label=CloneBackward0]
- 140509588347584 -> 140509588347536
- 140509588347584 [label=ExpandBackward0]
- 140509588347824 -> 140509588347584
- 140509588347824 [label=PermuteBackward0]
- 140509588348016 -> 140509588347824
- 140509588348016 [label=ViewBackward0]
- 140509588348064 -> 140509588348016
- 140509588348064 [label=ViewBackward0]
- 140509588348304 -> 140509588348064
- 140509588348304 [label=AddmmBackward0]
- 140509588348496 -> 140509588348304
- 140509588348496 [label=ToCopyBackward0]
- 140509588348784 -> 140509588348496
- 140509591291680 [label="encoder.layer.11.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509591291680 -> 140509588348784
- 140509588348784 [label=AccumulateGrad]
- 140509588348592 -> 140509588348304
- 140509588348592 [label=ViewBackward0]
- 140509588348544 -> 140509588348592
- 140509588348544 [label=ToCopyBackward0]
- 140509588315344 -> 140509588348544
- 140509588315344 [label=CatBackward0]
- 140509588369568 -> 140509588315344
- 140509588369568 [label=SumBackward1]
- 140509588370096 -> 140509588369568
- 140509588370096 [label=MulBackward0]
- 140509588370192 -> 140509588370096
- 140509588370192 [label=CatBackward0]
- 140509588370288 -> 140509588370192
- 140509588370288 [label=UnsqueezeBackward0]
- 140509588370672 -> 140509588370288
- 140509588370672 [label=NativeLayerNormBackward0]
- 140509588370864 -> 140509588370672
- 140509588370864 [label=AddBackward0]
- 140509588371152 -> 140509588370864
- 140509588371152 [label=NativeDropoutBackward0]
- 140509588371248 -> 140509588371152
- 140509588371248 [label=ViewBackward0]
- 140509588371440 -> 140509588371248
- 140509588371440 [label=AddmmBackward0]
- 140509588371488 -> 140509588371440
- 140509588371488 [label=ToCopyBackward0]
- 140509588371920 -> 140509588371488
- 140509591285568 [label="encoder.layer.10.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591285568 -> 140509588371920
- 140509588371920 [label=AccumulateGrad]
- 140509588371632 -> 140509588371440
- 140509588371632 [label=ViewBackward0]
- 140509588372112 -> 140509588371632
- 140509588372112 [label=GeluBackward0]
- 140509588372304 -> 140509588372112
- 140509588372304 [label=ViewBackward0]
- 140509588372496 -> 140509588372304
- 140509588372496 [label=AddmmBackward0]
- 140509588372592 -> 140509588372496
- 140509588372592 [label=ToCopyBackward0]
- 140509588372976 -> 140509588372592
- 140509591285488 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591285488 -> 140509588372976
- 140509588372976 [label=AccumulateGrad]
- 140509588372400 -> 140509588372496
- 140509588372400 [label=ViewBackward0]
- 140509588372880 -> 140509588372400
- 140509588372880 [label=ToCopyBackward0]
- 140509588370960 -> 140509588372880
- 140509588370960 [label=SliceBackward0]
- 140509588373264 -> 140509588370960
- 140509588373264 [label=SliceBackward0]
- 140509588373456 -> 140509588373264
- 140509588373456 [label=NativeLayerNormBackward0]
- 140509588373360 -> 140509588373456
- 140509588373360 [label=AddBackward0]
- 140509588402672 -> 140509588373360
- 140509588402672 [label=NativeDropoutBackward0]
- 140509588402624 -> 140509588402672
- 140509588402624 [label=ViewBackward0]
- 140509588402864 -> 140509588402624
- 140509588402864 [label=AddmmBackward0]
- 140509588403056 -> 140509588402864
- 140509588403056 [label=ToCopyBackward0]
- 140509588403344 -> 140509588403056
- 140509591293840 [label="encoder.layer.10.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591293840 -> 140509588403344
- 140509588403344 [label=AccumulateGrad]
- 140509588403152 -> 140509588402864
- 140509588403152 [label=ViewBackward0]
- 140509588403632 -> 140509588403152
- 140509588403632 [label=ViewBackward0]
- 140509588403728 -> 140509588403632
- 140509588403728 [label=CloneBackward0]
- 140509588403920 -> 140509588403728
- 140509588403920 [label=PermuteBackward0]
- 140509588404112 -> 140509588403920
- 140509588404112 [label=UnsafeViewBackward0]
- 140509588404208 -> 140509588404112
- 140509588404208 [label=BmmBackward0]
- 140509588404400 -> 140509588404208
- 140509588404400 [label=ReshapeAliasBackward0]
- 140509588404496 -> 140509588404400
- 140509588404496 [label=ExpandBackward0]
- 140509588404544 -> 140509588404496
- 140509588404544 [label=ToCopyBackward0]
- 140509588404784 -> 140509588404544
- 140509588404784 [label=NativeDropoutBackward0]
- 140509588404976 -> 140509588404784
- 140509588404976 [label=SoftmaxBackward0]
- 140509588405024 -> 140509588404976
- 140509588405024 [label=AddBackward0]
- 140509588405264 -> 140509588405024
- 140509588405264 [label=DivBackward0]
- 140509588405456 -> 140509588405264
- 140509588405456 [label=UnsafeViewBackward0]
- 140509588405504 -> 140509588405456
- 140509588405504 [label=BmmBackward0]
- 140509588405744 -> 140509588405504
- 140509588405744 [label=UnsafeViewBackward0]
- 140509588406128 -> 140509588405744
- 140509588406128 [label=CloneBackward0]
- 140509588405984 -> 140509588406128
- 140509588405984 [label=ExpandBackward0]
- 140509588427056 -> 140509588405984
- 140509588427056 [label=PermuteBackward0]
- 140509588427152 -> 140509588427056
- 140509588427152 [label=ViewBackward0]
- 140509588427344 -> 140509588427152
- 140509588427344 [label=ViewBackward0]
- 140509588427536 -> 140509588427344
- 140509588427536 [label=AddmmBackward0]
- 140509588427632 -> 140509588427536
- 140509588427632 [label=ToCopyBackward0]
- 140509588428016 -> 140509588427632
- 140509591312160 [label="encoder.layer.10.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509591312160 -> 140509588428016
- 140509588428016 [label=AccumulateGrad]
- 140509588427440 -> 140509588427536
- 140509588427440 [label=ViewBackward0]
- 140509588427920 -> 140509588427440
- 140509588427920 [label=ToCopyBackward0]
- 140509588402384 -> 140509588427920
- 140509588402384 [label=SliceBackward0]
- 140509588428304 -> 140509588402384
- 140509588428304 [label=SliceBackward0]
- 140509588428496 -> 140509588428304
- 140509588428496 [label=SliceBackward0]
- 140509588428592 -> 140509588428496
- 140509588428592 [label=NativeLayerNormBackward0]
- 140509588428784 -> 140509588428592
- 140509588428784 [label=AddBackward0]
- 140509588429072 -> 140509588428784
- 140509588429072 [label=NativeDropoutBackward0]
- 140509588429168 -> 140509588429072
- 140509588429168 [label=ViewBackward0]
- 140509588429360 -> 140509588429168
- 140509588429360 [label=AddmmBackward0]
- 140509588429408 -> 140509588429360
- 140509588429408 [label=ToCopyBackward0]
- 140509588429840 -> 140509588429408
- 140509591312960 [label="encoder.layer.10.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591312960 -> 140509588429840
- 140509588429840 [label=AccumulateGrad]
- 140509588429552 -> 140509588429360
- 140509588429552 [label=ViewBackward0]
- 140509588430032 -> 140509588429552
- 140509588430032 [label=ViewBackward0]
- 140509588430224 -> 140509588430032
- 140509588430224 [label=CloneBackward0]
- 140509588430416 -> 140509588430224
- 140509588430416 [label=PermuteBackward0]
- 140509588430512 -> 140509588430416
- 140509588430512 [label=UnsafeViewBackward0]
- 140509588430704 -> 140509588430512
- 140509588430704 [label=BmmBackward0]
- 140509588430608 -> 140509588430704
- 140509588430608 [label=ReshapeAliasBackward0]
- 140509588459728 -> 140509588430608
- 140509588459728 [label=ExpandBackward0]
- 140509588459824 -> 140509588459728
- 140509588459824 [label=ToCopyBackward0]
- 140509588460016 -> 140509588459824
- 140509588460016 [label=NativeDropoutBackward0]
- 140509588460064 -> 140509588460016
- 140509588460064 [label=SoftmaxBackward0]
- 140509588460304 -> 140509588460064
- 140509588460304 [label=AddBackward0]
- 140509588460496 -> 140509588460304
- 140509588460496 [label=DivBackward0]
- 140509588460544 -> 140509588460496
- 140509588460544 [label=UnsafeViewBackward0]
- 140509588460784 -> 140509588460544
- 140509588460784 [label=BmmBackward0]
- 140509588460976 -> 140509588460784
- 140509588460976 [label=UnsafeViewBackward0]
- 140509588461360 -> 140509588460976
- 140509588461360 [label=CloneBackward0]
- 140509588461552 -> 140509588461360
- 140509588461552 [label=ExpandBackward0]
- 140509588461648 -> 140509588461552
- 140509588461648 [label=PermuteBackward0]
- 140509588461840 -> 140509588461648
- 140509588461840 [label=ViewBackward0]
- 140509588462032 -> 140509588461840
- 140509588462032 [label=ViewBackward0]
- 140509588462128 -> 140509588462032
- 140509588462128 [label=AddmmBackward0]
- 140509588462320 -> 140509588462128
- 140509588462320 [label=ToCopyBackward0]
- 140509588462608 -> 140509588462320
- 140509591313360 [label="encoder.layer.10.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509591313360 -> 140509588462608
- 140509588462608 [label=AccumulateGrad]
- 140509588461984 -> 140509588462128
- 140509588461984 [label=ViewBackward0]
- 140509588462464 -> 140509588461984
- 140509588462464 [label=ToCopyBackward0]
- 140509588428880 -> 140509588462464
- 140509588428880 [label=CatBackward0]
- 140509588462992 -> 140509588428880
- 140509588462992 [label=SumBackward1]
- 140509588462944 -> 140509588462992
- 140509588462944 [label=MulBackward0]
- 140509588463184 -> 140509588462944
- 140509588463184 [label=CatBackward0]
- 140509588463568 -> 140509588463184
- 140509588463568 [label=UnsqueezeBackward0]
- 140509588463424 -> 140509588463568
- 140509588463424 [label=NativeLayerNormBackward0]
- 140509587960112 -> 140509588463424
- 140509587960112 [label=AddBackward0]
- 140509587960400 -> 140509587960112
- 140509587960400 [label=NativeDropoutBackward0]
- 140509587960784 -> 140509587960400
- 140509587960784 [label=ViewBackward0]
- 140509587960976 -> 140509587960784
- 140509587960976 [label=AddmmBackward0]
- 140509587961168 -> 140509587960976
- 140509587961168 [label=ToCopyBackward0]
- 140509587961456 -> 140509587961168
- 140509591311680 [label="encoder.layer.9.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591311680 -> 140509587961456
- 140509587961456 [label=AccumulateGrad]
- 140509587960880 -> 140509587960976
- 140509587960880 [label=ViewBackward0]
- 140509587961360 -> 140509587960880
- 140509587961360 [label=GeluBackward0]
- 140509587961552 -> 140509587961360
- 140509587961552 [label=ViewBackward0]
- 140509587961600 -> 140509587961552
- 140509587961600 [label=AddmmBackward0]
- 140509587961840 -> 140509587961600
- 140509587961840 [label=ToCopyBackward0]
- 140509587962080 -> 140509587961840
- 140509591312000 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591312000 -> 140509587962080
- 140509587962080 [label=AccumulateGrad]
- 140509587961936 -> 140509587961600
- 140509587961936 [label=ViewBackward0]
- 140509587962416 -> 140509587961936
- 140509587962416 [label=ToCopyBackward0]
- 140509587960496 -> 140509587962416
- 140509587960496 [label=SliceBackward0]
- 140509587962512 -> 140509587960496
- 140509587962512 [label=SliceBackward0]
- 140509587962560 -> 140509587962512
- 140509587962560 [label=SliceBackward0]
- 140509587962800 -> 140509587962560
- 140509587962800 [label=SliceBackward0]
- 140509587962992 -> 140509587962800
- 140509587962992 [label=SliceBackward0]
- 140509587963040 -> 140509587962992
- 140509587963040 [label=NativeLayerNormBackward0]
- 140509587963280 -> 140509587963040
- 140509587963280 [label=AddBackward0]
- 140509587963520 -> 140509587963280
- 140509587963520 [label=NativeDropoutBackward0]
- 140509587963760 -> 140509587963520
- 140509587963760 [label=ViewBackward0]
- 140509587988784 -> 140509587963760
- 140509587988784 [label=AddmmBackward0]
- 140509587988976 -> 140509587988784
- 140509587988976 [label=ToCopyBackward0]
- 140509587989264 -> 140509587988976
- 140509591321152 [label="encoder.layer.9.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591321152 -> 140509587989264
- 140509587989264 [label=AccumulateGrad]
- 140509587988640 -> 140509587988784
- 140509587988640 [label=ViewBackward0]
- 140509587989120 -> 140509587988640
- 140509587989120 [label=ViewBackward0]
- 140509587989360 -> 140509587989120
- 140509587989360 [label=CloneBackward0]
- 140509587989552 -> 140509587989360
- 140509587989552 [label=PermuteBackward0]
- 140509587989600 -> 140509587989552
- 140509587989600 [label=UnsafeViewBackward0]
- 140509587989840 -> 140509587989600
- 140509587989840 [label=BmmBackward0]
- 140509587990032 -> 140509587989840
- 140509587990032 [label=ReshapeAliasBackward0]
- 140509587990416 -> 140509587990032
- 140509587990416 [label=ExpandBackward0]
- 140509587990608 -> 140509587990416
- 140509587990608 [label=ToCopyBackward0]
- 140509587990704 -> 140509587990608
- 140509587990704 [label=NativeDropoutBackward0]
- 140509587990896 -> 140509587990704
- 140509587990896 [label=SoftmaxBackward0]
- 140509587991088 -> 140509587990896
- 140509587991088 [label=AddBackward0]
- 140509587991184 -> 140509587991088
- 140509587991184 [label=DivBackward0]
- 140509587991376 -> 140509587991184
- 140509587991376 [label=UnsafeViewBackward0]
- 140509587991568 -> 140509587991376
- 140509587991568 [label=BmmBackward0]
- 140509587991664 -> 140509587991568
- 140509587991664 [label=UnsafeViewBackward0]
- 140509587991760 -> 140509587991664
- 140509587991760 [label=CloneBackward0]
- 140509587991952 -> 140509587991760
- 140509587991952 [label=ExpandBackward0]
- 140509587992000 -> 140509587991952
- 140509587992000 [label=PermuteBackward0]
- 140509587992240 -> 140509587992000
- 140509587992240 [label=ViewBackward0]
- 140509587992432 -> 140509587992240
- 140509587992432 [label=ViewBackward0]
- 140509587991520 -> 140509587992432
- 140509587991520 [label=AddmmBackward0]
- 140509588021456 -> 140509587991520
- 140509588021456 [label=ToCopyBackward0]
- 140509588021696 -> 140509588021456
- 140509591321952 [label="encoder.layer.9.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509591321952 -> 140509588021696
- 140509588021696 [label=AccumulateGrad]
- 140509588021552 -> 140509587991520
- 140509588021552 [label=ViewBackward0]
- 140509588022032 -> 140509588021552
- 140509588022032 [label=ToCopyBackward0]
- 140509587963664 -> 140509588022032
- 140509587963664 [label=CatBackward0]
- 140509588022128 -> 140509587963664
- 140509588022128 [label=SumBackward1]
- 140509588022512 -> 140509588022128
- 140509588022512 [label=MulBackward0]
- 140509588022704 -> 140509588022512
- 140509588022704 [label=CatBackward0]
- 140509588022656 -> 140509588022704
- 140509588022656 [label=UnsqueezeBackward0]
- 140509588023184 -> 140509588022656
- 140509588023184 [label=NativeLayerNormBackward0]
- 140509588023280 -> 140509588023184
- 140509588023280 [label=AddBackward0]
- 140509588023664 -> 140509588023280
- 140509588023664 [label=NativeDropoutBackward0]
- 140509588023616 -> 140509588023664
- 140509588023616 [label=ViewBackward0]
- 140509588023856 -> 140509588023616
- 140509588023856 [label=AddmmBackward0]
- 140509588024048 -> 140509588023856
- 140509588024048 [label=ToCopyBackward0]
- 140509588024336 -> 140509588024048
- 140509591320272 [label="encoder.layer.8.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591320272 -> 140509588024336
- 140509588024336 [label=AccumulateGrad]
- 140509588024144 -> 140509588023856
- 140509588024144 [label=ViewBackward0]
- 140509588024624 -> 140509588024144
- 140509588024624 [label=GeluBackward0]
- 140509588024720 -> 140509588024624
- 140509588024720 [label=ViewBackward0]
- 140509588024912 -> 140509588024720
- 140509588024912 [label=AddmmBackward0]
- 140509588025104 -> 140509588024912
- 140509588025104 [label=ToCopyBackward0]
- 140509588025056 -> 140509588025104
- 140509591320192 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591320192 -> 140509588025056
- 140509588025056 [label=AccumulateGrad]
- 140509588024816 -> 140509588024912
- 140509588024816 [label=ViewBackward0]
- 140509588025200 -> 140509588024816
- 140509588025200 [label=ToCopyBackward0]
- 140509588023376 -> 140509588025200
- 140509588023376 [label=SliceBackward0]
- 140509588046224 -> 140509588023376
- 140509588046224 [label=SliceBackward0]
- 140509588046416 -> 140509588046224
- 140509588046416 [label=NativeLayerNormBackward0]
- 140509588046608 -> 140509588046416
- 140509588046608 [label=AddBackward0]
- 140509588046800 -> 140509588046608
- 140509588046800 [label=NativeDropoutBackward0]
- 140509588047184 -> 140509588046800
- 140509588047184 [label=ViewBackward0]
- 140509588047376 -> 140509588047184
- 140509588047376 [label=AddmmBackward0]
- 140509588047568 -> 140509588047376
- 140509588047568 [label=ToCopyBackward0]
- 140509588047856 -> 140509588047568
- 140509591341312 [label="encoder.layer.8.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591341312 -> 140509588047856
- 140509588047856 [label=AccumulateGrad]
- 140509588047280 -> 140509588047376
- 140509588047280 [label=ViewBackward0]
- 140509588047760 -> 140509588047280
- 140509588047760 [label=ViewBackward0]
- 140509588047952 -> 140509588047760
- 140509588047952 [label=CloneBackward0]
- 140509588048000 -> 140509588047952
- 140509588048000 [label=PermuteBackward0]
- 140509588048240 -> 140509588048000
- 140509588048240 [label=UnsafeViewBackward0]
- 140509588048432 -> 140509588048240
- 140509588048432 [label=BmmBackward0]
- 140509588048480 -> 140509588048432
- 140509588048480 [label=ReshapeAliasBackward0]
- 140509588049008 -> 140509588048480
- 140509588049008 [label=ExpandBackward0]
- 140509588049104 -> 140509588049008
- 140509588049104 [label=ToCopyBackward0]
- 140509588049296 -> 140509588049104
- 140509588049296 [label=NativeDropoutBackward0]
- 140509588049488 -> 140509588049296
- 140509588049488 [label=SoftmaxBackward0]
- 140509588049584 -> 140509588049488
- 140509588049584 [label=AddBackward0]
- 140509588049776 -> 140509588049584
- 140509588049776 [label=DivBackward0]
- 140509588049680 -> 140509588049776
- 140509588049680 [label=UnsafeViewBackward0]
- 140509588074656 -> 140509588049680
- 140509588074656 [label=BmmBackward0]
- 140509588074896 -> 140509588074656
- 140509588074896 [label=UnsafeViewBackward0]
- 140509588074992 -> 140509588074896
- 140509588074992 [label=CloneBackward0]
- 140509588075040 -> 140509588074992
- 140509588075040 [label=ExpandBackward0]
- 140509588075280 -> 140509588075040
- 140509588075280 [label=PermuteBackward0]
- 140509588075472 -> 140509588075280
- 140509588075472 [label=ViewBackward0]
- 140509588075520 -> 140509588075472
- 140509588075520 [label=ViewBackward0]
- 140509588075760 -> 140509588075520
- 140509588075760 [label=AddmmBackward0]
- 140509588075952 -> 140509588075760
- 140509588075952 [label=ToCopyBackward0]
- 140509588076240 -> 140509588075952
- 140509591342432 [label="encoder.layer.8.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509591342432 -> 140509588076240
- 140509588076240 [label=AccumulateGrad]
- 140509588076048 -> 140509588075760
- 140509588076048 [label=ViewBackward0]
- 140509588076528 -> 140509588076048
- 140509588076528 [label=ToCopyBackward0]
- 140509588046896 -> 140509588076528
- 140509588046896 [label=SliceBackward0]
- 140509588076480 -> 140509588046896
- 140509588076480 [label=SliceBackward0]
- 140509588076720 -> 140509588076480
- 140509588076720 [label=SliceBackward0]
- 140509588076912 -> 140509588076720
- 140509588076912 [label=NativeLayerNormBackward0]
- 140509588076960 -> 140509588076912
- 140509588076960 [label=AddBackward0]
- 140509588077392 -> 140509588076960
- 140509588077392 [label=NativeDropoutBackward0]
- 140509588077776 -> 140509588077392
- 140509588077776 [label=ViewBackward0]
- 140509588077968 -> 140509588077776
- 140509588077968 [label=AddmmBackward0]
- 140509588078064 -> 140509588077968
- 140509588078064 [label=ToCopyBackward0]
- 140509588078448 -> 140509588078064
- 140509590823056 [label="encoder.layer.8.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590823056 -> 140509588078448
- 140509588078448 [label=AccumulateGrad]
- 140509588077872 -> 140509588077968
- 140509588077872 [label=ViewBackward0]
- 140509588078352 -> 140509588077872
- 140509588078352 [label=ViewBackward0]
- 140509588078400 -> 140509588078352
- 140509588078400 [label=CloneBackward0]
- 140509588078160 -> 140509588078400
- 140509588078160 [label=PermuteBackward0]
- 140509588103472 -> 140509588078160
- 140509588103472 [label=UnsafeViewBackward0]
- 140509588103520 -> 140509588103472
- 140509588103520 [label=BmmBackward0]
- 140509588103760 -> 140509588103520
- 140509588103760 [label=ReshapeAliasBackward0]
- 140509588104144 -> 140509588103760
- 140509588104144 [label=ExpandBackward0]
- 140509588104336 -> 140509588104144
- 140509588104336 [label=ToCopyBackward0]
- 140509588104528 -> 140509588104336
- 140509588104528 [label=NativeDropoutBackward0]
- 140509588104624 -> 140509588104528
- 140509588104624 [label=SoftmaxBackward0]
- 140509588104816 -> 140509588104624
- 140509588104816 [label=AddBackward0]
- 140509588105008 -> 140509588104816
- 140509588105008 [label=DivBackward0]
- 140509588105104 -> 140509588105008
- 140509588105104 [label=UnsafeViewBackward0]
- 140509588105296 -> 140509588105104
- 140509588105296 [label=BmmBackward0]
- 140509588105488 -> 140509588105296
- 140509588105488 [label=UnsafeViewBackward0]
- 140509588105440 -> 140509588105488
- 140509588105440 [label=CloneBackward0]
- 140509588105680 -> 140509588105440
- 140509588105680 [label=ExpandBackward0]
- 140509588105872 -> 140509588105680
- 140509588105872 [label=PermuteBackward0]
- 140509588105920 -> 140509588105872
- 140509588105920 [label=ViewBackward0]
- 140509588106160 -> 140509588105920
- 140509588106160 [label=ViewBackward0]
- 140509588106352 -> 140509588106160
- 140509588106352 [label=AddmmBackward0]
- 140509588106400 -> 140509588106352
- 140509588106400 [label=ToCopyBackward0]
- 140509588106832 -> 140509588106400
- 140509590823536 [label="encoder.layer.8.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590823536 -> 140509588106832
- 140509588106832 [label=AccumulateGrad]
- 140509588106544 -> 140509588106352
- 140509588106544 [label=ViewBackward0]
- 140509588107024 -> 140509588106544
- 140509588107024 [label=ToCopyBackward0]
- 140509588077488 -> 140509588107024
- 140509588077488 [label=CatBackward0]
- 140509588107120 -> 140509588077488
- 140509588107120 [label=SumBackward1]
- 140509588136240 -> 140509588107120
- 140509588136240 [label=MulBackward0]
- 140509588136432 -> 140509588136240
- 140509588136432 [label=CatBackward0]
- 140509588136528 -> 140509588136432
- 140509588136528 [label=UnsqueezeBackward0]
- 140509588136912 -> 140509588136528
- 140509588136912 [label=NativeLayerNormBackward0]
- 140509588137104 -> 140509588136912
- 140509588137104 [label=AddBackward0]
- 140509588137392 -> 140509588137104
- 140509588137392 [label=NativeDropoutBackward0]
- 140509588137488 -> 140509588137392
- 140509588137488 [label=ViewBackward0]
- 140509588137536 -> 140509588137488
- 140509588137536 [label=AddmmBackward0]
- 140509588137776 -> 140509588137536
- 140509588137776 [label=ToCopyBackward0]
- 140509588138016 -> 140509588137776
- 140509591341952 [label="encoder.layer.7.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591341952 -> 140509588138016
- 140509588138016 [label=AccumulateGrad]
- 140509588137872 -> 140509588137536
- 140509588137872 [label=ViewBackward0]
- 140509588138352 -> 140509588137872
- 140509588138352 [label=GeluBackward0]
- 140509588138544 -> 140509588138352
- 140509588138544 [label=ViewBackward0]
- 140509588138640 -> 140509588138544
- 140509588138640 [label=AddmmBackward0]
- 140509588138832 -> 140509588138640
- 140509588138832 [label=ToCopyBackward0]
- 140509588139120 -> 140509588138832
- 140509591342272 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591342272 -> 140509588139120
- 140509588139120 [label=AccumulateGrad]
- 140509588138496 -> 140509588138640
- 140509588138496 [label=ViewBackward0]
- 140509588138976 -> 140509588138496
- 140509588138976 [label=ToCopyBackward0]
- 140509588137056 -> 140509588138976
- 140509588137056 [label=SliceBackward0]
- 140509588139504 -> 140509588137056
- 140509588139504 [label=SliceBackward0]
- 140509588139600 -> 140509588139504
- 140509588139600 [label=SliceBackward0]
- 140509588139792 -> 140509588139600
- 140509588139792 [label=SliceBackward0]
- 140509588139984 -> 140509588139792
- 140509588139984 [label=SliceBackward0]
- 140509588139888 -> 140509588139984
- 140509588139888 [label=NativeLayerNormBackward0]
- 140509588164912 -> 140509588139888
- 140509588164912 [label=AddBackward0]
- 140509588165200 -> 140509588164912
- 140509588165200 [label=NativeDropoutBackward0]
- 140509588165296 -> 140509588165200
- 140509588165296 [label=ViewBackward0]
- 140509588165488 -> 140509588165296
- 140509588165488 [label=AddmmBackward0]
- 140509588165536 -> 140509588165488
- 140509588165536 [label=ToCopyBackward0]
- 140509588165968 -> 140509588165536
- 140509590839360 [label="encoder.layer.7.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590839360 -> 140509588165968
- 140509588165968 [label=AccumulateGrad]
- 140509588165680 -> 140509588165488
- 140509588165680 [label=ViewBackward0]
- 140509588166160 -> 140509588165680
- 140509588166160 [label=ViewBackward0]
- 140509588166352 -> 140509588166160
- 140509588166352 [label=CloneBackward0]
- 140509588166544 -> 140509588166352
- 140509588166544 [label=PermuteBackward0]
- 140509588166640 -> 140509588166544
- 140509588166640 [label=UnsafeViewBackward0]
- 140509588166832 -> 140509588166640
- 140509588166832 [label=BmmBackward0]
- 140509588167024 -> 140509588166832
- 140509588167024 [label=ReshapeAliasBackward0]
- 140509588166976 -> 140509588167024
- 140509588166976 [label=ExpandBackward0]
- 140509588167216 -> 140509588166976
- 140509588167216 [label=ToCopyBackward0]
- 140509588167408 -> 140509588167216
- 140509588167408 [label=NativeDropoutBackward0]
- 140509588167456 -> 140509588167408
- 140509588167456 [label=SoftmaxBackward0]
- 140509588167696 -> 140509588167456
- 140509588167696 [label=AddBackward0]
- 140509588167888 -> 140509588167696
- 140509588167888 [label=DivBackward0]
- 140509588167936 -> 140509588167888
- 140509588167936 [label=UnsafeViewBackward0]
- 140509588168176 -> 140509588167936
- 140509588168176 [label=BmmBackward0]
- 140509588168368 -> 140509588168176
- 140509588168368 [label=UnsafeViewBackward0]
- 140509588168416 -> 140509588168368
- 140509588168416 [label=CloneBackward0]
- 140509588193584 -> 140509588168416
- 140509588193584 [label=ExpandBackward0]
- 140509588193680 -> 140509588193584
- 140509588193680 [label=PermuteBackward0]
- 140509588193872 -> 140509588193680
- 140509588193872 [label=ViewBackward0]
- 140509588194064 -> 140509588193872
- 140509588194064 [label=ViewBackward0]
- 140509588194160 -> 140509588194064
- 140509588194160 [label=AddmmBackward0]
- 140509588194352 -> 140509588194160
- 140509588194352 [label=ToCopyBackward0]
- 140509588194640 -> 140509588194352
- 140509590840320 [label="encoder.layer.7.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590840320 -> 140509588194640
- 140509588194640 [label=AccumulateGrad]
- 140509588194016 -> 140509588194160
- 140509588194016 [label=ViewBackward0]
- 140509588194496 -> 140509588194016
- 140509588194496 [label=ToCopyBackward0]
- 140509588165008 -> 140509588194496
- 140509588165008 [label=CatBackward0]
- 140509588195024 -> 140509588165008
- 140509588195024 [label=SumBackward1]
- 140509588194976 -> 140509588195024
- 140509588194976 [label=MulBackward0]
- 140509588195216 -> 140509588194976
- 140509588195216 [label=CatBackward0]
- 140509588195600 -> 140509588195216
- 140509588195600 [label=UnsqueezeBackward0]
- 140509588195696 -> 140509588195600
- 140509588195696 [label=NativeLayerNormBackward0]
- 140509588195888 -> 140509588195696
- 140509588195888 [label=AddBackward0]
- 140509588196176 -> 140509588195888
- 140509588196176 [label=NativeDropoutBackward0]
- 140509588196560 -> 140509588196176
- 140509588196560 [label=ViewBackward0]
- 140509588196752 -> 140509588196560
- 140509588196752 [label=AddmmBackward0]
- 140509588196944 -> 140509588196752
- 140509588196944 [label=ToCopyBackward0]
- 140509588197232 -> 140509588196944
- 140509590825776 [label="encoder.layer.6.experts.experts.0.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590825776 -> 140509588197232
- 140509588197232 [label=AccumulateGrad]
- 140509588196656 -> 140509588196752
- 140509588196656 [label=ViewBackward0]
- 140509588197040 -> 140509588196656
- 140509588197040 [label=GeluBackward0]
- 140509588196896 -> 140509588197040
- 140509588196896 [label=ViewBackward0]
- 140509587696464 -> 140509588196896
- 140509587696464 [label=AddmmBackward0]
- 140509587696368 -> 140509587696464
- 140509587696368 [label=ToCopyBackward0]
- 140509587693680 -> 140509587696368
- 140509590826256 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590826256 -> 140509587693680
- 140509587693680 [label=AccumulateGrad]
- 140509587696752 -> 140509587696464
- 140509587696752 [label=ViewBackward0]
- 140509587693728 -> 140509587696752
- 140509587693728 [label=ToCopyBackward0]
- 140509588196272 -> 140509587693728
- 140509588196272 [label=ViewBackward0]
- 140509587693872 -> 140509588196272
- 140509587693872 [label=CloneBackward0]
- 140509587694064 -> 140509587693872
- 140509587694064 [label=ExpandBackward0]
- 140509587694112 -> 140509587694064
- 140509587694112 [label=UnsqueezeBackward0]
- 140509587694352 -> 140509587694112
- 140509587694352 [label=SliceBackward0]
- 140509587694544 -> 140509587694352
- 140509587694544 [label=SliceBackward0]
- 140509587694592 -> 140509587694544
- 140509587694592 [label=NativeLayerNormBackward0]
- 140509587694832 -> 140509587694592
- 140509587694832 [label=AddBackward0]
- 140509587695072 -> 140509587694832
- 140509587695072 [label=NativeDropoutBackward0]
- 140509587695408 -> 140509587695072
- 140509587695408 [label=ViewBackward0]
- 140509587695600 -> 140509587695408
- 140509587695600 [label=AddmmBackward0]
- 140509587697232 -> 140509587695600
- 140509587697232 [label=ToCopyBackward0]
- 140509587696992 -> 140509587697232
- 140509590842480 [label="encoder.layer.6.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590842480 -> 140509587696992
- 140509587696992 [label=AccumulateGrad]
- 140509587697472 -> 140509587695600
- 140509587697472 [label=ViewBackward0]
- 140509587697616 -> 140509587697472
- 140509587697616 [label=ViewBackward0]
- 140509587696272 -> 140509587697616
- 140509587696272 [label=CloneBackward0]
- 140509587696944 -> 140509587696272
- 140509587696944 [label=PermuteBackward0]
- 140509587696512 -> 140509587696944
- 140509587696512 [label=UnsafeViewBackward0]
- 140509587695984 -> 140509587696512
- 140509587695984 [label=BmmBackward0]
- 140509587696032 -> 140509587695984
- 140509587696032 [label=ReshapeAliasBackward0]
- 140509587852640 -> 140509587696032
- 140509587852640 [label=ExpandBackward0]
- 140509587852544 -> 140509587852640
- 140509587852544 [label=ToCopyBackward0]
- 140509587852448 -> 140509587852544
- 140509587852448 [label=NativeDropoutBackward0]
- 140509587852352 -> 140509587852448
- 140509587852352 [label=SoftmaxBackward0]
- 140509587852256 -> 140509587852352
- 140509587852256 [label=AddBackward0]
- 140509587852160 -> 140509587852256
- 140509587852160 [label=DivBackward0]
- 140509587852064 -> 140509587852160
- 140509587852064 [label=UnsafeViewBackward0]
- 140509587851968 -> 140509587852064
- 140509587851968 [label=BmmBackward0]
- 140509587851872 -> 140509587851968
- 140509587851872 [label=ReshapeAliasBackward0]
- 140509587851824 -> 140509587851872
- 140509587851824 [label=ExpandBackward0]
- 140509587851728 -> 140509587851824
- 140509587851728 [label=PermuteBackward0]
- 140509587851632 -> 140509587851728
- 140509587851632 [label=ViewBackward0]
- 140509587851536 -> 140509587851632
- 140509587851536 [label=ViewBackward0]
- 140509587851440 -> 140509587851536
- 140509587851440 [label=AddmmBackward0]
- 140509587851344 -> 140509587851440
- 140509587851344 [label=ToCopyBackward0]
- 140509587851152 -> 140509587851344
- 140509590843200 [label="encoder.layer.6.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590843200 -> 140509587851152
- 140509587851152 [label=AccumulateGrad]
- 140509587851296 -> 140509587851440
- 140509587851296 [label=ViewBackward0]
- 140509587851008 -> 140509587851296
- 140509587851008 [label=ToCopyBackward0]
- 140509587695120 -> 140509587851008
- 140509587695120 [label=SliceBackward0]
- 140509587850960 -> 140509587695120
- 140509587850960 [label=SliceBackward0]
- 140509587850864 -> 140509587850960
- 140509587850864 [label=SliceBackward0]
- 140509587850768 -> 140509587850864
- 140509587850768 [label=NativeLayerNormBackward0]
- 140509587850672 -> 140509587850768
- 140509587850672 [label=AddBackward0]
- 140509587850480 -> 140509587850672
- 140509587850480 [label=NativeDropoutBackward0]
- 140509587850240 -> 140509587850480
- 140509587850240 [label=ViewBackward0]
- 140509587850144 -> 140509587850240
- 140509587850144 [label=AddmmBackward0]
- 140509587850048 -> 140509587850144
- 140509587850048 [label=ToCopyBackward0]
- 140509587849856 -> 140509587850048
- 140509590856064 [label="encoder.layer.6.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590856064 -> 140509587849856
- 140509587849856 [label=AccumulateGrad]
- 140509587850192 -> 140509587850144
- 140509587850192 [label=ViewBackward0]
- 140509587849904 -> 140509587850192
- 140509587849904 [label=ViewBackward0]
- 140509587849808 -> 140509587849904
- 140509587849808 [label=CloneBackward0]
- 140509587849712 -> 140509587849808
- 140509587849712 [label=PermuteBackward0]
- 140509587849616 -> 140509587849712
- 140509587849616 [label=UnsafeViewBackward0]
- 140509587849520 -> 140509587849616
- 140509587849520 [label=BmmBackward0]
- 140509587849424 -> 140509587849520
- 140509587849424 [label=ReshapeAliasBackward0]
- 140509587852976 -> 140509587849424
- 140509587852976 [label=ExpandBackward0]
- 140509587853072 -> 140509587852976
- 140509587853072 [label=ToCopyBackward0]
- 140509587853168 -> 140509587853072
- 140509587853168 [label=NativeDropoutBackward0]
- 140509587853264 -> 140509587853168
- 140509587853264 [label=SoftmaxBackward0]
- 140509587849280 -> 140509587853264
- 140509587849280 [label=AddBackward0]
- 140509587558608 -> 140509587849280
- 140509587558608 [label=DivBackward0]
- 140509587558704 -> 140509587558608
- 140509587558704 [label=UnsafeViewBackward0]
- 140509587558800 -> 140509587558704
- 140509587558800 [label=BmmBackward0]
- 140509587558896 -> 140509587558800
- 140509587558896 [label=ReshapeAliasBackward0]
- 140509587559040 -> 140509587558896
- 140509587559040 [label=ExpandBackward0]
- 140509587559136 -> 140509587559040
- 140509587559136 [label=PermuteBackward0]
- 140509587559232 -> 140509587559136
- 140509587559232 [label=ViewBackward0]
- 140509587559328 -> 140509587559232
- 140509587559328 [label=ViewBackward0]
- 140509587559424 -> 140509587559328
- 140509587559424 [label=AddmmBackward0]
- 140509587559520 -> 140509587559424
- 140509587559520 [label=ToCopyBackward0]
- 140509587559712 -> 140509587559520
- 140509590856784 [label="encoder.layer.6.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590856784 -> 140509587559712
- 140509587559712 [label=AccumulateGrad]
- 140509587559472 -> 140509587559424
- 140509587559472 [label=ViewBackward0]
- 140509587559760 -> 140509587559472
- 140509587559760 [label=ToCopyBackward0]
- 140509587850432 -> 140509587559760
- 140509587850432 [label=CatBackward0]
- 140509587559904 -> 140509587850432
- 140509587559904 [label=NativeLayerNormBackward0]
- 140509587560048 -> 140509587559904
- 140509587560048 [label=AddBackward0]
- 140509587560240 -> 140509587560048
- 140509587560240 [label=NativeDropoutBackward0]
- 140509587560384 -> 140509587560240
- 140509587560384 [label=ViewBackward0]
- 140509587560480 -> 140509587560384
- 140509587560480 [label=AddmmBackward0]
- 140509587560576 -> 140509587560480
- 140509587560576 [label=ToCopyBackward0]
- 140509587560768 -> 140509587560576
- 140509590857264 [label="encoder.layer.5.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590857264 -> 140509587560768
- 140509587560768 [label=AccumulateGrad]
- 140509587560528 -> 140509587560480
- 140509587560528 [label=ViewBackward0]
- 140509587560816 -> 140509587560528
- 140509587560816 [label=GeluBackward0]
- 140509587560912 -> 140509587560816
- 140509587560912 [label=ViewBackward0]
- 140509587561008 -> 140509587560912
- 140509587561008 [label=AddmmBackward0]
- 140509587561104 -> 140509587561008
- 140509587561104 [label=ToCopyBackward0]
- 140509587561296 -> 140509587561104
- 140509590857504 [label="encoder.layer.5.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590857504 -> 140509587561296
- 140509587561296 [label=AccumulateGrad]
- 140509587561056 -> 140509587561008
- 140509587561056 [label=ViewBackward0]
- 140509587561344 -> 140509587561056
- 140509587561344 [label=ToCopyBackward0]
- 140509587560192 -> 140509587561344
- 140509587560192 [label=SliceBackward0]
- 140509587561488 -> 140509587560192
- 140509587561488 [label=SliceBackward0]
- 140509587561584 -> 140509587561488
- 140509587561584 [label=SliceBackward0]
- 140509587561680 -> 140509587561584
- 140509587561680 [label=SliceBackward0]
- 140509587561776 -> 140509587561680
- 140509587561776 [label=SliceBackward0]
- 140509587561872 -> 140509587561776
- 140509587561872 [label=NativeLayerNormBackward0]
- 140509587561968 -> 140509587561872
- 140509587561968 [label=AddBackward0]
- 140509587562160 -> 140509587561968
- 140509587562160 [label=NativeDropoutBackward0]
- 140509587562304 -> 140509587562160
- 140509587562304 [label=ViewBackward0]
- 140509587562400 -> 140509587562304
- 140509587562400 [label=AddmmBackward0]
- 140509587562448 -> 140509587562400
- 140509587562448 [label=ToCopyBackward0]
- 140509587570944 -> 140509587562448
- 140509590859424 [label="encoder.layer.5.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590859424 -> 140509587570944
- 140509587570944 [label=AccumulateGrad]
- 140509587562208 -> 140509587562400
- 140509587562208 [label=ViewBackward0]
- 140509587570992 -> 140509587562208
- 140509587570992 [label=ViewBackward0]
- 140509587571136 -> 140509587570992
- 140509587571136 [label=CloneBackward0]
- 140509587571232 -> 140509587571136
- 140509587571232 [label=PermuteBackward0]
- 140509587571328 -> 140509587571232
- 140509587571328 [label=UnsafeViewBackward0]
- 140509587571424 -> 140509587571328
- 140509587571424 [label=BmmBackward0]
- 140509587571520 -> 140509587571424
- 140509587571520 [label=ReshapeAliasBackward0]
- 140509587571664 -> 140509587571520
- 140509587571664 [label=ExpandBackward0]
- 140509587571760 -> 140509587571664
- 140509587571760 [label=ToCopyBackward0]
- 140509587571856 -> 140509587571760
- 140509587571856 [label=NativeDropoutBackward0]
- 140509587571952 -> 140509587571856
- 140509587571952 [label=SoftmaxBackward0]
- 140509587572048 -> 140509587571952
- 140509587572048 [label=AddBackward0]
- 140509587572144 -> 140509587572048
- 140509587572144 [label=DivBackward0]
- 140509587572240 -> 140509587572144
- 140509587572240 [label=UnsafeViewBackward0]
- 140509587572336 -> 140509587572240
- 140509587572336 [label=BmmBackward0]
- 140509587572432 -> 140509587572336
- 140509587572432 [label=ReshapeAliasBackward0]
- 140509587572576 -> 140509587572432
- 140509587572576 [label=ExpandBackward0]
- 140509587572672 -> 140509587572576
- 140509587572672 [label=PermuteBackward0]
- 140509587572768 -> 140509587572672
- 140509587572768 [label=ViewBackward0]
- 140509587572864 -> 140509587572768
- 140509587572864 [label=ViewBackward0]
- 140509587572960 -> 140509587572864
- 140509587572960 [label=AddmmBackward0]
- 140509587573056 -> 140509587572960
- 140509587573056 [label=ToCopyBackward0]
- 140509587573248 -> 140509587573056
- 140509590872528 [label="encoder.layer.5.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590872528 -> 140509587573248
- 140509587573248 [label=AccumulateGrad]
- 140509587573008 -> 140509587572960
- 140509587573008 [label=ViewBackward0]
- 140509587573296 -> 140509587573008
- 140509587573296 [label=ToCopyBackward0]
- 140509587562112 -> 140509587573296
- 140509587562112 [label=CatBackward0]
- 140509587573440 -> 140509587562112
- 140509587573440 [label=NativeLayerNormBackward0]
- 140509587573584 -> 140509587573440
- 140509587573584 [label=AddBackward0]
- 140509587573776 -> 140509587573584
- 140509587573776 [label=NativeDropoutBackward0]
- 140509587573920 -> 140509587573776
- 140509587573920 [label=ViewBackward0]
- 140509587574016 -> 140509587573920
- 140509587574016 [label=AddmmBackward0]
- 140509587574112 -> 140509587574016
- 140509587574112 [label=ToCopyBackward0]
- 140509587574304 -> 140509587574112
- 140509590873008 [label="encoder.layer.4.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590873008 -> 140509587574304
- 140509587574304 [label=AccumulateGrad]
- 140509587574064 -> 140509587574016
- 140509587574064 [label=ViewBackward0]
- 140509587574352 -> 140509587574064
- 140509587574352 [label=GeluBackward0]
- 140509587574448 -> 140509587574352
- 140509587574448 [label=ViewBackward0]
- 140509587574544 -> 140509587574448
- 140509587574544 [label=AddmmBackward0]
- 140509587574640 -> 140509587574544
- 140509587574640 [label=ToCopyBackward0]
- 140509587574736 -> 140509587574640
- 140509590873248 [label="encoder.layer.4.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590873248 -> 140509587574736
- 140509587574736 [label=AccumulateGrad]
- 140509587574592 -> 140509587574544
- 140509587574592 [label=ViewBackward0]
- 140509587591232 -> 140509587574592
- 140509587591232 [label=ToCopyBackward0]
- 140509587573728 -> 140509587591232
- 140509587573728 [label=SliceBackward0]
- 140509587591472 -> 140509587573728
- 140509587591472 [label=SliceBackward0]
- 140509587591568 -> 140509587591472
- 140509587591568 [label=NativeLayerNormBackward0]
- 140509587591664 -> 140509587591568
- 140509587591664 [label=AddBackward0]
- 140509587591856 -> 140509587591664
- 140509587591856 [label=NativeDropoutBackward0]
- 140509587592000 -> 140509587591856
- 140509587592000 [label=ViewBackward0]
- 140509587592096 -> 140509587592000
- 140509587592096 [label=AddmmBackward0]
- 140509587592192 -> 140509587592096
- 140509587592192 [label=ToCopyBackward0]
- 140509587592384 -> 140509587592192
- 140509590875168 [label="encoder.layer.4.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590875168 -> 140509587592384
- 140509587592384 [label=AccumulateGrad]
- 140509587592144 -> 140509587592096
- 140509587592144 [label=ViewBackward0]
- 140509587592432 -> 140509587592144
- 140509587592432 [label=ViewBackward0]
- 140509587592528 -> 140509587592432
- 140509587592528 [label=CloneBackward0]
- 140509587592624 -> 140509587592528
- 140509587592624 [label=PermuteBackward0]
- 140509587592720 -> 140509587592624
- 140509587592720 [label=UnsafeViewBackward0]
- 140509587592816 -> 140509587592720
- 140509587592816 [label=BmmBackward0]
- 140509587592912 -> 140509587592816
- 140509587592912 [label=ReshapeAliasBackward0]
- 140509587593056 -> 140509587592912
- 140509587593056 [label=ExpandBackward0]
- 140509587593152 -> 140509587593056
- 140509587593152 [label=ToCopyBackward0]
- 140509587593248 -> 140509587593152
- 140509587593248 [label=NativeDropoutBackward0]
- 140509587593344 -> 140509587593248
- 140509587593344 [label=SoftmaxBackward0]
- 140509587593440 -> 140509587593344
- 140509587593440 [label=AddBackward0]
- 140509587593536 -> 140509587593440
- 140509587593536 [label=DivBackward0]
- 140509587593632 -> 140509587593536
- 140509587593632 [label=UnsafeViewBackward0]
- 140509587593728 -> 140509587593632
- 140509587593728 [label=BmmBackward0]
- 140509587593824 -> 140509587593728
- 140509587593824 [label=ReshapeAliasBackward0]
- 140509587593968 -> 140509587593824
- 140509587593968 [label=ExpandBackward0]
- 140509587594064 -> 140509587593968
- 140509587594064 [label=PermuteBackward0]
- 140509587594160 -> 140509587594064
- 140509587594160 [label=ViewBackward0]
- 140509587594256 -> 140509587594160
- 140509587594256 [label=ViewBackward0]
- 140509587594352 -> 140509587594256
- 140509587594352 [label=AddmmBackward0]
- 140509587594448 -> 140509587594352
- 140509587594448 [label=ToCopyBackward0]
- 140509587594640 -> 140509587594448
- 140509590875888 [label="encoder.layer.4.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590875888 -> 140509587594640
- 140509587594640 [label=AccumulateGrad]
- 140509587594400 -> 140509587594352
- 140509587594400 [label=ViewBackward0]
- 140509587594688 -> 140509587594400
- 140509587594688 [label=ToCopyBackward0]
- 140509587591808 -> 140509587594688
- 140509587591808 [label=SliceBackward0]
- 140509587594832 -> 140509587591808
- 140509587594832 [label=SliceBackward0]
- 140509587594928 -> 140509587594832
- 140509587594928 [label=SliceBackward0]
- 140509587595024 -> 140509587594928
- 140509587595024 [label=NativeLayerNormBackward0]
- 140509587595120 -> 140509587595024
- 140509587595120 [label=AddBackward0]
- 140509587595216 -> 140509587595120
- 140509587595216 [label=NativeDropoutBackward0]
- 140509587607808 -> 140509587595216
- 140509587607808 [label=ViewBackward0]
- 140509587607904 -> 140509587607808
- 140509587607904 [label=AddmmBackward0]
- 140509587608000 -> 140509587607904
- 140509587608000 [label=ToCopyBackward0]
- 140509587608192 -> 140509587608000
- 140509590892848 [label="encoder.layer.4.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590892848 -> 140509587608192
- 140509587608192 [label=AccumulateGrad]
- 140509587607952 -> 140509587607904
- 140509587607952 [label=ViewBackward0]
- 140509587608240 -> 140509587607952
- 140509587608240 [label=ViewBackward0]
- 140509587608336 -> 140509587608240
- 140509587608336 [label=CloneBackward0]
- 140509587608432 -> 140509587608336
- 140509587608432 [label=PermuteBackward0]
- 140509587608528 -> 140509587608432
- 140509587608528 [label=UnsafeViewBackward0]
- 140509587608624 -> 140509587608528
- 140509587608624 [label=BmmBackward0]
- 140509587608720 -> 140509587608624
- 140509587608720 [label=ReshapeAliasBackward0]
- 140509587608864 -> 140509587608720
- 140509587608864 [label=ExpandBackward0]
- 140509587608960 -> 140509587608864
- 140509587608960 [label=ToCopyBackward0]
- 140509587609056 -> 140509587608960
- 140509587609056 [label=NativeDropoutBackward0]
- 140509587609152 -> 140509587609056
- 140509587609152 [label=SoftmaxBackward0]
- 140509587609248 -> 140509587609152
- 140509587609248 [label=AddBackward0]
- 140509587609344 -> 140509587609248
- 140509587609344 [label=DivBackward0]
- 140509587609440 -> 140509587609344
- 140509587609440 [label=UnsafeViewBackward0]
- 140509587609536 -> 140509587609440
- 140509587609536 [label=BmmBackward0]
- 140509587609632 -> 140509587609536
- 140509587609632 [label=ReshapeAliasBackward0]
- 140509587609776 -> 140509587609632
- 140509587609776 [label=ExpandBackward0]
- 140509587609872 -> 140509587609776
- 140509587609872 [label=PermuteBackward0]
- 140509587609968 -> 140509587609872
- 140509587609968 [label=ViewBackward0]
- 140509587610064 -> 140509587609968
- 140509587610064 [label=ViewBackward0]
- 140509587610160 -> 140509587610064
- 140509587610160 [label=AddmmBackward0]
- 140509587610256 -> 140509587610160
- 140509587610256 [label=ToCopyBackward0]
- 140509587610448 -> 140509587610256
- 140509590893568 [label="encoder.layer.4.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590893568 -> 140509587610448
- 140509587610448 [label=AccumulateGrad]
- 140509587610208 -> 140509587610160
- 140509587610208 [label=ViewBackward0]
- 140509587610496 -> 140509587610208
- 140509587610496 [label=ToCopyBackward0]
- 140509587607664 -> 140509587610496
- 140509587607664 [label=CatBackward0]
- 140509587610640 -> 140509587607664
- 140509587610640 [label=NativeLayerNormBackward0]
- 140509587610784 -> 140509587610640
- 140509587610784 [label=AddBackward0]
- 140509587610976 -> 140509587610784
- 140509587610976 [label=NativeDropoutBackward0]
- 140509587611120 -> 140509587610976
- 140509587611120 [label=ViewBackward0]
- 140509587611216 -> 140509587611120
- 140509587611216 [label=AddmmBackward0]
- 140509587611312 -> 140509587611216
- 140509587611312 [label=ToCopyBackward0]
- 140509587611504 -> 140509587611312
- 140509590894048 [label="encoder.layer.3.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590894048 -> 140509587611504
- 140509587611504 [label=AccumulateGrad]
- 140509587611264 -> 140509587611216
- 140509587611264 [label=ViewBackward0]
- 140509587611552 -> 140509587611264
- 140509587611552 [label=GeluBackward0]
- 140509587611408 -> 140509587611552
- 140509587611408 [label=ViewBackward0]
- 140509587624096 -> 140509587611408
- 140509587624096 [label=AddmmBackward0]
- 140509587624192 -> 140509587624096
- 140509587624192 [label=ToCopyBackward0]
- 140509587624384 -> 140509587624192
- 140509590894288 [label="encoder.layer.3.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590894288 -> 140509587624384
- 140509587624384 [label=AccumulateGrad]
- 140509587624144 -> 140509587624096
- 140509587624144 [label=ViewBackward0]
- 140509587624432 -> 140509587624144
- 140509587624432 [label=ToCopyBackward0]
- 140509587610928 -> 140509587624432
- 140509587610928 [label=SliceBackward0]
- 140509587624576 -> 140509587610928
- 140509587624576 [label=SliceBackward0]
- 140509587624672 -> 140509587624576
- 140509587624672 [label=SliceBackward0]
- 140509587624768 -> 140509587624672
- 140509587624768 [label=SliceBackward0]
- 140509587624864 -> 140509587624768
- 140509587624864 [label=SliceBackward0]
- 140509587624960 -> 140509587624864
- 140509587624960 [label=NativeLayerNormBackward0]
- 140509587625056 -> 140509587624960
- 140509587625056 [label=AddBackward0]
- 140509587625248 -> 140509587625056
- 140509587625248 [label=NativeDropoutBackward0]
- 140509587625392 -> 140509587625248
- 140509587625392 [label=ViewBackward0]
- 140509587625488 -> 140509587625392
- 140509587625488 [label=AddmmBackward0]
- 140509587625584 -> 140509587625488
- 140509587625584 [label=ToCopyBackward0]
- 140509587625776 -> 140509587625584
- 140509590896208 [label="encoder.layer.3.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590896208 -> 140509587625776
- 140509587625776 [label=AccumulateGrad]
- 140509587625536 -> 140509587625488
- 140509587625536 [label=ViewBackward0]
- 140509587625824 -> 140509587625536
- 140509587625824 [label=ViewBackward0]
- 140509587625920 -> 140509587625824
- 140509587625920 [label=CloneBackward0]
- 140509587626016 -> 140509587625920
- 140509587626016 [label=PermuteBackward0]
- 140509587626112 -> 140509587626016
- 140509587626112 [label=UnsafeViewBackward0]
- 140509587626208 -> 140509587626112
- 140509587626208 [label=BmmBackward0]
- 140509587626304 -> 140509587626208
- 140509587626304 [label=ReshapeAliasBackward0]
- 140509587626448 -> 140509587626304
- 140509587626448 [label=ExpandBackward0]
- 140509587626544 -> 140509587626448
- 140509587626544 [label=ToCopyBackward0]
- 140509587626640 -> 140509587626544
- 140509587626640 [label=NativeDropoutBackward0]
- 140509587626736 -> 140509587626640
- 140509587626736 [label=SoftmaxBackward0]
- 140509587626832 -> 140509587626736
- 140509587626832 [label=AddBackward0]
- 140509587626928 -> 140509587626832
- 140509587626928 [label=DivBackward0]
- 140509587627024 -> 140509587626928
- 140509587627024 [label=UnsafeViewBackward0]
- 140509587627120 -> 140509587627024
- 140509587627120 [label=BmmBackward0]
- 140509587627216 -> 140509587627120
- 140509587627216 [label=ReshapeAliasBackward0]
- 140509587627360 -> 140509587627216
- 140509587627360 [label=ExpandBackward0]
- 140509587627456 -> 140509587627360
- 140509587627456 [label=PermuteBackward0]
- 140509587627552 -> 140509587627456
- 140509587627552 [label=ViewBackward0]
- 140509587627648 -> 140509587627552
- 140509587627648 [label=ViewBackward0]
- 140509587627744 -> 140509587627648
- 140509587627744 [label=AddmmBackward0]
- 140509587627840 -> 140509587627744
- 140509587627840 [label=ToCopyBackward0]
- 140509587627984 -> 140509587627840
- 140509590901120 [label="encoder.layer.3.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590901120 -> 140509587627984
- 140509587627984 [label=AccumulateGrad]
- 140509587627792 -> 140509587627744
- 140509587627792 [label=ViewBackward0]
- 140509587627936 -> 140509587627792
- 140509587627936 [label=ToCopyBackward0]
- 140509587625200 -> 140509587627936
- 140509587625200 [label=CatBackward0]
- 140509587640576 -> 140509587625200
- 140509587640576 [label=NativeLayerNormBackward0]
- 140509587640720 -> 140509587640576
- 140509587640720 [label=AddBackward0]
- 140509587640912 -> 140509587640720
- 140509587640912 [label=NativeDropoutBackward0]
- 140509587641056 -> 140509587640912
- 140509587641056 [label=ViewBackward0]
- 140509587641152 -> 140509587641056
- 140509587641152 [label=AddmmBackward0]
- 140509587641248 -> 140509587641152
- 140509587641248 [label=ToCopyBackward0]
- 140509587641440 -> 140509587641248
- 140509590901600 [label="encoder.layer.2.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590901600 -> 140509587641440
- 140509587641440 [label=AccumulateGrad]
- 140509587641200 -> 140509587641152
- 140509587641200 [label=ViewBackward0]
- 140509587641488 -> 140509587641200
- 140509587641488 [label=GeluBackward0]
- 140509587641584 -> 140509587641488
- 140509587641584 [label=ViewBackward0]
- 140509587641680 -> 140509587641584
- 140509587641680 [label=AddmmBackward0]
- 140509587641776 -> 140509587641680
- 140509587641776 [label=ToCopyBackward0]
- 140509587641968 -> 140509587641776
- 140509590901840 [label="encoder.layer.2.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590901840 -> 140509587641968
- 140509587641968 [label=AccumulateGrad]
- 140509587641728 -> 140509587641680
- 140509587641728 [label=ViewBackward0]
- 140509587642016 -> 140509587641728
- 140509587642016 [label=ToCopyBackward0]
- 140509587640864 -> 140509587642016
- 140509587640864 [label=SliceBackward0]
- 140509587642160 -> 140509587640864
- 140509587642160 [label=SliceBackward0]
- 140509587642256 -> 140509587642160
- 140509587642256 [label=NativeLayerNormBackward0]
- 140509587642352 -> 140509587642256
- 140509587642352 [label=AddBackward0]
- 140509587642544 -> 140509587642352
- 140509587642544 [label=NativeDropoutBackward0]
- 140509587642688 -> 140509587642544
- 140509587642688 [label=ViewBackward0]
- 140509587642784 -> 140509587642688
- 140509587642784 [label=AddmmBackward0]
- 140509587642880 -> 140509587642784
- 140509587642880 [label=ToCopyBackward0]
- 140509587643072 -> 140509587642880
- 140509590903760 [label="encoder.layer.2.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590903760 -> 140509587643072
- 140509587643072 [label=AccumulateGrad]
- 140509587642832 -> 140509587642784
- 140509587642832 [label=ViewBackward0]
- 140509587643120 -> 140509587642832
- 140509587643120 [label=ViewBackward0]
- 140509587643216 -> 140509587643120
- 140509587643216 [label=CloneBackward0]
- 140509587643312 -> 140509587643216
- 140509587643312 [label=PermuteBackward0]
- 140509587643408 -> 140509587643312
- 140509587643408 [label=UnsafeViewBackward0]
- 140509587643504 -> 140509587643408
- 140509587643504 [label=BmmBackward0]
- 140509587643600 -> 140509587643504
- 140509587643600 [label=ReshapeAliasBackward0]
- 140509587643744 -> 140509587643600
- 140509587643744 [label=ExpandBackward0]
- 140509587643840 -> 140509587643744
- 140509587643840 [label=ToCopyBackward0]
- 140509587643936 -> 140509587643840
- 140509587643936 [label=NativeDropoutBackward0]
- 140509587644032 -> 140509587643936
- 140509587644032 [label=SoftmaxBackward0]
- 140509587644128 -> 140509587644032
- 140509587644128 [label=AddBackward0]
- 140509587644224 -> 140509587644128
- 140509587644224 [label=DivBackward0]
- 140509587644320 -> 140509587644224
- 140509587644320 [label=UnsafeViewBackward0]
- 140509587644368 -> 140509587644320
- 140509587644368 [label=BmmBackward0]
- 140509587656864 -> 140509587644368
- 140509587656864 [label=ReshapeAliasBackward0]
- 140509587657008 -> 140509587656864
- 140509587657008 [label=ExpandBackward0]
- 140509587657104 -> 140509587657008
- 140509587657104 [label=PermuteBackward0]
- 140509587657200 -> 140509587657104
- 140509587657200 [label=ViewBackward0]
- 140509587657296 -> 140509587657200
- 140509587657296 [label=ViewBackward0]
- 140509587657392 -> 140509587657296
- 140509587657392 [label=AddmmBackward0]
- 140509587657488 -> 140509587657392
- 140509587657488 [label=ToCopyBackward0]
- 140509587657680 -> 140509587657488
- 140509590904480 [label="encoder.layer.2.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590904480 -> 140509587657680
- 140509587657680 [label=AccumulateGrad]
- 140509587657440 -> 140509587657392
- 140509587657440 [label=ViewBackward0]
- 140509587657728 -> 140509587657440
- 140509587657728 [label=ToCopyBackward0]
- 140509587642496 -> 140509587657728
- 140509587642496 [label=SliceBackward0]
- 140509587657872 -> 140509587642496
- 140509587657872 [label=SliceBackward0]
- 140509587657968 -> 140509587657872
- 140509587657968 [label=SliceBackward0]
- 140509587658064 -> 140509587657968
- 140509587658064 [label=NativeLayerNormBackward0]
- 140509587658160 -> 140509587658064
- 140509587658160 [label=AddBackward0]
- 140509587658352 -> 140509587658160
- 140509587658352 [label=NativeDropoutBackward0]
- 140509587658496 -> 140509587658352
- 140509587658496 [label=ViewBackward0]
- 140509587658592 -> 140509587658496
- 140509587658592 [label=AddmmBackward0]
- 140509587658688 -> 140509587658592
- 140509587658688 [label=ToCopyBackward0]
- 140509587658880 -> 140509587658688
- 140509590913248 [label="encoder.layer.2.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590913248 -> 140509587658880
- 140509587658880 [label=AccumulateGrad]
- 140509587658640 -> 140509587658592
- 140509587658640 [label=ViewBackward0]
- 140509587658928 -> 140509587658640
- 140509587658928 [label=ViewBackward0]
- 140509587659024 -> 140509587658928
- 140509587659024 [label=CloneBackward0]
- 140509587659120 -> 140509587659024
- 140509587659120 [label=PermuteBackward0]
- 140509587659216 -> 140509587659120
- 140509587659216 [label=UnsafeViewBackward0]
- 140509587659312 -> 140509587659216
- 140509587659312 [label=BmmBackward0]
- 140509587659408 -> 140509587659312
- 140509587659408 [label=ReshapeAliasBackward0]
- 140509587659552 -> 140509587659408
- 140509587659552 [label=ExpandBackward0]
- 140509587659648 -> 140509587659552
- 140509587659648 [label=ToCopyBackward0]
- 140509587659744 -> 140509587659648
- 140509587659744 [label=NativeDropoutBackward0]
- 140509587659840 -> 140509587659744
- 140509587659840 [label=SoftmaxBackward0]
- 140509587659936 -> 140509587659840
- 140509587659936 [label=AddBackward0]
- 140509587660032 -> 140509587659936
- 140509587660032 [label=DivBackward0]
- 140509587660128 -> 140509587660032
- 140509587660128 [label=UnsafeViewBackward0]
- 140509587660224 -> 140509587660128
- 140509587660224 [label=BmmBackward0]
- 140509587660320 -> 140509587660224
- 140509587660320 [label=ReshapeAliasBackward0]
- 140509587660464 -> 140509587660320
- 140509587660464 [label=ExpandBackward0]
- 140509587660560 -> 140509587660464
- 140509587660560 [label=PermuteBackward0]
- 140509587660656 -> 140509587660560
- 140509587660656 [label=ViewBackward0]
- 140509587660752 -> 140509587660656
- 140509587660752 [label=ViewBackward0]
- 140509587660368 -> 140509587660752
- 140509587660368 [label=AddmmBackward0]
- 140509587673296 -> 140509587660368
- 140509587673296 [label=ToCopyBackward0]
- 140509587673488 -> 140509587673296
- 140509590913968 [label="encoder.layer.2.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590913968 -> 140509587673488
- 140509587673488 [label=AccumulateGrad]
- 140509587673248 -> 140509587660368
- 140509587673248 [label=ViewBackward0]
- 140509587673536 -> 140509587673248
- 140509587673536 [label=ToCopyBackward0]
- 140509587658304 -> 140509587673536
- 140509587658304 [label=CatBackward0]
- 140509587673680 -> 140509587658304
- 140509587673680 [label=NativeLayerNormBackward0]
- 140509587673824 -> 140509587673680
- 140509587673824 [label=AddBackward0]
- 140509587674016 -> 140509587673824
- 140509587674016 [label=NativeDropoutBackward0]
- 140509587674160 -> 140509587674016
- 140509587674160 [label=ViewBackward0]
- 140509587674256 -> 140509587674160
- 140509587674256 [label=AddmmBackward0]
- 140509587674352 -> 140509587674256
- 140509587674352 [label=ToCopyBackward0]
- 140509587674544 -> 140509587674352
- 140509590914448 [label="encoder.layer.1.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590914448 -> 140509587674544
- 140509587674544 [label=AccumulateGrad]
- 140509587674304 -> 140509587674256
- 140509587674304 [label=ViewBackward0]
- 140509587674592 -> 140509587674304
- 140509587674592 [label=GeluBackward0]
- 140509587674688 -> 140509587674592
- 140509587674688 [label=ViewBackward0]
- 140509587674784 -> 140509587674688
- 140509587674784 [label=AddmmBackward0]
- 140509587674880 -> 140509587674784
- 140509587674880 [label=ToCopyBackward0]
- 140509587675072 -> 140509587674880
- 140509590914688 [label="encoder.layer.1.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590914688 -> 140509587675072
- 140509587675072 [label=AccumulateGrad]
- 140509587674832 -> 140509587674784
- 140509587674832 [label=ViewBackward0]
- 140509587675120 -> 140509587674832
- 140509587675120 [label=ToCopyBackward0]
- 140509587673968 -> 140509587675120
- 140509587673968 [label=SliceBackward0]
- 140509587675264 -> 140509587673968
- 140509587675264 [label=SliceBackward0]
- 140509587675360 -> 140509587675264
- 140509587675360 [label=SliceBackward0]
- 140509587675456 -> 140509587675360
- 140509587675456 [label=SliceBackward0]
- 140509587675552 -> 140509587675456
- 140509587675552 [label=SliceBackward0]
- 140509587675648 -> 140509587675552
- 140509587675648 [label=NativeLayerNormBackward0]
- 140509587675744 -> 140509587675648
- 140509587675744 [label=AddBackward0]
- 140509587675936 -> 140509587675744
- 140509587675936 [label=NativeDropoutBackward0]
- 140509587676080 -> 140509587675936
- 140509587676080 [label=ViewBackward0]
- 140509587676176 -> 140509587676080
- 140509587676176 [label=AddmmBackward0]
- 140509587676272 -> 140509587676176
- 140509587676272 [label=ToCopyBackward0]
- 140509587676464 -> 140509587676272
- 140509590916608 [label="encoder.layer.1.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590916608 -> 140509587676464
- 140509587676464 [label=AccumulateGrad]
- 140509587676224 -> 140509587676176
- 140509587676224 [label=ViewBackward0]
- 140509587676512 -> 140509587676224
- 140509587676512 [label=ViewBackward0]
- 140509587676608 -> 140509587676512
- 140509587676608 [label=CloneBackward0]
- 140509587676704 -> 140509587676608
- 140509587676704 [label=PermuteBackward0]
- 140509587676800 -> 140509587676704
- 140509587676800 [label=UnsafeViewBackward0]
- 140509587676896 -> 140509587676800
- 140509587676896 [label=BmmBackward0]
- 140509587676992 -> 140509587676896
- 140509587676992 [label=ReshapeAliasBackward0]
- 140509587677136 -> 140509587676992
- 140509587677136 [label=ExpandBackward0]
- 140509587677040 -> 140509587677136
- 140509587677040 [label=ToCopyBackward0]
- 140517615505616 -> 140509587677040
- 140517615505616 [label=NativeDropoutBackward0]
- 140517615505712 -> 140517615505616
- 140517615505712 [label=SoftmaxBackward0]
- 140517615505808 -> 140517615505712
- 140517615505808 [label=AddBackward0]
- 140517615505904 -> 140517615505808
- 140517615505904 [label=DivBackward0]
- 140517615506000 -> 140517615505904
- 140517615506000 [label=UnsafeViewBackward0]
- 140517615506096 -> 140517615506000
- 140517615506096 [label=BmmBackward0]
- 140517615506192 -> 140517615506096
- 140517615506192 [label=ReshapeAliasBackward0]
- 140517615506336 -> 140517615506192
- 140517615506336 [label=ExpandBackward0]
- 140517615506432 -> 140517615506336
- 140517615506432 [label=PermuteBackward0]
- 140517615506528 -> 140517615506432
- 140517615506528 [label=ViewBackward0]
- 140517615506624 -> 140517615506528
- 140517615506624 [label=ViewBackward0]
- 140517615506720 -> 140517615506624
- 140517615506720 [label=AddmmBackward0]
- 140517615506816 -> 140517615506720
- 140517615506816 [label=ToCopyBackward0]
- 140517615507008 -> 140517615506816
- 140509590933808 [label="encoder.layer.1.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590933808 -> 140517615507008
- 140517615507008 [label=AccumulateGrad]
- 140517615506768 -> 140517615506720
- 140517615506768 [label=ViewBackward0]
- 140517615507056 -> 140517615506768
- 140517615507056 [label=ToCopyBackward0]
- 140509587675888 -> 140517615507056
- 140509587675888 [label=CatBackward0]
- 140517615507200 -> 140509587675888
- 140517615507200 [label=NativeLayerNormBackward0]
- 140517615507344 -> 140517615507200
- 140517615507344 [label=AddBackward0]
- 140517615507536 -> 140517615507344
- 140517615507536 [label=NativeDropoutBackward0]
- 140517615507680 -> 140517615507536
- 140517615507680 [label=ViewBackward0]
- 140517615507776 -> 140517615507680
- 140517615507776 [label=AddmmBackward0]
- 140517615507872 -> 140517615507776
- 140517615507872 [label=ToCopyBackward0]
- 140517615508064 -> 140517615507872
- 140509590934288 [label="encoder.layer.0.experts.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590934288 -> 140517615508064
- 140517615508064 [label=AccumulateGrad]
- 140517615507824 -> 140517615507776
- 140517615507824 [label=ViewBackward0]
- 140517615508112 -> 140517615507824
- 140517615508112 [label=GeluBackward0]
- 140517615508208 -> 140517615508112
- 140517615508208 [label=ViewBackward0]
- 140517615508304 -> 140517615508208
- 140517615508304 [label=AddmmBackward0]
- 140517615508400 -> 140517615508304
- 140517615508400 [label=ToCopyBackward0]
- 140517615508592 -> 140517615508400
- 140509590934528 [label="encoder.layer.0.experts.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590934528 -> 140517615508592
- 140517615508592 [label=AccumulateGrad]
- 140517615508352 -> 140517615508304
- 140517615508352 [label=ViewBackward0]
- 140517615508640 -> 140517615508352
- 140517615508640 [label=ToCopyBackward0]
- 140517615507488 -> 140517615508640
- 140517615507488 [label=SliceBackward0]
- 140517615508784 -> 140517615507488
- 140517615508784 [label=SliceBackward0]
- 140517615508880 -> 140517615508784
- 140517615508880 [label=NativeLayerNormBackward0]
- 140517615508976 -> 140517615508880
- 140517615508976 [label=AddBackward0]
- 140517615509168 -> 140517615508976
- 140517615509168 [label=NativeDropoutBackward0]
- 140517615509312 -> 140517615509168
- 140517615509312 [label=ViewBackward0]
- 140517615509408 -> 140517615509312
- 140517615509408 [label=AddmmBackward0]
- 140517615509456 -> 140517615509408
- 140517615509456 [label=ToCopyBackward0]
- 140517615522048 -> 140517615509456
- 140509590936448 [label="encoder.layer.0.crossattention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590936448 -> 140517615522048
- 140517615522048 [label=AccumulateGrad]
- 140517615509216 -> 140517615509408
- 140517615509216 [label=ViewBackward0]
- 140517615522096 -> 140517615509216
- 140517615522096 [label=ViewBackward0]
- 140517615522192 -> 140517615522096
- 140517615522192 [label=CloneBackward0]
- 140517615522288 -> 140517615522192
- 140517615522288 [label=PermuteBackward0]
- 140517615522384 -> 140517615522288
- 140517615522384 [label=UnsafeViewBackward0]
- 140517615522480 -> 140517615522384
- 140517615522480 [label=BmmBackward0]
- 140517615522576 -> 140517615522480
- 140517615522576 [label=ReshapeAliasBackward0]
- 140517615522720 -> 140517615522576
- 140517615522720 [label=ExpandBackward0]
- 140517615522816 -> 140517615522720
- 140517615522816 [label=ToCopyBackward0]
- 140517615522912 -> 140517615522816
- 140517615522912 [label=NativeDropoutBackward0]
- 140517615523008 -> 140517615522912
- 140517615523008 [label=SoftmaxBackward0]
- 140517615523104 -> 140517615523008
- 140517615523104 [label=AddBackward0]
- 140517615523200 -> 140517615523104
- 140517615523200 [label=DivBackward0]
- 140517615523296 -> 140517615523200
- 140517615523296 [label=UnsafeViewBackward0]
- 140517615523392 -> 140517615523296
- 140517615523392 [label=BmmBackward0]
- 140517615523488 -> 140517615523392
- 140517615523488 [label=ReshapeAliasBackward0]
- 140517615523632 -> 140517615523488
- 140517615523632 [label=ExpandBackward0]
- 140517615523728 -> 140517615523632
- 140517615523728 [label=PermuteBackward0]
- 140517615523824 -> 140517615523728
- 140517615523824 [label=ViewBackward0]
- 140517615523920 -> 140517615523824
- 140517615523920 [label=ViewBackward0]
- 140517615524016 -> 140517615523920
- 140517615524016 [label=AddmmBackward0]
- 140517615524112 -> 140517615524016
- 140517615524112 [label=ToCopyBackward0]
- 140517615524304 -> 140517615524112
- 140509590937168 [label="encoder.layer.0.crossattention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590937168 -> 140517615524304
- 140517615524304 [label=AccumulateGrad]
- 140517615524064 -> 140517615524016
- 140517615524064 [label=ViewBackward0]
- 140517615524352 -> 140517615524064
- 140517615524352 [label=ToCopyBackward0]
- 140517615509120 -> 140517615524352
- 140517615509120 [label=SliceBackward0]
- 140517615524496 -> 140517615509120
- 140517615524496 [label=SliceBackward0]
- 140517615524592 -> 140517615524496
- 140517615524592 [label=SliceBackward0]
- 140517615524688 -> 140517615524592
- 140517615524688 [label=NativeLayerNormBackward0]
- 140517615524784 -> 140517615524688
- 140517615524784 [label=AddBackward0]
- 140517615524976 -> 140517615524784
- 140517615524976 [label=NativeDropoutBackward0]
- 140517615525120 -> 140517615524976
- 140517615525120 [label=ViewBackward0]
- 140517615525216 -> 140517615525120
- 140517615525216 [label=AddmmBackward0]
- 140517615525312 -> 140517615525216
- 140517615525312 [label=ToCopyBackward0]
- 140517615525504 -> 140517615525312
- 140509590945936 [label="encoder.layer.0.attention.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590945936 -> 140517615525504
- 140517615525504 [label=AccumulateGrad]
- 140517615525264 -> 140517615525216
- 140517615525264 [label=ViewBackward0]
- 140517615525552 -> 140517615525264
- 140517615525552 [label=ViewBackward0]
- 140517615525648 -> 140517615525552
- 140517615525648 [label=CloneBackward0]
- 140517615525744 -> 140517615525648
- 140517615525744 [label=PermuteBackward0]
- 140517615525840 -> 140517615525744
- 140517615525840 [label=UnsafeViewBackward0]
- 140517615525456 -> 140517615525840
- 140517615525456 [label=BmmBackward0]
- 140517615538384 -> 140517615525456
- 140517615538384 [label=ReshapeAliasBackward0]
- 140517615538528 -> 140517615538384
- 140517615538528 [label=ExpandBackward0]
- 140517615538624 -> 140517615538528
- 140517615538624 [label=ToCopyBackward0]
- 140517615538720 -> 140517615538624
- 140517615538720 [label=NativeDropoutBackward0]
- 140517615538816 -> 140517615538720
- 140517615538816 [label=SoftmaxBackward0]
- 140517615538912 -> 140517615538816
- 140517615538912 [label=AddBackward0]
- 140517615539008 -> 140517615538912
- 140517615539008 [label=DivBackward0]
- 140517615539104 -> 140517615539008
- 140517615539104 [label=UnsafeViewBackward0]
- 140517615539200 -> 140517615539104
- 140517615539200 [label=BmmBackward0]
- 140517615539296 -> 140517615539200
- 140517615539296 [label=ReshapeAliasBackward0]
- 140517615539440 -> 140517615539296
- 140517615539440 [label=ExpandBackward0]
- 140517615539536 -> 140517615539440
- 140517615539536 [label=PermuteBackward0]
- 140517615539632 -> 140517615539536
- 140517615539632 [label=ViewBackward0]
- 140517615539728 -> 140517615539632
- 140517615539728 [label=ViewBackward0]
- 140517615539824 -> 140517615539728
- 140517615539824 [label=AddmmBackward0]
- 140517615539920 -> 140517615539824
- 140517615539920 [label=ToCopyBackward0]
- 140517615540112 -> 140517615539920
- 140509590600896 [label="encoder.layer.0.attention.self.query.bias
- (768)" fillcolor=lightblue]
- 140509590600896 -> 140517615540112
- 140517615540112 [label=AccumulateGrad]
- 140517615539872 -> 140517615539824
- 140517615539872 [label=ViewBackward0]
- 140517615540160 -> 140517615539872
- 140517615540160 [label=ToCopyBackward0]
- 140517615524928 -> 140517615540160
- 140517615524928 [label=NativeDropoutBackward0]
- 140517615540304 -> 140517615524928
- 140517615540304 [label=NativeLayerNormBackward0]
- 140517615540400 -> 140517615540304
- 140517615540400 [label=CatBackward0]
- 140517615540592 -> 140517615540400
- 140517615540592 [label=ExpandBackward0]
- 140517615540736 -> 140517615540592
- 140509590947296 [label="
- (1, 32, 768)" fillcolor=lightblue]
- 140509590947296 -> 140517615540736
- 140517615540736 [label=AccumulateGrad]
- 140517615540544 -> 140517615540400
- 140517615540544 [label=AddBackward0]
- 140517615540784 -> 140517615540544
- 140517615540784 [label=EmbeddingBackward0]
- 140517615540928 -> 140517615540784
- 140509590947856 [label="embeddings.word_embeddings.weight
- (30523, 768)" fillcolor=lightblue]
- 140509590947856 -> 140517615540928
- 140517615540928 [label=AccumulateGrad]
- 140517615540832 -> 140517615540544
- 140517615540832 [label=EmbeddingBackward0]
- 140517615540976 -> 140517615540832
- 140509939919504 [label="embeddings.position_embeddings.weight
- (512, 768)" fillcolor=lightblue]
- 140509939919504 -> 140517615540976
- 140517615540976 [label=AccumulateGrad]
- 140517615540352 -> 140517615540304
- 140509590958304 [label="embeddings.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590958304 -> 140517615540352
- 140517615540352 [label=AccumulateGrad]
- 140517615540016 -> 140517615540304
- 140509590946656 [label="embeddings.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590946656 -> 140517615540016
- 140517615540016 [label=AccumulateGrad]
- 140517615539344 -> 140517615539824
- 140517615539344 [label=TBackward0]
- 140517615540064 -> 140517615539344
- 140517615540064 [label=ToCopyBackward0]
- 140517615540496 -> 140517615540064
- 140509986890912 [label="encoder.layer.0.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509986890912 -> 140517615540496
- 140517615540496 [label=AccumulateGrad]
- 140517615539248 -> 140517615539200
- 140517615539248 [label=ReshapeAliasBackward0]
- 140517615539584 -> 140517615539248
- 140517615539584 [label=ExpandBackward0]
- 140517615539776 -> 140517615539584
- 140517615539776 [label=TransposeBackward0]
- 140517615540256 -> 140517615539776
- 140517615540256 [label=PermuteBackward0]
- 140517615541024 -> 140517615540256
- 140517615541024 [label=ViewBackward0]
- 140517615540208 -> 140517615541024
- 140517615540208 [label=ViewBackward0]
- 140517615540640 -> 140517615540208
- 140517615540640 [label=AddmmBackward0]
- 140517615541120 -> 140517615540640
- 140517615541120 [label=ToCopyBackward0]
- 140517615541312 -> 140517615541120
- 140509590946096 [label="encoder.layer.0.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590946096 -> 140517615541312
- 140517615541312 [label=AccumulateGrad]
- 140517615540880 -> 140517615540640
- 140517615540880 [label=ViewBackward0]
- 140517615541360 -> 140517615540880
- 140517615541360 [label=ToCopyBackward0]
- 140517615524928 -> 140517615541360
- 140517615539392 -> 140517615540640
- 140517615539392 [label=TBackward0]
- 140517615541216 -> 140517615539392
- 140517615541216 [label=ToCopyBackward0]
- 140517615541504 -> 140517615541216
- 140509590600816 [label="encoder.layer.0.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590600816 -> 140517615541504
- 140517615541504 [label=AccumulateGrad]
- 140517615538336 -> 140517615525456
- 140517615538336 [label=ReshapeAliasBackward0]
- 140517615538672 -> 140517615538336
- 140517615538672 [label=ExpandBackward0]
- 140517615538864 -> 140517615538672
- 140517615538864 [label=PermuteBackward0]
- 140517615539056 -> 140517615538864
- 140517615539056 [label=ViewBackward0]
- 140517615538432 -> 140517615539056
- 140517615538432 [label=ViewBackward0]
- 140517615539680 -> 140517615538432
- 140517615539680 [label=AddmmBackward0]
- 140517615540448 -> 140517615539680
- 140517615540448 [label=ToCopyBackward0]
- 140517615541456 -> 140517615540448
- 140509590945856 [label="encoder.layer.0.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590945856 -> 140517615541456
- 140517615541456 [label=AccumulateGrad]
- 140517615539968 -> 140517615539680
- 140517615539968 [label=ViewBackward0]
- 140517615541264 -> 140517615539968
- 140517615541264 [label=ToCopyBackward0]
- 140517615524928 -> 140517615541264
- 140517615538480 -> 140517615539680
- 140517615538480 [label=TBackward0]
- 140517615541072 -> 140517615538480
- 140517615541072 [label=ToCopyBackward0]
- 140517615541408 -> 140517615541072
- 140509590946176 [label="encoder.layer.0.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590946176 -> 140517615541408
- 140517615541408 [label=AccumulateGrad]
- 140517615525024 -> 140517615525216
- 140517615525024 [label=TBackward0]
- 140517615525696 -> 140517615525024
- 140517615525696 [label=ToCopyBackward0]
- 140517615525792 -> 140517615525696
- 140509987117712 [label="encoder.layer.0.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509987117712 -> 140517615525792
- 140517615525792 [label=AccumulateGrad]
- 140517615524928 -> 140517615524784
- 140517615524736 -> 140517615524688
- 140509590937328 [label="encoder.layer.0.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590937328 -> 140517615524736
- 140517615524736 [label=AccumulateGrad]
- 140517615524208 -> 140517615524688
- 140509590937408 [label="encoder.layer.0.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590937408 -> 140517615524208
- 140517615524208 [label=AccumulateGrad]
- 140517615523536 -> 140517615524016
- 140517615523536 [label=TBackward0]
- 140517615524256 -> 140517615523536
- 140517615524256 [label=ToCopyBackward0]
- 140517615524640 -> 140517615524256
- 140509590937088 [label="encoder.layer.0.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590937088 -> 140517615524640
- 140517615524640 [label=AccumulateGrad]
- 140517615523440 -> 140517615523392
- 140517615523440 [label=ReshapeAliasBackward0]
- 140517615523776 -> 140517615523440
- 140517615523776 [label=ExpandBackward0]
- 140517615523968 -> 140517615523776
- 140517615523968 [label=TransposeBackward0]
- 140517615524448 -> 140517615523968
- 140517615524448 [label=PermuteBackward0]
- 140517615524880 -> 140517615524448
- 140517615524880 [label=ViewBackward0]
- 140517615524400 -> 140517615524880
- 140517615524400 [label=ViewBackward0]
- 140517615525168 -> 140517615524400
- 140517615525168 [label=AddmmBackward0]
- 140517615525408 -> 140517615525168
- 140517615525408 [label=ToCopyBackward0]
- 140517615538288 -> 140517615525408
- 140509590936928 [label="encoder.layer.0.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590936928 -> 140517615538288
- 140517615538288 [label=AccumulateGrad]
- 140517615525360 -> 140517615525168
- 140517615525360 [label=ViewBackward0]
- 140517615538768 -> 140517615525360
- 140517615538768 [label=ToCopyBackward0]
- 140517615539152 -> 140517615538768
- 140517615539152 [label=NativeLayerNormBackward0]
- 140517615540688 -> 140517615539152
- 140509590598736 [label="
- (1408)" fillcolor=lightblue]
- 140509590598736 -> 140517615540688
- 140517615540688 [label=AccumulateGrad]
- 140517615539488 -> 140517615539152
- 140509590598976 [label="
- (1408)" fillcolor=lightblue]
- 140509590598976 -> 140517615539488
- 140517615539488 [label=AccumulateGrad]
- 140517615523584 -> 140517615525168
- 140517615523584 [label=TBackward0]
- 140517615538240 -> 140517615523584
- 140517615538240 [label=ToCopyBackward0]
- 140517615541168 -> 140517615538240
- 140509590936848 [label="encoder.layer.0.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590936848 -> 140517615541168
- 140517615541168 [label=AccumulateGrad]
- 140517615522528 -> 140517615522480
- 140517615522528 [label=ReshapeAliasBackward0]
- 140517615522864 -> 140517615522528
- 140517615522864 [label=ExpandBackward0]
- 140517615523056 -> 140517615522864
- 140517615523056 [label=PermuteBackward0]
- 140517615523248 -> 140517615523056
- 140517615523248 [label=ViewBackward0]
- 140517615522624 -> 140517615523248
- 140517615522624 [label=ViewBackward0]
- 140517615523872 -> 140517615522624
- 140517615523872 [label=AddmmBackward0]
- 140517615524544 -> 140517615523872
- 140517615524544 [label=ToCopyBackward0]
- 140517615525600 -> 140517615524544
- 140509590936688 [label="encoder.layer.0.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590936688 -> 140517615525600
- 140517615525600 [label=AccumulateGrad]
- 140517615524160 -> 140517615523872
- 140517615524160 [label=ViewBackward0]
- 140517615525072 -> 140517615524160
- 140517615525072 [label=ToCopyBackward0]
- 140517615539152 -> 140517615525072
- 140517615522672 -> 140517615523872
- 140517615522672 [label=TBackward0]
- 140517615538576 -> 140517615522672
- 140517615538576 [label=ToCopyBackward0]
- 140517615538960 -> 140517615538576
- 140509590936608 [label="encoder.layer.0.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590936608 -> 140517615538960
- 140517615538960 [label=AccumulateGrad]
- 140517615521856 -> 140517615509408
- 140517615521856 [label=TBackward0]
- 140517615522240 -> 140517615521856
- 140517615522240 [label=ToCopyBackward0]
- 140517615522432 -> 140517615522240
- 140509590936368 [label="encoder.layer.0.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590936368 -> 140517615522432
- 140517615522432 [label=AccumulateGrad]
- 140517615509120 -> 140517615508976
- 140517615508928 -> 140517615508880
- 140509590936128 [label="encoder.layer.0.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590936128 -> 140517615508928
- 140517615508928 [label=AccumulateGrad]
- 140517615508496 -> 140517615508880
- 140509590936208 [label="encoder.layer.0.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590936208 -> 140517615508496
- 140517615508496 [label=AccumulateGrad]
- 140517615508016 -> 140517615508304
- 140517615508016 [label=TBackward0]
- 140517615508544 -> 140517615508016
- 140517615508544 [label=ToCopyBackward0]
- 140517615509024 -> 140517615508544
- 140509590934448 [label="encoder.layer.0.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590934448 -> 140517615509024
- 140517615509024 [label=AccumulateGrad]
- 140517615507584 -> 140517615507776
- 140517615507584 [label=TBackward0]
- 140517615508256 -> 140517615507584
- 140517615508256 [label=ToCopyBackward0]
- 140517615508736 -> 140517615508256
- 140509590934208 [label="encoder.layer.0.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590934208 -> 140517615508736
- 140517615508736 [label=AccumulateGrad]
- 140517615507488 -> 140517615507344
- 140517615507296 -> 140517615507200
- 140509590933968 [label="encoder.layer.0.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590933968 -> 140517615507296
- 140517615507296 [label=AccumulateGrad]
- 140517615507248 -> 140517615507200
- 140509590934048 [label="encoder.layer.0.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590934048 -> 140517615507248
- 140517615507248 [label=AccumulateGrad]
- 140517615506960 -> 140509587675888
- 140517615506960 [label=NativeLayerNormBackward0]
- 140517615507632 -> 140517615506960
- 140517615507632 [label=AddBackward0]
- 140517615508448 -> 140517615507632
- 140517615508448 [label=NativeDropoutBackward0]
- 140517615508160 -> 140517615508448
- 140517615508160 [label=ViewBackward0]
- 140517615508688 -> 140517615508160
- 140517615508688 [label=AddmmBackward0]
- 140517615509360 -> 140517615508688
- 140517615509360 [label=ToCopyBackward0]
- 140517615522000 -> 140517615509360
- 140509590935728 [label="encoder.layer.0.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590935728 -> 140517615522000
- 140517615522000 [label=AccumulateGrad]
- 140517615509264 -> 140517615508688
- 140517615509264 [label=ViewBackward0]
- 140517615522144 -> 140517615509264
- 140517615522144 [label=GeluBackward0]
- 140517615523152 -> 140517615522144
- 140517615523152 [label=ViewBackward0]
- 140517615523680 -> 140517615523152
- 140517615523680 [label=AddmmBackward0]
- 140517615524832 -> 140517615523680
- 140517615524832 [label=ToCopyBackward0]
- 140517615541552 -> 140517615524832
- 140509590935968 [label="encoder.layer.0.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590935968 -> 140517615541552
- 140517615541552 [label=AccumulateGrad]
- 140517615522768 -> 140517615523680
- 140517615522768 [label=ViewBackward0]
- 140517615541792 -> 140517615522768
- 140517615541792 [label=ToCopyBackward0]
- 140517615507968 -> 140517615541792
- 140517615507968 [label=SliceBackward0]
- 140517615541936 -> 140517615507968
- 140517615541936 [label=SliceBackward0]
- 140517615542032 -> 140517615541936
- 140517615542032 [label=SliceBackward0]
- 140517615524688 -> 140517615542032
- 140517615541696 -> 140517615523680
- 140517615541696 [label=TBackward0]
- 140517615541600 -> 140517615541696
- 140517615541600 [label=ToCopyBackward0]
- 140517615542128 -> 140517615541600
- 140509590935888 [label="encoder.layer.0.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590935888 -> 140517615542128
- 140517615542128 [label=AccumulateGrad]
- 140517615521904 -> 140517615508688
- 140517615521904 [label=TBackward0]
- 140517615523344 -> 140517615521904
- 140517615523344 [label=ToCopyBackward0]
- 140517615522960 -> 140517615523344
- 140509590935648 [label="encoder.layer.0.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590935648 -> 140517615522960
- 140517615522960 [label=AccumulateGrad]
- 140517615507968 -> 140517615507632
- 140517615507440 -> 140517615506960
- 140509590935408 [label="encoder.layer.0.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590935408 -> 140517615507440
- 140517615507440 [label=AccumulateGrad]
- 140517615507392 -> 140517615506960
- 140509590935488 [label="encoder.layer.0.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590935488 -> 140517615507392
- 140517615507392 [label=AccumulateGrad]
- 140517615506240 -> 140517615506720
- 140517615506240 [label=TBackward0]
- 140517615506912 -> 140517615506240
- 140517615506912 [label=ToCopyBackward0]
- 140517615507920 -> 140517615506912
- 140509590933728 [label="encoder.layer.1.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590933728 -> 140517615507920
- 140517615507920 [label=AccumulateGrad]
- 140517615506144 -> 140517615506096
- 140517615506144 [label=ReshapeAliasBackward0]
- 140517615506480 -> 140517615506144
- 140517615506480 [label=ExpandBackward0]
- 140517615506672 -> 140517615506480
- 140517615506672 [label=TransposeBackward0]
- 140517615507152 -> 140517615506672
- 140517615507152 [label=PermuteBackward0]
- 140517615509072 -> 140517615507152
- 140517615509072 [label=ViewBackward0]
- 140517615507104 -> 140517615509072
- 140517615507104 [label=ViewBackward0]
- 140517615522336 -> 140517615507104
- 140517615522336 [label=AddmmBackward0]
- 140517615506288 -> 140517615522336
- 140517615506288 [label=ToCopyBackward0]
- 140517615541840 -> 140517615506288
- 140509590917008 [label="encoder.layer.1.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590917008 -> 140517615541840
- 140517615541840 [label=AccumulateGrad]
- 140517615541744 -> 140517615522336
- 140517615541744 [label=ViewBackward0]
- 140517615542176 -> 140517615541744
- 140517615542176 [label=ToCopyBackward0]
- 140509587675888 -> 140517615542176
- 140517615541888 -> 140517615522336
- 140517615541888 [label=TBackward0]
- 140517615542080 -> 140517615541888
- 140517615542080 [label=ToCopyBackward0]
- 140517615542224 -> 140517615542080
- 140509590933568 [label="encoder.layer.1.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590933568 -> 140517615542224
- 140517615542224 [label=AccumulateGrad]
- 140509587676944 -> 140509587676896
- 140509587676944 [label=ReshapeAliasBackward0]
- 140509587677088 -> 140509587676944
- 140509587677088 [label=ExpandBackward0]
- 140517615505760 -> 140509587677088
- 140517615505760 [label=PermuteBackward0]
- 140517615505952 -> 140517615505760
- 140517615505952 [label=ViewBackward0]
- 140517615505472 -> 140517615505952
- 140517615505472 [label=ViewBackward0]
- 140517615506576 -> 140517615505472
- 140517615506576 [label=AddmmBackward0]
- 140517615507728 -> 140517615506576
- 140517615507728 [label=ToCopyBackward0]
- 140517615541648 -> 140517615507728
- 140509590916848 [label="encoder.layer.1.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590916848 -> 140517615541648
- 140517615541648 [label=AccumulateGrad]
- 140517615506864 -> 140517615506576
- 140517615506864 [label=ViewBackward0]
- 140517615521952 -> 140517615506864
- 140517615521952 [label=ToCopyBackward0]
- 140509587675888 -> 140517615521952
- 140517615505520 -> 140517615506576
- 140517615505520 [label=TBackward0]
- 140517615541984 -> 140517615505520
- 140517615541984 [label=ToCopyBackward0]
- 140517615591632 -> 140517615541984
- 140509590916768 [label="encoder.layer.1.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590916768 -> 140517615591632
- 140517615591632 [label=AccumulateGrad]
- 140509587675984 -> 140509587676176
- 140509587675984 [label=TBackward0]
- 140509587676656 -> 140509587675984
- 140509587676656 [label=ToCopyBackward0]
- 140509587676848 -> 140509587676656
- 140509590916528 [label="encoder.layer.1.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590916528 -> 140509587676848
- 140509587676848 [label=AccumulateGrad]
- 140509587675888 -> 140509587675744
- 140509587675696 -> 140509587675648
- 140509590916288 [label="encoder.layer.1.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590916288 -> 140509587675696
- 140509587675696 [label=AccumulateGrad]
- 140509587674976 -> 140509587675648
- 140509590916368 [label="encoder.layer.1.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590916368 -> 140509587674976
- 140509587674976 [label=AccumulateGrad]
- 140509587674496 -> 140509587674784
- 140509587674496 [label=TBackward0]
- 140509587675024 -> 140509587674496
- 140509587675024 [label=ToCopyBackward0]
- 140509587675408 -> 140509587675024
- 140509590914608 [label="encoder.layer.1.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590914608 -> 140509587675408
- 140509587675408 [label=AccumulateGrad]
- 140509587674064 -> 140509587674256
- 140509587674064 [label=TBackward0]
- 140509587674736 -> 140509587674064
- 140509587674736 [label=ToCopyBackward0]
- 140509587675216 -> 140509587674736
- 140509590914368 [label="encoder.layer.1.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590914368 -> 140509587675216
- 140509587675216 [label=AccumulateGrad]
- 140509587673968 -> 140509587673824
- 140509587673776 -> 140509587673680
- 140509590914128 [label="encoder.layer.1.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590914128 -> 140509587673776
- 140509587673776 [label=AccumulateGrad]
- 140509587673728 -> 140509587673680
- 140509590914208 [label="encoder.layer.1.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590914208 -> 140509587673728
- 140509587673728 [label=AccumulateGrad]
- 140509587673440 -> 140509587658304
- 140509587673440 [label=NativeLayerNormBackward0]
- 140509587674112 -> 140509587673440
- 140509587674112 [label=AddBackward0]
- 140509587674928 -> 140509587674112
- 140509587674928 [label=NativeDropoutBackward0]
- 140509587674640 -> 140509587674928
- 140509587674640 [label=ViewBackward0]
- 140509587675168 -> 140509587674640
- 140509587675168 [label=AddmmBackward0]
- 140509587675840 -> 140509587675168
- 140509587675840 [label=ToCopyBackward0]
- 140509587676368 -> 140509587675840
- 140509590915888 [label="encoder.layer.1.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590915888 -> 140509587676368
- 140509587676368 [label=AccumulateGrad]
- 140509587675792 -> 140509587675168
- 140509587675792 [label=ViewBackward0]
- 140509587676752 -> 140509587675792
- 140509587676752 [label=GeluBackward0]
- 140509587676560 -> 140509587676752
- 140509587676560 [label=ViewBackward0]
- 140509587676320 -> 140509587676560
- 140509587676320 [label=AddmmBackward0]
- 140517615506048 -> 140509587676320
- 140517615506048 [label=ToCopyBackward0]
- 140517615508832 -> 140517615506048
- 140509590916128 [label="encoder.layer.1.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590916128 -> 140517615508832
- 140517615508832 [label=AccumulateGrad]
- 140517615505856 -> 140509587676320
- 140517615505856 [label=ViewBackward0]
- 140517615591728 -> 140517615505856
- 140517615591728 [label=ToCopyBackward0]
- 140509587674448 -> 140517615591728
- 140509587674448 [label=SliceBackward0]
- 140517615591776 -> 140509587674448
- 140517615591776 [label=SliceBackward0]
- 140517615591872 -> 140517615591776
- 140517615591872 [label=SliceBackward0]
- 140509587675648 -> 140517615591872
- 140517615505568 -> 140509587676320
- 140517615505568 [label=TBackward0]
- 140517615591536 -> 140517615505568
- 140517615591536 [label=ToCopyBackward0]
- 140517615591968 -> 140517615591536
- 140509590916048 [label="encoder.layer.1.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590916048 -> 140517615591968
- 140517615591968 [label=AccumulateGrad]
- 140509587675600 -> 140509587675168
- 140509587675600 [label=TBackward0]
- 140509587676128 -> 140509587675600
- 140509587676128 [label=ToCopyBackward0]
- 140517615506384 -> 140509587676128
- 140509590915808 [label="encoder.layer.1.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590915808 -> 140517615506384
- 140517615506384 [label=AccumulateGrad]
- 140509587674448 -> 140509587674112
- 140509587673920 -> 140509587673440
- 140509590915568 [label="encoder.layer.1.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590915568 -> 140509587673920
- 140509587673920 [label=AccumulateGrad]
- 140509587673872 -> 140509587673440
- 140509590915648 [label="encoder.layer.1.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590915648 -> 140509587673872
- 140509587673872 [label=AccumulateGrad]
- 140509587673152 -> 140509587660368
- 140509587673152 [label=TBackward0]
- 140509587673392 -> 140509587673152
- 140509587673392 [label=ToCopyBackward0]
- 140509587674400 -> 140509587673392
- 140509590913888 [label="encoder.layer.2.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590913888 -> 140509587674400
- 140509587674400 [label=AccumulateGrad]
- 140509587660272 -> 140509587660224
- 140509587660272 [label=ReshapeAliasBackward0]
- 140509587660608 -> 140509587660272
- 140509587660608 [label=ExpandBackward0]
- 140509587660704 -> 140509587660608
- 140509587660704 [label=TransposeBackward0]
- 140509587673632 -> 140509587660704
- 140509587673632 [label=PermuteBackward0]
- 140509587675504 -> 140509587673632
- 140509587675504 [label=ViewBackward0]
- 140509587673584 -> 140509587675504
- 140509587673584 [label=ViewBackward0]
- 140509587676416 -> 140509587673584
- 140509587676416 [label=AddmmBackward0]
- 140517615505664 -> 140509587676416
- 140517615505664 [label=ToCopyBackward0]
- 140517615591680 -> 140517615505664
- 140509590913728 [label="encoder.layer.2.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590913728 -> 140517615591680
- 140517615591680 [label=AccumulateGrad]
- 140509587673200 -> 140509587676416
- 140509587673200 [label=ViewBackward0]
- 140517615592016 -> 140509587673200
- 140517615592016 [label=ToCopyBackward0]
- 140509587658304 -> 140517615592016
- 140517615591488 -> 140509587676416
- 140517615591488 [label=TBackward0]
- 140517615591584 -> 140517615591488
- 140517615591584 [label=ToCopyBackward0]
- 140517615592160 -> 140517615591584
- 140509590913648 [label="encoder.layer.2.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590913648 -> 140517615592160
- 140517615592160 [label=AccumulateGrad]
- 140509587659360 -> 140509587659312
- 140509587659360 [label=ReshapeAliasBackward0]
- 140509587659696 -> 140509587659360
- 140509587659696 [label=ExpandBackward0]
- 140509587659888 -> 140509587659696
- 140509587659888 [label=PermuteBackward0]
- 140509587660080 -> 140509587659888
- 140509587660080 [label=ViewBackward0]
- 140509587659456 -> 140509587660080
- 140509587659456 [label=ViewBackward0]
- 140509587660416 -> 140509587659456
- 140509587660416 [label=AddmmBackward0]
- 140509587659504 -> 140509587660416
- 140509587659504 [label=ToCopyBackward0]
- 140509587676032 -> 140509587659504
- 140509590913488 [label="encoder.layer.2.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590913488 -> 140509587676032
- 140509587676032 [label=AccumulateGrad]
- 140509587674208 -> 140509587660416
- 140509587674208 [label=ViewBackward0]
- 140517615591920 -> 140509587674208
- 140517615591920 [label=ToCopyBackward0]
- 140509587658304 -> 140517615591920
- 140509587673344 -> 140509587660416
- 140509587673344 [label=TBackward0]
- 140517615591824 -> 140509587673344
- 140517615591824 [label=ToCopyBackward0]
- 140517615592064 -> 140517615591824
- 140509590913408 [label="encoder.layer.2.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590913408 -> 140517615592064
- 140517615592064 [label=AccumulateGrad]
- 140509587658400 -> 140509587658592
- 140509587658400 [label=TBackward0]
- 140509587659072 -> 140509587658400
- 140509587659072 [label=ToCopyBackward0]
- 140509587659264 -> 140509587659072
- 140509590913168 [label="encoder.layer.2.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590913168 -> 140509587659264
- 140509587659264 [label=AccumulateGrad]
- 140509587658304 -> 140509587658160
- 140509587658112 -> 140509587658064
- 140509590904640 [label="encoder.layer.2.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590904640 -> 140509587658112
- 140509587658112 [label=AccumulateGrad]
- 140509587657584 -> 140509587658064
- 140509590904720 [label="encoder.layer.2.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590904720 -> 140509587657584
- 140509587657584 [label=AccumulateGrad]
- 140509587656912 -> 140509587657392
- 140509587656912 [label=TBackward0]
- 140509587657632 -> 140509587656912
- 140509587657632 [label=ToCopyBackward0]
- 140509587658016 -> 140509587657632
- 140509590904400 [label="encoder.layer.2.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590904400 -> 140509587658016
- 140509587658016 [label=AccumulateGrad]
- 140509587656816 -> 140509587644368
- 140509587656816 [label=ReshapeAliasBackward0]
- 140509587657152 -> 140509587656816
- 140509587657152 [label=ExpandBackward0]
- 140509587657344 -> 140509587657152
- 140509587657344 [label=TransposeBackward0]
- 140509587657824 -> 140509587657344
- 140509587657824 [label=PermuteBackward0]
- 140509587658256 -> 140509587657824
- 140509587658256 [label=ViewBackward0]
- 140509587657776 -> 140509587658256
- 140509587657776 [label=ViewBackward0]
- 140509587658544 -> 140509587657776
- 140509587658544 [label=AddmmBackward0]
- 140509587658784 -> 140509587658544
- 140509587658784 [label=ToCopyBackward0]
- 140509587658976 -> 140509587658784
- 140509590904240 [label="encoder.layer.2.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590904240 -> 140509587658976
- 140509587658976 [label=AccumulateGrad]
- 140509587658736 -> 140509587658544
- 140509587658736 [label=ViewBackward0]
- 140509587659792 -> 140509587658736
- 140509587659792 [label=ToCopyBackward0]
- 140517615539152 -> 140509587659792
- 140509587656960 -> 140509587658544
- 140509587656960 [label=TBackward0]
- 140509587659600 -> 140509587656960
- 140509587659600 [label=ToCopyBackward0]
- 140509587660512 -> 140509587659600
- 140509590904160 [label="encoder.layer.2.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590904160 -> 140509587660512
- 140509587660512 [label=AccumulateGrad]
- 140509587643552 -> 140509587643504
- 140509587643552 [label=ReshapeAliasBackward0]
- 140509587643888 -> 140509587643552
- 140509587643888 [label=ExpandBackward0]
- 140509587644080 -> 140509587643888
- 140509587644080 [label=PermuteBackward0]
- 140509587644272 -> 140509587644080
- 140509587644272 [label=ViewBackward0]
- 140509587675312 -> 140509587644272
- 140509587675312 [label=ViewBackward0]
- 140509587643696 -> 140509587675312
- 140509587643696 [label=AddmmBackward0]
- 140509587657536 -> 140509587643696
- 140509587657536 [label=ToCopyBackward0]
- 140509587659168 -> 140509587657536
- 140509590904000 [label="encoder.layer.2.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590904000 -> 140509587659168
- 140509587659168 [label=AccumulateGrad]
- 140509587657248 -> 140509587643696
- 140509587657248 [label=ViewBackward0]
- 140509587660176 -> 140509587657248
- 140509587660176 [label=ToCopyBackward0]
- 140517615539152 -> 140509587660176
- 140509587656768 -> 140509587643696
- 140509587656768 [label=TBackward0]
- 140509587658208 -> 140509587656768
- 140509587658208 [label=ToCopyBackward0]
- 140509587658448 -> 140509587658208
- 140509590903920 [label="encoder.layer.2.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590903920 -> 140509587658448
- 140509587658448 [label=AccumulateGrad]
- 140509587642592 -> 140509587642784
- 140509587642592 [label=TBackward0]
- 140509587643264 -> 140509587642592
- 140509587643264 [label=ToCopyBackward0]
- 140509587643456 -> 140509587643264
- 140509590903680 [label="encoder.layer.2.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590903680 -> 140509587643456
- 140509587643456 [label=AccumulateGrad]
- 140509587642496 -> 140509587642352
- 140509587642304 -> 140509587642256
- 140509590903440 [label="encoder.layer.2.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590903440 -> 140509587642304
- 140509587642304 [label=AccumulateGrad]
- 140509587641872 -> 140509587642256
- 140509590903520 [label="encoder.layer.2.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590903520 -> 140509587641872
- 140509587641872 [label=AccumulateGrad]
- 140509587641392 -> 140509587641680
- 140509587641392 [label=TBackward0]
- 140509587641920 -> 140509587641392
- 140509587641920 [label=ToCopyBackward0]
- 140509587642400 -> 140509587641920
- 140509590901760 [label="encoder.layer.2.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590901760 -> 140509587642400
- 140509587642400 [label=AccumulateGrad]
- 140509587640960 -> 140509587641152
- 140509587640960 [label=TBackward0]
- 140509587641632 -> 140509587640960
- 140509587641632 [label=ToCopyBackward0]
- 140509587642112 -> 140509587641632
- 140509590901520 [label="encoder.layer.2.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590901520 -> 140509587642112
- 140509587642112 [label=AccumulateGrad]
- 140509587640864 -> 140509587640720
- 140509587640672 -> 140509587640576
- 140509590901280 [label="encoder.layer.2.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590901280 -> 140509587640672
- 140509587640672 [label=AccumulateGrad]
- 140509587640624 -> 140509587640576
- 140509590901360 [label="encoder.layer.2.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590901360 -> 140509587640624
- 140509587640624 [label=AccumulateGrad]
- 140509587640480 -> 140509587625200
- 140509587640480 [label=NativeLayerNormBackward0]
- 140509587641008 -> 140509587640480
- 140509587641008 [label=AddBackward0]
- 140509587641824 -> 140509587641008
- 140509587641824 [label=NativeDropoutBackward0]
- 140509587641536 -> 140509587641824
- 140509587641536 [label=ViewBackward0]
- 140509587642064 -> 140509587641536
- 140509587642064 [label=AddmmBackward0]
- 140509587642928 -> 140509587642064
- 140509587642928 [label=ToCopyBackward0]
- 140509587643024 -> 140509587642928
- 140509590903040 [label="encoder.layer.2.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590903040 -> 140509587643024
- 140509587643024 [label=AccumulateGrad]
- 140509587642736 -> 140509587642064
- 140509587642736 [label=ViewBackward0]
- 140509587643168 -> 140509587642736
- 140509587643168 [label=GeluBackward0]
- 140509587644176 -> 140509587643168
- 140509587644176 [label=ViewBackward0]
- 140509587643648 -> 140509587644176
- 140509587643648 [label=AddmmBackward0]
- 140509587659984 -> 140509587643648
- 140509587659984 [label=ToCopyBackward0]
- 140517615592208 -> 140509587659984
- 140509590903280 [label="encoder.layer.2.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590903280 -> 140517615592208
- 140517615592208 [label=AccumulateGrad]
- 140509587657920 -> 140509587643648
- 140509587657920 [label=ViewBackward0]
- 140517615592304 -> 140509587657920
- 140517615592304 [label=ToCopyBackward0]
- 140509587641344 -> 140517615592304
- 140509587641344 [label=SliceBackward0]
- 140517615592448 -> 140509587641344
- 140517615592448 [label=SliceBackward0]
- 140517615592544 -> 140517615592448
- 140517615592544 [label=SliceBackward0]
- 140509587658064 -> 140517615592544
- 140509587657056 -> 140509587643648
- 140509587657056 [label=TBackward0]
- 140517615592112 -> 140509587657056
- 140517615592112 [label=ToCopyBackward0]
- 140517615592640 -> 140517615592112
- 140509590903200 [label="encoder.layer.2.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590903200 -> 140517615592640
- 140517615592640 [label=AccumulateGrad]
- 140509587642640 -> 140509587642064
- 140509587642640 [label=TBackward0]
- 140509587643792 -> 140509587642640
- 140509587643792 [label=ToCopyBackward0]
- 140509587658832 -> 140509587643792
- 140509590902960 [label="encoder.layer.2.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590902960 -> 140509587658832
- 140509587658832 [label=AccumulateGrad]
- 140509587641344 -> 140509587641008
- 140509587640816 -> 140509587640480
- 140509590902720 [label="encoder.layer.2.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590902720 -> 140509587640816
- 140509587640816 [label=AccumulateGrad]
- 140509587640768 -> 140509587640480
- 140509590902800 [label="encoder.layer.2.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590902800 -> 140509587640768
- 140509587640768 [label=AccumulateGrad]
- 140509587627264 -> 140509587627744
- 140509587627264 [label=TBackward0]
- 140509587640384 -> 140509587627264
- 140509587640384 [label=ToCopyBackward0]
- 140509587641296 -> 140509587640384
- 140509590901040 [label="encoder.layer.3.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590901040 -> 140509587641296
- 140509587641296 [label=AccumulateGrad]
- 140509587627168 -> 140509587627120
- 140509587627168 [label=ReshapeAliasBackward0]
- 140509587627504 -> 140509587627168
- 140509587627504 [label=ExpandBackward0]
- 140509587627696 -> 140509587627504
- 140509587627696 [label=TransposeBackward0]
- 140509587627888 -> 140509587627696
- 140509587627888 [label=PermuteBackward0]
- 140509587642448 -> 140509587627888
- 140509587642448 [label=ViewBackward0]
- 140509587640432 -> 140509587642448
- 140509587640432 [label=ViewBackward0]
- 140509587643360 -> 140509587640432
- 140509587643360 [label=AddmmBackward0]
- 140509587643984 -> 140509587643360
- 140509587643984 [label=ToCopyBackward0]
- 140517615592256 -> 140509587643984
- 140509590900880 [label="encoder.layer.3.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590900880 -> 140517615592256
- 140517615592256 [label=AccumulateGrad]
- 140509587640528 -> 140509587643360
- 140509587640528 [label=ViewBackward0]
- 140517615592688 -> 140509587640528
- 140517615592688 [label=ToCopyBackward0]
- 140509587625200 -> 140517615592688
- 140517615592352 -> 140509587643360
- 140517615592352 [label=TBackward0]
- 140517615592400 -> 140517615592352
- 140517615592400 [label=ToCopyBackward0]
- 140517615592832 -> 140517615592400
- 140509590900800 [label="encoder.layer.3.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590900800 -> 140517615592832
- 140517615592832 [label=AccumulateGrad]
- 140509587626256 -> 140509587626208
- 140509587626256 [label=ReshapeAliasBackward0]
- 140509587626592 -> 140509587626256
- 140509587626592 [label=ExpandBackward0]
- 140509587626784 -> 140509587626592
- 140509587626784 [label=PermuteBackward0]
- 140509587626976 -> 140509587626784
- 140509587626976 [label=ViewBackward0]
- 140509587626352 -> 140509587626976
- 140509587626352 [label=ViewBackward0]
- 140509587627600 -> 140509587626352
- 140509587627600 [label=AddmmBackward0]
- 140509587627312 -> 140509587627600
- 140509587627312 [label=ToCopyBackward0]
- 140509587642976 -> 140509587627312
- 140509590896448 [label="encoder.layer.3.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590896448 -> 140509587642976
- 140509587642976 [label=AccumulateGrad]
- 140509587626400 -> 140509587627600
- 140509587626400 [label=ViewBackward0]
- 140517615592592 -> 140509587626400
- 140517615592592 [label=ToCopyBackward0]
- 140509587625200 -> 140517615592592
- 140509587641104 -> 140509587627600
- 140509587641104 [label=TBackward0]
- 140517615592496 -> 140509587641104
- 140517615592496 [label=ToCopyBackward0]
- 140517615592736 -> 140517615592496
- 140509590896368 [label="encoder.layer.3.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590896368 -> 140517615592736
- 140517615592736 [label=AccumulateGrad]
- 140509587625296 -> 140509587625488
- 140509587625296 [label=TBackward0]
- 140509587625968 -> 140509587625296
- 140509587625968 [label=ToCopyBackward0]
- 140509587626160 -> 140509587625968
- 140509590896128 [label="encoder.layer.3.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590896128 -> 140509587626160
- 140509587626160 [label=AccumulateGrad]
- 140509587625200 -> 140509587625056
- 140509587625008 -> 140509587624960
- 140509590895888 [label="encoder.layer.3.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590895888 -> 140509587625008
- 140509587625008 [label=AccumulateGrad]
- 140509587624288 -> 140509587624960
- 140509590895968 [label="encoder.layer.3.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590895968 -> 140509587624288
- 140509587624288 [label=AccumulateGrad]
- 140509587624000 -> 140509587624096
- 140509587624000 [label=TBackward0]
- 140509587624336 -> 140509587624000
- 140509587624336 [label=ToCopyBackward0]
- 140509587624720 -> 140509587624336
- 140509590894208 [label="encoder.layer.3.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590894208 -> 140509587624720
- 140509587624720 [label=AccumulateGrad]
- 140509587611024 -> 140509587611216
- 140509587611024 [label=TBackward0]
- 140509587611456 -> 140509587611024
- 140509587611456 [label=ToCopyBackward0]
- 140509587624528 -> 140509587611456
- 140509590893968 [label="encoder.layer.3.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590893968 -> 140509587624528
- 140509587624528 [label=AccumulateGrad]
- 140509587610928 -> 140509587610784
- 140509587610736 -> 140509587610640
- 140509590893728 [label="encoder.layer.3.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590893728 -> 140509587610736
- 140509587610736 [label=AccumulateGrad]
- 140509587610688 -> 140509587610640
- 140509590893808 [label="encoder.layer.3.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590893808 -> 140509587610688
- 140509587610688 [label=AccumulateGrad]
- 140509587610400 -> 140509587607664
- 140509587610400 [label=NativeLayerNormBackward0]
- 140509587611072 -> 140509587610400
- 140509587611072 [label=AddBackward0]
- 140509587611600 -> 140509587611072
- 140509587611600 [label=NativeDropoutBackward0]
- 140509587624048 -> 140509587611600
- 140509587624048 [label=ViewBackward0]
- 140509587624480 -> 140509587624048
- 140509587624480 [label=AddmmBackward0]
- 140509587625152 -> 140509587624480
- 140509587625152 [label=ToCopyBackward0]
- 140509587625680 -> 140509587625152
- 140509590895488 [label="encoder.layer.3.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590895488 -> 140509587625680
- 140509587625680 [label=AccumulateGrad]
- 140509587625104 -> 140509587624480
- 140509587625104 [label=ViewBackward0]
- 140509587626064 -> 140509587625104
- 140509587626064 [label=GeluBackward0]
- 140509587625728 -> 140509587626064
- 140509587625728 [label=ViewBackward0]
- 140509587626688 -> 140509587625728
- 140509587626688 [label=AddmmBackward0]
- 140509587627072 -> 140509587626688
- 140509587627072 [label=ToCopyBackward0]
- 140509587642208 -> 140509587627072
- 140509590895728 [label="encoder.layer.3.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590895728 -> 140509587642208
- 140509587642208 [label=AccumulateGrad]
- 140509587626880 -> 140509587626688
- 140509587626880 [label=ViewBackward0]
- 140517615593024 -> 140509587626880
- 140517615593024 [label=ToCopyBackward0]
- 140509587611360 -> 140517615593024
- 140509587611360 [label=SliceBackward0]
- 140517615593072 -> 140509587611360
- 140517615593072 [label=SliceBackward0]
- 140517615593168 -> 140517615593072
- 140517615593168 [label=SliceBackward0]
- 140509587624960 -> 140517615593168
- 140509587625632 -> 140509587626688
- 140509587625632 [label=TBackward0]
- 140517615592784 -> 140509587625632
- 140517615592784 [label=ToCopyBackward0]
- 140517615593264 -> 140517615592784
- 140509590895648 [label="encoder.layer.3.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590895648 -> 140517615593264
- 140517615593264 [label=AccumulateGrad]
- 140509587624912 -> 140509587624480
- 140509587624912 [label=TBackward0]
- 140509587625872 -> 140509587624912
- 140509587625872 [label=ToCopyBackward0]
- 140509587627408 -> 140509587625872
- 140509590895408 [label="encoder.layer.3.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590895408 -> 140509587627408
- 140509587627408 [label=AccumulateGrad]
- 140509587611360 -> 140509587611072
- 140509587610880 -> 140509587610400
- 140509590895168 [label="encoder.layer.3.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590895168 -> 140509587610880
- 140509587610880 [label=AccumulateGrad]
- 140509587610832 -> 140509587610400
- 140509590895248 [label="encoder.layer.3.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590895248 -> 140509587610832
- 140509587610832 [label=AccumulateGrad]
- 140509587609680 -> 140509587610160
- 140509587609680 [label=TBackward0]
- 140509587610352 -> 140509587609680
- 140509587610352 [label=ToCopyBackward0]
- 140509587611168 -> 140509587610352
- 140509590893488 [label="encoder.layer.4.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590893488 -> 140509587611168
- 140509587611168 [label=AccumulateGrad]
- 140509587609584 -> 140509587609536
- 140509587609584 [label=ReshapeAliasBackward0]
- 140509587609920 -> 140509587609584
- 140509587609920 [label=ExpandBackward0]
- 140509587610112 -> 140509587609920
- 140509587610112 [label=TransposeBackward0]
- 140509587610592 -> 140509587610112
- 140509587610592 [label=PermuteBackward0]
- 140509587610544 -> 140509587610592
- 140509587610544 [label=ViewBackward0]
- 140509587624240 -> 140509587610544
- 140509587624240 [label=ViewBackward0]
- 140509587625440 -> 140509587624240
- 140509587625440 [label=AddmmBackward0]
- 140509587626496 -> 140509587625440
- 140509587626496 [label=ToCopyBackward0]
- 140517615592976 -> 140509587626496
- 140509590893328 [label="encoder.layer.4.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590893328 -> 140517615592976
- 140517615592976 [label=AccumulateGrad]
- 140509587624816 -> 140509587625440
- 140509587624816 [label=ViewBackward0]
- 140517615593312 -> 140509587624816
- 140517615593312 [label=ToCopyBackward0]
- 140509587607664 -> 140517615593312
- 140517615592880 -> 140509587625440
- 140517615592880 [label=TBackward0]
- 140517615592928 -> 140517615592880
- 140517615592928 [label=ToCopyBackward0]
- 140517615593456 -> 140517615592928
- 140509590893248 [label="encoder.layer.4.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590893248 -> 140517615593456
- 140517615593456 [label=AccumulateGrad]
- 140509587608672 -> 140509587608624
- 140509587608672 [label=ReshapeAliasBackward0]
- 140509587609008 -> 140509587608672
- 140509587609008 [label=ExpandBackward0]
- 140509587609200 -> 140509587609008
- 140509587609200 [label=PermuteBackward0]
- 140509587609392 -> 140509587609200
- 140509587609392 [label=ViewBackward0]
- 140509587608768 -> 140509587609392
- 140509587608768 [label=ViewBackward0]
- 140509587610016 -> 140509587608768
- 140509587610016 [label=AddmmBackward0]
- 140509587609728 -> 140509587610016
- 140509587609728 [label=ToCopyBackward0]
- 140509587625344 -> 140509587609728
- 140509590893088 [label="encoder.layer.4.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590893088 -> 140509587625344
- 140509587625344 [label=AccumulateGrad]
- 140509587610304 -> 140509587610016
- 140509587610304 [label=ViewBackward0]
- 140517615593216 -> 140509587610304
- 140517615593216 [label=ToCopyBackward0]
- 140509587607664 -> 140517615593216
- 140509587608816 -> 140509587610016
- 140509587608816 [label=TBackward0]
- 140517615593120 -> 140509587608816
- 140517615593120 [label=ToCopyBackward0]
- 140517615593360 -> 140517615593120
- 140509590893008 [label="encoder.layer.4.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590893008 -> 140517615593360
- 140517615593360 [label=AccumulateGrad]
- 140509587607712 -> 140509587607904
- 140509587607712 [label=TBackward0]
- 140509587608384 -> 140509587607712
- 140509587608384 [label=ToCopyBackward0]
- 140509587608576 -> 140509587608384
- 140509590892768 [label="encoder.layer.4.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590892768 -> 140509587608576
- 140509587608576 [label=AccumulateGrad]
- 140509587607664 -> 140509587595120
- 140509587595072 -> 140509587595024
- 140509590892608 [label="encoder.layer.4.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590892608 -> 140509587595072
- 140509587595072 [label=AccumulateGrad]
- 140509587594544 -> 140509587595024
- 140509590876048 [label="encoder.layer.4.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590876048 -> 140509587594544
- 140509587594544 [label=AccumulateGrad]
- 140509587593872 -> 140509587594352
- 140509587593872 [label=TBackward0]
- 140509587594592 -> 140509587593872
- 140509587594592 [label=ToCopyBackward0]
- 140509587594976 -> 140509587594592
- 140509590875808 [label="encoder.layer.4.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590875808 -> 140509587594976
- 140509587594976 [label=AccumulateGrad]
- 140509587593776 -> 140509587593728
- 140509587593776 [label=ReshapeAliasBackward0]
- 140509587594112 -> 140509587593776
- 140509587594112 [label=ExpandBackward0]
- 140509587594304 -> 140509587594112
- 140509587594304 [label=TransposeBackward0]
- 140509587594784 -> 140509587594304
- 140509587594784 [label=PermuteBackward0]
- 140509587595168 -> 140509587594784
- 140509587595168 [label=ViewBackward0]
- 140509587594736 -> 140509587595168
- 140509587594736 [label=ViewBackward0]
- 140509587607856 -> 140509587594736
- 140509587607856 [label=AddmmBackward0]
- 140509587608096 -> 140509587607856
- 140509587608096 [label=ToCopyBackward0]
- 140509587608288 -> 140509587608096
- 140509590875648 [label="encoder.layer.4.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590875648 -> 140509587608288
- 140509587608288 [label=AccumulateGrad]
- 140509587608048 -> 140509587607856
- 140509587608048 [label=ViewBackward0]
- 140509587609104 -> 140509587608048
- 140509587609104 [label=ToCopyBackward0]
- 140517615539152 -> 140509587609104
- 140509587607616 -> 140509587607856
- 140509587607616 [label=TBackward0]
- 140509587608912 -> 140509587607616
- 140509587608912 [label=ToCopyBackward0]
- 140509587609824 -> 140509587608912
- 140509590875568 [label="encoder.layer.4.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590875568 -> 140509587609824
- 140509587609824 [label=AccumulateGrad]
- 140509587592864 -> 140509587592816
- 140509587592864 [label=ReshapeAliasBackward0]
- 140509587593200 -> 140509587592864
- 140509587593200 [label=ExpandBackward0]
- 140509587593392 -> 140509587593200
- 140509587593392 [label=PermuteBackward0]
- 140509587593584 -> 140509587593392
- 140509587593584 [label=ViewBackward0]
- 140509587592960 -> 140509587593584
- 140509587592960 [label=ViewBackward0]
- 140509587594208 -> 140509587592960
- 140509587594208 [label=AddmmBackward0]
- 140509587594880 -> 140509587594208
- 140509587594880 [label=ToCopyBackward0]
- 140509587624624 -> 140509587594880
- 140509590875408 [label="encoder.layer.4.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590875408 -> 140509587624624
- 140509587624624 [label=AccumulateGrad]
- 140509587594496 -> 140509587594208
- 140509587594496 [label=ViewBackward0]
- 140509587609488 -> 140509587594496
- 140509587609488 [label=ToCopyBackward0]
- 140517615539152 -> 140509587609488
- 140509587593008 -> 140509587594208
- 140509587593008 [label=TBackward0]
- 140509587607760 -> 140509587593008
- 140509587607760 [label=ToCopyBackward0]
- 140509587608480 -> 140509587607760
- 140509590875328 [label="encoder.layer.4.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590875328 -> 140509587608480
- 140509587608480 [label=AccumulateGrad]
- 140509587591904 -> 140509587592096
- 140509587591904 [label=TBackward0]
- 140509587592576 -> 140509587591904
- 140509587592576 [label=ToCopyBackward0]
- 140509587592768 -> 140509587592576
- 140509590875088 [label="encoder.layer.4.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590875088 -> 140509587592768
- 140509587592768 [label=AccumulateGrad]
- 140509587591808 -> 140509587591664
- 140509587591616 -> 140509587591568
- 140509590874848 [label="encoder.layer.4.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590874848 -> 140509587591616
- 140509587591616 [label=AccumulateGrad]
- 140509587591376 -> 140509587591568
- 140509590874928 [label="encoder.layer.4.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590874928 -> 140509587591376
- 140509587591376 [label=AccumulateGrad]
- 140509587574256 -> 140509587574544
- 140509587574256 [label=TBackward0]
- 140509587591328 -> 140509587574256
- 140509587591328 [label=ToCopyBackward0]
- 140509587591712 -> 140509587591328
- 140509590873168 [label="encoder.layer.4.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590873168 -> 140509587591712
- 140509587591712 [label=AccumulateGrad]
- 140509587573824 -> 140509587574016
- 140509587573824 [label=TBackward0]
- 140509587574496 -> 140509587573824
- 140509587574496 [label=ToCopyBackward0]
- 140509587574688 -> 140509587574496
- 140509590872928 [label="encoder.layer.4.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590872928 -> 140509587574688
- 140509587574688 [label=AccumulateGrad]
- 140509587573728 -> 140509587573584
- 140509587573536 -> 140509587573440
- 140509590872688 [label="encoder.layer.4.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590872688 -> 140509587573536
- 140509587573536 [label=AccumulateGrad]
- 140509587573488 -> 140509587573440
- 140509590872768 [label="encoder.layer.4.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590872768 -> 140509587573488
- 140509587573488 [label=AccumulateGrad]
- 140509587573200 -> 140509587562112
- 140509587573200 [label=NativeLayerNormBackward0]
- 140509587573872 -> 140509587573200
- 140509587573872 [label=AddBackward0]
- 140509587574400 -> 140509587573872
- 140509587574400 [label=NativeDropoutBackward0]
- 140509587591424 -> 140509587574400
- 140509587591424 [label=ViewBackward0]
- 140509587591280 -> 140509587591424
- 140509587591280 [label=AddmmBackward0]
- 140509587592240 -> 140509587591280
- 140509587592240 [label=ToCopyBackward0]
- 140509587592336 -> 140509587592240
- 140509590874448 [label="encoder.layer.4.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590874448 -> 140509587592336
- 140509587592336 [label=AccumulateGrad]
- 140509587592048 -> 140509587591280
- 140509587592048 [label=ViewBackward0]
- 140509587592480 -> 140509587592048
- 140509587592480 [label=GeluBackward0]
- 140509587593488 -> 140509587592480
- 140509587593488 [label=ViewBackward0]
- 140509587594016 -> 140509587593488
- 140509587594016 [label=AddmmBackward0]
- 140509587593920 -> 140509587594016
- 140509587593920 [label=ToCopyBackward0]
- 140517615593504 -> 140509587593920
- 140509590874688 [label="encoder.layer.4.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590874688 -> 140517615593504
- 140517615593504 [label=AccumulateGrad]
- 140509587593104 -> 140509587594016
- 140509587593104 [label=ViewBackward0]
- 140517615593600 -> 140509587593104
- 140517615593600 [label=ToCopyBackward0]
- 140509587574208 -> 140517615593600
- 140509587574208 [label=SliceBackward0]
- 140517615593744 -> 140509587574208
- 140517615593744 [label=SliceBackward0]
- 140517615593840 -> 140517615593744
- 140517615593840 [label=SliceBackward0]
- 140509587595024 -> 140517615593840
- 140509587609296 -> 140509587594016
- 140509587609296 [label=TBackward0]
- 140517615593408 -> 140509587609296
- 140517615593408 [label=ToCopyBackward0]
- 140517615593936 -> 140517615593408
- 140509590874608 [label="encoder.layer.4.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590874608 -> 140517615593936
- 140517615593936 [label=AccumulateGrad]
- 140509587591952 -> 140509587591280
- 140509587591952 [label=TBackward0]
- 140509587593680 -> 140509587591952
- 140509587593680 [label=ToCopyBackward0]
- 140509587608144 -> 140509587593680
- 140509590874368 [label="encoder.layer.4.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590874368 -> 140509587608144
- 140509587608144 [label=AccumulateGrad]
- 140509587574208 -> 140509587573872
- 140509587573680 -> 140509587573200
- 140509590874128 [label="encoder.layer.4.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590874128 -> 140509587573680
- 140509587573680 [label=AccumulateGrad]
- 140509587573632 -> 140509587573200
- 140509590874208 [label="encoder.layer.4.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590874208 -> 140509587573632
- 140509587573632 [label=AccumulateGrad]
- 140509587572480 -> 140509587572960
- 140509587572480 [label=TBackward0]
- 140509587573152 -> 140509587572480
- 140509587573152 [label=ToCopyBackward0]
- 140509587574160 -> 140509587573152
- 140509590872448 [label="encoder.layer.5.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590872448 -> 140509587574160
- 140509587574160 [label=AccumulateGrad]
- 140509587572384 -> 140509587572336
- 140509587572384 [label=ReshapeAliasBackward0]
- 140509587572720 -> 140509587572384
- 140509587572720 [label=ExpandBackward0]
- 140509587572912 -> 140509587572720
- 140509587572912 [label=TransposeBackward0]
- 140509587573392 -> 140509587572912
- 140509587573392 [label=PermuteBackward0]
- 140509587573344 -> 140509587573392
- 140509587573344 [label=ViewBackward0]
- 140509587572528 -> 140509587573344
- 140509587572528 [label=ViewBackward0]
- 140509587592672 -> 140509587572528
- 140509587592672 [label=AddmmBackward0]
- 140509587593296 -> 140509587592672
- 140509587593296 [label=ToCopyBackward0]
- 140517615593552 -> 140509587593296
- 140509590872288 [label="encoder.layer.5.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590872288 -> 140517615593552
- 140517615593552 [label=AccumulateGrad]
- 140509587591760 -> 140509587592672
- 140509587591760 [label=ViewBackward0]
- 140517615593984 -> 140509587591760
- 140517615593984 [label=ToCopyBackward0]
- 140509587562112 -> 140517615593984
- 140517615593648 -> 140509587592672
- 140517615593648 [label=TBackward0]
- 140517615593696 -> 140517615593648
- 140517615593696 [label=ToCopyBackward0]
- 140517615594128 -> 140517615593696
- 140509590872208 [label="encoder.layer.5.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590872208 -> 140517615594128
- 140517615594128 [label=AccumulateGrad]
- 140509587571472 -> 140509587571424
- 140509587571472 [label=ReshapeAliasBackward0]
- 140509587571808 -> 140509587571472
- 140509587571808 [label=ExpandBackward0]
- 140509587572000 -> 140509587571808
- 140509587572000 [label=PermuteBackward0]
- 140509587572192 -> 140509587572000
- 140509587572192 [label=ViewBackward0]
- 140509587571568 -> 140509587572192
- 140509587571568 [label=ViewBackward0]
- 140509587572816 -> 140509587571568
- 140509587572816 [label=AddmmBackward0]
- 140509587573968 -> 140509587572816
- 140509587573968 [label=ToCopyBackward0]
- 140509587592288 -> 140509587573968
- 140509590859664 [label="encoder.layer.5.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590859664 -> 140509587592288
- 140509587592288 [label=AccumulateGrad]
- 140509587573104 -> 140509587572816
- 140509587573104 [label=ViewBackward0]
- 140517615593888 -> 140509587573104
- 140517615593888 [label=ToCopyBackward0]
- 140509587562112 -> 140517615593888
- 140509587571616 -> 140509587572816
- 140509587571616 [label=TBackward0]
- 140517615593792 -> 140509587571616
- 140517615593792 [label=ToCopyBackward0]
- 140517615594032 -> 140517615593792
- 140509590859584 [label="encoder.layer.5.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590859584 -> 140517615594032
- 140517615594032 [label=AccumulateGrad]
- 140509587570752 -> 140509587562400
- 140509587570752 [label=TBackward0]
- 140509587571184 -> 140509587570752
- 140509587571184 [label=ToCopyBackward0]
- 140509587571376 -> 140509587571184
- 140509590859344 [label="encoder.layer.5.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590859344 -> 140509587571376
- 140509587571376 [label=AccumulateGrad]
- 140509587562112 -> 140509587561968
- 140509587561920 -> 140509587561872
- 140509590859104 [label="encoder.layer.5.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590859104 -> 140509587561920
- 140509587561920 [label=AccumulateGrad]
- 140509587561200 -> 140509587561872
- 140509590859184 [label="encoder.layer.5.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590859184 -> 140509587561200
- 140509587561200 [label=AccumulateGrad]
- 140509587560720 -> 140509587561008
- 140509587560720 [label=TBackward0]
- 140509587561248 -> 140509587560720
- 140509587561248 [label=ToCopyBackward0]
- 140509587561632 -> 140509587561248
- 140509590857424 [label="encoder.layer.5.experts.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590857424 -> 140509587561632
- 140509587561632 [label=AccumulateGrad]
- 140509587560288 -> 140509587560480
- 140509587560288 [label=TBackward0]
- 140509587560960 -> 140509587560288
- 140509587560960 [label=ToCopyBackward0]
- 140509587561440 -> 140509587560960
- 140509590857184 [label="encoder.layer.5.experts.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590857184 -> 140509587561440
- 140509587561440 [label=AccumulateGrad]
- 140509587560192 -> 140509587560048
- 140509587560000 -> 140509587559904
- 140509590856944 [label="encoder.layer.5.experts.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590856944 -> 140509587560000
- 140509587560000 [label=AccumulateGrad]
- 140509587559952 -> 140509587559904
- 140509590857024 [label="encoder.layer.5.experts.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590857024 -> 140509587559952
- 140509587559952 [label=AccumulateGrad]
- 140509587559664 -> 140509587850432
- 140509587559664 [label=NativeLayerNormBackward0]
- 140509587560336 -> 140509587559664
- 140509587560336 [label=AddBackward0]
- 140509587561152 -> 140509587560336
- 140509587561152 [label=NativeDropoutBackward0]
- 140509587560864 -> 140509587561152
- 140509587560864 [label=ViewBackward0]
- 140509587561392 -> 140509587560864
- 140509587561392 [label=AddmmBackward0]
- 140509587562064 -> 140509587561392
- 140509587562064 [label=ToCopyBackward0]
- 140509587562352 -> 140509587562064
- 140509590858704 [label="encoder.layer.5.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590858704 -> 140509587562352
- 140509587562352 [label=AccumulateGrad]
- 140509587562016 -> 140509587561392
- 140509587562016 [label=ViewBackward0]
- 140509587571280 -> 140509587562016
- 140509587571280 [label=GeluBackward0]
- 140509587570848 -> 140509587571280
- 140509587570848 [label=ViewBackward0]
- 140509587571904 -> 140509587570848
- 140509587571904 [label=AddmmBackward0]
- 140509587572288 -> 140509587571904
- 140509587572288 [label=ToCopyBackward0]
- 140509587591520 -> 140509587572288
- 140509590858944 [label="encoder.layer.5.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590858944 -> 140509587591520
- 140509587591520 [label=AccumulateGrad]
- 140509587572096 -> 140509587571904
- 140509587572096 [label=ViewBackward0]
- 140517615594320 -> 140509587572096
- 140517615594320 [label=ToCopyBackward0]
- 140509587560672 -> 140517615594320
- 140509587560672 [label=SliceBackward0]
- 140517615594368 -> 140509587560672
- 140517615594368 [label=SliceBackward0]
- 140517615594464 -> 140517615594368
- 140517615594464 [label=SliceBackward0]
- 140509587561872 -> 140517615594464
- 140509587571088 -> 140509587571904
- 140509587571088 [label=TBackward0]
- 140517615594080 -> 140509587571088
- 140517615594080 [label=ToCopyBackward0]
- 140517615594560 -> 140517615594080
- 140509590858864 [label="encoder.layer.5.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590858864 -> 140517615594560
- 140517615594560 [label=AccumulateGrad]
- 140509587561824 -> 140509587561392
- 140509587561824 [label=TBackward0]
- 140509587571040 -> 140509587561824
- 140509587571040 [label=ToCopyBackward0]
- 140509587572624 -> 140509587571040
- 140509590858624 [label="encoder.layer.5.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590858624 -> 140509587572624
- 140509587572624 [label=AccumulateGrad]
- 140509587560672 -> 140509587560336
- 140509587560144 -> 140509587559664
- 140509590858384 [label="encoder.layer.5.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590858384 -> 140509587560144
- 140509587560144 [label=AccumulateGrad]
- 140509587560096 -> 140509587559664
- 140509590858464 [label="encoder.layer.5.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590858464 -> 140509587560096
- 140509587560096 [label=AccumulateGrad]
- 140509587558944 -> 140509587559424
- 140509587558944 [label=TBackward0]
- 140509587559616 -> 140509587558944
- 140509587559616 [label=ToCopyBackward0]
- 140509587560624 -> 140509587559616
- 140509590856704 [label="encoder.layer.6.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590856704 -> 140509587560624
- 140509587560624 [label=AccumulateGrad]
- 140509587558848 -> 140509587558800
- 140509587558848 [label=ReshapeAliasBackward0]
- 140509587559184 -> 140509587558848
- 140509587559184 [label=ExpandBackward0]
- 140509587559376 -> 140509587559184
- 140509587559376 [label=TransposeBackward0]
- 140509587559856 -> 140509587559376
- 140509587559856 [label=PermuteBackward0]
- 140509587561728 -> 140509587559856
- 140509587561728 [label=ViewBackward0]
- 140509587559808 -> 140509587561728
- 140509587559808 [label=ViewBackward0]
- 140509587562256 -> 140509587559808
- 140509587562256 [label=AddmmBackward0]
- 140509587571712 -> 140509587562256
- 140509587571712 [label=ToCopyBackward0]
- 140517615594272 -> 140509587571712
- 140509590856544 [label="encoder.layer.6.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590856544 -> 140517615594272
- 140517615594272 [label=AccumulateGrad]
- 140509587570800 -> 140509587562256
- 140509587570800 [label=ViewBackward0]
- 140517615594608 -> 140509587570800
- 140517615594608 [label=ToCopyBackward0]
- 140509587850432 -> 140517615594608
- 140517615594176 -> 140509587562256
- 140517615594176 [label=TBackward0]
- 140517615594224 -> 140517615594176
- 140517615594224 [label=ToCopyBackward0]
- 140517615594752 -> 140517615594224
- 140509590856464 [label="encoder.layer.6.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590856464 -> 140517615594752
- 140517615594752 [label=AccumulateGrad]
- 140509587849376 -> 140509587849520
- 140509587849376 [label=ReshapeAliasBackward0]
- 140509587853120 -> 140509587849376
- 140509587853120 [label=ExpandBackward0]
- 140509587853216 -> 140509587853120
- 140509587853216 [label=PermuteBackward0]
- 140509587558656 -> 140509587853216
- 140509587558656 [label=ViewBackward0]
- 140509587558464 -> 140509587558656
- 140509587558464 [label=ViewBackward0]
- 140509587559280 -> 140509587558464
- 140509587559280 [label=AddmmBackward0]
- 140509587560432 -> 140509587559280
- 140509587560432 [label=ToCopyBackward0]
- 140509587558992 -> 140509587560432
- 140509590856304 [label="encoder.layer.6.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590856304 -> 140509587558992
- 140509587558992 [label=AccumulateGrad]
- 140509587559568 -> 140509587559280
- 140509587559568 [label=ViewBackward0]
- 140517615594512 -> 140509587559568
- 140517615594512 [label=ToCopyBackward0]
- 140509587850432 -> 140517615594512
- 140509587558512 -> 140509587559280
- 140509587558512 [label=TBackward0]
- 140517615594416 -> 140509587558512
- 140517615594416 [label=ToCopyBackward0]
- 140517615594656 -> 140517615594416
- 140509590856224 [label="encoder.layer.6.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590856224 -> 140517615594656
- 140517615594656 [label=AccumulateGrad]
- 140509587850336 -> 140509587850144
- 140509587850336 [label=TBackward0]
- 140509587849664 -> 140509587850336
- 140509587849664 [label=ToCopyBackward0]
- 140509587849472 -> 140509587849664
- 140509590855984 [label="encoder.layer.6.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590855984 -> 140509587849472
- 140509587849472 [label=AccumulateGrad]
- 140509587850432 -> 140509587850672
- 140509587850624 -> 140509587850768
- 140509590855744 [label="encoder.layer.6.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590855744 -> 140509587850624
- 140509587850624 [label=AccumulateGrad]
- 140509587851248 -> 140509587850768
- 140509590855824 [label="encoder.layer.6.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590855824 -> 140509587851248
- 140509587851248 [label=AccumulateGrad]
- 140509587851920 -> 140509587851440
- 140509587851920 [label=TBackward0]
- 140509587851104 -> 140509587851920
- 140509587851104 [label=ToCopyBackward0]
- 140509587850720 -> 140509587851104
- 140509590843120 [label="encoder.layer.6.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590843120 -> 140509587850720
- 140509587850720 [label=AccumulateGrad]
- 140509587852016 -> 140509587851968
- 140509587852016 [label=ReshapeAliasBackward0]
- 140509587851584 -> 140509587852016
- 140509587851584 [label=ExpandBackward0]
- 140509587851392 -> 140509587851584
- 140509587851392 [label=TransposeBackward0]
- 140509587850912 -> 140509587851392
- 140509587850912 [label=PermuteBackward0]
- 140509587850576 -> 140509587850912
- 140509587850576 [label=ViewBackward0]
- 140509587851056 -> 140509587850576
- 140509587851056 [label=ViewBackward0]
- 140509587850288 -> 140509587851056
- 140509587850288 [label=AddmmBackward0]
- 140509587849952 -> 140509587850288
- 140509587849952 [label=ToCopyBackward0]
- 140509587849760 -> 140509587849952
- 140509590842960 [label="encoder.layer.6.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590842960 -> 140509587849760
- 140509587849760 [label=AccumulateGrad]
- 140509587850096 -> 140509587850288
- 140509587850096 [label=ViewBackward0]
- 140509587853024 -> 140509587850096
- 140509587853024 [label=ToCopyBackward0]
- 140517615539152 -> 140509587853024
- 140509587851776 -> 140509587850288
- 140509587851776 [label=TBackward0]
- 140509587850000 -> 140509587851776
- 140509587850000 [label=ToCopyBackward0]
- 140509587559088 -> 140509587850000
- 140509590842880 [label="encoder.layer.6.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590842880 -> 140509587559088
- 140509587559088 [label=AccumulateGrad]
- 140509587852880 -> 140509587695984
- 140509587852880 [label=ReshapeAliasBackward0]
- 140509587852592 -> 140509587852880
- 140509587852592 [label=ExpandBackward0]
- 140509587852400 -> 140509587852592
- 140509587852400 [label=PermuteBackward0]
- 140509587852208 -> 140509587852400
- 140509587852208 [label=ViewBackward0]
- 140509587852736 -> 140509587852208
- 140509587852736 [label=ViewBackward0]
- 140509587851488 -> 140509587852736
- 140509587851488 [label=AddmmBackward0]
- 140509587850816 -> 140509587851488
- 140509587850816 [label=ToCopyBackward0]
- 140509587849328 -> 140509587850816
- 140509590842720 [label="encoder.layer.6.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590842720 -> 140509587849328
- 140509587849328 [label=AccumulateGrad]
- 140509587851200 -> 140509587851488
- 140509587851200 [label=ViewBackward0]
- 140509587850384 -> 140509587851200
- 140509587850384 [label=ToCopyBackward0]
- 140517615539152 -> 140509587850384
- 140509587852784 -> 140509587851488
- 140509587852784 [label=TBackward0]
- 140509587849568 -> 140509587852784
- 140509587849568 [label=ToCopyBackward0]
- 140509587561536 -> 140509587849568
- 140509590842640 [label="encoder.layer.6.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509590842640 -> 140509587561536
- 140509587561536 [label=AccumulateGrad]
- 140509587695216 -> 140509587695600
- 140509587695216 [label=TBackward0]
- 140509587697520 -> 140509587695216
- 140509587697520 [label=ToCopyBackward0]
- 140509587695552 -> 140509587697520
- 140509590842400 [label="encoder.layer.6.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590842400 -> 140509587695552
- 140509587695552 [label=AccumulateGrad]
- 140509587695120 -> 140509587694832
- 140509587696080 -> 140509587694592
- 140509590842160 [label="encoder.layer.6.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590842160 -> 140509587696080
- 140509587696080 [label=AccumulateGrad]
- 140509587697040 -> 140509587694592
- 140509590842240 [label="encoder.layer.6.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590842240 -> 140509587697040
- 140509587697040 [label=AccumulateGrad]
- 140509587697328 -> 140509587696464
- 140509587697328 [label=TBackward0]
- 140509587693632 -> 140509587697328
- 140509587693632 [label=ToCopyBackward0]
- 140509587694256 -> 140509587693632
- 140509590826016 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590826016 -> 140509587694256
- 140509587694256 [label=AccumulateGrad]
- 140509588196464 -> 140509588196752
- 140509588196464 [label=TBackward0]
- 140509588197136 -> 140509588196464
- 140509588197136 [label=ToCopyBackward0]
- 140509587693968 -> 140509588197136
- 140509590826176 [label="encoder.layer.6.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590826176 -> 140509587693968
- 140509587693968 [label=AccumulateGrad]
- 140509588196272 -> 140509588195888
- 140509588195984 -> 140509588195696
- 140509590825696 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590825696 -> 140509588195984
- 140509588195984 [label=AccumulateGrad]
- 140509588195456 -> 140509588195696
- 140509590826496 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590826496 -> 140509588195456
- 140509588195456 [label=AccumulateGrad]
- 140509588195408 -> 140509588195216
- 140509588195408 [label=UnsqueezeBackward0]
- 140509588195936 -> 140509588195408
- 140509588195936 [label=NativeLayerNormBackward0]
- 140509588196416 -> 140509588195936
- 140509588196416 [label=AddBackward0]
- 140509587694640 -> 140509588196416
- 140509587694640 [label=NativeDropoutBackward0]
- 140509587697424 -> 140509587694640
- 140509587697424 [label=ViewBackward0]
- 140509587693776 -> 140509587697424
- 140509587693776 [label=AddmmBackward0]
- 140509587694928 -> 140509587693776
- 140509587694928 [label=ToCopyBackward0]
- 140509587696848 -> 140509587694928
- 140509590825936 [label="encoder.layer.6.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509590825936 -> 140509587696848
- 140509587696848 [label=AccumulateGrad]
- 140509587694736 -> 140509587693776
- 140509587694736 [label=ViewBackward0]
- 140509587695888 -> 140509587694736
- 140509587695888 [label=GeluBackward0]
- 140509587696176 -> 140509587695888
- 140509587696176 [label=ViewBackward0]
- 140509587695504 -> 140509587696176
- 140509587695504 [label=AddmmBackward0]
- 140509587852304 -> 140509587695504
- 140509587852304 [label=ToCopyBackward0]
- 140509587850528 -> 140509587852304
- 140509590825456 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590825456 -> 140509587850528
- 140509587850528 [label=AccumulateGrad]
- 140509587852496 -> 140509587695504
- 140509587852496 [label=ViewBackward0]
- 140509587558560 -> 140509587852496
- 140509587558560 [label=ToCopyBackward0]
- 140509588196272 -> 140509587558560
- 140509587852688 -> 140509587695504
- 140509587852688 [label=TBackward0]
- 140509587851680 -> 140509587852688
- 140509587851680 [label=ToCopyBackward0]
- 140517615594800 -> 140509587851680
- 140509590825536 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590825536 -> 140517615594800
- 140517615594800 [label=AccumulateGrad]
- 140509587697136 -> 140509587693776
- 140509587697136 [label=TBackward0]
- 140509587695312 -> 140509587697136
- 140509587695312 [label=ToCopyBackward0]
- 140509587558752 -> 140509587695312
- 140509590825296 [label="encoder.layer.6.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590825296 -> 140509587558752
- 140509587558752 [label=AccumulateGrad]
- 140509588196272 -> 140509588196416
- 140509588196368 -> 140509588195936
- 140509590825056 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590825056 -> 140509588196368
- 140509588196368 [label=AccumulateGrad]
- 140509588195792 -> 140509588195936
- 140509590824976 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590824976 -> 140509588195792
- 140509588195792 [label=AccumulateGrad]
- 140509588195312 -> 140509588194976
- 140509588195312 [label=UnsqueezeBackward0]
- 140509588196848 -> 140509588195312
- 140509588196848 [label=UnsqueezeBackward0]
- 140509588195504 -> 140509588196848
- 140509588195504 [label=MulBackward0]
- 140509587695024 -> 140509588195504
- 140509587695024 [label=ViewBackward0]
- 140509587696656 -> 140509587695024
- 140509587696656 [label=CloneBackward0]
- 140509587852832 -> 140509587696656
- 140509587852832 [label=ExpandBackward0]
- 140517615594896 -> 140509587852832
- 140517615594896 [label=UnsqueezeBackward0]
- 140517615594992 -> 140517615594896
- 140517615594992 [label=SoftmaxBackward0]
- 140517615595088 -> 140517615594992
- 140517615595088 [label=MmBackward0]
- 140517615595184 -> 140517615595088
- 140517615595184 [label=ToCopyBackward0]
- 140517615595328 -> 140517615595184
- 140517615595328 [label=DivBackward0]
- 140517615595424 -> 140517615595328
- 140517615595424 [label=SumBackward1]
- 140517615595472 -> 140517615595424
- 140517615595472 [label=MulBackward0]
- 140509587694352 -> 140517615595472
- 140517615595136 -> 140517615595088
- 140517615595136 [label=TBackward0]
- 140517615595232 -> 140517615595136
- 140517615595232 [label=ToCopyBackward0]
- 140517615595280 -> 140517615595232
- 140509590839840 [label="encoder.layer.6.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509590839840 -> 140517615595280
- 140517615595280 [label=AccumulateGrad]
- 140509588194448 -> 140509588165008
- 140509588194448 [label=ViewBackward0]
- 140509588196080 -> 140509588194448
- 140509588196080 [label=CloneBackward0]
- 140509588195120 -> 140509588196080
- 140509588195120 [label=ExpandBackward0]
- 140509587852112 -> 140509588195120
- 140509587852112 [label=UnsqueezeBackward0]
- 140509587694160 -> 140509587852112
- 140509587694160 [label=NativeLayerNormBackward0]
- 140517615594848 -> 140509587694160
- 140517615594848 [label=AddBackward0]
- 140517615726656 -> 140517615594848
- 140517615726656 [label=NativeDropoutBackward0]
- 140517615726896 -> 140517615726656
- 140517615726896 [label=ViewBackward0]
- 140517615726992 -> 140517615726896
- 140517615726992 [label=AddmmBackward0]
- 140517615727088 -> 140517615726992
- 140517615727088 [label=ToCopyBackward0]
- 140517615727280 -> 140517615727088
- 140509590841760 [label="encoder.layer.6.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590841760 -> 140517615727280
- 140517615727280 [label=AccumulateGrad]
- 140517615727040 -> 140517615726992
- 140517615727040 [label=ViewBackward0]
- 140517615727328 -> 140517615727040
- 140517615727328 [label=GeluBackward0]
- 140517615727424 -> 140517615727328
- 140517615727424 [label=ViewBackward0]
- 140517615727520 -> 140517615727424
- 140517615727520 [label=AddmmBackward0]
- 140517615727616 -> 140517615727520
- 140517615727616 [label=ToCopyBackward0]
- 140517615727808 -> 140517615727616
- 140509590842000 [label="encoder.layer.6.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590842000 -> 140517615727808
- 140517615727808 [label=AccumulateGrad]
- 140517615727568 -> 140517615727520
- 140517615727568 [label=ViewBackward0]
- 140517615727856 -> 140517615727568
- 140517615727856 [label=ToCopyBackward0]
- 140517615726800 -> 140517615727856
- 140517615726800 [label=SliceBackward0]
- 140517615728000 -> 140517615726800
- 140517615728000 [label=SliceBackward0]
- 140517615728096 -> 140517615728000
- 140517615728096 [label=SliceBackward0]
- 140509587850768 -> 140517615728096
- 140517615727232 -> 140517615727520
- 140517615727232 [label=TBackward0]
- 140517615727760 -> 140517615727232
- 140517615727760 [label=ToCopyBackward0]
- 140517615728192 -> 140517615727760
- 140509590841920 [label="encoder.layer.6.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509590841920 -> 140517615728192
- 140517615728192 [label=AccumulateGrad]
- 140517615726752 -> 140517615726992
- 140517615726752 [label=TBackward0]
- 140517615727472 -> 140517615726752
- 140517615727472 [label=ToCopyBackward0]
- 140517615727952 -> 140517615727472
- 140509590841680 [label="encoder.layer.6.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590841680 -> 140517615727952
- 140517615727952 [label=AccumulateGrad]
- 140517615726800 -> 140517615594848
- 140517615595040 -> 140509587694160
- 140509590841440 [label="encoder.layer.6.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590841440 -> 140517615595040
- 140517615595040 [label=AccumulateGrad]
- 140517615594944 -> 140509587694160
- 140509590841520 [label="encoder.layer.6.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590841520 -> 140517615594944
- 140517615594944 [label=AccumulateGrad]
- 140509588193344 -> 140509588194160
- 140509588193344 [label=TBackward0]
- 140509588194544 -> 140509588193344
- 140509588194544 [label=ToCopyBackward0]
- 140509588194928 -> 140509588194544
- 140509590840000 [label="encoder.layer.7.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590840000 -> 140509588194928
- 140509588194928 [label=AccumulateGrad]
- 140509588168464 -> 140509588168176
- 140509588168464 [label=UnsafeViewBackward0]
- 140509588168560 -> 140509588168464
- 140509588168560 [label=CloneBackward0]
- 140509588193776 -> 140509588168560
- 140509588193776 [label=ExpandBackward0]
- 140509588194256 -> 140509588193776
- 140509588194256 [label=TransposeBackward0]
- 140509588194832 -> 140509588194256
- 140509588194832 [label=PermuteBackward0]
- 140509587694448 -> 140509588194832
- 140509587694448 [label=ViewBackward0]
- 140517615595376 -> 140509587694448
- 140517615595376 [label=ViewBackward0]
- 140509588193392 -> 140517615595376
- 140509588193392 [label=AddmmBackward0]
- 140517615727136 -> 140509588193392
- 140517615727136 [label=ToCopyBackward0]
- 140517615728048 -> 140517615727136
- 140509590840560 [label="encoder.layer.7.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590840560 -> 140517615728048
- 140517615728048 [label=AccumulateGrad]
- 140517615726944 -> 140509588193392
- 140517615726944 [label=ViewBackward0]
- 140517615727376 -> 140517615726944
- 140517615727376 [label=ToCopyBackward0]
- 140509588165008 -> 140517615727376
- 140517615726704 -> 140509588193392
- 140517615726704 [label=TBackward0]
- 140517615727664 -> 140517615726704
- 140517615727664 [label=ToCopyBackward0]
- 140517615728240 -> 140517615727664
- 140509590840240 [label="encoder.layer.7.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590840240 -> 140517615728240
- 140517615728240 [label=AccumulateGrad]
- 140509588166736 -> 140509588166832
- 140509588166736 [label=UnsafeViewBackward0]
- 140509588167504 -> 140509588166736
- 140509588167504 [label=CloneBackward0]
- 140509588167792 -> 140509588167504
- 140509588167792 [label=ExpandBackward0]
- 140509588168080 -> 140509588167792
- 140509588168080 [label=PermuteBackward0]
- 140509588166928 -> 140509588168080
- 140509588166928 [label=ViewBackward0]
- 140509588167120 -> 140509588166928
- 140509588167120 [label=ViewBackward0]
- 140509588194736 -> 140509588167120
- 140509588194736 [label=AddmmBackward0]
- 140517615594704 -> 140509588194736
- 140517615594704 [label=ToCopyBackward0]
- 140517615727712 -> 140517615594704
- 140509590839760 [label="encoder.layer.7.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590839760 -> 140517615727712
- 140517615727712 [label=AccumulateGrad]
- 140509587695792 -> 140509588194736
- 140509587695792 [label=ViewBackward0]
- 140517615728336 -> 140509587695792
- 140517615728336 [label=ToCopyBackward0]
- 140509588165008 -> 140517615728336
- 140509588193488 -> 140509588194736
- 140509588193488 [label=TBackward0]
- 140517615727184 -> 140509588193488
- 140517615727184 [label=ToCopyBackward0]
- 140517615728384 -> 140517615727184
- 140509590840480 [label="encoder.layer.7.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590840480 -> 140517615728384
- 140517615728384 [label=AccumulateGrad]
- 140509588165056 -> 140509588165488
- 140509588165056 [label=TBackward0]
- 140509588166256 -> 140509588165056
- 140509588166256 [label=ToCopyBackward0]
- 140509588166496 -> 140509588166256
- 140509590839600 [label="encoder.layer.7.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590839600 -> 140509588166496
- 140509588166496 [label=AccumulateGrad]
- 140509588165008 -> 140509588164912
- 140509588164720 -> 140509588139888
- 140509590839520 [label="encoder.layer.7.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590839520 -> 140509588164720
- 140509588164720 [label=AccumulateGrad]
- 140509588164672 -> 140509588139888
- 140509985419152 [label="encoder.layer.7.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509985419152 -> 140509588164672
- 140509588164672 [label=AccumulateGrad]
- 140509588138160 -> 140509588138640
- 140509588138160 [label=TBackward0]
- 140509588138928 -> 140509588138160
- 140509588138928 [label=ToCopyBackward0]
- 140509588139456 -> 140509588138928
- 140509591342032 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591342032 -> 140509588139456
- 140509588139456 [label=AccumulateGrad]
- 140509588137296 -> 140509588137536
- 140509588137296 [label=TBackward0]
- 140509588138448 -> 140509588137296
- 140509588138448 [label=ToCopyBackward0]
- 140509588139216 -> 140509588138448
- 140509591341712 [label="encoder.layer.7.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591341712 -> 140509588139216
- 140509588139216 [label=AccumulateGrad]
- 140509588137056 -> 140509588137104
- 140509588136816 -> 140509588136912
- 140509591341472 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591341472 -> 140509588136816
- 140509588136816 [label=AccumulateGrad]
- 140509588136720 -> 140509588136912
- 140509591341792 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591341792 -> 140509588136720
- 140509588136720 [label=AccumulateGrad]
- 140509588136624 -> 140509588136432
- 140509588136624 [label=UnsqueezeBackward0]
- 140509588137200 -> 140509588136624
- 140509588137200 [label=NativeLayerNormBackward0]
- 140509588137680 -> 140509588137200
- 140509588137680 [label=AddBackward0]
- 140509588139024 -> 140509588137680
- 140509588139024 [label=NativeDropoutBackward0]
- 140509588138256 -> 140509588139024
- 140509588138256 [label=ViewBackward0]
- 140509588139312 -> 140509588138256
- 140509588139312 [label=AddmmBackward0]
- 140509588137968 -> 140509588139312
- 140509588137968 [label=ToCopyBackward0]
- 140509588165776 -> 140509588137968
- 140509591342192 [label="encoder.layer.7.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591342192 -> 140509588165776
- 140509588165776 [label=AccumulateGrad]
- 140509588165104 -> 140509588139312
- 140509588165104 [label=ViewBackward0]
- 140509588166448 -> 140509588165104
- 140509588166448 [label=GeluBackward0]
- 140509588166064 -> 140509588166448
- 140509588166064 [label=ViewBackward0]
- 140509588167600 -> 140509588166064
- 140509588167600 [label=AddmmBackward0]
- 140509588168272 -> 140509588167600
- 140509588168272 [label=ToCopyBackward0]
- 140509588193968 -> 140509588168272
- 140509591341552 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591341552 -> 140509588193968
- 140509588193968 [label=AccumulateGrad]
- 140509588167984 -> 140509588167600
- 140509588167984 [label=ViewBackward0]
- 140517615727904 -> 140509588167984
- 140517615727904 [label=ToCopyBackward0]
- 140509588137056 -> 140517615727904
- 140509588165872 -> 140509588167600
- 140509588165872 [label=TBackward0]
- 140517615726848 -> 140509588165872
- 140517615726848 [label=ToCopyBackward0]
- 140517615728288 -> 140517615726848
- 140509591341232 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591341232 -> 140517615728288
- 140517615728288 [label=AccumulateGrad]
- 140509588164816 -> 140509588139312
- 140509588164816 [label=TBackward0]
- 140509588166016 -> 140509588164816
- 140509588166016 [label=ToCopyBackward0]
- 140509588193536 -> 140509588166016
- 140509591340992 [label="encoder.layer.7.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591340992 -> 140509588193536
- 140509588193536 [label=AccumulateGrad]
- 140509588137056 -> 140509588137680
- 140509588137584 -> 140509588137200
- 140509591340752 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591340752 -> 140509588137584
- 140509588137584 [label=AccumulateGrad]
- 140509588136576 -> 140509588137200
- 140509591341072 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591341072 -> 140509588136576
- 140509588136576 [label=AccumulateGrad]
- 140509588136096 -> 140509588136240
- 140509588136096 [label=UnsqueezeBackward0]
- 140509588138064 -> 140509588136096
- 140509588138064 [label=UnsqueezeBackward0]
- 140509588139408 -> 140509588138064
- 140509588139408 [label=MulBackward0]
- 140509588139696 -> 140509588139408
- 140509588139696 [label=SoftmaxBackward0]
- 140509588167312 -> 140509588139696
- 140509588167312 [label=MmBackward0]
- 140509588165392 -> 140509588167312
- 140509588165392 [label=ToCopyBackward0]
- 140517615728480 -> 140509588165392
- 140517615728480 [label=DivBackward0]
- 140517615728672 -> 140517615728480
- 140517615728672 [label=SumBackward1]
- 140517615728768 -> 140517615728672
- 140517615728768 [label=MulBackward0]
- 140509588137056 -> 140517615728768
- 140517615728144 -> 140509588167312
- 140517615728144 [label=TBackward0]
- 140517615728720 -> 140517615728144
- 140517615728720 [label=ToCopyBackward0]
- 140517615728816 -> 140517615728720
- 140509590823376 [label="encoder.layer.7.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509590823376 -> 140517615728816
- 140517615728816 [label=AccumulateGrad]
- 140509588106928 -> 140509588077488
- 140509588106928 [label=IndexBackward0]
- 140509588137008 -> 140509588106928
- 140509588137008 [label=NativeLayerNormBackward0]
- 140509588136336 -> 140509588137008
- 140509588136336 [label=AddBackward0]
- 140517615728864 -> 140509588136336
- 140517615728864 [label=NativeDropoutBackward0]
- 140517615728528 -> 140517615728864
- 140517615728528 [label=ViewBackward0]
- 140517615729008 -> 140517615728528
- 140517615729008 [label=AddmmBackward0]
- 140517615729104 -> 140517615729008
- 140517615729104 [label=ToCopyBackward0]
- 140517615729296 -> 140517615729104
- 140509590826656 [label="encoder.layer.7.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509590826656 -> 140517615729296
- 140517615729296 [label=AccumulateGrad]
- 140517615729056 -> 140517615729008
- 140517615729056 [label=ViewBackward0]
- 140517615729344 -> 140517615729056
- 140517615729344 [label=GeluBackward0]
- 140517615729440 -> 140517615729344
- 140517615729440 [label=ViewBackward0]
- 140517615729536 -> 140517615729440
- 140517615729536 [label=AddmmBackward0]
- 140517615729632 -> 140517615729536
- 140517615729632 [label=ToCopyBackward0]
- 140517615729824 -> 140517615729632
- 140509590826896 [label="encoder.layer.7.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509590826896 -> 140517615729824
- 140517615729824 [label=AccumulateGrad]
- 140517615729584 -> 140517615729536
- 140517615729584 [label=ViewBackward0]
- 140517615729872 -> 140517615729584
- 140517615729872 [label=ToCopyBackward0]
- 140517615728624 -> 140517615729872
- 140517615728624 [label=SliceBackward0]
- 140517615730016 -> 140517615728624
- 140517615730016 [label=SliceBackward0]
- 140517615730112 -> 140517615730016
- 140517615730112 [label=SliceBackward0]
- 140509588139888 -> 140517615730112
- 140517615729248 -> 140517615729536
- 140517615729248 [label=TBackward0]
- 140517615729776 -> 140517615729248
- 140517615729776 [label=ToCopyBackward0]
- 140517615730208 -> 140517615729776
- 140509985417872 [label="encoder.layer.7.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509985417872 -> 140517615730208
- 140517615730208 [label=AccumulateGrad]
- 140517615728912 -> 140517615729008
- 140517615728912 [label=TBackward0]
- 140517615729488 -> 140517615728912
- 140517615729488 [label=ToCopyBackward0]
- 140517615729968 -> 140517615729488
- 140509590826416 [label="encoder.layer.7.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509590826416 -> 140517615729968
- 140517615729968 [label=AccumulateGrad]
- 140517615728624 -> 140509588136336
- 140509588138736 -> 140509588137008
- 140509590826736 [label="encoder.layer.7.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590826736 -> 140509588138736
- 140509588138736 [label=AccumulateGrad]
- 140509588136048 -> 140509588137008
- 140509590824496 [label="encoder.layer.7.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509590824496 -> 140509588136048
- 140509588136048 [label=AccumulateGrad]
- 140509588105392 -> 140509588106352
- 140509588105392 [label=TBackward0]
- 140509588106640 -> 140509588105392
- 140509588106640 [label=ToCopyBackward0]
- 140509588165584 -> 140509588106640
- 140509590823616 [label="encoder.layer.8.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509590823616 -> 140509588165584
- 140509588165584 [label=AccumulateGrad]
- 140509588105200 -> 140509588105296
- 140509588105200 [label=UnsafeViewBackward0]
- 140509588136144 -> 140509588105200
- 140509588136144 [label=CloneBackward0]
- 140509588106064 -> 140509588136144
- 140509588106064 [label=ExpandBackward0]
- 140509588106448 -> 140509588106064
- 140509588106448 [label=TransposeBackward0]
- 140509588107216 -> 140509588106448
- 140509588107216 [label=PermuteBackward0]
- 140509588106880 -> 140509588107216
- 140509588106880 [label=ViewBackward0]
- 140517615728960 -> 140509588106880
- 140517615728960 [label=ViewBackward0]
- 140517615729200 -> 140517615728960
- 140517615729200 [label=AddmmBackward0]
- 140517615729728 -> 140517615729200
- 140517615729728 [label=ToCopyBackward0]
- 140517615729920 -> 140517615729728
- 140509590823776 [label="encoder.layer.8.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509590823776 -> 140517615729920
- 140517615729920 [label=AccumulateGrad]
- 140517615729680 -> 140517615729200
- 140517615729680 [label=ViewBackward0]
- 140517615730256 -> 140517615729680
- 140517615730256 [label=ToCopyBackward0]
- 140509588077488 -> 140517615730256
- 140517615728432 -> 140517615729200
- 140517615728432 [label=TBackward0]
- 140517615729392 -> 140517615728432
- 140517615729392 [label=ToCopyBackward0]
- 140517615730400 -> 140517615729392
- 140509590823856 [label="encoder.layer.8.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509590823856 -> 140517615730400
- 140517615730400 [label=AccumulateGrad]
- 140509588103856 -> 140509588103520
- 140509588103856 [label=UnsafeViewBackward0]
- 140509588104240 -> 140509588103856
- 140509588104240 [label=CloneBackward0]
- 140509588104480 -> 140509588104240
- 140509588104480 [label=ExpandBackward0]
- 140509588104912 -> 140509588104480
- 140509588104912 [label=PermuteBackward0]
- 140509588104048 -> 140509588104912
- 140509588104048 [label=ViewBackward0]
- 140509588105968 -> 140509588104048
- 140509588105968 [label=ViewBackward0]
- 140509588106736 -> 140509588105968
- 140509588106736 [label=AddmmBackward0]
- 140509588105584 -> 140509588106736
- 140509588105584 [label=ToCopyBackward0]
- 140517615730160 -> 140509588105584
- 140509590824016 [label="encoder.layer.8.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509590824016 -> 140517615730160
- 140517615730160 [label=AccumulateGrad]
- 140509588103952 -> 140509588106736
- 140509588103952 [label=ViewBackward0]
- 140517615730496 -> 140509588103952
- 140517615730496 [label=ToCopyBackward0]
- 140509588077488 -> 140517615730496
- 140517615728576 -> 140509588106736
- 140517615728576 [label=TBackward0]
- 140517615730064 -> 140517615728576
- 140517615730064 [label=ToCopyBackward0]
- 140517615730544 -> 140517615730064
- 140509590824096 [label="encoder.layer.8.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509590824096 -> 140517615730544
- 140517615730544 [label=AccumulateGrad]
- 140509588077584 -> 140509588077968
- 140509588077584 [label=TBackward0]
- 140509588078256 -> 140509588077584
- 140509588078256 [label=ToCopyBackward0]
- 140509588103664 -> 140509588078256
- 140509590823296 [label="encoder.layer.8.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509590823296 -> 140509588103664
- 140509588103664 [label=AccumulateGrad]
- 140509588077488 -> 140509588076960
- 140509588077104 -> 140509588076912
- 140509590823136 [label="encoder.layer.8.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509590823136 -> 140509588077104
- 140509588077104 [label=AccumulateGrad]
- 140509588076000 -> 140509588076912
- 140509591342912 [label="encoder.layer.8.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591342912 -> 140509588076000
- 140509588076000 [label=AccumulateGrad]
- 140509588074800 -> 140509588075760
- 140509588074800 [label=TBackward0]
- 140509588076336 -> 140509588074800
- 140509588076336 [label=ToCopyBackward0]
- 140509588077008 -> 140509588076336
- 140509591342992 [label="encoder.layer.8.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509591342992 -> 140509588077008
- 140509588077008 [label=AccumulateGrad]
- 140509588074704 -> 140509588074656
- 140509588074704 [label=UnsafeViewBackward0]
- 140509588075376 -> 140509588074704
- 140509588075376 [label=CloneBackward0]
- 140509588075664 -> 140509588075376
- 140509588075664 [label=ExpandBackward0]
- 140509588076144 -> 140509588075664
- 140509588076144 [label=TransposeBackward0]
- 140509588076816 -> 140509588076144
- 140509588076816 [label=PermuteBackward0]
- 140509588077296 -> 140509588076816
- 140509588077296 [label=ViewBackward0]
- 140509588077440 -> 140509588077296
- 140509588077440 [label=ViewBackward0]
- 140509588077920 -> 140509588077440
- 140509588077920 [label=AddmmBackward0]
- 140509588078544 -> 140509588077920
- 140509588078544 [label=ToCopyBackward0]
- 140509588104432 -> 140509588078544
- 140509591342752 [label="encoder.layer.8.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509591342752 -> 140509588104432
- 140509588104432 [label=AccumulateGrad]
- 140509588075088 -> 140509588077920
- 140509588075088 [label=ViewBackward0]
- 140509588104720 -> 140509588075088
- 140509588104720 [label=ToCopyBackward0]
- 140509588105776 -> 140509588104720
- 140509588105776 [label=ViewBackward0]
- 140509588106256 -> 140509588105776
- 140509588106256 [label=CloneBackward0]
- 140517615730352 -> 140509588106256
- 140517615730352 [label=ExpandBackward0]
- 140517615730592 -> 140517615730352
- 140517615730592 [label=UnsqueezeBackward0]
- 140517615539152 -> 140517615730592
- 140509588103568 -> 140509588077920
- 140509588103568 [label=TBackward0]
- 140509588103280 -> 140509588103568
- 140509588103280 [label=ToCopyBackward0]
- 140509588104960 -> 140509588103280
- 140509591342672 [label="encoder.layer.8.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509591342672 -> 140509588104960
- 140509588104960 [label=AccumulateGrad]
- 140509588048624 -> 140509588048432
- 140509588048624 [label=UnsafeViewBackward0]
- 140509588048960 -> 140509588048624
- 140509588048960 [label=CloneBackward0]
- 140509588049392 -> 140509588048960
- 140509588049392 [label=ExpandBackward0]
- 140509588048816 -> 140509588049392
- 140509588048816 [label=PermuteBackward0]
- 140509588048720 -> 140509588048816
- 140509588048720 [label=ViewBackward0]
- 140509588075568 -> 140509588048720
- 140509588075568 [label=ViewBackward0]
- 140509588076624 -> 140509588075568
- 140509588076624 [label=AddmmBackward0]
- 140509588076432 -> 140509588076624
- 140509588076432 [label=ToCopyBackward0]
- 140509588103376 -> 140509588076432
- 140509591340592 [label="encoder.layer.8.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509591340592 -> 140509588103376
- 140509588103376 [label=AccumulateGrad]
- 140509588077200 -> 140509588076624
- 140509588077200 [label=ViewBackward0]
- 140509588104000 -> 140509588077200
- 140509588104000 [label=ToCopyBackward0]
- 140509588105776 -> 140509588104000
- 140509588074608 -> 140509588076624
- 140509588074608 [label=TBackward0]
- 140517615730640 -> 140509588074608
- 140517615730640 [label=ToCopyBackward0]
- 140517615730448 -> 140517615730640
- 140509591342512 [label="encoder.layer.8.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509591342512 -> 140517615730448
- 140517615730448 [label=AccumulateGrad]
- 140509588047088 -> 140509588047376
- 140509588047088 [label=TBackward0]
- 140509588048144 -> 140509588047088
- 140509588048144 [label=ToCopyBackward0]
- 140509588048528 -> 140509588048144
- 140509591340832 [label="encoder.layer.8.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509591340832 -> 140509588048528
- 140509588048528 [label=AccumulateGrad]
- 140509588046896 -> 140509588046608
- 140509588046320 -> 140509588046416
- 140509591340512 [label="encoder.layer.8.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591340512 -> 140509588046320
- 140509588046320 [label=AccumulateGrad]
- 140509588045888 -> 140509588046416
- 140509591340272 [label="encoder.layer.8.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591340272 -> 140509588045888
- 140509588045888 [label=AccumulateGrad]
- 140509588024432 -> 140509588024912
- 140509588024432 [label=TBackward0]
- 140509588046128 -> 140509588024432
- 140509588046128 [label=ToCopyBackward0]
- 140509588046512 -> 140509588046128
- 140509591319952 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591319952 -> 140509588046512
- 140509588046512 [label=AccumulateGrad]
- 140509588023568 -> 140509588023856
- 140509588023568 [label=TBackward0]
- 140509588024576 -> 140509588023568
- 140509588024576 [label=ToCopyBackward0]
- 140509588025008 -> 140509588024576
- 140509591320032 [label="encoder.layer.8.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591320032 -> 140509588025008
- 140509588025008 [label=AccumulateGrad]
- 140509588023376 -> 140509588023280
- 140509588023088 -> 140509588023184
- 140509591319792 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591319792 -> 140509588023088
- 140509588023088 [label=AccumulateGrad]
- 140509588022992 -> 140509588023184
- 140509591319712 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591319712 -> 140509588022992
- 140509588022992 [label=AccumulateGrad]
- 140509588022800 -> 140509588022704
- 140509588022800 [label=UnsqueezeBackward0]
- 140509588023472 -> 140509588022800
- 140509588023472 [label=NativeLayerNormBackward0]
- 140509588023952 -> 140509588023472
- 140509588023952 [label=AddBackward0]
- 140509588024528 -> 140509588023952
- 140509588024528 [label=NativeDropoutBackward0]
- 140509588046032 -> 140509588024528
- 140509588046032 [label=ViewBackward0]
- 140509588045936 -> 140509588046032
- 140509588045936 [label=AddmmBackward0]
- 140509588047472 -> 140509588045936
- 140509588047472 [label=ToCopyBackward0]
- 140509588047520 -> 140509588047472
- 140509591320512 [label="encoder.layer.8.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591320512 -> 140509588047520
- 140509588047520 [label=AccumulateGrad]
- 140509588047040 -> 140509588045936
- 140509588047040 [label=ViewBackward0]
- 140509588048048 -> 140509588047040
- 140509588048048 [label=GeluBackward0]
- 140509588049440 -> 140509588048048
- 140509588049440 [label=ViewBackward0]
- 140509588048912 -> 140509588049440
- 140509588048912 [label=AddmmBackward0]
- 140509588077680 -> 140509588048912
- 140509588077680 [label=ToCopyBackward0]
- 140517615729152 -> 140509588077680
- 140509591319472 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591319472 -> 140517615729152
- 140517615729152 [label=AccumulateGrad]
- 140509588075856 -> 140509588048912
- 140509588075856 [label=ViewBackward0]
- 140517615268000 -> 140509588075856
- 140517615268000 [label=ToCopyBackward0]
- 140509588023376 -> 140517615268000
- 140509588074560 -> 140509588048912
- 140509588074560 [label=TBackward0]
- 140517615267904 -> 140509588074560
- 140517615267904 [label=ToCopyBackward0]
- 140517615268144 -> 140517615267904
- 140509591319552 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591319552 -> 140517615268144
- 140517615268144 [label=AccumulateGrad]
- 140509588046992 -> 140509588045936
- 140509588046992 [label=TBackward0]
- 140509588075184 -> 140509588046992
- 140509588075184 [label=ToCopyBackward0]
- 140517615730304 -> 140509588075184
- 140509591319312 [label="encoder.layer.8.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591319312 -> 140517615730304
- 140517615730304 [label=AccumulateGrad]
- 140509588023376 -> 140509588023952
- 140509588023760 -> 140509588023472
- 140509591319072 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591319072 -> 140509588023760
- 140509588023760 [label=AccumulateGrad]
- 140509588022896 -> 140509588023472
- 140509591318992 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591318992 -> 140509588022896
- 140509588022896 [label=AccumulateGrad]
- 140509588022416 -> 140509588022512
- 140509588022416 [label=UnsqueezeBackward0]
- 140509588024240 -> 140509588022416
- 140509588024240 [label=UnsqueezeBackward0]
- 140509588024096 -> 140509588024240
- 140509588024096 [label=MulBackward0]
- 140509588047664 -> 140509588024096
- 140509588047664 [label=SoftmaxBackward0]
- 140509588049200 -> 140509588047664
- 140509588049200 [label=MmBackward0]
- 140509588046080 -> 140509588049200
- 140509588046080 [label=ToCopyBackward0]
- 140517615268048 -> 140509588046080
- 140517615268048 [label=DivBackward0]
- 140517615268336 -> 140517615268048
- 140517615268336 [label=SumBackward1]
- 140517615268432 -> 140517615268336
- 140517615268432 [label=MulBackward0]
- 140509588023376 -> 140517615268432
- 140517615267952 -> 140509588049200
- 140517615267952 [label=TBackward0]
- 140517615268384 -> 140517615267952
- 140517615268384 [label=ToCopyBackward0]
- 140517615268480 -> 140517615268384
- 140509591321392 [label="encoder.layer.8.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509591321392 -> 140517615268480
- 140517615268480 [label=AccumulateGrad]
- 140509588021840 -> 140509587963664
- 140509588021840 [label=IndexBackward0]
- 140509588023136 -> 140509588021840
- 140509588023136 [label=NativeLayerNormBackward0]
- 140509588022608 -> 140509588023136
- 140509588022608 [label=AddBackward0]
- 140517615268528 -> 140509588022608
- 140517615268528 [label=NativeDropoutBackward0]
- 140517615268192 -> 140517615268528
- 140517615268192 [label=ViewBackward0]
- 140517615268672 -> 140517615268192
- 140517615268672 [label=AddmmBackward0]
- 140517615268768 -> 140517615268672
- 140517615268768 [label=ToCopyBackward0]
- 140517615268960 -> 140517615268768
- 140509591339792 [label="encoder.layer.8.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591339792 -> 140517615268960
- 140517615268960 [label=AccumulateGrad]
- 140517615268720 -> 140517615268672
- 140517615268720 [label=ViewBackward0]
- 140517615269008 -> 140517615268720
- 140517615269008 [label=GeluBackward0]
- 140517615269104 -> 140517615269008
- 140517615269104 [label=ViewBackward0]
- 140517615269200 -> 140517615269104
- 140517615269200 [label=AddmmBackward0]
- 140517615269296 -> 140517615269200
- 140517615269296 [label=ToCopyBackward0]
- 140517615269488 -> 140517615269296
- 140509591340032 [label="encoder.layer.8.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591340032 -> 140517615269488
- 140517615269488 [label=AccumulateGrad]
- 140517615269248 -> 140517615269200
- 140517615269248 [label=ViewBackward0]
- 140517615269536 -> 140517615269248
- 140517615269536 [label=ToCopyBackward0]
- 140517615268288 -> 140517615269536
- 140517615268288 [label=SliceBackward0]
- 140517615269680 -> 140517615268288
- 140517615269680 [label=SliceBackward0]
- 140517615269776 -> 140517615269680
- 140517615269776 [label=SliceBackward0]
- 140509588076912 -> 140517615269776
- 140517615268912 -> 140517615269200
- 140517615268912 [label=TBackward0]
- 140517615269440 -> 140517615268912
- 140517615269440 [label=ToCopyBackward0]
- 140517615269872 -> 140517615269440
- 140509591340352 [label="encoder.layer.8.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591340352 -> 140517615269872
- 140517615269872 [label=AccumulateGrad]
- 140517615268576 -> 140517615268672
- 140517615268576 [label=TBackward0]
- 140517615269152 -> 140517615268576
- 140517615269152 [label=ToCopyBackward0]
- 140517615269632 -> 140517615269152
- 140509591340112 [label="encoder.layer.8.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591340112 -> 140517615269632
- 140517615269632 [label=AccumulateGrad]
- 140517615268288 -> 140509588022608
- 140509588022176 -> 140509588023136
- 140509591339872 [label="encoder.layer.8.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591339872 -> 140509588022176
- 140509588022176 [label=AccumulateGrad]
- 140509588046560 -> 140509588023136
- 140509591339552 [label="encoder.layer.8.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591339552 -> 140509588046560
- 140509588046560 [label=AccumulateGrad]
- 140509588021312 -> 140509587991520
- 140509588021312 [label=TBackward0]
- 140509588021648 -> 140509588021312
- 140509588021648 [label=ToCopyBackward0]
- 140509588048336 -> 140509588021648
- 140509591321632 [label="encoder.layer.9.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509591321632 -> 140509588048336
- 140509588048336 [label=AccumulateGrad]
- 140509587991472 -> 140509587991568
- 140509587991472 [label=UnsafeViewBackward0]
- 140509587992144 -> 140509587991472
- 140509587992144 [label=CloneBackward0]
- 140509587992528 -> 140509587992144
- 140509587992528 [label=ExpandBackward0]
- 140509587991856 -> 140509587992528
- 140509587991856 [label=TransposeBackward0]
- 140509588022320 -> 140509587991856
- 140509588022320 [label=PermuteBackward0]
- 140509588021936 -> 140509588022320
- 140509588021936 [label=ViewBackward0]
- 140517615268624 -> 140509588021936
- 140517615268624 [label=ViewBackward0]
- 140517615268864 -> 140517615268624
- 140517615268864 [label=AddmmBackward0]
- 140517615269392 -> 140517615268864
- 140517615269392 [label=ToCopyBackward0]
- 140517615269584 -> 140517615269392
- 140509591322192 [label="encoder.layer.9.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509591322192 -> 140517615269584
- 140517615269584 [label=AccumulateGrad]
- 140517615269344 -> 140517615268864
- 140517615269344 [label=ViewBackward0]
- 140517615269920 -> 140517615269344
- 140517615269920 [label=ToCopyBackward0]
- 140509587963664 -> 140517615269920
- 140517615268096 -> 140517615268864
- 140517615268096 [label=TBackward0]
- 140517615269056 -> 140517615268096
- 140517615269056 [label=ToCopyBackward0]
- 140517615270064 -> 140517615269056
- 140509591321872 [label="encoder.layer.9.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509591321872 -> 140517615270064
- 140517615270064 [label=AccumulateGrad]
- 140509587990128 -> 140509587989840
- 140509587990128 [label=UnsafeViewBackward0]
- 140509587990512 -> 140509587990128
- 140509587990512 [label=CloneBackward0]
- 140509587990800 -> 140509587990512
- 140509587990800 [label=ExpandBackward0]
- 140509587991040 -> 140509587990800
- 140509587991040 [label=PermuteBackward0]
- 140509587990224 -> 140509587991040
- 140509587990224 [label=ViewBackward0]
- 140509587992336 -> 140509587990224
- 140509587992336 [label=ViewBackward0]
- 140509587990080 -> 140509587992336
- 140509587990080 [label=AddmmBackward0]
- 140509588021360 -> 140509587990080
- 140509588021360 [label=ToCopyBackward0]
- 140517615269824 -> 140509588021360
- 140509591322432 [label="encoder.layer.9.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509591322432 -> 140517615269824
- 140517615269824 [label=AccumulateGrad]
- 140509588021744 -> 140509587990080
- 140509588021744 [label=ViewBackward0]
- 140517615270160 -> 140509588021744
- 140517615270160 [label=ToCopyBackward0]
- 140509587963664 -> 140517615270160
- 140517615268240 -> 140509587990080
- 140517615268240 [label=TBackward0]
- 140517615269728 -> 140517615268240
- 140517615269728 [label=ToCopyBackward0]
- 140517615270208 -> 140517615269728
- 140509591322112 [label="encoder.layer.9.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509591322112 -> 140517615270208
- 140517615270208 [label=AccumulateGrad]
- 140509587988688 -> 140509587988784
- 140509587988688 [label=TBackward0]
- 140509587989648 -> 140509587988688
- 140509587989648 [label=ToCopyBackward0]
- 140509587989936 -> 140509587989648
- 140509591321712 [label="encoder.layer.9.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509591321712 -> 140509587989936
- 140509587989936 [label=AccumulateGrad]
- 140509587963664 -> 140509587963280
- 140509587963376 -> 140509587963040
- 140509591321232 [label="encoder.layer.9.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591321232 -> 140509587963376
- 140509587963376 [label=AccumulateGrad]
- 140509587962032 -> 140509587963040
- 140509591321472 [label="encoder.layer.9.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591321472 -> 140509587962032
- 140509587962032 [label=AccumulateGrad]
- 140509587961120 -> 140509587961600
- 140509587961120 [label=TBackward0]
- 140509587962224 -> 140509587961120
- 140509587962224 [label=ToCopyBackward0]
- 140509587962896 -> 140509587962224
- 140509591311760 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591311760 -> 140509587962896
- 140509587962896 [label=AccumulateGrad]
- 140509587960688 -> 140509587960976
- 140509587960688 [label=TBackward0]
- 140509587961744 -> 140509587960688
- 140509587961744 [label=ToCopyBackward0]
- 140509587962608 -> 140509587961744
- 140509591311440 [label="encoder.layer.9.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591311440 -> 140509587962608
- 140509587962608 [label=AccumulateGrad]
- 140509587960496 -> 140509587960112
- 140509587960208 -> 140509588463424
- 140509591311200 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591311200 -> 140509587960208
- 140509587960208 [label=AccumulateGrad]
- 140509587960016 -> 140509588463424
- 140509591311520 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591311520 -> 140509587960016
- 140509587960016 [label=AccumulateGrad]
- 140509588463376 -> 140509588463184
- 140509588463376 [label=UnsqueezeBackward0]
- 140509587960160 -> 140509588463376
- 140509587960160 [label=NativeLayerNormBackward0]
- 140509587960640 -> 140509587960160
- 140509587960640 [label=AddBackward0]
- 140509587963184 -> 140509587960640
- 140509587963184 [label=NativeDropoutBackward0]
- 140509587961648 -> 140509587963184
- 140509587961648 [label=ViewBackward0]
- 140509587962320 -> 140509587961648
- 140509587962320 [label=AddmmBackward0]
- 140509587963472 -> 140509587962320
- 140509587963472 [label=ToCopyBackward0]
- 140509587989168 -> 140509587963472
- 140509591311920 [label="encoder.layer.9.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591311920 -> 140509587989168
- 140509587989168 [label=AccumulateGrad]
- 140509587963568 -> 140509587962320
- 140509587963568 [label=ViewBackward0]
- 140509587989744 -> 140509587963568
- 140509587989744 [label=GeluBackward0]
- 140509587989072 -> 140509587989744
- 140509587989072 [label=ViewBackward0]
- 140509587990560 -> 140509587989072
- 140509587990560 [label=AddmmBackward0]
- 140509587991280 -> 140509587990560
- 140509587991280 [label=ToCopyBackward0]
- 140509588022224 -> 140509587991280
- 140509591311280 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591311280 -> 140509588022224
- 140509588022224 [label=AccumulateGrad]
- 140509587990992 -> 140509587990560
- 140509587990992 [label=ViewBackward0]
- 140517615270016 -> 140509587990992
- 140517615270016 [label=ToCopyBackward0]
- 140509587960496 -> 140517615270016
- 140509587988880 -> 140509587990560
- 140509587988880 [label=TBackward0]
- 140517615268816 -> 140509587988880
- 140517615268816 [label=ToCopyBackward0]
- 140517615270112 -> 140517615268816
- 140509591310960 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591310960 -> 140517615270112
- 140517615270112 [label=AccumulateGrad]
- 140509587961264 -> 140509587962320
- 140509587961264 [label=TBackward0]
- 140509587989456 -> 140509587961264
- 140509587989456 [label=ToCopyBackward0]
- 140509587992048 -> 140509587989456
- 140509591310720 [label="encoder.layer.9.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591310720 -> 140509587992048
- 140509587992048 [label=AccumulateGrad]
- 140509587960496 -> 140509587960640
- 140509587960592 -> 140509587960160
- 140509591310480 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591310480 -> 140509587960592
- 140509587960592 [label=AccumulateGrad]
- 140509587959920 -> 140509587960160
- 140509591310800 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591310800 -> 140509587959920
- 140509587959920 [label=AccumulateGrad]
- 140509588463280 -> 140509588462944
- 140509588463280 [label=UnsqueezeBackward0]
- 140509588463472 -> 140509588463280
- 140509588463472 [label=UnsqueezeBackward0]
- 140509587962704 -> 140509588463472
- 140509587962704 [label=MulBackward0]
- 140509587963856 -> 140509587962704
- 140509587963856 [label=SoftmaxBackward0]
- 140509587990320 -> 140509587963856
- 140509587990320 [label=MmBackward0]
- 140509587960304 -> 140509587990320
- 140509587960304 [label=ToCopyBackward0]
- 140517615270304 -> 140509587960304
- 140517615270304 [label=DivBackward0]
- 140517615270496 -> 140517615270304
- 140517615270496 [label=SumBackward1]
- 140517615270592 -> 140517615270496
- 140517615270592 [label=MulBackward0]
- 140509587960496 -> 140517615270592
- 140517615269968 -> 140509587990320
- 140517615269968 [label=TBackward0]
- 140517615270544 -> 140517615269968
- 140517615270544 [label=ToCopyBackward0]
- 140517615270640 -> 140517615270544
- 140509591313200 [label="encoder.layer.9.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509591313200 -> 140517615270640
- 140517615270640 [label=AccumulateGrad]
- 140509588462416 -> 140509588428880
- 140509588462416 [label=IndexBackward0]
- 140509588462896 -> 140509588462416
- 140509588462896 [label=NativeLayerNormBackward0]
- 140509587963088 -> 140509588462896
- 140509587963088 [label=AddBackward0]
- 140517615270688 -> 140509587963088
- 140517615270688 [label=NativeDropoutBackward0]
- 140517615270352 -> 140517615270688
- 140517615270352 [label=ViewBackward0]
- 140517615270832 -> 140517615270352
- 140517615270832 [label=AddmmBackward0]
- 140517615270928 -> 140517615270832
- 140517615270928 [label=ToCopyBackward0]
- 140517615271120 -> 140517615270928
- 140509591320672 [label="encoder.layer.9.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591320672 -> 140517615271120
- 140517615271120 [label=AccumulateGrad]
- 140517615270880 -> 140517615270832
- 140517615270880 [label=ViewBackward0]
- 140517615271168 -> 140517615270880
- 140517615271168 [label=GeluBackward0]
- 140517615271264 -> 140517615271168
- 140517615271264 [label=ViewBackward0]
- 140517615271360 -> 140517615271264
- 140517615271360 [label=AddmmBackward0]
- 140517615271456 -> 140517615271360
- 140517615271456 [label=ToCopyBackward0]
- 140517615271648 -> 140517615271456
- 140509591320752 [label="encoder.layer.9.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591320752 -> 140517615271648
- 140517615271648 [label=AccumulateGrad]
- 140517615271408 -> 140517615271360
- 140517615271408 [label=ViewBackward0]
- 140517615271696 -> 140517615271408
- 140517615271696 [label=ToCopyBackward0]
- 140517615270448 -> 140517615271696
- 140517615270448 [label=SliceBackward0]
- 140517615271840 -> 140517615270448
- 140517615271840 [label=SliceBackward0]
- 140517615271888 -> 140517615271840
- 140517615271888 [label=SliceBackward0]
- 140509587963040 -> 140517615271888
- 140517615271072 -> 140517615271360
- 140517615271072 [label=TBackward0]
- 140517615271600 -> 140517615271072
- 140517615271600 [label=ToCopyBackward0]
- 140517615271552 -> 140517615271600
- 140509591320912 [label="encoder.layer.9.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591320912 -> 140517615271552
- 140517615271552 [label=AccumulateGrad]
- 140517615270736 -> 140517615270832
- 140517615270736 [label=TBackward0]
- 140517615271312 -> 140517615270736
- 140517615271312 [label=ToCopyBackward0]
- 140517615271792 -> 140517615271312
- 140509591320992 [label="encoder.layer.9.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591320992 -> 140517615271792
- 140517615271792 [label=AccumulateGrad]
- 140517615270448 -> 140509587963088
- 140509587962128 -> 140509588462896
- 140509591320432 [label="encoder.layer.9.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591320432 -> 140509587962128
- 140509587962128 [label=AccumulateGrad]
- 140509587961072 -> 140509588462896
- 140509591318592 [label="encoder.layer.9.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591318592 -> 140509587961072
- 140509587961072 [label=AccumulateGrad]
- 140509588461168 -> 140509588462128
- 140509588461168 [label=TBackward0]
- 140509588462512 -> 140509588461168
- 140509588462512 [label=ToCopyBackward0]
- 140509587988592 -> 140509588462512
- 140509591313440 [label="encoder.layer.10.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509591313440 -> 140509587988592
- 140509587988592 [label=AccumulateGrad]
- 140509588461072 -> 140509588460784
- 140509588461072 [label=UnsafeViewBackward0]
- 140509588461456 -> 140509588461072
- 140509588461456 [label=CloneBackward0]
- 140509588461744 -> 140509588461456
- 140509588461744 [label=ExpandBackward0]
- 140509588462224 -> 140509588461744
- 140509588462224 [label=TransposeBackward0]
- 140509588463088 -> 140509588462224
- 140509588463088 [label=PermuteBackward0]
- 140509588462800 -> 140509588463088
- 140509588462800 [label=ViewBackward0]
- 140517615270784 -> 140509588462800
- 140517615270784 [label=ViewBackward0]
- 140517615271024 -> 140517615270784
- 140517615271024 [label=AddmmBackward0]
- 140517615271744 -> 140517615271024
- 140517615271744 [label=ToCopyBackward0]
- 140517615321248 -> 140517615271744
- 140509591313600 [label="encoder.layer.10.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509591313600 -> 140517615321248
- 140517615321248 [label=AccumulateGrad]
- 140517615271504 -> 140517615271024
- 140517615271504 [label=ViewBackward0]
- 140517615321296 -> 140517615271504
- 140517615321296 [label=ToCopyBackward0]
- 140509588428880 -> 140517615321296
- 140517615270256 -> 140517615271024
- 140517615270256 [label=TBackward0]
- 140517615321152 -> 140517615270256
- 140517615321152 [label=ToCopyBackward0]
- 140517615321440 -> 140517615321152
- 140509591313680 [label="encoder.layer.10.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509591313680 -> 140517615321440
- 140517615321440 [label=AccumulateGrad]
- 140509588429936 -> 140509588430704
- 140509588429936 [label=UnsafeViewBackward0]
- 140509588460112 -> 140509588429936
- 140509588460112 [label=CloneBackward0]
- 140509588460400 -> 140509588460112
- 140509588460400 [label=ExpandBackward0]
- 140509588460688 -> 140509588460400
- 140509588460688 [label=PermuteBackward0]
- 140509588459632 -> 140509588460688
- 140509588459632 [label=ViewBackward0]
- 140509588461504 -> 140509588459632
- 140509588461504 [label=ViewBackward0]
- 140509588462704 -> 140509588461504
- 140509588462704 [label=AddmmBackward0]
- 140509588461024 -> 140509588462704
- 140509588461024 [label=ToCopyBackward0]
- 140517615271216 -> 140509588461024
- 140509591313840 [label="encoder.layer.10.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509591313840 -> 140517615271216
- 140517615271216 [label=AccumulateGrad]
- 140509588459680 -> 140509588462704
- 140509588459680 [label=ViewBackward0]
- 140517615321536 -> 140509588459680
- 140517615321536 [label=ToCopyBackward0]
- 140509588428880 -> 140517615321536
- 140517615270400 -> 140509588462704
- 140517615270400 [label=TBackward0]
- 140517615321392 -> 140517615270400
- 140517615321392 [label=ToCopyBackward0]
- 140517615321584 -> 140517615321392
- 140509591313920 [label="encoder.layer.10.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509591313920 -> 140517615321584
- 140517615321584 [label=AccumulateGrad]
- 140509588428928 -> 140509588429360
- 140509588428928 [label=TBackward0]
- 140509588430128 -> 140509588428928
- 140509588430128 [label=ToCopyBackward0]
- 140509588430368 -> 140509588430128
- 140509591313120 [label="encoder.layer.10.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509591313120 -> 140509588430368
- 140509588430368 [label=AccumulateGrad]
- 140509588428880 -> 140509588428784
- 140509588428448 -> 140509588428592
- 140509591312640 [label="encoder.layer.10.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591312640 -> 140509588428448
- 140509588428448 [label=AccumulateGrad]
- 140509588427824 -> 140509588428592
- 140509591312880 [label="encoder.layer.10.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591312880 -> 140509588427824
- 140509588427824 [label=AccumulateGrad]
- 140509588426816 -> 140509588427536
- 140509588426816 [label=TBackward0]
- 140509588427728 -> 140509588426816
- 140509588427728 [label=ToCopyBackward0]
- 140509588428400 -> 140509588427728
- 140509591312720 [label="encoder.layer.10.crossattention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509591312720 -> 140509588428400
- 140509588428400 [label=AccumulateGrad]
- 140509588405840 -> 140509588405504
- 140509588405840 [label=UnsafeViewBackward0]
- 140509588406032 -> 140509588405840
- 140509588406032 [label=CloneBackward0]
- 140509588427008 -> 140509588406032
- 140509588427008 [label=ExpandBackward0]
- 140509588427488 -> 140509588427008
- 140509588427488 [label=TransposeBackward0]
- 140509588428208 -> 140509588427488
- 140509588428208 [label=PermuteBackward0]
- 140509588428688 -> 140509588428208
- 140509588428688 [label=ViewBackward0]
- 140509588429264 -> 140509588428688
- 140509588429264 [label=ViewBackward0]
- 140509588429744 -> 140509588429264
- 140509588429744 [label=AddmmBackward0]
- 140509588430320 -> 140509588429744
- 140509588430320 [label=ToCopyBackward0]
- 140509588460208 -> 140509588430320
- 140509591312480 [label="encoder.layer.10.crossattention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509591312480 -> 140509588460208
- 140509588460208 [label=AccumulateGrad]
- 140509588429648 -> 140509588429744
- 140509588429648 [label=ViewBackward0]
- 140509588460592 -> 140509588429648
- 140509588460592 [label=ToCopyBackward0]
- 140509588461264 -> 140509588460592
- 140509588461264 [label=ViewBackward0]
- 140517615270976 -> 140509588461264
- 140517615270976 [label=CloneBackward0]
- 140509588459584 -> 140517615270976
- 140509588459584 [label=ExpandBackward0]
- 140517615321632 -> 140509588459584
- 140517615321632 [label=UnsqueezeBackward0]
- 140517615539152 -> 140517615321632
- 140509588426864 -> 140509588429744
- 140509588426864 [label=TBackward0]
- 140509588461936 -> 140509588426864
- 140509588461936 [label=ToCopyBackward0]
- 140509588460880 -> 140509588461936
- 140509591312400 [label="encoder.layer.10.crossattention.self.key.weight
- (768, 1408)" fillcolor=lightblue]
- 140509591312400 -> 140509588460880
- 140509588460880 [label=AccumulateGrad]
- 140509588404064 -> 140509588404208
- 140509588404064 [label=UnsafeViewBackward0]
- 140509588404880 -> 140509588404064
- 140509588404880 [label=CloneBackward0]
- 140509588405168 -> 140509588404880
- 140509588405168 [label=ExpandBackward0]
- 140509588405552 -> 140509588405168
- 140509588405552 [label=PermuteBackward0]
- 140509588404304 -> 140509588405552
- 140509588404304 [label=ViewBackward0]
- 140509588405936 -> 140509588404304
- 140509588405936 [label=ViewBackward0]
- 140509588427968 -> 140509588405936
- 140509588427968 [label=AddmmBackward0]
- 140509588428112 -> 140509588427968
- 140509588428112 [label=ToCopyBackward0]
- 140509588459920 -> 140509588428112
- 140509591310560 [label="encoder.layer.10.crossattention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509591310560 -> 140509588459920
- 140509588459920 [label=AccumulateGrad]
- 140509588428976 -> 140509588427968
- 140509588428976 [label=ViewBackward0]
- 140509588429888 -> 140509588428976
- 140509588429888 [label=ToCopyBackward0]
- 140509588461264 -> 140509588429888
- 140509588426960 -> 140509588427968
- 140509588426960 [label=TBackward0]
- 140517615321680 -> 140509588426960
- 140517615321680 [label=ToCopyBackward0]
- 140517615321344 -> 140517615321680
- 140509591312240 [label="encoder.layer.10.crossattention.self.value.weight
- (768, 1408)" fillcolor=lightblue]
- 140509591312240 -> 140517615321344
- 140517615321344 [label=AccumulateGrad]
- 140509588402576 -> 140509588402864
- 140509588402576 [label=TBackward0]
- 140509588403584 -> 140509588402576
- 140509588403584 [label=ToCopyBackward0]
- 140509588404016 -> 140509588403584
- 140509591311040 [label="encoder.layer.10.crossattention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509591311040 -> 140509588404016
- 140509588404016 [label=AccumulateGrad]
- 140509588402384 -> 140509588373360
- 140509588372784 -> 140509588373456
- 140509591293760 [label="encoder.layer.10.crossattention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591293760 -> 140509588372784
- 140509588372784 [label=AccumulateGrad]
- 140509588402240 -> 140509588373456
- 140509591293520 [label="encoder.layer.10.crossattention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591293520 -> 140509588402240
- 140509588402240 [label=AccumulateGrad]
- 140509588372016 -> 140509588372496
- 140509588372016 [label=TBackward0]
- 140509588372688 -> 140509588372016
- 140509588372688 [label=ToCopyBackward0]
- 140509588373168 -> 140509588372688
- 140509591289920 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591289920 -> 140509588373168
- 140509588373168 [label=AccumulateGrad]
- 140509588371008 -> 140509588371440
- 140509588371008 [label=TBackward0]
- 140509588372208 -> 140509588371008
- 140509588372208 [label=ToCopyBackward0]
- 140509588372928 -> 140509588372208
- 140509591290240 [label="encoder.layer.10.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591290240 -> 140509588372928
- 140509588372928 [label=AccumulateGrad]
- 140509588370960 -> 140509588370864
- 140509588370528 -> 140509588370672
- 140509591285328 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591285328 -> 140509588370528
- 140509588370528 [label=AccumulateGrad]
- 140509588370576 -> 140509588370672
- 140509591285248 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591285248 -> 140509588370576
- 140509588370576 [label=AccumulateGrad]
- 140509588370384 -> 140509588370192
- 140509588370384 [label=UnsqueezeBackward0]
- 140509588371056 -> 140509588370384
- 140509588371056 [label=NativeLayerNormBackward0]
- 140509588371536 -> 140509588371056
- 140509588371536 [label=AddBackward0]
- 140509588373072 -> 140509588371536
- 140509588373072 [label=NativeDropoutBackward0]
- 140509588371968 -> 140509588373072
- 140509588371968 [label=ViewBackward0]
- 140509588402288 -> 140509588371968
- 140509588402288 [label=AddmmBackward0]
- 140509588403248 -> 140509588402288
- 140509588403248 [label=ToCopyBackward0]
- 140509588403440 -> 140509588403248
- 140509591284528 [label="encoder.layer.10.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591284528 -> 140509588403440
- 140509588403440 [label=AccumulateGrad]
- 140509588402960 -> 140509588402288
- 140509588402960 [label=ViewBackward0]
- 140509588403536 -> 140509588402960
- 140509588403536 [label=GeluBackward0]
- 140509588405360 -> 140509588403536
- 140509588405360 [label=ViewBackward0]
- 140509588404592 -> 140509588405360
- 140509588404592 [label=AddmmBackward0]
- 140509588429456 -> 140509588404592
- 140509588429456 [label=ToCopyBackward0]
- 140517615321776 -> 140509588429456
- 140509591284768 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591284768 -> 140517615321776
- 140517615321776 [label=AccumulateGrad]
- 140509588427248 -> 140509588404592
- 140509588427248 [label=ViewBackward0]
- 140517615321488 -> 140509588427248
- 140517615321488 [label=ToCopyBackward0]
- 140509588370960 -> 140517615321488
- 140509588404688 -> 140509588404592
- 140509588404688 [label=TBackward0]
- 140517615321728 -> 140509588404688
- 140517615321728 [label=ToCopyBackward0]
- 140517615321968 -> 140517615321728
- 140509591285088 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591285088 -> 140517615321968
- 140517615321968 [label=AccumulateGrad]
- 140509588402768 -> 140509588402288
- 140509588402768 [label=TBackward0]
- 140509588405648 -> 140509588402768
- 140509588405648 [label=ToCopyBackward0]
- 140509588405072 -> 140509588405648
- 140509591284848 [label="encoder.layer.10.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591284848 -> 140509588405072
- 140509588405072 [label=AccumulateGrad]
- 140509588370960 -> 140509588371536
- 140509588371344 -> 140509588371056
- 140509591284608 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591284608 -> 140509588371344
- 140509588371344 [label=AccumulateGrad]
- 140509588370480 -> 140509588371056
- 140509591285008 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591285008 -> 140509588370480
- 140509588370480 [label=AccumulateGrad]
- 140509588370000 -> 140509588370096
- 140509588370000 [label=UnsqueezeBackward0]
- 140509588371824 -> 140509588370000
- 140509588371824 [label=UnsqueezeBackward0]
- 140509588371728 -> 140509588371824
- 140509588371728 [label=MulBackward0]
- 140509588370048 -> 140509588371728
- 140509588370048 [label=SoftmaxBackward0]
- 140509588403824 -> 140509588370048
- 140509588403824 [label=MmBackward0]
- 140517615321824 -> 140509588403824
- 140517615321824 [label=ToCopyBackward0]
- 140517615321872 -> 140517615321824
- 140517615321872 [label=DivBackward0]
- 140517615322160 -> 140517615321872
- 140517615322160 [label=SumBackward1]
- 140517615322256 -> 140517615322160
- 140517615322256 [label=MulBackward0]
- 140509588370960 -> 140517615322256
- 140517615322064 -> 140509588403824
- 140517615322064 [label=TBackward0]
- 140517615322208 -> 140517615322064
- 140517615322208 [label=ToCopyBackward0]
- 140517615322304 -> 140517615322208
- 140509591291120 [label="encoder.layer.10.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509591291120 -> 140517615322304
- 140517615322304 [label=AccumulateGrad]
- 140509588369520 -> 140509588315344
- 140509588369520 [label=IndexBackward0]
- 140509588370768 -> 140509588369520
- 140509588370768 [label=NativeLayerNormBackward0]
- 140509588372448 -> 140509588370768
- 140509588372448 [label=AddBackward0]
- 140517615322352 -> 140509588372448
- 140517615322352 [label=NativeDropoutBackward0]
- 140517615322016 -> 140517615322352
- 140517615322016 [label=ViewBackward0]
- 140517615322496 -> 140517615322016
- 140517615322496 [label=AddmmBackward0]
- 140517615322592 -> 140517615322496
- 140517615322592 [label=ToCopyBackward0]
- 140517615322784 -> 140517615322592
- 140509591293040 [label="encoder.layer.10.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591293040 -> 140517615322784
- 140517615322784 [label=AccumulateGrad]
- 140517615322544 -> 140517615322496
- 140517615322544 [label=ViewBackward0]
- 140517615322832 -> 140517615322544
- 140517615322832 [label=GeluBackward0]
- 140517615322928 -> 140517615322832
- 140517615322928 [label=ViewBackward0]
- 140517615323024 -> 140517615322928
- 140517615323024 [label=AddmmBackward0]
- 140517615323120 -> 140517615323024
- 140517615323120 [label=ToCopyBackward0]
- 140517615323312 -> 140517615323120
- 140509591293280 [label="encoder.layer.10.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591293280 -> 140517615323312
- 140517615323312 [label=AccumulateGrad]
- 140517615323072 -> 140517615323024
- 140517615323072 [label=ViewBackward0]
- 140517615323360 -> 140517615323072
- 140517615323360 [label=ToCopyBackward0]
- 140517615322112 -> 140517615323360
- 140517615322112 [label=SliceBackward0]
- 140517615323504 -> 140517615322112
- 140517615323504 [label=SliceBackward0]
- 140517615323600 -> 140517615323504
- 140517615323600 [label=SliceBackward0]
- 140509588428592 -> 140517615323600
- 140517615322736 -> 140517615323024
- 140517615322736 [label=TBackward0]
- 140517615323264 -> 140517615322736
- 140517615323264 [label=ToCopyBackward0]
- 140517615323696 -> 140517615323264
- 140509591293600 [label="encoder.layer.10.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591293600 -> 140517615323696
- 140517615323696 [label=AccumulateGrad]
- 140517615322400 -> 140517615322496
- 140517615322400 [label=TBackward0]
- 140517615322976 -> 140517615322400
- 140517615322976 [label=ToCopyBackward0]
- 140517615323456 -> 140517615322976
- 140509591293360 [label="encoder.layer.10.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591293360 -> 140517615323456
- 140517615323456 [label=AccumulateGrad]
- 140517615322112 -> 140509588372448
- 140509588369808 -> 140509588370768
- 140509591293120 [label="encoder.layer.10.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591293120 -> 140509588369808
- 140509588369808 [label=AccumulateGrad]
- 140509588403104 -> 140509588370768
- 140509591292800 [label="encoder.layer.10.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591292800 -> 140509588403104
- 140509588403104 [label=AccumulateGrad]
- 140509588347344 -> 140509588348304
- 140509588347344 [label=TBackward0]
- 140509588348880 -> 140509588347344
- 140509588348880 [label=ToCopyBackward0]
- 140509588402480 -> 140509588348880
- 140509591291360 [label="encoder.layer.11.attention.self.query.weight
- (768, 768)" fillcolor=lightblue]
- 140509591291360 -> 140509588402480
- 140509588402480 [label=AccumulateGrad]
- 140509588347104 -> 140509588347248
- 140509588347104 [label=UnsafeViewBackward0]
- 140509588347920 -> 140509588347104
- 140509588347920 [label=CloneBackward0]
- 140509588348208 -> 140509588347920
- 140509588348208 [label=ExpandBackward0]
- 140509588348688 -> 140509588348208
- 140509588348688 [label=TransposeBackward0]
- 140509588347632 -> 140509588348688
- 140509588347632 [label=PermuteBackward0]
- 140509588369712 -> 140509588347632
- 140509588369712 [label=ViewBackward0]
- 140517615322448 -> 140509588369712
- 140517615322448 [label=ViewBackward0]
- 140517615322688 -> 140517615322448
- 140517615322688 [label=AddmmBackward0]
- 140517615323216 -> 140517615322688
- 140517615323216 [label=ToCopyBackward0]
- 140517615323408 -> 140517615323216
- 140509591291920 [label="encoder.layer.11.attention.self.key.bias
- (768)" fillcolor=lightblue]
- 140509591291920 -> 140517615323408
- 140517615323408 [label=AccumulateGrad]
- 140517615323168 -> 140517615322688
- 140517615323168 [label=ViewBackward0]
- 140517615323744 -> 140517615323168
- 140517615323744 [label=ToCopyBackward0]
- 140509588315344 -> 140517615323744
- 140517615321200 -> 140517615322688
- 140517615321200 [label=TBackward0]
- 140517615322880 -> 140517615321200
- 140517615322880 [label=ToCopyBackward0]
- 140517615323888 -> 140517615322880
- 140509591291600 [label="encoder.layer.11.attention.self.key.weight
- (768, 768)" fillcolor=lightblue]
- 140509591291600 -> 140517615323888
- 140517615323888 [label=AccumulateGrad]
- 140509588345808 -> 140509588345616
- 140509588345808 [label=UnsafeViewBackward0]
- 140509588346144 -> 140509588345808
- 140509588346144 [label=CloneBackward0]
- 140509588346576 -> 140509588346144
- 140509588346576 [label=ExpandBackward0]
- 140509588346864 -> 140509588346576
- 140509588346864 [label=PermuteBackward0]
- 140509588346000 -> 140509588346864
- 140509588346000 [label=ViewBackward0]
- 140509588348112 -> 140509588346000
- 140509588348112 [label=ViewBackward0]
- 140509588348400 -> 140509588348112
- 140509588348400 [label=AddmmBackward0]
- 140509588369616 -> 140509588348400
- 140509588369616 [label=ToCopyBackward0]
- 140517615323648 -> 140509588369616
- 140509591292160 [label="encoder.layer.11.attention.self.value.bias
- (768)" fillcolor=lightblue]
- 140509591292160 -> 140517615323648
- 140517615323648 [label=AccumulateGrad]
- 140509588369904 -> 140509588348400
- 140509588369904 [label=ViewBackward0]
- 140517615323984 -> 140509588369904
- 140517615323984 [label=ToCopyBackward0]
- 140509588315344 -> 140517615323984
- 140517615321920 -> 140509588348400
- 140517615321920 [label=TBackward0]
- 140517615323552 -> 140517615321920
- 140517615323552 [label=ToCopyBackward0]
- 140517615324032 -> 140517615323552
- 140509591291840 [label="encoder.layer.11.attention.self.value.weight
- (768, 768)" fillcolor=lightblue]
- 140509591291840 -> 140517615324032
- 140517615324032 [label=AccumulateGrad]
- 140509588315536 -> 140509588315824
- 140509588315536 [label=TBackward0]
- 140509588345328 -> 140509588315536
- 140509588345328 [label=ToCopyBackward0]
- 140509588345712 -> 140509588345328
- 140509591291440 [label="encoder.layer.11.attention.output.dense.weight
- (768, 768)" fillcolor=lightblue]
- 140509591291440 -> 140509588345712
- 140509588345712 [label=AccumulateGrad]
- 140509588315344 -> 140509588314960
- 140509588315056 -> 140509588314768
- 140509591290960 [label="encoder.layer.11.attention.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591290960 -> 140509588315056
- 140509588315056 [label=AccumulateGrad]
- 140509588313568 -> 140509588314768
- 140509591291200 [label="encoder.layer.11.attention.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591291200 -> 140509588313568
- 140509588313568 [label=AccumulateGrad]
- 140509588312272 -> 140509588313328
- 140509588312272 [label=TBackward0]
- 140509588313904 -> 140509588312272
- 140509588313904 [label=ToCopyBackward0]
- 140509588314576 -> 140509588313904
- 140509591260912 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591260912 -> 140509588314576
- 140509588314576 [label=AccumulateGrad]
- 140509588312848 -> 140509588313232
- 140509588312848 [label=TBackward0]
- 140509588312128 -> 140509588312848
- 140509588312128 [label=ToCopyBackward0]
- 140509588314192 -> 140509588312128
- 140509591260592 [label="encoder.layer.11.experts.experts.0.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591260592 -> 140509588314192
- 140509588314192 [label=AccumulateGrad]
- 140509588312608 -> 140509591317376
- 140509591314832 -> 140509591314640
- 140509591260352 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591260352 -> 140509591314832
- 140509591314832 [label=AccumulateGrad]
- 140509591317568 -> 140509591314640
- 140509591260832 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591260832 -> 140509591317568
- 140509591317568 [label=AccumulateGrad]
- 140509591315408 -> 140509588282864
- 140509591315408 [label=UnsqueezeBackward0]
- 140509591268800 -> 140509591315408
- 140509591268800 [label=NativeLayerNormBackward0]
- 140509588313088 -> 140509591268800
- 140509588313088 [label=AddBackward0]
- 140509588314864 -> 140509588313088
- 140509588314864 [label=NativeDropoutBackward0]
- 140509588312224 -> 140509588314864
- 140509588312224 [label=ViewBackward0]
- 140509588314000 -> 140509588312224
- 140509588314000 [label=AddmmBackward0]
- 140509588315008 -> 140509588314000
- 140509588315008 [label=ToCopyBackward0]
- 140509588315920 -> 140509588315008
- 140509591259952 [label="encoder.layer.11.experts.experts.1.output_query.dense.bias
- (768)" fillcolor=lightblue]
- 140509591259952 -> 140509588315920
- 140509588315920 [label=AccumulateGrad]
- 140509588315152 -> 140509588314000
- 140509588315152 [label=ViewBackward0]
- 140509588315488 -> 140509588315152
- 140509588315488 [label=GeluBackward0]
- 140509588345232 -> 140509588315488
- 140509588345232 [label=ViewBackward0]
- 140509588346384 -> 140509588345232
- 140509588346384 [label=AddmmBackward0]
- 140509588347056 -> 140509588346384
- 140509588347056 [label=ToCopyBackward0]
- 140509588345904 -> 140509588347056
- 140509591260192 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591260192 -> 140509588345904
- 140509588345904 [label=AccumulateGrad]
- 140509588346624 -> 140509588346384
- 140509588346624 [label=ViewBackward0]
- 140517615323840 -> 140509588346624
- 140517615323840 [label=ToCopyBackward0]
- 140509588312608 -> 140517615323840
- 140509588346096 -> 140509588346384
- 140509588346096 [label=TBackward0]
- 140517615322640 -> 140509588346096
- 140517615322640 [label=ToCopyBackward0]
- 140517615323936 -> 140517615322640
- 140509591260112 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591260112 -> 140517615323936
- 140517615323936 [label=AccumulateGrad]
- 140509588312464 -> 140509588314000
- 140509588312464 [label=TBackward0]
- 140509588344944 -> 140509588312464
- 140509588344944 [label=ToCopyBackward0]
- 140509588347728 -> 140509588344944
- 140509591259872 [label="encoder.layer.11.experts.experts.1.output_query.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591259872 -> 140509588347728
- 140509588347728 [label=AccumulateGrad]
- 140509588312608 -> 140509588313088
- 140509588313136 -> 140509591268800
- 140509591259632 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591259632 -> 140509588313136
- 140509588313136 [label=AccumulateGrad]
- 140509588312752 -> 140509591268800
- 140509591260432 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591260432 -> 140509588312752
- 140509588312752 [label=AccumulateGrad]
- 140509588282672 -> 140509588283152
- 140509588282672 [label=UnsqueezeBackward0]
- 140509591318432 -> 140509588282672
- 140509591318432 [label=UnsqueezeBackward0]
- 140509588314384 -> 140509591318432
- 140509588314384 [label=MulBackward0]
- 140509588315440 -> 140509588314384
- 140509588315440 [label=SoftmaxBackward0]
- 140509588345520 -> 140509588315440
- 140509588345520 [label=MmBackward0]
- 140509588312656 -> 140509588345520
- 140509588312656 [label=ToCopyBackward0]
- 140517615324128 -> 140509588312656
- 140517615324128 [label=DivBackward0]
- 140517615324320 -> 140517615324128
- 140517615324320 [label=SumBackward1]
- 140517615324416 -> 140517615324320
- 140517615324416 [label=MulBackward0]
- 140509588312608 -> 140517615324416
- 140517615323792 -> 140509588345520
- 140517615323792 [label=TBackward0]
- 140517615324368 -> 140517615323792
- 140517615324368 [label=ToCopyBackward0]
- 140517615324464 -> 140517615324368
- 140509591282928 [label="encoder.layer.11.experts.gate.weight
- (2, 768)" fillcolor=lightblue]
- 140509591282928 -> 140517615324464
- 140517615324464 [label=AccumulateGrad]
- 140509588282432 -> 140509588281712
- 140509588282432 [label=IndexBackward0]
- 140509588283248 -> 140509588282432
- 140509588283248 [label=IndexBackward0]
- 140509591317952 -> 140509588283248
- 140509591317952 [label=NativeLayerNormBackward0]
- 140509588345040 -> 140509591317952
- 140509588345040 [label=AddBackward0]
- 140517615324560 -> 140509588345040
- 140517615324560 [label=NativeDropoutBackward0]
- 140517615324608 -> 140517615324560
- 140517615324608 [label=ViewBackward0]
- 140517615324704 -> 140517615324608
- 140517615324704 [label=AddmmBackward0]
- 140517615324800 -> 140517615324704
- 140517615324800 [label=ToCopyBackward0]
- 140517615324992 -> 140517615324800
- 140509591290400 [label="encoder.layer.11.output.dense.bias
- (768)" fillcolor=lightblue]
- 140509591290400 -> 140517615324992
- 140517615324992 [label=AccumulateGrad]
- 140517615324752 -> 140517615324704
- 140517615324752 [label=ViewBackward0]
- 140517615325040 -> 140517615324752
- 140517615325040 [label=GeluBackward0]
- 140517615325136 -> 140517615325040
- 140517615325136 [label=ViewBackward0]
- 140517615324944 -> 140517615325136
- 140517615324944 [label=AddmmBackward0]
- 140517615382736 -> 140517615324944
- 140517615382736 [label=ToCopyBackward0]
- 140517615382928 -> 140517615382736
- 140509591290480 [label="encoder.layer.11.intermediate.dense.bias
- (3072)" fillcolor=lightblue]
- 140509591290480 -> 140517615382928
- 140517615382928 [label=AccumulateGrad]
- 140517615382688 -> 140517615324944
- 140517615382688 [label=ViewBackward0]
- 140517615382976 -> 140517615382688
- 140517615382976 [label=ToCopyBackward0]
- 140517615324512 -> 140517615382976
- 140517615324512 [label=SliceBackward0]
- 140517615383120 -> 140517615324512
- 140517615383120 [label=SliceBackward0]
- 140517615383216 -> 140517615383120
- 140517615383216 [label=SliceBackward0]
- 140509588314768 -> 140517615383216
- 140517615382592 -> 140517615324944
- 140517615382592 [label=TBackward0]
- 140517615382880 -> 140517615382592
- 140517615382880 [label=ToCopyBackward0]
- 140517615383312 -> 140517615382880
- 140509591290640 [label="encoder.layer.11.intermediate.dense.weight
- (3072, 768)" fillcolor=lightblue]
- 140509591290640 -> 140517615383312
- 140517615383312 [label=AccumulateGrad]
- 140517615324080 -> 140517615324704
- 140517615324080 [label=TBackward0]
- 140517615324896 -> 140517615324080
- 140517615324896 [label=ToCopyBackward0]
- 140517615383072 -> 140517615324896
- 140509591290720 [label="encoder.layer.11.output.dense.weight
- (768, 3072)" fillcolor=lightblue]
- 140509591290720 -> 140517615383072
- 140517615383072 [label=AccumulateGrad]
- 140517615324512 -> 140509588345040
- 140509588314672 -> 140509591317952
- 140509591290160 [label="encoder.layer.11.output.LayerNorm.weight
- (768)" fillcolor=lightblue]
- 140509591290160 -> 140509588314672
- 140509588314672 [label=AccumulateGrad]
- 140509588313712 -> 140509591317952
- 140509591290000 [label="encoder.layer.11.output.LayerNorm.bias
- (768)" fillcolor=lightblue]
- 140509591290000 -> 140509588313712
- 140509588313712 [label=AccumulateGrad]
- 140509588281712 -> 140509988778688
-}
diff --git a/test.pdf/backward_graph.pdf b/test.pdf/backward_graph.pdf
deleted file mode 100644
index 7f162b0..0000000
Binary files a/test.pdf/backward_graph.pdf and /dev/null differ
diff --git a/test/datasets/test_dataset.py b/test/datasets/test_dataset.py
new file mode 100644
index 0000000..c4f64a8
--- /dev/null
+++ b/test/datasets/test_dataset.py
@@ -0,0 +1,58 @@
+import datasets
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
+import random
+from tqdm import tqdm
+
+# path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/minigpt4/models/cmrc2018_trial.json"
+# dataset = load_dataset("json", data_files=[path], field="data", split="train")
+# tokenizer = AutoTokenizer.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
+# def preprocess_function(example):
+# import pdb; pdb.set_trace()
+# model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
+# labels = tokenizer(example["title"], max_length=32, truncation=True)
+# # label就是title编码的结果
+# model_inputs["labels"] = labels["input_ids"]
+# return model_inputs
+# processed_datasets = dataset.map(preprocess_function)
+
+dataset = load_dataset("/mnt/pfs-guan-ssai/nlu/wanghanzi/data/alpaca_20k")
+train_dataset = dataset['train']
+
+
+for i in tqdm(range(1, len(train_dataset))):
+ import pdb; pdb.set_trace()
+
+ idx = random.randint(0,i)
+ memory = train_dataset[idx]
+ memory_text = f"Instruction: {memory['instruction']}\n Answer: {memory['output']} \n"
+ train_dataset[i]['text'] = f"{memory_text} Instruction:{train_dataset[i]['instruction']}"
+
+
+import pdb; pdb.set_trace()
+
+
+model_path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/opt_350m"
+model = AutoModelForCausalLM.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+
+def formatting_prompts_func(example):
+ import pdb; pdb.set_trace()
+ output_texts = []
+ for i in range(len(example['instruction'])):
+ text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
+ output_texts.append(text)
+ return output_texts
+
+response_template = " ### Answer:"
+collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
+
+trainer = SFTTrainer(
+ model,
+ train_dataset=train_dataset,
+ formatting_func=formatting_prompts_func,
+ data_collator=collator,
+)
+trainer.train()
\ No newline at end of file