diff --git a/evaluation/coco_caption.py b/evaluation/coco_caption.py new file mode 100644 index 0000000..b6179b8 --- /dev/null +++ b/evaluation/coco_caption.py @@ -0,0 +1,94 @@ +import os +import json +import pandas as pd +from tqdm import tqdm + +from pycocoevalcap.eval import COCOEvalCap +from collections import defaultdict + +class COCO_Annotation: + def __init__(self, annotation_file): + self.coco_cn_file = annotation_file + self.imgToAnns = self.build_imgToAnns() + + def build_imgToAnns(self): + imgToAnns = defaultdict(list) + with open(self.coco_cn_file, "r", encoding="UTF-8") as fin: + for line in fin: + line = line.strip() + temp = eval(line) + annotations = temp['annotations'] + for ann in annotations: + image_id = str(ann['image_id']).zfill(6) + imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']}) + return imgToAnns + + def getImgIds(self): + return self.imgToAnns.keys() + +class COCO_Result: + def __init__(self,result_file): + self.coco_cn_file = result_file + self.imgToAnns = self.build_imgToAnns() + + def build_imgToAnns(self): + imgToAnns = dict() + data = json.load(open(self.coco_cn_file, "r")) + for d in data: + tmp = { + 'image_id':d['question_id'][-6:], + 'caption':d['answer'] + } + imgToAnns[d['question_id'][-6:]] = [tmp] + return imgToAnns + +def coco_caption_eval(results_file, split_name): + files = { + "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json", + "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json" + } + + # create coco object and coco_result object + annotation_file = files[split_name] + coco = COCO_Annotation(annotation_file) + coco_result = COCO_Result(results_file) + + # create coco_eval object by taking coco and coco_result + coco_eval = COCOEvalCap(coco, coco_result) + + # evaluate on a subset of images by setting + # coco_eval.params['image_id'] = coco_result.getImgIds() + # please remove this line when evaluating the full validation set + # coco_eval.params['image_id'] = coco_result.getImgIds() + + # evaluate results + # SPICE will take a few minutes the first time, but speeds up due to caching + coco_eval.evaluate() + + # print output evaluation scores + for metric, score in coco_eval.eval.items(): + print(f"{metric}: {score:.3f}") + + return coco_eval + + +def main(): + result_file = "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_cap_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0302/20240302231/result/val_vqa_result_coco_cap.json" + split_name = "val" + coco_val = coco_caption_eval(result_file, split_name) + + agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"] + + # log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}} + # with open( + # os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a" + # ) as f: + # f.write(json.dumps(log_stats) + "\n") + + coco_res = {k: v for k, v in coco_val.eval.items()} + coco_res["agg_metrics"] = agg_metrics + + print(coco_res) + + +main() \ No newline at end of file diff --git a/examples/ad_1.png b/examples/ad_1.png deleted file mode 100644 index d0378e4..0000000 Binary files a/examples/ad_1.png and /dev/null differ diff --git a/examples/ad_2.png b/examples/ad_2.png deleted file mode 100644 index 674248b..0000000 Binary files a/examples/ad_2.png and /dev/null differ diff --git a/examples/cook_1.png b/examples/cook_1.png deleted file mode 100644 index d8cdb45..0000000 Binary files a/examples/cook_1.png and /dev/null differ diff --git a/examples/cook_2.png b/examples/cook_2.png deleted file mode 100644 index d08272b..0000000 Binary files a/examples/cook_2.png and /dev/null differ diff --git a/examples/describe_1.png b/examples/describe_1.png deleted file mode 100644 index 02f3c92..0000000 Binary files a/examples/describe_1.png and /dev/null differ diff --git a/examples/describe_2.png b/examples/describe_2.png deleted file mode 100644 index 20bf8c7..0000000 Binary files a/examples/describe_2.png and /dev/null differ diff --git a/examples/fact_1.png b/examples/fact_1.png deleted file mode 100644 index 1f75228..0000000 Binary files a/examples/fact_1.png and /dev/null differ diff --git a/examples/fact_2.png b/examples/fact_2.png deleted file mode 100644 index de6ef53..0000000 Binary files a/examples/fact_2.png and /dev/null differ diff --git a/examples/fix_1.png b/examples/fix_1.png deleted file mode 100644 index 023cfe6..0000000 Binary files a/examples/fix_1.png and /dev/null differ diff --git a/examples/fix_2.png b/examples/fix_2.png deleted file mode 100644 index f60da5f..0000000 Binary files a/examples/fix_2.png and /dev/null differ diff --git a/examples/fun_1.png b/examples/fun_1.png deleted file mode 100644 index f720ea6..0000000 Binary files a/examples/fun_1.png and /dev/null differ diff --git a/examples/fun_2.png b/examples/fun_2.png deleted file mode 100644 index 1d37a80..0000000 Binary files a/examples/fun_2.png and /dev/null differ diff --git a/examples/logo_1.png b/examples/logo_1.png deleted file mode 100644 index 8bbe438..0000000 Binary files a/examples/logo_1.png and /dev/null differ diff --git a/examples/op_1.png b/examples/op_1.png deleted file mode 100644 index 3dbb2ff..0000000 Binary files a/examples/op_1.png and /dev/null differ diff --git a/examples/op_2.png b/examples/op_2.png deleted file mode 100644 index 2cd3e1f..0000000 Binary files a/examples/op_2.png and /dev/null differ diff --git a/examples/people_1.png b/examples/people_1.png deleted file mode 100644 index 7e95c42..0000000 Binary files a/examples/people_1.png and /dev/null differ diff --git a/examples/people_2.png b/examples/people_2.png deleted file mode 100644 index aec6c83..0000000 Binary files a/examples/people_2.png and /dev/null differ diff --git a/examples/rhyme_1.png b/examples/rhyme_1.png deleted file mode 100644 index 7d13387..0000000 Binary files a/examples/rhyme_1.png and /dev/null differ diff --git a/examples/rhyme_2.png b/examples/rhyme_2.png deleted file mode 100644 index 6cf9bf8..0000000 Binary files a/examples/rhyme_2.png and /dev/null differ diff --git a/examples/story_1.png b/examples/story_1.png deleted file mode 100644 index 3eb6ccb..0000000 Binary files a/examples/story_1.png and /dev/null differ diff --git a/examples/story_2.png b/examples/story_2.png deleted file mode 100644 index 9d37142..0000000 Binary files a/examples/story_2.png and /dev/null differ diff --git a/examples/web_1.png b/examples/web_1.png deleted file mode 100644 index 8943842..0000000 Binary files a/examples/web_1.png and /dev/null differ diff --git a/examples/wop_1.png b/examples/wop_1.png deleted file mode 100644 index 88f37d6..0000000 Binary files a/examples/wop_1.png and /dev/null differ diff --git a/examples/wop_2.png b/examples/wop_2.png deleted file mode 100644 index 8255974..0000000 Binary files a/examples/wop_2.png and /dev/null differ diff --git a/examples_v2/2000x1372_wmkn_0012149409555.jpg b/examples_v2/2000x1372_wmkn_0012149409555.jpg deleted file mode 100755 index 1250f7f..0000000 Binary files a/examples_v2/2000x1372_wmkn_0012149409555.jpg and /dev/null differ diff --git a/examples_v2/KFC-20-for-20-Nuggets.jpg b/examples_v2/KFC-20-for-20-Nuggets.jpg deleted file mode 100755 index 0ec641c..0000000 Binary files a/examples_v2/KFC-20-for-20-Nuggets.jpg and /dev/null differ diff --git a/examples_v2/cockdial.png b/examples_v2/cockdial.png deleted file mode 100755 index 935f98e..0000000 Binary files a/examples_v2/cockdial.png and /dev/null differ diff --git a/examples_v2/float.png b/examples_v2/float.png deleted file mode 100755 index 900dcb0..0000000 Binary files a/examples_v2/float.png and /dev/null differ diff --git a/examples_v2/glip_test.jpg b/examples_v2/glip_test.jpg deleted file mode 100755 index f9198f2..0000000 Binary files a/examples_v2/glip_test.jpg and /dev/null differ diff --git a/examples_v2/office.jpg b/examples_v2/office.jpg deleted file mode 100755 index e35bdc2..0000000 Binary files a/examples_v2/office.jpg and /dev/null differ diff --git a/examples_v2/sofa.jpg b/examples_v2/sofa.jpg deleted file mode 100755 index 8610591..0000000 Binary files a/examples_v2/sofa.jpg and /dev/null differ diff --git a/examples_v2/thief.png b/examples_v2/thief.png deleted file mode 100755 index 579ee52..0000000 Binary files a/examples_v2/thief.png and /dev/null differ diff --git a/minigpt4/configs/datasets/aokvqa/defaults.yaml b/minigpt4/configs/datasets/aokvqa/defaults.yaml index 7bbd26b..bfbd821 100755 --- a/minigpt4/configs/datasets/aokvqa/defaults.yaml +++ b/minigpt4/configs/datasets/aokvqa/defaults.yaml @@ -16,11 +16,16 @@ datasets: - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_train.json - # val: - # url: - # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json - # storage: - # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json + val: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json + test: + url: + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/AOKVQA/aokvqa_v1p0_val.json # test: # url: # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml index 8e96a13..8d62c89 100644 --- a/minigpt4/configs/datasets/coco/caption.yaml +++ b/minigpt4/configs/datasets/coco/caption.yaml @@ -17,14 +17,14 @@ datasets: # md5: aa31ac474cf6250ebb81d18348a07ed8 storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json - # val: - # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json - # storage: - # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json - # test: - # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json - # storage: - # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json images: storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO diff --git a/minigpt4/configs/datasets/coco/caption_eval.yaml b/minigpt4/configs/datasets/coco/caption_eval.yaml new file mode 100644 index 0000000..5a2a17f --- /dev/null +++ b/minigpt4/configs/datasets/coco/caption_eval.yaml @@ -0,0 +1,26 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +datasets: + coco_caption: # name of the dataset builder + # dataset_card: dataset_card/coco_caption.md + # data_dir: ${env.data_dir}/datasets + data_type: images # [images|videos|features] + + build_info: + # Be careful not to append minus sign (-) before split to avoid itemizing + annotations: + val: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json + test: + url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + storage: + - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json + + images: + storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO + diff --git a/minigpt4/datasets/builders/image_text_pair_builder.py b/minigpt4/datasets/builders/image_text_pair_builder.py index 4f6db0c..5317354 100644 --- a/minigpt4/datasets/builders/image_text_pair_builder.py +++ b/minigpt4/datasets/builders/image_text_pair_builder.py @@ -14,7 +14,7 @@ from minigpt4.datasets.datasets.flickr import GroundedDetailDataset,CaptionToObj from minigpt4.datasets.datasets.vg_dataset import ReferVisualGenomeDataset from minigpt4.datasets.datasets.coco_dataset import ReferCOCODataset, InvReferCOCODataset from minigpt4.datasets.datasets.gqa_datasets import GQADataset, GQAEvalDataset -from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset +from minigpt4.datasets.datasets.aok_vqa_datasets import AOKVQADataset, AOKVQAEvalDataset from minigpt4.datasets.datasets.coco_vqa_datasets import COCOVQADataset, COCOVQAEvalDataset from minigpt4.datasets.datasets.ok_vqa_datasets import OKVQADataset, OKVQAEvalDataset from minigpt4.datasets.datasets.ocrvqa_dataset import OCRVQADataset @@ -384,7 +384,7 @@ class OKVQABuilder(COCOVQABuilder): @registry.register_builder("aok_vqa") class AOKVQABuilder(BaseDatasetBuilder): train_dataset_cls = AOKVQADataset - eval_dataset_cls = AOKVQADataset + eval_dataset_cls = AOKVQAEvalDataset DATASET_CONFIG_DICT = {"default": "configs/datasets/aokvqa/defaults.yaml"} @@ -584,6 +584,7 @@ class COCOCapBuilder(BaseDatasetBuilder): DATASET_CONFIG_DICT = { "default": "configs/datasets/coco/caption.yaml", + "coco_cap_eval": "configs/datasets/coco/caption_eval.yaml", } diff --git a/minigpt4/datasets/datasets/aok_vqa_datasets.py b/minigpt4/datasets/datasets/aok_vqa_datasets.py index 3768c93..d4b83e5 100755 --- a/minigpt4/datasets/datasets/aok_vqa_datasets.py +++ b/minigpt4/datasets/datasets/aok_vqa_datasets.py @@ -13,7 +13,7 @@ import torch from PIL import Image -from minigpt4.datasets.datasets.vqa_datasets import VQADataset #, VQAEvalDataset +from minigpt4.datasets.datasets.vqa_datasets import VQADataset, VQAEvalDataset class __DisplMixin: @@ -37,11 +37,11 @@ class AOKVQADataset(VQADataset, __DisplMixin): super().__init__(vis_processor, text_processor, vis_root, ann_paths) self.instruction_pool =[ - '{}', - 'Q: {} A: ', - 'Based on the image, respond to this question with a short answer: {}', - '{} A short answer to the question is ', - 'Question: {} Short answer:', + '{} Choose from {}.', + 'Q: {} Multi Choices: {} A: ', + 'Question: {} Multi Choices: {} Answer: ', + "{} Choose one from the following possible answers: {}. ", + '{} Choose from {}. The answer is', ] exist_annotation = [] @@ -63,25 +63,19 @@ class AOKVQADataset(VQADataset, __DisplMixin): image = self.vis_processor(image) question = self.text_processor(ann["question"]) - answer_key = "direct_answers" - - answer_weight = {} - for answer in ann[answer_key]: - if answer in answer_weight.keys(): - answer_weight[answer] += 1 / len(ann[answer_key]) - else: - answer_weight[answer] = 1 / len(ann[answer_key]) - - answers = list(answer_weight.keys()) - weights = list(answer_weight.values()) - - answer = random.choices(answers, weights=weights, k=1)[0] # random sample an answer according to weights + answer_lst = ann["choices"] + direct_answers = ann["direct_answers"] + final_answer = random.choices(direct_answers, k=1)[0] + for answer in answer_lst: + if answer in direct_answers: + final_answer = answer return { "image": image, "image_id": ann["image"], "question": question, - "answer": answer, + "answer": final_answer, + "choices": ", ".join(answer_lst) } def __getitem__(self, index): @@ -90,7 +84,7 @@ class AOKVQADataset(VQADataset, __DisplMixin): answer = self.text_processor(data['answer']) q_input = question - llm_input = random.choice(self.instruction_pool).format(question) + llm_input = random.choice(self.instruction_pool).format(question, data["choices"]) return { "image": data['image'], @@ -104,25 +98,103 @@ class AOKVQADataset(VQADataset, __DisplMixin): } -class AOKVQGDataset(AOKVQADataset): - +class AOKVQAEvalDataset(VQAEvalDataset, __DisplMixin): def __init__(self, vis_processor, text_processor, vis_root, ann_paths): - super().__init__(vis_processor, text_processor, vis_root, ann_paths) - self.instruction_pool = [ - 'Given the image, generate a question whose answer is: {}', - 'Based on the image, provide a question with the answer: {}', - 'Given the visual representation, create a question for which the answer is "{}"', - 'From the image provided, craft a question that leads to the reply: {}', - 'Considering the picture, come up with a question where the answer is: {}', - 'Taking the image into account, generate an question that has the answer: {}' - ] + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ - def __getitem__(self, index): - data = self.get_data(index) - instruction = random.choice(self.instruction_pool).format(data['answer']) + self.vis_root = vis_root + + self.annotation = json.load(open(ann_paths[0])) + + self.instruction_pool =[ + '{} Choose from {}.', + 'Q: {} Multi Choices: {} A: ', + 'Question: {} Multi Choices: {} Answer: ', + "{} Choose one from the following possible answers: {}. ", + '{} Choose from {}. The answer is', + ] + + try: + self.coco_fmt_qust_file = ann_paths[2] + self.coco_fmt_anno_file = ann_paths[3] + except IndexError: + self.coco_fmt_qust_file = None + self.coco_fmt_anno_file = None + + self.vis_processor = vis_processor + self.text_processor = text_processor + self.source = 'aokvqa' + + def collater(self, samples): + ( + image_list, + question_list, + question_id_list, + choices_list, + correct_choice_idx_list, + direct_answers_list, + llm_input_list, + q_input_list, + source_list, + ) = ([], [], [], [], [], [], [], [], []) + + for sample in samples: + image_list.append(sample["image"]) + question_list.append(sample["text_input"]) + question_id_list.append(sample["question_id"]) + choices_list.append(sample["choices"]) + correct_choice_idx_list.append(sample["correct_choice_idx"]) + direct_answers_list.append(sample["direct_answers"]) + llm_input_list.append(sample["llm_input"]) + q_input_list.append(sample["q_input"]) + source_list.append(sample["source"]) return { - "image": data['image'], - "instruction_input": instruction, - "answer": data['question'], + "image": torch.stack(image_list, dim=0), + "text_input": question_list, + "question_id": question_id_list, + "choices": choices_list, + "correct_choice_idx": correct_choice_idx_list, + "direct_answers": direct_answers_list, + "llm_input": llm_input_list, + "q_input": q_input_list, + "source": source_list, } + + def __getitem__(self, index): + ann = self.annotation[index] + + image_path = os.path.join(self.vis_root, ann["image"]) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + question = self.text_processor(ann["question"]) + + choices = ann["choices"] + if "correct_choice_idx" in ann: + correct_choice_idx = ann["correct_choice_idx"] + else: + correct_choice_idx = None + + if "direct_answers" in ann: + direct_answers = ann["direct_answers"] + else: + direct_answers = None + + llm_input = random.choice(self.instruction_pool).format(question, ", ".join(choices)) + + return { + "image": image, + "q_input": question, + "llm_input": llm_input, + "text_input": question, + "question_id": ann["question_id"], + "choices": choices, + "correct_choice_idx": correct_choice_idx, + "direct_answers": direct_answers, + "source": 'aokvqa', + } + \ No newline at end of file diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py index 6b74cb5..9354adc 100644 --- a/minigpt4/datasets/datasets/caption_datasets.py +++ b/minigpt4/datasets/datasets/caption_datasets.py @@ -59,83 +59,7 @@ class CaptionDataset(BaseDataset, __DisplMixin): "text_input": caption, "image_id": self.img_ids[ann["image_id"]], } - - - -class COCOCaptionDataset(BaseDataset, __DisplMixin): - def __init__(self, vis_processor, text_processor, vis_root, ann_paths): - """ - vis_root (string): Root directory of images (e.g. coco/images/) - ann_root (string): directory to store the annotation file - """ - super().__init__(vis_processor, text_processor, vis_root, ann_paths) - - self.img_ids = {} - n = 0 - - self.filter_anntation = [] - - for ann in self.annotation: - if "train" in ann["image"]: - self.filter_anntation.append(ann) - self.annotation = self.filter_anntation - - for ann in self.annotation: - img_id = ann["image_id"] - if img_id not in self.img_ids.keys(): - self.img_ids[img_id] = n - n += 1 - - self.instruction_pool = [ - 'Briefly describe this image.', - 'Provide a concise depiction of this image.', - 'Present a short description of this image.', - 'Summarize this image in a few words.', - 'A short image caption:', - 'A short image description:', - 'A photo of ', - 'An image that shows ', - 'Write a short description for the image. ', - 'Write a description for the photo.', - 'Provide a description of what is presented in the photo.', - 'Briefly describe the content of the image.', - 'Can you briefly explain what you see in the image?', - 'Could you use a few words to describe what you perceive in the photo?', - 'Please provide a short depiction of the picture.', - 'Using language, provide a short account of the image.', - 'Use a few words to illustrate what is happening in the picture.', - ] - self.source = 'coco_cap' - - def __getitem__(self, index): - - # TODO this assumes image input, not general enough - ann = self.annotation[index] - - # img_file = ann["image"].split("/")[-1] - img_file = ann["image"] - image_path = os.path.join(self.vis_root, img_file) - image = Image.open(image_path).convert("RGB") - - image = self.vis_processor(image) - caption = self.text_processor(ann["caption"]) - - # instruction = random.choice(self.instruction_pool) - # instruction = " [caption] {} ".format(instruction) - q_input = "" - llm_input = random.choice(self.instruction_pool) - - return { - "image": image, - "image_id": ann["image"], - "answer": caption, - "q_input": q_input, - "llm_input": llm_input, - "text_input": llm_input, - "text_output": caption, - "source": 'coco_cap', - } - + class CaptionEvalDataset(BaseDataset, __DisplMixin): def __init__(self, vis_processor, text_processor, vis_root, ann_paths): """ @@ -151,7 +75,7 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin): image_path = os.path.join(self.vis_root, ann["image"]) image = Image.open(image_path).convert("RGB") - + image = self.vis_processor(image) return { @@ -159,3 +83,4 @@ class CaptionEvalDataset(BaseDataset, __DisplMixin): "image_id": ann["image_id"], "instance_id": ann["instance_id"], } + diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py index e388956..a33dda4 100755 --- a/minigpt4/datasets/datasets/coco_caption.py +++ b/minigpt4/datasets/datasets/coco_caption.py @@ -9,18 +9,102 @@ import os import json import torch import numpy as np +import random from PIL import Image from PIL import ImageFile +from collections import OrderedDict ImageFile.LOAD_TRUNCATED_IMAGES = True -from minigpt4.datasets.datasets.caption_datasets import COCOCaptionDataset, CaptionEvalDataset +from minigpt4.datasets.datasets.base_dataset import BaseDataset +from minigpt4.datasets.datasets.caption_datasets import CaptionDataset, CaptionEvalDataset -COCOCapDataset = COCOCaptionDataset +class __DisplMixin: + def displ_item(self, index): + sample, ann = self.__getitem__(index), self.annotation[index] + return OrderedDict( + { + "file": ann["image"], + "caption": ann["caption"], + "image": sample["image"], + } + ) + +class COCOCapDataset(BaseDataset, __DisplMixin): + def __init__(self, vis_processor, text_processor, vis_root, ann_paths): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.img_ids = {} + n = 0 + self.filter_anntation = [] + + for ann in self.annotation: + if "train" in ann["image"]: + self.filter_anntation.append(ann) + self.annotation = self.filter_anntation + + for ann in self.annotation: + img_id = ann["image_id"] + if img_id not in self.img_ids.keys(): + self.img_ids[img_id] = n + n += 1 + + self.instruction_pool = [ + 'Briefly describe this image.', + 'Provide a concise depiction of this image.', + 'Present a short description of this image.', + 'Summarize this image in a few words.', + 'A short image caption:', + 'A short image description:', + 'A photo of ', + 'An image that shows ', + 'Write a short description for the image. ', + 'Write a description for the photo.', + 'Provide a description of what is presented in the photo.', + 'Briefly describe the content of the image.', + 'Can you briefly explain what you see in the image?', + 'Could you use a few words to describe what you perceive in the photo?', + 'Please provide a short depiction of the picture.', + 'Using language, provide a short account of the image.', + 'Use a few words to illustrate what is happening in the picture.', + ] + self.source = 'coco_cap' + + def __getitem__(self, index): + + # TODO this assumes image input, not general enough + ann = self.annotation[index] + + # img_file = ann["image"].split("/")[-1] + img_file = ann["image"] + image_path = os.path.join(self.vis_root, img_file) + image = Image.open(image_path).convert("RGB") + + image = self.vis_processor(image) + caption = self.text_processor(ann["caption"]) + + instruction = random.choice(self.instruction_pool) + # q_input = "" + q_input = instruction + llm_input = instruction + + return { + "image": image, + "image_id": ann["image"], + "answer": caption, + "q_input": q_input, + "llm_input": llm_input, + "text_input": llm_input, + "text_output": caption, + "source": 'coco_cap', + } class COCOCapEvalDataset(CaptionEvalDataset): @@ -31,6 +115,26 @@ class COCOCapEvalDataset(CaptionEvalDataset): split (string): val or test """ super().__init__(vis_processor, text_processor, vis_root, ann_paths) + + self.instruction_pool = [ + 'Briefly describe this image.', + 'Provide a concise depiction of this image.', + 'Present a short description of this image.', + 'Summarize this image in a few words.', + 'A short image caption:', + 'A short image description:', + 'A photo of ', + 'An image that shows ', + 'Write a short description for the image. ', + 'Write a description for the photo.', + 'Provide a description of what is presented in the photo.', + 'Briefly describe the content of the image.', + 'Can you briefly explain what you see in the image?', + 'Could you use a few words to describe what you perceive in the photo?', + 'Please provide a short depiction of the picture.', + 'Using language, provide a short account of the image.', + 'Use a few words to illustrate what is happening in the picture.', + ] self.source = 'coco_cap' def __getitem__(self, index): @@ -38,15 +142,25 @@ class COCOCapEvalDataset(CaptionEvalDataset): image_path = os.path.join(self.vis_root, ann["image"]) image = Image.open(image_path).convert("RGB") - - image = self.vis_processor(image) + try: + image = self.vis_processor(image) + except Exception as e: + print(e) + print(image_path) img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1] + instruction = random.choice(self.instruction_pool) + # q_input = "" + q_input = instruction + llm_input = instruction return { "image": image, "image_id": img_id, - "instance_id": ann["instance_id"], + "text_input":llm_input, + "q_input": q_input, + "llm_input": llm_input, + "source": self.source, } diff --git a/minigpt4/datasets/datasets/ok_vqa_datasets.py b/minigpt4/datasets/datasets/ok_vqa_datasets.py index 20b4494..c0bf799 100755 --- a/minigpt4/datasets/datasets/ok_vqa_datasets.py +++ b/minigpt4/datasets/datasets/ok_vqa_datasets.py @@ -149,7 +149,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin): self.source = 'okvqa' self.annotation_add = self.get_data() - self._add_instance_ids() def get_data(self): ann_instruct = list() @@ -180,7 +179,6 @@ class OKVQAEvalDataset(VQAEvalDataset, __DisplMixin): "image_id": ann["image"], 'image_path': image_path, "question_id": ann["question_id"], - # "instance_id": ann["instance_id"], "question": question, "q_input": q_input, "llm_input": llm_input, diff --git a/minigpt4/models/QformerMoE.py b/minigpt4/models/QformerMoE.py index 5cc8c1f..addacc5 100644 --- a/minigpt4/models/QformerMoE.py +++ b/minigpt4/models/QformerMoE.py @@ -45,7 +45,6 @@ from transformers.utils import logging from transformers.models.bert.configuration_bert import BertConfig from minigpt4.models.moe.utils import ( - FeedForward, MoEModelOutput, MoEModelOutputWithPooling, use_experts, diff --git a/minigpt4/models/QformerMoELN.py b/minigpt4/models/QformerMoELN.py new file mode 100644 index 0000000..9ef1f6b --- /dev/null +++ b/minigpt4/models/QformerMoELN.py @@ -0,0 +1,1276 @@ +""" + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Junnan Li + * Based on huggingface code base + * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert +""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple, Dict, Any + +import torch +from torch import Tensor, device, dtype, nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers.models.bert.configuration_bert import BertConfig + +from minigpt4.models.moe.utils import ( + MoEModelOutput, + MoEModelOutputWithPooling, + use_experts, +) +from minigpt4.models.moe.moe_layer import MoELayer + +logging.set_verbosity_error() # ignore warning : Some weights of BertLMHeadModel were not initialized from the model checkpoint... +logger = logging.get_logger(__name__) + +# from visualizer import get_local + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads # 12 + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 64 + self.all_head_size = self.num_attention_heads * self.attention_head_size # 768 + + self.query = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768) + self.value = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + self.value = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) # torch.Size([1, 257, 12, 64]) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) # encoder_hidden_states:[bz,257,1408], torch.Size([1, 12, 257, 64]) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) # torch.Size([1, 12, 257, 64]) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) # torch.Size([1, 12, 41, 64]) + + past_key_value = (key_layer, value_layer) # torch.Size([1, 12, 41, 257]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + # extended_attention_mask + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) # torch.Size([1, 12, 41, 257]) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) # torch.Size([1, 12, 41, 64]) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) # torch.Size([1, 41, 768]) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = ( + self.self.attention_head_size * self.self.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): # Add & Norm + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FeedForward(nn.Module): + # Add LayerNorm + def __init__(self, config): + super().__init__() + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states: Tensor): + intermediate_output = self.intermediate(hidden_states) + layer_output = self.output(intermediate_output, hidden_states) + return layer_output + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + if ( + self.config.add_cross_attention + and layer_num % self.config.cross_attention_freq == 0 + ): + self.crossattention = BertAttention( + config, is_cross_attention=self.config.add_cross_attention + ) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + self.intermediate_query = BertIntermediate(config) + self.output_query = BertOutput(config) + + # Add MoE FFN + self.use_experts = use_experts(layer_num) + ffn = FeedForward(config) + if self.use_experts: + self.experts = MoELayer( + hidden_size=config.hidden_size, + expert=ffn, + num_experts=config.moebert_expert_num, + route_method=config.moebert_route_method, + topk=config.moe_topk, + use_balance_loss=config.use_balance_loss, + weight_type=config.moe_weight_type, + ) + else: + self.experts = ffn + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + assert ( + encoder_hidden_states is not None + ), "encoder_hidden_states must be given for cross-attention layers" + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + outputs = ( + outputs + cross_attention_outputs[1:-1] + ) # add cross attentions if we output attention weights + + # add moe query ffn + # query_attention_output size: [bz, query_length+seq_len, 768] + # attention_mask size: [bz, 1, 1, query_length+seq_len] + moe_ffn_attention_input = query_attention_output[:, :query_length, :] + moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length] + layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask) # layer_output, gate_loss, gate_load + # import pdb; pdb.set_trace() # test0107 + + if attention_output.shape[1] > query_length: # have text input in Qformer + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2]) + + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + layer_output = (layer_output, 0.0, []) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_query_moe(self, attention_output, expert_attention_mask): + if not self.use_experts: + hidden_states = self.experts(attention_output) + return hidden_states, 0.0, [] + + hidden_states, gate_loss, gate_load = self.experts( + attention_output, expert_attention_mask + ) + return hidden_states, gate_loss, gate_load + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)] + ) + + # @get_local('all_cross_attentions') + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + () if output_attentions and self.config.add_cross_attention else None + ) + + next_decoder_cache = () if use_cache else None + gate_loss = 0.0 + gate_loads = list() + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module( + *inputs, past_key_value, output_attentions, query_length + ) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, #torch.Size([bz, 32+input_len, 768]) + attention_mask, # torch.Size([bz, 1, 1, 32+input_len]) + layer_head_mask, # None + encoder_hidden_states, # torch.Size([bz, 257, 1408]) + encoder_attention_mask, + past_key_value, + output_attentions, # False + query_length, # 32 + ) + hidden_states = layer_outputs[0][0] + gate_loss = gate_loss + layer_outputs[0][1] + gate_loads.append(layer_outputs[0][2]) + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + + return MoEModelOutput( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + gate_loss=gate_loss, + gate_loads=gate_loads, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=False): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + has_query: bool = False, + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + + # add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + if has_query: # UniLM style attention mask + causal_mask = torch.cat( + [ + torch.zeros( + (batch_size, prefix_seq_len, seq_length), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=1, + ) + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, causal_mask.shape[1], prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # use_cache = use_cache if use_cache is not None else self.config.use_cache + + if input_ids is None: + assert ( + query_embeds is not None + ), "You have to specify query_embeds when input_ids is None" + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length + if past_key_values is not None + else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if is_decoder: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, + input_ids.shape, + device, + is_decoder, + has_query=(query_embeds is not None), + ) + else: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return MoEModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + gate_loss=encoder_outputs.gate_loss, + gate_loads=encoder_outputs.gate_loads, + ) + + +class BertMoELMHeadModelLNIn(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.load_balance_alpha = config.moebert_load_balance + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if labels is not None: + use_cache = False + if past_key_values is not None: + query_embeds = None + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + gate_loss = outputs.gate_loss + + sequence_output = outputs[0] + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss, total_loss = None, None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + total_loss = lm_loss + gate_loss * self.load_balance_alpha + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=total_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + query_mask = input_ids.new_ones(query_embeds.shape[:-1]) + attention_mask = torch.cat([query_mask, attention_mask], dim=-1) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "query_embeds": query_embeds, + "attention_mask": attention_mask, + "past_key_values": past, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=False, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), labels.view(-1) + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ( + ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + ) + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/minigpt4/models/QformerRouteMoE.py b/minigpt4/models/QformerRouteMoE.py index 8595dc6..5cdd983 100644 --- a/minigpt4/models/QformerRouteMoE.py +++ b/minigpt4/models/QformerRouteMoE.py @@ -389,17 +389,23 @@ class BertOutput(nn.Module): # Add & Norm class FeedForward(nn.Module): + # remove LayerNorm def __init__(self, config): - nn.Module.__init__(self) - # first layer - self.intermediate_query = BertIntermediate(config) - # second layer - self.output_query = BertOutput(config) + super().__init__() + self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2 + # self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2 def forward(self, hidden_states: Tensor): - input_tensor = hidden_states - intermediate_output = self.intermediate_query(hidden_states) - hidden_states = self.output_query(intermediate_output, input_tensor) + hidden_states = self.dense1(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense2(hidden_states) + hidden_states = self.dropout(hidden_states) return hidden_states @@ -433,7 +439,6 @@ class BertLayer(nn.Module): self.layer_judge = moe_layer_judge(layer_num) self.num_beams = config.moebert_num_beams ffn = FeedForward(config) - if self.use_experts: self.experts = RouteMoELayer( hidden_size=config.hidden_size, @@ -446,8 +451,7 @@ class BertLayer(nn.Module): ) else: self.experts = ffn - - # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward( self, @@ -538,7 +542,7 @@ class BertLayer(nn.Module): if self.layer_judge == 'first' and self.num_beams>1: # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1: # adjust the dimension of layer_output_text to bz*num_beams - layer_output_text = self.adjust_layer_output_text(layer_output_text) + layer_output_text = self.adjust_hidden_states_by_num_beams(layer_output_text) if self.layer_judge == 'mid' and self.num_beams > 1: # layer_output_text [bz*num_beams, len, hidden_size] @@ -575,11 +579,11 @@ class BertLayer(nn.Module): attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len]) return attention_mask - def adjust_layer_output_text(self, layer_output_text): - batch_size, text_length, hidden_size = layer_output_text.shape - tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size) - layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768] - return layer_output_text + def adjust_hidden_states_by_num_beams(self, hidden_states): + batch_size, text_length, hidden_size = hidden_states.shape + tmp_text = hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size) + hidden_states = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768] + return hidden_states def route_moe_last_layer_top1(self, layer_output, layer_output_text): batch_size = layer_output[0].shape[0] @@ -602,20 +606,21 @@ class BertLayer(nn.Module): def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) - # layer_output = self.LayerNorm(layer_output + attention_output) return layer_output def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route): if not self.use_experts: - layer_output = self.experts(attention_output) - # layer_output = self.LayerNorm(layer_output + attention_output) + hidden_states = self.experts(attention_output) + layer_output = self.expert_ln(hidden_states + attention_output) return layer_output, None, None, None, 0.0 - layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts( + hidden_states, beam_scores, expert_route, beam_idx, importance_loss = self.experts( attention_output, expert_attention_mask, beam_scores, expert_route ) + if hidden_states.shape[0]==attention_output.shape[0]*self.num_beams and self.num_beams>1: + attention_output = self.adjust_hidden_states_by_num_beams(attention_output) + layer_output = self.expert_ln(hidden_states + attention_output) - # layer_output = self.LayerNorm(layer_output + attention_output) return layer_output, beam_scores, expert_route, beam_idx, importance_loss class BertEncoder(nn.Module): @@ -722,7 +727,7 @@ class BertEncoder(nn.Module): ] if v is not None ) - + return MoEModelOutput( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, diff --git a/minigpt4/models/QformerRouteMoELN.py b/minigpt4/models/QformerRouteMoELN.py new file mode 100644 index 0000000..1f1f289 --- /dev/null +++ b/minigpt4/models/QformerRouteMoELN.py @@ -0,0 +1,1367 @@ +""" + * Copyright (c) 2023, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Junnan Li + * Based on huggingface code base + * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert +""" + +import math +import os +import warnings +import copy +from dataclasses import dataclass +from typing import Optional, Tuple, Dict, Any + +import torch +from torch import Tensor, device, dtype, nn +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers.models.bert.configuration_bert import BertConfig + +from minigpt4.models.moe.utils import ( + MoEModelOutput, + MoEModelOutputWithPooling, + use_experts_route, + moe_layer_judge, +) +from minigpt4.models.moe.route_moe_layer import RouteMoELayer + +logging.set_verbosity_error() # ignore warning : Some weights of BertLMHeadModel were not initialized from the model checkpoint... +logger = logging.get_logger(__name__) + +# from visualizer import get_local + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length : seq_length + past_key_values_length + ].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr( + config, "embedding_size" + ): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads # 12 + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 64 + self.all_head_size = self.num_attention_heads * self.attention_head_size # 768 + + self.query = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768) + self.value = nn.Linear(config.encoder_width, self.all_head_size) # nn.Linear(1408, 768) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + self.value = nn.Linear(config.hidden_size, self.all_head_size) # nn.Linear(768, 768) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) # torch.Size([1, 257, 12, 64]) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) # encoder_hidden_states:[bz,257,1408], torch.Size([1, 12, 257, 64]) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) # torch.Size([1, 12, 257, 64]) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) # torch.Size([1, 12, 41, 64]) + + past_key_value = (key_layer, value_layer) # torch.Size([1, 12, 41, 257]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if ( + self.position_embedding_type == "relative_key" + or self.position_embedding_type == "relative_key_query" + ): + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, device=hidden_states.device + ).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1 + ) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype + ) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding + ) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding + ) + attention_scores = ( + attention_scores + + relative_position_scores_query + + relative_position_scores_key + ) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + # extended_attention_mask + + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) # torch.Size([1, 12, 41, 257]) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) # torch.Size([1, 12, 41, 64]) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) # torch.Size([1, 41, 768]) + + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = ( + self.self.attention_head_size * self.self.num_attention_heads + ) + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): # Add & Norm + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1 + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + # Move LayerNorm & ResNet out of FFN After MoEFFN + hidden_states = self.LayerNorm(hidden_states + input_tensor) # 1 + return hidden_states + + +class FeedForward(nn.Module): + def __init__(self, config): + nn.Module.__init__(self) + # first layer + self.intermediate_query = BertIntermediate(config) + # second layer + self.output_query = BertOutput(config) + + def forward(self, hidden_states: Tensor): + input_tensor = hidden_states + intermediate_output = self.intermediate_query(hidden_states) + hidden_states = self.output_query(intermediate_output, input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + if ( + self.config.add_cross_attention + and layer_num % self.config.cross_attention_freq == 0 + ): + self.crossattention = BertAttention( + config, is_cross_attention=self.config.add_cross_attention + ) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + self.intermediate_query = BertIntermediate(config) + self.output_query = BertOutput(config) + + # Add MoE FFN + self.use_experts = use_experts_route(layer_num) + self.layer_judge = moe_layer_judge(layer_num) + self.num_beams = config.moebert_num_beams + ffn = FeedForward(config) + + if self.use_experts: + self.experts = RouteMoELayer( + hidden_size=config.hidden_size, + expert=ffn, + num_experts=config.moebert_expert_num, + num_beams=config.moebert_num_beams, + layer_judge = self.layer_judge, + route_method=config.route_method, + weight_type=config.moe_weight_type, + ) + else: + self.experts = ffn + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + beam_scores=None, + expert_route=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = ( + past_key_value[:2] if past_key_value is not None else None + ) + # import pdb; pdb.set_trace() # 0107test + + # adjust the dimension of hidden_states, attention_mask, encoder_attention_mask and encoder_hidden_states to be the same + if self.num_beams > 1: + if hidden_states.shape[0]== attention_mask.shape[0]*self.num_beams: + # attention_mask dimension to be bz*num_beams + attention_mask = self.adjust_attention_mask(attention_mask) + encoder_attention_mask = self.adjust_attention_mask(encoder_attention_mask) + + if hidden_states.shape[0]*self.num_beams == attention_mask.shape[0]: + # attention_mask dimension back to bz + batch_size = attention_mask.shape[0] + attention_mask = attention_mask[[ i for i in range(0, batch_size, self.num_beams)]] + + if hidden_states.shape[0] == encoder_hidden_states.shape[0]*self.num_beams: + batch_size, visual_tokens, vision_dim = encoder_hidden_states.shape + tmp = encoder_hidden_states.unsqueeze(1).expand(batch_size, self.num_beams, visual_tokens, vision_dim ) + encoder_hidden_states = tmp.contiguous().view(batch_size* self.num_beams, visual_tokens, vision_dim) # torch.Size([bz, 257, 1408]) + + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + + assert ( + encoder_hidden_states is not None + ), "encoder_hidden_states must be given for cross-attention layers" + + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + outputs = ( + outputs + cross_attention_outputs[1:-1] + ) # add cross attentions if we output attention weights + + # add moe query ffn + # query_attention_output size: [bz, query_length+seq_len, 768] + # attention_mask size: [bz, 1, 1, query_length+seq_len] + moe_ffn_attention_input = query_attention_output[:, :query_length, :] + moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length] + layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask, beam_scores, expert_route) + # layer_output = (layer_output, beam_scores, expert_route, beam_idx, importance_loss) + # import pdb; pdb.set_trace() # 0107test + + if attention_output.shape[1] > query_length: # have text input in Qformer + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + if self.layer_judge == 'first' and self.num_beams>1: + # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1: + # adjust the dimension of layer_output_text to bz*num_beams + layer_output_text = self.adjust_layer_output_text(layer_output_text) + + if self.layer_judge == 'mid' and self.num_beams > 1: + # layer_output_text [bz*num_beams, len, hidden_size] + beam_idx = layer_output[3] + layer_output_text = layer_output_text[beam_idx] + + if self.layer_judge == 'last' and self.num_beams>1: + # select top1 for each sample among beams + # layer_output = (hidden_states, beam_scores, expert_route) + # layer_output & layer_output_text dimen_0 from bz*num_beams to bz + layer_output, layer_output_text = self.route_moe_last_layer_top1(layer_output, layer_output_text) + + layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2], layer_output[3],layer_output[4]) + # import pdb; pdb.set_trace() # 0107test + + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + layer_output = (layer_output, None, None, None, 0.0) + + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def adjust_attention_mask(self, attention_mask): + batch_size = attention_mask.shape[0] + tmp = attention_mask.unsqueeze(1).expand(batch_size, self.num_beams, 1, 1, attention_mask.shape[3]) + attention_mask = tmp.contiguous().view(batch_size* self.num_beams, 1, 1, attention_mask.shape[3]) # torch.Size([bz*num_beams, 1, 1, 32+input_len]) + return attention_mask + + def adjust_layer_output_text(self, layer_output_text): + batch_size, text_length, hidden_size = layer_output_text.shape + tmp_text = layer_output_text.unsqueeze(1).expand(batch_size, self.num_beams, text_length, hidden_size) + layer_output_text = tmp_text.contiguous().view(-1, text_length, hidden_size) # [bz*num_beams, text_length ,768] + return layer_output_text + + def route_moe_last_layer_top1(self, layer_output, layer_output_text): + batch_size = layer_output[0].shape[0] + raw_batch_size = int(batch_size / self.num_beams) + hidden_states, beam_scores, expert_route, beam_idx = layer_output[0], layer_output[1], layer_output[2], layer_output[3] + layer_output_text = layer_output_text[beam_idx] + + scores = beam_scores.view(raw_batch_size, self.num_beams) + _, gate = torch.topk(scores, 1, dim=1) + selects = [ (bz_idx * self.num_beams + gate[bz_idx].item()) for bz_idx in range(raw_batch_size)] + + layer_output_text = layer_output_text[selects] + hidden_states_new = hidden_states[selects] + beam_scores_new = beam_scores[selects] + expert_route_new = expert_route[selects] + + return (hidden_states_new, beam_scores_new, expert_route_new, layer_output[3], layer_output[4]), layer_output_text + + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + # layer_output = self.LayerNorm(layer_output + attention_output) + return layer_output + + def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route): + if not self.use_experts: + layer_output = self.experts(attention_output) + # layer_output = self.LayerNorm(layer_output + attention_output) + return layer_output, None, None, None, 0.0 + + layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts( + attention_output, expert_attention_mask, beam_scores, expert_route + ) + + # layer_output = self.LayerNorm(layer_output + attention_output) + return layer_output, beam_scores, expert_route, beam_idx, importance_loss + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [BertLayer(config, i) for i in range(config.num_hidden_layers)] + ) + + # @get_local('all_cross_attentions') + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = ( + () if output_attentions and self.config.add_cross_attention else None + ) + + next_decoder_cache = () if use_cache else None + beam_scores=None + expert_route=None + importance_loss = 0 + for i in range(self.config.num_hidden_layers): + + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module( + *inputs, past_key_value, output_attentions, query_length, beam_scores, expert_route + ) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, #torch.Size([bz, 32+input_len, 768]) + attention_mask, # torch.Size([bz, 1, 1, 32+input_len]) + layer_head_mask, # None + encoder_hidden_states, # torch.Size([bz, 257, 1408]) + encoder_attention_mask, + past_key_value, + output_attentions, # False + query_length, # 32 + beam_scores, # None + expert_route, # None + ) + hidden_states = layer_outputs[0][0] + beam_scores = beam_scores if layer_outputs[0][1] == None else layer_outputs[0][1] + expert_route = expert_route if layer_outputs[0][2] == None else layer_outputs[0][2] + importance_loss += layer_outputs[0][4] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + + return MoEModelOutput( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + beam_scores=beam_scores, + expert_route=expert_route, + gate_loss=importance_loss, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=False): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + has_query: bool = False, + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + + # add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + if has_query: # UniLM style attention mask + causal_mask = torch.cat( + [ + torch.zeros( + (batch_size, prefix_seq_len, seq_length), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=1, + ) + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, causal_mask.shape[1], prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # use_cache = use_cache if use_cache is not None else self.config.use_cache + + if input_ids is None: + assert ( + query_embeds is not None + ), "You have to specify query_embeds when input_ids is None" + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length + if past_key_values is not None + else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if is_decoder: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, + input_ids.shape, + device, + is_decoder, + has_query=(query_embeds is not None), + ) + else: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ + 0 + ].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask + ] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return MoEModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + beam_scores=encoder_outputs.beam_scores, + expert_route=encoder_outputs.expert_route, + gate_loss=encoder_outputs.gate_loss + ) + + +class BertMoERouteLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if labels is not None: + use_cache = False + if past_key_values is not None: + query_embeds = None + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + # gate_loss = outputs.gate_loss + + sequence_output = outputs[0] + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss, total_loss = None, None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + total_loss = lm_loss + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=total_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + query_mask = input_ids.new_ones(query_embeds.shape[:-1]) + attention_mask = torch.cat([query_mask, attention_mask], dim=-1) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "query_embeds": query_embeds, + "attention_mask": attention_mask, + "past_key_values": past, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=False, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), labels.view(-1) + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ( + ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + ) + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/minigpt4/models/blip2.py b/minigpt4/models/blip2.py index a6bf474..593d829 100644 --- a/minigpt4/models/blip2.py +++ b/minigpt4/models/blip2.py @@ -22,6 +22,7 @@ from minigpt4.common.logger import MetricLogger from minigpt4.models.base_model import BaseModel from minigpt4.models.Qformer import BertConfig, BertLMHeadModel from minigpt4.models.QformerMoE import BertMoELMHeadModel +from minigpt4.models.QformerMoELN import BertMoELMHeadModelLNIn from minigpt4.models.QformerRouteMoE import BertMoERouteLMHeadModel from minigpt4.models.eva_vit import create_eva_vit_g from transformers import BertTokenizer @@ -88,7 +89,7 @@ class Blip2Base(BaseModel): @classmethod - def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2): + def init_QformerMoE(cls, num_query_token, vision_width, moebert_expert_num, moebert_route_method, moebert_load_balance, moe_topk=1, use_balance_loss=True, moe_weight_type='l2_norm', cross_attention_freq=2,ln_position="out"): moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased") moe_encoder_config.encoder_width = vision_width @@ -104,9 +105,14 @@ class Blip2Base(BaseModel): moe_encoder_config.use_balance_loss = use_balance_loss moe_encoder_config.moe_weight_type = moe_weight_type - MoEQformer = BertMoELMHeadModel.from_pretrained( - "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config - ) + if ln_position == "out": + MoEQformer = BertMoELMHeadModel.from_pretrained( + "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config + ) + elif ln_position == "in": + MoEQformer = BertMoELMHeadModelLNIn.from_pretrained( + "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config + ) query_tokens = nn.Parameter( torch.zeros(1, num_query_token, moe_encoder_config.hidden_size) ) diff --git a/minigpt4/models/blip2_vicuna_instruct.py b/minigpt4/models/blip2_vicuna_instruct.py index 13421ab..d7e26a5 100644 --- a/minigpt4/models/blip2_vicuna_instruct.py +++ b/minigpt4/models/blip2_vicuna_instruct.py @@ -65,6 +65,8 @@ class Blip2VicunaInstruct(Blip2Base): use_balance_loss = True, moe_weight_type = "l2_norm", gate_save_path = None, + bal_loss_decay_epoch = 3, + ln_position = "out", ): super().__init__() transformers_version = version.parse(transformers.__version__) @@ -112,7 +114,8 @@ class Blip2VicunaInstruct(Blip2Base): moe_topk=moe_topk, use_balance_loss=use_balance_loss, moe_weight_type=moe_weight_type, - cross_attention_freq=2 + cross_attention_freq=2, + ln_position=ln_position, ) else: self.Qformer, self.query_tokens = self.init_Qformer( @@ -221,6 +224,7 @@ class Blip2VicunaInstruct(Blip2Base): self.moebert_num_beams = moebert_num_beams self.gate_save_path = gate_save_path + self.bal_loss_decay_epoch = bal_loss_decay_epoch # if self.gate_save_path != None: # import os # if not os.path.exists(self.gate_save_path): @@ -392,9 +396,12 @@ class Blip2VicunaInstruct(Blip2Base): return_dict=True, labels=targets, ) - + if self.use_moeqformer: - loss = outputs.loss + self.moebert_load_balance * gate_loss + if samples['epoch'] > self.bal_loss_decay_epoch: + loss = outputs.loss + else: + loss = outputs.loss + self.moebert_load_balance * gate_loss else: loss = outputs.loss @@ -512,6 +519,16 @@ class Blip2VicunaInstruct(Blip2Base): with self.maybe_autocast(): inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids) + + # path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/embedding/" + # np.save(os.join(path, "inputs_llm.npy"), inputs_llm.cpu().numpy) + # np.save(os.join(path, "inputs_llm.npy"), self.llm_model.get_input_embeddings().weight.cpu().numpy) + # samples_copy = samples.copy() + # samples_copy.pop('image', None) + # with open(os.path.join(path, 'test_samples.json'),'a+') as f: + # f.write(f"{json.dumps(samples_copy)}\n") + + inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1) attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1) @@ -654,6 +671,8 @@ class Blip2VicunaInstruct(Blip2Base): use_balance_loss = cfg.get("use_balance_loss", True) moe_weight_type = cfg.get("moe_weight_type",'l2_norm') gate_save_path = cfg.get("gate_save_path", None) + bal_loss_decay_epoch = cfg.get("bal_loss_decay_epoch", 3) + ln_position = cfg.get("ln_position","out") model = cls( vit_model=vit_model, @@ -683,6 +702,8 @@ class Blip2VicunaInstruct(Blip2Base): use_balance_loss=use_balance_loss, moe_weight_type=moe_weight_type, gate_save_path=gate_save_path, + bal_loss_decay_epoch=bal_loss_decay_epoch, + ln_position=ln_position, ) # if qformer_text_input: diff --git a/minigpt4/models/moe/beam_search.py b/minigpt4/models/moe/beam_search.py index c4b3c5b..c5c3a5a 100644 --- a/minigpt4/models/moe/beam_search.py +++ b/minigpt4/models/moe/beam_search.py @@ -165,7 +165,7 @@ class RouteMoELayer(nn.Module): self.route_method = route_method if self.route_method == "pre-route": self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() - elif self.route_method == "post-route": + elif self.route_method in ["post-route", "post-route-dp"]: gate = nn.Linear(hidden_size, 1, bias=False).float() self.gate = gate # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) @@ -252,6 +252,53 @@ class RouteMoELayer(nn.Module): return beam_scores, expert_route, beam_idx + def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size): + if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route', 'post-route-dp']: + # current_scores_log torch.Size([bz, num_experts]) + assert beam_scores==None and expert_route==None + current_scores = torch.exp(current_scores_log) + topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) + expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) + beam_idx = torch.tensor(range(self.num_beams * batch_size)) + + else: + batch_size = int(batch_size // self.num_beams) + next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 + next_scores_exp = torch.exp(next_scores_raw) + import pdb;pdb.set_trace() + + next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True) + next_scores = next_scores_raw.view(batch_size, self.num_beams) + next_experts = next_experts_raw.view(batch_size, self.num_beams) + # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价 + # next_scores torch.Size([bz * num_beams, 1]) + # next_tokens torch.Size([bz * num_beams, 1]) + + next_batch_beam = list() + for batch_idx in range(batch_size): + next_sent_beam = list() + expert_id = next_experts[batch_idx] + expert_score = next_scores[batch_idx] + values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True) + for i in range(self.num_beams): + beam_id = index[i].item() + ex_id = expert_id[beam_id].item() + effective_beam_id = batch_idx*self.num_beams + beam_id + next_sent_beam.append((values[i], ex_id, effective_beam_id)) + next_batch_beam.extend(next_sent_beam) + + import pdb;pdb.set_trace() + + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) + beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) + pre_route = expert_route[beam_idx,:] + expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) + + return beam_scores, expert_route, beam_idx + + def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size): if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']: # current_scores_log torch.Size([bz, num_experts]) @@ -267,6 +314,8 @@ class RouteMoELayer(nn.Module): batch_size = int(batch_size // self.num_beams) next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 next_scores_exp = torch.exp(next_scores_raw) + import pdb;pdb.set_trace() + next_scores_raw1 = next_scores_exp.view( batch_size, self.num_beams * self.num_experts ) # torch.Size([bz, num_beams*num_experts]) @@ -289,7 +338,7 @@ class RouteMoELayer(nn.Module): next_sent_beam.append((expert_score, ex_id, effective_beam_id)) next_batch_beam.extend(next_sent_beam) - # import pdb;pdb.set_trace() + import pdb;pdb.set_trace() beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) @@ -301,8 +350,6 @@ class RouteMoELayer(nn.Module): return beam_scores, expert_route, beam_idx - - def forward_expert_ffn(self, x, expert_select, current_scores): """ x_repeat : [bz*num_beams, 32,768] @@ -343,6 +390,7 @@ class RouteMoELayer(nn.Module): batch_size, num_tokens = x.shape[0], x.shape[1] beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + current_expert_select = expert_route[:,-1] import pdb;pdb.set_trace() @@ -368,7 +416,6 @@ class RouteMoELayer(nn.Module): output_x = self.experts[expert_idx].forward(input_x) return output_x - import pdb; pdb.set_trace() outputs = list() logits_gate_lst = list() for expert_idx in range(self.num_experts): @@ -392,10 +439,14 @@ class RouteMoELayer(nn.Module): # importance loss importance_loss = self._importance_auxiliary_loss(current_scores) - # import pdb; pdb.set_trace() - batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam - beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + import pdb; pdb.set_trace() + + if self.route_method == 'post-route': + beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + elif self.route_method == 'post-route-dp': + beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size) + # beam_scores torch.Size([bz*num_beam]) # expert_route torch.Size([bz*num_beam, layer_n]) current_select_expert = expert_route[:,-1] @@ -431,7 +482,7 @@ class RouteMoELayer(nn.Module): """ if self.route_method == 'pre-route': candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) - elif self.route_method == "post-route": + elif self.route_method in ['post-route', 'post-route-dp']: candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True) return candidate_output, beam_scores, expert_route, beam_idx, importance_loss @@ -467,10 +518,11 @@ if __name__ == '__main__': batch_size = 4 x = torch.randn(batch_size, 32, 768) beam_scores, expert_route = None, None - x1 = x x2 = x + x3 = x beam_scores1, expert_route1 = None, None + beam_scores2, expert_route2 = None, None for layer_num in [6, 8, 10]: layer_judge = moe_layer_judge(layer_num) @@ -494,25 +546,41 @@ if __name__ == '__main__': # print(importance_loss) # x = hidden_states1 - gate1 = nn.Linear(768, 1, bias=False).float() + # experts_post = RouteMoELayer( + # hidden_size=768, + # expert=ffn, + # num_experts=config.moebert_expert_num, + # num_beams=config.moebert_num_beams, + # layer_judge = layer_judge, + # route_method = "post-route", + # weight_type="ffn_prob" + # ) + # layer_output = experts_post(x1, None, beam_scores1, expert_route1, False) + # hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output + + # print(beam_scores1) + # print(expert_route1) + # print(beam_idx) + # print(importance_loss) + # x1 = hidden_states2 + experts_post = RouteMoELayer( hidden_size=768, expert=ffn, num_experts=config.moebert_expert_num, num_beams=config.moebert_num_beams, layer_judge = layer_judge, - route_method = "post-route", + route_method = "post-route-dp", weight_type="ffn_prob" ) - layer_output = experts_post(x1, None, beam_scores1, expert_route1, False) - hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output - - print(beam_scores1) - print(expert_route1) - print(beam_idx) - print(importance_loss) - x1 = hidden_states2 + layer_output = experts_post(x2, None, beam_scores2, expert_route2, False) + hidden_states3, beam_scores2, expert_route2, beam_idx2, importance_loss2 = layer_output + print(beam_scores2) + print(expert_route2) + print(beam_idx2) + print(importance_loss2) + x2 = hidden_states3 # gate = nn.Linear(768, config.moebert_expert_num, bias=False).float() # experts_moe = MoELayer( @@ -526,12 +594,12 @@ if __name__ == '__main__': # weight_type=config.moe_weight_type, # ) # attn_mask = torch.ones([batch_size, 32]) - # layer_output = experts_moe(x2, attn_mask) - # hidden_states3, select_prob_gate, gate_load,_ = layer_output + # layer_output = experts_moe(x3, attn_mask) + # hidden_states4, select_prob_gate, gate_load,_ = layer_output # print(select_prob_gate) # print(gate_load) - # x2 = hidden_states3 + # x3 = hidden_states4 print("------------------------------------") import pdb; pdb.set_trace() diff --git a/minigpt4/models/moe/route_moe_layer.py b/minigpt4/models/moe/route_moe_layer.py index 6012dd2..69fac18 100644 --- a/minigpt4/models/moe/route_moe_layer.py +++ b/minigpt4/models/moe/route_moe_layer.py @@ -18,7 +18,7 @@ class RouteMoELayer(nn.Module): self.route_method = route_method if self.route_method == "pre-route": self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() - elif self.route_method == "post-route": + elif self.route_method in ["post-route", "post-route-dp"]: gate = nn.Linear(hidden_size, 1, bias=False).float() self.gate = gate # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) @@ -47,26 +47,67 @@ class RouteMoELayer(nn.Module): prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts]) return prob_gate - - def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size): - if self.layer_judge=='first' and self.route_method=='pre-route': + def dp_search(self, current_scores_log, beam_scores, expert_route, batch_size): + if self.layer_judge=='first' and self.route_method in ['post-route-dp']: + # current_scores_log torch.Size([bz, num_experts]) assert beam_scores==None and expert_route==None current_scores = torch.exp(current_scores_log) topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) beam_idx = torch.tensor(range(self.num_beams * batch_size)) + else: - if self.layer_judge=='first' and self.route_method == 'post-route': - batch_size = batch_size - next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts]) - else: - batch_size = int(batch_size // self.num_beams) - next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 - next_scores_exp = torch.exp(next_scores_raw) - next_scores_raw1 = next_scores_exp.view( - batch_size, self.num_beams * self.num_experts - ) # torch.Size([bz, num_beams*num_experts]) + batch_size = int(batch_size // self.num_beams) + next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 + next_scores_exp = torch.exp(next_scores_raw) + + next_scores_raw, next_experts_raw = torch.topk(next_scores_exp, 1, dim=1, largest=True, sorted=True) + next_scores = next_scores_raw.view(batch_size, self.num_beams) + next_experts = next_experts_raw.view(batch_size, self.num_beams) + # next_scores, next_experts = torch.topk(current_scores_log, 1, dim=1, largest=True, sorted=True) # equal 等价 + # next_scores torch.Size([bz * num_beams, 1]) + # next_tokens torch.Size([bz * num_beams, 1]) + + next_batch_beam = list() + for batch_idx in range(batch_size): + next_sent_beam = list() + expert_id = next_experts[batch_idx] + expert_score = next_scores[batch_idx] + values, index = torch.topk(expert_score, self.num_beams, dim=0, largest=True, sorted=True) + for i in range(self.num_beams): + beam_id = index[i].item() + ex_id = expert_id[beam_id].item() + effective_beam_id = batch_idx*self.num_beams + beam_id + next_sent_beam.append((values[i], ex_id, effective_beam_id)) + next_batch_beam.extend(next_sent_beam) + + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) + beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) + pre_route = expert_route[beam_idx,:] + expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) + + return beam_scores, expert_route, beam_idx + + def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size): + if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']: + # current_scores_log torch.Size([bz, num_experts]) + assert beam_scores==None and expert_route==None + current_scores = torch.exp(current_scores_log) + topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) + expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) + beam_idx = torch.tensor(range(self.num_beams * batch_size)) + + else: + batch_size = int(batch_size // self.num_beams) + next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 + next_scores_exp = torch.exp(next_scores_raw) + + next_scores_raw1 = next_scores_exp.view( + batch_size, self.num_beams * self.num_experts + ) # torch.Size([bz, num_beams*num_experts]) next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True) # next_scores torch.Size([bz, num_beams]) @@ -86,19 +127,11 @@ class RouteMoELayer(nn.Module): next_sent_beam.append((expert_score, ex_id, effective_beam_id)) next_batch_beam.extend(next_sent_beam) - if self.layer_judge=='first' and self.route_method == 'post-route': - beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) - expert_route = next_experts.view(self.num_beams * batch_size) - beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) - beam_experts = expert_route.new([x[1] for x in next_batch_beam]).unsqueeze(-1) - beam_idx = expert_route.new([int(x[2]/self.num_beams) for x in next_batch_beam]) - expert_route = beam_experts - else: - beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) - beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) - beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) - pre_route = expert_route[beam_idx,:] - expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) + beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) + pre_route = expert_route[beam_idx,:] + expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) return beam_scores, expert_route, beam_idx @@ -153,7 +186,6 @@ class RouteMoELayer(nn.Module): # import pdb;pdb.set_trace() return candidate_output, beam_scores, expert_route, beam_idx, importance_loss - def forward_post_route(self, x, beam_scores, expert_route, use_log=True): attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) @@ -187,7 +219,12 @@ class RouteMoELayer(nn.Module): importance_loss = self._importance_auxiliary_loss(current_scores) batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam - beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + + if self.route_method == 'post-route': + beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + elif self.route_method == 'post-route-dp': + beam_scores, expert_route, beam_idx = self.dp_search(current_scores_log, beam_scores, expert_route, batch_size) + # beam_scores torch.Size([bz*num_beam]) # expert_route torch.Size([bz*num_beam, layer_n]) current_select_expert = expert_route[:,-1] @@ -218,7 +255,7 @@ class RouteMoELayer(nn.Module): """ if self.route_method == 'pre-route': candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) - elif self.route_method == "post-route": + elif self.route_method in ['post-route', 'post-route-dp']: candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True) return candidate_output, beam_scores, expert_route, beam_idx, importance_loss diff --git a/minigpt4/models/moe/utils.py b/minigpt4/models/moe/utils.py index 52f78b8..1489f60 100644 --- a/minigpt4/models/moe/utils.py +++ b/minigpt4/models/moe/utils.py @@ -13,7 +13,7 @@ from typing import Optional, Tuple, List def use_experts(layer_idx): # if layer_idx % 2 == 0: # use moe_ffn after cross_attns - if int(layer_idx) in [6,8,10]: + if int(layer_idx) in [6,7,8,9,10,11]: # layer 6/8/10 return True else: diff --git a/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml b/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml new file mode 100644 index 0000000..1617e28 --- /dev/null +++ b/minigpt4/projects/qformer_moe_post_vicuna/eval/vqav2_okvqa_gqa_evaluation.yaml @@ -0,0 +1,114 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: True + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/20240301223/checkpoint_best.pth" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # T5 + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: False + moebert_expert_num: 3 + moebert_route_method: "gate-sentence-post" + moe_weight_type: "raw_prob" + moebert_load_balance: 0.05 + moe_topk: 1 + use_balance_loss: False + ln_position: "out" + +datasets: + gqa: + type: balanced_sft_raw_eval + batch_size: 4 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + + ok_vqa: # train, valid (9009, 5046) + type: ok_vqa_eval + batch_size: 4 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + + coco_vqa: # 658104 + type: vqa_v2_eval + batch_size: 4 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + + aok_vqa: # train: 17056, val: 1145 + batch_size: 4 + vis_processor: + eval: + name: "blip2_image_train" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + +run: + task: instruction_tuning + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_post/eval/mix_coco_gqa_1610k_raw_QformerMoE_Post_linear_gate_lnout_lr5e5_3ex_top1_2loss_005_top6layer_textinqf_6epo_0301/" + num_workers: 4 + + amp: True + resume_ckpt_path: null + + evaluate: True + test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True + + + + + + diff --git a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml index 74f4ab0..991ad2a 100644 --- a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml +++ b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml @@ -10,7 +10,7 @@ model: load_finetuned: True vit_model: eva_clip_g pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" - finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth" + finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/20240128142/checkpoint_best.pth" q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" # vit encoder @@ -39,27 +39,18 @@ model: use_moeqformer: True use_route_moe: True moebert_route_method: "post-route" - moebert_load_balance: 0 + moebert_load_balance: 0.01 moebert_expert_num: 2 moebert_num_beams: 2 moe_weight_type: 'ffn_prob' - gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/" + use_balance_loss: False + bal_loss_decay_epoch: 8 + gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/" datasets: gqa: type: balanced_sft_raw_eval - batch_size: 32 - vis_processor: - eval: - name: "blip2_image_eval" - image_size: 224 - text_processor: - eval: - name: "blip_caption" - - ok_vqa: # train, valid (9009, 5046) - type: ok_vqa_eval - batch_size: 32 + batch_size: 64 vis_processor: eval: name: "blip2_image_eval" @@ -70,6 +61,17 @@ datasets: coco_vqa: # 658104 type: vqa_v2_eval + batch_size: 64 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + + coco_caption: # 414113 train + type: coco_cap_eval batch_size: 32 vis_processor: eval: @@ -78,7 +80,18 @@ datasets: text_processor: eval: name: "blip_caption" - + + ok_vqa: # train, valid (9009, 5046) + type: ok_vqa_eval + batch_size: 64 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + run: task: instruction_tuning # optimizer @@ -96,7 +109,7 @@ run: iters_per_epoch: 3000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_5e5lr_top6layer_textinqf_epo8_0128/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml index 16440dc..7ae5cbc 100644 --- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml @@ -38,17 +38,17 @@ model: # moe use_moeqformer: True use_route_moe: True - moebert_route_method: "post-route" - moebert_load_balance: 0 - moebert_expert_num: 3 - moebert_num_beams: 3 + moebert_route_method: "post-route-dp" + moebert_load_balance: 0.05 + moebert_expert_num: 2 + moebert_num_beams: 2 moe_weight_type: 'ffn_prob' use_balance_loss: False datasets: gqa: # train: 943000, 12578, 12578) type: balanced_sft_raw - batch_size: 16 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -64,7 +64,7 @@ datasets: sample_ratio: 10 ok_vqa: # train, valid (9009, 5046) - batch_size: 16 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -80,7 +80,7 @@ datasets: sample_ratio: 1 coco_vqa: # 658104 - batch_size: 16 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -112,7 +112,7 @@ run: iters_per_epoch: 5000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0121/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml similarity index 84% rename from minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml rename to minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml index 7124efc..b2cf35b 100644 --- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_5000.yaml +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_0122.yaml @@ -38,14 +38,17 @@ model: # moe use_moeqformer: True use_route_moe: True - moebert_expert_num: 5 - moebert_num_beams: 1 - # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + moebert_route_method: "post-route" + moebert_load_balance: 0 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False datasets: gqa: # train: 943000, 12578, 12578) type: balanced_sft_raw - batch_size: 4 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -61,7 +64,7 @@ datasets: sample_ratio: 10 ok_vqa: # train, valid (9009, 5046) - batch_size: 4 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -77,7 +80,7 @@ datasets: sample_ratio: 1 coco_vqa: # 658104 - batch_size: 4 + batch_size: 32 vis_processor: train: name: "blip2_image_train" @@ -96,20 +99,20 @@ run: task: instruction_tuning # optimizer lr_sched: "linear_warmup_cosine_lr" - init_lr: 2e-5 + init_lr: 5e-5 min_lr: 1e-6 warmup_lr: 1e-6 log_freq: 5 save_freq: 1500 weight_decay: 0.05 - max_epoch: 6 + max_epoch: 8 num_workers: 4 warmup_steps: 600 iters_per_epoch: 5000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1212_Test/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_1loss_5e5lr_top6layer_textinqf_epo8_0123/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml new file mode 100644 index 0000000..15117b3 --- /dev/null +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco.yaml @@ -0,0 +1,145 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: False + use_route_moe: False + moebert_route_method: "post-route" + moebert_load_balance: 0 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + +datasets: + gqa: # train: 943000, 12578, 12578) + type: balanced_sft_raw + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 10 + + ok_vqa: # train, valid (9009, 5046) + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 1 + + coco_vqa: # 658104 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 9 + + coco_caption: # 414113 train + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 7 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + # output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_005_5e5lr_top6layer_textinqf_epo8_0122/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Base_top6layer_textinqf_epo8_0124/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + # test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml new file mode 100644 index 0000000..e1fbc8f --- /dev/null +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_coco_0128.yaml @@ -0,0 +1,145 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: True + moebert_route_method: "post-route" + moebert_load_balance: 0.01 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + bal_loss_decay_epoch: 3 + +datasets: + gqa: # train: 943000, 12578, 12578) + type: balanced_sft_raw + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 10 + + ok_vqa: # train, valid (9009, 5046) + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 1 + + coco_vqa: # 658104 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 9 + + coco_caption: # 414113 train + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 7 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_vqa_cococap_2024k_raw_QformerMoE_Route_Post_ffn_prob_lnout_linear_1gate_2ex_2beam_2loss_001_loss_decay_5e5lr_top6layer_textinqf_epo8_0129/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + # test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml new file mode 100644 index 0000000..16b3ef5 --- /dev/null +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_mix.yaml @@ -0,0 +1,188 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: True + moebert_route_method: "post-route" + moebert_load_balance: 0.05 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + +datasets: + gqa: + type: balanced_sft_raw_eval + batch_size: 16 + vis_processor: + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + eval: + name: "blip_caption" + + ok_vqa: # train, valid (9009, 5046) + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 8 + + coco_vqa: # 658104 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 15 + + aok_vqa: # train: 17056, val: 1145 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 12 + + ocrvqa: # train 207572 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 30 + + llava_reason: # 76643 + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 80 + + llava_conversation: # 56681 + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 30 + + llava_detail: # 23240 + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 20 + + coco_caption: # 414113 train + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + text_processor: + train: + name: "blip_caption" + sample_ratio: 10 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + # test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml new file mode 100644 index 0000000..be65c17 --- /dev/null +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_dp_blip2_vicuna7b_data_balance.yaml @@ -0,0 +1,128 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: True + moebert_route_method: "post-route-dp" + moebert_load_balance: 0.05 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + +datasets: + gqa: # train: 943000, 12578, 12578) + type: balanced_sft_raw + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 10 + + ok_vqa: # train, valid (9009, 5046) + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 1 + + coco_vqa: # 658104 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 9 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_DP_Route_Post_ffn_prob_linear_1gate_2ex_2beam_2loss_5e5lr_top6layer_textinqf_epo8_0118/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + # test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/tasks/instruction_tuning.py b/minigpt4/tasks/instruction_tuning.py index 759d8bd..341d601 100644 --- a/minigpt4/tasks/instruction_tuning.py +++ b/minigpt4/tasks/instruction_tuning.py @@ -53,7 +53,7 @@ class InstructionTask(BaseTask): run_cfg = cfg.run_cfg num_beams = run_cfg.get("num_beams", 3) - max_len = run_cfg.get("max_len", 20) + max_len = run_cfg.get("max_len", 30) min_len = run_cfg.get("min_len", 1) evaluate = run_cfg.get("evaluate", False) @@ -112,22 +112,33 @@ class InstructionTask(BaseTask): ) pred_qa_pairs = [] - question_id = samples["question_id"] - question = samples["text_input"] + text_inputs = samples["text_input"] + sources = samples["source"] + source = samples["source"][0] + + if source in ['vqav2','okvqa','gqa']: + sample_ids = [int(sample_id.item()) for sample_id in samples["question_id"]] + elif source in ['aokvqa']: + sample_ids = [sample_id for sample_id in samples["question_id"]] + elif source in ['coco_cap']: + sample_ids = samples["image_id"] # For GQA - full_answers = samples.get("fullAnswer", ["" for i in range(len(question_id))]) - gt_answers = samples.get("gt_answers", ["" for i in range(len(question_id))]) + full_answers = samples.get("fullAnswer", ["" for i in range(len(sample_ids))]) + gt_answers = samples.get("gt_answers", ["" for i in range(len(sample_ids))]) - for answer, ques_id, ques, full_answer, gt_answer, source in zip(answers, question_id, question, full_answers, gt_answers, sources): - ques_id = int(ques_id.item()) + # For AOKVQA + choices = samples.get("choices", ["" for i in range(len(sample_ids))]) + + for answer, sample_id, text_input, full_answer, gt_answer, choice, source in zip(answers, sample_ids, text_inputs, full_answers, gt_answers, choices, sources): pred_qa_pairs.append({ - "question_id": ques_id, - "question": ques, + "question_id": sample_id, + "question": text_input, "full_answer": full_answer, "answer": answer, "gt_ans": gt_answer, + "choice": choice, "source": source}) return pred_qa_pairs @@ -140,9 +151,7 @@ class InstructionTask(BaseTask): total_results = list() for sub_data_loader in data_loader.loaders: results = [] - ques_ids = [] for samples in metric_logger.log_every(sub_data_loader, print_freq, header): - ques_ids.extend(samples['question_id'].tolist()) samples = prepare_sample(samples, cuda_enabled=cuda_enabled) eval_output = self.valid_step(model=model, samples=samples) @@ -168,6 +177,7 @@ class InstructionTask(BaseTask): filename=f"{split_name}_vqa_result_{source}", remove_duplicate="question_id", ) + if source in ['vqav2','okvqa']: try: metrics = self._report_metrics_coco_vqa(result_file=result_file, split=split_name, source=source) @@ -180,7 +190,18 @@ class InstructionTask(BaseTask): except Exception as e: metrics = None print(f"Report Metrics {source} Error: {e}") - + elif source in ['aokvqa']: + try: + metrics = self._report_metrics_aokvqa(result_file=result_file, source=source) + except Exception as e: + metrics = None + print(f"Report Metrics {source} Error: {e}") + elif source in ['coco_cap']: + try: + metrics = self._report_metrics_caption(result_file=result_file, split_name=split_name, source=source) + except Exception as e: + metrics = None + print(f"Report Metrics {source} Error: {e}") else: metrics = None final_metrics[source] = metrics @@ -234,10 +255,46 @@ class InstructionTask(BaseTask): return metrics + @dist_utils.main_process + def _report_metrics_aokvqa(self, result_file, source='aokvqa'): + """ + Validation of aokvqa + """ + # measuring accuracy compared to answer + results = json.load(open(result_file, "r")) + acc = [] + vqa_tool = VQAEval() + + for res in results: + + gt_ans = res["choice"] + pred = res["answer"] + + pred = vqa_tool.processPunctuation(pred) + pred = vqa_tool.processDigitArticle(pred) + + # vqa_acc = 1 if pred == gt_ans else 0 + vqa_acc = 1 if pred in gt_ans else 0 + + acc.append(vqa_acc) + + accuracy = sum(acc) / len(acc) * 100 + metrics = {"agg_metrics": accuracy, "acc": accuracy} + + with open( + os.path.join(registry.get_path("output_dir"), f"evaluate_{source}.txt"), "a" + ) as f: + f.write(json.dumps(metrics) + "\n") + + logging.info(metrics) + + return metrics + + @dist_utils.main_process def _report_metrics_gqa(self, result_file, source='gqa'): """ - Validation of GQA/VQAv2 + Validation of GQA """ # measuring accuracy compared to answer results = json.load(open(result_file, "r")) @@ -274,3 +331,90 @@ class InstructionTask(BaseTask): return metrics + @dist_utils.main_process + def _report_metrics_caption(self, result_file, split_name, source='coco_cap'): + """ + Use official COCO Cap evaluation script to report metrics. + """ + coco_gt_root = os.path.join(registry.get_path("cache_root"), "coco_gt") + coco_val = coco_caption_eval(coco_gt_root, result_file, split_name) + + agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"] + log_stats = {split_name: {k: v for k, v in coco_val.eval.items()}} + + with open( + os.path.join(registry.get_path("output_dir"), "evaluate.txt"), "a" + ) as f: + f.write(json.dumps(log_stats) + "\n") + + coco_res = {k: v for k, v in coco_val.eval.items()} + coco_res["agg_metrics"] = agg_metrics + + return coco_res + +from collections import defaultdict +from pycocoevalcap.eval import COCOEvalCap +class COCO_Annotation: + def __init__(self, annotation_file): + self.coco_cn_file = annotation_file + self.imgToAnns = self.build_imgToAnns() + + def build_imgToAnns(self): + imgToAnns = defaultdict(list) + with open(self.coco_cn_file, "r", encoding="UTF-8") as fin: + for line in fin: + line = line.strip() + temp = eval(line) + annotations = temp['annotations'] + for ann in annotations: + image_id = str(ann['image_id']).zfill(6) + imgToAnns[image_id].append({'image_id':image_id,'caption':ann['caption'],'image': ann['image_id']}) + return imgToAnns + + def getImgIds(self): + return self.imgToAnns.keys() + +class COCO_Result: + def __init__(self,result_file): + self.coco_cn_file = result_file + self.imgToAnns = self.build_imgToAnns() + + def build_imgToAnns(self): + imgToAnns = dict() + data = json.load(open(self.coco_cn_file, "r")) + for d in data: + tmp = { + 'image_id':d['question_id'][-6:], + 'caption':d['answer'] + } + imgToAnns[d['question_id'][-6:]] = [tmp] + return imgToAnns + +def coco_caption_eval(coco_gt_root, results_file, split_name): + files = { + "val":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val_gt.json", + "test":"/mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test_gt.json" + } + + # create coco object and coco_result object + annotation_file = files[split_name] + coco = COCO_Annotation(annotation_file) + coco_result = COCO_Result(results_file) + + # create coco_eval object by taking coco and coco_result + coco_eval = COCOEvalCap(coco, coco_result) + + # evaluate on a subset of images by setting + # coco_eval.params['image_id'] = coco_result.getImgIds() + # please remove this line when evaluating the full validation set + # coco_eval.params['image_id'] = coco_result.getImgIds() + + # evaluate results + # SPICE will take a few minutes the first time, but speeds up due to caching + coco_eval.evaluate() + + # print output evaluation scores + for metric, score in coco_eval.eval.items(): + print(f"{metric}: {score:.3f}") + + return coco_eval \ No newline at end of file diff --git a/prompts/alignment.txt b/prompts/alignment.txt deleted file mode 100644 index 38ae75a..0000000 --- a/prompts/alignment.txt +++ /dev/null @@ -1,4 +0,0 @@ - Describe this image in detail. - Take a look at this image and describe what you notice. - Please provide a detailed description of the picture. - Could you describe the contents of this image for me? \ No newline at end of file diff --git a/test.pdf/backward_graph b/test.pdf/backward_graph deleted file mode 100644 index 7867fb1..0000000 --- a/test.pdf/backward_graph +++ /dev/null @@ -1,5570 +0,0 @@ -digraph { - graph [size="778.8,778.8"] - node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled] - 140509988778688 [label=" - (1, 49, 768)" fillcolor=darkolivegreen1] - 140509588281712 [label=CatBackward0] - 140509588282912 -> 140509588281712 - 140509588282912 [label=IndexBackward0] - 140509588281808 -> 140509588282912 - 140509588281808 [label=SumBackward1] - 140509588283152 -> 140509588281808 - 140509588283152 [label=MulBackward0] - 140509588282864 -> 140509588283152 - 140509588282864 [label=CatBackward0] - 140509591316848 -> 140509588282864 - 140509591316848 [label=UnsqueezeBackward0] - 140509591314640 -> 140509591316848 - 140509591314640 [label=NativeLayerNormBackward0] - 140509591317376 -> 140509591314640 - 140509591317376 [label=AddBackward0] - 140509588312944 -> 140509591317376 - 140509588312944 [label=NativeDropoutBackward0] - 140509588313424 -> 140509588312944 - 140509588313424 [label=ViewBackward0] - 140509588313232 -> 140509588313424 - 140509588313232 [label=AddmmBackward0] - 140509588312560 -> 140509588313232 - 140509588312560 [label=ToCopyBackward0] - 140509591318384 -> 140509588312560 - 140509591260672 [label="encoder.layer.11.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591260672 -> 140509591318384 - 140509591318384 [label=AccumulateGrad] - 140509588313040 -> 140509588313232 - 140509588313040 [label=ViewBackward0] - 140509588312368 -> 140509588313040 - 140509588312368 [label=GeluBackward0] - 140509588312176 -> 140509588312368 - 140509588312176 [label=ViewBackward0] - 140509588313328 -> 140509588312176 - 140509588313328 [label=AddmmBackward0] - 140509588313520 -> 140509588313328 - 140509588313520 [label=ToCopyBackward0] - 140509588313808 -> 140509588313520 - 140509591261072 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591261072 -> 140509588313808 - 140509588313808 [label=AccumulateGrad] - 140509588313616 -> 140509588313328 - 140509588313616 [label=ViewBackward0] - 140509588314096 -> 140509588313616 - 140509588314096 [label=ToCopyBackward0] - 140509588312608 -> 140509588314096 - 140509588312608 [label=SliceBackward0] - 140509588314048 -> 140509588312608 - 140509588314048 [label=SliceBackward0] - 140509588314288 -> 140509588314048 - 140509588314288 [label=SliceBackward0] - 140509588314480 -> 140509588314288 - 140509588314480 [label=SliceBackward0] - 140509588314528 -> 140509588314480 - 140509588314528 [label=SliceBackward0] - 140509588314768 -> 140509588314528 - 140509588314768 [label=NativeLayerNormBackward0] - 140509588314960 -> 140509588314768 - 140509588314960 [label=AddBackward0] - 140509588315248 -> 140509588314960 - 140509588315248 [label=NativeDropoutBackward0] - 140509588315632 -> 140509588315248 - 140509588315632 [label=ViewBackward0] - 140509588315824 -> 140509588315632 - 140509588315824 [label=AddmmBackward0] - 140509588316016 -> 140509588315824 - 140509588316016 [label=ToCopyBackward0] - 140509588315968 -> 140509588316016 - 140509591290880 [label="encoder.layer.11.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509591290880 -> 140509588315968 - 140509588315968 [label=AccumulateGrad] - 140509588315728 -> 140509588315824 - 140509588315728 [label=ViewBackward0] - 140509588316112 -> 140509588315728 - 140509588316112 [label=ViewBackward0] - 140509588345136 -> 140509588316112 - 140509588345136 [label=CloneBackward0] - 140509588345184 -> 140509588345136 - 140509588345184 [label=PermuteBackward0] - 140509588345424 -> 140509588345184 - 140509588345424 [label=UnsafeViewBackward0] - 140509588345616 -> 140509588345424 - 140509588345616 [label=BmmBackward0] - 140509588345664 -> 140509588345616 - 140509588345664 [label=ReshapeAliasBackward0] - 140509588346192 -> 140509588345664 - 140509588346192 [label=ExpandBackward0] - 140509588346288 -> 140509588346192 - 140509588346288 [label=ToCopyBackward0] - 140509588346480 -> 140509588346288 - 140509588346480 [label=NativeDropoutBackward0] - 140509588346672 -> 140509588346480 - 140509588346672 [label=SoftmaxBackward0] - 140509588346768 -> 140509588346672 - 140509588346768 [label=AddBackward0] - 140509588346960 -> 140509588346768 - 140509588346960 [label=DivBackward0] - 140509588347152 -> 140509588346960 - 140509588347152 [label=UnsafeViewBackward0] - 140509588347248 -> 140509588347152 - 140509588347248 [label=BmmBackward0] - 140509588347440 -> 140509588347248 - 140509588347440 [label=UnsafeViewBackward0] - 140509588347536 -> 140509588347440 - 140509588347536 [label=CloneBackward0] - 140509588347584 -> 140509588347536 - 140509588347584 [label=ExpandBackward0] - 140509588347824 -> 140509588347584 - 140509588347824 [label=PermuteBackward0] - 140509588348016 -> 140509588347824 - 140509588348016 [label=ViewBackward0] - 140509588348064 -> 140509588348016 - 140509588348064 [label=ViewBackward0] - 140509588348304 -> 140509588348064 - 140509588348304 [label=AddmmBackward0] - 140509588348496 -> 140509588348304 - 140509588348496 [label=ToCopyBackward0] - 140509588348784 -> 140509588348496 - 140509591291680 [label="encoder.layer.11.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509591291680 -> 140509588348784 - 140509588348784 [label=AccumulateGrad] - 140509588348592 -> 140509588348304 - 140509588348592 [label=ViewBackward0] - 140509588348544 -> 140509588348592 - 140509588348544 [label=ToCopyBackward0] - 140509588315344 -> 140509588348544 - 140509588315344 [label=CatBackward0] - 140509588369568 -> 140509588315344 - 140509588369568 [label=SumBackward1] - 140509588370096 -> 140509588369568 - 140509588370096 [label=MulBackward0] - 140509588370192 -> 140509588370096 - 140509588370192 [label=CatBackward0] - 140509588370288 -> 140509588370192 - 140509588370288 [label=UnsqueezeBackward0] - 140509588370672 -> 140509588370288 - 140509588370672 [label=NativeLayerNormBackward0] - 140509588370864 -> 140509588370672 - 140509588370864 [label=AddBackward0] - 140509588371152 -> 140509588370864 - 140509588371152 [label=NativeDropoutBackward0] - 140509588371248 -> 140509588371152 - 140509588371248 [label=ViewBackward0] - 140509588371440 -> 140509588371248 - 140509588371440 [label=AddmmBackward0] - 140509588371488 -> 140509588371440 - 140509588371488 [label=ToCopyBackward0] - 140509588371920 -> 140509588371488 - 140509591285568 [label="encoder.layer.10.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591285568 -> 140509588371920 - 140509588371920 [label=AccumulateGrad] - 140509588371632 -> 140509588371440 - 140509588371632 [label=ViewBackward0] - 140509588372112 -> 140509588371632 - 140509588372112 [label=GeluBackward0] - 140509588372304 -> 140509588372112 - 140509588372304 [label=ViewBackward0] - 140509588372496 -> 140509588372304 - 140509588372496 [label=AddmmBackward0] - 140509588372592 -> 140509588372496 - 140509588372592 [label=ToCopyBackward0] - 140509588372976 -> 140509588372592 - 140509591285488 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591285488 -> 140509588372976 - 140509588372976 [label=AccumulateGrad] - 140509588372400 -> 140509588372496 - 140509588372400 [label=ViewBackward0] - 140509588372880 -> 140509588372400 - 140509588372880 [label=ToCopyBackward0] - 140509588370960 -> 140509588372880 - 140509588370960 [label=SliceBackward0] - 140509588373264 -> 140509588370960 - 140509588373264 [label=SliceBackward0] - 140509588373456 -> 140509588373264 - 140509588373456 [label=NativeLayerNormBackward0] - 140509588373360 -> 140509588373456 - 140509588373360 [label=AddBackward0] - 140509588402672 -> 140509588373360 - 140509588402672 [label=NativeDropoutBackward0] - 140509588402624 -> 140509588402672 - 140509588402624 [label=ViewBackward0] - 140509588402864 -> 140509588402624 - 140509588402864 [label=AddmmBackward0] - 140509588403056 -> 140509588402864 - 140509588403056 [label=ToCopyBackward0] - 140509588403344 -> 140509588403056 - 140509591293840 [label="encoder.layer.10.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509591293840 -> 140509588403344 - 140509588403344 [label=AccumulateGrad] - 140509588403152 -> 140509588402864 - 140509588403152 [label=ViewBackward0] - 140509588403632 -> 140509588403152 - 140509588403632 [label=ViewBackward0] - 140509588403728 -> 140509588403632 - 140509588403728 [label=CloneBackward0] - 140509588403920 -> 140509588403728 - 140509588403920 [label=PermuteBackward0] - 140509588404112 -> 140509588403920 - 140509588404112 [label=UnsafeViewBackward0] - 140509588404208 -> 140509588404112 - 140509588404208 [label=BmmBackward0] - 140509588404400 -> 140509588404208 - 140509588404400 [label=ReshapeAliasBackward0] - 140509588404496 -> 140509588404400 - 140509588404496 [label=ExpandBackward0] - 140509588404544 -> 140509588404496 - 140509588404544 [label=ToCopyBackward0] - 140509588404784 -> 140509588404544 - 140509588404784 [label=NativeDropoutBackward0] - 140509588404976 -> 140509588404784 - 140509588404976 [label=SoftmaxBackward0] - 140509588405024 -> 140509588404976 - 140509588405024 [label=AddBackward0] - 140509588405264 -> 140509588405024 - 140509588405264 [label=DivBackward0] - 140509588405456 -> 140509588405264 - 140509588405456 [label=UnsafeViewBackward0] - 140509588405504 -> 140509588405456 - 140509588405504 [label=BmmBackward0] - 140509588405744 -> 140509588405504 - 140509588405744 [label=UnsafeViewBackward0] - 140509588406128 -> 140509588405744 - 140509588406128 [label=CloneBackward0] - 140509588405984 -> 140509588406128 - 140509588405984 [label=ExpandBackward0] - 140509588427056 -> 140509588405984 - 140509588427056 [label=PermuteBackward0] - 140509588427152 -> 140509588427056 - 140509588427152 [label=ViewBackward0] - 140509588427344 -> 140509588427152 - 140509588427344 [label=ViewBackward0] - 140509588427536 -> 140509588427344 - 140509588427536 [label=AddmmBackward0] - 140509588427632 -> 140509588427536 - 140509588427632 [label=ToCopyBackward0] - 140509588428016 -> 140509588427632 - 140509591312160 [label="encoder.layer.10.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509591312160 -> 140509588428016 - 140509588428016 [label=AccumulateGrad] - 140509588427440 -> 140509588427536 - 140509588427440 [label=ViewBackward0] - 140509588427920 -> 140509588427440 - 140509588427920 [label=ToCopyBackward0] - 140509588402384 -> 140509588427920 - 140509588402384 [label=SliceBackward0] - 140509588428304 -> 140509588402384 - 140509588428304 [label=SliceBackward0] - 140509588428496 -> 140509588428304 - 140509588428496 [label=SliceBackward0] - 140509588428592 -> 140509588428496 - 140509588428592 [label=NativeLayerNormBackward0] - 140509588428784 -> 140509588428592 - 140509588428784 [label=AddBackward0] - 140509588429072 -> 140509588428784 - 140509588429072 [label=NativeDropoutBackward0] - 140509588429168 -> 140509588429072 - 140509588429168 [label=ViewBackward0] - 140509588429360 -> 140509588429168 - 140509588429360 [label=AddmmBackward0] - 140509588429408 -> 140509588429360 - 140509588429408 [label=ToCopyBackward0] - 140509588429840 -> 140509588429408 - 140509591312960 [label="encoder.layer.10.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509591312960 -> 140509588429840 - 140509588429840 [label=AccumulateGrad] - 140509588429552 -> 140509588429360 - 140509588429552 [label=ViewBackward0] - 140509588430032 -> 140509588429552 - 140509588430032 [label=ViewBackward0] - 140509588430224 -> 140509588430032 - 140509588430224 [label=CloneBackward0] - 140509588430416 -> 140509588430224 - 140509588430416 [label=PermuteBackward0] - 140509588430512 -> 140509588430416 - 140509588430512 [label=UnsafeViewBackward0] - 140509588430704 -> 140509588430512 - 140509588430704 [label=BmmBackward0] - 140509588430608 -> 140509588430704 - 140509588430608 [label=ReshapeAliasBackward0] - 140509588459728 -> 140509588430608 - 140509588459728 [label=ExpandBackward0] - 140509588459824 -> 140509588459728 - 140509588459824 [label=ToCopyBackward0] - 140509588460016 -> 140509588459824 - 140509588460016 [label=NativeDropoutBackward0] - 140509588460064 -> 140509588460016 - 140509588460064 [label=SoftmaxBackward0] - 140509588460304 -> 140509588460064 - 140509588460304 [label=AddBackward0] - 140509588460496 -> 140509588460304 - 140509588460496 [label=DivBackward0] - 140509588460544 -> 140509588460496 - 140509588460544 [label=UnsafeViewBackward0] - 140509588460784 -> 140509588460544 - 140509588460784 [label=BmmBackward0] - 140509588460976 -> 140509588460784 - 140509588460976 [label=UnsafeViewBackward0] - 140509588461360 -> 140509588460976 - 140509588461360 [label=CloneBackward0] - 140509588461552 -> 140509588461360 - 140509588461552 [label=ExpandBackward0] - 140509588461648 -> 140509588461552 - 140509588461648 [label=PermuteBackward0] - 140509588461840 -> 140509588461648 - 140509588461840 [label=ViewBackward0] - 140509588462032 -> 140509588461840 - 140509588462032 [label=ViewBackward0] - 140509588462128 -> 140509588462032 - 140509588462128 [label=AddmmBackward0] - 140509588462320 -> 140509588462128 - 140509588462320 [label=ToCopyBackward0] - 140509588462608 -> 140509588462320 - 140509591313360 [label="encoder.layer.10.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509591313360 -> 140509588462608 - 140509588462608 [label=AccumulateGrad] - 140509588461984 -> 140509588462128 - 140509588461984 [label=ViewBackward0] - 140509588462464 -> 140509588461984 - 140509588462464 [label=ToCopyBackward0] - 140509588428880 -> 140509588462464 - 140509588428880 [label=CatBackward0] - 140509588462992 -> 140509588428880 - 140509588462992 [label=SumBackward1] - 140509588462944 -> 140509588462992 - 140509588462944 [label=MulBackward0] - 140509588463184 -> 140509588462944 - 140509588463184 [label=CatBackward0] - 140509588463568 -> 140509588463184 - 140509588463568 [label=UnsqueezeBackward0] - 140509588463424 -> 140509588463568 - 140509588463424 [label=NativeLayerNormBackward0] - 140509587960112 -> 140509588463424 - 140509587960112 [label=AddBackward0] - 140509587960400 -> 140509587960112 - 140509587960400 [label=NativeDropoutBackward0] - 140509587960784 -> 140509587960400 - 140509587960784 [label=ViewBackward0] - 140509587960976 -> 140509587960784 - 140509587960976 [label=AddmmBackward0] - 140509587961168 -> 140509587960976 - 140509587961168 [label=ToCopyBackward0] - 140509587961456 -> 140509587961168 - 140509591311680 [label="encoder.layer.9.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591311680 -> 140509587961456 - 140509587961456 [label=AccumulateGrad] - 140509587960880 -> 140509587960976 - 140509587960880 [label=ViewBackward0] - 140509587961360 -> 140509587960880 - 140509587961360 [label=GeluBackward0] - 140509587961552 -> 140509587961360 - 140509587961552 [label=ViewBackward0] - 140509587961600 -> 140509587961552 - 140509587961600 [label=AddmmBackward0] - 140509587961840 -> 140509587961600 - 140509587961840 [label=ToCopyBackward0] - 140509587962080 -> 140509587961840 - 140509591312000 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591312000 -> 140509587962080 - 140509587962080 [label=AccumulateGrad] - 140509587961936 -> 140509587961600 - 140509587961936 [label=ViewBackward0] - 140509587962416 -> 140509587961936 - 140509587962416 [label=ToCopyBackward0] - 140509587960496 -> 140509587962416 - 140509587960496 [label=SliceBackward0] - 140509587962512 -> 140509587960496 - 140509587962512 [label=SliceBackward0] - 140509587962560 -> 140509587962512 - 140509587962560 [label=SliceBackward0] - 140509587962800 -> 140509587962560 - 140509587962800 [label=SliceBackward0] - 140509587962992 -> 140509587962800 - 140509587962992 [label=SliceBackward0] - 140509587963040 -> 140509587962992 - 140509587963040 [label=NativeLayerNormBackward0] - 140509587963280 -> 140509587963040 - 140509587963280 [label=AddBackward0] - 140509587963520 -> 140509587963280 - 140509587963520 [label=NativeDropoutBackward0] - 140509587963760 -> 140509587963520 - 140509587963760 [label=ViewBackward0] - 140509587988784 -> 140509587963760 - 140509587988784 [label=AddmmBackward0] - 140509587988976 -> 140509587988784 - 140509587988976 [label=ToCopyBackward0] - 140509587989264 -> 140509587988976 - 140509591321152 [label="encoder.layer.9.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509591321152 -> 140509587989264 - 140509587989264 [label=AccumulateGrad] - 140509587988640 -> 140509587988784 - 140509587988640 [label=ViewBackward0] - 140509587989120 -> 140509587988640 - 140509587989120 [label=ViewBackward0] - 140509587989360 -> 140509587989120 - 140509587989360 [label=CloneBackward0] - 140509587989552 -> 140509587989360 - 140509587989552 [label=PermuteBackward0] - 140509587989600 -> 140509587989552 - 140509587989600 [label=UnsafeViewBackward0] - 140509587989840 -> 140509587989600 - 140509587989840 [label=BmmBackward0] - 140509587990032 -> 140509587989840 - 140509587990032 [label=ReshapeAliasBackward0] - 140509587990416 -> 140509587990032 - 140509587990416 [label=ExpandBackward0] - 140509587990608 -> 140509587990416 - 140509587990608 [label=ToCopyBackward0] - 140509587990704 -> 140509587990608 - 140509587990704 [label=NativeDropoutBackward0] - 140509587990896 -> 140509587990704 - 140509587990896 [label=SoftmaxBackward0] - 140509587991088 -> 140509587990896 - 140509587991088 [label=AddBackward0] - 140509587991184 -> 140509587991088 - 140509587991184 [label=DivBackward0] - 140509587991376 -> 140509587991184 - 140509587991376 [label=UnsafeViewBackward0] - 140509587991568 -> 140509587991376 - 140509587991568 [label=BmmBackward0] - 140509587991664 -> 140509587991568 - 140509587991664 [label=UnsafeViewBackward0] - 140509587991760 -> 140509587991664 - 140509587991760 [label=CloneBackward0] - 140509587991952 -> 140509587991760 - 140509587991952 [label=ExpandBackward0] - 140509587992000 -> 140509587991952 - 140509587992000 [label=PermuteBackward0] - 140509587992240 -> 140509587992000 - 140509587992240 [label=ViewBackward0] - 140509587992432 -> 140509587992240 - 140509587992432 [label=ViewBackward0] - 140509587991520 -> 140509587992432 - 140509587991520 [label=AddmmBackward0] - 140509588021456 -> 140509587991520 - 140509588021456 [label=ToCopyBackward0] - 140509588021696 -> 140509588021456 - 140509591321952 [label="encoder.layer.9.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509591321952 -> 140509588021696 - 140509588021696 [label=AccumulateGrad] - 140509588021552 -> 140509587991520 - 140509588021552 [label=ViewBackward0] - 140509588022032 -> 140509588021552 - 140509588022032 [label=ToCopyBackward0] - 140509587963664 -> 140509588022032 - 140509587963664 [label=CatBackward0] - 140509588022128 -> 140509587963664 - 140509588022128 [label=SumBackward1] - 140509588022512 -> 140509588022128 - 140509588022512 [label=MulBackward0] - 140509588022704 -> 140509588022512 - 140509588022704 [label=CatBackward0] - 140509588022656 -> 140509588022704 - 140509588022656 [label=UnsqueezeBackward0] - 140509588023184 -> 140509588022656 - 140509588023184 [label=NativeLayerNormBackward0] - 140509588023280 -> 140509588023184 - 140509588023280 [label=AddBackward0] - 140509588023664 -> 140509588023280 - 140509588023664 [label=NativeDropoutBackward0] - 140509588023616 -> 140509588023664 - 140509588023616 [label=ViewBackward0] - 140509588023856 -> 140509588023616 - 140509588023856 [label=AddmmBackward0] - 140509588024048 -> 140509588023856 - 140509588024048 [label=ToCopyBackward0] - 140509588024336 -> 140509588024048 - 140509591320272 [label="encoder.layer.8.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591320272 -> 140509588024336 - 140509588024336 [label=AccumulateGrad] - 140509588024144 -> 140509588023856 - 140509588024144 [label=ViewBackward0] - 140509588024624 -> 140509588024144 - 140509588024624 [label=GeluBackward0] - 140509588024720 -> 140509588024624 - 140509588024720 [label=ViewBackward0] - 140509588024912 -> 140509588024720 - 140509588024912 [label=AddmmBackward0] - 140509588025104 -> 140509588024912 - 140509588025104 [label=ToCopyBackward0] - 140509588025056 -> 140509588025104 - 140509591320192 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591320192 -> 140509588025056 - 140509588025056 [label=AccumulateGrad] - 140509588024816 -> 140509588024912 - 140509588024816 [label=ViewBackward0] - 140509588025200 -> 140509588024816 - 140509588025200 [label=ToCopyBackward0] - 140509588023376 -> 140509588025200 - 140509588023376 [label=SliceBackward0] - 140509588046224 -> 140509588023376 - 140509588046224 [label=SliceBackward0] - 140509588046416 -> 140509588046224 - 140509588046416 [label=NativeLayerNormBackward0] - 140509588046608 -> 140509588046416 - 140509588046608 [label=AddBackward0] - 140509588046800 -> 140509588046608 - 140509588046800 [label=NativeDropoutBackward0] - 140509588047184 -> 140509588046800 - 140509588047184 [label=ViewBackward0] - 140509588047376 -> 140509588047184 - 140509588047376 [label=AddmmBackward0] - 140509588047568 -> 140509588047376 - 140509588047568 [label=ToCopyBackward0] - 140509588047856 -> 140509588047568 - 140509591341312 [label="encoder.layer.8.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509591341312 -> 140509588047856 - 140509588047856 [label=AccumulateGrad] - 140509588047280 -> 140509588047376 - 140509588047280 [label=ViewBackward0] - 140509588047760 -> 140509588047280 - 140509588047760 [label=ViewBackward0] - 140509588047952 -> 140509588047760 - 140509588047952 [label=CloneBackward0] - 140509588048000 -> 140509588047952 - 140509588048000 [label=PermuteBackward0] - 140509588048240 -> 140509588048000 - 140509588048240 [label=UnsafeViewBackward0] - 140509588048432 -> 140509588048240 - 140509588048432 [label=BmmBackward0] - 140509588048480 -> 140509588048432 - 140509588048480 [label=ReshapeAliasBackward0] - 140509588049008 -> 140509588048480 - 140509588049008 [label=ExpandBackward0] - 140509588049104 -> 140509588049008 - 140509588049104 [label=ToCopyBackward0] - 140509588049296 -> 140509588049104 - 140509588049296 [label=NativeDropoutBackward0] - 140509588049488 -> 140509588049296 - 140509588049488 [label=SoftmaxBackward0] - 140509588049584 -> 140509588049488 - 140509588049584 [label=AddBackward0] - 140509588049776 -> 140509588049584 - 140509588049776 [label=DivBackward0] - 140509588049680 -> 140509588049776 - 140509588049680 [label=UnsafeViewBackward0] - 140509588074656 -> 140509588049680 - 140509588074656 [label=BmmBackward0] - 140509588074896 -> 140509588074656 - 140509588074896 [label=UnsafeViewBackward0] - 140509588074992 -> 140509588074896 - 140509588074992 [label=CloneBackward0] - 140509588075040 -> 140509588074992 - 140509588075040 [label=ExpandBackward0] - 140509588075280 -> 140509588075040 - 140509588075280 [label=PermuteBackward0] - 140509588075472 -> 140509588075280 - 140509588075472 [label=ViewBackward0] - 140509588075520 -> 140509588075472 - 140509588075520 [label=ViewBackward0] - 140509588075760 -> 140509588075520 - 140509588075760 [label=AddmmBackward0] - 140509588075952 -> 140509588075760 - 140509588075952 [label=ToCopyBackward0] - 140509588076240 -> 140509588075952 - 140509591342432 [label="encoder.layer.8.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509591342432 -> 140509588076240 - 140509588076240 [label=AccumulateGrad] - 140509588076048 -> 140509588075760 - 140509588076048 [label=ViewBackward0] - 140509588076528 -> 140509588076048 - 140509588076528 [label=ToCopyBackward0] - 140509588046896 -> 140509588076528 - 140509588046896 [label=SliceBackward0] - 140509588076480 -> 140509588046896 - 140509588076480 [label=SliceBackward0] - 140509588076720 -> 140509588076480 - 140509588076720 [label=SliceBackward0] - 140509588076912 -> 140509588076720 - 140509588076912 [label=NativeLayerNormBackward0] - 140509588076960 -> 140509588076912 - 140509588076960 [label=AddBackward0] - 140509588077392 -> 140509588076960 - 140509588077392 [label=NativeDropoutBackward0] - 140509588077776 -> 140509588077392 - 140509588077776 [label=ViewBackward0] - 140509588077968 -> 140509588077776 - 140509588077968 [label=AddmmBackward0] - 140509588078064 -> 140509588077968 - 140509588078064 [label=ToCopyBackward0] - 140509588078448 -> 140509588078064 - 140509590823056 [label="encoder.layer.8.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590823056 -> 140509588078448 - 140509588078448 [label=AccumulateGrad] - 140509588077872 -> 140509588077968 - 140509588077872 [label=ViewBackward0] - 140509588078352 -> 140509588077872 - 140509588078352 [label=ViewBackward0] - 140509588078400 -> 140509588078352 - 140509588078400 [label=CloneBackward0] - 140509588078160 -> 140509588078400 - 140509588078160 [label=PermuteBackward0] - 140509588103472 -> 140509588078160 - 140509588103472 [label=UnsafeViewBackward0] - 140509588103520 -> 140509588103472 - 140509588103520 [label=BmmBackward0] - 140509588103760 -> 140509588103520 - 140509588103760 [label=ReshapeAliasBackward0] - 140509588104144 -> 140509588103760 - 140509588104144 [label=ExpandBackward0] - 140509588104336 -> 140509588104144 - 140509588104336 [label=ToCopyBackward0] - 140509588104528 -> 140509588104336 - 140509588104528 [label=NativeDropoutBackward0] - 140509588104624 -> 140509588104528 - 140509588104624 [label=SoftmaxBackward0] - 140509588104816 -> 140509588104624 - 140509588104816 [label=AddBackward0] - 140509588105008 -> 140509588104816 - 140509588105008 [label=DivBackward0] - 140509588105104 -> 140509588105008 - 140509588105104 [label=UnsafeViewBackward0] - 140509588105296 -> 140509588105104 - 140509588105296 [label=BmmBackward0] - 140509588105488 -> 140509588105296 - 140509588105488 [label=UnsafeViewBackward0] - 140509588105440 -> 140509588105488 - 140509588105440 [label=CloneBackward0] - 140509588105680 -> 140509588105440 - 140509588105680 [label=ExpandBackward0] - 140509588105872 -> 140509588105680 - 140509588105872 [label=PermuteBackward0] - 140509588105920 -> 140509588105872 - 140509588105920 [label=ViewBackward0] - 140509588106160 -> 140509588105920 - 140509588106160 [label=ViewBackward0] - 140509588106352 -> 140509588106160 - 140509588106352 [label=AddmmBackward0] - 140509588106400 -> 140509588106352 - 140509588106400 [label=ToCopyBackward0] - 140509588106832 -> 140509588106400 - 140509590823536 [label="encoder.layer.8.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590823536 -> 140509588106832 - 140509588106832 [label=AccumulateGrad] - 140509588106544 -> 140509588106352 - 140509588106544 [label=ViewBackward0] - 140509588107024 -> 140509588106544 - 140509588107024 [label=ToCopyBackward0] - 140509588077488 -> 140509588107024 - 140509588077488 [label=CatBackward0] - 140509588107120 -> 140509588077488 - 140509588107120 [label=SumBackward1] - 140509588136240 -> 140509588107120 - 140509588136240 [label=MulBackward0] - 140509588136432 -> 140509588136240 - 140509588136432 [label=CatBackward0] - 140509588136528 -> 140509588136432 - 140509588136528 [label=UnsqueezeBackward0] - 140509588136912 -> 140509588136528 - 140509588136912 [label=NativeLayerNormBackward0] - 140509588137104 -> 140509588136912 - 140509588137104 [label=AddBackward0] - 140509588137392 -> 140509588137104 - 140509588137392 [label=NativeDropoutBackward0] - 140509588137488 -> 140509588137392 - 140509588137488 [label=ViewBackward0] - 140509588137536 -> 140509588137488 - 140509588137536 [label=AddmmBackward0] - 140509588137776 -> 140509588137536 - 140509588137776 [label=ToCopyBackward0] - 140509588138016 -> 140509588137776 - 140509591341952 [label="encoder.layer.7.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591341952 -> 140509588138016 - 140509588138016 [label=AccumulateGrad] - 140509588137872 -> 140509588137536 - 140509588137872 [label=ViewBackward0] - 140509588138352 -> 140509588137872 - 140509588138352 [label=GeluBackward0] - 140509588138544 -> 140509588138352 - 140509588138544 [label=ViewBackward0] - 140509588138640 -> 140509588138544 - 140509588138640 [label=AddmmBackward0] - 140509588138832 -> 140509588138640 - 140509588138832 [label=ToCopyBackward0] - 140509588139120 -> 140509588138832 - 140509591342272 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591342272 -> 140509588139120 - 140509588139120 [label=AccumulateGrad] - 140509588138496 -> 140509588138640 - 140509588138496 [label=ViewBackward0] - 140509588138976 -> 140509588138496 - 140509588138976 [label=ToCopyBackward0] - 140509588137056 -> 140509588138976 - 140509588137056 [label=SliceBackward0] - 140509588139504 -> 140509588137056 - 140509588139504 [label=SliceBackward0] - 140509588139600 -> 140509588139504 - 140509588139600 [label=SliceBackward0] - 140509588139792 -> 140509588139600 - 140509588139792 [label=SliceBackward0] - 140509588139984 -> 140509588139792 - 140509588139984 [label=SliceBackward0] - 140509588139888 -> 140509588139984 - 140509588139888 [label=NativeLayerNormBackward0] - 140509588164912 -> 140509588139888 - 140509588164912 [label=AddBackward0] - 140509588165200 -> 140509588164912 - 140509588165200 [label=NativeDropoutBackward0] - 140509588165296 -> 140509588165200 - 140509588165296 [label=ViewBackward0] - 140509588165488 -> 140509588165296 - 140509588165488 [label=AddmmBackward0] - 140509588165536 -> 140509588165488 - 140509588165536 [label=ToCopyBackward0] - 140509588165968 -> 140509588165536 - 140509590839360 [label="encoder.layer.7.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590839360 -> 140509588165968 - 140509588165968 [label=AccumulateGrad] - 140509588165680 -> 140509588165488 - 140509588165680 [label=ViewBackward0] - 140509588166160 -> 140509588165680 - 140509588166160 [label=ViewBackward0] - 140509588166352 -> 140509588166160 - 140509588166352 [label=CloneBackward0] - 140509588166544 -> 140509588166352 - 140509588166544 [label=PermuteBackward0] - 140509588166640 -> 140509588166544 - 140509588166640 [label=UnsafeViewBackward0] - 140509588166832 -> 140509588166640 - 140509588166832 [label=BmmBackward0] - 140509588167024 -> 140509588166832 - 140509588167024 [label=ReshapeAliasBackward0] - 140509588166976 -> 140509588167024 - 140509588166976 [label=ExpandBackward0] - 140509588167216 -> 140509588166976 - 140509588167216 [label=ToCopyBackward0] - 140509588167408 -> 140509588167216 - 140509588167408 [label=NativeDropoutBackward0] - 140509588167456 -> 140509588167408 - 140509588167456 [label=SoftmaxBackward0] - 140509588167696 -> 140509588167456 - 140509588167696 [label=AddBackward0] - 140509588167888 -> 140509588167696 - 140509588167888 [label=DivBackward0] - 140509588167936 -> 140509588167888 - 140509588167936 [label=UnsafeViewBackward0] - 140509588168176 -> 140509588167936 - 140509588168176 [label=BmmBackward0] - 140509588168368 -> 140509588168176 - 140509588168368 [label=UnsafeViewBackward0] - 140509588168416 -> 140509588168368 - 140509588168416 [label=CloneBackward0] - 140509588193584 -> 140509588168416 - 140509588193584 [label=ExpandBackward0] - 140509588193680 -> 140509588193584 - 140509588193680 [label=PermuteBackward0] - 140509588193872 -> 140509588193680 - 140509588193872 [label=ViewBackward0] - 140509588194064 -> 140509588193872 - 140509588194064 [label=ViewBackward0] - 140509588194160 -> 140509588194064 - 140509588194160 [label=AddmmBackward0] - 140509588194352 -> 140509588194160 - 140509588194352 [label=ToCopyBackward0] - 140509588194640 -> 140509588194352 - 140509590840320 [label="encoder.layer.7.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590840320 -> 140509588194640 - 140509588194640 [label=AccumulateGrad] - 140509588194016 -> 140509588194160 - 140509588194016 [label=ViewBackward0] - 140509588194496 -> 140509588194016 - 140509588194496 [label=ToCopyBackward0] - 140509588165008 -> 140509588194496 - 140509588165008 [label=CatBackward0] - 140509588195024 -> 140509588165008 - 140509588195024 [label=SumBackward1] - 140509588194976 -> 140509588195024 - 140509588194976 [label=MulBackward0] - 140509588195216 -> 140509588194976 - 140509588195216 [label=CatBackward0] - 140509588195600 -> 140509588195216 - 140509588195600 [label=UnsqueezeBackward0] - 140509588195696 -> 140509588195600 - 140509588195696 [label=NativeLayerNormBackward0] - 140509588195888 -> 140509588195696 - 140509588195888 [label=AddBackward0] - 140509588196176 -> 140509588195888 - 140509588196176 [label=NativeDropoutBackward0] - 140509588196560 -> 140509588196176 - 140509588196560 [label=ViewBackward0] - 140509588196752 -> 140509588196560 - 140509588196752 [label=AddmmBackward0] - 140509588196944 -> 140509588196752 - 140509588196944 [label=ToCopyBackward0] - 140509588197232 -> 140509588196944 - 140509590825776 [label="encoder.layer.6.experts.experts.0.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590825776 -> 140509588197232 - 140509588197232 [label=AccumulateGrad] - 140509588196656 -> 140509588196752 - 140509588196656 [label=ViewBackward0] - 140509588197040 -> 140509588196656 - 140509588197040 [label=GeluBackward0] - 140509588196896 -> 140509588197040 - 140509588196896 [label=ViewBackward0] - 140509587696464 -> 140509588196896 - 140509587696464 [label=AddmmBackward0] - 140509587696368 -> 140509587696464 - 140509587696368 [label=ToCopyBackward0] - 140509587693680 -> 140509587696368 - 140509590826256 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590826256 -> 140509587693680 - 140509587693680 [label=AccumulateGrad] - 140509587696752 -> 140509587696464 - 140509587696752 [label=ViewBackward0] - 140509587693728 -> 140509587696752 - 140509587693728 [label=ToCopyBackward0] - 140509588196272 -> 140509587693728 - 140509588196272 [label=ViewBackward0] - 140509587693872 -> 140509588196272 - 140509587693872 [label=CloneBackward0] - 140509587694064 -> 140509587693872 - 140509587694064 [label=ExpandBackward0] - 140509587694112 -> 140509587694064 - 140509587694112 [label=UnsqueezeBackward0] - 140509587694352 -> 140509587694112 - 140509587694352 [label=SliceBackward0] - 140509587694544 -> 140509587694352 - 140509587694544 [label=SliceBackward0] - 140509587694592 -> 140509587694544 - 140509587694592 [label=NativeLayerNormBackward0] - 140509587694832 -> 140509587694592 - 140509587694832 [label=AddBackward0] - 140509587695072 -> 140509587694832 - 140509587695072 [label=NativeDropoutBackward0] - 140509587695408 -> 140509587695072 - 140509587695408 [label=ViewBackward0] - 140509587695600 -> 140509587695408 - 140509587695600 [label=AddmmBackward0] - 140509587697232 -> 140509587695600 - 140509587697232 [label=ToCopyBackward0] - 140509587696992 -> 140509587697232 - 140509590842480 [label="encoder.layer.6.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590842480 -> 140509587696992 - 140509587696992 [label=AccumulateGrad] - 140509587697472 -> 140509587695600 - 140509587697472 [label=ViewBackward0] - 140509587697616 -> 140509587697472 - 140509587697616 [label=ViewBackward0] - 140509587696272 -> 140509587697616 - 140509587696272 [label=CloneBackward0] - 140509587696944 -> 140509587696272 - 140509587696944 [label=PermuteBackward0] - 140509587696512 -> 140509587696944 - 140509587696512 [label=UnsafeViewBackward0] - 140509587695984 -> 140509587696512 - 140509587695984 [label=BmmBackward0] - 140509587696032 -> 140509587695984 - 140509587696032 [label=ReshapeAliasBackward0] - 140509587852640 -> 140509587696032 - 140509587852640 [label=ExpandBackward0] - 140509587852544 -> 140509587852640 - 140509587852544 [label=ToCopyBackward0] - 140509587852448 -> 140509587852544 - 140509587852448 [label=NativeDropoutBackward0] - 140509587852352 -> 140509587852448 - 140509587852352 [label=SoftmaxBackward0] - 140509587852256 -> 140509587852352 - 140509587852256 [label=AddBackward0] - 140509587852160 -> 140509587852256 - 140509587852160 [label=DivBackward0] - 140509587852064 -> 140509587852160 - 140509587852064 [label=UnsafeViewBackward0] - 140509587851968 -> 140509587852064 - 140509587851968 [label=BmmBackward0] - 140509587851872 -> 140509587851968 - 140509587851872 [label=ReshapeAliasBackward0] - 140509587851824 -> 140509587851872 - 140509587851824 [label=ExpandBackward0] - 140509587851728 -> 140509587851824 - 140509587851728 [label=PermuteBackward0] - 140509587851632 -> 140509587851728 - 140509587851632 [label=ViewBackward0] - 140509587851536 -> 140509587851632 - 140509587851536 [label=ViewBackward0] - 140509587851440 -> 140509587851536 - 140509587851440 [label=AddmmBackward0] - 140509587851344 -> 140509587851440 - 140509587851344 [label=ToCopyBackward0] - 140509587851152 -> 140509587851344 - 140509590843200 [label="encoder.layer.6.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509590843200 -> 140509587851152 - 140509587851152 [label=AccumulateGrad] - 140509587851296 -> 140509587851440 - 140509587851296 [label=ViewBackward0] - 140509587851008 -> 140509587851296 - 140509587851008 [label=ToCopyBackward0] - 140509587695120 -> 140509587851008 - 140509587695120 [label=SliceBackward0] - 140509587850960 -> 140509587695120 - 140509587850960 [label=SliceBackward0] - 140509587850864 -> 140509587850960 - 140509587850864 [label=SliceBackward0] - 140509587850768 -> 140509587850864 - 140509587850768 [label=NativeLayerNormBackward0] - 140509587850672 -> 140509587850768 - 140509587850672 [label=AddBackward0] - 140509587850480 -> 140509587850672 - 140509587850480 [label=NativeDropoutBackward0] - 140509587850240 -> 140509587850480 - 140509587850240 [label=ViewBackward0] - 140509587850144 -> 140509587850240 - 140509587850144 [label=AddmmBackward0] - 140509587850048 -> 140509587850144 - 140509587850048 [label=ToCopyBackward0] - 140509587849856 -> 140509587850048 - 140509590856064 [label="encoder.layer.6.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590856064 -> 140509587849856 - 140509587849856 [label=AccumulateGrad] - 140509587850192 -> 140509587850144 - 140509587850192 [label=ViewBackward0] - 140509587849904 -> 140509587850192 - 140509587849904 [label=ViewBackward0] - 140509587849808 -> 140509587849904 - 140509587849808 [label=CloneBackward0] - 140509587849712 -> 140509587849808 - 140509587849712 [label=PermuteBackward0] - 140509587849616 -> 140509587849712 - 140509587849616 [label=UnsafeViewBackward0] - 140509587849520 -> 140509587849616 - 140509587849520 [label=BmmBackward0] - 140509587849424 -> 140509587849520 - 140509587849424 [label=ReshapeAliasBackward0] - 140509587852976 -> 140509587849424 - 140509587852976 [label=ExpandBackward0] - 140509587853072 -> 140509587852976 - 140509587853072 [label=ToCopyBackward0] - 140509587853168 -> 140509587853072 - 140509587853168 [label=NativeDropoutBackward0] - 140509587853264 -> 140509587853168 - 140509587853264 [label=SoftmaxBackward0] - 140509587849280 -> 140509587853264 - 140509587849280 [label=AddBackward0] - 140509587558608 -> 140509587849280 - 140509587558608 [label=DivBackward0] - 140509587558704 -> 140509587558608 - 140509587558704 [label=UnsafeViewBackward0] - 140509587558800 -> 140509587558704 - 140509587558800 [label=BmmBackward0] - 140509587558896 -> 140509587558800 - 140509587558896 [label=ReshapeAliasBackward0] - 140509587559040 -> 140509587558896 - 140509587559040 [label=ExpandBackward0] - 140509587559136 -> 140509587559040 - 140509587559136 [label=PermuteBackward0] - 140509587559232 -> 140509587559136 - 140509587559232 [label=ViewBackward0] - 140509587559328 -> 140509587559232 - 140509587559328 [label=ViewBackward0] - 140509587559424 -> 140509587559328 - 140509587559424 [label=AddmmBackward0] - 140509587559520 -> 140509587559424 - 140509587559520 [label=ToCopyBackward0] - 140509587559712 -> 140509587559520 - 140509590856784 [label="encoder.layer.6.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590856784 -> 140509587559712 - 140509587559712 [label=AccumulateGrad] - 140509587559472 -> 140509587559424 - 140509587559472 [label=ViewBackward0] - 140509587559760 -> 140509587559472 - 140509587559760 [label=ToCopyBackward0] - 140509587850432 -> 140509587559760 - 140509587850432 [label=CatBackward0] - 140509587559904 -> 140509587850432 - 140509587559904 [label=NativeLayerNormBackward0] - 140509587560048 -> 140509587559904 - 140509587560048 [label=AddBackward0] - 140509587560240 -> 140509587560048 - 140509587560240 [label=NativeDropoutBackward0] - 140509587560384 -> 140509587560240 - 140509587560384 [label=ViewBackward0] - 140509587560480 -> 140509587560384 - 140509587560480 [label=AddmmBackward0] - 140509587560576 -> 140509587560480 - 140509587560576 [label=ToCopyBackward0] - 140509587560768 -> 140509587560576 - 140509590857264 [label="encoder.layer.5.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590857264 -> 140509587560768 - 140509587560768 [label=AccumulateGrad] - 140509587560528 -> 140509587560480 - 140509587560528 [label=ViewBackward0] - 140509587560816 -> 140509587560528 - 140509587560816 [label=GeluBackward0] - 140509587560912 -> 140509587560816 - 140509587560912 [label=ViewBackward0] - 140509587561008 -> 140509587560912 - 140509587561008 [label=AddmmBackward0] - 140509587561104 -> 140509587561008 - 140509587561104 [label=ToCopyBackward0] - 140509587561296 -> 140509587561104 - 140509590857504 [label="encoder.layer.5.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590857504 -> 140509587561296 - 140509587561296 [label=AccumulateGrad] - 140509587561056 -> 140509587561008 - 140509587561056 [label=ViewBackward0] - 140509587561344 -> 140509587561056 - 140509587561344 [label=ToCopyBackward0] - 140509587560192 -> 140509587561344 - 140509587560192 [label=SliceBackward0] - 140509587561488 -> 140509587560192 - 140509587561488 [label=SliceBackward0] - 140509587561584 -> 140509587561488 - 140509587561584 [label=SliceBackward0] - 140509587561680 -> 140509587561584 - 140509587561680 [label=SliceBackward0] - 140509587561776 -> 140509587561680 - 140509587561776 [label=SliceBackward0] - 140509587561872 -> 140509587561776 - 140509587561872 [label=NativeLayerNormBackward0] - 140509587561968 -> 140509587561872 - 140509587561968 [label=AddBackward0] - 140509587562160 -> 140509587561968 - 140509587562160 [label=NativeDropoutBackward0] - 140509587562304 -> 140509587562160 - 140509587562304 [label=ViewBackward0] - 140509587562400 -> 140509587562304 - 140509587562400 [label=AddmmBackward0] - 140509587562448 -> 140509587562400 - 140509587562448 [label=ToCopyBackward0] - 140509587570944 -> 140509587562448 - 140509590859424 [label="encoder.layer.5.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590859424 -> 140509587570944 - 140509587570944 [label=AccumulateGrad] - 140509587562208 -> 140509587562400 - 140509587562208 [label=ViewBackward0] - 140509587570992 -> 140509587562208 - 140509587570992 [label=ViewBackward0] - 140509587571136 -> 140509587570992 - 140509587571136 [label=CloneBackward0] - 140509587571232 -> 140509587571136 - 140509587571232 [label=PermuteBackward0] - 140509587571328 -> 140509587571232 - 140509587571328 [label=UnsafeViewBackward0] - 140509587571424 -> 140509587571328 - 140509587571424 [label=BmmBackward0] - 140509587571520 -> 140509587571424 - 140509587571520 [label=ReshapeAliasBackward0] - 140509587571664 -> 140509587571520 - 140509587571664 [label=ExpandBackward0] - 140509587571760 -> 140509587571664 - 140509587571760 [label=ToCopyBackward0] - 140509587571856 -> 140509587571760 - 140509587571856 [label=NativeDropoutBackward0] - 140509587571952 -> 140509587571856 - 140509587571952 [label=SoftmaxBackward0] - 140509587572048 -> 140509587571952 - 140509587572048 [label=AddBackward0] - 140509587572144 -> 140509587572048 - 140509587572144 [label=DivBackward0] - 140509587572240 -> 140509587572144 - 140509587572240 [label=UnsafeViewBackward0] - 140509587572336 -> 140509587572240 - 140509587572336 [label=BmmBackward0] - 140509587572432 -> 140509587572336 - 140509587572432 [label=ReshapeAliasBackward0] - 140509587572576 -> 140509587572432 - 140509587572576 [label=ExpandBackward0] - 140509587572672 -> 140509587572576 - 140509587572672 [label=PermuteBackward0] - 140509587572768 -> 140509587572672 - 140509587572768 [label=ViewBackward0] - 140509587572864 -> 140509587572768 - 140509587572864 [label=ViewBackward0] - 140509587572960 -> 140509587572864 - 140509587572960 [label=AddmmBackward0] - 140509587573056 -> 140509587572960 - 140509587573056 [label=ToCopyBackward0] - 140509587573248 -> 140509587573056 - 140509590872528 [label="encoder.layer.5.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590872528 -> 140509587573248 - 140509587573248 [label=AccumulateGrad] - 140509587573008 -> 140509587572960 - 140509587573008 [label=ViewBackward0] - 140509587573296 -> 140509587573008 - 140509587573296 [label=ToCopyBackward0] - 140509587562112 -> 140509587573296 - 140509587562112 [label=CatBackward0] - 140509587573440 -> 140509587562112 - 140509587573440 [label=NativeLayerNormBackward0] - 140509587573584 -> 140509587573440 - 140509587573584 [label=AddBackward0] - 140509587573776 -> 140509587573584 - 140509587573776 [label=NativeDropoutBackward0] - 140509587573920 -> 140509587573776 - 140509587573920 [label=ViewBackward0] - 140509587574016 -> 140509587573920 - 140509587574016 [label=AddmmBackward0] - 140509587574112 -> 140509587574016 - 140509587574112 [label=ToCopyBackward0] - 140509587574304 -> 140509587574112 - 140509590873008 [label="encoder.layer.4.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590873008 -> 140509587574304 - 140509587574304 [label=AccumulateGrad] - 140509587574064 -> 140509587574016 - 140509587574064 [label=ViewBackward0] - 140509587574352 -> 140509587574064 - 140509587574352 [label=GeluBackward0] - 140509587574448 -> 140509587574352 - 140509587574448 [label=ViewBackward0] - 140509587574544 -> 140509587574448 - 140509587574544 [label=AddmmBackward0] - 140509587574640 -> 140509587574544 - 140509587574640 [label=ToCopyBackward0] - 140509587574736 -> 140509587574640 - 140509590873248 [label="encoder.layer.4.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590873248 -> 140509587574736 - 140509587574736 [label=AccumulateGrad] - 140509587574592 -> 140509587574544 - 140509587574592 [label=ViewBackward0] - 140509587591232 -> 140509587574592 - 140509587591232 [label=ToCopyBackward0] - 140509587573728 -> 140509587591232 - 140509587573728 [label=SliceBackward0] - 140509587591472 -> 140509587573728 - 140509587591472 [label=SliceBackward0] - 140509587591568 -> 140509587591472 - 140509587591568 [label=NativeLayerNormBackward0] - 140509587591664 -> 140509587591568 - 140509587591664 [label=AddBackward0] - 140509587591856 -> 140509587591664 - 140509587591856 [label=NativeDropoutBackward0] - 140509587592000 -> 140509587591856 - 140509587592000 [label=ViewBackward0] - 140509587592096 -> 140509587592000 - 140509587592096 [label=AddmmBackward0] - 140509587592192 -> 140509587592096 - 140509587592192 [label=ToCopyBackward0] - 140509587592384 -> 140509587592192 - 140509590875168 [label="encoder.layer.4.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590875168 -> 140509587592384 - 140509587592384 [label=AccumulateGrad] - 140509587592144 -> 140509587592096 - 140509587592144 [label=ViewBackward0] - 140509587592432 -> 140509587592144 - 140509587592432 [label=ViewBackward0] - 140509587592528 -> 140509587592432 - 140509587592528 [label=CloneBackward0] - 140509587592624 -> 140509587592528 - 140509587592624 [label=PermuteBackward0] - 140509587592720 -> 140509587592624 - 140509587592720 [label=UnsafeViewBackward0] - 140509587592816 -> 140509587592720 - 140509587592816 [label=BmmBackward0] - 140509587592912 -> 140509587592816 - 140509587592912 [label=ReshapeAliasBackward0] - 140509587593056 -> 140509587592912 - 140509587593056 [label=ExpandBackward0] - 140509587593152 -> 140509587593056 - 140509587593152 [label=ToCopyBackward0] - 140509587593248 -> 140509587593152 - 140509587593248 [label=NativeDropoutBackward0] - 140509587593344 -> 140509587593248 - 140509587593344 [label=SoftmaxBackward0] - 140509587593440 -> 140509587593344 - 140509587593440 [label=AddBackward0] - 140509587593536 -> 140509587593440 - 140509587593536 [label=DivBackward0] - 140509587593632 -> 140509587593536 - 140509587593632 [label=UnsafeViewBackward0] - 140509587593728 -> 140509587593632 - 140509587593728 [label=BmmBackward0] - 140509587593824 -> 140509587593728 - 140509587593824 [label=ReshapeAliasBackward0] - 140509587593968 -> 140509587593824 - 140509587593968 [label=ExpandBackward0] - 140509587594064 -> 140509587593968 - 140509587594064 [label=PermuteBackward0] - 140509587594160 -> 140509587594064 - 140509587594160 [label=ViewBackward0] - 140509587594256 -> 140509587594160 - 140509587594256 [label=ViewBackward0] - 140509587594352 -> 140509587594256 - 140509587594352 [label=AddmmBackward0] - 140509587594448 -> 140509587594352 - 140509587594448 [label=ToCopyBackward0] - 140509587594640 -> 140509587594448 - 140509590875888 [label="encoder.layer.4.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509590875888 -> 140509587594640 - 140509587594640 [label=AccumulateGrad] - 140509587594400 -> 140509587594352 - 140509587594400 [label=ViewBackward0] - 140509587594688 -> 140509587594400 - 140509587594688 [label=ToCopyBackward0] - 140509587591808 -> 140509587594688 - 140509587591808 [label=SliceBackward0] - 140509587594832 -> 140509587591808 - 140509587594832 [label=SliceBackward0] - 140509587594928 -> 140509587594832 - 140509587594928 [label=SliceBackward0] - 140509587595024 -> 140509587594928 - 140509587595024 [label=NativeLayerNormBackward0] - 140509587595120 -> 140509587595024 - 140509587595120 [label=AddBackward0] - 140509587595216 -> 140509587595120 - 140509587595216 [label=NativeDropoutBackward0] - 140509587607808 -> 140509587595216 - 140509587607808 [label=ViewBackward0] - 140509587607904 -> 140509587607808 - 140509587607904 [label=AddmmBackward0] - 140509587608000 -> 140509587607904 - 140509587608000 [label=ToCopyBackward0] - 140509587608192 -> 140509587608000 - 140509590892848 [label="encoder.layer.4.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590892848 -> 140509587608192 - 140509587608192 [label=AccumulateGrad] - 140509587607952 -> 140509587607904 - 140509587607952 [label=ViewBackward0] - 140509587608240 -> 140509587607952 - 140509587608240 [label=ViewBackward0] - 140509587608336 -> 140509587608240 - 140509587608336 [label=CloneBackward0] - 140509587608432 -> 140509587608336 - 140509587608432 [label=PermuteBackward0] - 140509587608528 -> 140509587608432 - 140509587608528 [label=UnsafeViewBackward0] - 140509587608624 -> 140509587608528 - 140509587608624 [label=BmmBackward0] - 140509587608720 -> 140509587608624 - 140509587608720 [label=ReshapeAliasBackward0] - 140509587608864 -> 140509587608720 - 140509587608864 [label=ExpandBackward0] - 140509587608960 -> 140509587608864 - 140509587608960 [label=ToCopyBackward0] - 140509587609056 -> 140509587608960 - 140509587609056 [label=NativeDropoutBackward0] - 140509587609152 -> 140509587609056 - 140509587609152 [label=SoftmaxBackward0] - 140509587609248 -> 140509587609152 - 140509587609248 [label=AddBackward0] - 140509587609344 -> 140509587609248 - 140509587609344 [label=DivBackward0] - 140509587609440 -> 140509587609344 - 140509587609440 [label=UnsafeViewBackward0] - 140509587609536 -> 140509587609440 - 140509587609536 [label=BmmBackward0] - 140509587609632 -> 140509587609536 - 140509587609632 [label=ReshapeAliasBackward0] - 140509587609776 -> 140509587609632 - 140509587609776 [label=ExpandBackward0] - 140509587609872 -> 140509587609776 - 140509587609872 [label=PermuteBackward0] - 140509587609968 -> 140509587609872 - 140509587609968 [label=ViewBackward0] - 140509587610064 -> 140509587609968 - 140509587610064 [label=ViewBackward0] - 140509587610160 -> 140509587610064 - 140509587610160 [label=AddmmBackward0] - 140509587610256 -> 140509587610160 - 140509587610256 [label=ToCopyBackward0] - 140509587610448 -> 140509587610256 - 140509590893568 [label="encoder.layer.4.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590893568 -> 140509587610448 - 140509587610448 [label=AccumulateGrad] - 140509587610208 -> 140509587610160 - 140509587610208 [label=ViewBackward0] - 140509587610496 -> 140509587610208 - 140509587610496 [label=ToCopyBackward0] - 140509587607664 -> 140509587610496 - 140509587607664 [label=CatBackward0] - 140509587610640 -> 140509587607664 - 140509587610640 [label=NativeLayerNormBackward0] - 140509587610784 -> 140509587610640 - 140509587610784 [label=AddBackward0] - 140509587610976 -> 140509587610784 - 140509587610976 [label=NativeDropoutBackward0] - 140509587611120 -> 140509587610976 - 140509587611120 [label=ViewBackward0] - 140509587611216 -> 140509587611120 - 140509587611216 [label=AddmmBackward0] - 140509587611312 -> 140509587611216 - 140509587611312 [label=ToCopyBackward0] - 140509587611504 -> 140509587611312 - 140509590894048 [label="encoder.layer.3.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590894048 -> 140509587611504 - 140509587611504 [label=AccumulateGrad] - 140509587611264 -> 140509587611216 - 140509587611264 [label=ViewBackward0] - 140509587611552 -> 140509587611264 - 140509587611552 [label=GeluBackward0] - 140509587611408 -> 140509587611552 - 140509587611408 [label=ViewBackward0] - 140509587624096 -> 140509587611408 - 140509587624096 [label=AddmmBackward0] - 140509587624192 -> 140509587624096 - 140509587624192 [label=ToCopyBackward0] - 140509587624384 -> 140509587624192 - 140509590894288 [label="encoder.layer.3.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590894288 -> 140509587624384 - 140509587624384 [label=AccumulateGrad] - 140509587624144 -> 140509587624096 - 140509587624144 [label=ViewBackward0] - 140509587624432 -> 140509587624144 - 140509587624432 [label=ToCopyBackward0] - 140509587610928 -> 140509587624432 - 140509587610928 [label=SliceBackward0] - 140509587624576 -> 140509587610928 - 140509587624576 [label=SliceBackward0] - 140509587624672 -> 140509587624576 - 140509587624672 [label=SliceBackward0] - 140509587624768 -> 140509587624672 - 140509587624768 [label=SliceBackward0] - 140509587624864 -> 140509587624768 - 140509587624864 [label=SliceBackward0] - 140509587624960 -> 140509587624864 - 140509587624960 [label=NativeLayerNormBackward0] - 140509587625056 -> 140509587624960 - 140509587625056 [label=AddBackward0] - 140509587625248 -> 140509587625056 - 140509587625248 [label=NativeDropoutBackward0] - 140509587625392 -> 140509587625248 - 140509587625392 [label=ViewBackward0] - 140509587625488 -> 140509587625392 - 140509587625488 [label=AddmmBackward0] - 140509587625584 -> 140509587625488 - 140509587625584 [label=ToCopyBackward0] - 140509587625776 -> 140509587625584 - 140509590896208 [label="encoder.layer.3.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590896208 -> 140509587625776 - 140509587625776 [label=AccumulateGrad] - 140509587625536 -> 140509587625488 - 140509587625536 [label=ViewBackward0] - 140509587625824 -> 140509587625536 - 140509587625824 [label=ViewBackward0] - 140509587625920 -> 140509587625824 - 140509587625920 [label=CloneBackward0] - 140509587626016 -> 140509587625920 - 140509587626016 [label=PermuteBackward0] - 140509587626112 -> 140509587626016 - 140509587626112 [label=UnsafeViewBackward0] - 140509587626208 -> 140509587626112 - 140509587626208 [label=BmmBackward0] - 140509587626304 -> 140509587626208 - 140509587626304 [label=ReshapeAliasBackward0] - 140509587626448 -> 140509587626304 - 140509587626448 [label=ExpandBackward0] - 140509587626544 -> 140509587626448 - 140509587626544 [label=ToCopyBackward0] - 140509587626640 -> 140509587626544 - 140509587626640 [label=NativeDropoutBackward0] - 140509587626736 -> 140509587626640 - 140509587626736 [label=SoftmaxBackward0] - 140509587626832 -> 140509587626736 - 140509587626832 [label=AddBackward0] - 140509587626928 -> 140509587626832 - 140509587626928 [label=DivBackward0] - 140509587627024 -> 140509587626928 - 140509587627024 [label=UnsafeViewBackward0] - 140509587627120 -> 140509587627024 - 140509587627120 [label=BmmBackward0] - 140509587627216 -> 140509587627120 - 140509587627216 [label=ReshapeAliasBackward0] - 140509587627360 -> 140509587627216 - 140509587627360 [label=ExpandBackward0] - 140509587627456 -> 140509587627360 - 140509587627456 [label=PermuteBackward0] - 140509587627552 -> 140509587627456 - 140509587627552 [label=ViewBackward0] - 140509587627648 -> 140509587627552 - 140509587627648 [label=ViewBackward0] - 140509587627744 -> 140509587627648 - 140509587627744 [label=AddmmBackward0] - 140509587627840 -> 140509587627744 - 140509587627840 [label=ToCopyBackward0] - 140509587627984 -> 140509587627840 - 140509590901120 [label="encoder.layer.3.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590901120 -> 140509587627984 - 140509587627984 [label=AccumulateGrad] - 140509587627792 -> 140509587627744 - 140509587627792 [label=ViewBackward0] - 140509587627936 -> 140509587627792 - 140509587627936 [label=ToCopyBackward0] - 140509587625200 -> 140509587627936 - 140509587625200 [label=CatBackward0] - 140509587640576 -> 140509587625200 - 140509587640576 [label=NativeLayerNormBackward0] - 140509587640720 -> 140509587640576 - 140509587640720 [label=AddBackward0] - 140509587640912 -> 140509587640720 - 140509587640912 [label=NativeDropoutBackward0] - 140509587641056 -> 140509587640912 - 140509587641056 [label=ViewBackward0] - 140509587641152 -> 140509587641056 - 140509587641152 [label=AddmmBackward0] - 140509587641248 -> 140509587641152 - 140509587641248 [label=ToCopyBackward0] - 140509587641440 -> 140509587641248 - 140509590901600 [label="encoder.layer.2.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590901600 -> 140509587641440 - 140509587641440 [label=AccumulateGrad] - 140509587641200 -> 140509587641152 - 140509587641200 [label=ViewBackward0] - 140509587641488 -> 140509587641200 - 140509587641488 [label=GeluBackward0] - 140509587641584 -> 140509587641488 - 140509587641584 [label=ViewBackward0] - 140509587641680 -> 140509587641584 - 140509587641680 [label=AddmmBackward0] - 140509587641776 -> 140509587641680 - 140509587641776 [label=ToCopyBackward0] - 140509587641968 -> 140509587641776 - 140509590901840 [label="encoder.layer.2.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590901840 -> 140509587641968 - 140509587641968 [label=AccumulateGrad] - 140509587641728 -> 140509587641680 - 140509587641728 [label=ViewBackward0] - 140509587642016 -> 140509587641728 - 140509587642016 [label=ToCopyBackward0] - 140509587640864 -> 140509587642016 - 140509587640864 [label=SliceBackward0] - 140509587642160 -> 140509587640864 - 140509587642160 [label=SliceBackward0] - 140509587642256 -> 140509587642160 - 140509587642256 [label=NativeLayerNormBackward0] - 140509587642352 -> 140509587642256 - 140509587642352 [label=AddBackward0] - 140509587642544 -> 140509587642352 - 140509587642544 [label=NativeDropoutBackward0] - 140509587642688 -> 140509587642544 - 140509587642688 [label=ViewBackward0] - 140509587642784 -> 140509587642688 - 140509587642784 [label=AddmmBackward0] - 140509587642880 -> 140509587642784 - 140509587642880 [label=ToCopyBackward0] - 140509587643072 -> 140509587642880 - 140509590903760 [label="encoder.layer.2.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590903760 -> 140509587643072 - 140509587643072 [label=AccumulateGrad] - 140509587642832 -> 140509587642784 - 140509587642832 [label=ViewBackward0] - 140509587643120 -> 140509587642832 - 140509587643120 [label=ViewBackward0] - 140509587643216 -> 140509587643120 - 140509587643216 [label=CloneBackward0] - 140509587643312 -> 140509587643216 - 140509587643312 [label=PermuteBackward0] - 140509587643408 -> 140509587643312 - 140509587643408 [label=UnsafeViewBackward0] - 140509587643504 -> 140509587643408 - 140509587643504 [label=BmmBackward0] - 140509587643600 -> 140509587643504 - 140509587643600 [label=ReshapeAliasBackward0] - 140509587643744 -> 140509587643600 - 140509587643744 [label=ExpandBackward0] - 140509587643840 -> 140509587643744 - 140509587643840 [label=ToCopyBackward0] - 140509587643936 -> 140509587643840 - 140509587643936 [label=NativeDropoutBackward0] - 140509587644032 -> 140509587643936 - 140509587644032 [label=SoftmaxBackward0] - 140509587644128 -> 140509587644032 - 140509587644128 [label=AddBackward0] - 140509587644224 -> 140509587644128 - 140509587644224 [label=DivBackward0] - 140509587644320 -> 140509587644224 - 140509587644320 [label=UnsafeViewBackward0] - 140509587644368 -> 140509587644320 - 140509587644368 [label=BmmBackward0] - 140509587656864 -> 140509587644368 - 140509587656864 [label=ReshapeAliasBackward0] - 140509587657008 -> 140509587656864 - 140509587657008 [label=ExpandBackward0] - 140509587657104 -> 140509587657008 - 140509587657104 [label=PermuteBackward0] - 140509587657200 -> 140509587657104 - 140509587657200 [label=ViewBackward0] - 140509587657296 -> 140509587657200 - 140509587657296 [label=ViewBackward0] - 140509587657392 -> 140509587657296 - 140509587657392 [label=AddmmBackward0] - 140509587657488 -> 140509587657392 - 140509587657488 [label=ToCopyBackward0] - 140509587657680 -> 140509587657488 - 140509590904480 [label="encoder.layer.2.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509590904480 -> 140509587657680 - 140509587657680 [label=AccumulateGrad] - 140509587657440 -> 140509587657392 - 140509587657440 [label=ViewBackward0] - 140509587657728 -> 140509587657440 - 140509587657728 [label=ToCopyBackward0] - 140509587642496 -> 140509587657728 - 140509587642496 [label=SliceBackward0] - 140509587657872 -> 140509587642496 - 140509587657872 [label=SliceBackward0] - 140509587657968 -> 140509587657872 - 140509587657968 [label=SliceBackward0] - 140509587658064 -> 140509587657968 - 140509587658064 [label=NativeLayerNormBackward0] - 140509587658160 -> 140509587658064 - 140509587658160 [label=AddBackward0] - 140509587658352 -> 140509587658160 - 140509587658352 [label=NativeDropoutBackward0] - 140509587658496 -> 140509587658352 - 140509587658496 [label=ViewBackward0] - 140509587658592 -> 140509587658496 - 140509587658592 [label=AddmmBackward0] - 140509587658688 -> 140509587658592 - 140509587658688 [label=ToCopyBackward0] - 140509587658880 -> 140509587658688 - 140509590913248 [label="encoder.layer.2.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590913248 -> 140509587658880 - 140509587658880 [label=AccumulateGrad] - 140509587658640 -> 140509587658592 - 140509587658640 [label=ViewBackward0] - 140509587658928 -> 140509587658640 - 140509587658928 [label=ViewBackward0] - 140509587659024 -> 140509587658928 - 140509587659024 [label=CloneBackward0] - 140509587659120 -> 140509587659024 - 140509587659120 [label=PermuteBackward0] - 140509587659216 -> 140509587659120 - 140509587659216 [label=UnsafeViewBackward0] - 140509587659312 -> 140509587659216 - 140509587659312 [label=BmmBackward0] - 140509587659408 -> 140509587659312 - 140509587659408 [label=ReshapeAliasBackward0] - 140509587659552 -> 140509587659408 - 140509587659552 [label=ExpandBackward0] - 140509587659648 -> 140509587659552 - 140509587659648 [label=ToCopyBackward0] - 140509587659744 -> 140509587659648 - 140509587659744 [label=NativeDropoutBackward0] - 140509587659840 -> 140509587659744 - 140509587659840 [label=SoftmaxBackward0] - 140509587659936 -> 140509587659840 - 140509587659936 [label=AddBackward0] - 140509587660032 -> 140509587659936 - 140509587660032 [label=DivBackward0] - 140509587660128 -> 140509587660032 - 140509587660128 [label=UnsafeViewBackward0] - 140509587660224 -> 140509587660128 - 140509587660224 [label=BmmBackward0] - 140509587660320 -> 140509587660224 - 140509587660320 [label=ReshapeAliasBackward0] - 140509587660464 -> 140509587660320 - 140509587660464 [label=ExpandBackward0] - 140509587660560 -> 140509587660464 - 140509587660560 [label=PermuteBackward0] - 140509587660656 -> 140509587660560 - 140509587660656 [label=ViewBackward0] - 140509587660752 -> 140509587660656 - 140509587660752 [label=ViewBackward0] - 140509587660368 -> 140509587660752 - 140509587660368 [label=AddmmBackward0] - 140509587673296 -> 140509587660368 - 140509587673296 [label=ToCopyBackward0] - 140509587673488 -> 140509587673296 - 140509590913968 [label="encoder.layer.2.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590913968 -> 140509587673488 - 140509587673488 [label=AccumulateGrad] - 140509587673248 -> 140509587660368 - 140509587673248 [label=ViewBackward0] - 140509587673536 -> 140509587673248 - 140509587673536 [label=ToCopyBackward0] - 140509587658304 -> 140509587673536 - 140509587658304 [label=CatBackward0] - 140509587673680 -> 140509587658304 - 140509587673680 [label=NativeLayerNormBackward0] - 140509587673824 -> 140509587673680 - 140509587673824 [label=AddBackward0] - 140509587674016 -> 140509587673824 - 140509587674016 [label=NativeDropoutBackward0] - 140509587674160 -> 140509587674016 - 140509587674160 [label=ViewBackward0] - 140509587674256 -> 140509587674160 - 140509587674256 [label=AddmmBackward0] - 140509587674352 -> 140509587674256 - 140509587674352 [label=ToCopyBackward0] - 140509587674544 -> 140509587674352 - 140509590914448 [label="encoder.layer.1.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590914448 -> 140509587674544 - 140509587674544 [label=AccumulateGrad] - 140509587674304 -> 140509587674256 - 140509587674304 [label=ViewBackward0] - 140509587674592 -> 140509587674304 - 140509587674592 [label=GeluBackward0] - 140509587674688 -> 140509587674592 - 140509587674688 [label=ViewBackward0] - 140509587674784 -> 140509587674688 - 140509587674784 [label=AddmmBackward0] - 140509587674880 -> 140509587674784 - 140509587674880 [label=ToCopyBackward0] - 140509587675072 -> 140509587674880 - 140509590914688 [label="encoder.layer.1.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590914688 -> 140509587675072 - 140509587675072 [label=AccumulateGrad] - 140509587674832 -> 140509587674784 - 140509587674832 [label=ViewBackward0] - 140509587675120 -> 140509587674832 - 140509587675120 [label=ToCopyBackward0] - 140509587673968 -> 140509587675120 - 140509587673968 [label=SliceBackward0] - 140509587675264 -> 140509587673968 - 140509587675264 [label=SliceBackward0] - 140509587675360 -> 140509587675264 - 140509587675360 [label=SliceBackward0] - 140509587675456 -> 140509587675360 - 140509587675456 [label=SliceBackward0] - 140509587675552 -> 140509587675456 - 140509587675552 [label=SliceBackward0] - 140509587675648 -> 140509587675552 - 140509587675648 [label=NativeLayerNormBackward0] - 140509587675744 -> 140509587675648 - 140509587675744 [label=AddBackward0] - 140509587675936 -> 140509587675744 - 140509587675936 [label=NativeDropoutBackward0] - 140509587676080 -> 140509587675936 - 140509587676080 [label=ViewBackward0] - 140509587676176 -> 140509587676080 - 140509587676176 [label=AddmmBackward0] - 140509587676272 -> 140509587676176 - 140509587676272 [label=ToCopyBackward0] - 140509587676464 -> 140509587676272 - 140509590916608 [label="encoder.layer.1.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590916608 -> 140509587676464 - 140509587676464 [label=AccumulateGrad] - 140509587676224 -> 140509587676176 - 140509587676224 [label=ViewBackward0] - 140509587676512 -> 140509587676224 - 140509587676512 [label=ViewBackward0] - 140509587676608 -> 140509587676512 - 140509587676608 [label=CloneBackward0] - 140509587676704 -> 140509587676608 - 140509587676704 [label=PermuteBackward0] - 140509587676800 -> 140509587676704 - 140509587676800 [label=UnsafeViewBackward0] - 140509587676896 -> 140509587676800 - 140509587676896 [label=BmmBackward0] - 140509587676992 -> 140509587676896 - 140509587676992 [label=ReshapeAliasBackward0] - 140509587677136 -> 140509587676992 - 140509587677136 [label=ExpandBackward0] - 140509587677040 -> 140509587677136 - 140509587677040 [label=ToCopyBackward0] - 140517615505616 -> 140509587677040 - 140517615505616 [label=NativeDropoutBackward0] - 140517615505712 -> 140517615505616 - 140517615505712 [label=SoftmaxBackward0] - 140517615505808 -> 140517615505712 - 140517615505808 [label=AddBackward0] - 140517615505904 -> 140517615505808 - 140517615505904 [label=DivBackward0] - 140517615506000 -> 140517615505904 - 140517615506000 [label=UnsafeViewBackward0] - 140517615506096 -> 140517615506000 - 140517615506096 [label=BmmBackward0] - 140517615506192 -> 140517615506096 - 140517615506192 [label=ReshapeAliasBackward0] - 140517615506336 -> 140517615506192 - 140517615506336 [label=ExpandBackward0] - 140517615506432 -> 140517615506336 - 140517615506432 [label=PermuteBackward0] - 140517615506528 -> 140517615506432 - 140517615506528 [label=ViewBackward0] - 140517615506624 -> 140517615506528 - 140517615506624 [label=ViewBackward0] - 140517615506720 -> 140517615506624 - 140517615506720 [label=AddmmBackward0] - 140517615506816 -> 140517615506720 - 140517615506816 [label=ToCopyBackward0] - 140517615507008 -> 140517615506816 - 140509590933808 [label="encoder.layer.1.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590933808 -> 140517615507008 - 140517615507008 [label=AccumulateGrad] - 140517615506768 -> 140517615506720 - 140517615506768 [label=ViewBackward0] - 140517615507056 -> 140517615506768 - 140517615507056 [label=ToCopyBackward0] - 140509587675888 -> 140517615507056 - 140509587675888 [label=CatBackward0] - 140517615507200 -> 140509587675888 - 140517615507200 [label=NativeLayerNormBackward0] - 140517615507344 -> 140517615507200 - 140517615507344 [label=AddBackward0] - 140517615507536 -> 140517615507344 - 140517615507536 [label=NativeDropoutBackward0] - 140517615507680 -> 140517615507536 - 140517615507680 [label=ViewBackward0] - 140517615507776 -> 140517615507680 - 140517615507776 [label=AddmmBackward0] - 140517615507872 -> 140517615507776 - 140517615507872 [label=ToCopyBackward0] - 140517615508064 -> 140517615507872 - 140509590934288 [label="encoder.layer.0.experts.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590934288 -> 140517615508064 - 140517615508064 [label=AccumulateGrad] - 140517615507824 -> 140517615507776 - 140517615507824 [label=ViewBackward0] - 140517615508112 -> 140517615507824 - 140517615508112 [label=GeluBackward0] - 140517615508208 -> 140517615508112 - 140517615508208 [label=ViewBackward0] - 140517615508304 -> 140517615508208 - 140517615508304 [label=AddmmBackward0] - 140517615508400 -> 140517615508304 - 140517615508400 [label=ToCopyBackward0] - 140517615508592 -> 140517615508400 - 140509590934528 [label="encoder.layer.0.experts.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590934528 -> 140517615508592 - 140517615508592 [label=AccumulateGrad] - 140517615508352 -> 140517615508304 - 140517615508352 [label=ViewBackward0] - 140517615508640 -> 140517615508352 - 140517615508640 [label=ToCopyBackward0] - 140517615507488 -> 140517615508640 - 140517615507488 [label=SliceBackward0] - 140517615508784 -> 140517615507488 - 140517615508784 [label=SliceBackward0] - 140517615508880 -> 140517615508784 - 140517615508880 [label=NativeLayerNormBackward0] - 140517615508976 -> 140517615508880 - 140517615508976 [label=AddBackward0] - 140517615509168 -> 140517615508976 - 140517615509168 [label=NativeDropoutBackward0] - 140517615509312 -> 140517615509168 - 140517615509312 [label=ViewBackward0] - 140517615509408 -> 140517615509312 - 140517615509408 [label=AddmmBackward0] - 140517615509456 -> 140517615509408 - 140517615509456 [label=ToCopyBackward0] - 140517615522048 -> 140517615509456 - 140509590936448 [label="encoder.layer.0.crossattention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590936448 -> 140517615522048 - 140517615522048 [label=AccumulateGrad] - 140517615509216 -> 140517615509408 - 140517615509216 [label=ViewBackward0] - 140517615522096 -> 140517615509216 - 140517615522096 [label=ViewBackward0] - 140517615522192 -> 140517615522096 - 140517615522192 [label=CloneBackward0] - 140517615522288 -> 140517615522192 - 140517615522288 [label=PermuteBackward0] - 140517615522384 -> 140517615522288 - 140517615522384 [label=UnsafeViewBackward0] - 140517615522480 -> 140517615522384 - 140517615522480 [label=BmmBackward0] - 140517615522576 -> 140517615522480 - 140517615522576 [label=ReshapeAliasBackward0] - 140517615522720 -> 140517615522576 - 140517615522720 [label=ExpandBackward0] - 140517615522816 -> 140517615522720 - 140517615522816 [label=ToCopyBackward0] - 140517615522912 -> 140517615522816 - 140517615522912 [label=NativeDropoutBackward0] - 140517615523008 -> 140517615522912 - 140517615523008 [label=SoftmaxBackward0] - 140517615523104 -> 140517615523008 - 140517615523104 [label=AddBackward0] - 140517615523200 -> 140517615523104 - 140517615523200 [label=DivBackward0] - 140517615523296 -> 140517615523200 - 140517615523296 [label=UnsafeViewBackward0] - 140517615523392 -> 140517615523296 - 140517615523392 [label=BmmBackward0] - 140517615523488 -> 140517615523392 - 140517615523488 [label=ReshapeAliasBackward0] - 140517615523632 -> 140517615523488 - 140517615523632 [label=ExpandBackward0] - 140517615523728 -> 140517615523632 - 140517615523728 [label=PermuteBackward0] - 140517615523824 -> 140517615523728 - 140517615523824 [label=ViewBackward0] - 140517615523920 -> 140517615523824 - 140517615523920 [label=ViewBackward0] - 140517615524016 -> 140517615523920 - 140517615524016 [label=AddmmBackward0] - 140517615524112 -> 140517615524016 - 140517615524112 [label=ToCopyBackward0] - 140517615524304 -> 140517615524112 - 140509590937168 [label="encoder.layer.0.crossattention.self.query.bias - (768)" fillcolor=lightblue] - 140509590937168 -> 140517615524304 - 140517615524304 [label=AccumulateGrad] - 140517615524064 -> 140517615524016 - 140517615524064 [label=ViewBackward0] - 140517615524352 -> 140517615524064 - 140517615524352 [label=ToCopyBackward0] - 140517615509120 -> 140517615524352 - 140517615509120 [label=SliceBackward0] - 140517615524496 -> 140517615509120 - 140517615524496 [label=SliceBackward0] - 140517615524592 -> 140517615524496 - 140517615524592 [label=SliceBackward0] - 140517615524688 -> 140517615524592 - 140517615524688 [label=NativeLayerNormBackward0] - 140517615524784 -> 140517615524688 - 140517615524784 [label=AddBackward0] - 140517615524976 -> 140517615524784 - 140517615524976 [label=NativeDropoutBackward0] - 140517615525120 -> 140517615524976 - 140517615525120 [label=ViewBackward0] - 140517615525216 -> 140517615525120 - 140517615525216 [label=AddmmBackward0] - 140517615525312 -> 140517615525216 - 140517615525312 [label=ToCopyBackward0] - 140517615525504 -> 140517615525312 - 140509590945936 [label="encoder.layer.0.attention.output.dense.bias - (768)" fillcolor=lightblue] - 140509590945936 -> 140517615525504 - 140517615525504 [label=AccumulateGrad] - 140517615525264 -> 140517615525216 - 140517615525264 [label=ViewBackward0] - 140517615525552 -> 140517615525264 - 140517615525552 [label=ViewBackward0] - 140517615525648 -> 140517615525552 - 140517615525648 [label=CloneBackward0] - 140517615525744 -> 140517615525648 - 140517615525744 [label=PermuteBackward0] - 140517615525840 -> 140517615525744 - 140517615525840 [label=UnsafeViewBackward0] - 140517615525456 -> 140517615525840 - 140517615525456 [label=BmmBackward0] - 140517615538384 -> 140517615525456 - 140517615538384 [label=ReshapeAliasBackward0] - 140517615538528 -> 140517615538384 - 140517615538528 [label=ExpandBackward0] - 140517615538624 -> 140517615538528 - 140517615538624 [label=ToCopyBackward0] - 140517615538720 -> 140517615538624 - 140517615538720 [label=NativeDropoutBackward0] - 140517615538816 -> 140517615538720 - 140517615538816 [label=SoftmaxBackward0] - 140517615538912 -> 140517615538816 - 140517615538912 [label=AddBackward0] - 140517615539008 -> 140517615538912 - 140517615539008 [label=DivBackward0] - 140517615539104 -> 140517615539008 - 140517615539104 [label=UnsafeViewBackward0] - 140517615539200 -> 140517615539104 - 140517615539200 [label=BmmBackward0] - 140517615539296 -> 140517615539200 - 140517615539296 [label=ReshapeAliasBackward0] - 140517615539440 -> 140517615539296 - 140517615539440 [label=ExpandBackward0] - 140517615539536 -> 140517615539440 - 140517615539536 [label=PermuteBackward0] - 140517615539632 -> 140517615539536 - 140517615539632 [label=ViewBackward0] - 140517615539728 -> 140517615539632 - 140517615539728 [label=ViewBackward0] - 140517615539824 -> 140517615539728 - 140517615539824 [label=AddmmBackward0] - 140517615539920 -> 140517615539824 - 140517615539920 [label=ToCopyBackward0] - 140517615540112 -> 140517615539920 - 140509590600896 [label="encoder.layer.0.attention.self.query.bias - (768)" fillcolor=lightblue] - 140509590600896 -> 140517615540112 - 140517615540112 [label=AccumulateGrad] - 140517615539872 -> 140517615539824 - 140517615539872 [label=ViewBackward0] - 140517615540160 -> 140517615539872 - 140517615540160 [label=ToCopyBackward0] - 140517615524928 -> 140517615540160 - 140517615524928 [label=NativeDropoutBackward0] - 140517615540304 -> 140517615524928 - 140517615540304 [label=NativeLayerNormBackward0] - 140517615540400 -> 140517615540304 - 140517615540400 [label=CatBackward0] - 140517615540592 -> 140517615540400 - 140517615540592 [label=ExpandBackward0] - 140517615540736 -> 140517615540592 - 140509590947296 [label=" - (1, 32, 768)" fillcolor=lightblue] - 140509590947296 -> 140517615540736 - 140517615540736 [label=AccumulateGrad] - 140517615540544 -> 140517615540400 - 140517615540544 [label=AddBackward0] - 140517615540784 -> 140517615540544 - 140517615540784 [label=EmbeddingBackward0] - 140517615540928 -> 140517615540784 - 140509590947856 [label="embeddings.word_embeddings.weight - (30523, 768)" fillcolor=lightblue] - 140509590947856 -> 140517615540928 - 140517615540928 [label=AccumulateGrad] - 140517615540832 -> 140517615540544 - 140517615540832 [label=EmbeddingBackward0] - 140517615540976 -> 140517615540832 - 140509939919504 [label="embeddings.position_embeddings.weight - (512, 768)" fillcolor=lightblue] - 140509939919504 -> 140517615540976 - 140517615540976 [label=AccumulateGrad] - 140517615540352 -> 140517615540304 - 140509590958304 [label="embeddings.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590958304 -> 140517615540352 - 140517615540352 [label=AccumulateGrad] - 140517615540016 -> 140517615540304 - 140509590946656 [label="embeddings.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590946656 -> 140517615540016 - 140517615540016 [label=AccumulateGrad] - 140517615539344 -> 140517615539824 - 140517615539344 [label=TBackward0] - 140517615540064 -> 140517615539344 - 140517615540064 [label=ToCopyBackward0] - 140517615540496 -> 140517615540064 - 140509986890912 [label="encoder.layer.0.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509986890912 -> 140517615540496 - 140517615540496 [label=AccumulateGrad] - 140517615539248 -> 140517615539200 - 140517615539248 [label=ReshapeAliasBackward0] - 140517615539584 -> 140517615539248 - 140517615539584 [label=ExpandBackward0] - 140517615539776 -> 140517615539584 - 140517615539776 [label=TransposeBackward0] - 140517615540256 -> 140517615539776 - 140517615540256 [label=PermuteBackward0] - 140517615541024 -> 140517615540256 - 140517615541024 [label=ViewBackward0] - 140517615540208 -> 140517615541024 - 140517615540208 [label=ViewBackward0] - 140517615540640 -> 140517615540208 - 140517615540640 [label=AddmmBackward0] - 140517615541120 -> 140517615540640 - 140517615541120 [label=ToCopyBackward0] - 140517615541312 -> 140517615541120 - 140509590946096 [label="encoder.layer.0.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590946096 -> 140517615541312 - 140517615541312 [label=AccumulateGrad] - 140517615540880 -> 140517615540640 - 140517615540880 [label=ViewBackward0] - 140517615541360 -> 140517615540880 - 140517615541360 [label=ToCopyBackward0] - 140517615524928 -> 140517615541360 - 140517615539392 -> 140517615540640 - 140517615539392 [label=TBackward0] - 140517615541216 -> 140517615539392 - 140517615541216 [label=ToCopyBackward0] - 140517615541504 -> 140517615541216 - 140509590600816 [label="encoder.layer.0.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590600816 -> 140517615541504 - 140517615541504 [label=AccumulateGrad] - 140517615538336 -> 140517615525456 - 140517615538336 [label=ReshapeAliasBackward0] - 140517615538672 -> 140517615538336 - 140517615538672 [label=ExpandBackward0] - 140517615538864 -> 140517615538672 - 140517615538864 [label=PermuteBackward0] - 140517615539056 -> 140517615538864 - 140517615539056 [label=ViewBackward0] - 140517615538432 -> 140517615539056 - 140517615538432 [label=ViewBackward0] - 140517615539680 -> 140517615538432 - 140517615539680 [label=AddmmBackward0] - 140517615540448 -> 140517615539680 - 140517615540448 [label=ToCopyBackward0] - 140517615541456 -> 140517615540448 - 140509590945856 [label="encoder.layer.0.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590945856 -> 140517615541456 - 140517615541456 [label=AccumulateGrad] - 140517615539968 -> 140517615539680 - 140517615539968 [label=ViewBackward0] - 140517615541264 -> 140517615539968 - 140517615541264 [label=ToCopyBackward0] - 140517615524928 -> 140517615541264 - 140517615538480 -> 140517615539680 - 140517615538480 [label=TBackward0] - 140517615541072 -> 140517615538480 - 140517615541072 [label=ToCopyBackward0] - 140517615541408 -> 140517615541072 - 140509590946176 [label="encoder.layer.0.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590946176 -> 140517615541408 - 140517615541408 [label=AccumulateGrad] - 140517615525024 -> 140517615525216 - 140517615525024 [label=TBackward0] - 140517615525696 -> 140517615525024 - 140517615525696 [label=ToCopyBackward0] - 140517615525792 -> 140517615525696 - 140509987117712 [label="encoder.layer.0.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509987117712 -> 140517615525792 - 140517615525792 [label=AccumulateGrad] - 140517615524928 -> 140517615524784 - 140517615524736 -> 140517615524688 - 140509590937328 [label="encoder.layer.0.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590937328 -> 140517615524736 - 140517615524736 [label=AccumulateGrad] - 140517615524208 -> 140517615524688 - 140509590937408 [label="encoder.layer.0.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590937408 -> 140517615524208 - 140517615524208 [label=AccumulateGrad] - 140517615523536 -> 140517615524016 - 140517615523536 [label=TBackward0] - 140517615524256 -> 140517615523536 - 140517615524256 [label=ToCopyBackward0] - 140517615524640 -> 140517615524256 - 140509590937088 [label="encoder.layer.0.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590937088 -> 140517615524640 - 140517615524640 [label=AccumulateGrad] - 140517615523440 -> 140517615523392 - 140517615523440 [label=ReshapeAliasBackward0] - 140517615523776 -> 140517615523440 - 140517615523776 [label=ExpandBackward0] - 140517615523968 -> 140517615523776 - 140517615523968 [label=TransposeBackward0] - 140517615524448 -> 140517615523968 - 140517615524448 [label=PermuteBackward0] - 140517615524880 -> 140517615524448 - 140517615524880 [label=ViewBackward0] - 140517615524400 -> 140517615524880 - 140517615524400 [label=ViewBackward0] - 140517615525168 -> 140517615524400 - 140517615525168 [label=AddmmBackward0] - 140517615525408 -> 140517615525168 - 140517615525408 [label=ToCopyBackward0] - 140517615538288 -> 140517615525408 - 140509590936928 [label="encoder.layer.0.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509590936928 -> 140517615538288 - 140517615538288 [label=AccumulateGrad] - 140517615525360 -> 140517615525168 - 140517615525360 [label=ViewBackward0] - 140517615538768 -> 140517615525360 - 140517615538768 [label=ToCopyBackward0] - 140517615539152 -> 140517615538768 - 140517615539152 [label=NativeLayerNormBackward0] - 140517615540688 -> 140517615539152 - 140509590598736 [label=" - (1408)" fillcolor=lightblue] - 140509590598736 -> 140517615540688 - 140517615540688 [label=AccumulateGrad] - 140517615539488 -> 140517615539152 - 140509590598976 [label=" - (1408)" fillcolor=lightblue] - 140509590598976 -> 140517615539488 - 140517615539488 [label=AccumulateGrad] - 140517615523584 -> 140517615525168 - 140517615523584 [label=TBackward0] - 140517615538240 -> 140517615523584 - 140517615538240 [label=ToCopyBackward0] - 140517615541168 -> 140517615538240 - 140509590936848 [label="encoder.layer.0.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509590936848 -> 140517615541168 - 140517615541168 [label=AccumulateGrad] - 140517615522528 -> 140517615522480 - 140517615522528 [label=ReshapeAliasBackward0] - 140517615522864 -> 140517615522528 - 140517615522864 [label=ExpandBackward0] - 140517615523056 -> 140517615522864 - 140517615523056 [label=PermuteBackward0] - 140517615523248 -> 140517615523056 - 140517615523248 [label=ViewBackward0] - 140517615522624 -> 140517615523248 - 140517615522624 [label=ViewBackward0] - 140517615523872 -> 140517615522624 - 140517615523872 [label=AddmmBackward0] - 140517615524544 -> 140517615523872 - 140517615524544 [label=ToCopyBackward0] - 140517615525600 -> 140517615524544 - 140509590936688 [label="encoder.layer.0.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509590936688 -> 140517615525600 - 140517615525600 [label=AccumulateGrad] - 140517615524160 -> 140517615523872 - 140517615524160 [label=ViewBackward0] - 140517615525072 -> 140517615524160 - 140517615525072 [label=ToCopyBackward0] - 140517615539152 -> 140517615525072 - 140517615522672 -> 140517615523872 - 140517615522672 [label=TBackward0] - 140517615538576 -> 140517615522672 - 140517615538576 [label=ToCopyBackward0] - 140517615538960 -> 140517615538576 - 140509590936608 [label="encoder.layer.0.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509590936608 -> 140517615538960 - 140517615538960 [label=AccumulateGrad] - 140517615521856 -> 140517615509408 - 140517615521856 [label=TBackward0] - 140517615522240 -> 140517615521856 - 140517615522240 [label=ToCopyBackward0] - 140517615522432 -> 140517615522240 - 140509590936368 [label="encoder.layer.0.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590936368 -> 140517615522432 - 140517615522432 [label=AccumulateGrad] - 140517615509120 -> 140517615508976 - 140517615508928 -> 140517615508880 - 140509590936128 [label="encoder.layer.0.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590936128 -> 140517615508928 - 140517615508928 [label=AccumulateGrad] - 140517615508496 -> 140517615508880 - 140509590936208 [label="encoder.layer.0.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590936208 -> 140517615508496 - 140517615508496 [label=AccumulateGrad] - 140517615508016 -> 140517615508304 - 140517615508016 [label=TBackward0] - 140517615508544 -> 140517615508016 - 140517615508544 [label=ToCopyBackward0] - 140517615509024 -> 140517615508544 - 140509590934448 [label="encoder.layer.0.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590934448 -> 140517615509024 - 140517615509024 [label=AccumulateGrad] - 140517615507584 -> 140517615507776 - 140517615507584 [label=TBackward0] - 140517615508256 -> 140517615507584 - 140517615508256 [label=ToCopyBackward0] - 140517615508736 -> 140517615508256 - 140509590934208 [label="encoder.layer.0.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590934208 -> 140517615508736 - 140517615508736 [label=AccumulateGrad] - 140517615507488 -> 140517615507344 - 140517615507296 -> 140517615507200 - 140509590933968 [label="encoder.layer.0.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590933968 -> 140517615507296 - 140517615507296 [label=AccumulateGrad] - 140517615507248 -> 140517615507200 - 140509590934048 [label="encoder.layer.0.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590934048 -> 140517615507248 - 140517615507248 [label=AccumulateGrad] - 140517615506960 -> 140509587675888 - 140517615506960 [label=NativeLayerNormBackward0] - 140517615507632 -> 140517615506960 - 140517615507632 [label=AddBackward0] - 140517615508448 -> 140517615507632 - 140517615508448 [label=NativeDropoutBackward0] - 140517615508160 -> 140517615508448 - 140517615508160 [label=ViewBackward0] - 140517615508688 -> 140517615508160 - 140517615508688 [label=AddmmBackward0] - 140517615509360 -> 140517615508688 - 140517615509360 [label=ToCopyBackward0] - 140517615522000 -> 140517615509360 - 140509590935728 [label="encoder.layer.0.output.dense.bias - (768)" fillcolor=lightblue] - 140509590935728 -> 140517615522000 - 140517615522000 [label=AccumulateGrad] - 140517615509264 -> 140517615508688 - 140517615509264 [label=ViewBackward0] - 140517615522144 -> 140517615509264 - 140517615522144 [label=GeluBackward0] - 140517615523152 -> 140517615522144 - 140517615523152 [label=ViewBackward0] - 140517615523680 -> 140517615523152 - 140517615523680 [label=AddmmBackward0] - 140517615524832 -> 140517615523680 - 140517615524832 [label=ToCopyBackward0] - 140517615541552 -> 140517615524832 - 140509590935968 [label="encoder.layer.0.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590935968 -> 140517615541552 - 140517615541552 [label=AccumulateGrad] - 140517615522768 -> 140517615523680 - 140517615522768 [label=ViewBackward0] - 140517615541792 -> 140517615522768 - 140517615541792 [label=ToCopyBackward0] - 140517615507968 -> 140517615541792 - 140517615507968 [label=SliceBackward0] - 140517615541936 -> 140517615507968 - 140517615541936 [label=SliceBackward0] - 140517615542032 -> 140517615541936 - 140517615542032 [label=SliceBackward0] - 140517615524688 -> 140517615542032 - 140517615541696 -> 140517615523680 - 140517615541696 [label=TBackward0] - 140517615541600 -> 140517615541696 - 140517615541600 [label=ToCopyBackward0] - 140517615542128 -> 140517615541600 - 140509590935888 [label="encoder.layer.0.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590935888 -> 140517615542128 - 140517615542128 [label=AccumulateGrad] - 140517615521904 -> 140517615508688 - 140517615521904 [label=TBackward0] - 140517615523344 -> 140517615521904 - 140517615523344 [label=ToCopyBackward0] - 140517615522960 -> 140517615523344 - 140509590935648 [label="encoder.layer.0.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590935648 -> 140517615522960 - 140517615522960 [label=AccumulateGrad] - 140517615507968 -> 140517615507632 - 140517615507440 -> 140517615506960 - 140509590935408 [label="encoder.layer.0.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590935408 -> 140517615507440 - 140517615507440 [label=AccumulateGrad] - 140517615507392 -> 140517615506960 - 140509590935488 [label="encoder.layer.0.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590935488 -> 140517615507392 - 140517615507392 [label=AccumulateGrad] - 140517615506240 -> 140517615506720 - 140517615506240 [label=TBackward0] - 140517615506912 -> 140517615506240 - 140517615506912 [label=ToCopyBackward0] - 140517615507920 -> 140517615506912 - 140509590933728 [label="encoder.layer.1.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590933728 -> 140517615507920 - 140517615507920 [label=AccumulateGrad] - 140517615506144 -> 140517615506096 - 140517615506144 [label=ReshapeAliasBackward0] - 140517615506480 -> 140517615506144 - 140517615506480 [label=ExpandBackward0] - 140517615506672 -> 140517615506480 - 140517615506672 [label=TransposeBackward0] - 140517615507152 -> 140517615506672 - 140517615507152 [label=PermuteBackward0] - 140517615509072 -> 140517615507152 - 140517615509072 [label=ViewBackward0] - 140517615507104 -> 140517615509072 - 140517615507104 [label=ViewBackward0] - 140517615522336 -> 140517615507104 - 140517615522336 [label=AddmmBackward0] - 140517615506288 -> 140517615522336 - 140517615506288 [label=ToCopyBackward0] - 140517615541840 -> 140517615506288 - 140509590917008 [label="encoder.layer.1.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590917008 -> 140517615541840 - 140517615541840 [label=AccumulateGrad] - 140517615541744 -> 140517615522336 - 140517615541744 [label=ViewBackward0] - 140517615542176 -> 140517615541744 - 140517615542176 [label=ToCopyBackward0] - 140509587675888 -> 140517615542176 - 140517615541888 -> 140517615522336 - 140517615541888 [label=TBackward0] - 140517615542080 -> 140517615541888 - 140517615542080 [label=ToCopyBackward0] - 140517615542224 -> 140517615542080 - 140509590933568 [label="encoder.layer.1.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590933568 -> 140517615542224 - 140517615542224 [label=AccumulateGrad] - 140509587676944 -> 140509587676896 - 140509587676944 [label=ReshapeAliasBackward0] - 140509587677088 -> 140509587676944 - 140509587677088 [label=ExpandBackward0] - 140517615505760 -> 140509587677088 - 140517615505760 [label=PermuteBackward0] - 140517615505952 -> 140517615505760 - 140517615505952 [label=ViewBackward0] - 140517615505472 -> 140517615505952 - 140517615505472 [label=ViewBackward0] - 140517615506576 -> 140517615505472 - 140517615506576 [label=AddmmBackward0] - 140517615507728 -> 140517615506576 - 140517615507728 [label=ToCopyBackward0] - 140517615541648 -> 140517615507728 - 140509590916848 [label="encoder.layer.1.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590916848 -> 140517615541648 - 140517615541648 [label=AccumulateGrad] - 140517615506864 -> 140517615506576 - 140517615506864 [label=ViewBackward0] - 140517615521952 -> 140517615506864 - 140517615521952 [label=ToCopyBackward0] - 140509587675888 -> 140517615521952 - 140517615505520 -> 140517615506576 - 140517615505520 [label=TBackward0] - 140517615541984 -> 140517615505520 - 140517615541984 [label=ToCopyBackward0] - 140517615591632 -> 140517615541984 - 140509590916768 [label="encoder.layer.1.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590916768 -> 140517615591632 - 140517615591632 [label=AccumulateGrad] - 140509587675984 -> 140509587676176 - 140509587675984 [label=TBackward0] - 140509587676656 -> 140509587675984 - 140509587676656 [label=ToCopyBackward0] - 140509587676848 -> 140509587676656 - 140509590916528 [label="encoder.layer.1.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590916528 -> 140509587676848 - 140509587676848 [label=AccumulateGrad] - 140509587675888 -> 140509587675744 - 140509587675696 -> 140509587675648 - 140509590916288 [label="encoder.layer.1.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590916288 -> 140509587675696 - 140509587675696 [label=AccumulateGrad] - 140509587674976 -> 140509587675648 - 140509590916368 [label="encoder.layer.1.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590916368 -> 140509587674976 - 140509587674976 [label=AccumulateGrad] - 140509587674496 -> 140509587674784 - 140509587674496 [label=TBackward0] - 140509587675024 -> 140509587674496 - 140509587675024 [label=ToCopyBackward0] - 140509587675408 -> 140509587675024 - 140509590914608 [label="encoder.layer.1.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590914608 -> 140509587675408 - 140509587675408 [label=AccumulateGrad] - 140509587674064 -> 140509587674256 - 140509587674064 [label=TBackward0] - 140509587674736 -> 140509587674064 - 140509587674736 [label=ToCopyBackward0] - 140509587675216 -> 140509587674736 - 140509590914368 [label="encoder.layer.1.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590914368 -> 140509587675216 - 140509587675216 [label=AccumulateGrad] - 140509587673968 -> 140509587673824 - 140509587673776 -> 140509587673680 - 140509590914128 [label="encoder.layer.1.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590914128 -> 140509587673776 - 140509587673776 [label=AccumulateGrad] - 140509587673728 -> 140509587673680 - 140509590914208 [label="encoder.layer.1.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590914208 -> 140509587673728 - 140509587673728 [label=AccumulateGrad] - 140509587673440 -> 140509587658304 - 140509587673440 [label=NativeLayerNormBackward0] - 140509587674112 -> 140509587673440 - 140509587674112 [label=AddBackward0] - 140509587674928 -> 140509587674112 - 140509587674928 [label=NativeDropoutBackward0] - 140509587674640 -> 140509587674928 - 140509587674640 [label=ViewBackward0] - 140509587675168 -> 140509587674640 - 140509587675168 [label=AddmmBackward0] - 140509587675840 -> 140509587675168 - 140509587675840 [label=ToCopyBackward0] - 140509587676368 -> 140509587675840 - 140509590915888 [label="encoder.layer.1.output.dense.bias - (768)" fillcolor=lightblue] - 140509590915888 -> 140509587676368 - 140509587676368 [label=AccumulateGrad] - 140509587675792 -> 140509587675168 - 140509587675792 [label=ViewBackward0] - 140509587676752 -> 140509587675792 - 140509587676752 [label=GeluBackward0] - 140509587676560 -> 140509587676752 - 140509587676560 [label=ViewBackward0] - 140509587676320 -> 140509587676560 - 140509587676320 [label=AddmmBackward0] - 140517615506048 -> 140509587676320 - 140517615506048 [label=ToCopyBackward0] - 140517615508832 -> 140517615506048 - 140509590916128 [label="encoder.layer.1.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590916128 -> 140517615508832 - 140517615508832 [label=AccumulateGrad] - 140517615505856 -> 140509587676320 - 140517615505856 [label=ViewBackward0] - 140517615591728 -> 140517615505856 - 140517615591728 [label=ToCopyBackward0] - 140509587674448 -> 140517615591728 - 140509587674448 [label=SliceBackward0] - 140517615591776 -> 140509587674448 - 140517615591776 [label=SliceBackward0] - 140517615591872 -> 140517615591776 - 140517615591872 [label=SliceBackward0] - 140509587675648 -> 140517615591872 - 140517615505568 -> 140509587676320 - 140517615505568 [label=TBackward0] - 140517615591536 -> 140517615505568 - 140517615591536 [label=ToCopyBackward0] - 140517615591968 -> 140517615591536 - 140509590916048 [label="encoder.layer.1.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590916048 -> 140517615591968 - 140517615591968 [label=AccumulateGrad] - 140509587675600 -> 140509587675168 - 140509587675600 [label=TBackward0] - 140509587676128 -> 140509587675600 - 140509587676128 [label=ToCopyBackward0] - 140517615506384 -> 140509587676128 - 140509590915808 [label="encoder.layer.1.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590915808 -> 140517615506384 - 140517615506384 [label=AccumulateGrad] - 140509587674448 -> 140509587674112 - 140509587673920 -> 140509587673440 - 140509590915568 [label="encoder.layer.1.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590915568 -> 140509587673920 - 140509587673920 [label=AccumulateGrad] - 140509587673872 -> 140509587673440 - 140509590915648 [label="encoder.layer.1.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590915648 -> 140509587673872 - 140509587673872 [label=AccumulateGrad] - 140509587673152 -> 140509587660368 - 140509587673152 [label=TBackward0] - 140509587673392 -> 140509587673152 - 140509587673392 [label=ToCopyBackward0] - 140509587674400 -> 140509587673392 - 140509590913888 [label="encoder.layer.2.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590913888 -> 140509587674400 - 140509587674400 [label=AccumulateGrad] - 140509587660272 -> 140509587660224 - 140509587660272 [label=ReshapeAliasBackward0] - 140509587660608 -> 140509587660272 - 140509587660608 [label=ExpandBackward0] - 140509587660704 -> 140509587660608 - 140509587660704 [label=TransposeBackward0] - 140509587673632 -> 140509587660704 - 140509587673632 [label=PermuteBackward0] - 140509587675504 -> 140509587673632 - 140509587675504 [label=ViewBackward0] - 140509587673584 -> 140509587675504 - 140509587673584 [label=ViewBackward0] - 140509587676416 -> 140509587673584 - 140509587676416 [label=AddmmBackward0] - 140517615505664 -> 140509587676416 - 140517615505664 [label=ToCopyBackward0] - 140517615591680 -> 140517615505664 - 140509590913728 [label="encoder.layer.2.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590913728 -> 140517615591680 - 140517615591680 [label=AccumulateGrad] - 140509587673200 -> 140509587676416 - 140509587673200 [label=ViewBackward0] - 140517615592016 -> 140509587673200 - 140517615592016 [label=ToCopyBackward0] - 140509587658304 -> 140517615592016 - 140517615591488 -> 140509587676416 - 140517615591488 [label=TBackward0] - 140517615591584 -> 140517615591488 - 140517615591584 [label=ToCopyBackward0] - 140517615592160 -> 140517615591584 - 140509590913648 [label="encoder.layer.2.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590913648 -> 140517615592160 - 140517615592160 [label=AccumulateGrad] - 140509587659360 -> 140509587659312 - 140509587659360 [label=ReshapeAliasBackward0] - 140509587659696 -> 140509587659360 - 140509587659696 [label=ExpandBackward0] - 140509587659888 -> 140509587659696 - 140509587659888 [label=PermuteBackward0] - 140509587660080 -> 140509587659888 - 140509587660080 [label=ViewBackward0] - 140509587659456 -> 140509587660080 - 140509587659456 [label=ViewBackward0] - 140509587660416 -> 140509587659456 - 140509587660416 [label=AddmmBackward0] - 140509587659504 -> 140509587660416 - 140509587659504 [label=ToCopyBackward0] - 140509587676032 -> 140509587659504 - 140509590913488 [label="encoder.layer.2.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590913488 -> 140509587676032 - 140509587676032 [label=AccumulateGrad] - 140509587674208 -> 140509587660416 - 140509587674208 [label=ViewBackward0] - 140517615591920 -> 140509587674208 - 140517615591920 [label=ToCopyBackward0] - 140509587658304 -> 140517615591920 - 140509587673344 -> 140509587660416 - 140509587673344 [label=TBackward0] - 140517615591824 -> 140509587673344 - 140517615591824 [label=ToCopyBackward0] - 140517615592064 -> 140517615591824 - 140509590913408 [label="encoder.layer.2.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590913408 -> 140517615592064 - 140517615592064 [label=AccumulateGrad] - 140509587658400 -> 140509587658592 - 140509587658400 [label=TBackward0] - 140509587659072 -> 140509587658400 - 140509587659072 [label=ToCopyBackward0] - 140509587659264 -> 140509587659072 - 140509590913168 [label="encoder.layer.2.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590913168 -> 140509587659264 - 140509587659264 [label=AccumulateGrad] - 140509587658304 -> 140509587658160 - 140509587658112 -> 140509587658064 - 140509590904640 [label="encoder.layer.2.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590904640 -> 140509587658112 - 140509587658112 [label=AccumulateGrad] - 140509587657584 -> 140509587658064 - 140509590904720 [label="encoder.layer.2.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590904720 -> 140509587657584 - 140509587657584 [label=AccumulateGrad] - 140509587656912 -> 140509587657392 - 140509587656912 [label=TBackward0] - 140509587657632 -> 140509587656912 - 140509587657632 [label=ToCopyBackward0] - 140509587658016 -> 140509587657632 - 140509590904400 [label="encoder.layer.2.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590904400 -> 140509587658016 - 140509587658016 [label=AccumulateGrad] - 140509587656816 -> 140509587644368 - 140509587656816 [label=ReshapeAliasBackward0] - 140509587657152 -> 140509587656816 - 140509587657152 [label=ExpandBackward0] - 140509587657344 -> 140509587657152 - 140509587657344 [label=TransposeBackward0] - 140509587657824 -> 140509587657344 - 140509587657824 [label=PermuteBackward0] - 140509587658256 -> 140509587657824 - 140509587658256 [label=ViewBackward0] - 140509587657776 -> 140509587658256 - 140509587657776 [label=ViewBackward0] - 140509587658544 -> 140509587657776 - 140509587658544 [label=AddmmBackward0] - 140509587658784 -> 140509587658544 - 140509587658784 [label=ToCopyBackward0] - 140509587658976 -> 140509587658784 - 140509590904240 [label="encoder.layer.2.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509590904240 -> 140509587658976 - 140509587658976 [label=AccumulateGrad] - 140509587658736 -> 140509587658544 - 140509587658736 [label=ViewBackward0] - 140509587659792 -> 140509587658736 - 140509587659792 [label=ToCopyBackward0] - 140517615539152 -> 140509587659792 - 140509587656960 -> 140509587658544 - 140509587656960 [label=TBackward0] - 140509587659600 -> 140509587656960 - 140509587659600 [label=ToCopyBackward0] - 140509587660512 -> 140509587659600 - 140509590904160 [label="encoder.layer.2.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509590904160 -> 140509587660512 - 140509587660512 [label=AccumulateGrad] - 140509587643552 -> 140509587643504 - 140509587643552 [label=ReshapeAliasBackward0] - 140509587643888 -> 140509587643552 - 140509587643888 [label=ExpandBackward0] - 140509587644080 -> 140509587643888 - 140509587644080 [label=PermuteBackward0] - 140509587644272 -> 140509587644080 - 140509587644272 [label=ViewBackward0] - 140509587675312 -> 140509587644272 - 140509587675312 [label=ViewBackward0] - 140509587643696 -> 140509587675312 - 140509587643696 [label=AddmmBackward0] - 140509587657536 -> 140509587643696 - 140509587657536 [label=ToCopyBackward0] - 140509587659168 -> 140509587657536 - 140509590904000 [label="encoder.layer.2.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509590904000 -> 140509587659168 - 140509587659168 [label=AccumulateGrad] - 140509587657248 -> 140509587643696 - 140509587657248 [label=ViewBackward0] - 140509587660176 -> 140509587657248 - 140509587660176 [label=ToCopyBackward0] - 140517615539152 -> 140509587660176 - 140509587656768 -> 140509587643696 - 140509587656768 [label=TBackward0] - 140509587658208 -> 140509587656768 - 140509587658208 [label=ToCopyBackward0] - 140509587658448 -> 140509587658208 - 140509590903920 [label="encoder.layer.2.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509590903920 -> 140509587658448 - 140509587658448 [label=AccumulateGrad] - 140509587642592 -> 140509587642784 - 140509587642592 [label=TBackward0] - 140509587643264 -> 140509587642592 - 140509587643264 [label=ToCopyBackward0] - 140509587643456 -> 140509587643264 - 140509590903680 [label="encoder.layer.2.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590903680 -> 140509587643456 - 140509587643456 [label=AccumulateGrad] - 140509587642496 -> 140509587642352 - 140509587642304 -> 140509587642256 - 140509590903440 [label="encoder.layer.2.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590903440 -> 140509587642304 - 140509587642304 [label=AccumulateGrad] - 140509587641872 -> 140509587642256 - 140509590903520 [label="encoder.layer.2.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590903520 -> 140509587641872 - 140509587641872 [label=AccumulateGrad] - 140509587641392 -> 140509587641680 - 140509587641392 [label=TBackward0] - 140509587641920 -> 140509587641392 - 140509587641920 [label=ToCopyBackward0] - 140509587642400 -> 140509587641920 - 140509590901760 [label="encoder.layer.2.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590901760 -> 140509587642400 - 140509587642400 [label=AccumulateGrad] - 140509587640960 -> 140509587641152 - 140509587640960 [label=TBackward0] - 140509587641632 -> 140509587640960 - 140509587641632 [label=ToCopyBackward0] - 140509587642112 -> 140509587641632 - 140509590901520 [label="encoder.layer.2.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590901520 -> 140509587642112 - 140509587642112 [label=AccumulateGrad] - 140509587640864 -> 140509587640720 - 140509587640672 -> 140509587640576 - 140509590901280 [label="encoder.layer.2.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590901280 -> 140509587640672 - 140509587640672 [label=AccumulateGrad] - 140509587640624 -> 140509587640576 - 140509590901360 [label="encoder.layer.2.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590901360 -> 140509587640624 - 140509587640624 [label=AccumulateGrad] - 140509587640480 -> 140509587625200 - 140509587640480 [label=NativeLayerNormBackward0] - 140509587641008 -> 140509587640480 - 140509587641008 [label=AddBackward0] - 140509587641824 -> 140509587641008 - 140509587641824 [label=NativeDropoutBackward0] - 140509587641536 -> 140509587641824 - 140509587641536 [label=ViewBackward0] - 140509587642064 -> 140509587641536 - 140509587642064 [label=AddmmBackward0] - 140509587642928 -> 140509587642064 - 140509587642928 [label=ToCopyBackward0] - 140509587643024 -> 140509587642928 - 140509590903040 [label="encoder.layer.2.output.dense.bias - (768)" fillcolor=lightblue] - 140509590903040 -> 140509587643024 - 140509587643024 [label=AccumulateGrad] - 140509587642736 -> 140509587642064 - 140509587642736 [label=ViewBackward0] - 140509587643168 -> 140509587642736 - 140509587643168 [label=GeluBackward0] - 140509587644176 -> 140509587643168 - 140509587644176 [label=ViewBackward0] - 140509587643648 -> 140509587644176 - 140509587643648 [label=AddmmBackward0] - 140509587659984 -> 140509587643648 - 140509587659984 [label=ToCopyBackward0] - 140517615592208 -> 140509587659984 - 140509590903280 [label="encoder.layer.2.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590903280 -> 140517615592208 - 140517615592208 [label=AccumulateGrad] - 140509587657920 -> 140509587643648 - 140509587657920 [label=ViewBackward0] - 140517615592304 -> 140509587657920 - 140517615592304 [label=ToCopyBackward0] - 140509587641344 -> 140517615592304 - 140509587641344 [label=SliceBackward0] - 140517615592448 -> 140509587641344 - 140517615592448 [label=SliceBackward0] - 140517615592544 -> 140517615592448 - 140517615592544 [label=SliceBackward0] - 140509587658064 -> 140517615592544 - 140509587657056 -> 140509587643648 - 140509587657056 [label=TBackward0] - 140517615592112 -> 140509587657056 - 140517615592112 [label=ToCopyBackward0] - 140517615592640 -> 140517615592112 - 140509590903200 [label="encoder.layer.2.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590903200 -> 140517615592640 - 140517615592640 [label=AccumulateGrad] - 140509587642640 -> 140509587642064 - 140509587642640 [label=TBackward0] - 140509587643792 -> 140509587642640 - 140509587643792 [label=ToCopyBackward0] - 140509587658832 -> 140509587643792 - 140509590902960 [label="encoder.layer.2.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590902960 -> 140509587658832 - 140509587658832 [label=AccumulateGrad] - 140509587641344 -> 140509587641008 - 140509587640816 -> 140509587640480 - 140509590902720 [label="encoder.layer.2.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590902720 -> 140509587640816 - 140509587640816 [label=AccumulateGrad] - 140509587640768 -> 140509587640480 - 140509590902800 [label="encoder.layer.2.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590902800 -> 140509587640768 - 140509587640768 [label=AccumulateGrad] - 140509587627264 -> 140509587627744 - 140509587627264 [label=TBackward0] - 140509587640384 -> 140509587627264 - 140509587640384 [label=ToCopyBackward0] - 140509587641296 -> 140509587640384 - 140509590901040 [label="encoder.layer.3.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590901040 -> 140509587641296 - 140509587641296 [label=AccumulateGrad] - 140509587627168 -> 140509587627120 - 140509587627168 [label=ReshapeAliasBackward0] - 140509587627504 -> 140509587627168 - 140509587627504 [label=ExpandBackward0] - 140509587627696 -> 140509587627504 - 140509587627696 [label=TransposeBackward0] - 140509587627888 -> 140509587627696 - 140509587627888 [label=PermuteBackward0] - 140509587642448 -> 140509587627888 - 140509587642448 [label=ViewBackward0] - 140509587640432 -> 140509587642448 - 140509587640432 [label=ViewBackward0] - 140509587643360 -> 140509587640432 - 140509587643360 [label=AddmmBackward0] - 140509587643984 -> 140509587643360 - 140509587643984 [label=ToCopyBackward0] - 140517615592256 -> 140509587643984 - 140509590900880 [label="encoder.layer.3.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590900880 -> 140517615592256 - 140517615592256 [label=AccumulateGrad] - 140509587640528 -> 140509587643360 - 140509587640528 [label=ViewBackward0] - 140517615592688 -> 140509587640528 - 140517615592688 [label=ToCopyBackward0] - 140509587625200 -> 140517615592688 - 140517615592352 -> 140509587643360 - 140517615592352 [label=TBackward0] - 140517615592400 -> 140517615592352 - 140517615592400 [label=ToCopyBackward0] - 140517615592832 -> 140517615592400 - 140509590900800 [label="encoder.layer.3.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590900800 -> 140517615592832 - 140517615592832 [label=AccumulateGrad] - 140509587626256 -> 140509587626208 - 140509587626256 [label=ReshapeAliasBackward0] - 140509587626592 -> 140509587626256 - 140509587626592 [label=ExpandBackward0] - 140509587626784 -> 140509587626592 - 140509587626784 [label=PermuteBackward0] - 140509587626976 -> 140509587626784 - 140509587626976 [label=ViewBackward0] - 140509587626352 -> 140509587626976 - 140509587626352 [label=ViewBackward0] - 140509587627600 -> 140509587626352 - 140509587627600 [label=AddmmBackward0] - 140509587627312 -> 140509587627600 - 140509587627312 [label=ToCopyBackward0] - 140509587642976 -> 140509587627312 - 140509590896448 [label="encoder.layer.3.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590896448 -> 140509587642976 - 140509587642976 [label=AccumulateGrad] - 140509587626400 -> 140509587627600 - 140509587626400 [label=ViewBackward0] - 140517615592592 -> 140509587626400 - 140517615592592 [label=ToCopyBackward0] - 140509587625200 -> 140517615592592 - 140509587641104 -> 140509587627600 - 140509587641104 [label=TBackward0] - 140517615592496 -> 140509587641104 - 140517615592496 [label=ToCopyBackward0] - 140517615592736 -> 140517615592496 - 140509590896368 [label="encoder.layer.3.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590896368 -> 140517615592736 - 140517615592736 [label=AccumulateGrad] - 140509587625296 -> 140509587625488 - 140509587625296 [label=TBackward0] - 140509587625968 -> 140509587625296 - 140509587625968 [label=ToCopyBackward0] - 140509587626160 -> 140509587625968 - 140509590896128 [label="encoder.layer.3.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590896128 -> 140509587626160 - 140509587626160 [label=AccumulateGrad] - 140509587625200 -> 140509587625056 - 140509587625008 -> 140509587624960 - 140509590895888 [label="encoder.layer.3.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590895888 -> 140509587625008 - 140509587625008 [label=AccumulateGrad] - 140509587624288 -> 140509587624960 - 140509590895968 [label="encoder.layer.3.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590895968 -> 140509587624288 - 140509587624288 [label=AccumulateGrad] - 140509587624000 -> 140509587624096 - 140509587624000 [label=TBackward0] - 140509587624336 -> 140509587624000 - 140509587624336 [label=ToCopyBackward0] - 140509587624720 -> 140509587624336 - 140509590894208 [label="encoder.layer.3.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590894208 -> 140509587624720 - 140509587624720 [label=AccumulateGrad] - 140509587611024 -> 140509587611216 - 140509587611024 [label=TBackward0] - 140509587611456 -> 140509587611024 - 140509587611456 [label=ToCopyBackward0] - 140509587624528 -> 140509587611456 - 140509590893968 [label="encoder.layer.3.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590893968 -> 140509587624528 - 140509587624528 [label=AccumulateGrad] - 140509587610928 -> 140509587610784 - 140509587610736 -> 140509587610640 - 140509590893728 [label="encoder.layer.3.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590893728 -> 140509587610736 - 140509587610736 [label=AccumulateGrad] - 140509587610688 -> 140509587610640 - 140509590893808 [label="encoder.layer.3.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590893808 -> 140509587610688 - 140509587610688 [label=AccumulateGrad] - 140509587610400 -> 140509587607664 - 140509587610400 [label=NativeLayerNormBackward0] - 140509587611072 -> 140509587610400 - 140509587611072 [label=AddBackward0] - 140509587611600 -> 140509587611072 - 140509587611600 [label=NativeDropoutBackward0] - 140509587624048 -> 140509587611600 - 140509587624048 [label=ViewBackward0] - 140509587624480 -> 140509587624048 - 140509587624480 [label=AddmmBackward0] - 140509587625152 -> 140509587624480 - 140509587625152 [label=ToCopyBackward0] - 140509587625680 -> 140509587625152 - 140509590895488 [label="encoder.layer.3.output.dense.bias - (768)" fillcolor=lightblue] - 140509590895488 -> 140509587625680 - 140509587625680 [label=AccumulateGrad] - 140509587625104 -> 140509587624480 - 140509587625104 [label=ViewBackward0] - 140509587626064 -> 140509587625104 - 140509587626064 [label=GeluBackward0] - 140509587625728 -> 140509587626064 - 140509587625728 [label=ViewBackward0] - 140509587626688 -> 140509587625728 - 140509587626688 [label=AddmmBackward0] - 140509587627072 -> 140509587626688 - 140509587627072 [label=ToCopyBackward0] - 140509587642208 -> 140509587627072 - 140509590895728 [label="encoder.layer.3.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590895728 -> 140509587642208 - 140509587642208 [label=AccumulateGrad] - 140509587626880 -> 140509587626688 - 140509587626880 [label=ViewBackward0] - 140517615593024 -> 140509587626880 - 140517615593024 [label=ToCopyBackward0] - 140509587611360 -> 140517615593024 - 140509587611360 [label=SliceBackward0] - 140517615593072 -> 140509587611360 - 140517615593072 [label=SliceBackward0] - 140517615593168 -> 140517615593072 - 140517615593168 [label=SliceBackward0] - 140509587624960 -> 140517615593168 - 140509587625632 -> 140509587626688 - 140509587625632 [label=TBackward0] - 140517615592784 -> 140509587625632 - 140517615592784 [label=ToCopyBackward0] - 140517615593264 -> 140517615592784 - 140509590895648 [label="encoder.layer.3.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590895648 -> 140517615593264 - 140517615593264 [label=AccumulateGrad] - 140509587624912 -> 140509587624480 - 140509587624912 [label=TBackward0] - 140509587625872 -> 140509587624912 - 140509587625872 [label=ToCopyBackward0] - 140509587627408 -> 140509587625872 - 140509590895408 [label="encoder.layer.3.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590895408 -> 140509587627408 - 140509587627408 [label=AccumulateGrad] - 140509587611360 -> 140509587611072 - 140509587610880 -> 140509587610400 - 140509590895168 [label="encoder.layer.3.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590895168 -> 140509587610880 - 140509587610880 [label=AccumulateGrad] - 140509587610832 -> 140509587610400 - 140509590895248 [label="encoder.layer.3.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590895248 -> 140509587610832 - 140509587610832 [label=AccumulateGrad] - 140509587609680 -> 140509587610160 - 140509587609680 [label=TBackward0] - 140509587610352 -> 140509587609680 - 140509587610352 [label=ToCopyBackward0] - 140509587611168 -> 140509587610352 - 140509590893488 [label="encoder.layer.4.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590893488 -> 140509587611168 - 140509587611168 [label=AccumulateGrad] - 140509587609584 -> 140509587609536 - 140509587609584 [label=ReshapeAliasBackward0] - 140509587609920 -> 140509587609584 - 140509587609920 [label=ExpandBackward0] - 140509587610112 -> 140509587609920 - 140509587610112 [label=TransposeBackward0] - 140509587610592 -> 140509587610112 - 140509587610592 [label=PermuteBackward0] - 140509587610544 -> 140509587610592 - 140509587610544 [label=ViewBackward0] - 140509587624240 -> 140509587610544 - 140509587624240 [label=ViewBackward0] - 140509587625440 -> 140509587624240 - 140509587625440 [label=AddmmBackward0] - 140509587626496 -> 140509587625440 - 140509587626496 [label=ToCopyBackward0] - 140517615592976 -> 140509587626496 - 140509590893328 [label="encoder.layer.4.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590893328 -> 140517615592976 - 140517615592976 [label=AccumulateGrad] - 140509587624816 -> 140509587625440 - 140509587624816 [label=ViewBackward0] - 140517615593312 -> 140509587624816 - 140517615593312 [label=ToCopyBackward0] - 140509587607664 -> 140517615593312 - 140517615592880 -> 140509587625440 - 140517615592880 [label=TBackward0] - 140517615592928 -> 140517615592880 - 140517615592928 [label=ToCopyBackward0] - 140517615593456 -> 140517615592928 - 140509590893248 [label="encoder.layer.4.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590893248 -> 140517615593456 - 140517615593456 [label=AccumulateGrad] - 140509587608672 -> 140509587608624 - 140509587608672 [label=ReshapeAliasBackward0] - 140509587609008 -> 140509587608672 - 140509587609008 [label=ExpandBackward0] - 140509587609200 -> 140509587609008 - 140509587609200 [label=PermuteBackward0] - 140509587609392 -> 140509587609200 - 140509587609392 [label=ViewBackward0] - 140509587608768 -> 140509587609392 - 140509587608768 [label=ViewBackward0] - 140509587610016 -> 140509587608768 - 140509587610016 [label=AddmmBackward0] - 140509587609728 -> 140509587610016 - 140509587609728 [label=ToCopyBackward0] - 140509587625344 -> 140509587609728 - 140509590893088 [label="encoder.layer.4.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590893088 -> 140509587625344 - 140509587625344 [label=AccumulateGrad] - 140509587610304 -> 140509587610016 - 140509587610304 [label=ViewBackward0] - 140517615593216 -> 140509587610304 - 140517615593216 [label=ToCopyBackward0] - 140509587607664 -> 140517615593216 - 140509587608816 -> 140509587610016 - 140509587608816 [label=TBackward0] - 140517615593120 -> 140509587608816 - 140517615593120 [label=ToCopyBackward0] - 140517615593360 -> 140517615593120 - 140509590893008 [label="encoder.layer.4.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590893008 -> 140517615593360 - 140517615593360 [label=AccumulateGrad] - 140509587607712 -> 140509587607904 - 140509587607712 [label=TBackward0] - 140509587608384 -> 140509587607712 - 140509587608384 [label=ToCopyBackward0] - 140509587608576 -> 140509587608384 - 140509590892768 [label="encoder.layer.4.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590892768 -> 140509587608576 - 140509587608576 [label=AccumulateGrad] - 140509587607664 -> 140509587595120 - 140509587595072 -> 140509587595024 - 140509590892608 [label="encoder.layer.4.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590892608 -> 140509587595072 - 140509587595072 [label=AccumulateGrad] - 140509587594544 -> 140509587595024 - 140509590876048 [label="encoder.layer.4.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590876048 -> 140509587594544 - 140509587594544 [label=AccumulateGrad] - 140509587593872 -> 140509587594352 - 140509587593872 [label=TBackward0] - 140509587594592 -> 140509587593872 - 140509587594592 [label=ToCopyBackward0] - 140509587594976 -> 140509587594592 - 140509590875808 [label="encoder.layer.4.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590875808 -> 140509587594976 - 140509587594976 [label=AccumulateGrad] - 140509587593776 -> 140509587593728 - 140509587593776 [label=ReshapeAliasBackward0] - 140509587594112 -> 140509587593776 - 140509587594112 [label=ExpandBackward0] - 140509587594304 -> 140509587594112 - 140509587594304 [label=TransposeBackward0] - 140509587594784 -> 140509587594304 - 140509587594784 [label=PermuteBackward0] - 140509587595168 -> 140509587594784 - 140509587595168 [label=ViewBackward0] - 140509587594736 -> 140509587595168 - 140509587594736 [label=ViewBackward0] - 140509587607856 -> 140509587594736 - 140509587607856 [label=AddmmBackward0] - 140509587608096 -> 140509587607856 - 140509587608096 [label=ToCopyBackward0] - 140509587608288 -> 140509587608096 - 140509590875648 [label="encoder.layer.4.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509590875648 -> 140509587608288 - 140509587608288 [label=AccumulateGrad] - 140509587608048 -> 140509587607856 - 140509587608048 [label=ViewBackward0] - 140509587609104 -> 140509587608048 - 140509587609104 [label=ToCopyBackward0] - 140517615539152 -> 140509587609104 - 140509587607616 -> 140509587607856 - 140509587607616 [label=TBackward0] - 140509587608912 -> 140509587607616 - 140509587608912 [label=ToCopyBackward0] - 140509587609824 -> 140509587608912 - 140509590875568 [label="encoder.layer.4.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509590875568 -> 140509587609824 - 140509587609824 [label=AccumulateGrad] - 140509587592864 -> 140509587592816 - 140509587592864 [label=ReshapeAliasBackward0] - 140509587593200 -> 140509587592864 - 140509587593200 [label=ExpandBackward0] - 140509587593392 -> 140509587593200 - 140509587593392 [label=PermuteBackward0] - 140509587593584 -> 140509587593392 - 140509587593584 [label=ViewBackward0] - 140509587592960 -> 140509587593584 - 140509587592960 [label=ViewBackward0] - 140509587594208 -> 140509587592960 - 140509587594208 [label=AddmmBackward0] - 140509587594880 -> 140509587594208 - 140509587594880 [label=ToCopyBackward0] - 140509587624624 -> 140509587594880 - 140509590875408 [label="encoder.layer.4.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509590875408 -> 140509587624624 - 140509587624624 [label=AccumulateGrad] - 140509587594496 -> 140509587594208 - 140509587594496 [label=ViewBackward0] - 140509587609488 -> 140509587594496 - 140509587609488 [label=ToCopyBackward0] - 140517615539152 -> 140509587609488 - 140509587593008 -> 140509587594208 - 140509587593008 [label=TBackward0] - 140509587607760 -> 140509587593008 - 140509587607760 [label=ToCopyBackward0] - 140509587608480 -> 140509587607760 - 140509590875328 [label="encoder.layer.4.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509590875328 -> 140509587608480 - 140509587608480 [label=AccumulateGrad] - 140509587591904 -> 140509587592096 - 140509587591904 [label=TBackward0] - 140509587592576 -> 140509587591904 - 140509587592576 [label=ToCopyBackward0] - 140509587592768 -> 140509587592576 - 140509590875088 [label="encoder.layer.4.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590875088 -> 140509587592768 - 140509587592768 [label=AccumulateGrad] - 140509587591808 -> 140509587591664 - 140509587591616 -> 140509587591568 - 140509590874848 [label="encoder.layer.4.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590874848 -> 140509587591616 - 140509587591616 [label=AccumulateGrad] - 140509587591376 -> 140509587591568 - 140509590874928 [label="encoder.layer.4.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590874928 -> 140509587591376 - 140509587591376 [label=AccumulateGrad] - 140509587574256 -> 140509587574544 - 140509587574256 [label=TBackward0] - 140509587591328 -> 140509587574256 - 140509587591328 [label=ToCopyBackward0] - 140509587591712 -> 140509587591328 - 140509590873168 [label="encoder.layer.4.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590873168 -> 140509587591712 - 140509587591712 [label=AccumulateGrad] - 140509587573824 -> 140509587574016 - 140509587573824 [label=TBackward0] - 140509587574496 -> 140509587573824 - 140509587574496 [label=ToCopyBackward0] - 140509587574688 -> 140509587574496 - 140509590872928 [label="encoder.layer.4.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590872928 -> 140509587574688 - 140509587574688 [label=AccumulateGrad] - 140509587573728 -> 140509587573584 - 140509587573536 -> 140509587573440 - 140509590872688 [label="encoder.layer.4.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590872688 -> 140509587573536 - 140509587573536 [label=AccumulateGrad] - 140509587573488 -> 140509587573440 - 140509590872768 [label="encoder.layer.4.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590872768 -> 140509587573488 - 140509587573488 [label=AccumulateGrad] - 140509587573200 -> 140509587562112 - 140509587573200 [label=NativeLayerNormBackward0] - 140509587573872 -> 140509587573200 - 140509587573872 [label=AddBackward0] - 140509587574400 -> 140509587573872 - 140509587574400 [label=NativeDropoutBackward0] - 140509587591424 -> 140509587574400 - 140509587591424 [label=ViewBackward0] - 140509587591280 -> 140509587591424 - 140509587591280 [label=AddmmBackward0] - 140509587592240 -> 140509587591280 - 140509587592240 [label=ToCopyBackward0] - 140509587592336 -> 140509587592240 - 140509590874448 [label="encoder.layer.4.output.dense.bias - (768)" fillcolor=lightblue] - 140509590874448 -> 140509587592336 - 140509587592336 [label=AccumulateGrad] - 140509587592048 -> 140509587591280 - 140509587592048 [label=ViewBackward0] - 140509587592480 -> 140509587592048 - 140509587592480 [label=GeluBackward0] - 140509587593488 -> 140509587592480 - 140509587593488 [label=ViewBackward0] - 140509587594016 -> 140509587593488 - 140509587594016 [label=AddmmBackward0] - 140509587593920 -> 140509587594016 - 140509587593920 [label=ToCopyBackward0] - 140517615593504 -> 140509587593920 - 140509590874688 [label="encoder.layer.4.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590874688 -> 140517615593504 - 140517615593504 [label=AccumulateGrad] - 140509587593104 -> 140509587594016 - 140509587593104 [label=ViewBackward0] - 140517615593600 -> 140509587593104 - 140517615593600 [label=ToCopyBackward0] - 140509587574208 -> 140517615593600 - 140509587574208 [label=SliceBackward0] - 140517615593744 -> 140509587574208 - 140517615593744 [label=SliceBackward0] - 140517615593840 -> 140517615593744 - 140517615593840 [label=SliceBackward0] - 140509587595024 -> 140517615593840 - 140509587609296 -> 140509587594016 - 140509587609296 [label=TBackward0] - 140517615593408 -> 140509587609296 - 140517615593408 [label=ToCopyBackward0] - 140517615593936 -> 140517615593408 - 140509590874608 [label="encoder.layer.4.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590874608 -> 140517615593936 - 140517615593936 [label=AccumulateGrad] - 140509587591952 -> 140509587591280 - 140509587591952 [label=TBackward0] - 140509587593680 -> 140509587591952 - 140509587593680 [label=ToCopyBackward0] - 140509587608144 -> 140509587593680 - 140509590874368 [label="encoder.layer.4.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590874368 -> 140509587608144 - 140509587608144 [label=AccumulateGrad] - 140509587574208 -> 140509587573872 - 140509587573680 -> 140509587573200 - 140509590874128 [label="encoder.layer.4.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590874128 -> 140509587573680 - 140509587573680 [label=AccumulateGrad] - 140509587573632 -> 140509587573200 - 140509590874208 [label="encoder.layer.4.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590874208 -> 140509587573632 - 140509587573632 [label=AccumulateGrad] - 140509587572480 -> 140509587572960 - 140509587572480 [label=TBackward0] - 140509587573152 -> 140509587572480 - 140509587573152 [label=ToCopyBackward0] - 140509587574160 -> 140509587573152 - 140509590872448 [label="encoder.layer.5.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590872448 -> 140509587574160 - 140509587574160 [label=AccumulateGrad] - 140509587572384 -> 140509587572336 - 140509587572384 [label=ReshapeAliasBackward0] - 140509587572720 -> 140509587572384 - 140509587572720 [label=ExpandBackward0] - 140509587572912 -> 140509587572720 - 140509587572912 [label=TransposeBackward0] - 140509587573392 -> 140509587572912 - 140509587573392 [label=PermuteBackward0] - 140509587573344 -> 140509587573392 - 140509587573344 [label=ViewBackward0] - 140509587572528 -> 140509587573344 - 140509587572528 [label=ViewBackward0] - 140509587592672 -> 140509587572528 - 140509587592672 [label=AddmmBackward0] - 140509587593296 -> 140509587592672 - 140509587593296 [label=ToCopyBackward0] - 140517615593552 -> 140509587593296 - 140509590872288 [label="encoder.layer.5.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590872288 -> 140517615593552 - 140517615593552 [label=AccumulateGrad] - 140509587591760 -> 140509587592672 - 140509587591760 [label=ViewBackward0] - 140517615593984 -> 140509587591760 - 140517615593984 [label=ToCopyBackward0] - 140509587562112 -> 140517615593984 - 140517615593648 -> 140509587592672 - 140517615593648 [label=TBackward0] - 140517615593696 -> 140517615593648 - 140517615593696 [label=ToCopyBackward0] - 140517615594128 -> 140517615593696 - 140509590872208 [label="encoder.layer.5.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590872208 -> 140517615594128 - 140517615594128 [label=AccumulateGrad] - 140509587571472 -> 140509587571424 - 140509587571472 [label=ReshapeAliasBackward0] - 140509587571808 -> 140509587571472 - 140509587571808 [label=ExpandBackward0] - 140509587572000 -> 140509587571808 - 140509587572000 [label=PermuteBackward0] - 140509587572192 -> 140509587572000 - 140509587572192 [label=ViewBackward0] - 140509587571568 -> 140509587572192 - 140509587571568 [label=ViewBackward0] - 140509587572816 -> 140509587571568 - 140509587572816 [label=AddmmBackward0] - 140509587573968 -> 140509587572816 - 140509587573968 [label=ToCopyBackward0] - 140509587592288 -> 140509587573968 - 140509590859664 [label="encoder.layer.5.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590859664 -> 140509587592288 - 140509587592288 [label=AccumulateGrad] - 140509587573104 -> 140509587572816 - 140509587573104 [label=ViewBackward0] - 140517615593888 -> 140509587573104 - 140517615593888 [label=ToCopyBackward0] - 140509587562112 -> 140517615593888 - 140509587571616 -> 140509587572816 - 140509587571616 [label=TBackward0] - 140517615593792 -> 140509587571616 - 140517615593792 [label=ToCopyBackward0] - 140517615594032 -> 140517615593792 - 140509590859584 [label="encoder.layer.5.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590859584 -> 140517615594032 - 140517615594032 [label=AccumulateGrad] - 140509587570752 -> 140509587562400 - 140509587570752 [label=TBackward0] - 140509587571184 -> 140509587570752 - 140509587571184 [label=ToCopyBackward0] - 140509587571376 -> 140509587571184 - 140509590859344 [label="encoder.layer.5.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590859344 -> 140509587571376 - 140509587571376 [label=AccumulateGrad] - 140509587562112 -> 140509587561968 - 140509587561920 -> 140509587561872 - 140509590859104 [label="encoder.layer.5.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590859104 -> 140509587561920 - 140509587561920 [label=AccumulateGrad] - 140509587561200 -> 140509587561872 - 140509590859184 [label="encoder.layer.5.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590859184 -> 140509587561200 - 140509587561200 [label=AccumulateGrad] - 140509587560720 -> 140509587561008 - 140509587560720 [label=TBackward0] - 140509587561248 -> 140509587560720 - 140509587561248 [label=ToCopyBackward0] - 140509587561632 -> 140509587561248 - 140509590857424 [label="encoder.layer.5.experts.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590857424 -> 140509587561632 - 140509587561632 [label=AccumulateGrad] - 140509587560288 -> 140509587560480 - 140509587560288 [label=TBackward0] - 140509587560960 -> 140509587560288 - 140509587560960 [label=ToCopyBackward0] - 140509587561440 -> 140509587560960 - 140509590857184 [label="encoder.layer.5.experts.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590857184 -> 140509587561440 - 140509587561440 [label=AccumulateGrad] - 140509587560192 -> 140509587560048 - 140509587560000 -> 140509587559904 - 140509590856944 [label="encoder.layer.5.experts.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590856944 -> 140509587560000 - 140509587560000 [label=AccumulateGrad] - 140509587559952 -> 140509587559904 - 140509590857024 [label="encoder.layer.5.experts.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590857024 -> 140509587559952 - 140509587559952 [label=AccumulateGrad] - 140509587559664 -> 140509587850432 - 140509587559664 [label=NativeLayerNormBackward0] - 140509587560336 -> 140509587559664 - 140509587560336 [label=AddBackward0] - 140509587561152 -> 140509587560336 - 140509587561152 [label=NativeDropoutBackward0] - 140509587560864 -> 140509587561152 - 140509587560864 [label=ViewBackward0] - 140509587561392 -> 140509587560864 - 140509587561392 [label=AddmmBackward0] - 140509587562064 -> 140509587561392 - 140509587562064 [label=ToCopyBackward0] - 140509587562352 -> 140509587562064 - 140509590858704 [label="encoder.layer.5.output.dense.bias - (768)" fillcolor=lightblue] - 140509590858704 -> 140509587562352 - 140509587562352 [label=AccumulateGrad] - 140509587562016 -> 140509587561392 - 140509587562016 [label=ViewBackward0] - 140509587571280 -> 140509587562016 - 140509587571280 [label=GeluBackward0] - 140509587570848 -> 140509587571280 - 140509587570848 [label=ViewBackward0] - 140509587571904 -> 140509587570848 - 140509587571904 [label=AddmmBackward0] - 140509587572288 -> 140509587571904 - 140509587572288 [label=ToCopyBackward0] - 140509587591520 -> 140509587572288 - 140509590858944 [label="encoder.layer.5.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590858944 -> 140509587591520 - 140509587591520 [label=AccumulateGrad] - 140509587572096 -> 140509587571904 - 140509587572096 [label=ViewBackward0] - 140517615594320 -> 140509587572096 - 140517615594320 [label=ToCopyBackward0] - 140509587560672 -> 140517615594320 - 140509587560672 [label=SliceBackward0] - 140517615594368 -> 140509587560672 - 140517615594368 [label=SliceBackward0] - 140517615594464 -> 140517615594368 - 140517615594464 [label=SliceBackward0] - 140509587561872 -> 140517615594464 - 140509587571088 -> 140509587571904 - 140509587571088 [label=TBackward0] - 140517615594080 -> 140509587571088 - 140517615594080 [label=ToCopyBackward0] - 140517615594560 -> 140517615594080 - 140509590858864 [label="encoder.layer.5.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590858864 -> 140517615594560 - 140517615594560 [label=AccumulateGrad] - 140509587561824 -> 140509587561392 - 140509587561824 [label=TBackward0] - 140509587571040 -> 140509587561824 - 140509587571040 [label=ToCopyBackward0] - 140509587572624 -> 140509587571040 - 140509590858624 [label="encoder.layer.5.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590858624 -> 140509587572624 - 140509587572624 [label=AccumulateGrad] - 140509587560672 -> 140509587560336 - 140509587560144 -> 140509587559664 - 140509590858384 [label="encoder.layer.5.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590858384 -> 140509587560144 - 140509587560144 [label=AccumulateGrad] - 140509587560096 -> 140509587559664 - 140509590858464 [label="encoder.layer.5.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590858464 -> 140509587560096 - 140509587560096 [label=AccumulateGrad] - 140509587558944 -> 140509587559424 - 140509587558944 [label=TBackward0] - 140509587559616 -> 140509587558944 - 140509587559616 [label=ToCopyBackward0] - 140509587560624 -> 140509587559616 - 140509590856704 [label="encoder.layer.6.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590856704 -> 140509587560624 - 140509587560624 [label=AccumulateGrad] - 140509587558848 -> 140509587558800 - 140509587558848 [label=ReshapeAliasBackward0] - 140509587559184 -> 140509587558848 - 140509587559184 [label=ExpandBackward0] - 140509587559376 -> 140509587559184 - 140509587559376 [label=TransposeBackward0] - 140509587559856 -> 140509587559376 - 140509587559856 [label=PermuteBackward0] - 140509587561728 -> 140509587559856 - 140509587561728 [label=ViewBackward0] - 140509587559808 -> 140509587561728 - 140509587559808 [label=ViewBackward0] - 140509587562256 -> 140509587559808 - 140509587562256 [label=AddmmBackward0] - 140509587571712 -> 140509587562256 - 140509587571712 [label=ToCopyBackward0] - 140517615594272 -> 140509587571712 - 140509590856544 [label="encoder.layer.6.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590856544 -> 140517615594272 - 140517615594272 [label=AccumulateGrad] - 140509587570800 -> 140509587562256 - 140509587570800 [label=ViewBackward0] - 140517615594608 -> 140509587570800 - 140517615594608 [label=ToCopyBackward0] - 140509587850432 -> 140517615594608 - 140517615594176 -> 140509587562256 - 140517615594176 [label=TBackward0] - 140517615594224 -> 140517615594176 - 140517615594224 [label=ToCopyBackward0] - 140517615594752 -> 140517615594224 - 140509590856464 [label="encoder.layer.6.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590856464 -> 140517615594752 - 140517615594752 [label=AccumulateGrad] - 140509587849376 -> 140509587849520 - 140509587849376 [label=ReshapeAliasBackward0] - 140509587853120 -> 140509587849376 - 140509587853120 [label=ExpandBackward0] - 140509587853216 -> 140509587853120 - 140509587853216 [label=PermuteBackward0] - 140509587558656 -> 140509587853216 - 140509587558656 [label=ViewBackward0] - 140509587558464 -> 140509587558656 - 140509587558464 [label=ViewBackward0] - 140509587559280 -> 140509587558464 - 140509587559280 [label=AddmmBackward0] - 140509587560432 -> 140509587559280 - 140509587560432 [label=ToCopyBackward0] - 140509587558992 -> 140509587560432 - 140509590856304 [label="encoder.layer.6.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590856304 -> 140509587558992 - 140509587558992 [label=AccumulateGrad] - 140509587559568 -> 140509587559280 - 140509587559568 [label=ViewBackward0] - 140517615594512 -> 140509587559568 - 140517615594512 [label=ToCopyBackward0] - 140509587850432 -> 140517615594512 - 140509587558512 -> 140509587559280 - 140509587558512 [label=TBackward0] - 140517615594416 -> 140509587558512 - 140517615594416 [label=ToCopyBackward0] - 140517615594656 -> 140517615594416 - 140509590856224 [label="encoder.layer.6.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590856224 -> 140517615594656 - 140517615594656 [label=AccumulateGrad] - 140509587850336 -> 140509587850144 - 140509587850336 [label=TBackward0] - 140509587849664 -> 140509587850336 - 140509587849664 [label=ToCopyBackward0] - 140509587849472 -> 140509587849664 - 140509590855984 [label="encoder.layer.6.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590855984 -> 140509587849472 - 140509587849472 [label=AccumulateGrad] - 140509587850432 -> 140509587850672 - 140509587850624 -> 140509587850768 - 140509590855744 [label="encoder.layer.6.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590855744 -> 140509587850624 - 140509587850624 [label=AccumulateGrad] - 140509587851248 -> 140509587850768 - 140509590855824 [label="encoder.layer.6.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590855824 -> 140509587851248 - 140509587851248 [label=AccumulateGrad] - 140509587851920 -> 140509587851440 - 140509587851920 [label=TBackward0] - 140509587851104 -> 140509587851920 - 140509587851104 [label=ToCopyBackward0] - 140509587850720 -> 140509587851104 - 140509590843120 [label="encoder.layer.6.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590843120 -> 140509587850720 - 140509587850720 [label=AccumulateGrad] - 140509587852016 -> 140509587851968 - 140509587852016 [label=ReshapeAliasBackward0] - 140509587851584 -> 140509587852016 - 140509587851584 [label=ExpandBackward0] - 140509587851392 -> 140509587851584 - 140509587851392 [label=TransposeBackward0] - 140509587850912 -> 140509587851392 - 140509587850912 [label=PermuteBackward0] - 140509587850576 -> 140509587850912 - 140509587850576 [label=ViewBackward0] - 140509587851056 -> 140509587850576 - 140509587851056 [label=ViewBackward0] - 140509587850288 -> 140509587851056 - 140509587850288 [label=AddmmBackward0] - 140509587849952 -> 140509587850288 - 140509587849952 [label=ToCopyBackward0] - 140509587849760 -> 140509587849952 - 140509590842960 [label="encoder.layer.6.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509590842960 -> 140509587849760 - 140509587849760 [label=AccumulateGrad] - 140509587850096 -> 140509587850288 - 140509587850096 [label=ViewBackward0] - 140509587853024 -> 140509587850096 - 140509587853024 [label=ToCopyBackward0] - 140517615539152 -> 140509587853024 - 140509587851776 -> 140509587850288 - 140509587851776 [label=TBackward0] - 140509587850000 -> 140509587851776 - 140509587850000 [label=ToCopyBackward0] - 140509587559088 -> 140509587850000 - 140509590842880 [label="encoder.layer.6.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509590842880 -> 140509587559088 - 140509587559088 [label=AccumulateGrad] - 140509587852880 -> 140509587695984 - 140509587852880 [label=ReshapeAliasBackward0] - 140509587852592 -> 140509587852880 - 140509587852592 [label=ExpandBackward0] - 140509587852400 -> 140509587852592 - 140509587852400 [label=PermuteBackward0] - 140509587852208 -> 140509587852400 - 140509587852208 [label=ViewBackward0] - 140509587852736 -> 140509587852208 - 140509587852736 [label=ViewBackward0] - 140509587851488 -> 140509587852736 - 140509587851488 [label=AddmmBackward0] - 140509587850816 -> 140509587851488 - 140509587850816 [label=ToCopyBackward0] - 140509587849328 -> 140509587850816 - 140509590842720 [label="encoder.layer.6.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509590842720 -> 140509587849328 - 140509587849328 [label=AccumulateGrad] - 140509587851200 -> 140509587851488 - 140509587851200 [label=ViewBackward0] - 140509587850384 -> 140509587851200 - 140509587850384 [label=ToCopyBackward0] - 140517615539152 -> 140509587850384 - 140509587852784 -> 140509587851488 - 140509587852784 [label=TBackward0] - 140509587849568 -> 140509587852784 - 140509587849568 [label=ToCopyBackward0] - 140509587561536 -> 140509587849568 - 140509590842640 [label="encoder.layer.6.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509590842640 -> 140509587561536 - 140509587561536 [label=AccumulateGrad] - 140509587695216 -> 140509587695600 - 140509587695216 [label=TBackward0] - 140509587697520 -> 140509587695216 - 140509587697520 [label=ToCopyBackward0] - 140509587695552 -> 140509587697520 - 140509590842400 [label="encoder.layer.6.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590842400 -> 140509587695552 - 140509587695552 [label=AccumulateGrad] - 140509587695120 -> 140509587694832 - 140509587696080 -> 140509587694592 - 140509590842160 [label="encoder.layer.6.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590842160 -> 140509587696080 - 140509587696080 [label=AccumulateGrad] - 140509587697040 -> 140509587694592 - 140509590842240 [label="encoder.layer.6.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590842240 -> 140509587697040 - 140509587697040 [label=AccumulateGrad] - 140509587697328 -> 140509587696464 - 140509587697328 [label=TBackward0] - 140509587693632 -> 140509587697328 - 140509587693632 [label=ToCopyBackward0] - 140509587694256 -> 140509587693632 - 140509590826016 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590826016 -> 140509587694256 - 140509587694256 [label=AccumulateGrad] - 140509588196464 -> 140509588196752 - 140509588196464 [label=TBackward0] - 140509588197136 -> 140509588196464 - 140509588197136 [label=ToCopyBackward0] - 140509587693968 -> 140509588197136 - 140509590826176 [label="encoder.layer.6.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590826176 -> 140509587693968 - 140509587693968 [label=AccumulateGrad] - 140509588196272 -> 140509588195888 - 140509588195984 -> 140509588195696 - 140509590825696 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590825696 -> 140509588195984 - 140509588195984 [label=AccumulateGrad] - 140509588195456 -> 140509588195696 - 140509590826496 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590826496 -> 140509588195456 - 140509588195456 [label=AccumulateGrad] - 140509588195408 -> 140509588195216 - 140509588195408 [label=UnsqueezeBackward0] - 140509588195936 -> 140509588195408 - 140509588195936 [label=NativeLayerNormBackward0] - 140509588196416 -> 140509588195936 - 140509588196416 [label=AddBackward0] - 140509587694640 -> 140509588196416 - 140509587694640 [label=NativeDropoutBackward0] - 140509587697424 -> 140509587694640 - 140509587697424 [label=ViewBackward0] - 140509587693776 -> 140509587697424 - 140509587693776 [label=AddmmBackward0] - 140509587694928 -> 140509587693776 - 140509587694928 [label=ToCopyBackward0] - 140509587696848 -> 140509587694928 - 140509590825936 [label="encoder.layer.6.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509590825936 -> 140509587696848 - 140509587696848 [label=AccumulateGrad] - 140509587694736 -> 140509587693776 - 140509587694736 [label=ViewBackward0] - 140509587695888 -> 140509587694736 - 140509587695888 [label=GeluBackward0] - 140509587696176 -> 140509587695888 - 140509587696176 [label=ViewBackward0] - 140509587695504 -> 140509587696176 - 140509587695504 [label=AddmmBackward0] - 140509587852304 -> 140509587695504 - 140509587852304 [label=ToCopyBackward0] - 140509587850528 -> 140509587852304 - 140509590825456 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509590825456 -> 140509587850528 - 140509587850528 [label=AccumulateGrad] - 140509587852496 -> 140509587695504 - 140509587852496 [label=ViewBackward0] - 140509587558560 -> 140509587852496 - 140509587558560 [label=ToCopyBackward0] - 140509588196272 -> 140509587558560 - 140509587852688 -> 140509587695504 - 140509587852688 [label=TBackward0] - 140509587851680 -> 140509587852688 - 140509587851680 [label=ToCopyBackward0] - 140517615594800 -> 140509587851680 - 140509590825536 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590825536 -> 140517615594800 - 140517615594800 [label=AccumulateGrad] - 140509587697136 -> 140509587693776 - 140509587697136 [label=TBackward0] - 140509587695312 -> 140509587697136 - 140509587695312 [label=ToCopyBackward0] - 140509587558752 -> 140509587695312 - 140509590825296 [label="encoder.layer.6.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590825296 -> 140509587558752 - 140509587558752 [label=AccumulateGrad] - 140509588196272 -> 140509588196416 - 140509588196368 -> 140509588195936 - 140509590825056 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590825056 -> 140509588196368 - 140509588196368 [label=AccumulateGrad] - 140509588195792 -> 140509588195936 - 140509590824976 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590824976 -> 140509588195792 - 140509588195792 [label=AccumulateGrad] - 140509588195312 -> 140509588194976 - 140509588195312 [label=UnsqueezeBackward0] - 140509588196848 -> 140509588195312 - 140509588196848 [label=UnsqueezeBackward0] - 140509588195504 -> 140509588196848 - 140509588195504 [label=MulBackward0] - 140509587695024 -> 140509588195504 - 140509587695024 [label=ViewBackward0] - 140509587696656 -> 140509587695024 - 140509587696656 [label=CloneBackward0] - 140509587852832 -> 140509587696656 - 140509587852832 [label=ExpandBackward0] - 140517615594896 -> 140509587852832 - 140517615594896 [label=UnsqueezeBackward0] - 140517615594992 -> 140517615594896 - 140517615594992 [label=SoftmaxBackward0] - 140517615595088 -> 140517615594992 - 140517615595088 [label=MmBackward0] - 140517615595184 -> 140517615595088 - 140517615595184 [label=ToCopyBackward0] - 140517615595328 -> 140517615595184 - 140517615595328 [label=DivBackward0] - 140517615595424 -> 140517615595328 - 140517615595424 [label=SumBackward1] - 140517615595472 -> 140517615595424 - 140517615595472 [label=MulBackward0] - 140509587694352 -> 140517615595472 - 140517615595136 -> 140517615595088 - 140517615595136 [label=TBackward0] - 140517615595232 -> 140517615595136 - 140517615595232 [label=ToCopyBackward0] - 140517615595280 -> 140517615595232 - 140509590839840 [label="encoder.layer.6.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509590839840 -> 140517615595280 - 140517615595280 [label=AccumulateGrad] - 140509588194448 -> 140509588165008 - 140509588194448 [label=ViewBackward0] - 140509588196080 -> 140509588194448 - 140509588196080 [label=CloneBackward0] - 140509588195120 -> 140509588196080 - 140509588195120 [label=ExpandBackward0] - 140509587852112 -> 140509588195120 - 140509587852112 [label=UnsqueezeBackward0] - 140509587694160 -> 140509587852112 - 140509587694160 [label=NativeLayerNormBackward0] - 140517615594848 -> 140509587694160 - 140517615594848 [label=AddBackward0] - 140517615726656 -> 140517615594848 - 140517615726656 [label=NativeDropoutBackward0] - 140517615726896 -> 140517615726656 - 140517615726896 [label=ViewBackward0] - 140517615726992 -> 140517615726896 - 140517615726992 [label=AddmmBackward0] - 140517615727088 -> 140517615726992 - 140517615727088 [label=ToCopyBackward0] - 140517615727280 -> 140517615727088 - 140509590841760 [label="encoder.layer.6.output.dense.bias - (768)" fillcolor=lightblue] - 140509590841760 -> 140517615727280 - 140517615727280 [label=AccumulateGrad] - 140517615727040 -> 140517615726992 - 140517615727040 [label=ViewBackward0] - 140517615727328 -> 140517615727040 - 140517615727328 [label=GeluBackward0] - 140517615727424 -> 140517615727328 - 140517615727424 [label=ViewBackward0] - 140517615727520 -> 140517615727424 - 140517615727520 [label=AddmmBackward0] - 140517615727616 -> 140517615727520 - 140517615727616 [label=ToCopyBackward0] - 140517615727808 -> 140517615727616 - 140509590842000 [label="encoder.layer.6.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590842000 -> 140517615727808 - 140517615727808 [label=AccumulateGrad] - 140517615727568 -> 140517615727520 - 140517615727568 [label=ViewBackward0] - 140517615727856 -> 140517615727568 - 140517615727856 [label=ToCopyBackward0] - 140517615726800 -> 140517615727856 - 140517615726800 [label=SliceBackward0] - 140517615728000 -> 140517615726800 - 140517615728000 [label=SliceBackward0] - 140517615728096 -> 140517615728000 - 140517615728096 [label=SliceBackward0] - 140509587850768 -> 140517615728096 - 140517615727232 -> 140517615727520 - 140517615727232 [label=TBackward0] - 140517615727760 -> 140517615727232 - 140517615727760 [label=ToCopyBackward0] - 140517615728192 -> 140517615727760 - 140509590841920 [label="encoder.layer.6.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509590841920 -> 140517615728192 - 140517615728192 [label=AccumulateGrad] - 140517615726752 -> 140517615726992 - 140517615726752 [label=TBackward0] - 140517615727472 -> 140517615726752 - 140517615727472 [label=ToCopyBackward0] - 140517615727952 -> 140517615727472 - 140509590841680 [label="encoder.layer.6.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590841680 -> 140517615727952 - 140517615727952 [label=AccumulateGrad] - 140517615726800 -> 140517615594848 - 140517615595040 -> 140509587694160 - 140509590841440 [label="encoder.layer.6.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590841440 -> 140517615595040 - 140517615595040 [label=AccumulateGrad] - 140517615594944 -> 140509587694160 - 140509590841520 [label="encoder.layer.6.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590841520 -> 140517615594944 - 140517615594944 [label=AccumulateGrad] - 140509588193344 -> 140509588194160 - 140509588193344 [label=TBackward0] - 140509588194544 -> 140509588193344 - 140509588194544 [label=ToCopyBackward0] - 140509588194928 -> 140509588194544 - 140509590840000 [label="encoder.layer.7.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590840000 -> 140509588194928 - 140509588194928 [label=AccumulateGrad] - 140509588168464 -> 140509588168176 - 140509588168464 [label=UnsafeViewBackward0] - 140509588168560 -> 140509588168464 - 140509588168560 [label=CloneBackward0] - 140509588193776 -> 140509588168560 - 140509588193776 [label=ExpandBackward0] - 140509588194256 -> 140509588193776 - 140509588194256 [label=TransposeBackward0] - 140509588194832 -> 140509588194256 - 140509588194832 [label=PermuteBackward0] - 140509587694448 -> 140509588194832 - 140509587694448 [label=ViewBackward0] - 140517615595376 -> 140509587694448 - 140517615595376 [label=ViewBackward0] - 140509588193392 -> 140517615595376 - 140509588193392 [label=AddmmBackward0] - 140517615727136 -> 140509588193392 - 140517615727136 [label=ToCopyBackward0] - 140517615728048 -> 140517615727136 - 140509590840560 [label="encoder.layer.7.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590840560 -> 140517615728048 - 140517615728048 [label=AccumulateGrad] - 140517615726944 -> 140509588193392 - 140517615726944 [label=ViewBackward0] - 140517615727376 -> 140517615726944 - 140517615727376 [label=ToCopyBackward0] - 140509588165008 -> 140517615727376 - 140517615726704 -> 140509588193392 - 140517615726704 [label=TBackward0] - 140517615727664 -> 140517615726704 - 140517615727664 [label=ToCopyBackward0] - 140517615728240 -> 140517615727664 - 140509590840240 [label="encoder.layer.7.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590840240 -> 140517615728240 - 140517615728240 [label=AccumulateGrad] - 140509588166736 -> 140509588166832 - 140509588166736 [label=UnsafeViewBackward0] - 140509588167504 -> 140509588166736 - 140509588167504 [label=CloneBackward0] - 140509588167792 -> 140509588167504 - 140509588167792 [label=ExpandBackward0] - 140509588168080 -> 140509588167792 - 140509588168080 [label=PermuteBackward0] - 140509588166928 -> 140509588168080 - 140509588166928 [label=ViewBackward0] - 140509588167120 -> 140509588166928 - 140509588167120 [label=ViewBackward0] - 140509588194736 -> 140509588167120 - 140509588194736 [label=AddmmBackward0] - 140517615594704 -> 140509588194736 - 140517615594704 [label=ToCopyBackward0] - 140517615727712 -> 140517615594704 - 140509590839760 [label="encoder.layer.7.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590839760 -> 140517615727712 - 140517615727712 [label=AccumulateGrad] - 140509587695792 -> 140509588194736 - 140509587695792 [label=ViewBackward0] - 140517615728336 -> 140509587695792 - 140517615728336 [label=ToCopyBackward0] - 140509588165008 -> 140517615728336 - 140509588193488 -> 140509588194736 - 140509588193488 [label=TBackward0] - 140517615727184 -> 140509588193488 - 140517615727184 [label=ToCopyBackward0] - 140517615728384 -> 140517615727184 - 140509590840480 [label="encoder.layer.7.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590840480 -> 140517615728384 - 140517615728384 [label=AccumulateGrad] - 140509588165056 -> 140509588165488 - 140509588165056 [label=TBackward0] - 140509588166256 -> 140509588165056 - 140509588166256 [label=ToCopyBackward0] - 140509588166496 -> 140509588166256 - 140509590839600 [label="encoder.layer.7.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590839600 -> 140509588166496 - 140509588166496 [label=AccumulateGrad] - 140509588165008 -> 140509588164912 - 140509588164720 -> 140509588139888 - 140509590839520 [label="encoder.layer.7.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590839520 -> 140509588164720 - 140509588164720 [label=AccumulateGrad] - 140509588164672 -> 140509588139888 - 140509985419152 [label="encoder.layer.7.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509985419152 -> 140509588164672 - 140509588164672 [label=AccumulateGrad] - 140509588138160 -> 140509588138640 - 140509588138160 [label=TBackward0] - 140509588138928 -> 140509588138160 - 140509588138928 [label=ToCopyBackward0] - 140509588139456 -> 140509588138928 - 140509591342032 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591342032 -> 140509588139456 - 140509588139456 [label=AccumulateGrad] - 140509588137296 -> 140509588137536 - 140509588137296 [label=TBackward0] - 140509588138448 -> 140509588137296 - 140509588138448 [label=ToCopyBackward0] - 140509588139216 -> 140509588138448 - 140509591341712 [label="encoder.layer.7.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591341712 -> 140509588139216 - 140509588139216 [label=AccumulateGrad] - 140509588137056 -> 140509588137104 - 140509588136816 -> 140509588136912 - 140509591341472 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591341472 -> 140509588136816 - 140509588136816 [label=AccumulateGrad] - 140509588136720 -> 140509588136912 - 140509591341792 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591341792 -> 140509588136720 - 140509588136720 [label=AccumulateGrad] - 140509588136624 -> 140509588136432 - 140509588136624 [label=UnsqueezeBackward0] - 140509588137200 -> 140509588136624 - 140509588137200 [label=NativeLayerNormBackward0] - 140509588137680 -> 140509588137200 - 140509588137680 [label=AddBackward0] - 140509588139024 -> 140509588137680 - 140509588139024 [label=NativeDropoutBackward0] - 140509588138256 -> 140509588139024 - 140509588138256 [label=ViewBackward0] - 140509588139312 -> 140509588138256 - 140509588139312 [label=AddmmBackward0] - 140509588137968 -> 140509588139312 - 140509588137968 [label=ToCopyBackward0] - 140509588165776 -> 140509588137968 - 140509591342192 [label="encoder.layer.7.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591342192 -> 140509588165776 - 140509588165776 [label=AccumulateGrad] - 140509588165104 -> 140509588139312 - 140509588165104 [label=ViewBackward0] - 140509588166448 -> 140509588165104 - 140509588166448 [label=GeluBackward0] - 140509588166064 -> 140509588166448 - 140509588166064 [label=ViewBackward0] - 140509588167600 -> 140509588166064 - 140509588167600 [label=AddmmBackward0] - 140509588168272 -> 140509588167600 - 140509588168272 [label=ToCopyBackward0] - 140509588193968 -> 140509588168272 - 140509591341552 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591341552 -> 140509588193968 - 140509588193968 [label=AccumulateGrad] - 140509588167984 -> 140509588167600 - 140509588167984 [label=ViewBackward0] - 140517615727904 -> 140509588167984 - 140517615727904 [label=ToCopyBackward0] - 140509588137056 -> 140517615727904 - 140509588165872 -> 140509588167600 - 140509588165872 [label=TBackward0] - 140517615726848 -> 140509588165872 - 140517615726848 [label=ToCopyBackward0] - 140517615728288 -> 140517615726848 - 140509591341232 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591341232 -> 140517615728288 - 140517615728288 [label=AccumulateGrad] - 140509588164816 -> 140509588139312 - 140509588164816 [label=TBackward0] - 140509588166016 -> 140509588164816 - 140509588166016 [label=ToCopyBackward0] - 140509588193536 -> 140509588166016 - 140509591340992 [label="encoder.layer.7.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591340992 -> 140509588193536 - 140509588193536 [label=AccumulateGrad] - 140509588137056 -> 140509588137680 - 140509588137584 -> 140509588137200 - 140509591340752 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591340752 -> 140509588137584 - 140509588137584 [label=AccumulateGrad] - 140509588136576 -> 140509588137200 - 140509591341072 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591341072 -> 140509588136576 - 140509588136576 [label=AccumulateGrad] - 140509588136096 -> 140509588136240 - 140509588136096 [label=UnsqueezeBackward0] - 140509588138064 -> 140509588136096 - 140509588138064 [label=UnsqueezeBackward0] - 140509588139408 -> 140509588138064 - 140509588139408 [label=MulBackward0] - 140509588139696 -> 140509588139408 - 140509588139696 [label=SoftmaxBackward0] - 140509588167312 -> 140509588139696 - 140509588167312 [label=MmBackward0] - 140509588165392 -> 140509588167312 - 140509588165392 [label=ToCopyBackward0] - 140517615728480 -> 140509588165392 - 140517615728480 [label=DivBackward0] - 140517615728672 -> 140517615728480 - 140517615728672 [label=SumBackward1] - 140517615728768 -> 140517615728672 - 140517615728768 [label=MulBackward0] - 140509588137056 -> 140517615728768 - 140517615728144 -> 140509588167312 - 140517615728144 [label=TBackward0] - 140517615728720 -> 140517615728144 - 140517615728720 [label=ToCopyBackward0] - 140517615728816 -> 140517615728720 - 140509590823376 [label="encoder.layer.7.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509590823376 -> 140517615728816 - 140517615728816 [label=AccumulateGrad] - 140509588106928 -> 140509588077488 - 140509588106928 [label=IndexBackward0] - 140509588137008 -> 140509588106928 - 140509588137008 [label=NativeLayerNormBackward0] - 140509588136336 -> 140509588137008 - 140509588136336 [label=AddBackward0] - 140517615728864 -> 140509588136336 - 140517615728864 [label=NativeDropoutBackward0] - 140517615728528 -> 140517615728864 - 140517615728528 [label=ViewBackward0] - 140517615729008 -> 140517615728528 - 140517615729008 [label=AddmmBackward0] - 140517615729104 -> 140517615729008 - 140517615729104 [label=ToCopyBackward0] - 140517615729296 -> 140517615729104 - 140509590826656 [label="encoder.layer.7.output.dense.bias - (768)" fillcolor=lightblue] - 140509590826656 -> 140517615729296 - 140517615729296 [label=AccumulateGrad] - 140517615729056 -> 140517615729008 - 140517615729056 [label=ViewBackward0] - 140517615729344 -> 140517615729056 - 140517615729344 [label=GeluBackward0] - 140517615729440 -> 140517615729344 - 140517615729440 [label=ViewBackward0] - 140517615729536 -> 140517615729440 - 140517615729536 [label=AddmmBackward0] - 140517615729632 -> 140517615729536 - 140517615729632 [label=ToCopyBackward0] - 140517615729824 -> 140517615729632 - 140509590826896 [label="encoder.layer.7.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509590826896 -> 140517615729824 - 140517615729824 [label=AccumulateGrad] - 140517615729584 -> 140517615729536 - 140517615729584 [label=ViewBackward0] - 140517615729872 -> 140517615729584 - 140517615729872 [label=ToCopyBackward0] - 140517615728624 -> 140517615729872 - 140517615728624 [label=SliceBackward0] - 140517615730016 -> 140517615728624 - 140517615730016 [label=SliceBackward0] - 140517615730112 -> 140517615730016 - 140517615730112 [label=SliceBackward0] - 140509588139888 -> 140517615730112 - 140517615729248 -> 140517615729536 - 140517615729248 [label=TBackward0] - 140517615729776 -> 140517615729248 - 140517615729776 [label=ToCopyBackward0] - 140517615730208 -> 140517615729776 - 140509985417872 [label="encoder.layer.7.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509985417872 -> 140517615730208 - 140517615730208 [label=AccumulateGrad] - 140517615728912 -> 140517615729008 - 140517615728912 [label=TBackward0] - 140517615729488 -> 140517615728912 - 140517615729488 [label=ToCopyBackward0] - 140517615729968 -> 140517615729488 - 140509590826416 [label="encoder.layer.7.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509590826416 -> 140517615729968 - 140517615729968 [label=AccumulateGrad] - 140517615728624 -> 140509588136336 - 140509588138736 -> 140509588137008 - 140509590826736 [label="encoder.layer.7.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590826736 -> 140509588138736 - 140509588138736 [label=AccumulateGrad] - 140509588136048 -> 140509588137008 - 140509590824496 [label="encoder.layer.7.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509590824496 -> 140509588136048 - 140509588136048 [label=AccumulateGrad] - 140509588105392 -> 140509588106352 - 140509588105392 [label=TBackward0] - 140509588106640 -> 140509588105392 - 140509588106640 [label=ToCopyBackward0] - 140509588165584 -> 140509588106640 - 140509590823616 [label="encoder.layer.8.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509590823616 -> 140509588165584 - 140509588165584 [label=AccumulateGrad] - 140509588105200 -> 140509588105296 - 140509588105200 [label=UnsafeViewBackward0] - 140509588136144 -> 140509588105200 - 140509588136144 [label=CloneBackward0] - 140509588106064 -> 140509588136144 - 140509588106064 [label=ExpandBackward0] - 140509588106448 -> 140509588106064 - 140509588106448 [label=TransposeBackward0] - 140509588107216 -> 140509588106448 - 140509588107216 [label=PermuteBackward0] - 140509588106880 -> 140509588107216 - 140509588106880 [label=ViewBackward0] - 140517615728960 -> 140509588106880 - 140517615728960 [label=ViewBackward0] - 140517615729200 -> 140517615728960 - 140517615729200 [label=AddmmBackward0] - 140517615729728 -> 140517615729200 - 140517615729728 [label=ToCopyBackward0] - 140517615729920 -> 140517615729728 - 140509590823776 [label="encoder.layer.8.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509590823776 -> 140517615729920 - 140517615729920 [label=AccumulateGrad] - 140517615729680 -> 140517615729200 - 140517615729680 [label=ViewBackward0] - 140517615730256 -> 140517615729680 - 140517615730256 [label=ToCopyBackward0] - 140509588077488 -> 140517615730256 - 140517615728432 -> 140517615729200 - 140517615728432 [label=TBackward0] - 140517615729392 -> 140517615728432 - 140517615729392 [label=ToCopyBackward0] - 140517615730400 -> 140517615729392 - 140509590823856 [label="encoder.layer.8.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509590823856 -> 140517615730400 - 140517615730400 [label=AccumulateGrad] - 140509588103856 -> 140509588103520 - 140509588103856 [label=UnsafeViewBackward0] - 140509588104240 -> 140509588103856 - 140509588104240 [label=CloneBackward0] - 140509588104480 -> 140509588104240 - 140509588104480 [label=ExpandBackward0] - 140509588104912 -> 140509588104480 - 140509588104912 [label=PermuteBackward0] - 140509588104048 -> 140509588104912 - 140509588104048 [label=ViewBackward0] - 140509588105968 -> 140509588104048 - 140509588105968 [label=ViewBackward0] - 140509588106736 -> 140509588105968 - 140509588106736 [label=AddmmBackward0] - 140509588105584 -> 140509588106736 - 140509588105584 [label=ToCopyBackward0] - 140517615730160 -> 140509588105584 - 140509590824016 [label="encoder.layer.8.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509590824016 -> 140517615730160 - 140517615730160 [label=AccumulateGrad] - 140509588103952 -> 140509588106736 - 140509588103952 [label=ViewBackward0] - 140517615730496 -> 140509588103952 - 140517615730496 [label=ToCopyBackward0] - 140509588077488 -> 140517615730496 - 140517615728576 -> 140509588106736 - 140517615728576 [label=TBackward0] - 140517615730064 -> 140517615728576 - 140517615730064 [label=ToCopyBackward0] - 140517615730544 -> 140517615730064 - 140509590824096 [label="encoder.layer.8.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509590824096 -> 140517615730544 - 140517615730544 [label=AccumulateGrad] - 140509588077584 -> 140509588077968 - 140509588077584 [label=TBackward0] - 140509588078256 -> 140509588077584 - 140509588078256 [label=ToCopyBackward0] - 140509588103664 -> 140509588078256 - 140509590823296 [label="encoder.layer.8.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509590823296 -> 140509588103664 - 140509588103664 [label=AccumulateGrad] - 140509588077488 -> 140509588076960 - 140509588077104 -> 140509588076912 - 140509590823136 [label="encoder.layer.8.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509590823136 -> 140509588077104 - 140509588077104 [label=AccumulateGrad] - 140509588076000 -> 140509588076912 - 140509591342912 [label="encoder.layer.8.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591342912 -> 140509588076000 - 140509588076000 [label=AccumulateGrad] - 140509588074800 -> 140509588075760 - 140509588074800 [label=TBackward0] - 140509588076336 -> 140509588074800 - 140509588076336 [label=ToCopyBackward0] - 140509588077008 -> 140509588076336 - 140509591342992 [label="encoder.layer.8.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509591342992 -> 140509588077008 - 140509588077008 [label=AccumulateGrad] - 140509588074704 -> 140509588074656 - 140509588074704 [label=UnsafeViewBackward0] - 140509588075376 -> 140509588074704 - 140509588075376 [label=CloneBackward0] - 140509588075664 -> 140509588075376 - 140509588075664 [label=ExpandBackward0] - 140509588076144 -> 140509588075664 - 140509588076144 [label=TransposeBackward0] - 140509588076816 -> 140509588076144 - 140509588076816 [label=PermuteBackward0] - 140509588077296 -> 140509588076816 - 140509588077296 [label=ViewBackward0] - 140509588077440 -> 140509588077296 - 140509588077440 [label=ViewBackward0] - 140509588077920 -> 140509588077440 - 140509588077920 [label=AddmmBackward0] - 140509588078544 -> 140509588077920 - 140509588078544 [label=ToCopyBackward0] - 140509588104432 -> 140509588078544 - 140509591342752 [label="encoder.layer.8.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509591342752 -> 140509588104432 - 140509588104432 [label=AccumulateGrad] - 140509588075088 -> 140509588077920 - 140509588075088 [label=ViewBackward0] - 140509588104720 -> 140509588075088 - 140509588104720 [label=ToCopyBackward0] - 140509588105776 -> 140509588104720 - 140509588105776 [label=ViewBackward0] - 140509588106256 -> 140509588105776 - 140509588106256 [label=CloneBackward0] - 140517615730352 -> 140509588106256 - 140517615730352 [label=ExpandBackward0] - 140517615730592 -> 140517615730352 - 140517615730592 [label=UnsqueezeBackward0] - 140517615539152 -> 140517615730592 - 140509588103568 -> 140509588077920 - 140509588103568 [label=TBackward0] - 140509588103280 -> 140509588103568 - 140509588103280 [label=ToCopyBackward0] - 140509588104960 -> 140509588103280 - 140509591342672 [label="encoder.layer.8.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509591342672 -> 140509588104960 - 140509588104960 [label=AccumulateGrad] - 140509588048624 -> 140509588048432 - 140509588048624 [label=UnsafeViewBackward0] - 140509588048960 -> 140509588048624 - 140509588048960 [label=CloneBackward0] - 140509588049392 -> 140509588048960 - 140509588049392 [label=ExpandBackward0] - 140509588048816 -> 140509588049392 - 140509588048816 [label=PermuteBackward0] - 140509588048720 -> 140509588048816 - 140509588048720 [label=ViewBackward0] - 140509588075568 -> 140509588048720 - 140509588075568 [label=ViewBackward0] - 140509588076624 -> 140509588075568 - 140509588076624 [label=AddmmBackward0] - 140509588076432 -> 140509588076624 - 140509588076432 [label=ToCopyBackward0] - 140509588103376 -> 140509588076432 - 140509591340592 [label="encoder.layer.8.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509591340592 -> 140509588103376 - 140509588103376 [label=AccumulateGrad] - 140509588077200 -> 140509588076624 - 140509588077200 [label=ViewBackward0] - 140509588104000 -> 140509588077200 - 140509588104000 [label=ToCopyBackward0] - 140509588105776 -> 140509588104000 - 140509588074608 -> 140509588076624 - 140509588074608 [label=TBackward0] - 140517615730640 -> 140509588074608 - 140517615730640 [label=ToCopyBackward0] - 140517615730448 -> 140517615730640 - 140509591342512 [label="encoder.layer.8.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509591342512 -> 140517615730448 - 140517615730448 [label=AccumulateGrad] - 140509588047088 -> 140509588047376 - 140509588047088 [label=TBackward0] - 140509588048144 -> 140509588047088 - 140509588048144 [label=ToCopyBackward0] - 140509588048528 -> 140509588048144 - 140509591340832 [label="encoder.layer.8.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509591340832 -> 140509588048528 - 140509588048528 [label=AccumulateGrad] - 140509588046896 -> 140509588046608 - 140509588046320 -> 140509588046416 - 140509591340512 [label="encoder.layer.8.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591340512 -> 140509588046320 - 140509588046320 [label=AccumulateGrad] - 140509588045888 -> 140509588046416 - 140509591340272 [label="encoder.layer.8.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591340272 -> 140509588045888 - 140509588045888 [label=AccumulateGrad] - 140509588024432 -> 140509588024912 - 140509588024432 [label=TBackward0] - 140509588046128 -> 140509588024432 - 140509588046128 [label=ToCopyBackward0] - 140509588046512 -> 140509588046128 - 140509591319952 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591319952 -> 140509588046512 - 140509588046512 [label=AccumulateGrad] - 140509588023568 -> 140509588023856 - 140509588023568 [label=TBackward0] - 140509588024576 -> 140509588023568 - 140509588024576 [label=ToCopyBackward0] - 140509588025008 -> 140509588024576 - 140509591320032 [label="encoder.layer.8.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591320032 -> 140509588025008 - 140509588025008 [label=AccumulateGrad] - 140509588023376 -> 140509588023280 - 140509588023088 -> 140509588023184 - 140509591319792 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591319792 -> 140509588023088 - 140509588023088 [label=AccumulateGrad] - 140509588022992 -> 140509588023184 - 140509591319712 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591319712 -> 140509588022992 - 140509588022992 [label=AccumulateGrad] - 140509588022800 -> 140509588022704 - 140509588022800 [label=UnsqueezeBackward0] - 140509588023472 -> 140509588022800 - 140509588023472 [label=NativeLayerNormBackward0] - 140509588023952 -> 140509588023472 - 140509588023952 [label=AddBackward0] - 140509588024528 -> 140509588023952 - 140509588024528 [label=NativeDropoutBackward0] - 140509588046032 -> 140509588024528 - 140509588046032 [label=ViewBackward0] - 140509588045936 -> 140509588046032 - 140509588045936 [label=AddmmBackward0] - 140509588047472 -> 140509588045936 - 140509588047472 [label=ToCopyBackward0] - 140509588047520 -> 140509588047472 - 140509591320512 [label="encoder.layer.8.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591320512 -> 140509588047520 - 140509588047520 [label=AccumulateGrad] - 140509588047040 -> 140509588045936 - 140509588047040 [label=ViewBackward0] - 140509588048048 -> 140509588047040 - 140509588048048 [label=GeluBackward0] - 140509588049440 -> 140509588048048 - 140509588049440 [label=ViewBackward0] - 140509588048912 -> 140509588049440 - 140509588048912 [label=AddmmBackward0] - 140509588077680 -> 140509588048912 - 140509588077680 [label=ToCopyBackward0] - 140517615729152 -> 140509588077680 - 140509591319472 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591319472 -> 140517615729152 - 140517615729152 [label=AccumulateGrad] - 140509588075856 -> 140509588048912 - 140509588075856 [label=ViewBackward0] - 140517615268000 -> 140509588075856 - 140517615268000 [label=ToCopyBackward0] - 140509588023376 -> 140517615268000 - 140509588074560 -> 140509588048912 - 140509588074560 [label=TBackward0] - 140517615267904 -> 140509588074560 - 140517615267904 [label=ToCopyBackward0] - 140517615268144 -> 140517615267904 - 140509591319552 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591319552 -> 140517615268144 - 140517615268144 [label=AccumulateGrad] - 140509588046992 -> 140509588045936 - 140509588046992 [label=TBackward0] - 140509588075184 -> 140509588046992 - 140509588075184 [label=ToCopyBackward0] - 140517615730304 -> 140509588075184 - 140509591319312 [label="encoder.layer.8.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591319312 -> 140517615730304 - 140517615730304 [label=AccumulateGrad] - 140509588023376 -> 140509588023952 - 140509588023760 -> 140509588023472 - 140509591319072 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591319072 -> 140509588023760 - 140509588023760 [label=AccumulateGrad] - 140509588022896 -> 140509588023472 - 140509591318992 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591318992 -> 140509588022896 - 140509588022896 [label=AccumulateGrad] - 140509588022416 -> 140509588022512 - 140509588022416 [label=UnsqueezeBackward0] - 140509588024240 -> 140509588022416 - 140509588024240 [label=UnsqueezeBackward0] - 140509588024096 -> 140509588024240 - 140509588024096 [label=MulBackward0] - 140509588047664 -> 140509588024096 - 140509588047664 [label=SoftmaxBackward0] - 140509588049200 -> 140509588047664 - 140509588049200 [label=MmBackward0] - 140509588046080 -> 140509588049200 - 140509588046080 [label=ToCopyBackward0] - 140517615268048 -> 140509588046080 - 140517615268048 [label=DivBackward0] - 140517615268336 -> 140517615268048 - 140517615268336 [label=SumBackward1] - 140517615268432 -> 140517615268336 - 140517615268432 [label=MulBackward0] - 140509588023376 -> 140517615268432 - 140517615267952 -> 140509588049200 - 140517615267952 [label=TBackward0] - 140517615268384 -> 140517615267952 - 140517615268384 [label=ToCopyBackward0] - 140517615268480 -> 140517615268384 - 140509591321392 [label="encoder.layer.8.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509591321392 -> 140517615268480 - 140517615268480 [label=AccumulateGrad] - 140509588021840 -> 140509587963664 - 140509588021840 [label=IndexBackward0] - 140509588023136 -> 140509588021840 - 140509588023136 [label=NativeLayerNormBackward0] - 140509588022608 -> 140509588023136 - 140509588022608 [label=AddBackward0] - 140517615268528 -> 140509588022608 - 140517615268528 [label=NativeDropoutBackward0] - 140517615268192 -> 140517615268528 - 140517615268192 [label=ViewBackward0] - 140517615268672 -> 140517615268192 - 140517615268672 [label=AddmmBackward0] - 140517615268768 -> 140517615268672 - 140517615268768 [label=ToCopyBackward0] - 140517615268960 -> 140517615268768 - 140509591339792 [label="encoder.layer.8.output.dense.bias - (768)" fillcolor=lightblue] - 140509591339792 -> 140517615268960 - 140517615268960 [label=AccumulateGrad] - 140517615268720 -> 140517615268672 - 140517615268720 [label=ViewBackward0] - 140517615269008 -> 140517615268720 - 140517615269008 [label=GeluBackward0] - 140517615269104 -> 140517615269008 - 140517615269104 [label=ViewBackward0] - 140517615269200 -> 140517615269104 - 140517615269200 [label=AddmmBackward0] - 140517615269296 -> 140517615269200 - 140517615269296 [label=ToCopyBackward0] - 140517615269488 -> 140517615269296 - 140509591340032 [label="encoder.layer.8.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509591340032 -> 140517615269488 - 140517615269488 [label=AccumulateGrad] - 140517615269248 -> 140517615269200 - 140517615269248 [label=ViewBackward0] - 140517615269536 -> 140517615269248 - 140517615269536 [label=ToCopyBackward0] - 140517615268288 -> 140517615269536 - 140517615268288 [label=SliceBackward0] - 140517615269680 -> 140517615268288 - 140517615269680 [label=SliceBackward0] - 140517615269776 -> 140517615269680 - 140517615269776 [label=SliceBackward0] - 140509588076912 -> 140517615269776 - 140517615268912 -> 140517615269200 - 140517615268912 [label=TBackward0] - 140517615269440 -> 140517615268912 - 140517615269440 [label=ToCopyBackward0] - 140517615269872 -> 140517615269440 - 140509591340352 [label="encoder.layer.8.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591340352 -> 140517615269872 - 140517615269872 [label=AccumulateGrad] - 140517615268576 -> 140517615268672 - 140517615268576 [label=TBackward0] - 140517615269152 -> 140517615268576 - 140517615269152 [label=ToCopyBackward0] - 140517615269632 -> 140517615269152 - 140509591340112 [label="encoder.layer.8.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591340112 -> 140517615269632 - 140517615269632 [label=AccumulateGrad] - 140517615268288 -> 140509588022608 - 140509588022176 -> 140509588023136 - 140509591339872 [label="encoder.layer.8.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591339872 -> 140509588022176 - 140509588022176 [label=AccumulateGrad] - 140509588046560 -> 140509588023136 - 140509591339552 [label="encoder.layer.8.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591339552 -> 140509588046560 - 140509588046560 [label=AccumulateGrad] - 140509588021312 -> 140509587991520 - 140509588021312 [label=TBackward0] - 140509588021648 -> 140509588021312 - 140509588021648 [label=ToCopyBackward0] - 140509588048336 -> 140509588021648 - 140509591321632 [label="encoder.layer.9.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509591321632 -> 140509588048336 - 140509588048336 [label=AccumulateGrad] - 140509587991472 -> 140509587991568 - 140509587991472 [label=UnsafeViewBackward0] - 140509587992144 -> 140509587991472 - 140509587992144 [label=CloneBackward0] - 140509587992528 -> 140509587992144 - 140509587992528 [label=ExpandBackward0] - 140509587991856 -> 140509587992528 - 140509587991856 [label=TransposeBackward0] - 140509588022320 -> 140509587991856 - 140509588022320 [label=PermuteBackward0] - 140509588021936 -> 140509588022320 - 140509588021936 [label=ViewBackward0] - 140517615268624 -> 140509588021936 - 140517615268624 [label=ViewBackward0] - 140517615268864 -> 140517615268624 - 140517615268864 [label=AddmmBackward0] - 140517615269392 -> 140517615268864 - 140517615269392 [label=ToCopyBackward0] - 140517615269584 -> 140517615269392 - 140509591322192 [label="encoder.layer.9.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509591322192 -> 140517615269584 - 140517615269584 [label=AccumulateGrad] - 140517615269344 -> 140517615268864 - 140517615269344 [label=ViewBackward0] - 140517615269920 -> 140517615269344 - 140517615269920 [label=ToCopyBackward0] - 140509587963664 -> 140517615269920 - 140517615268096 -> 140517615268864 - 140517615268096 [label=TBackward0] - 140517615269056 -> 140517615268096 - 140517615269056 [label=ToCopyBackward0] - 140517615270064 -> 140517615269056 - 140509591321872 [label="encoder.layer.9.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509591321872 -> 140517615270064 - 140517615270064 [label=AccumulateGrad] - 140509587990128 -> 140509587989840 - 140509587990128 [label=UnsafeViewBackward0] - 140509587990512 -> 140509587990128 - 140509587990512 [label=CloneBackward0] - 140509587990800 -> 140509587990512 - 140509587990800 [label=ExpandBackward0] - 140509587991040 -> 140509587990800 - 140509587991040 [label=PermuteBackward0] - 140509587990224 -> 140509587991040 - 140509587990224 [label=ViewBackward0] - 140509587992336 -> 140509587990224 - 140509587992336 [label=ViewBackward0] - 140509587990080 -> 140509587992336 - 140509587990080 [label=AddmmBackward0] - 140509588021360 -> 140509587990080 - 140509588021360 [label=ToCopyBackward0] - 140517615269824 -> 140509588021360 - 140509591322432 [label="encoder.layer.9.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509591322432 -> 140517615269824 - 140517615269824 [label=AccumulateGrad] - 140509588021744 -> 140509587990080 - 140509588021744 [label=ViewBackward0] - 140517615270160 -> 140509588021744 - 140517615270160 [label=ToCopyBackward0] - 140509587963664 -> 140517615270160 - 140517615268240 -> 140509587990080 - 140517615268240 [label=TBackward0] - 140517615269728 -> 140517615268240 - 140517615269728 [label=ToCopyBackward0] - 140517615270208 -> 140517615269728 - 140509591322112 [label="encoder.layer.9.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509591322112 -> 140517615270208 - 140517615270208 [label=AccumulateGrad] - 140509587988688 -> 140509587988784 - 140509587988688 [label=TBackward0] - 140509587989648 -> 140509587988688 - 140509587989648 [label=ToCopyBackward0] - 140509587989936 -> 140509587989648 - 140509591321712 [label="encoder.layer.9.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509591321712 -> 140509587989936 - 140509587989936 [label=AccumulateGrad] - 140509587963664 -> 140509587963280 - 140509587963376 -> 140509587963040 - 140509591321232 [label="encoder.layer.9.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591321232 -> 140509587963376 - 140509587963376 [label=AccumulateGrad] - 140509587962032 -> 140509587963040 - 140509591321472 [label="encoder.layer.9.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591321472 -> 140509587962032 - 140509587962032 [label=AccumulateGrad] - 140509587961120 -> 140509587961600 - 140509587961120 [label=TBackward0] - 140509587962224 -> 140509587961120 - 140509587962224 [label=ToCopyBackward0] - 140509587962896 -> 140509587962224 - 140509591311760 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591311760 -> 140509587962896 - 140509587962896 [label=AccumulateGrad] - 140509587960688 -> 140509587960976 - 140509587960688 [label=TBackward0] - 140509587961744 -> 140509587960688 - 140509587961744 [label=ToCopyBackward0] - 140509587962608 -> 140509587961744 - 140509591311440 [label="encoder.layer.9.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591311440 -> 140509587962608 - 140509587962608 [label=AccumulateGrad] - 140509587960496 -> 140509587960112 - 140509587960208 -> 140509588463424 - 140509591311200 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591311200 -> 140509587960208 - 140509587960208 [label=AccumulateGrad] - 140509587960016 -> 140509588463424 - 140509591311520 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591311520 -> 140509587960016 - 140509587960016 [label=AccumulateGrad] - 140509588463376 -> 140509588463184 - 140509588463376 [label=UnsqueezeBackward0] - 140509587960160 -> 140509588463376 - 140509587960160 [label=NativeLayerNormBackward0] - 140509587960640 -> 140509587960160 - 140509587960640 [label=AddBackward0] - 140509587963184 -> 140509587960640 - 140509587963184 [label=NativeDropoutBackward0] - 140509587961648 -> 140509587963184 - 140509587961648 [label=ViewBackward0] - 140509587962320 -> 140509587961648 - 140509587962320 [label=AddmmBackward0] - 140509587963472 -> 140509587962320 - 140509587963472 [label=ToCopyBackward0] - 140509587989168 -> 140509587963472 - 140509591311920 [label="encoder.layer.9.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591311920 -> 140509587989168 - 140509587989168 [label=AccumulateGrad] - 140509587963568 -> 140509587962320 - 140509587963568 [label=ViewBackward0] - 140509587989744 -> 140509587963568 - 140509587989744 [label=GeluBackward0] - 140509587989072 -> 140509587989744 - 140509587989072 [label=ViewBackward0] - 140509587990560 -> 140509587989072 - 140509587990560 [label=AddmmBackward0] - 140509587991280 -> 140509587990560 - 140509587991280 [label=ToCopyBackward0] - 140509588022224 -> 140509587991280 - 140509591311280 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591311280 -> 140509588022224 - 140509588022224 [label=AccumulateGrad] - 140509587990992 -> 140509587990560 - 140509587990992 [label=ViewBackward0] - 140517615270016 -> 140509587990992 - 140517615270016 [label=ToCopyBackward0] - 140509587960496 -> 140517615270016 - 140509587988880 -> 140509587990560 - 140509587988880 [label=TBackward0] - 140517615268816 -> 140509587988880 - 140517615268816 [label=ToCopyBackward0] - 140517615270112 -> 140517615268816 - 140509591310960 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591310960 -> 140517615270112 - 140517615270112 [label=AccumulateGrad] - 140509587961264 -> 140509587962320 - 140509587961264 [label=TBackward0] - 140509587989456 -> 140509587961264 - 140509587989456 [label=ToCopyBackward0] - 140509587992048 -> 140509587989456 - 140509591310720 [label="encoder.layer.9.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591310720 -> 140509587992048 - 140509587992048 [label=AccumulateGrad] - 140509587960496 -> 140509587960640 - 140509587960592 -> 140509587960160 - 140509591310480 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591310480 -> 140509587960592 - 140509587960592 [label=AccumulateGrad] - 140509587959920 -> 140509587960160 - 140509591310800 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591310800 -> 140509587959920 - 140509587959920 [label=AccumulateGrad] - 140509588463280 -> 140509588462944 - 140509588463280 [label=UnsqueezeBackward0] - 140509588463472 -> 140509588463280 - 140509588463472 [label=UnsqueezeBackward0] - 140509587962704 -> 140509588463472 - 140509587962704 [label=MulBackward0] - 140509587963856 -> 140509587962704 - 140509587963856 [label=SoftmaxBackward0] - 140509587990320 -> 140509587963856 - 140509587990320 [label=MmBackward0] - 140509587960304 -> 140509587990320 - 140509587960304 [label=ToCopyBackward0] - 140517615270304 -> 140509587960304 - 140517615270304 [label=DivBackward0] - 140517615270496 -> 140517615270304 - 140517615270496 [label=SumBackward1] - 140517615270592 -> 140517615270496 - 140517615270592 [label=MulBackward0] - 140509587960496 -> 140517615270592 - 140517615269968 -> 140509587990320 - 140517615269968 [label=TBackward0] - 140517615270544 -> 140517615269968 - 140517615270544 [label=ToCopyBackward0] - 140517615270640 -> 140517615270544 - 140509591313200 [label="encoder.layer.9.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509591313200 -> 140517615270640 - 140517615270640 [label=AccumulateGrad] - 140509588462416 -> 140509588428880 - 140509588462416 [label=IndexBackward0] - 140509588462896 -> 140509588462416 - 140509588462896 [label=NativeLayerNormBackward0] - 140509587963088 -> 140509588462896 - 140509587963088 [label=AddBackward0] - 140517615270688 -> 140509587963088 - 140517615270688 [label=NativeDropoutBackward0] - 140517615270352 -> 140517615270688 - 140517615270352 [label=ViewBackward0] - 140517615270832 -> 140517615270352 - 140517615270832 [label=AddmmBackward0] - 140517615270928 -> 140517615270832 - 140517615270928 [label=ToCopyBackward0] - 140517615271120 -> 140517615270928 - 140509591320672 [label="encoder.layer.9.output.dense.bias - (768)" fillcolor=lightblue] - 140509591320672 -> 140517615271120 - 140517615271120 [label=AccumulateGrad] - 140517615270880 -> 140517615270832 - 140517615270880 [label=ViewBackward0] - 140517615271168 -> 140517615270880 - 140517615271168 [label=GeluBackward0] - 140517615271264 -> 140517615271168 - 140517615271264 [label=ViewBackward0] - 140517615271360 -> 140517615271264 - 140517615271360 [label=AddmmBackward0] - 140517615271456 -> 140517615271360 - 140517615271456 [label=ToCopyBackward0] - 140517615271648 -> 140517615271456 - 140509591320752 [label="encoder.layer.9.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509591320752 -> 140517615271648 - 140517615271648 [label=AccumulateGrad] - 140517615271408 -> 140517615271360 - 140517615271408 [label=ViewBackward0] - 140517615271696 -> 140517615271408 - 140517615271696 [label=ToCopyBackward0] - 140517615270448 -> 140517615271696 - 140517615270448 [label=SliceBackward0] - 140517615271840 -> 140517615270448 - 140517615271840 [label=SliceBackward0] - 140517615271888 -> 140517615271840 - 140517615271888 [label=SliceBackward0] - 140509587963040 -> 140517615271888 - 140517615271072 -> 140517615271360 - 140517615271072 [label=TBackward0] - 140517615271600 -> 140517615271072 - 140517615271600 [label=ToCopyBackward0] - 140517615271552 -> 140517615271600 - 140509591320912 [label="encoder.layer.9.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591320912 -> 140517615271552 - 140517615271552 [label=AccumulateGrad] - 140517615270736 -> 140517615270832 - 140517615270736 [label=TBackward0] - 140517615271312 -> 140517615270736 - 140517615271312 [label=ToCopyBackward0] - 140517615271792 -> 140517615271312 - 140509591320992 [label="encoder.layer.9.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591320992 -> 140517615271792 - 140517615271792 [label=AccumulateGrad] - 140517615270448 -> 140509587963088 - 140509587962128 -> 140509588462896 - 140509591320432 [label="encoder.layer.9.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591320432 -> 140509587962128 - 140509587962128 [label=AccumulateGrad] - 140509587961072 -> 140509588462896 - 140509591318592 [label="encoder.layer.9.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591318592 -> 140509587961072 - 140509587961072 [label=AccumulateGrad] - 140509588461168 -> 140509588462128 - 140509588461168 [label=TBackward0] - 140509588462512 -> 140509588461168 - 140509588462512 [label=ToCopyBackward0] - 140509587988592 -> 140509588462512 - 140509591313440 [label="encoder.layer.10.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509591313440 -> 140509587988592 - 140509587988592 [label=AccumulateGrad] - 140509588461072 -> 140509588460784 - 140509588461072 [label=UnsafeViewBackward0] - 140509588461456 -> 140509588461072 - 140509588461456 [label=CloneBackward0] - 140509588461744 -> 140509588461456 - 140509588461744 [label=ExpandBackward0] - 140509588462224 -> 140509588461744 - 140509588462224 [label=TransposeBackward0] - 140509588463088 -> 140509588462224 - 140509588463088 [label=PermuteBackward0] - 140509588462800 -> 140509588463088 - 140509588462800 [label=ViewBackward0] - 140517615270784 -> 140509588462800 - 140517615270784 [label=ViewBackward0] - 140517615271024 -> 140517615270784 - 140517615271024 [label=AddmmBackward0] - 140517615271744 -> 140517615271024 - 140517615271744 [label=ToCopyBackward0] - 140517615321248 -> 140517615271744 - 140509591313600 [label="encoder.layer.10.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509591313600 -> 140517615321248 - 140517615321248 [label=AccumulateGrad] - 140517615271504 -> 140517615271024 - 140517615271504 [label=ViewBackward0] - 140517615321296 -> 140517615271504 - 140517615321296 [label=ToCopyBackward0] - 140509588428880 -> 140517615321296 - 140517615270256 -> 140517615271024 - 140517615270256 [label=TBackward0] - 140517615321152 -> 140517615270256 - 140517615321152 [label=ToCopyBackward0] - 140517615321440 -> 140517615321152 - 140509591313680 [label="encoder.layer.10.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509591313680 -> 140517615321440 - 140517615321440 [label=AccumulateGrad] - 140509588429936 -> 140509588430704 - 140509588429936 [label=UnsafeViewBackward0] - 140509588460112 -> 140509588429936 - 140509588460112 [label=CloneBackward0] - 140509588460400 -> 140509588460112 - 140509588460400 [label=ExpandBackward0] - 140509588460688 -> 140509588460400 - 140509588460688 [label=PermuteBackward0] - 140509588459632 -> 140509588460688 - 140509588459632 [label=ViewBackward0] - 140509588461504 -> 140509588459632 - 140509588461504 [label=ViewBackward0] - 140509588462704 -> 140509588461504 - 140509588462704 [label=AddmmBackward0] - 140509588461024 -> 140509588462704 - 140509588461024 [label=ToCopyBackward0] - 140517615271216 -> 140509588461024 - 140509591313840 [label="encoder.layer.10.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509591313840 -> 140517615271216 - 140517615271216 [label=AccumulateGrad] - 140509588459680 -> 140509588462704 - 140509588459680 [label=ViewBackward0] - 140517615321536 -> 140509588459680 - 140517615321536 [label=ToCopyBackward0] - 140509588428880 -> 140517615321536 - 140517615270400 -> 140509588462704 - 140517615270400 [label=TBackward0] - 140517615321392 -> 140517615270400 - 140517615321392 [label=ToCopyBackward0] - 140517615321584 -> 140517615321392 - 140509591313920 [label="encoder.layer.10.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509591313920 -> 140517615321584 - 140517615321584 [label=AccumulateGrad] - 140509588428928 -> 140509588429360 - 140509588428928 [label=TBackward0] - 140509588430128 -> 140509588428928 - 140509588430128 [label=ToCopyBackward0] - 140509588430368 -> 140509588430128 - 140509591313120 [label="encoder.layer.10.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509591313120 -> 140509588430368 - 140509588430368 [label=AccumulateGrad] - 140509588428880 -> 140509588428784 - 140509588428448 -> 140509588428592 - 140509591312640 [label="encoder.layer.10.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591312640 -> 140509588428448 - 140509588428448 [label=AccumulateGrad] - 140509588427824 -> 140509588428592 - 140509591312880 [label="encoder.layer.10.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591312880 -> 140509588427824 - 140509588427824 [label=AccumulateGrad] - 140509588426816 -> 140509588427536 - 140509588426816 [label=TBackward0] - 140509588427728 -> 140509588426816 - 140509588427728 [label=ToCopyBackward0] - 140509588428400 -> 140509588427728 - 140509591312720 [label="encoder.layer.10.crossattention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509591312720 -> 140509588428400 - 140509588428400 [label=AccumulateGrad] - 140509588405840 -> 140509588405504 - 140509588405840 [label=UnsafeViewBackward0] - 140509588406032 -> 140509588405840 - 140509588406032 [label=CloneBackward0] - 140509588427008 -> 140509588406032 - 140509588427008 [label=ExpandBackward0] - 140509588427488 -> 140509588427008 - 140509588427488 [label=TransposeBackward0] - 140509588428208 -> 140509588427488 - 140509588428208 [label=PermuteBackward0] - 140509588428688 -> 140509588428208 - 140509588428688 [label=ViewBackward0] - 140509588429264 -> 140509588428688 - 140509588429264 [label=ViewBackward0] - 140509588429744 -> 140509588429264 - 140509588429744 [label=AddmmBackward0] - 140509588430320 -> 140509588429744 - 140509588430320 [label=ToCopyBackward0] - 140509588460208 -> 140509588430320 - 140509591312480 [label="encoder.layer.10.crossattention.self.key.bias - (768)" fillcolor=lightblue] - 140509591312480 -> 140509588460208 - 140509588460208 [label=AccumulateGrad] - 140509588429648 -> 140509588429744 - 140509588429648 [label=ViewBackward0] - 140509588460592 -> 140509588429648 - 140509588460592 [label=ToCopyBackward0] - 140509588461264 -> 140509588460592 - 140509588461264 [label=ViewBackward0] - 140517615270976 -> 140509588461264 - 140517615270976 [label=CloneBackward0] - 140509588459584 -> 140517615270976 - 140509588459584 [label=ExpandBackward0] - 140517615321632 -> 140509588459584 - 140517615321632 [label=UnsqueezeBackward0] - 140517615539152 -> 140517615321632 - 140509588426864 -> 140509588429744 - 140509588426864 [label=TBackward0] - 140509588461936 -> 140509588426864 - 140509588461936 [label=ToCopyBackward0] - 140509588460880 -> 140509588461936 - 140509591312400 [label="encoder.layer.10.crossattention.self.key.weight - (768, 1408)" fillcolor=lightblue] - 140509591312400 -> 140509588460880 - 140509588460880 [label=AccumulateGrad] - 140509588404064 -> 140509588404208 - 140509588404064 [label=UnsafeViewBackward0] - 140509588404880 -> 140509588404064 - 140509588404880 [label=CloneBackward0] - 140509588405168 -> 140509588404880 - 140509588405168 [label=ExpandBackward0] - 140509588405552 -> 140509588405168 - 140509588405552 [label=PermuteBackward0] - 140509588404304 -> 140509588405552 - 140509588404304 [label=ViewBackward0] - 140509588405936 -> 140509588404304 - 140509588405936 [label=ViewBackward0] - 140509588427968 -> 140509588405936 - 140509588427968 [label=AddmmBackward0] - 140509588428112 -> 140509588427968 - 140509588428112 [label=ToCopyBackward0] - 140509588459920 -> 140509588428112 - 140509591310560 [label="encoder.layer.10.crossattention.self.value.bias - (768)" fillcolor=lightblue] - 140509591310560 -> 140509588459920 - 140509588459920 [label=AccumulateGrad] - 140509588428976 -> 140509588427968 - 140509588428976 [label=ViewBackward0] - 140509588429888 -> 140509588428976 - 140509588429888 [label=ToCopyBackward0] - 140509588461264 -> 140509588429888 - 140509588426960 -> 140509588427968 - 140509588426960 [label=TBackward0] - 140517615321680 -> 140509588426960 - 140517615321680 [label=ToCopyBackward0] - 140517615321344 -> 140517615321680 - 140509591312240 [label="encoder.layer.10.crossattention.self.value.weight - (768, 1408)" fillcolor=lightblue] - 140509591312240 -> 140517615321344 - 140517615321344 [label=AccumulateGrad] - 140509588402576 -> 140509588402864 - 140509588402576 [label=TBackward0] - 140509588403584 -> 140509588402576 - 140509588403584 [label=ToCopyBackward0] - 140509588404016 -> 140509588403584 - 140509591311040 [label="encoder.layer.10.crossattention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509591311040 -> 140509588404016 - 140509588404016 [label=AccumulateGrad] - 140509588402384 -> 140509588373360 - 140509588372784 -> 140509588373456 - 140509591293760 [label="encoder.layer.10.crossattention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591293760 -> 140509588372784 - 140509588372784 [label=AccumulateGrad] - 140509588402240 -> 140509588373456 - 140509591293520 [label="encoder.layer.10.crossattention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591293520 -> 140509588402240 - 140509588402240 [label=AccumulateGrad] - 140509588372016 -> 140509588372496 - 140509588372016 [label=TBackward0] - 140509588372688 -> 140509588372016 - 140509588372688 [label=ToCopyBackward0] - 140509588373168 -> 140509588372688 - 140509591289920 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591289920 -> 140509588373168 - 140509588373168 [label=AccumulateGrad] - 140509588371008 -> 140509588371440 - 140509588371008 [label=TBackward0] - 140509588372208 -> 140509588371008 - 140509588372208 [label=ToCopyBackward0] - 140509588372928 -> 140509588372208 - 140509591290240 [label="encoder.layer.10.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591290240 -> 140509588372928 - 140509588372928 [label=AccumulateGrad] - 140509588370960 -> 140509588370864 - 140509588370528 -> 140509588370672 - 140509591285328 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591285328 -> 140509588370528 - 140509588370528 [label=AccumulateGrad] - 140509588370576 -> 140509588370672 - 140509591285248 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591285248 -> 140509588370576 - 140509588370576 [label=AccumulateGrad] - 140509588370384 -> 140509588370192 - 140509588370384 [label=UnsqueezeBackward0] - 140509588371056 -> 140509588370384 - 140509588371056 [label=NativeLayerNormBackward0] - 140509588371536 -> 140509588371056 - 140509588371536 [label=AddBackward0] - 140509588373072 -> 140509588371536 - 140509588373072 [label=NativeDropoutBackward0] - 140509588371968 -> 140509588373072 - 140509588371968 [label=ViewBackward0] - 140509588402288 -> 140509588371968 - 140509588402288 [label=AddmmBackward0] - 140509588403248 -> 140509588402288 - 140509588403248 [label=ToCopyBackward0] - 140509588403440 -> 140509588403248 - 140509591284528 [label="encoder.layer.10.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591284528 -> 140509588403440 - 140509588403440 [label=AccumulateGrad] - 140509588402960 -> 140509588402288 - 140509588402960 [label=ViewBackward0] - 140509588403536 -> 140509588402960 - 140509588403536 [label=GeluBackward0] - 140509588405360 -> 140509588403536 - 140509588405360 [label=ViewBackward0] - 140509588404592 -> 140509588405360 - 140509588404592 [label=AddmmBackward0] - 140509588429456 -> 140509588404592 - 140509588429456 [label=ToCopyBackward0] - 140517615321776 -> 140509588429456 - 140509591284768 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591284768 -> 140517615321776 - 140517615321776 [label=AccumulateGrad] - 140509588427248 -> 140509588404592 - 140509588427248 [label=ViewBackward0] - 140517615321488 -> 140509588427248 - 140517615321488 [label=ToCopyBackward0] - 140509588370960 -> 140517615321488 - 140509588404688 -> 140509588404592 - 140509588404688 [label=TBackward0] - 140517615321728 -> 140509588404688 - 140517615321728 [label=ToCopyBackward0] - 140517615321968 -> 140517615321728 - 140509591285088 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591285088 -> 140517615321968 - 140517615321968 [label=AccumulateGrad] - 140509588402768 -> 140509588402288 - 140509588402768 [label=TBackward0] - 140509588405648 -> 140509588402768 - 140509588405648 [label=ToCopyBackward0] - 140509588405072 -> 140509588405648 - 140509591284848 [label="encoder.layer.10.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591284848 -> 140509588405072 - 140509588405072 [label=AccumulateGrad] - 140509588370960 -> 140509588371536 - 140509588371344 -> 140509588371056 - 140509591284608 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591284608 -> 140509588371344 - 140509588371344 [label=AccumulateGrad] - 140509588370480 -> 140509588371056 - 140509591285008 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591285008 -> 140509588370480 - 140509588370480 [label=AccumulateGrad] - 140509588370000 -> 140509588370096 - 140509588370000 [label=UnsqueezeBackward0] - 140509588371824 -> 140509588370000 - 140509588371824 [label=UnsqueezeBackward0] - 140509588371728 -> 140509588371824 - 140509588371728 [label=MulBackward0] - 140509588370048 -> 140509588371728 - 140509588370048 [label=SoftmaxBackward0] - 140509588403824 -> 140509588370048 - 140509588403824 [label=MmBackward0] - 140517615321824 -> 140509588403824 - 140517615321824 [label=ToCopyBackward0] - 140517615321872 -> 140517615321824 - 140517615321872 [label=DivBackward0] - 140517615322160 -> 140517615321872 - 140517615322160 [label=SumBackward1] - 140517615322256 -> 140517615322160 - 140517615322256 [label=MulBackward0] - 140509588370960 -> 140517615322256 - 140517615322064 -> 140509588403824 - 140517615322064 [label=TBackward0] - 140517615322208 -> 140517615322064 - 140517615322208 [label=ToCopyBackward0] - 140517615322304 -> 140517615322208 - 140509591291120 [label="encoder.layer.10.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509591291120 -> 140517615322304 - 140517615322304 [label=AccumulateGrad] - 140509588369520 -> 140509588315344 - 140509588369520 [label=IndexBackward0] - 140509588370768 -> 140509588369520 - 140509588370768 [label=NativeLayerNormBackward0] - 140509588372448 -> 140509588370768 - 140509588372448 [label=AddBackward0] - 140517615322352 -> 140509588372448 - 140517615322352 [label=NativeDropoutBackward0] - 140517615322016 -> 140517615322352 - 140517615322016 [label=ViewBackward0] - 140517615322496 -> 140517615322016 - 140517615322496 [label=AddmmBackward0] - 140517615322592 -> 140517615322496 - 140517615322592 [label=ToCopyBackward0] - 140517615322784 -> 140517615322592 - 140509591293040 [label="encoder.layer.10.output.dense.bias - (768)" fillcolor=lightblue] - 140509591293040 -> 140517615322784 - 140517615322784 [label=AccumulateGrad] - 140517615322544 -> 140517615322496 - 140517615322544 [label=ViewBackward0] - 140517615322832 -> 140517615322544 - 140517615322832 [label=GeluBackward0] - 140517615322928 -> 140517615322832 - 140517615322928 [label=ViewBackward0] - 140517615323024 -> 140517615322928 - 140517615323024 [label=AddmmBackward0] - 140517615323120 -> 140517615323024 - 140517615323120 [label=ToCopyBackward0] - 140517615323312 -> 140517615323120 - 140509591293280 [label="encoder.layer.10.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509591293280 -> 140517615323312 - 140517615323312 [label=AccumulateGrad] - 140517615323072 -> 140517615323024 - 140517615323072 [label=ViewBackward0] - 140517615323360 -> 140517615323072 - 140517615323360 [label=ToCopyBackward0] - 140517615322112 -> 140517615323360 - 140517615322112 [label=SliceBackward0] - 140517615323504 -> 140517615322112 - 140517615323504 [label=SliceBackward0] - 140517615323600 -> 140517615323504 - 140517615323600 [label=SliceBackward0] - 140509588428592 -> 140517615323600 - 140517615322736 -> 140517615323024 - 140517615322736 [label=TBackward0] - 140517615323264 -> 140517615322736 - 140517615323264 [label=ToCopyBackward0] - 140517615323696 -> 140517615323264 - 140509591293600 [label="encoder.layer.10.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591293600 -> 140517615323696 - 140517615323696 [label=AccumulateGrad] - 140517615322400 -> 140517615322496 - 140517615322400 [label=TBackward0] - 140517615322976 -> 140517615322400 - 140517615322976 [label=ToCopyBackward0] - 140517615323456 -> 140517615322976 - 140509591293360 [label="encoder.layer.10.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591293360 -> 140517615323456 - 140517615323456 [label=AccumulateGrad] - 140517615322112 -> 140509588372448 - 140509588369808 -> 140509588370768 - 140509591293120 [label="encoder.layer.10.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591293120 -> 140509588369808 - 140509588369808 [label=AccumulateGrad] - 140509588403104 -> 140509588370768 - 140509591292800 [label="encoder.layer.10.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591292800 -> 140509588403104 - 140509588403104 [label=AccumulateGrad] - 140509588347344 -> 140509588348304 - 140509588347344 [label=TBackward0] - 140509588348880 -> 140509588347344 - 140509588348880 [label=ToCopyBackward0] - 140509588402480 -> 140509588348880 - 140509591291360 [label="encoder.layer.11.attention.self.query.weight - (768, 768)" fillcolor=lightblue] - 140509591291360 -> 140509588402480 - 140509588402480 [label=AccumulateGrad] - 140509588347104 -> 140509588347248 - 140509588347104 [label=UnsafeViewBackward0] - 140509588347920 -> 140509588347104 - 140509588347920 [label=CloneBackward0] - 140509588348208 -> 140509588347920 - 140509588348208 [label=ExpandBackward0] - 140509588348688 -> 140509588348208 - 140509588348688 [label=TransposeBackward0] - 140509588347632 -> 140509588348688 - 140509588347632 [label=PermuteBackward0] - 140509588369712 -> 140509588347632 - 140509588369712 [label=ViewBackward0] - 140517615322448 -> 140509588369712 - 140517615322448 [label=ViewBackward0] - 140517615322688 -> 140517615322448 - 140517615322688 [label=AddmmBackward0] - 140517615323216 -> 140517615322688 - 140517615323216 [label=ToCopyBackward0] - 140517615323408 -> 140517615323216 - 140509591291920 [label="encoder.layer.11.attention.self.key.bias - (768)" fillcolor=lightblue] - 140509591291920 -> 140517615323408 - 140517615323408 [label=AccumulateGrad] - 140517615323168 -> 140517615322688 - 140517615323168 [label=ViewBackward0] - 140517615323744 -> 140517615323168 - 140517615323744 [label=ToCopyBackward0] - 140509588315344 -> 140517615323744 - 140517615321200 -> 140517615322688 - 140517615321200 [label=TBackward0] - 140517615322880 -> 140517615321200 - 140517615322880 [label=ToCopyBackward0] - 140517615323888 -> 140517615322880 - 140509591291600 [label="encoder.layer.11.attention.self.key.weight - (768, 768)" fillcolor=lightblue] - 140509591291600 -> 140517615323888 - 140517615323888 [label=AccumulateGrad] - 140509588345808 -> 140509588345616 - 140509588345808 [label=UnsafeViewBackward0] - 140509588346144 -> 140509588345808 - 140509588346144 [label=CloneBackward0] - 140509588346576 -> 140509588346144 - 140509588346576 [label=ExpandBackward0] - 140509588346864 -> 140509588346576 - 140509588346864 [label=PermuteBackward0] - 140509588346000 -> 140509588346864 - 140509588346000 [label=ViewBackward0] - 140509588348112 -> 140509588346000 - 140509588348112 [label=ViewBackward0] - 140509588348400 -> 140509588348112 - 140509588348400 [label=AddmmBackward0] - 140509588369616 -> 140509588348400 - 140509588369616 [label=ToCopyBackward0] - 140517615323648 -> 140509588369616 - 140509591292160 [label="encoder.layer.11.attention.self.value.bias - (768)" fillcolor=lightblue] - 140509591292160 -> 140517615323648 - 140517615323648 [label=AccumulateGrad] - 140509588369904 -> 140509588348400 - 140509588369904 [label=ViewBackward0] - 140517615323984 -> 140509588369904 - 140517615323984 [label=ToCopyBackward0] - 140509588315344 -> 140517615323984 - 140517615321920 -> 140509588348400 - 140517615321920 [label=TBackward0] - 140517615323552 -> 140517615321920 - 140517615323552 [label=ToCopyBackward0] - 140517615324032 -> 140517615323552 - 140509591291840 [label="encoder.layer.11.attention.self.value.weight - (768, 768)" fillcolor=lightblue] - 140509591291840 -> 140517615324032 - 140517615324032 [label=AccumulateGrad] - 140509588315536 -> 140509588315824 - 140509588315536 [label=TBackward0] - 140509588345328 -> 140509588315536 - 140509588345328 [label=ToCopyBackward0] - 140509588345712 -> 140509588345328 - 140509591291440 [label="encoder.layer.11.attention.output.dense.weight - (768, 768)" fillcolor=lightblue] - 140509591291440 -> 140509588345712 - 140509588345712 [label=AccumulateGrad] - 140509588315344 -> 140509588314960 - 140509588315056 -> 140509588314768 - 140509591290960 [label="encoder.layer.11.attention.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591290960 -> 140509588315056 - 140509588315056 [label=AccumulateGrad] - 140509588313568 -> 140509588314768 - 140509591291200 [label="encoder.layer.11.attention.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591291200 -> 140509588313568 - 140509588313568 [label=AccumulateGrad] - 140509588312272 -> 140509588313328 - 140509588312272 [label=TBackward0] - 140509588313904 -> 140509588312272 - 140509588313904 [label=ToCopyBackward0] - 140509588314576 -> 140509588313904 - 140509591260912 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591260912 -> 140509588314576 - 140509588314576 [label=AccumulateGrad] - 140509588312848 -> 140509588313232 - 140509588312848 [label=TBackward0] - 140509588312128 -> 140509588312848 - 140509588312128 [label=ToCopyBackward0] - 140509588314192 -> 140509588312128 - 140509591260592 [label="encoder.layer.11.experts.experts.0.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591260592 -> 140509588314192 - 140509588314192 [label=AccumulateGrad] - 140509588312608 -> 140509591317376 - 140509591314832 -> 140509591314640 - 140509591260352 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591260352 -> 140509591314832 - 140509591314832 [label=AccumulateGrad] - 140509591317568 -> 140509591314640 - 140509591260832 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591260832 -> 140509591317568 - 140509591317568 [label=AccumulateGrad] - 140509591315408 -> 140509588282864 - 140509591315408 [label=UnsqueezeBackward0] - 140509591268800 -> 140509591315408 - 140509591268800 [label=NativeLayerNormBackward0] - 140509588313088 -> 140509591268800 - 140509588313088 [label=AddBackward0] - 140509588314864 -> 140509588313088 - 140509588314864 [label=NativeDropoutBackward0] - 140509588312224 -> 140509588314864 - 140509588312224 [label=ViewBackward0] - 140509588314000 -> 140509588312224 - 140509588314000 [label=AddmmBackward0] - 140509588315008 -> 140509588314000 - 140509588315008 [label=ToCopyBackward0] - 140509588315920 -> 140509588315008 - 140509591259952 [label="encoder.layer.11.experts.experts.1.output_query.dense.bias - (768)" fillcolor=lightblue] - 140509591259952 -> 140509588315920 - 140509588315920 [label=AccumulateGrad] - 140509588315152 -> 140509588314000 - 140509588315152 [label=ViewBackward0] - 140509588315488 -> 140509588315152 - 140509588315488 [label=GeluBackward0] - 140509588345232 -> 140509588315488 - 140509588345232 [label=ViewBackward0] - 140509588346384 -> 140509588345232 - 140509588346384 [label=AddmmBackward0] - 140509588347056 -> 140509588346384 - 140509588347056 [label=ToCopyBackward0] - 140509588345904 -> 140509588347056 - 140509591260192 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.bias - (3072)" fillcolor=lightblue] - 140509591260192 -> 140509588345904 - 140509588345904 [label=AccumulateGrad] - 140509588346624 -> 140509588346384 - 140509588346624 [label=ViewBackward0] - 140517615323840 -> 140509588346624 - 140517615323840 [label=ToCopyBackward0] - 140509588312608 -> 140517615323840 - 140509588346096 -> 140509588346384 - 140509588346096 [label=TBackward0] - 140517615322640 -> 140509588346096 - 140517615322640 [label=ToCopyBackward0] - 140517615323936 -> 140517615322640 - 140509591260112 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591260112 -> 140517615323936 - 140517615323936 [label=AccumulateGrad] - 140509588312464 -> 140509588314000 - 140509588312464 [label=TBackward0] - 140509588344944 -> 140509588312464 - 140509588344944 [label=ToCopyBackward0] - 140509588347728 -> 140509588344944 - 140509591259872 [label="encoder.layer.11.experts.experts.1.output_query.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591259872 -> 140509588347728 - 140509588347728 [label=AccumulateGrad] - 140509588312608 -> 140509588313088 - 140509588313136 -> 140509591268800 - 140509591259632 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591259632 -> 140509588313136 - 140509588313136 [label=AccumulateGrad] - 140509588312752 -> 140509591268800 - 140509591260432 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591260432 -> 140509588312752 - 140509588312752 [label=AccumulateGrad] - 140509588282672 -> 140509588283152 - 140509588282672 [label=UnsqueezeBackward0] - 140509591318432 -> 140509588282672 - 140509591318432 [label=UnsqueezeBackward0] - 140509588314384 -> 140509591318432 - 140509588314384 [label=MulBackward0] - 140509588315440 -> 140509588314384 - 140509588315440 [label=SoftmaxBackward0] - 140509588345520 -> 140509588315440 - 140509588345520 [label=MmBackward0] - 140509588312656 -> 140509588345520 - 140509588312656 [label=ToCopyBackward0] - 140517615324128 -> 140509588312656 - 140517615324128 [label=DivBackward0] - 140517615324320 -> 140517615324128 - 140517615324320 [label=SumBackward1] - 140517615324416 -> 140517615324320 - 140517615324416 [label=MulBackward0] - 140509588312608 -> 140517615324416 - 140517615323792 -> 140509588345520 - 140517615323792 [label=TBackward0] - 140517615324368 -> 140517615323792 - 140517615324368 [label=ToCopyBackward0] - 140517615324464 -> 140517615324368 - 140509591282928 [label="encoder.layer.11.experts.gate.weight - (2, 768)" fillcolor=lightblue] - 140509591282928 -> 140517615324464 - 140517615324464 [label=AccumulateGrad] - 140509588282432 -> 140509588281712 - 140509588282432 [label=IndexBackward0] - 140509588283248 -> 140509588282432 - 140509588283248 [label=IndexBackward0] - 140509591317952 -> 140509588283248 - 140509591317952 [label=NativeLayerNormBackward0] - 140509588345040 -> 140509591317952 - 140509588345040 [label=AddBackward0] - 140517615324560 -> 140509588345040 - 140517615324560 [label=NativeDropoutBackward0] - 140517615324608 -> 140517615324560 - 140517615324608 [label=ViewBackward0] - 140517615324704 -> 140517615324608 - 140517615324704 [label=AddmmBackward0] - 140517615324800 -> 140517615324704 - 140517615324800 [label=ToCopyBackward0] - 140517615324992 -> 140517615324800 - 140509591290400 [label="encoder.layer.11.output.dense.bias - (768)" fillcolor=lightblue] - 140509591290400 -> 140517615324992 - 140517615324992 [label=AccumulateGrad] - 140517615324752 -> 140517615324704 - 140517615324752 [label=ViewBackward0] - 140517615325040 -> 140517615324752 - 140517615325040 [label=GeluBackward0] - 140517615325136 -> 140517615325040 - 140517615325136 [label=ViewBackward0] - 140517615324944 -> 140517615325136 - 140517615324944 [label=AddmmBackward0] - 140517615382736 -> 140517615324944 - 140517615382736 [label=ToCopyBackward0] - 140517615382928 -> 140517615382736 - 140509591290480 [label="encoder.layer.11.intermediate.dense.bias - (3072)" fillcolor=lightblue] - 140509591290480 -> 140517615382928 - 140517615382928 [label=AccumulateGrad] - 140517615382688 -> 140517615324944 - 140517615382688 [label=ViewBackward0] - 140517615382976 -> 140517615382688 - 140517615382976 [label=ToCopyBackward0] - 140517615324512 -> 140517615382976 - 140517615324512 [label=SliceBackward0] - 140517615383120 -> 140517615324512 - 140517615383120 [label=SliceBackward0] - 140517615383216 -> 140517615383120 - 140517615383216 [label=SliceBackward0] - 140509588314768 -> 140517615383216 - 140517615382592 -> 140517615324944 - 140517615382592 [label=TBackward0] - 140517615382880 -> 140517615382592 - 140517615382880 [label=ToCopyBackward0] - 140517615383312 -> 140517615382880 - 140509591290640 [label="encoder.layer.11.intermediate.dense.weight - (3072, 768)" fillcolor=lightblue] - 140509591290640 -> 140517615383312 - 140517615383312 [label=AccumulateGrad] - 140517615324080 -> 140517615324704 - 140517615324080 [label=TBackward0] - 140517615324896 -> 140517615324080 - 140517615324896 [label=ToCopyBackward0] - 140517615383072 -> 140517615324896 - 140509591290720 [label="encoder.layer.11.output.dense.weight - (768, 3072)" fillcolor=lightblue] - 140509591290720 -> 140517615383072 - 140517615383072 [label=AccumulateGrad] - 140517615324512 -> 140509588345040 - 140509588314672 -> 140509591317952 - 140509591290160 [label="encoder.layer.11.output.LayerNorm.weight - (768)" fillcolor=lightblue] - 140509591290160 -> 140509588314672 - 140509588314672 [label=AccumulateGrad] - 140509588313712 -> 140509591317952 - 140509591290000 [label="encoder.layer.11.output.LayerNorm.bias - (768)" fillcolor=lightblue] - 140509591290000 -> 140509588313712 - 140509588313712 [label=AccumulateGrad] - 140509588281712 -> 140509988778688 -} diff --git a/test.pdf/backward_graph.pdf b/test.pdf/backward_graph.pdf deleted file mode 100644 index 7f162b0..0000000 Binary files a/test.pdf/backward_graph.pdf and /dev/null differ diff --git a/test/datasets/test_dataset.py b/test/datasets/test_dataset.py new file mode 100644 index 0000000..c4f64a8 --- /dev/null +++ b/test/datasets/test_dataset.py @@ -0,0 +1,58 @@ +import datasets +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +from trl import SFTTrainer, DataCollatorForCompletionOnlyLM +import random +from tqdm import tqdm + +# path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/minigpt4/models/cmrc2018_trial.json" +# dataset = load_dataset("json", data_files=[path], field="data", split="train") +# tokenizer = AutoTokenizer.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased") +# def preprocess_function(example): +# import pdb; pdb.set_trace() +# model_inputs = tokenizer(example["content"], max_length=512, truncation=True) +# labels = tokenizer(example["title"], max_length=32, truncation=True) +# # label就是title编码的结果 +# model_inputs["labels"] = labels["input_ids"] +# return model_inputs +# processed_datasets = dataset.map(preprocess_function) + +dataset = load_dataset("/mnt/pfs-guan-ssai/nlu/wanghanzi/data/alpaca_20k") +train_dataset = dataset['train'] + + +for i in tqdm(range(1, len(train_dataset))): + import pdb; pdb.set_trace() + + idx = random.randint(0,i) + memory = train_dataset[idx] + memory_text = f"Instruction: {memory['instruction']}\n Answer: {memory['output']} \n" + train_dataset[i]['text'] = f"{memory_text} Instruction:{train_dataset[i]['instruction']}" + + +import pdb; pdb.set_trace() + + +model_path = "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/opt_350m" +model = AutoModelForCausalLM.from_pretrained(model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path) + + +def formatting_prompts_func(example): + import pdb; pdb.set_trace() + output_texts = [] + for i in range(len(example['instruction'])): + text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}" + output_texts.append(text) + return output_texts + +response_template = " ### Answer:" +collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer) + +trainer = SFTTrainer( + model, + train_dataset=train_dataset, + formatting_func=formatting_prompts_func, + data_collator=collator, +) +trainer.train() \ No newline at end of file