diff --git a/Pre_PromptMoE_RawProb_backward_graph b/Pre_PromptMoE_RawProb_backward_graph new file mode 100644 index 0000000..3a8d029 --- /dev/null +++ b/Pre_PromptMoE_RawProb_backward_graph @@ -0,0 +1,5294 @@ +digraph { + graph [size="739.65,739.65"] + node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled] + 140202223089520 [label=" + (1, 46, 768)" fillcolor=darkolivegreen1] + 140202228657312 [label=CatBackward0] + 140202228615488 -> 140202228657312 + 140202228615488 [label=NativeLayerNormBackward0] + 140202228614096 -> 140202228615488 + 140202228614096 [label=AddBackward0] + 140202223538720 -> 140202228614096 + 140202223538720 [label=NativeDropoutBackward0] + 140202223538912 -> 140202223538720 + 140202223538912 [label=ViewBackward0] + 140202223539008 -> 140202223538912 + 140202223539008 [label=AddmmBackward0] + 140202223539104 -> 140202223539008 + 140202223539104 [label=ToCopyBackward0] + 140202223539296 -> 140202223539104 + 140202228893712 [label="encoder.layer.11.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202228893712 -> 140202223539296 + 140202223539296 [label=AccumulateGrad] + 140202223538864 -> 140202223539008 + 140202223538864 [label=ViewBackward0] + 140202223539152 -> 140202223538864 + 140202223539152 [label=GeluBackward0] + 140202223539248 -> 140202223539152 + 140202223539248 [label=ViewBackward0] + 140202223539680 -> 140202223539248 + 140202223539680 [label=AddmmBackward0] + 140202223539584 -> 140202223539680 + 140202223539584 [label=ToCopyBackward0] + 140202223538528 -> 140202223539584 + 140202228893952 [label="encoder.layer.11.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202228893952 -> 140202223538528 + 140202223538528 [label=AccumulateGrad] + 140202223539440 -> 140202223539680 + 140202223539440 [label=ViewBackward0] + 140202223538288 -> 140202223539440 + 140202223538288 [label=ToCopyBackward0] + 140202223538480 -> 140202223538288 + 140202223538480 [label=SliceBackward0] + 140202223538336 -> 140202223538480 + 140202223538336 [label=SliceBackward0] + 140202223539776 -> 140202223538336 + 140202223539776 [label=SliceBackward0] + 140202223539872 -> 140202223539776 + 140202223539872 [label=SliceBackward0] + 140202223539968 -> 140202223539872 + 140202223539968 [label=SliceBackward0] + 140202223540064 -> 140202223539968 + 140202223540064 [label=NativeLayerNormBackward0] + 140202223540160 -> 140202223540064 + 140202223540160 [label=AddBackward0] + 140202223540352 -> 140202223540160 + 140202223540352 [label=NativeDropoutBackward0] + 140202223540304 -> 140202223540352 + 140202223540304 [label=ViewBackward0] + 140202223540400 -> 140202223540304 + 140202223540400 [label=AddmmBackward0] + 140202223540496 -> 140202223540400 + 140202223540496 [label=ToCopyBackward0] + 140202223540688 -> 140202223540496 + 140202228904080 [label="encoder.layer.11.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228904080 -> 140202223540688 + 140202223540688 [label=AccumulateGrad] + 140202223540640 -> 140202223540400 + 140202223540640 [label=ViewBackward0] + 140202223540928 -> 140202223540640 + 140202223540928 [label=ViewBackward0] + 140202223541024 -> 140202223540928 + 140202223541024 [label=CloneBackward0] + 140202223541120 -> 140202223541024 + 140202223541120 [label=PermuteBackward0] + 140202223541216 -> 140202223541120 + 140202223541216 [label=UnsafeViewBackward0] + 140202223541312 -> 140202223541216 + 140202223541312 [label=BmmBackward0] + 140202223541408 -> 140202223541312 + 140202223541408 [label=ReshapeAliasBackward0] + 140202223541504 -> 140202223541408 + 140202223541504 [label=ExpandBackward0] + 140202223541600 -> 140202223541504 + 140202223541600 [label=ToCopyBackward0] + 140202223541792 -> 140202223541600 + 140202223541792 [label=NativeDropoutBackward0] + 140202223541984 -> 140202223541792 + 140202223541984 [label=SoftmaxBackward0] + 140202223542080 -> 140202223541984 + 140202223542080 [label=AddBackward0] + 140202223541264 -> 140202223542080 + 140202223541264 [label=DivBackward0] + 140202223575296 -> 140202223541264 + 140202223575296 [label=UnsafeViewBackward0] + 140202223575392 -> 140202223575296 + 140202223575392 [label=BmmBackward0] + 140202223575584 -> 140202223575392 + 140202223575584 [label=ReshapeAliasBackward0] + 140202223575968 -> 140202223575584 + 140202223575968 [label=ExpandBackward0] + 140202223576160 -> 140202223575968 + 140202223576160 [label=PermuteBackward0] + 140202223576208 -> 140202223576160 + 140202223576208 [label=ViewBackward0] + 140202223576448 -> 140202223576208 + 140202223576448 [label=ViewBackward0] + 140202223576640 -> 140202223576448 + 140202223576640 [label=AddmmBackward0] + 140202223576688 -> 140202223576640 + 140202223576688 [label=ToCopyBackward0] + 140202223577120 -> 140202223576688 + 140202228906560 [label="encoder.layer.11.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228906560 -> 140202223577120 + 140202223577120 [label=AccumulateGrad] + 140202223576544 -> 140202223576640 + 140202223576544 [label=ViewBackward0] + 140202223577024 -> 140202223576544 + 140202223577024 [label=ToCopyBackward0] + 140202223540112 -> 140202223577024 + 140202223540112 [label=CatBackward0] + 140202223577408 -> 140202223540112 + 140202223577408 [label=NativeLayerNormBackward0] + 140202223577504 -> 140202223577408 + 140202223577504 [label=AddBackward0] + 140202223577792 -> 140202223577504 + 140202223577792 [label=SumBackward1] + 140202223578128 -> 140202223577792 + 140202223578128 [label=MulBackward0] + 140202223578368 -> 140202223578128 + 140202223578368 [label=PermuteBackward0] + 140202223578464 -> 140202223578368 + 140202223578464 [label=CatBackward0] + 140202223578656 -> 140202223578464 + 140202223578656 [label=UnsqueezeBackward0] + 140202223578944 -> 140202223578656 + 140202223578944 [label=NativeDropoutBackward0] + 140202223578752 -> 140202223578944 + 140202223578752 [label=ViewBackward0] + 140202223079536 -> 140202223578752 + 140202223079536 [label=AddmmBackward0] + 140202223079776 -> 140202223079536 + 140202223079776 [label=ToCopyBackward0] + 140202223080064 -> 140202223079776 + 140202228905360 [label="encoder.layer.10.experts.experts.0.dense2.bias + (768)" fillcolor=lightblue] + 140202228905360 -> 140202223080064 + 140202223080064 [label=AccumulateGrad] + 140202223079872 -> 140202223079536 + 140202223079872 [label=ViewBackward0] + 140202223080352 -> 140202223079872 + 140202223080352 [label=GeluBackward0] + 140202223080400 -> 140202223080352 + 140202223080400 [label=ViewBackward0] + 140202223080640 -> 140202223080400 + 140202223080640 [label=AddmmBackward0] + 140202223080832 -> 140202223080640 + 140202223080832 [label=ToCopyBackward0] + 140202223081120 -> 140202223080832 + 140202228905280 [label="encoder.layer.10.experts.experts.0.dense1.bias + (3072)" fillcolor=lightblue] + 140202228905280 -> 140202223081120 + 140202223081120 [label=AccumulateGrad] + 140202223080544 -> 140202223080640 + 140202223080544 [label=ViewBackward0] + 140202223081024 -> 140202223080544 + 140202223081024 [label=ToCopyBackward0] + 140202223577888 -> 140202223081024 + 140202223577888 [label=SliceBackward0] + 140202223081360 -> 140202223577888 + 140202223081360 [label=SliceBackward0] + 140202223081600 -> 140202223081360 + 140202223081600 [label=NativeLayerNormBackward0] + 140202223081792 -> 140202223081600 + 140202223081792 [label=AddBackward0] + 140202223082080 -> 140202223081792 + 140202223082080 [label=NativeDropoutBackward0] + 140202223082176 -> 140202223082080 + 140202223082176 [label=ViewBackward0] + 140202223082368 -> 140202223082176 + 140202223082368 [label=AddmmBackward0] + 140202223082464 -> 140202223082368 + 140202223082464 [label=ToCopyBackward0] + 140202223082848 -> 140202223082464 + 140202228924880 [label="encoder.layer.10.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228924880 -> 140202223082848 + 140202223082848 [label=AccumulateGrad] + 140202223082560 -> 140202223082368 + 140202223082560 [label=ViewBackward0] + 140202223083040 -> 140202223082560 + 140202223083040 [label=ViewBackward0] + 140202223083232 -> 140202223083040 + 140202223083232 [label=CloneBackward0] + 140202223083280 -> 140202223083232 + 140202223083280 [label=PermuteBackward0] + 140202223083424 -> 140202223083280 + 140202223083424 [label=UnsafeViewBackward0] + 140202223082800 -> 140202223083424 + 140202223082800 [label=BmmBackward0] + 140202223108400 -> 140202223082800 + 140202223108400 [label=ReshapeAliasBackward0] + 140202223108544 -> 140202223108400 + 140202223108544 [label=ExpandBackward0] + 140202223108736 -> 140202223108544 + 140202223108736 [label=ToCopyBackward0] + 140202223108928 -> 140202223108736 + 140202223108928 [label=NativeDropoutBackward0] + 140202223109024 -> 140202223108928 + 140202223109024 [label=SoftmaxBackward0] + 140202223109216 -> 140202223109024 + 140202223109216 [label=AddBackward0] + 140202223109408 -> 140202223109216 + 140202223109408 [label=DivBackward0] + 140202223109504 -> 140202223109408 + 140202223109504 [label=UnsafeViewBackward0] + 140202223109696 -> 140202223109504 + 140202223109696 [label=BmmBackward0] + 140202223109888 -> 140202223109696 + 140202223109888 [label=ReshapeAliasBackward0] + 140202223110272 -> 140202223109888 + 140202223110272 [label=ExpandBackward0] + 140202223110320 -> 140202223110272 + 140202223110320 [label=PermuteBackward0] + 140202223110560 -> 140202223110320 + 140202223110560 [label=ViewBackward0] + 140202223110752 -> 140202223110560 + 140202223110752 [label=ViewBackward0] + 140202223110800 -> 140202223110752 + 140202223110800 [label=AddmmBackward0] + 140202223111040 -> 140202223110800 + 140202223111040 [label=ToCopyBackward0] + 140202223111280 -> 140202223111040 + 140202228925600 [label="encoder.layer.10.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202228925600 -> 140202223111280 + 140202223111280 [label=AccumulateGrad] + 140202223110848 -> 140202223110800 + 140202223110848 [label=ViewBackward0] + 140202223111328 -> 140202223110848 + 140202223111328 [label=ToCopyBackward0] + 140202223081888 -> 140202223111328 + 140202223081888 [label=SliceBackward0] + 140202223111712 -> 140202223081888 + 140202223111712 [label=SliceBackward0] + 140202223111760 -> 140202223111712 + 140202223111760 [label=SliceBackward0] + 140202223112000 -> 140202223111760 + 140202223112000 [label=NativeLayerNormBackward0] + 140202223112096 -> 140202223112000 + 140202223112096 [label=AddBackward0] + 140202223137120 -> 140202223112096 + 140202223137120 [label=NativeDropoutBackward0] + 140202223137216 -> 140202223137120 + 140202223137216 [label=ViewBackward0] + 140202223137408 -> 140202223137216 + 140202223137408 [label=AddmmBackward0] + 140202223137504 -> 140202223137408 + 140202223137504 [label=ToCopyBackward0] + 140202223137888 -> 140202223137504 + 140202228926080 [label="encoder.layer.10.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228926080 -> 140202223137888 + 140202223137888 [label=AccumulateGrad] + 140202223137600 -> 140202223137408 + 140202223137600 [label=ViewBackward0] + 140202223138080 -> 140202223137600 + 140202223138080 [label=ViewBackward0] + 140202223138272 -> 140202223138080 + 140202223138272 [label=CloneBackward0] + 140202223138320 -> 140202223138272 + 140202223138320 [label=PermuteBackward0] + 140202223138560 -> 140202223138320 + 140202223138560 [label=UnsafeViewBackward0] + 140202223138752 -> 140202223138560 + 140202223138752 [label=BmmBackward0] + 140202223138800 -> 140202223138752 + 140202223138800 [label=ReshapeAliasBackward0] + 140202223138944 -> 140202223138800 + 140202223138944 [label=ExpandBackward0] + 140202223139136 -> 140202223138944 + 140202223139136 [label=ToCopyBackward0] + 140202223139328 -> 140202223139136 + 140202223139328 [label=NativeDropoutBackward0] + 140202223139424 -> 140202223139328 + 140202223139424 [label=SoftmaxBackward0] + 140202223139616 -> 140202223139424 + 140202223139616 [label=AddBackward0] + 140202223139808 -> 140202223139616 + 140202223139808 [label=DivBackward0] + 140202223139904 -> 140202223139808 + 140202223139904 [label=UnsafeViewBackward0] + 140202223140096 -> 140202223139904 + 140202223140096 [label=BmmBackward0] + 140202223140288 -> 140202223140096 + 140202223140288 [label=ReshapeAliasBackward0] + 140202223140672 -> 140202223140288 + 140202223140672 [label=ExpandBackward0] + 140202223140720 -> 140202223140672 + 140202223140720 [label=PermuteBackward0] + 140202223140768 -> 140202223140720 + 140202223140768 [label=ViewBackward0] + 140202223169888 -> 140202223140768 + 140202223169888 [label=ViewBackward0] + 140202223169936 -> 140202223169888 + 140202223169936 [label=AddmmBackward0] + 140202223170176 -> 140202223169936 + 140202223170176 [label=ToCopyBackward0] + 140202223170416 -> 140202223170176 + 140202228926800 [label="encoder.layer.10.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228926800 -> 140202223170416 + 140202223170416 [label=AccumulateGrad] + 140202223169984 -> 140202223169936 + 140202223169984 [label=ViewBackward0] + 140202223170464 -> 140202223169984 + 140202223170464 [label=ToCopyBackward0] + 140202223136928 -> 140202223170464 + 140202223136928 [label=CatBackward0] + 140202223170848 -> 140202223136928 + 140202223170848 [label=NativeLayerNormBackward0] + 140202223170944 -> 140202223170848 + 140202223170944 [label=AddBackward0] + 140202223171232 -> 140202223170944 + 140202223171232 [label=NativeDropoutBackward0] + 140202223171616 -> 140202223171232 + 140202223171616 [label=ViewBackward0] + 140202223171808 -> 140202223171616 + 140202223171808 [label=AddmmBackward0] + 140202223171856 -> 140202223171808 + 140202223171856 [label=ToCopyBackward0] + 140202223172288 -> 140202223171856 + 140202228927280 [label="encoder.layer.9.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202228927280 -> 140202223172288 + 140202223172288 [label=AccumulateGrad] + 140202223171712 -> 140202223171808 + 140202223171712 [label=ViewBackward0] + 140202223172192 -> 140202223171712 + 140202223172192 [label=GeluBackward0] + 140202223172384 -> 140202223172192 + 140202223172384 [label=ViewBackward0] + 140202223172480 -> 140202223172384 + 140202223172480 [label=AddmmBackward0] + 140202223172672 -> 140202223172480 + 140202223172672 [label=ToCopyBackward0] + 140202223172960 -> 140202223172672 + 140202228927520 [label="encoder.layer.9.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202228927520 -> 140202223172960 + 140202223172960 [label=AccumulateGrad] + 140202223172768 -> 140202223172480 + 140202223172768 [label=ViewBackward0] + 140202223173248 -> 140202223172768 + 140202223173248 [label=ToCopyBackward0] + 140202223171328 -> 140202223173248 + 140202223171328 [label=SliceBackward0] + 140202223173344 -> 140202223171328 + 140202223173344 [label=SliceBackward0] + 140202223173440 -> 140202223173344 + 140202223173440 [label=SliceBackward0] + 140202223172864 -> 140202223173440 + 140202223172864 [label=SliceBackward0] + 140202223194368 -> 140202223172864 + 140202223194368 [label=SliceBackward0] + 140202223194464 -> 140202223194368 + 140202223194464 [label=NativeLayerNormBackward0] + 140202223194656 -> 140202223194464 + 140202223194656 [label=AddBackward0] + 140202223194944 -> 140202223194656 + 140202223194944 [label=NativeDropoutBackward0] + 140202223195280 -> 140202223194944 + 140202223195280 [label=ViewBackward0] + 140202223195520 -> 140202223195280 + 140202223195520 [label=AddmmBackward0] + 140202223195712 -> 140202223195520 + 140202223195712 [label=ToCopyBackward0] + 140202223196000 -> 140202223195712 + 140202228933472 [label="encoder.layer.9.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228933472 -> 140202223196000 + 140202223196000 [label=AccumulateGrad] + 140202223195424 -> 140202223195520 + 140202223195424 [label=ViewBackward0] + 140202223195904 -> 140202223195424 + 140202223195904 [label=ViewBackward0] + 140202223196096 -> 140202223195904 + 140202223196096 [label=CloneBackward0] + 140202223196288 -> 140202223196096 + 140202223196288 [label=PermuteBackward0] + 140202223196384 -> 140202223196288 + 140202223196384 [label=UnsafeViewBackward0] + 140202223196576 -> 140202223196384 + 140202223196576 [label=BmmBackward0] + 140202223196768 -> 140202223196576 + 140202223196768 [label=ReshapeAliasBackward0] + 140202223197152 -> 140202223196768 + 140202223197152 [label=ExpandBackward0] + 140202223197200 -> 140202223197152 + 140202223197200 [label=ToCopyBackward0] + 140202223197440 -> 140202223197200 + 140202223197440 [label=NativeDropoutBackward0] + 140202223197632 -> 140202223197440 + 140202223197632 [label=SoftmaxBackward0] + 140202223197680 -> 140202223197632 + 140202223197680 [label=AddBackward0] + 140202223197920 -> 140202223197680 + 140202223197920 [label=DivBackward0] + 140202223198112 -> 140202223197920 + 140202223198112 [label=UnsafeViewBackward0] + 140202223198016 -> 140202223198112 + 140202223198016 [label=BmmBackward0] + 140202223227136 -> 140202223198016 + 140202223227136 [label=ReshapeAliasBackward0] + 140202223227232 -> 140202223227136 + 140202223227232 [label=ExpandBackward0] + 140202223227424 -> 140202223227232 + 140202223227424 [label=PermuteBackward0] + 140202223227520 -> 140202223227424 + 140202223227520 [label=ViewBackward0] + 140202223227712 -> 140202223227520 + 140202223227712 [label=ViewBackward0] + 140202223227904 -> 140202223227712 + 140202223227904 [label=AddmmBackward0] + 140202223228000 -> 140202223227904 + 140202223228000 [label=ToCopyBackward0] + 140202223228384 -> 140202223228000 + 140202228936032 [label="encoder.layer.9.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228936032 -> 140202223228384 + 140202223228384 [label=AccumulateGrad] + 140202223228096 -> 140202223227904 + 140202223228096 [label=ViewBackward0] + 140202223228576 -> 140202223228096 + 140202223228576 [label=ToCopyBackward0] + 140202223195040 -> 140202223228576 + 140202223195040 [label=CatBackward0] + 140202223228672 -> 140202223195040 + 140202223228672 [label=NativeLayerNormBackward0] + 140202223229056 -> 140202223228672 + 140202223229056 [label=AddBackward0] + 140202223229296 -> 140202223229056 + 140202223229296 [label=SumBackward1] + 140202223229440 -> 140202223229296 + 140202223229440 [label=MulBackward0] + 140202223229632 -> 140202223229440 + 140202223229632 [label=PermuteBackward0] + 140202223230016 -> 140202223229632 + 140202223230016 [label=CatBackward0] + 140202223230208 -> 140202223230016 + 140202223230208 [label=UnsqueezeBackward0] + 140202223230496 -> 140202223230208 + 140202223230496 [label=NativeDropoutBackward0] + 140202223230688 -> 140202223230496 + 140202223230688 [label=ViewBackward0] + 140202223230736 -> 140202223230688 + 140202223230736 [label=AddmmBackward0] + 140202223230880 -> 140202223230736 + 140202223230880 [label=ToCopyBackward0] + 140202223247664 -> 140202223230880 + 140202228934832 [label="encoder.layer.8.experts.experts.0.dense2.bias + (768)" fillcolor=lightblue] + 140202228934832 -> 140202223247664 + 140202223247664 [label=AccumulateGrad] + 140202223230784 -> 140202223230736 + 140202223230784 [label=ViewBackward0] + 140202223247712 -> 140202223230784 + 140202223247712 [label=GeluBackward0] + 140202223247808 -> 140202223247712 + 140202223247808 [label=ViewBackward0] + 140202223248000 -> 140202223247808 + 140202223248000 [label=AddmmBackward0] + 140202223248192 -> 140202223248000 + 140202223248192 [label=ToCopyBackward0] + 140202223248480 -> 140202223248192 + 140202228935152 [label="encoder.layer.8.experts.experts.0.dense1.bias + (3072)" fillcolor=lightblue] + 140202228935152 -> 140202223248480 + 140202223248480 [label=AccumulateGrad] + 140202223248144 -> 140202223248000 + 140202223248144 [label=ViewBackward0] + 140202223248624 -> 140202223248144 + 140202223248624 [label=ToCopyBackward0] + 140202223229152 -> 140202223248624 + 140202223229152 [label=SliceBackward0] + 140202223248768 -> 140202223229152 + 140202223248768 [label=SliceBackward0] + 140202223248960 -> 140202223248768 + 140202223248960 [label=NativeLayerNormBackward0] + 140202223249152 -> 140202223248960 + 140202223249152 [label=AddBackward0] + 140202223249440 -> 140202223249152 + 140202223249440 [label=NativeDropoutBackward0] + 140202223249824 -> 140202223249440 + 140202223249824 [label=ViewBackward0] + 140202223250016 -> 140202223249824 + 140202223250016 [label=AddmmBackward0] + 140202223250064 -> 140202223250016 + 140202223250064 [label=ToCopyBackward0] + 140202223250496 -> 140202223250064 + 140202228950656 [label="encoder.layer.8.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228950656 -> 140202223250496 + 140202223250496 [label=AccumulateGrad] + 140202223249920 -> 140202223250016 + 140202223249920 [label=ViewBackward0] + 140202223250400 -> 140202223249920 + 140202223250400 [label=ViewBackward0] + 140202223250592 -> 140202223250400 + 140202223250592 [label=CloneBackward0] + 140202223250688 -> 140202223250592 + 140202223250688 [label=PermuteBackward0] + 140202223250976 -> 140202223250688 + 140202223250976 [label=UnsafeViewBackward0] + 140202223251264 -> 140202223250976 + 140202223251264 [label=BmmBackward0] + 140202223251360 -> 140202223251264 + 140202223251360 [label=ReshapeAliasBackward0] + 140202223284384 -> 140202223251360 + 140202223284384 [label=ExpandBackward0] + 140202223284480 -> 140202223284384 + 140202223284480 [label=ToCopyBackward0] + 140202223284672 -> 140202223284480 + 140202223284672 [label=NativeDropoutBackward0] + 140202223284864 -> 140202223284672 + 140202223284864 [label=SoftmaxBackward0] + 140202223284960 -> 140202223284864 + 140202223284960 [label=AddBackward0] + 140202223285152 -> 140202223284960 + 140202223285152 [label=DivBackward0] + 140202223285344 -> 140202223285152 + 140202223285344 [label=UnsafeViewBackward0] + 140202223285440 -> 140202223285344 + 140202223285440 [label=BmmBackward0] + 140202223285632 -> 140202223285440 + 140202223285632 [label=ReshapeAliasBackward0] + 140202223286016 -> 140202223285632 + 140202223286016 [label=ExpandBackward0] + 140202223286208 -> 140202223286016 + 140202223286208 [label=PermuteBackward0] + 140202223286256 -> 140202223286208 + 140202223286256 [label=ViewBackward0] + 140202223286496 -> 140202223286256 + 140202223286496 [label=ViewBackward0] + 140202223286688 -> 140202223286496 + 140202223286688 [label=AddmmBackward0] + 140202223286736 -> 140202223286688 + 140202223286736 [label=ToCopyBackward0] + 140202223287168 -> 140202223286736 + 140202228951376 [label="encoder.layer.8.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202228951376 -> 140202223287168 + 140202223287168 [label=AccumulateGrad] + 140202223286592 -> 140202223286688 + 140202223286592 [label=ViewBackward0] + 140202223287072 -> 140202223286592 + 140202223287072 [label=ToCopyBackward0] + 140202223249536 -> 140202223287072 + 140202223249536 [label=SliceBackward0] + 140202223287456 -> 140202223249536 + 140202223287456 [label=SliceBackward0] + 140202223287648 -> 140202223287456 + 140202223287648 [label=SliceBackward0] + 140202223287696 -> 140202223287648 + 140202223287696 [label=NativeLayerNormBackward0] + 140202223287936 -> 140202223287696 + 140202223287936 [label=AddBackward0] + 140202223288176 -> 140202223287936 + 140202223288176 [label=NativeDropoutBackward0] + 140202223288224 -> 140202223288176 + 140202223288224 [label=ViewBackward0] + 140202223313152 -> 140202223288224 + 140202223313152 [label=AddmmBackward0] + 140202223313344 -> 140202223313152 + 140202223313344 [label=ToCopyBackward0] + 140202223313632 -> 140202223313344 + 140202228951856 [label="encoder.layer.8.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228951856 -> 140202223313632 + 140202223313632 [label=AccumulateGrad] + 140202223313296 -> 140202223313152 + 140202223313296 [label=ViewBackward0] + 140202223313776 -> 140202223313296 + 140202223313776 [label=ViewBackward0] + 140202223314016 -> 140202223313776 + 140202223314016 [label=CloneBackward0] + 140202223314208 -> 140202223314016 + 140202223314208 [label=PermuteBackward0] + 140202223314256 -> 140202223314208 + 140202223314256 [label=UnsafeViewBackward0] + 140202223314496 -> 140202223314256 + 140202223314496 [label=BmmBackward0] + 140202223314688 -> 140202223314496 + 140202223314688 [label=ReshapeAliasBackward0] + 140202223314784 -> 140202223314688 + 140202223314784 [label=ExpandBackward0] + 140202223314880 -> 140202223314784 + 140202223314880 [label=ToCopyBackward0] + 140202223315072 -> 140202223314880 + 140202223315072 [label=NativeDropoutBackward0] + 140202223315264 -> 140202223315072 + 140202223315264 [label=SoftmaxBackward0] + 140202223315360 -> 140202223315264 + 140202223315360 [label=AddBackward0] + 140202223315552 -> 140202223315360 + 140202223315552 [label=DivBackward0] + 140202223315744 -> 140202223315552 + 140202223315744 [label=UnsafeViewBackward0] + 140202223315840 -> 140202223315744 + 140202223315840 [label=BmmBackward0] + 140202223316032 -> 140202223315840 + 140202223316032 [label=ReshapeAliasBackward0] + 140202223316416 -> 140202223316032 + 140202223316416 [label=ExpandBackward0] + 140202223316608 -> 140202223316416 + 140202223316608 [label=PermuteBackward0] + 140202223316656 -> 140202223316608 + 140202223316656 [label=ViewBackward0] + 140202223316896 -> 140202223316656 + 140202223316896 [label=ViewBackward0] + 140202223316800 -> 140202223316896 + 140202223316800 [label=AddmmBackward0] + 140202222817488 -> 140202223316800 + 140202222817488 [label=ToCopyBackward0] + 140202222817920 -> 140202222817488 + 140202228952576 [label="encoder.layer.8.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228952576 -> 140202222817920 + 140202222817920 [label=AccumulateGrad] + 140202222817344 -> 140202223316800 + 140202222817344 [label=ViewBackward0] + 140202222817824 -> 140202222817344 + 140202222817824 [label=ToCopyBackward0] + 140202223288032 -> 140202222817824 + 140202223288032 [label=CatBackward0] + 140202222818208 -> 140202223288032 + 140202222818208 [label=NativeLayerNormBackward0] + 140202222818304 -> 140202222818208 + 140202222818304 [label=AddBackward0] + 140202222818592 -> 140202222818304 + 140202222818592 [label=NativeDropoutBackward0] + 140202222818928 -> 140202222818592 + 140202222818928 [label=ViewBackward0] + 140202222819168 -> 140202222818928 + 140202222819168 [label=AddmmBackward0] + 140202222819360 -> 140202222819168 + 140202222819360 [label=ToCopyBackward0] + 140202222819648 -> 140202222819360 + 140202228952976 [label="encoder.layer.7.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202228952976 -> 140202222819648 + 140202222819648 [label=AccumulateGrad] + 140202222819072 -> 140202222819168 + 140202222819072 [label=ViewBackward0] + 140202222819552 -> 140202222819072 + 140202222819552 [label=GeluBackward0] + 140202222819744 -> 140202222819552 + 140202222819744 [label=ViewBackward0] + 140202222819936 -> 140202222819744 + 140202222819936 [label=AddmmBackward0] + 140202222820032 -> 140202222819936 + 140202222820032 [label=ToCopyBackward0] + 140202222820416 -> 140202222820032 + 140202228965680 [label="encoder.layer.7.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202228965680 -> 140202222820416 + 140202222820416 [label=AccumulateGrad] + 140202222820128 -> 140202222819936 + 140202222820128 [label=ViewBackward0] + 140202222820608 -> 140202222820128 + 140202222820608 [label=ToCopyBackward0] + 140202222818688 -> 140202222820608 + 140202222818688 [label=SliceBackward0] + 140202222820704 -> 140202222818688 + 140202222820704 [label=SliceBackward0] + 140202222820896 -> 140202222820704 + 140202222820896 [label=SliceBackward0] + 140202222820992 -> 140202222820896 + 140202222820992 [label=SliceBackward0] + 140202222821184 -> 140202222820992 + 140202222821184 [label=SliceBackward0] + 140202222820224 -> 140202222821184 + 140202222820224 [label=NativeLayerNormBackward0] + 140202222841968 -> 140202222820224 + 140202222841968 [label=AddBackward0] + 140202222842400 -> 140202222841968 + 140202222842400 [label=NativeDropoutBackward0] + 140202222842784 -> 140202222842400 + 140202222842784 [label=ViewBackward0] + 140202222842832 -> 140202222842784 + 140202222842832 [label=AddmmBackward0] + 140202222843072 -> 140202222842832 + 140202222843072 [label=ToCopyBackward0] + 140202222843312 -> 140202222843072 + 140202228967040 [label="encoder.layer.7.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228967040 -> 140202222843312 + 140202222843312 [label=AccumulateGrad] + 140202222842880 -> 140202222842832 + 140202222842880 [label=ViewBackward0] + 140202222843360 -> 140202222842880 + 140202222843360 [label=ViewBackward0] + 140202222843456 -> 140202222843360 + 140202222843456 [label=CloneBackward0] + 140202222843648 -> 140202222843456 + 140202222843648 [label=PermuteBackward0] + 140202222843840 -> 140202222843648 + 140202222843840 [label=UnsafeViewBackward0] + 140202222843936 -> 140202222843840 + 140202222843936 [label=BmmBackward0] + 140202222844128 -> 140202222843936 + 140202222844128 [label=ReshapeAliasBackward0] + 140202222844512 -> 140202222844128 + 140202222844512 [label=ExpandBackward0] + 140202222844704 -> 140202222844512 + 140202222844704 [label=ToCopyBackward0] + 140202222844752 -> 140202222844704 + 140202222844752 [label=NativeDropoutBackward0] + 140202222844992 -> 140202222844752 + 140202222844992 [label=SoftmaxBackward0] + 140202222845184 -> 140202222844992 + 140202222845184 [label=AddBackward0] + 140202222845232 -> 140202222845184 + 140202222845232 [label=DivBackward0] + 140202222845472 -> 140202222845232 + 140202222845472 [label=UnsafeViewBackward0] + 140202222845664 -> 140202222845472 + 140202222845664 [label=BmmBackward0] + 140202222845712 -> 140202222845664 + 140202222845712 [label=ReshapeAliasBackward0] + 140202222845856 -> 140202222845712 + 140202222845856 [label=ExpandBackward0] + 140202222870688 -> 140202222845856 + 140202222870688 [label=PermuteBackward0] + 140202222870880 -> 140202222870688 + 140202222870880 [label=ViewBackward0] + 140202222870976 -> 140202222870880 + 140202222870976 [label=ViewBackward0] + 140202222871168 -> 140202222870976 + 140202222871168 [label=AddmmBackward0] + 140202222871360 -> 140202222871168 + 140202222871360 [label=ToCopyBackward0] + 140202222871648 -> 140202222871360 + 140202228982304 [label="encoder.layer.7.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228982304 -> 140202222871648 + 140202222871648 [label=AccumulateGrad] + 140202222871312 -> 140202222871168 + 140202222871312 [label=ViewBackward0] + 140202222871792 -> 140202222871312 + 140202222871792 [label=ToCopyBackward0] + 140202222842352 -> 140202222871792 + 140202222842352 [label=CatBackward0] + 140202222871936 -> 140202222842352 + 140202222871936 [label=NativeLayerNormBackward0] + 140202222872272 -> 140202222871936 + 140202222872272 [label=AddBackward0] + 140202222872704 -> 140202222872272 + 140202222872704 [label=SumBackward1] + 140202222872800 -> 140202222872704 + 140202222872800 [label=MulBackward0] + 140202222872896 -> 140202222872800 + 140202222872896 [label=PermuteBackward0] + 140202222873232 -> 140202222872896 + 140202222873232 [label=CatBackward0] + 140202222873472 -> 140202222873232 + 140202222873472 [label=UnsqueezeBackward0] + 140202222873712 -> 140202222873472 + 140202222873712 [label=NativeDropoutBackward0] + 140202222873952 -> 140202222873712 + 140202222873952 [label=ViewBackward0] + 140202222874144 -> 140202222873952 + 140202222874144 [label=AddmmBackward0] + 140202222874192 -> 140202222874144 + 140202222874192 [label=ToCopyBackward0] + 140202222874528 -> 140202222874192 + 140202228968800 [label="encoder.layer.6.experts.experts.0.dense2.bias + (768)" fillcolor=lightblue] + 140202228968800 -> 140202222874528 + 140202222874528 [label=AccumulateGrad] + 140202222874048 -> 140202222874144 + 140202222874048 [label=ViewBackward0] + 140202222874432 -> 140202222874048 + 140202222874432 [label=GeluBackward0] + 140202222903456 -> 140202222874432 + 140202222903456 [label=ViewBackward0] + 140202222903552 -> 140202222903456 + 140202222903552 [label=AddmmBackward0] + 140202222903744 -> 140202222903552 + 140202222903744 [label=ToCopyBackward0] + 140202222904032 -> 140202222903744 + 140202228968720 [label="encoder.layer.6.experts.experts.0.dense1.bias + (3072)" fillcolor=lightblue] + 140202228968720 -> 140202222904032 + 140202222904032 [label=AccumulateGrad] + 140202222903840 -> 140202222903552 + 140202222903840 [label=ViewBackward0] + 140202222904320 -> 140202222903840 + 140202222904320 [label=ToCopyBackward0] + 140202222872416 -> 140202222904320 + 140202222872416 [label=SliceBackward0] + 140202222904416 -> 140202222872416 + 140202222904416 [label=SliceBackward0] + 140202222904512 -> 140202222904416 + 140202222904512 [label=NativeLayerNormBackward0] + 140202222904896 -> 140202222904512 + 140202222904896 [label=AddBackward0] + 140202222905184 -> 140202222904896 + 140202222905184 [label=NativeDropoutBackward0] + 140202222905280 -> 140202222905184 + 140202222905280 [label=ViewBackward0] + 140202222905328 -> 140202222905280 + 140202222905328 [label=AddmmBackward0] + 140202222905568 -> 140202222905328 + 140202222905568 [label=ToCopyBackward0] + 140202222905808 -> 140202222905568 + 140202228984224 [label="encoder.layer.6.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228984224 -> 140202222905808 + 140202222905808 [label=AccumulateGrad] + 140202222905664 -> 140202222905328 + 140202222905664 [label=ViewBackward0] + 140202222906144 -> 140202222905664 + 140202222906144 [label=ViewBackward0] + 140202222906336 -> 140202222906144 + 140202222906336 [label=CloneBackward0] + 140202222906432 -> 140202222906336 + 140202222906432 [label=PermuteBackward0] + 140202222906624 -> 140202222906432 + 140202222906624 [label=UnsafeViewBackward0] + 140202222906816 -> 140202222906624 + 140202222906816 [label=BmmBackward0] + 140202222906912 -> 140202222906816 + 140202222906912 [label=ReshapeAliasBackward0] + 140202222907008 -> 140202222906912 + 140202222907008 [label=ExpandBackward0] + 140202222907200 -> 140202222907008 + 140202222907200 [label=ToCopyBackward0] + 140202222907248 -> 140202222907200 + 140202222907248 [label=NativeDropoutBackward0] + 140202222932128 -> 140202222907248 + 140202222932128 [label=SoftmaxBackward0] + 140202222932320 -> 140202222932128 + 140202222932320 [label=AddBackward0] + 140202222932368 -> 140202222932320 + 140202222932368 [label=DivBackward0] + 140202222932608 -> 140202222932368 + 140202222932608 [label=UnsafeViewBackward0] + 140202222932800 -> 140202222932608 + 140202222932800 [label=BmmBackward0] + 140202222932848 -> 140202222932800 + 140202222932848 [label=ReshapeAliasBackward0] + 140202222933376 -> 140202222932848 + 140202222933376 [label=ExpandBackward0] + 140202222933472 -> 140202222933376 + 140202222933472 [label=PermuteBackward0] + 140202222933664 -> 140202222933472 + 140202222933664 [label=ViewBackward0] + 140202222933856 -> 140202222933664 + 140202222933856 [label=ViewBackward0] + 140202222933952 -> 140202222933856 + 140202222933952 [label=AddmmBackward0] + 140202222934144 -> 140202222933952 + 140202222934144 [label=ToCopyBackward0] + 140202222934432 -> 140202222934144 + 140202228984944 [label="encoder.layer.6.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202228984944 -> 140202222934432 + 140202222934432 [label=AccumulateGrad] + 140202222933808 -> 140202222933952 + 140202222933808 [label=ViewBackward0] + 140202222934288 -> 140202222933808 + 140202222934288 [label=ToCopyBackward0] + 140202222904848 -> 140202222934288 + 140202222904848 [label=SliceBackward0] + 140202222934816 -> 140202222904848 + 140202222934816 [label=SliceBackward0] + 140202222934912 -> 140202222934816 + 140202222934912 [label=SliceBackward0] + 140202222935104 -> 140202222934912 + 140202222935104 [label=NativeLayerNormBackward0] + 140202222935296 -> 140202222935104 + 140202222935296 [label=AddBackward0] + 140202222935584 -> 140202222935296 + 140202222935584 [label=NativeDropoutBackward0] + 140202222935680 -> 140202222935584 + 140202222935680 [label=ViewBackward0] + 140202222935728 -> 140202222935680 + 140202222935728 [label=AddmmBackward0] + 140202222935968 -> 140202222935728 + 140202222935968 [label=ToCopyBackward0] + 140202222960848 -> 140202222935968 + 140202228985424 [label="encoder.layer.6.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228985424 -> 140202222960848 + 140202222960848 [label=AccumulateGrad] + 140202222935488 -> 140202222935728 + 140202222935488 [label=ViewBackward0] + 140202222961184 -> 140202222935488 + 140202222961184 [label=ViewBackward0] + 140202222961376 -> 140202222961184 + 140202222961376 [label=CloneBackward0] + 140202222961472 -> 140202222961376 + 140202222961472 [label=PermuteBackward0] + 140202222961664 -> 140202222961472 + 140202222961664 [label=UnsafeViewBackward0] + 140202222961856 -> 140202222961664 + 140202222961856 [label=BmmBackward0] + 140202222961952 -> 140202222961856 + 140202222961952 [label=ReshapeAliasBackward0] + 140202222962048 -> 140202222961952 + 140202222962048 [label=ExpandBackward0] + 140202222962240 -> 140202222962048 + 140202222962240 [label=ToCopyBackward0] + 140202222962288 -> 140202222962240 + 140202222962288 [label=NativeDropoutBackward0] + 140202222962528 -> 140202222962288 + 140202222962528 [label=SoftmaxBackward0] + 140202222962720 -> 140202222962528 + 140202222962720 [label=AddBackward0] + 140202222962768 -> 140202222962720 + 140202222962768 [label=DivBackward0] + 140202222963008 -> 140202222962768 + 140202222963008 [label=UnsafeViewBackward0] + 140202222963200 -> 140202222963008 + 140202222963200 [label=BmmBackward0] + 140202222963248 -> 140202222963200 + 140202222963248 [label=ReshapeAliasBackward0] + 140202222963776 -> 140202222963248 + 140202222963776 [label=ExpandBackward0] + 140202222963872 -> 140202222963776 + 140202222963872 [label=PermuteBackward0] + 140202222964064 -> 140202222963872 + 140202222964064 [label=ViewBackward0] + 140202222964256 -> 140202222964064 + 140202222964256 [label=ViewBackward0] + 140202222964352 -> 140202222964256 + 140202222964352 [label=AddmmBackward0] + 140202222964544 -> 140202222964352 + 140202222964544 [label=ToCopyBackward0] + 140202222964640 -> 140202222964544 + 140202228986240 [label="encoder.layer.6.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228986240 -> 140202222964640 + 140202222964640 [label=AccumulateGrad] + 140202222964208 -> 140202222964352 + 140202222964208 [label=ViewBackward0] + 140202222988064 -> 140202222964208 + 140202222988064 [label=ToCopyBackward0] + 140202222935248 -> 140202222988064 + 140202222935248 [label=CatBackward0] + 140202222988736 -> 140202222935248 + 140202222988736 [label=NativeLayerNormBackward0] + 140202222985280 -> 140202222988736 + 140202222985280 [label=AddBackward0] + 140202222985472 -> 140202222985280 + 140202222985472 [label=NativeDropoutBackward0] + 140202222985856 -> 140202222985472 + 140202222985856 [label=ViewBackward0] + 140202222986048 -> 140202222985856 + 140202222986048 [label=AddmmBackward0] + 140202222986240 -> 140202222986048 + 140202222986240 [label=ToCopyBackward0] + 140202222987680 -> 140202222986240 + 140202228986720 [label="encoder.layer.5.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202228986720 -> 140202222987680 + 140202222987680 [label=AccumulateGrad] + 140202222985952 -> 140202222986048 + 140202222985952 [label=ViewBackward0] + 140202222986432 -> 140202222985952 + 140202222986432 [label=GeluBackward0] + 140202222986624 -> 140202222986432 + 140202222986624 [label=ViewBackward0] + 140202222986672 -> 140202222986624 + 140202222986672 [label=AddmmBackward0] + 140202222986912 -> 140202222986672 + 140202222986912 [label=ToCopyBackward0] + 140202222989072 -> 140202222986912 + 140202228986960 [label="encoder.layer.5.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202228986960 -> 140202222989072 + 140202222989072 [label=AccumulateGrad] + 140202222986816 -> 140202222986672 + 140202222986816 [label=ViewBackward0] + 140202222988832 -> 140202222986816 + 140202222988832 [label=ToCopyBackward0] + 140202222985568 -> 140202222988832 + 140202222985568 [label=SliceBackward0] + 140202222987632 -> 140202222985568 + 140202222987632 [label=SliceBackward0] + 140202222989216 -> 140202222987632 + 140202222989216 [label=SliceBackward0] + 140202222987872 -> 140202222989216 + 140202222987872 [label=SliceBackward0] + 140202222987968 -> 140202222987872 + 140202222987968 [label=SliceBackward0] + 140202222988352 -> 140202222987968 + 140202222988352 [label=NativeLayerNormBackward0] + 140202222987584 -> 140202222988352 + 140202222987584 [label=AddBackward0] + 140202224191520 -> 140202222987584 + 140202224191520 [label=NativeDropoutBackward0] + 140202224191280 -> 140202224191520 + 140202224191280 [label=ViewBackward0] + 140202224191184 -> 140202224191280 + 140202224191184 [label=AddmmBackward0] + 140202224191088 -> 140202224191184 + 140202224191088 [label=ToCopyBackward0] + 140202224190896 -> 140202224191088 + 140202228988880 [label="encoder.layer.5.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202228988880 -> 140202224190896 + 140202224190896 [label=AccumulateGrad] + 140202224191232 -> 140202224191184 + 140202224191232 [label=ViewBackward0] + 140202224190944 -> 140202224191232 + 140202224190944 [label=ViewBackward0] + 140202224190848 -> 140202224190944 + 140202224190848 [label=CloneBackward0] + 140202224190752 -> 140202224190848 + 140202224190752 [label=PermuteBackward0] + 140202224190656 -> 140202224190752 + 140202224190656 [label=UnsafeViewBackward0] + 140202224190560 -> 140202224190656 + 140202224190560 [label=BmmBackward0] + 140202224190464 -> 140202224190560 + 140202224190464 [label=ReshapeAliasBackward0] + 140202224190224 -> 140202224190464 + 140202224190224 [label=ExpandBackward0] + 140202224190128 -> 140202224190224 + 140202224190128 [label=ToCopyBackward0] + 140202224190032 -> 140202224190128 + 140202224190032 [label=NativeDropoutBackward0] + 140202224189936 -> 140202224190032 + 140202224189936 [label=SoftmaxBackward0] + 140202224189840 -> 140202224189936 + 140202224189840 [label=AddBackward0] + 140202224189744 -> 140202224189840 + 140202224189744 [label=DivBackward0] + 140202224189648 -> 140202224189744 + 140202224189648 [label=UnsafeViewBackward0] + 140202224189552 -> 140202224189648 + 140202224189552 [label=BmmBackward0] + 140202224189504 -> 140202224189552 + 140202224189504 [label=ReshapeAliasBackward0] + 140202224191808 -> 140202224189504 + 140202224191808 [label=ExpandBackward0] + 140202224191904 -> 140202224191808 + 140202224191904 [label=PermuteBackward0] + 140202224192000 -> 140202224191904 + 140202224192000 [label=ViewBackward0] + 140202224192096 -> 140202224192000 + 140202224192096 [label=ViewBackward0] + 140202224192192 -> 140202224192096 + 140202224192192 [label=AddmmBackward0] + 140202224192288 -> 140202224192192 + 140202224192288 [label=ToCopyBackward0] + 140202224192480 -> 140202224192288 + 140202228989600 [label="encoder.layer.5.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228989600 -> 140202224192480 + 140202224192480 [label=AccumulateGrad] + 140202224192240 -> 140202224192192 + 140202224192240 [label=ViewBackward0] + 140202224192576 -> 140202224192240 + 140202224192576 [label=ToCopyBackward0] + 140202224191472 -> 140202224192576 + 140202224191472 [label=CatBackward0] + 140202224192720 -> 140202224191472 + 140202224192720 [label=NativeLayerNormBackward0] + 140202224192864 -> 140202224192720 + 140202224192864 [label=AddBackward0] + 140202224193056 -> 140202224192864 + 140202224193056 [label=NativeDropoutBackward0] + 140202224193200 -> 140202224193056 + 140202224193200 [label=ViewBackward0] + 140202224193296 -> 140202224193200 + 140202224193296 [label=AddmmBackward0] + 140202224193392 -> 140202224193296 + 140202224193392 [label=ToCopyBackward0] + 140202224193488 -> 140202224193392 + 140202229010656 [label="encoder.layer.4.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202229010656 -> 140202224193488 + 140202224193488 [label=AccumulateGrad] + 140202224193344 -> 140202224193296 + 140202224193344 [label=ViewBackward0] + 140210811924640 -> 140202224193344 + 140210811924640 [label=GeluBackward0] + 140210811924736 -> 140210811924640 + 140210811924736 [label=ViewBackward0] + 140210811924832 -> 140210811924736 + 140210811924832 [label=AddmmBackward0] + 140210811924928 -> 140210811924832 + 140210811924928 [label=ToCopyBackward0] + 140210811925120 -> 140210811924928 + 140202229010896 [label="encoder.layer.4.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202229010896 -> 140210811925120 + 140210811925120 [label=AccumulateGrad] + 140210811924880 -> 140210811924832 + 140210811924880 [label=ViewBackward0] + 140210811925168 -> 140210811924880 + 140210811925168 [label=ToCopyBackward0] + 140202224193008 -> 140210811925168 + 140202224193008 [label=SliceBackward0] + 140210811925312 -> 140202224193008 + 140210811925312 [label=SliceBackward0] + 140210811925408 -> 140210811925312 + 140210811925408 [label=NativeLayerNormBackward0] + 140210811925504 -> 140210811925408 + 140210811925504 [label=AddBackward0] + 140210811925696 -> 140210811925504 + 140210811925696 [label=NativeDropoutBackward0] + 140210811925840 -> 140210811925696 + 140210811925840 [label=ViewBackward0] + 140210811925936 -> 140210811925840 + 140210811925936 [label=AddmmBackward0] + 140210811926032 -> 140210811925936 + 140210811926032 [label=ToCopyBackward0] + 140210811926224 -> 140210811926032 + 140202229012816 [label="encoder.layer.4.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229012816 -> 140210811926224 + 140210811926224 [label=AccumulateGrad] + 140210811925984 -> 140210811925936 + 140210811925984 [label=ViewBackward0] + 140210811926272 -> 140210811925984 + 140210811926272 [label=ViewBackward0] + 140210811926368 -> 140210811926272 + 140210811926368 [label=CloneBackward0] + 140210811926464 -> 140210811926368 + 140210811926464 [label=PermuteBackward0] + 140210811926560 -> 140210811926464 + 140210811926560 [label=UnsafeViewBackward0] + 140210811926656 -> 140210811926560 + 140210811926656 [label=BmmBackward0] + 140210811926752 -> 140210811926656 + 140210811926752 [label=ReshapeAliasBackward0] + 140210811926896 -> 140210811926752 + 140210811926896 [label=ExpandBackward0] + 140210811926992 -> 140210811926896 + 140210811926992 [label=ToCopyBackward0] + 140210811927088 -> 140210811926992 + 140210811927088 [label=NativeDropoutBackward0] + 140210811927184 -> 140210811927088 + 140210811927184 [label=SoftmaxBackward0] + 140210811927280 -> 140210811927184 + 140210811927280 [label=AddBackward0] + 140210811927376 -> 140210811927280 + 140210811927376 [label=DivBackward0] + 140210811927472 -> 140210811927376 + 140210811927472 [label=UnsafeViewBackward0] + 140210811927568 -> 140210811927472 + 140210811927568 [label=BmmBackward0] + 140210811927664 -> 140210811927568 + 140210811927664 [label=ReshapeAliasBackward0] + 140210811927808 -> 140210811927664 + 140210811927808 [label=ExpandBackward0] + 140210811927904 -> 140210811927808 + 140210811927904 [label=PermuteBackward0] + 140210811928000 -> 140210811927904 + 140210811928000 [label=ViewBackward0] + 140210811928096 -> 140210811928000 + 140210811928096 [label=ViewBackward0] + 140210811928192 -> 140210811928096 + 140210811928192 [label=AddmmBackward0] + 140210811928288 -> 140210811928192 + 140210811928288 [label=ToCopyBackward0] + 140210811928480 -> 140210811928288 + 140202229013536 [label="encoder.layer.4.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202229013536 -> 140210811928480 + 140210811928480 [label=AccumulateGrad] + 140210811928240 -> 140210811928192 + 140210811928240 [label=ViewBackward0] + 140210811928384 -> 140210811928240 + 140210811928384 [label=ToCopyBackward0] + 140210811925648 -> 140210811928384 + 140210811925648 [label=SliceBackward0] + 140210811941024 -> 140210811925648 + 140210811941024 [label=SliceBackward0] + 140210811941120 -> 140210811941024 + 140210811941120 [label=SliceBackward0] + 140210811941216 -> 140210811941120 + 140210811941216 [label=NativeLayerNormBackward0] + 140210811941312 -> 140210811941216 + 140210811941312 [label=AddBackward0] + 140210811941504 -> 140210811941312 + 140210811941504 [label=NativeDropoutBackward0] + 140210811941648 -> 140210811941504 + 140210811941648 [label=ViewBackward0] + 140210811941744 -> 140210811941648 + 140210811941744 [label=AddmmBackward0] + 140210811941840 -> 140210811941744 + 140210811941840 [label=ToCopyBackward0] + 140210811942032 -> 140210811941840 + 140202229014016 [label="encoder.layer.4.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229014016 -> 140210811942032 + 140210811942032 [label=AccumulateGrad] + 140210811941792 -> 140210811941744 + 140210811941792 [label=ViewBackward0] + 140210811942080 -> 140210811941792 + 140210811942080 [label=ViewBackward0] + 140210811942176 -> 140210811942080 + 140210811942176 [label=CloneBackward0] + 140210811942272 -> 140210811942176 + 140210811942272 [label=PermuteBackward0] + 140210811942368 -> 140210811942272 + 140210811942368 [label=UnsafeViewBackward0] + 140210811942464 -> 140210811942368 + 140210811942464 [label=BmmBackward0] + 140210811942560 -> 140210811942464 + 140210811942560 [label=ReshapeAliasBackward0] + 140210811942704 -> 140210811942560 + 140210811942704 [label=ExpandBackward0] + 140210811942800 -> 140210811942704 + 140210811942800 [label=ToCopyBackward0] + 140210811942896 -> 140210811942800 + 140210811942896 [label=NativeDropoutBackward0] + 140210811942992 -> 140210811942896 + 140210811942992 [label=SoftmaxBackward0] + 140210811943088 -> 140210811942992 + 140210811943088 [label=AddBackward0] + 140210811943184 -> 140210811943088 + 140210811943184 [label=DivBackward0] + 140210811943280 -> 140210811943184 + 140210811943280 [label=UnsafeViewBackward0] + 140210811943376 -> 140210811943280 + 140210811943376 [label=BmmBackward0] + 140210811943472 -> 140210811943376 + 140210811943472 [label=ReshapeAliasBackward0] + 140210811943616 -> 140210811943472 + 140210811943616 [label=ExpandBackward0] + 140210811943712 -> 140210811943616 + 140210811943712 [label=PermuteBackward0] + 140210811943808 -> 140210811943712 + 140210811943808 [label=ViewBackward0] + 140210811943904 -> 140210811943808 + 140210811943904 [label=ViewBackward0] + 140210811944000 -> 140210811943904 + 140210811944000 [label=AddmmBackward0] + 140210811944096 -> 140210811944000 + 140210811944096 [label=ToCopyBackward0] + 140210811944288 -> 140210811944096 + 140202229023024 [label="encoder.layer.4.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202229023024 -> 140210811944288 + 140210811944288 [label=AccumulateGrad] + 140210811944048 -> 140210811944000 + 140210811944048 [label=ViewBackward0] + 140210811944336 -> 140210811944048 + 140210811944336 [label=ToCopyBackward0] + 140210811941456 -> 140210811944336 + 140210811941456 [label=CatBackward0] + 140210811944480 -> 140210811941456 + 140210811944480 [label=NativeLayerNormBackward0] + 140210811944624 -> 140210811944480 + 140210811944624 [label=AddBackward0] + 140210811944816 -> 140210811944624 + 140210811944816 [label=NativeDropoutBackward0] + 140210811944912 -> 140210811944816 + 140210811944912 [label=ViewBackward0] + 140210811957408 -> 140210811944912 + 140210811957408 [label=AddmmBackward0] + 140210811957504 -> 140210811957408 + 140210811957504 [label=ToCopyBackward0] + 140210811957696 -> 140210811957504 + 140202229023504 [label="encoder.layer.3.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202229023504 -> 140210811957696 + 140210811957696 [label=AccumulateGrad] + 140210811957456 -> 140210811957408 + 140210811957456 [label=ViewBackward0] + 140210811957744 -> 140210811957456 + 140210811957744 [label=GeluBackward0] + 140210811957840 -> 140210811957744 + 140210811957840 [label=ViewBackward0] + 140210811957936 -> 140210811957840 + 140210811957936 [label=AddmmBackward0] + 140210811958032 -> 140210811957936 + 140210811958032 [label=ToCopyBackward0] + 140210811958224 -> 140210811958032 + 140202229023744 [label="encoder.layer.3.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202229023744 -> 140210811958224 + 140210811958224 [label=AccumulateGrad] + 140210811957984 -> 140210811957936 + 140210811957984 [label=ViewBackward0] + 140210811958272 -> 140210811957984 + 140210811958272 [label=ToCopyBackward0] + 140210811944768 -> 140210811958272 + 140210811944768 [label=SliceBackward0] + 140210811958416 -> 140210811944768 + 140210811958416 [label=SliceBackward0] + 140210811958512 -> 140210811958416 + 140210811958512 [label=SliceBackward0] + 140210811958608 -> 140210811958512 + 140210811958608 [label=SliceBackward0] + 140210811958704 -> 140210811958608 + 140210811958704 [label=SliceBackward0] + 140210811958800 -> 140210811958704 + 140210811958800 [label=NativeLayerNormBackward0] + 140210811958896 -> 140210811958800 + 140210811958896 [label=AddBackward0] + 140210811959088 -> 140210811958896 + 140210811959088 [label=NativeDropoutBackward0] + 140210811959232 -> 140210811959088 + 140210811959232 [label=ViewBackward0] + 140210811959328 -> 140210811959232 + 140210811959328 [label=AddmmBackward0] + 140210811959424 -> 140210811959328 + 140210811959424 [label=ToCopyBackward0] + 140210811959616 -> 140210811959424 + 140202229025664 [label="encoder.layer.3.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229025664 -> 140210811959616 + 140210811959616 [label=AccumulateGrad] + 140210811959376 -> 140210811959328 + 140210811959376 [label=ViewBackward0] + 140210811959664 -> 140210811959376 + 140210811959664 [label=ViewBackward0] + 140210811959760 -> 140210811959664 + 140210811959760 [label=CloneBackward0] + 140210811959856 -> 140210811959760 + 140210811959856 [label=PermuteBackward0] + 140210811959952 -> 140210811959856 + 140210811959952 [label=UnsafeViewBackward0] + 140210811960048 -> 140210811959952 + 140210811960048 [label=BmmBackward0] + 140210811960144 -> 140210811960048 + 140210811960144 [label=ReshapeAliasBackward0] + 140210811960288 -> 140210811960144 + 140210811960288 [label=ExpandBackward0] + 140210811960384 -> 140210811960288 + 140210811960384 [label=ToCopyBackward0] + 140210811960480 -> 140210811960384 + 140210811960480 [label=NativeDropoutBackward0] + 140210811960576 -> 140210811960480 + 140210811960576 [label=SoftmaxBackward0] + 140210811960672 -> 140210811960576 + 140210811960672 [label=AddBackward0] + 140210811960768 -> 140210811960672 + 140210811960768 [label=DivBackward0] + 140210811960864 -> 140210811960768 + 140210811960864 [label=UnsafeViewBackward0] + 140210811960960 -> 140210811960864 + 140210811960960 [label=BmmBackward0] + 140210811961056 -> 140210811960960 + 140210811961056 [label=ReshapeAliasBackward0] + 140210811961200 -> 140210811961056 + 140210811961200 [label=ExpandBackward0] + 140210811961296 -> 140210811961200 + 140210811961296 [label=PermuteBackward0] + 140210811961104 -> 140210811961296 + 140210811961104 [label=ViewBackward0] + 140210811973840 -> 140210811961104 + 140210811973840 [label=ViewBackward0] + 140210811973936 -> 140210811973840 + 140210811973936 [label=AddmmBackward0] + 140210811974032 -> 140210811973936 + 140210811974032 [label=ToCopyBackward0] + 140210811974224 -> 140210811974032 + 140202229026384 [label="encoder.layer.3.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202229026384 -> 140210811974224 + 140210811974224 [label=AccumulateGrad] + 140210811973984 -> 140210811973936 + 140210811973984 [label=ViewBackward0] + 140210811974272 -> 140210811973984 + 140210811974272 [label=ToCopyBackward0] + 140210811959040 -> 140210811974272 + 140210811959040 [label=CatBackward0] + 140210811974416 -> 140210811959040 + 140210811974416 [label=NativeLayerNormBackward0] + 140210811974560 -> 140210811974416 + 140210811974560 [label=AddBackward0] + 140210811974752 -> 140210811974560 + 140210811974752 [label=NativeDropoutBackward0] + 140210811974896 -> 140210811974752 + 140210811974896 [label=ViewBackward0] + 140210811974992 -> 140210811974896 + 140210811974992 [label=AddmmBackward0] + 140210811975088 -> 140210811974992 + 140210811975088 [label=ToCopyBackward0] + 140210811975280 -> 140210811975088 + 140202229039248 [label="encoder.layer.2.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202229039248 -> 140210811975280 + 140210811975280 [label=AccumulateGrad] + 140210811975040 -> 140210811974992 + 140210811975040 [label=ViewBackward0] + 140210811975328 -> 140210811975040 + 140210811975328 [label=GeluBackward0] + 140210811975424 -> 140210811975328 + 140210811975424 [label=ViewBackward0] + 140210811975520 -> 140210811975424 + 140210811975520 [label=AddmmBackward0] + 140210811975616 -> 140210811975520 + 140210811975616 [label=ToCopyBackward0] + 140210811975808 -> 140210811975616 + 140202229039488 [label="encoder.layer.2.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202229039488 -> 140210811975808 + 140210811975808 [label=AccumulateGrad] + 140210811975568 -> 140210811975520 + 140210811975568 [label=ViewBackward0] + 140210811975856 -> 140210811975568 + 140210811975856 [label=ToCopyBackward0] + 140210811974704 -> 140210811975856 + 140210811974704 [label=SliceBackward0] + 140210811976000 -> 140210811974704 + 140210811976000 [label=SliceBackward0] + 140210811976096 -> 140210811976000 + 140210811976096 [label=NativeLayerNormBackward0] + 140210811976192 -> 140210811976096 + 140210811976192 [label=AddBackward0] + 140210811976384 -> 140210811976192 + 140210811976384 [label=NativeDropoutBackward0] + 140210811976528 -> 140210811976384 + 140210811976528 [label=ViewBackward0] + 140210811976624 -> 140210811976528 + 140210811976624 [label=AddmmBackward0] + 140210811976720 -> 140210811976624 + 140210811976720 [label=ToCopyBackward0] + 140210811976912 -> 140210811976720 + 140202229041408 [label="encoder.layer.2.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229041408 -> 140210811976912 + 140210811976912 [label=AccumulateGrad] + 140210811976672 -> 140210811976624 + 140210811976672 [label=ViewBackward0] + 140210811976960 -> 140210811976672 + 140210811976960 [label=ViewBackward0] + 140210811977056 -> 140210811976960 + 140210811977056 [label=CloneBackward0] + 140210811977152 -> 140210811977056 + 140210811977152 [label=PermuteBackward0] + 140210811977248 -> 140210811977152 + 140210811977248 [label=UnsafeViewBackward0] + 140210811977344 -> 140210811977248 + 140210811977344 [label=BmmBackward0] + 140210811977440 -> 140210811977344 + 140210811977440 [label=ReshapeAliasBackward0] + 140210811977584 -> 140210811977440 + 140210811977584 [label=ExpandBackward0] + 140210811977680 -> 140210811977584 + 140210811977680 [label=ToCopyBackward0] + 140210811977488 -> 140210811977680 + 140210811977488 [label=NativeDropoutBackward0] + 140210811994320 -> 140210811977488 + 140210811994320 [label=SoftmaxBackward0] + 140210811994416 -> 140210811994320 + 140210811994416 [label=AddBackward0] + 140210811994512 -> 140210811994416 + 140210811994512 [label=DivBackward0] + 140210811994608 -> 140210811994512 + 140210811994608 [label=UnsafeViewBackward0] + 140210811994704 -> 140210811994608 + 140210811994704 [label=BmmBackward0] + 140210811994800 -> 140210811994704 + 140210811994800 [label=ReshapeAliasBackward0] + 140210811994944 -> 140210811994800 + 140210811994944 [label=ExpandBackward0] + 140210811995040 -> 140210811994944 + 140210811995040 [label=PermuteBackward0] + 140210811995136 -> 140210811995040 + 140210811995136 [label=ViewBackward0] + 140210811995232 -> 140210811995136 + 140210811995232 [label=ViewBackward0] + 140210811995328 -> 140210811995232 + 140210811995328 [label=AddmmBackward0] + 140210811995424 -> 140210811995328 + 140210811995424 [label=ToCopyBackward0] + 140210811995616 -> 140210811995424 + 140202229042128 [label="encoder.layer.2.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202229042128 -> 140210811995616 + 140210811995616 [label=AccumulateGrad] + 140210811995376 -> 140210811995328 + 140210811995376 [label=ViewBackward0] + 140210811995664 -> 140210811995376 + 140210811995664 [label=ToCopyBackward0] + 140210811976336 -> 140210811995664 + 140210811976336 [label=SliceBackward0] + 140210811995808 -> 140210811976336 + 140210811995808 [label=SliceBackward0] + 140210811995904 -> 140210811995808 + 140210811995904 [label=SliceBackward0] + 140210811996000 -> 140210811995904 + 140210811996000 [label=NativeLayerNormBackward0] + 140210811996096 -> 140210811996000 + 140210811996096 [label=AddBackward0] + 140210811996288 -> 140210811996096 + 140210811996288 [label=NativeDropoutBackward0] + 140210811996432 -> 140210811996288 + 140210811996432 [label=ViewBackward0] + 140210811996528 -> 140210811996432 + 140210811996528 [label=AddmmBackward0] + 140210811996624 -> 140210811996528 + 140210811996624 [label=ToCopyBackward0] + 140210811996816 -> 140210811996624 + 140202229042608 [label="encoder.layer.2.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229042608 -> 140210811996816 + 140210811996816 [label=AccumulateGrad] + 140210811996576 -> 140210811996528 + 140210811996576 [label=ViewBackward0] + 140210811996864 -> 140210811996576 + 140210811996864 [label=ViewBackward0] + 140210811996960 -> 140210811996864 + 140210811996960 [label=CloneBackward0] + 140210811997056 -> 140210811996960 + 140210811997056 [label=PermuteBackward0] + 140210811997152 -> 140210811997056 + 140210811997152 [label=UnsafeViewBackward0] + 140210811997248 -> 140210811997152 + 140210811997248 [label=BmmBackward0] + 140210811997344 -> 140210811997248 + 140210811997344 [label=ReshapeAliasBackward0] + 140210811997488 -> 140210811997344 + 140210811997488 [label=ExpandBackward0] + 140210811997584 -> 140210811997488 + 140210811997584 [label=ToCopyBackward0] + 140210811997680 -> 140210811997584 + 140210811997680 [label=NativeDropoutBackward0] + 140210811997776 -> 140210811997680 + 140210811997776 [label=SoftmaxBackward0] + 140210811997872 -> 140210811997776 + 140210811997872 [label=AddBackward0] + 140210811997968 -> 140210811997872 + 140210811997968 [label=DivBackward0] + 140210811998064 -> 140210811997968 + 140210811998064 [label=UnsafeViewBackward0] + 140210811998160 -> 140210811998064 + 140210811998160 [label=BmmBackward0] + 140210811997392 -> 140210811998160 + 140210811997392 [label=ReshapeAliasBackward0] + 140210812006656 -> 140210811997392 + 140210812006656 [label=ExpandBackward0] + 140210812006752 -> 140210812006656 + 140210812006752 [label=PermuteBackward0] + 140210812006848 -> 140210812006752 + 140210812006848 [label=ViewBackward0] + 140210812006944 -> 140210812006848 + 140210812006944 [label=ViewBackward0] + 140210812007040 -> 140210812006944 + 140210812007040 [label=AddmmBackward0] + 140210812007136 -> 140210812007040 + 140210812007136 [label=ToCopyBackward0] + 140210812007328 -> 140210812007136 + 140202229047520 [label="encoder.layer.2.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202229047520 -> 140210812007328 + 140210812007328 [label=AccumulateGrad] + 140210812007088 -> 140210812007040 + 140210812007088 [label=ViewBackward0] + 140210812007376 -> 140210812007088 + 140210812007376 [label=ToCopyBackward0] + 140210811996240 -> 140210812007376 + 140210811996240 [label=CatBackward0] + 140210812007520 -> 140210811996240 + 140210812007520 [label=NativeLayerNormBackward0] + 140210812007664 -> 140210812007520 + 140210812007664 [label=AddBackward0] + 140210812007856 -> 140210812007664 + 140210812007856 [label=NativeDropoutBackward0] + 140210812008000 -> 140210812007856 + 140210812008000 [label=ViewBackward0] + 140210812008096 -> 140210812008000 + 140210812008096 [label=AddmmBackward0] + 140210812008192 -> 140210812008096 + 140210812008192 [label=ToCopyBackward0] + 140210812008384 -> 140210812008192 + 140202229048000 [label="encoder.layer.1.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202229048000 -> 140210812008384 + 140210812008384 [label=AccumulateGrad] + 140210812008144 -> 140210812008096 + 140210812008144 [label=ViewBackward0] + 140210812008432 -> 140210812008144 + 140210812008432 [label=GeluBackward0] + 140210812008528 -> 140210812008432 + 140210812008528 [label=ViewBackward0] + 140210812008624 -> 140210812008528 + 140210812008624 [label=AddmmBackward0] + 140210812008720 -> 140210812008624 + 140210812008720 [label=ToCopyBackward0] + 140210812008912 -> 140210812008720 + 140202229048240 [label="encoder.layer.1.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202229048240 -> 140210812008912 + 140210812008912 [label=AccumulateGrad] + 140210812008672 -> 140210812008624 + 140210812008672 [label=ViewBackward0] + 140210812008960 -> 140210812008672 + 140210812008960 [label=ToCopyBackward0] + 140210812007808 -> 140210812008960 + 140210812007808 [label=SliceBackward0] + 140210812009104 -> 140210812007808 + 140210812009104 [label=SliceBackward0] + 140210812009200 -> 140210812009104 + 140210812009200 [label=SliceBackward0] + 140210812009296 -> 140210812009200 + 140210812009296 [label=SliceBackward0] + 140210812009392 -> 140210812009296 + 140210812009392 [label=SliceBackward0] + 140210812009488 -> 140210812009392 + 140210812009488 [label=NativeLayerNormBackward0] + 140210812009584 -> 140210812009488 + 140210812009584 [label=AddBackward0] + 140210812009776 -> 140210812009584 + 140210812009776 [label=NativeDropoutBackward0] + 140210812009920 -> 140210812009776 + 140210812009920 [label=ViewBackward0] + 140210812010016 -> 140210812009920 + 140210812010016 [label=AddmmBackward0] + 140210812010112 -> 140210812010016 + 140210812010112 [label=ToCopyBackward0] + 140210812010304 -> 140210812010112 + 140202229050160 [label="encoder.layer.1.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229050160 -> 140210812010304 + 140210812010304 [label=AccumulateGrad] + 140210812010064 -> 140210812010016 + 140210812010064 [label=ViewBackward0] + 140210812010352 -> 140210812010064 + 140210812010352 [label=ViewBackward0] + 140210812010448 -> 140210812010352 + 140210812010448 [label=CloneBackward0] + 140210812010256 -> 140210812010448 + 140210812010256 [label=PermuteBackward0] + 140210812022992 -> 140210812010256 + 140210812022992 [label=UnsafeViewBackward0] + 140210812023088 -> 140210812022992 + 140210812023088 [label=BmmBackward0] + 140210812023184 -> 140210812023088 + 140210812023184 [label=ReshapeAliasBackward0] + 140210812023328 -> 140210812023184 + 140210812023328 [label=ExpandBackward0] + 140210812023424 -> 140210812023328 + 140210812023424 [label=ToCopyBackward0] + 140210812023520 -> 140210812023424 + 140210812023520 [label=NativeDropoutBackward0] + 140210812023616 -> 140210812023520 + 140210812023616 [label=SoftmaxBackward0] + 140210812023712 -> 140210812023616 + 140210812023712 [label=AddBackward0] + 140210812023808 -> 140210812023712 + 140210812023808 [label=DivBackward0] + 140210812023904 -> 140210812023808 + 140210812023904 [label=UnsafeViewBackward0] + 140210812024000 -> 140210812023904 + 140210812024000 [label=BmmBackward0] + 140210812024096 -> 140210812024000 + 140210812024096 [label=ReshapeAliasBackward0] + 140210812024240 -> 140210812024096 + 140210812024240 [label=ExpandBackward0] + 140210812024336 -> 140210812024240 + 140210812024336 [label=PermuteBackward0] + 140210812024432 -> 140210812024336 + 140210812024432 [label=ViewBackward0] + 140210812024528 -> 140210812024432 + 140210812024528 [label=ViewBackward0] + 140210812024624 -> 140210812024528 + 140210812024624 [label=AddmmBackward0] + 140210812024720 -> 140210812024624 + 140210812024720 [label=ToCopyBackward0] + 140210812024912 -> 140210812024720 + 140202229050880 [label="encoder.layer.1.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202229050880 -> 140210812024912 + 140210812024912 [label=AccumulateGrad] + 140210812024672 -> 140210812024624 + 140210812024672 [label=ViewBackward0] + 140210812024960 -> 140210812024672 + 140210812024960 [label=ToCopyBackward0] + 140210812009728 -> 140210812024960 + 140210812009728 [label=CatBackward0] + 140210812025104 -> 140210812009728 + 140210812025104 [label=NativeLayerNormBackward0] + 140210812025248 -> 140210812025104 + 140210812025248 [label=AddBackward0] + 140210812025440 -> 140210812025248 + 140210812025440 [label=NativeDropoutBackward0] + 140210812025584 -> 140210812025440 + 140210812025584 [label=ViewBackward0] + 140210812025680 -> 140210812025584 + 140210812025680 [label=AddmmBackward0] + 140210812025776 -> 140210812025680 + 140210812025776 [label=ToCopyBackward0] + 140210812025968 -> 140210812025776 + 140202229067840 [label="encoder.layer.0.experts.dense2.bias + (768)" fillcolor=lightblue] + 140202229067840 -> 140210812025968 + 140210812025968 [label=AccumulateGrad] + 140210812025728 -> 140210812025680 + 140210812025728 [label=ViewBackward0] + 140210812026016 -> 140210812025728 + 140210812026016 [label=GeluBackward0] + 140210812026112 -> 140210812026016 + 140210812026112 [label=ViewBackward0] + 140210812026208 -> 140210812026112 + 140210812026208 [label=AddmmBackward0] + 140210812026304 -> 140210812026208 + 140210812026304 [label=ToCopyBackward0] + 140210812026496 -> 140210812026304 + 140202229068080 [label="encoder.layer.0.experts.dense1.bias + (3072)" fillcolor=lightblue] + 140202229068080 -> 140210812026496 + 140210812026496 [label=AccumulateGrad] + 140210812026256 -> 140210812026208 + 140210812026256 [label=ViewBackward0] + 140210812026544 -> 140210812026256 + 140210812026544 [label=ToCopyBackward0] + 140210812025392 -> 140210812026544 + 140210812025392 [label=SliceBackward0] + 140210812026688 -> 140210812025392 + 140210812026688 [label=SliceBackward0] + 140210812026784 -> 140210812026688 + 140210812026784 [label=NativeLayerNormBackward0] + 140210812026832 -> 140210812026784 + 140210812026832 [label=AddBackward0] + 140210812039424 -> 140210812026832 + 140210812039424 [label=NativeDropoutBackward0] + 140210812039568 -> 140210812039424 + 140210812039568 [label=ViewBackward0] + 140210812039664 -> 140210812039568 + 140210812039664 [label=AddmmBackward0] + 140210812039760 -> 140210812039664 + 140210812039760 [label=ToCopyBackward0] + 140210812039952 -> 140210812039760 + 140202229070000 [label="encoder.layer.0.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229070000 -> 140210812039952 + 140210812039952 [label=AccumulateGrad] + 140210812039712 -> 140210812039664 + 140210812039712 [label=ViewBackward0] + 140210812040000 -> 140210812039712 + 140210812040000 [label=ViewBackward0] + 140210812040096 -> 140210812040000 + 140210812040096 [label=CloneBackward0] + 140210812040192 -> 140210812040096 + 140210812040192 [label=PermuteBackward0] + 140210812040288 -> 140210812040192 + 140210812040288 [label=UnsafeViewBackward0] + 140210812040384 -> 140210812040288 + 140210812040384 [label=BmmBackward0] + 140210812040480 -> 140210812040384 + 140210812040480 [label=ReshapeAliasBackward0] + 140210812040624 -> 140210812040480 + 140210812040624 [label=ExpandBackward0] + 140210812040720 -> 140210812040624 + 140210812040720 [label=ToCopyBackward0] + 140210812040816 -> 140210812040720 + 140210812040816 [label=NativeDropoutBackward0] + 140210812040912 -> 140210812040816 + 140210812040912 [label=SoftmaxBackward0] + 140210812041008 -> 140210812040912 + 140210812041008 [label=AddBackward0] + 140210812041104 -> 140210812041008 + 140210812041104 [label=DivBackward0] + 140210812041200 -> 140210812041104 + 140210812041200 [label=UnsafeViewBackward0] + 140210812041296 -> 140210812041200 + 140210812041296 [label=BmmBackward0] + 140210812041392 -> 140210812041296 + 140210812041392 [label=ReshapeAliasBackward0] + 140210812041536 -> 140210812041392 + 140210812041536 [label=ExpandBackward0] + 140210812041632 -> 140210812041536 + 140210812041632 [label=PermuteBackward0] + 140210812041728 -> 140210812041632 + 140210812041728 [label=ViewBackward0] + 140210812041824 -> 140210812041728 + 140210812041824 [label=ViewBackward0] + 140210812041920 -> 140210812041824 + 140210812041920 [label=AddmmBackward0] + 140210812042016 -> 140210812041920 + 140210812042016 [label=ToCopyBackward0] + 140210812042208 -> 140210812042016 + 140202229070720 [label="encoder.layer.0.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140202229070720 -> 140210812042208 + 140210812042208 [label=AccumulateGrad] + 140210812041968 -> 140210812041920 + 140210812041968 [label=ViewBackward0] + 140210812042256 -> 140210812041968 + 140210812042256 [label=ToCopyBackward0] + 140210812039376 -> 140210812042256 + 140210812039376 [label=SliceBackward0] + 140210812042400 -> 140210812039376 + 140210812042400 [label=SliceBackward0] + 140210812042496 -> 140210812042400 + 140210812042496 [label=SliceBackward0] + 140210812042592 -> 140210812042496 + 140210812042592 [label=NativeLayerNormBackward0] + 140210812042688 -> 140210812042592 + 140210812042688 [label=AddBackward0] + 140210812042880 -> 140210812042688 + 140210812042880 [label=NativeDropoutBackward0] + 140210812043024 -> 140210812042880 + 140210812043024 [label=ViewBackward0] + 140210812043120 -> 140210812043024 + 140210812043120 [label=AddmmBackward0] + 140210812043216 -> 140210812043120 + 140210812043216 [label=ToCopyBackward0] + 140210812051664 -> 140210812043216 + 140202229071200 [label="encoder.layer.0.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140202229071200 -> 140210812051664 + 140210812051664 [label=AccumulateGrad] + 140210812043168 -> 140210812043120 + 140210812043168 [label=ViewBackward0] + 140210812051712 -> 140210812043168 + 140210812051712 [label=ViewBackward0] + 140210812051808 -> 140210812051712 + 140210812051808 [label=CloneBackward0] + 140210812051904 -> 140210812051808 + 140210812051904 [label=PermuteBackward0] + 140210812052000 -> 140210812051904 + 140210812052000 [label=UnsafeViewBackward0] + 140210812052096 -> 140210812052000 + 140210812052096 [label=BmmBackward0] + 140210812052192 -> 140210812052096 + 140210812052192 [label=ReshapeAliasBackward0] + 140210812052336 -> 140210812052192 + 140210812052336 [label=ExpandBackward0] + 140210812052432 -> 140210812052336 + 140210812052432 [label=ToCopyBackward0] + 140210812052528 -> 140210812052432 + 140210812052528 [label=NativeDropoutBackward0] + 140210812052624 -> 140210812052528 + 140210812052624 [label=SoftmaxBackward0] + 140210812052720 -> 140210812052624 + 140210812052720 [label=AddBackward0] + 140210812052816 -> 140210812052720 + 140210812052816 [label=DivBackward0] + 140210812052912 -> 140210812052816 + 140210812052912 [label=UnsafeViewBackward0] + 140210812053008 -> 140210812052912 + 140210812053008 [label=BmmBackward0] + 140210812053104 -> 140210812053008 + 140210812053104 [label=ReshapeAliasBackward0] + 140210812053248 -> 140210812053104 + 140210812053248 [label=ExpandBackward0] + 140210812053344 -> 140210812053248 + 140210812053344 [label=PermuteBackward0] + 140210812053440 -> 140210812053344 + 140210812053440 [label=ViewBackward0] + 140210812053536 -> 140210812053440 + 140210812053536 [label=ViewBackward0] + 140210812053632 -> 140210812053536 + 140210812053632 [label=AddmmBackward0] + 140210812053728 -> 140210812053632 + 140210812053728 [label=ToCopyBackward0] + 140210812053920 -> 140210812053728 + 140202228734688 [label="encoder.layer.0.attention.self.query.bias + (768)" fillcolor=lightblue] + 140202228734688 -> 140210812053920 + 140210812053920 [label=AccumulateGrad] + 140210812053680 -> 140210812053632 + 140210812053680 [label=ViewBackward0] + 140210812053968 -> 140210812053680 + 140210812053968 [label=ToCopyBackward0] + 140210812042832 -> 140210812053968 + 140210812042832 [label=NativeDropoutBackward0] + 140210812054112 -> 140210812042832 + 140210812054112 [label=NativeLayerNormBackward0] + 140210812054208 -> 140210812054112 + 140210812054208 [label=CatBackward0] + 140210812054400 -> 140210812054208 + 140210812054400 [label=ExpandBackward0] + 140210812054544 -> 140210812054400 + 140202228561216 [label=" + (1, 32, 768)" fillcolor=lightblue] + 140202228561216 -> 140210812054544 + 140210812054544 [label=AccumulateGrad] + 140210812054352 -> 140210812054208 + 140210812054352 [label=AddBackward0] + 140210812054592 -> 140210812054352 + 140210812054592 [label=EmbeddingBackward0] + 140210812054736 -> 140210812054592 + 140202228561776 [label="embeddings.word_embeddings.weight + (30523, 768)" fillcolor=lightblue] + 140202228561776 -> 140210812054736 + 140210812054736 [label=AccumulateGrad] + 140210812054640 -> 140210812054352 + 140210812054640 [label=EmbeddingBackward0] + 140210812054784 -> 140210812054640 + 140202228735888 [label="embeddings.position_embeddings.weight + (512, 768)" fillcolor=lightblue] + 140202228735888 -> 140210812054784 + 140210812054784 [label=AccumulateGrad] + 140210812054160 -> 140210812054112 + 140202228560576 [label="embeddings.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228560576 -> 140210812054160 + 140210812054160 [label=AccumulateGrad] + 140210812053824 -> 140210812054112 + 140202228560336 [label="embeddings.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228560336 -> 140210812053824 + 140210812053824 [label=AccumulateGrad] + 140210812053152 -> 140210812053632 + 140210812053152 [label=TBackward0] + 140210812053872 -> 140210812053152 + 140210812053872 [label=ToCopyBackward0] + 140210812054304 -> 140210812053872 + 140202228560096 [label="encoder.layer.0.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228560096 -> 140210812054304 + 140210812054304 [label=AccumulateGrad] + 140210812053056 -> 140210812053008 + 140210812053056 [label=ReshapeAliasBackward0] + 140210812053392 -> 140210812053056 + 140210812053392 [label=ExpandBackward0] + 140210812053584 -> 140210812053392 + 140210812053584 [label=TransposeBackward0] + 140210812054064 -> 140210812053584 + 140210812054064 [label=PermuteBackward0] + 140210812054832 -> 140210812054064 + 140210812054832 [label=ViewBackward0] + 140210812054016 -> 140210812054832 + 140210812054016 [label=ViewBackward0] + 140210812054448 -> 140210812054016 + 140210812054448 [label=AddmmBackward0] + 140210812054928 -> 140210812054448 + 140210812054928 [label=ToCopyBackward0] + 140210812055120 -> 140210812054928 + 140202229071680 [label="encoder.layer.0.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202229071680 -> 140210812055120 + 140210812055120 [label=AccumulateGrad] + 140210812054688 -> 140210812054448 + 140210812054688 [label=ViewBackward0] + 140210812055168 -> 140210812054688 + 140210812055168 [label=ToCopyBackward0] + 140210812042832 -> 140210812055168 + 140210812053200 -> 140210812054448 + 140210812053200 [label=TBackward0] + 140210812055024 -> 140210812053200 + 140210812055024 [label=ToCopyBackward0] + 140210812055312 -> 140210812055024 + 140202228734048 [label="encoder.layer.0.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228734048 -> 140210812055312 + 140210812055312 [label=AccumulateGrad] + 140210812052144 -> 140210812052096 + 140210812052144 [label=ReshapeAliasBackward0] + 140210812052480 -> 140210812052144 + 140210812052480 [label=ExpandBackward0] + 140210812052672 -> 140210812052480 + 140210812052672 [label=PermuteBackward0] + 140210812052864 -> 140210812052672 + 140210812052864 [label=ViewBackward0] + 140210812052240 -> 140210812052864 + 140210812052240 [label=ViewBackward0] + 140210812053488 -> 140210812052240 + 140210812053488 [label=AddmmBackward0] + 140210812054256 -> 140210812053488 + 140210812054256 [label=ToCopyBackward0] + 140210812055264 -> 140210812054256 + 140202229071440 [label="encoder.layer.0.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202229071440 -> 140210812055264 + 140210812055264 [label=AccumulateGrad] + 140210812053776 -> 140210812053488 + 140210812053776 [label=ViewBackward0] + 140210812055072 -> 140210812053776 + 140210812055072 [label=ToCopyBackward0] + 140210812042832 -> 140210812055072 + 140210812052288 -> 140210812053488 + 140210812052288 [label=TBackward0] + 140210812054880 -> 140210812052288 + 140210812054880 [label=ToCopyBackward0] + 140210812055216 -> 140210812054880 + 140202229071760 [label="encoder.layer.0.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202229071760 -> 140210812055216 + 140210812055216 [label=AccumulateGrad] + 140210812042928 -> 140210812043120 + 140210812042928 [label=TBackward0] + 140210812051856 -> 140210812042928 + 140210812051856 [label=ToCopyBackward0] + 140210812052048 -> 140210812051856 + 140202229071520 [label="encoder.layer.0.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229071520 -> 140210812052048 + 140210812052048 [label=AccumulateGrad] + 140210812042832 -> 140210812042688 + 140210812042640 -> 140210812042592 + 140202229071280 [label="encoder.layer.0.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229071280 -> 140210812042640 + 140210812042640 [label=AccumulateGrad] + 140210812042112 -> 140210812042592 + 140202229070960 [label="encoder.layer.0.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229070960 -> 140210812042112 + 140210812042112 [label=AccumulateGrad] + 140210812041440 -> 140210812041920 + 140210812041440 [label=TBackward0] + 140210812042160 -> 140210812041440 + 140210812042160 [label=ToCopyBackward0] + 140210812042544 -> 140210812042160 + 140202229071040 [label="encoder.layer.0.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229071040 -> 140210812042544 + 140210812042544 [label=AccumulateGrad] + 140210812041344 -> 140210812041296 + 140210812041344 [label=ReshapeAliasBackward0] + 140210812041680 -> 140210812041344 + 140210812041680 [label=ExpandBackward0] + 140210812041872 -> 140210812041680 + 140210812041872 [label=TransposeBackward0] + 140210812042352 -> 140210812041872 + 140210812042352 [label=PermuteBackward0] + 140210812042784 -> 140210812042352 + 140210812042784 [label=ViewBackward0] + 140210812042304 -> 140210812042784 + 140210812042304 [label=ViewBackward0] + 140210812043072 -> 140210812042304 + 140210812043072 [label=AddmmBackward0] + 140210812041488 -> 140210812043072 + 140210812041488 [label=ToCopyBackward0] + 140210812051760 -> 140210812041488 + 140202229070480 [label="encoder.layer.0.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202229070480 -> 140210812051760 + 140210812051760 [label=AccumulateGrad] + 140210812051568 -> 140210812043072 + 140210812051568 [label=ViewBackward0] + 140210812052576 -> 140210812051568 + 140210812052576 [label=ToCopyBackward0] + 140210812052960 -> 140210812052576 + 140210812052960 [label=NativeLayerNormBackward0] + 140210812054496 -> 140210812052960 + 140202228735248 [label=" + (1408)" fillcolor=lightblue] + 140202228735248 -> 140210812054496 + 140210812054496 [label=AccumulateGrad] + 140210812053296 -> 140210812052960 + 140202228735488 [label=" + (1408)" fillcolor=lightblue] + 140202228735488 -> 140210812053296 + 140210812053296 [label=AccumulateGrad] + 140210812051520 -> 140210812043072 + 140210812051520 [label=TBackward0] + 140210812051616 -> 140210812051520 + 140210812051616 [label=ToCopyBackward0] + 140210812054976 -> 140210812051616 + 140202229070800 [label="encoder.layer.0.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202229070800 -> 140210812054976 + 140210812054976 [label=AccumulateGrad] + 140210812040432 -> 140210812040384 + 140210812040432 [label=ReshapeAliasBackward0] + 140210812040768 -> 140210812040432 + 140210812040768 [label=ExpandBackward0] + 140210812040960 -> 140210812040768 + 140210812040960 [label=PermuteBackward0] + 140210812041152 -> 140210812040960 + 140210812041152 [label=ViewBackward0] + 140210812040528 -> 140210812041152 + 140210812040528 [label=ViewBackward0] + 140210812041776 -> 140210812040528 + 140210812041776 [label=AddmmBackward0] + 140210812042448 -> 140210812041776 + 140210812042448 [label=ToCopyBackward0] + 140210812042976 -> 140210812042448 + 140202229070240 [label="encoder.layer.0.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202229070240 -> 140210812042976 + 140210812042976 [label=AccumulateGrad] + 140210812042064 -> 140210812041776 + 140210812042064 [label=ViewBackward0] + 140210812055360 -> 140210812042064 + 140210812055360 [label=ToCopyBackward0] + 140210812052960 -> 140210812055360 + 140210812040576 -> 140210812041776 + 140210812040576 [label=TBackward0] + 140210812051952 -> 140210812040576 + 140210812051952 [label=ToCopyBackward0] + 140210812052768 -> 140210812051952 + 140202229070560 [label="encoder.layer.0.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202229070560 -> 140210812052768 + 140210812052768 [label=AccumulateGrad] + 140210812039472 -> 140210812039664 + 140210812039472 [label=TBackward0] + 140210812040144 -> 140210812039472 + 140210812040144 [label=ToCopyBackward0] + 140210812040336 -> 140210812040144 + 140202229070320 [label="encoder.layer.0.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229070320 -> 140210812040336 + 140210812040336 [label=AccumulateGrad] + 140210812039376 -> 140210812026832 + 140210812026400 -> 140210812026784 + 140202229070080 [label="encoder.layer.0.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229070080 -> 140210812026400 + 140210812026400 [label=AccumulateGrad] + 140210812039232 -> 140210812026784 + 140202229069760 [label="encoder.layer.0.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229069760 -> 140210812039232 + 140210812039232 [label=AccumulateGrad] + 140210812025920 -> 140210812026208 + 140210812025920 [label=TBackward0] + 140210812026448 -> 140210812025920 + 140210812026448 [label=ToCopyBackward0] + 140210812026736 -> 140210812026448 + 140202229068400 [label="encoder.layer.0.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202229068400 -> 140210812026736 + 140210812026736 [label=AccumulateGrad] + 140210812025488 -> 140210812025680 + 140210812025488 [label=TBackward0] + 140210812026160 -> 140210812025488 + 140210812026160 [label=ToCopyBackward0] + 140210812026640 -> 140210812026160 + 140202229068160 [label="encoder.layer.0.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202229068160 -> 140210812026640 + 140210812026640 [label=AccumulateGrad] + 140210812025392 -> 140210812025248 + 140210812025200 -> 140210812025104 + 140202229067920 [label="encoder.layer.0.expert_ln.weight + (768)" fillcolor=lightblue] + 140202229067920 -> 140210812025200 + 140210812025200 [label=AccumulateGrad] + 140210812025152 -> 140210812025104 + 140202229051120 [label="encoder.layer.0.expert_ln.bias + (768)" fillcolor=lightblue] + 140202229051120 -> 140210812025152 + 140210812025152 [label=AccumulateGrad] + 140210812024864 -> 140210812009728 + 140210812024864 [label=NativeLayerNormBackward0] + 140210812025536 -> 140210812024864 + 140210812025536 [label=AddBackward0] + 140210812026352 -> 140210812025536 + 140210812026352 [label=NativeDropoutBackward0] + 140210812026064 -> 140210812026352 + 140210812026064 [label=ViewBackward0] + 140210812039280 -> 140210812026064 + 140210812039280 [label=AddmmBackward0] + 140210812039808 -> 140210812039280 + 140210812039808 [label=ToCopyBackward0] + 140210812039904 -> 140210812039808 + 140202229069280 [label="encoder.layer.0.output.dense.bias + (768)" fillcolor=lightblue] + 140202229069280 -> 140210812039904 + 140210812039904 [label=AccumulateGrad] + 140210812039616 -> 140210812039280 + 140210812039616 [label=ViewBackward0] + 140210812040048 -> 140210812039616 + 140210812040048 [label=GeluBackward0] + 140210812041056 -> 140210812040048 + 140210812041056 [label=ViewBackward0] + 140210812041584 -> 140210812041056 + 140210812041584 [label=AddmmBackward0] + 140210812042736 -> 140210812041584 + 140210812042736 [label=ToCopyBackward0] + 140210812055456 -> 140210812042736 + 140202229069520 [label="encoder.layer.0.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202229069520 -> 140210812055456 + 140210812055456 [label=AccumulateGrad] + 140210812040672 -> 140210812041584 + 140210812040672 [label=ViewBackward0] + 140210812055408 -> 140210812040672 + 140210812055408 [label=ToCopyBackward0] + 140210812025872 -> 140210812055408 + 140210812025872 [label=SliceBackward0] + 140210812092672 -> 140210812025872 + 140210812092672 [label=SliceBackward0] + 140210812092768 -> 140210812092672 + 140210812092768 [label=SliceBackward0] + 140210812042592 -> 140210812092768 + 140210812055504 -> 140210812041584 + 140210812055504 [label=TBackward0] + 140210812092576 -> 140210812055504 + 140210812092576 [label=ToCopyBackward0] + 140210812092864 -> 140210812092576 + 140202229069840 [label="encoder.layer.0.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202229069840 -> 140210812092864 + 140210812092864 [label=AccumulateGrad] + 140210812039520 -> 140210812039280 + 140210812039520 [label=TBackward0] + 140210812041248 -> 140210812039520 + 140210812041248 [label=ToCopyBackward0] + 140210812052384 -> 140210812041248 + 140202229069600 [label="encoder.layer.0.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202229069600 -> 140210812052384 + 140210812052384 [label=AccumulateGrad] + 140210812025872 -> 140210812025536 + 140210812025344 -> 140210812024864 + 140202229069360 [label="encoder.layer.0.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229069360 -> 140210812025344 + 140210812025344 [label=AccumulateGrad] + 140210812025296 -> 140210812024864 + 140202229069040 [label="encoder.layer.0.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229069040 -> 140210812025296 + 140210812025296 [label=AccumulateGrad] + 140210812024144 -> 140210812024624 + 140210812024144 [label=TBackward0] + 140210812024816 -> 140210812024144 + 140210812024816 [label=ToCopyBackward0] + 140210812025824 -> 140210812024816 + 140202229051200 [label="encoder.layer.1.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229051200 -> 140210812025824 + 140210812025824 [label=AccumulateGrad] + 140210812024048 -> 140210812024000 + 140210812024048 [label=ReshapeAliasBackward0] + 140210812024384 -> 140210812024048 + 140210812024384 [label=ExpandBackward0] + 140210812024576 -> 140210812024384 + 140210812024576 [label=TransposeBackward0] + 140210812025056 -> 140210812024576 + 140210812025056 [label=PermuteBackward0] + 140210812026592 -> 140210812025056 + 140210812026592 [label=ViewBackward0] + 140210812025008 -> 140210812026592 + 140210812025008 [label=ViewBackward0] + 140210812040240 -> 140210812025008 + 140210812040240 [label=AddmmBackward0] + 140210812040864 -> 140210812040240 + 140210812040864 [label=ToCopyBackward0] + 140210812092528 -> 140210812040864 + 140202229050640 [label="encoder.layer.1.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202229050640 -> 140210812092528 + 140210812092528 [label=AccumulateGrad] + 140210812039328 -> 140210812040240 + 140210812039328 [label=ViewBackward0] + 140210812092912 -> 140210812039328 + 140210812092912 [label=ToCopyBackward0] + 140210812009728 -> 140210812092912 + 140210812092480 -> 140210812040240 + 140210812092480 [label=TBackward0] + 140210812092624 -> 140210812092480 + 140210812092624 [label=ToCopyBackward0] + 140210812093056 -> 140210812092624 + 140202229050960 [label="encoder.layer.1.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202229050960 -> 140210812093056 + 140210812093056 [label=AccumulateGrad] + 140210812023136 -> 140210812023088 + 140210812023136 [label=ReshapeAliasBackward0] + 140210812023472 -> 140210812023136 + 140210812023472 [label=ExpandBackward0] + 140210812023664 -> 140210812023472 + 140210812023664 [label=PermuteBackward0] + 140210812023856 -> 140210812023664 + 140210812023856 [label=ViewBackward0] + 140210812023232 -> 140210812023856 + 140210812023232 [label=ViewBackward0] + 140210812024480 -> 140210812023232 + 140210812024480 [label=AddmmBackward0] + 140210812025632 -> 140210812024480 + 140210812025632 [label=ToCopyBackward0] + 140210812039856 -> 140210812025632 + 140202229050400 [label="encoder.layer.1.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202229050400 -> 140210812039856 + 140210812039856 [label=AccumulateGrad] + 140210812024768 -> 140210812024480 + 140210812024768 [label=ViewBackward0] + 140210812092816 -> 140210812024768 + 140210812092816 [label=ToCopyBackward0] + 140210812009728 -> 140210812092816 + 140210812023280 -> 140210812024480 + 140210812023280 [label=TBackward0] + 140210812092720 -> 140210812023280 + 140210812092720 [label=ToCopyBackward0] + 140210812092960 -> 140210812092720 + 140202229050720 [label="encoder.layer.1.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202229050720 -> 140210812092960 + 140210812092960 [label=AccumulateGrad] + 140210812009824 -> 140210812010016 + 140210812009824 [label=TBackward0] + 140210812010208 -> 140210812009824 + 140210812010208 [label=ToCopyBackward0] + 140210812023040 -> 140210812010208 + 140202229050480 [label="encoder.layer.1.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229050480 -> 140210812023040 + 140210812023040 [label=AccumulateGrad] + 140210812009728 -> 140210812009584 + 140210812009536 -> 140210812009488 + 140202229050240 [label="encoder.layer.1.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229050240 -> 140210812009536 + 140210812009536 [label=AccumulateGrad] + 140210812008816 -> 140210812009488 + 140202229049920 [label="encoder.layer.1.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229049920 -> 140210812008816 + 140210812008816 [label=AccumulateGrad] + 140210812008336 -> 140210812008624 + 140210812008336 [label=TBackward0] + 140210812008864 -> 140210812008336 + 140210812008864 [label=ToCopyBackward0] + 140210812009248 -> 140210812008864 + 140202229048560 [label="encoder.layer.1.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202229048560 -> 140210812009248 + 140210812009248 [label=AccumulateGrad] + 140210812007904 -> 140210812008096 + 140210812007904 [label=TBackward0] + 140210812008576 -> 140210812007904 + 140210812008576 [label=ToCopyBackward0] + 140210812009056 -> 140210812008576 + 140202229048320 [label="encoder.layer.1.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202229048320 -> 140210812009056 + 140210812009056 [label=AccumulateGrad] + 140210812007808 -> 140210812007664 + 140210812007616 -> 140210812007520 + 140202229048080 [label="encoder.layer.1.expert_ln.weight + (768)" fillcolor=lightblue] + 140202229048080 -> 140210812007616 + 140210812007616 [label=AccumulateGrad] + 140210812007568 -> 140210812007520 + 140202229047760 [label="encoder.layer.1.expert_ln.bias + (768)" fillcolor=lightblue] + 140202229047760 -> 140210812007568 + 140210812007568 [label=AccumulateGrad] + 140210812007280 -> 140210811996240 + 140210812007280 [label=NativeLayerNormBackward0] + 140210812007952 -> 140210812007280 + 140210812007952 [label=AddBackward0] + 140210812008768 -> 140210812007952 + 140210812008768 [label=NativeDropoutBackward0] + 140210812008480 -> 140210812008768 + 140210812008480 [label=ViewBackward0] + 140210812009008 -> 140210812008480 + 140210812009008 [label=AddmmBackward0] + 140210812009680 -> 140210812009008 + 140210812009680 [label=ToCopyBackward0] + 140210812010400 -> 140210812009680 + 140202229049440 [label="encoder.layer.1.output.dense.bias + (768)" fillcolor=lightblue] + 140202229049440 -> 140210812010400 + 140210812010400 [label=AccumulateGrad] + 140210812009632 -> 140210812009008 + 140210812009632 [label=ViewBackward0] + 140210812009968 -> 140210812009632 + 140210812009968 [label=GeluBackward0] + 140210812022848 -> 140210812009968 + 140210812022848 [label=ViewBackward0] + 140210812023568 -> 140210812022848 + 140210812023568 [label=AddmmBackward0] + 140210812023952 -> 140210812023568 + 140210812023952 [label=ToCopyBackward0] + 140210812024192 -> 140210812023952 + 140202229049680 [label="encoder.layer.1.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202229049680 -> 140210812024192 + 140210812024192 [label=AccumulateGrad] + 140210812023760 -> 140210812023568 + 140210812023760 [label=ViewBackward0] + 140210812093248 -> 140210812023760 + 140210812093248 [label=ToCopyBackward0] + 140210812008288 -> 140210812093248 + 140210812008288 [label=SliceBackward0] + 140210812093296 -> 140210812008288 + 140210812093296 [label=SliceBackward0] + 140210812093392 -> 140210812093296 + 140210812093392 [label=SliceBackward0] + 140210812009488 -> 140210812093392 + 140210812023376 -> 140210812023568 + 140210812023376 [label=TBackward0] + 140210812093008 -> 140210812023376 + 140210812093008 [label=ToCopyBackward0] + 140210812093488 -> 140210812093008 + 140202229050000 [label="encoder.layer.1.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202229050000 -> 140210812093488 + 140210812093488 [label=AccumulateGrad] + 140210812009440 -> 140210812009008 + 140210812009440 [label=TBackward0] + 140210812010160 -> 140210812009440 + 140210812010160 [label=ToCopyBackward0] + 140210812024288 -> 140210812010160 + 140202229049760 [label="encoder.layer.1.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202229049760 -> 140210812024288 + 140210812024288 [label=AccumulateGrad] + 140210812008288 -> 140210812007952 + 140210812007760 -> 140210812007280 + 140202229049520 [label="encoder.layer.1.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229049520 -> 140210812007760 + 140210812007760 [label=AccumulateGrad] + 140210812007712 -> 140210812007280 + 140202229049200 [label="encoder.layer.1.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229049200 -> 140210812007712 + 140210812007712 [label=AccumulateGrad] + 140210812006560 -> 140210812007040 + 140210812006560 [label=TBackward0] + 140210812007232 -> 140210812006560 + 140210812007232 [label=ToCopyBackward0] + 140210812008240 -> 140210812007232 + 140202229047840 [label="encoder.layer.2.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229047840 -> 140210812008240 + 140210812008240 [label=AccumulateGrad] + 140210812006464 -> 140210811998160 + 140210812006464 [label=ReshapeAliasBackward0] + 140210812006800 -> 140210812006464 + 140210812006800 [label=ExpandBackward0] + 140210812006992 -> 140210812006800 + 140210812006992 [label=TransposeBackward0] + 140210812007472 -> 140210812006992 + 140210812007472 [label=PermuteBackward0] + 140210812009344 -> 140210812007472 + 140210812009344 [label=ViewBackward0] + 140210812007424 -> 140210812009344 + 140210812007424 [label=ViewBackward0] + 140210812009872 -> 140210812007424 + 140210812009872 [label=AddmmBackward0] + 140210812022944 -> 140210812009872 + 140210812022944 [label=ToCopyBackward0] + 140210812093200 -> 140210812022944 + 140202229047360 [label="encoder.layer.2.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202229047360 -> 140210812093200 + 140210812093200 [label=AccumulateGrad] + 140210812022896 -> 140210812009872 + 140210812022896 [label=ViewBackward0] + 140210812093536 -> 140210812022896 + 140210812093536 [label=ToCopyBackward0] + 140210811996240 -> 140210812093536 + 140210812093104 -> 140210812009872 + 140210812093104 [label=TBackward0] + 140210812093152 -> 140210812093104 + 140210812093152 [label=ToCopyBackward0] + 140210812093680 -> 140210812093152 + 140202229047600 [label="encoder.layer.2.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202229047600 -> 140210812093680 + 140210812093680 [label=AccumulateGrad] + 140210811997296 -> 140210811997248 + 140210811997296 [label=ReshapeAliasBackward0] + 140210811997632 -> 140210811997296 + 140210811997632 [label=ExpandBackward0] + 140210811997824 -> 140210811997632 + 140210811997824 [label=PermuteBackward0] + 140210811998016 -> 140210811997824 + 140210811998016 [label=ViewBackward0] + 140210811998112 -> 140210811998016 + 140210811998112 [label=ViewBackward0] + 140210812006896 -> 140210811998112 + 140210812006896 [label=AddmmBackward0] + 140210812008048 -> 140210812006896 + 140210812008048 [label=ToCopyBackward0] + 140210812006608 -> 140210812008048 + 140202229042848 [label="encoder.layer.2.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202229042848 -> 140210812006608 + 140210812006608 [label=AccumulateGrad] + 140210812007184 -> 140210812006896 + 140210812007184 [label=ViewBackward0] + 140210812093440 -> 140210812007184 + 140210812093440 [label=ToCopyBackward0] + 140210811996240 -> 140210812093440 + 140210812006512 -> 140210812006896 + 140210812006512 [label=TBackward0] + 140210812093344 -> 140210812006512 + 140210812093344 [label=ToCopyBackward0] + 140210812093584 -> 140210812093344 + 140202229043088 [label="encoder.layer.2.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202229043088 -> 140210812093584 + 140210812093584 [label=AccumulateGrad] + 140210811996336 -> 140210811996528 + 140210811996336 [label=TBackward0] + 140210811997008 -> 140210811996336 + 140210811997008 [label=ToCopyBackward0] + 140210811997200 -> 140210811997008 + 140202229042928 [label="encoder.layer.2.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229042928 -> 140210811997200 + 140210811997200 [label=AccumulateGrad] + 140210811996240 -> 140210811996096 + 140210811996048 -> 140210811996000 + 140202229042688 [label="encoder.layer.2.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229042688 -> 140210811996048 + 140210811996048 [label=AccumulateGrad] + 140210811995520 -> 140210811996000 + 140202229042368 [label="encoder.layer.2.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229042368 -> 140210811995520 + 140210811995520 [label=AccumulateGrad] + 140210811994848 -> 140210811995328 + 140210811994848 [label=TBackward0] + 140210811995568 -> 140210811994848 + 140210811995568 [label=ToCopyBackward0] + 140210811995952 -> 140210811995568 + 140202229042448 [label="encoder.layer.2.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229042448 -> 140210811995952 + 140210811995952 [label=AccumulateGrad] + 140210811994752 -> 140210811994704 + 140210811994752 [label=ReshapeAliasBackward0] + 140210811995088 -> 140210811994752 + 140210811995088 [label=ExpandBackward0] + 140210811995280 -> 140210811995088 + 140210811995280 [label=TransposeBackward0] + 140210811995760 -> 140210811995280 + 140210811995760 [label=PermuteBackward0] + 140210811996192 -> 140210811995760 + 140210811996192 [label=ViewBackward0] + 140210811995712 -> 140210811996192 + 140210811995712 [label=ViewBackward0] + 140210811996480 -> 140210811995712 + 140210811996480 [label=AddmmBackward0] + 140210811996720 -> 140210811996480 + 140210811996720 [label=ToCopyBackward0] + 140210811996912 -> 140210811996720 + 140202229041888 [label="encoder.layer.2.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202229041888 -> 140210811996912 + 140210811996912 [label=AccumulateGrad] + 140210811996672 -> 140210811996480 + 140210811996672 [label=ViewBackward0] + 140210811997728 -> 140210811996672 + 140210811997728 [label=ToCopyBackward0] + 140210812052960 -> 140210811997728 + 140210811994896 -> 140210811996480 + 140210811994896 [label=TBackward0] + 140210811997536 -> 140210811994896 + 140210811997536 [label=ToCopyBackward0] + 140210811996768 -> 140210811997536 + 140202229042208 [label="encoder.layer.2.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202229042208 -> 140210811996768 + 140210811996768 [label=AccumulateGrad] + 140210811977392 -> 140210811977344 + 140210811977392 [label=ReshapeAliasBackward0] + 140210811977632 -> 140210811977392 + 140210811977632 [label=ExpandBackward0] + 140210811994368 -> 140210811977632 + 140210811994368 [label=PermuteBackward0] + 140210811994560 -> 140210811994368 + 140210811994560 [label=ViewBackward0] + 140210811994176 -> 140210811994560 + 140210811994176 [label=ViewBackward0] + 140210811995184 -> 140210811994176 + 140210811995184 [label=AddmmBackward0] + 140210811995856 -> 140210811995184 + 140210811995856 [label=ToCopyBackward0] + 140210811997440 -> 140210811995856 + 140202229041648 [label="encoder.layer.2.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202229041648 -> 140210811997440 + 140210811997440 [label=AccumulateGrad] + 140210811995472 -> 140210811995184 + 140210811995472 [label=ViewBackward0] + 140210811996384 -> 140210811995472 + 140210811996384 [label=ToCopyBackward0] + 140210812052960 -> 140210811996384 + 140210811994224 -> 140210811995184 + 140210811994224 [label=TBackward0] + 140210812009152 -> 140210811994224 + 140210812009152 [label=ToCopyBackward0] + 140210811997104 -> 140210812009152 + 140202229041968 [label="encoder.layer.2.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202229041968 -> 140210811997104 + 140210811997104 [label=AccumulateGrad] + 140210811976432 -> 140210811976624 + 140210811976432 [label=TBackward0] + 140210811977104 -> 140210811976432 + 140210811977104 [label=ToCopyBackward0] + 140210811977296 -> 140210811977104 + 140202229041728 [label="encoder.layer.2.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229041728 -> 140210811977296 + 140210811977296 [label=AccumulateGrad] + 140210811976336 -> 140210811976192 + 140210811976144 -> 140210811976096 + 140202229041488 [label="encoder.layer.2.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229041488 -> 140210811976144 + 140210811976144 [label=AccumulateGrad] + 140210811975712 -> 140210811976096 + 140202229041168 [label="encoder.layer.2.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229041168 -> 140210811975712 + 140210811975712 [label=AccumulateGrad] + 140210811975232 -> 140210811975520 + 140210811975232 [label=TBackward0] + 140210811975760 -> 140210811975232 + 140210811975760 [label=ToCopyBackward0] + 140210811976240 -> 140210811975760 + 140202229039808 [label="encoder.layer.2.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202229039808 -> 140210811976240 + 140210811976240 [label=AccumulateGrad] + 140210811974800 -> 140210811974992 + 140210811974800 [label=TBackward0] + 140210811975472 -> 140210811974800 + 140210811975472 [label=ToCopyBackward0] + 140210811975952 -> 140210811975472 + 140202229039568 [label="encoder.layer.2.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202229039568 -> 140210811975952 + 140210811975952 [label=AccumulateGrad] + 140210811974704 -> 140210811974560 + 140210811974512 -> 140210811974416 + 140202229039328 [label="encoder.layer.2.expert_ln.weight + (768)" fillcolor=lightblue] + 140202229039328 -> 140210811974512 + 140210811974512 [label=AccumulateGrad] + 140210811974464 -> 140210811974416 + 140202229026624 [label="encoder.layer.2.expert_ln.bias + (768)" fillcolor=lightblue] + 140202229026624 -> 140210811974464 + 140210811974464 [label=AccumulateGrad] + 140210811974176 -> 140210811959040 + 140210811974176 [label=NativeLayerNormBackward0] + 140210811974848 -> 140210811974176 + 140210811974848 [label=AddBackward0] + 140210811975664 -> 140210811974848 + 140210811975664 [label=NativeDropoutBackward0] + 140210811975376 -> 140210811975664 + 140210811975376 [label=ViewBackward0] + 140210811975904 -> 140210811975376 + 140210811975904 [label=AddmmBackward0] + 140210811976768 -> 140210811975904 + 140210811976768 [label=ToCopyBackward0] + 140210811976864 -> 140210811976768 + 140202229040688 [label="encoder.layer.2.output.dense.bias + (768)" fillcolor=lightblue] + 140202229040688 -> 140210811976864 + 140210811976864 [label=AccumulateGrad] + 140210811976576 -> 140210811975904 + 140210811976576 [label=ViewBackward0] + 140210812006704 -> 140210811976576 + 140210812006704 [label=GeluBackward0] + 140210811977200 -> 140210812006704 + 140210811977200 [label=ViewBackward0] + 140210811994656 -> 140210811977200 + 140210811994656 [label=AddmmBackward0] + 140210811996144 -> 140210811994656 + 140210811996144 [label=ToCopyBackward0] + 140210812093728 -> 140210811996144 + 140202229040928 [label="encoder.layer.2.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202229040928 -> 140210812093728 + 140210812093728 [label=AccumulateGrad] + 140210811994992 -> 140210811994656 + 140210811994992 [label=ViewBackward0] + 140210812093824 -> 140210811994992 + 140210812093824 [label=ToCopyBackward0] + 140210811975184 -> 140210812093824 + 140210811975184 [label=SliceBackward0] + 140210812093968 -> 140210811975184 + 140210812093968 [label=SliceBackward0] + 140210812094064 -> 140210812093968 + 140210812094064 [label=SliceBackward0] + 140210811996000 -> 140210812094064 + 140210811994272 -> 140210811994656 + 140210811994272 [label=TBackward0] + 140210812093632 -> 140210811994272 + 140210812093632 [label=ToCopyBackward0] + 140210812094160 -> 140210812093632 + 140202229041248 [label="encoder.layer.2.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202229041248 -> 140210812094160 + 140210812094160 [label=AccumulateGrad] + 140210811976480 -> 140210811975904 + 140210811976480 [label=TBackward0] + 140210811977536 -> 140210811976480 + 140210811977536 [label=ToCopyBackward0] + 140210811997920 -> 140210811977536 + 140202229041008 [label="encoder.layer.2.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202229041008 -> 140210811997920 + 140210811997920 [label=AccumulateGrad] + 140210811975184 -> 140210811974848 + 140210811974656 -> 140210811974176 + 140202229040768 [label="encoder.layer.2.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229040768 -> 140210811974656 + 140210811974656 [label=AccumulateGrad] + 140210811974608 -> 140210811974176 + 140202229040448 [label="encoder.layer.2.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229040448 -> 140210811974608 + 140210811974608 [label=AccumulateGrad] + 140210811973696 -> 140210811973936 + 140210811973696 [label=TBackward0] + 140210811974128 -> 140210811973696 + 140210811974128 [label=ToCopyBackward0] + 140210811975136 -> 140210811974128 + 140202229026704 [label="encoder.layer.3.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229026704 -> 140210811975136 + 140210811975136 [label=AccumulateGrad] + 140210811961008 -> 140210811960960 + 140210811961008 [label=ReshapeAliasBackward0] + 140210811961248 -> 140210811961008 + 140210811961248 [label=ExpandBackward0] + 140210811973888 -> 140210811961248 + 140210811973888 [label=TransposeBackward0] + 140210811974368 -> 140210811973888 + 140210811974368 [label=PermuteBackward0] + 140210811976288 -> 140210811974368 + 140210811976288 [label=ViewBackward0] + 140210811974320 -> 140210811976288 + 140210811974320 [label=ViewBackward0] + 140210811977008 -> 140210811974320 + 140210811977008 [label=AddmmBackward0] + 140210811994464 -> 140210811977008 + 140210811994464 [label=ToCopyBackward0] + 140210812093776 -> 140210811994464 + 140202229026144 [label="encoder.layer.3.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202229026144 -> 140210812093776 + 140210812093776 [label=AccumulateGrad] + 140210811973744 -> 140210811977008 + 140210811973744 [label=ViewBackward0] + 140210812094208 -> 140210811973744 + 140210812094208 [label=ToCopyBackward0] + 140210811959040 -> 140210812094208 + 140210812093872 -> 140210811977008 + 140210812093872 [label=TBackward0] + 140210812093920 -> 140210812093872 + 140210812093920 [label=ToCopyBackward0] + 140210812094352 -> 140210812093920 + 140202229026464 [label="encoder.layer.3.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202229026464 -> 140210812094352 + 140210812094352 [label=AccumulateGrad] + 140210811960096 -> 140210811960048 + 140210811960096 [label=ReshapeAliasBackward0] + 140210811960432 -> 140210811960096 + 140210811960432 [label=ExpandBackward0] + 140210811960624 -> 140210811960432 + 140210811960624 [label=PermuteBackward0] + 140210811960816 -> 140210811960624 + 140210811960816 [label=ViewBackward0] + 140210811960192 -> 140210811960816 + 140210811960192 [label=ViewBackward0] + 140210811961152 -> 140210811960192 + 140210811961152 [label=AddmmBackward0] + 140210811974944 -> 140210811961152 + 140210811974944 [label=ToCopyBackward0] + 140210811976816 -> 140210811974944 + 140202229025904 [label="encoder.layer.3.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202229025904 -> 140210811976816 + 140210811976816 [label=AccumulateGrad] + 140210811974080 -> 140210811961152 + 140210811974080 [label=ViewBackward0] + 140210812094112 -> 140210811974080 + 140210812094112 [label=ToCopyBackward0] + 140210811959040 -> 140210812094112 + 140210811973792 -> 140210811961152 + 140210811973792 [label=TBackward0] + 140210812094016 -> 140210811973792 + 140210812094016 [label=ToCopyBackward0] + 140210812094256 -> 140210812094016 + 140202229026224 [label="encoder.layer.3.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202229026224 -> 140210812094256 + 140210812094256 [label=AccumulateGrad] + 140210811959136 -> 140210811959328 + 140210811959136 [label=TBackward0] + 140210811959808 -> 140210811959136 + 140210811959808 [label=ToCopyBackward0] + 140210811960000 -> 140210811959808 + 140202229025984 [label="encoder.layer.3.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229025984 -> 140210811960000 + 140210811960000 [label=AccumulateGrad] + 140210811959040 -> 140210811958896 + 140210811958848 -> 140210811958800 + 140202229025744 [label="encoder.layer.3.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229025744 -> 140210811958848 + 140210811958848 [label=AccumulateGrad] + 140210811958128 -> 140210811958800 + 140202229025424 [label="encoder.layer.3.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229025424 -> 140210811958128 + 140210811958128 [label=AccumulateGrad] + 140210811957648 -> 140210811957936 + 140210811957648 [label=TBackward0] + 140210811958176 -> 140210811957648 + 140210811958176 [label=ToCopyBackward0] + 140210811958560 -> 140210811958176 + 140202229024064 [label="encoder.layer.3.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202229024064 -> 140210811958560 + 140210811958560 [label=AccumulateGrad] + 140210811957312 -> 140210811957408 + 140210811957312 [label=TBackward0] + 140210811957888 -> 140210811957312 + 140210811957888 [label=ToCopyBackward0] + 140210811958368 -> 140210811957888 + 140202229023824 [label="encoder.layer.3.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202229023824 -> 140210811958368 + 140210811958368 [label=AccumulateGrad] + 140210811944768 -> 140210811944624 + 140210811944576 -> 140210811944480 + 140202229023584 [label="encoder.layer.3.expert_ln.weight + (768)" fillcolor=lightblue] + 140202229023584 -> 140210811944576 + 140210811944576 [label=AccumulateGrad] + 140210811944528 -> 140210811944480 + 140202229023264 [label="encoder.layer.3.expert_ln.bias + (768)" fillcolor=lightblue] + 140202229023264 -> 140210811944528 + 140210811944528 [label=AccumulateGrad] + 140210811944240 -> 140210811941456 + 140210811944240 [label=NativeLayerNormBackward0] + 140210811944864 -> 140210811944240 + 140210811944864 [label=AddBackward0] + 140210811958080 -> 140210811944864 + 140210811958080 [label=NativeDropoutBackward0] + 140210811957792 -> 140210811958080 + 140210811957792 [label=ViewBackward0] + 140210811958320 -> 140210811957792 + 140210811958320 [label=AddmmBackward0] + 140210811958992 -> 140210811958320 + 140210811958992 [label=ToCopyBackward0] + 140210811959520 -> 140210811958992 + 140202229024944 [label="encoder.layer.3.output.dense.bias + (768)" fillcolor=lightblue] + 140202229024944 -> 140210811959520 + 140210811959520 [label=AccumulateGrad] + 140210811958944 -> 140210811958320 + 140210811958944 [label=ViewBackward0] + 140210811959904 -> 140210811958944 + 140210811959904 [label=GeluBackward0] + 140210811959568 -> 140210811959904 + 140210811959568 [label=ViewBackward0] + 140210811960528 -> 140210811959568 + 140210811960528 [label=AddmmBackward0] + 140210811960912 -> 140210811960528 + 140210811960912 [label=ToCopyBackward0] + 140210811976048 -> 140210811960912 + 140202229025184 [label="encoder.layer.3.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202229025184 -> 140210811976048 + 140210811976048 [label=AccumulateGrad] + 140210811960720 -> 140210811960528 + 140210811960720 [label=ViewBackward0] + 140210812094544 -> 140210811960720 + 140210812094544 [label=ToCopyBackward0] + 140210811957600 -> 140210812094544 + 140210811957600 [label=SliceBackward0] + 140210812094592 -> 140210811957600 + 140210812094592 [label=SliceBackward0] + 140210812094688 -> 140210812094592 + 140210812094688 [label=SliceBackward0] + 140210811958800 -> 140210812094688 + 140210811959472 -> 140210811960528 + 140210811959472 [label=TBackward0] + 140210812094304 -> 140210811959472 + 140210812094304 [label=ToCopyBackward0] + 140210812094784 -> 140210812094304 + 140202229025504 [label="encoder.layer.3.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202229025504 -> 140210812094784 + 140210812094784 [label=AccumulateGrad] + 140210811958752 -> 140210811958320 + 140210811958752 [label=TBackward0] + 140210811959712 -> 140210811958752 + 140210811959712 [label=ToCopyBackward0] + 140210811960240 -> 140210811959712 + 140202229025264 [label="encoder.layer.3.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202229025264 -> 140210811960240 + 140210811960240 [label=AccumulateGrad] + 140210811957600 -> 140210811944864 + 140210811944720 -> 140210811944240 + 140202229025024 [label="encoder.layer.3.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229025024 -> 140210811944720 + 140210811944720 [label=AccumulateGrad] + 140210811944672 -> 140210811944240 + 140202229024704 [label="encoder.layer.3.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229024704 -> 140210811944672 + 140210811944672 [label=AccumulateGrad] + 140210811943520 -> 140210811944000 + 140210811943520 [label=TBackward0] + 140210811944192 -> 140210811943520 + 140210811944192 [label=ToCopyBackward0] + 140210811944384 -> 140210811944192 + 140202229023344 [label="encoder.layer.4.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229023344 -> 140210811944384 + 140210811944384 [label=AccumulateGrad] + 140210811943424 -> 140210811943376 + 140210811943424 [label=ReshapeAliasBackward0] + 140210811943760 -> 140210811943424 + 140210811943760 [label=ExpandBackward0] + 140210811943952 -> 140210811943760 + 140210811943952 [label=TransposeBackward0] + 140210811944432 -> 140210811943952 + 140210811944432 [label=PermuteBackward0] + 140210811943568 -> 140210811944432 + 140210811943568 [label=ViewBackward0] + 140210811957360 -> 140210811943568 + 140210811957360 [label=ViewBackward0] + 140210811959280 -> 140210811957360 + 140210811959280 [label=AddmmBackward0] + 140210811960336 -> 140210811959280 + 140210811960336 [label=ToCopyBackward0] + 140210812094496 -> 140210811960336 + 140202229022784 [label="encoder.layer.4.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202229022784 -> 140210812094496 + 140210812094496 [label=AccumulateGrad] + 140210811957552 -> 140210811959280 + 140210811957552 [label=ViewBackward0] + 140210812094832 -> 140210811957552 + 140210812094832 [label=ToCopyBackward0] + 140210811941456 -> 140210812094832 + 140210812094400 -> 140210811959280 + 140210812094400 [label=TBackward0] + 140210812094448 -> 140210812094400 + 140210812094448 [label=ToCopyBackward0] + 140210812094976 -> 140210812094448 + 140202229023104 [label="encoder.layer.4.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202229023104 -> 140210812094976 + 140210812094976 [label=AccumulateGrad] + 140210811942512 -> 140210811942464 + 140210811942512 [label=ReshapeAliasBackward0] + 140210811942848 -> 140210811942512 + 140210811942848 [label=ExpandBackward0] + 140210811943040 -> 140210811942848 + 140210811943040 [label=PermuteBackward0] + 140210811943232 -> 140210811943040 + 140210811943232 [label=ViewBackward0] + 140210811942608 -> 140210811943232 + 140210811942608 [label=ViewBackward0] + 140210811943856 -> 140210811942608 + 140210811943856 [label=AddmmBackward0] + 140210811944144 -> 140210811943856 + 140210811944144 [label=ToCopyBackward0] + 140210811959184 -> 140210811944144 + 140202229014256 [label="encoder.layer.4.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202229014256 -> 140210811959184 + 140210811959184 [label=AccumulateGrad] + 140210811942656 -> 140210811943856 + 140210811942656 [label=ViewBackward0] + 140210812094736 -> 140210811942656 + 140210812094736 [label=ToCopyBackward0] + 140210811941456 -> 140210812094736 + 140210811958656 -> 140210811943856 + 140210811958656 [label=TBackward0] + 140210812094640 -> 140210811958656 + 140210812094640 [label=ToCopyBackward0] + 140210812094880 -> 140210812094640 + 140202229022864 [label="encoder.layer.4.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202229022864 -> 140210812094880 + 140210812094880 [label=AccumulateGrad] + 140210811941552 -> 140210811941744 + 140210811941552 [label=TBackward0] + 140210811942224 -> 140210811941552 + 140210811942224 [label=ToCopyBackward0] + 140210811942416 -> 140210811942224 + 140202229014336 [label="encoder.layer.4.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229014336 -> 140210811942416 + 140210811942416 [label=AccumulateGrad] + 140210811941456 -> 140210811941312 + 140210811941264 -> 140210811941216 + 140202229014096 [label="encoder.layer.4.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229014096 -> 140210811941264 + 140210811941264 [label=AccumulateGrad] + 140210811940976 -> 140210811941216 + 140202229013776 [label="encoder.layer.4.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229013776 -> 140210811940976 + 140210811940976 [label=AccumulateGrad] + 140210811927712 -> 140210811928192 + 140210811927712 [label=TBackward0] + 140210811928432 -> 140210811927712 + 140210811928432 [label=ToCopyBackward0] + 140210811941168 -> 140210811928432 + 140202229013856 [label="encoder.layer.4.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202229013856 -> 140210811941168 + 140210811941168 [label=AccumulateGrad] + 140210811927616 -> 140210811927568 + 140210811927616 [label=ReshapeAliasBackward0] + 140210811927952 -> 140210811927616 + 140210811927952 [label=ExpandBackward0] + 140210811928144 -> 140210811927952 + 140210811928144 [label=TransposeBackward0] + 140210811928528 -> 140210811928144 + 140210811928528 [label=PermuteBackward0] + 140210811927760 -> 140210811928528 + 140210811927760 [label=ViewBackward0] + 140210811940928 -> 140210811927760 + 140210811940928 [label=ViewBackward0] + 140210811941696 -> 140210811940928 + 140210811941696 [label=AddmmBackward0] + 140210811941936 -> 140210811941696 + 140210811941936 [label=ToCopyBackward0] + 140210811942128 -> 140210811941936 + 140202229013296 [label="encoder.layer.4.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202229013296 -> 140210811942128 + 140210811942128 [label=AccumulateGrad] + 140210811941888 -> 140210811941696 + 140210811941888 [label=ViewBackward0] + 140210811942944 -> 140210811941888 + 140210811942944 [label=ToCopyBackward0] + 140210812052960 -> 140210811942944 + 140210811941072 -> 140210811941696 + 140210811941072 [label=TBackward0] + 140210811942752 -> 140210811941072 + 140210811942752 [label=ToCopyBackward0] + 140210811943664 -> 140210811942752 + 140202229013616 [label="encoder.layer.4.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202229013616 -> 140210811943664 + 140210811943664 [label=AccumulateGrad] + 140210811926704 -> 140210811926656 + 140210811926704 [label=ReshapeAliasBackward0] + 140210811927040 -> 140210811926704 + 140210811927040 [label=ExpandBackward0] + 140210811927232 -> 140210811927040 + 140210811927232 [label=PermuteBackward0] + 140210811927424 -> 140210811927232 + 140210811927424 [label=ViewBackward0] + 140210811926800 -> 140210811927424 + 140210811926800 [label=ViewBackward0] + 140210811928048 -> 140210811926800 + 140210811928048 [label=AddmmBackward0] + 140210811958464 -> 140210811928048 + 140210811958464 [label=ToCopyBackward0] + 140210811942320 -> 140210811958464 + 140202229013056 [label="encoder.layer.4.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202229013056 -> 140210811942320 + 140210811942320 [label=AccumulateGrad] + 140210811928336 -> 140210811928048 + 140210811928336 [label=ViewBackward0] + 140210811943328 -> 140210811928336 + 140210811943328 [label=ToCopyBackward0] + 140210812052960 -> 140210811943328 + 140210811926848 -> 140210811928048 + 140210811926848 [label=TBackward0] + 140210811941360 -> 140210811926848 + 140210811941360 [label=ToCopyBackward0] + 140210811941600 -> 140210811941360 + 140202229013376 [label="encoder.layer.4.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202229013376 -> 140210811941600 + 140210811941600 [label=AccumulateGrad] + 140210811925744 -> 140210811925936 + 140210811925744 [label=TBackward0] + 140210811926416 -> 140210811925744 + 140210811926416 [label=ToCopyBackward0] + 140210811926608 -> 140210811926416 + 140202229013136 [label="encoder.layer.4.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202229013136 -> 140210811926608 + 140210811926608 [label=AccumulateGrad] + 140210811925648 -> 140210811925504 + 140210811925456 -> 140210811925408 + 140202229012896 [label="encoder.layer.4.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229012896 -> 140210811925456 + 140210811925456 [label=AccumulateGrad] + 140210811925024 -> 140210811925408 + 140202229012576 [label="encoder.layer.4.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229012576 -> 140210811925024 + 140210811925024 [label=AccumulateGrad] + 140210811924592 -> 140210811924832 + 140210811924592 [label=TBackward0] + 140210811925072 -> 140210811924592 + 140210811925072 [label=ToCopyBackward0] + 140210811925552 -> 140210811925072 + 140202229011216 [label="encoder.layer.4.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202229011216 -> 140210811925552 + 140210811925552 [label=AccumulateGrad] + 140202224193104 -> 140202224193296 + 140202224193104 [label=TBackward0] + 140210811924784 -> 140202224193104 + 140210811924784 [label=ToCopyBackward0] + 140210811925264 -> 140210811924784 + 140202229010976 [label="encoder.layer.4.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202229010976 -> 140210811925264 + 140210811925264 [label=AccumulateGrad] + 140202224193008 -> 140202224192864 + 140202224192816 -> 140202224192720 + 140202229010736 [label="encoder.layer.4.expert_ln.weight + (768)" fillcolor=lightblue] + 140202229010736 -> 140202224192816 + 140202224192816 [label=AccumulateGrad] + 140202224192768 -> 140202224192720 + 140202229010496 [label="encoder.layer.4.expert_ln.bias + (768)" fillcolor=lightblue] + 140202229010496 -> 140202224192768 + 140202224192768 [label=AccumulateGrad] + 140202224192432 -> 140202224191472 + 140202224192432 [label=NativeLayerNormBackward0] + 140202224193152 -> 140202224192432 + 140202224193152 [label=AddBackward0] + 140202224193440 -> 140202224193152 + 140202224193440 [label=NativeDropoutBackward0] + 140210811924688 -> 140202224193440 + 140210811924688 [label=ViewBackward0] + 140210811925216 -> 140210811924688 + 140210811925216 [label=AddmmBackward0] + 140210811926080 -> 140210811925216 + 140210811926080 [label=ToCopyBackward0] + 140210811926176 -> 140210811926080 + 140202229012096 [label="encoder.layer.4.output.dense.bias + (768)" fillcolor=lightblue] + 140202229012096 -> 140210811926176 + 140210811926176 [label=AccumulateGrad] + 140210811925888 -> 140210811925216 + 140210811925888 [label=ViewBackward0] + 140210811926320 -> 140210811925888 + 140210811926320 [label=GeluBackward0] + 140210811927328 -> 140210811926320 + 140210811927328 [label=ViewBackward0] + 140210811927856 -> 140210811927328 + 140210811927856 [label=AddmmBackward0] + 140210811926944 -> 140210811927856 + 140210811926944 [label=ToCopyBackward0] + 140210812095024 -> 140210811926944 + 140202229012336 [label="encoder.layer.4.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202229012336 -> 140210812095024 + 140210812095024 [label=AccumulateGrad] + 140210811943136 -> 140210811927856 + 140210811943136 [label=ViewBackward0] + 140210812095120 -> 140210811943136 + 140210812095120 [label=ToCopyBackward0] + 140210811924976 -> 140210812095120 + 140210811924976 [label=SliceBackward0] + 140210812095264 -> 140210811924976 + 140210812095264 [label=SliceBackward0] + 140210812095360 -> 140210812095264 + 140210812095360 [label=SliceBackward0] + 140210811941216 -> 140210812095360 + 140210811941408 -> 140210811927856 + 140210811941408 [label=TBackward0] + 140210812094928 -> 140210811941408 + 140210812094928 [label=ToCopyBackward0] + 140210812095456 -> 140210812094928 + 140202229012656 [label="encoder.layer.4.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202229012656 -> 140210812095456 + 140210812095456 [label=AccumulateGrad] + 140210811925792 -> 140210811925216 + 140210811925792 [label=TBackward0] + 140210811927520 -> 140210811925792 + 140210811927520 [label=ToCopyBackward0] + 140210811941984 -> 140210811927520 + 140202229012416 [label="encoder.layer.4.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202229012416 -> 140210811941984 + 140210811941984 [label=AccumulateGrad] + 140210811924976 -> 140202224193152 + 140202224192960 -> 140202224192432 + 140202229012176 [label="encoder.layer.4.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202229012176 -> 140202224192960 + 140202224192960 [label=AccumulateGrad] + 140202224192912 -> 140202224192432 + 140202229011856 [label="encoder.layer.4.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202229011856 -> 140202224192912 + 140202224192912 [label=AccumulateGrad] + 140202224191712 -> 140202224192192 + 140202224191712 [label=TBackward0] + 140202224192528 -> 140202224191712 + 140202224192528 [label=ToCopyBackward0] + 140202224193248 -> 140202224192528 + 140202228989840 [label="encoder.layer.5.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228989840 -> 140202224193248 + 140202224193248 [label=AccumulateGrad] + 140202224189600 -> 140202224189552 + 140202224189600 [label=ReshapeAliasBackward0] + 140202224191952 -> 140202224189600 + 140202224191952 [label=ExpandBackward0] + 140202224192144 -> 140202224191952 + 140202224192144 [label=TransposeBackward0] + 140202224192672 -> 140202224192144 + 140202224192672 [label=PermuteBackward0] + 140202224192624 -> 140202224192672 + 140202224192624 [label=ViewBackward0] + 140210811924544 -> 140202224192624 + 140210811924544 [label=ViewBackward0] + 140210811926512 -> 140210811924544 + 140210811926512 [label=AddmmBackward0] + 140210811927136 -> 140210811926512 + 140210811927136 [label=ToCopyBackward0] + 140210812095072 -> 140210811927136 + 140202228989360 [label="encoder.layer.5.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228989360 -> 140210812095072 + 140210812095072 [label=AccumulateGrad] + 140210811925600 -> 140210811926512 + 140210811925600 [label=ViewBackward0] + 140210812095504 -> 140210811925600 + 140210812095504 [label=ToCopyBackward0] + 140202224191472 -> 140210812095504 + 140210812095168 -> 140210811926512 + 140210812095168 [label=TBackward0] + 140210812095216 -> 140210812095168 + 140210812095216 [label=ToCopyBackward0] + 140210812095648 -> 140210812095216 + 140202228989680 [label="encoder.layer.5.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228989680 -> 140210812095648 + 140210812095648 [label=AccumulateGrad] + 140202224190416 -> 140202224190560 + 140202224190416 [label=ReshapeAliasBackward0] + 140202224190176 -> 140202224190416 + 140202224190176 [label=ExpandBackward0] + 140202224189984 -> 140202224190176 + 140202224189984 [label=PermuteBackward0] + 140202224189792 -> 140202224189984 + 140202224189792 [label=ViewBackward0] + 140202224190320 -> 140202224189792 + 140202224190320 [label=ViewBackward0] + 140202224192048 -> 140202224190320 + 140202224192048 [label=AddmmBackward0] + 140202224191760 -> 140202224192048 + 140202224191760 [label=ToCopyBackward0] + 140210811926128 -> 140202224191760 + 140202228989120 [label="encoder.layer.5.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228989120 -> 140210811926128 + 140210811926128 [label=AccumulateGrad] + 140202224192336 -> 140202224192048 + 140202224192336 [label=ViewBackward0] + 140210812095408 -> 140202224192336 + 140210812095408 [label=ToCopyBackward0] + 140202224191472 -> 140210812095408 + 140202224190368 -> 140202224192048 + 140202224190368 [label=TBackward0] + 140210812095312 -> 140202224190368 + 140210812095312 [label=ToCopyBackward0] + 140210812095552 -> 140210812095312 + 140202228989440 [label="encoder.layer.5.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228989440 -> 140210812095552 + 140210812095552 [label=AccumulateGrad] + 140202224191376 -> 140202224191184 + 140202224191376 [label=TBackward0] + 140202224190704 -> 140202224191376 + 140202224190704 [label=ToCopyBackward0] + 140202224190512 -> 140202224190704 + 140202228989200 [label="encoder.layer.5.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228989200 -> 140202224190512 + 140202224190512 [label=AccumulateGrad] + 140202224191472 -> 140202222987584 + 140202222987152 -> 140202222988352 + 140202228988960 [label="encoder.layer.5.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228988960 -> 140202222987152 + 140202222987152 [label=AccumulateGrad] + 140202222987104 -> 140202222988352 + 140202228988640 [label="encoder.layer.5.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228988640 -> 140202222987104 + 140202222987104 [label=AccumulateGrad] + 140202222986192 -> 140202222986672 + 140202222986192 [label=TBackward0] + 140202222987200 -> 140202222986192 + 140202222987200 [label=ToCopyBackward0] + 140202222988544 -> 140202222987200 + 140202228987280 [label="encoder.layer.5.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228987280 -> 140202222988544 + 140202222988544 [label=AccumulateGrad] + 140202222985760 -> 140202222986048 + 140202222985760 [label=TBackward0] + 140202222986720 -> 140202222985760 + 140202222986720 [label=ToCopyBackward0] + 140202222987488 -> 140202222986720 + 140202228987040 [label="encoder.layer.5.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228987040 -> 140202222987488 + 140202222987488 [label=AccumulateGrad] + 140202222985568 -> 140202222985280 + 140202222988592 -> 140202222988736 + 140202228986800 [label="encoder.layer.5.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228986800 -> 140202222988592 + 140202222988592 [label=AccumulateGrad] + 140202222987392 -> 140202222988736 + 140202228986480 [label="encoder.layer.5.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228986480 -> 140202222987392 + 140202222987392 [label=AccumulateGrad] + 140202222988160 -> 140202222935248 + 140202222988160 [label=NativeLayerNormBackward0] + 140202222985664 -> 140202222988160 + 140202222985664 [label=AddBackward0] + 140202222987008 -> 140202222985664 + 140202222987008 [label=NativeDropoutBackward0] + 140202222986528 -> 140202222987008 + 140202222986528 [label=ViewBackward0] + 140202222988448 -> 140202222986528 + 140202222988448 [label=AddmmBackward0] + 140202222988256 -> 140202222988448 + 140202222988256 [label=ToCopyBackward0] + 140202224190992 -> 140202222988256 + 140202228988160 [label="encoder.layer.5.output.dense.bias + (768)" fillcolor=lightblue] + 140202228988160 -> 140202224190992 + 140202224190992 [label=AccumulateGrad] + 140202224191616 -> 140202222988448 + 140202224191616 [label=ViewBackward0] + 140202224190608 -> 140202224191616 + 140202224190608 [label=GeluBackward0] + 140202224191040 -> 140202224190608 + 140202224191040 [label=ViewBackward0] + 140202224190080 -> 140202224191040 + 140202224190080 [label=AddmmBackward0] + 140202224189696 -> 140202224190080 + 140202224189696 [label=ToCopyBackward0] + 140210811925360 -> 140202224189696 + 140202228988400 [label="encoder.layer.5.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228988400 -> 140210811925360 + 140210811925360 [label=AccumulateGrad] + 140202224189888 -> 140202224190080 + 140202224189888 [label=ViewBackward0] + 140210812095840 -> 140202224189888 + 140210812095840 [label=ToCopyBackward0] + 140202222986336 -> 140210812095840 + 140202222986336 [label=SliceBackward0] + 140210812095888 -> 140202222986336 + 140210812095888 [label=SliceBackward0] + 140210812095984 -> 140210812095888 + 140210812095984 [label=SliceBackward0] + 140202222988352 -> 140210812095984 + 140202224191136 -> 140202224190080 + 140202224191136 [label=TBackward0] + 140210812095600 -> 140202224191136 + 140210812095600 [label=ToCopyBackward0] + 140210812096080 -> 140210812095600 + 140202228988720 [label="encoder.layer.5.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228988720 -> 140210812096080 + 140210812096080 [label=AccumulateGrad] + 140202224191568 -> 140202222988448 + 140202224191568 [label=TBackward0] + 140202224190800 -> 140202224191568 + 140202224190800 [label=ToCopyBackward0] + 140202224191856 -> 140202224190800 + 140202228988480 [label="encoder.layer.5.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228988480 -> 140202224191856 + 140202224191856 [label=AccumulateGrad] + 140202222986336 -> 140202222985664 + 140202222985376 -> 140202222988160 + 140202228988240 [label="encoder.layer.5.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228988240 -> 140202222985376 + 140202222985376 [label=AccumulateGrad] + 140202222985328 -> 140202222988160 + 140202228987920 [label="encoder.layer.5.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228987920 -> 140202222985328 + 140202222985328 [label=AccumulateGrad] + 140202222963584 -> 140202222964352 + 140202222963584 [label=TBackward0] + 140202222988928 -> 140202222963584 + 140202222988928 [label=ToCopyBackward0] + 140202222986144 -> 140202222988928 + 140202228986560 [label="encoder.layer.6.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228986560 -> 140202222986144 + 140202222986144 [label=AccumulateGrad] + 140202222963392 -> 140202222963200 + 140202222963392 [label=ReshapeAliasBackward0] + 140202222963728 -> 140202222963392 + 140202222963728 [label=ExpandBackward0] + 140202222964160 -> 140202222963728 + 140202222964160 [label=TransposeBackward0] + 140202222964448 -> 140202222964160 + 140202222964448 [label=PermuteBackward0] + 140202222989120 -> 140202222964448 + 140202222989120 [label=ViewBackward0] + 140202222988112 -> 140202222989120 + 140202222988112 [label=ViewBackward0] + 140202222988640 -> 140202222988112 + 140202222988640 [label=AddmmBackward0] + 140202224190272 -> 140202222988640 + 140202224190272 [label=ToCopyBackward0] + 140210812095792 -> 140202224190272 + 140202228986000 [label="encoder.layer.6.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228986000 -> 140210812095792 + 140210812095792 [label=AccumulateGrad] + 140202224191424 -> 140202222988640 + 140202224191424 [label=ViewBackward0] + 140210812096128 -> 140202224191424 + 140210812096128 [label=ToCopyBackward0] + 140202222935248 -> 140210812096128 + 140210812095696 -> 140202222988640 + 140210812095696 [label=TBackward0] + 140210812095744 -> 140210812095696 + 140210812095744 [label=ToCopyBackward0] + 140210812096272 -> 140210812095744 + 140202228986320 [label="encoder.layer.6.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228986320 -> 140210812096272 + 140210812096272 [label=AccumulateGrad] + 140202222961760 -> 140202222961856 + 140202222961760 [label=ReshapeAliasBackward0] + 140202222962432 -> 140202222961760 + 140202222962432 [label=ExpandBackward0] + 140202222962816 -> 140202222962432 + 140202222962816 [label=PermuteBackward0] + 140202222963104 -> 140202222962816 + 140202222963104 [label=ViewBackward0] + 140202222961808 -> 140202222963104 + 140202222961808 [label=ViewBackward0] + 140202222963968 -> 140202222961808 + 140202222963968 [label=AddmmBackward0] + 140202222963488 -> 140202222963968 + 140202222963488 [label=ToCopyBackward0] + 140202224191328 -> 140202222963488 + 140202228985664 [label="encoder.layer.6.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228985664 -> 140202224191328 + 140202224191328 [label=AccumulateGrad] + 140202222962144 -> 140202222963968 + 140202222962144 [label=ViewBackward0] + 140210812096032 -> 140202222962144 + 140210812096032 [label=ToCopyBackward0] + 140202222935248 -> 140210812096032 + 140202222985712 -> 140202222963968 + 140202222985712 [label=TBackward0] + 140210812095936 -> 140202222985712 + 140210812095936 [label=ToCopyBackward0] + 140210812096176 -> 140210812095936 + 140202228986080 [label="encoder.layer.6.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228986080 -> 140210812096176 + 140210812096176 [label=AccumulateGrad] + 140202222960704 -> 140202222935728 + 140202222960704 [label=TBackward0] + 140202222961280 -> 140202222960704 + 140202222961280 [label=ToCopyBackward0] + 140202222961568 -> 140202222961280 + 140202228985744 [label="encoder.layer.6.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228985744 -> 140202222961568 + 140202222961568 [label=AccumulateGrad] + 140202222935248 -> 140202222935296 + 140202222935008 -> 140202222935104 + 140202228985504 [label="encoder.layer.6.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228985504 -> 140202222935008 + 140202222935008 [label=AccumulateGrad] + 140202222934336 -> 140202222935104 + 140202228985184 [label="encoder.layer.6.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228985184 -> 140202222934336 + 140202222934336 [label=AccumulateGrad] + 140202222933184 -> 140202222933952 + 140202222933184 [label=TBackward0] + 140202222934240 -> 140202222933184 + 140202222934240 [label=ToCopyBackward0] + 140202222934768 -> 140202222934240 + 140202228985264 [label="encoder.layer.6.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228985264 -> 140202222934768 + 140202222934768 [label=AccumulateGrad] + 140202222932992 -> 140202222932800 + 140202222932992 [label=ReshapeAliasBackward0] + 140202222933328 -> 140202222932992 + 140202222933328 [label=ExpandBackward0] + 140202222933760 -> 140202222933328 + 140202222933760 [label=TransposeBackward0] + 140202222934528 -> 140202222933760 + 140202222934528 [label=PermuteBackward0] + 140202222935392 -> 140202222934528 + 140202222935392 [label=ViewBackward0] + 140202222934624 -> 140202222935392 + 140202222934624 [label=ViewBackward0] + 140202222935872 -> 140202222934624 + 140202222935872 [label=AddmmBackward0] + 140202222933088 -> 140202222935872 + 140202222933088 [label=ToCopyBackward0] + 140202222961088 -> 140202222933088 + 140202228984704 [label="encoder.layer.6.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202228984704 -> 140202222961088 + 140202222961088 [label=AccumulateGrad] + 140202222960800 -> 140202222935872 + 140202222960800 [label=ViewBackward0] + 140202222962624 -> 140202222960800 + 140202222962624 [label=ToCopyBackward0] + 140210812052960 -> 140202222962624 + 140202222960896 -> 140202222935872 + 140202222960896 [label=TBackward0] + 140202222962336 -> 140202222960896 + 140202222962336 [label=ToCopyBackward0] + 140202222963680 -> 140202222962336 + 140202228985024 [label="encoder.layer.6.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202228985024 -> 140202222963680 + 140202222963680 [label=AccumulateGrad] + 140202222906720 -> 140202222906816 + 140202222906720 [label=ReshapeAliasBackward0] + 140202222906768 -> 140202222906720 + 140202222906768 [label=ExpandBackward0] + 140202222907104 -> 140202222906768 + 140202222907104 [label=PermuteBackward0] + 140202222932704 -> 140202222907104 + 140202222932704 [label=ViewBackward0] + 140202222932032 -> 140202222932704 + 140202222932032 [label=ViewBackward0] + 140202222933568 -> 140202222932032 + 140202222933568 [label=AddmmBackward0] + 140202222934720 -> 140202222933568 + 140202222934720 [label=ToCopyBackward0] + 140202222987776 -> 140202222934720 + 140202228984464 [label="encoder.layer.6.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202228984464 -> 140202222987776 + 140202222987776 [label=AccumulateGrad] + 140202222934048 -> 140202222933568 + 140202222934048 [label=ViewBackward0] + 140202222935776 -> 140202222934048 + 140202222935776 [label=ToCopyBackward0] + 140210812052960 -> 140202222935776 + 140202222932224 -> 140202222933568 + 140202222932224 [label=TBackward0] + 140202222961328 -> 140202222932224 + 140202222961328 [label=ToCopyBackward0] + 140202222960992 -> 140202222961328 + 140202228984784 [label="encoder.layer.6.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202228984784 -> 140202222960992 + 140202222960992 [label=AccumulateGrad] + 140202222905088 -> 140202222905328 + 140202222905088 [label=TBackward0] + 140202222906240 -> 140202222905088 + 140202222906240 [label=ToCopyBackward0] + 140202222906528 -> 140202222906240 + 140202228984544 [label="encoder.layer.6.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228984544 -> 140202222906528 + 140202222906528 [label=AccumulateGrad] + 140202222904848 -> 140202222904896 + 140202222904704 -> 140202222904512 + 140202228984304 [label="encoder.layer.6.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228984304 -> 140202222904704 + 140202222904704 [label=AccumulateGrad] + 140202222903936 -> 140202222904512 + 140202228983984 [label="encoder.layer.6.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228983984 -> 140202222903936 + 140202222903936 [label=AccumulateGrad] + 140202222903408 -> 140202222903552 + 140202222903408 [label=TBackward0] + 140202222904128 -> 140202222903408 + 140202222904128 [label=ToCopyBackward0] + 140202222904800 -> 140202222904128 + 140202228968480 [label="encoder.layer.6.experts.experts.0.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228968480 -> 140202222904800 + 140202222904800 [label=AccumulateGrad] + 140202222873664 -> 140202222874144 + 140202222873664 [label=TBackward0] + 140202222874336 -> 140202222873664 + 140202222874336 [label=ToCopyBackward0] + 140202222904368 -> 140202222874336 + 140202228968560 [label="encoder.layer.6.experts.experts.0.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228968560 -> 140202222904368 + 140202222904368 [label=AccumulateGrad] + 140202222873280 -> 140202222873232 + 140202222873280 [label=UnsqueezeBackward0] + 140202222873856 -> 140202222873280 + 140202222873856 [label=NativeDropoutBackward0] + 140202222874240 -> 140202222873856 + 140202222874240 [label=ViewBackward0] + 140202222905376 -> 140202222874240 + 140202222905376 [label=AddmmBackward0] + 140202222903360 -> 140202222905376 + 140202222903360 [label=ToCopyBackward0] + 140202222905856 -> 140202222903360 + 140202228968240 [label="encoder.layer.6.experts.experts.1.dense2.bias + (768)" fillcolor=lightblue] + 140202228968240 -> 140202222905856 + 140202222905856 [label=AccumulateGrad] + 140202222904608 -> 140202222905376 + 140202222904608 [label=ViewBackward0] + 140202222905760 -> 140202222904608 + 140202222905760 [label=GeluBackward0] + 140202222907296 -> 140202222905760 + 140202222907296 [label=ViewBackward0] + 140202222906048 -> 140202222907296 + 140202222906048 [label=AddmmBackward0] + 140202222905472 -> 140202222906048 + 140202222905472 [label=ToCopyBackward0] + 140202222935200 -> 140202222905472 + 140202228969040 [label="encoder.layer.6.experts.experts.1.dense1.bias + (3072)" fillcolor=lightblue] + 140202228969040 -> 140202222935200 + 140202222935200 [label=AccumulateGrad] + 140202222932512 -> 140202222906048 + 140202222932512 [label=ViewBackward0] + 140202222962912 -> 140202222932512 + 140202222962912 [label=ToCopyBackward0] + 140202222872416 -> 140202222962912 + 140202222932416 -> 140202222906048 + 140202222932416 [label=TBackward0] + 140202222933280 -> 140202222932416 + 140202222933280 [label=ToCopyBackward0] + 140210812096320 -> 140202222933280 + 140202228968320 [label="encoder.layer.6.experts.experts.1.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228968320 -> 140210812096320 + 140210812096320 [label=AccumulateGrad] + 140202222903648 -> 140202222905376 + 140202222903648 [label=TBackward0] + 140202222905952 -> 140202222903648 + 140202222905952 [label=ToCopyBackward0] + 140202222963296 -> 140202222905952 + 140202228968080 [label="encoder.layer.6.experts.experts.1.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228968080 -> 140202222963296 + 140202222963296 [label=AccumulateGrad] + 140202222873184 -> 140202222873232 + 140202222873184 [label=UnsqueezeBackward0] + 140202222932896 -> 140202222873184 + 140202222932896 [label=NativeDropoutBackward0] + 140202222873760 -> 140202222932896 + 140202222873760 [label=ViewBackward0] + 140202222906288 -> 140202222873760 + 140202222906288 [label=AddmmBackward0] + 140202222903888 -> 140202222906288 + 140202222903888 [label=ToCopyBackward0] + 140210812096224 -> 140202222903888 + 140202228967760 [label="encoder.layer.6.experts.experts.2.dense2.bias + (768)" fillcolor=lightblue] + 140202228967760 -> 140210812096224 + 140210812096224 [label=AccumulateGrad] + 140210812096464 -> 140202222906288 + 140210812096464 [label=ViewBackward0] + 140210811723936 -> 140210812096464 + 140210811723936 [label=GeluBackward0] + 140210811724032 -> 140210811723936 + 140210811724032 [label=ViewBackward0] + 140210811724128 -> 140210811724032 + 140210811724128 [label=AddmmBackward0] + 140210811724224 -> 140210811724128 + 140210811724224 [label=ToCopyBackward0] + 140210811724416 -> 140210811724224 + 140202228968000 [label="encoder.layer.6.experts.experts.2.dense1.bias + (3072)" fillcolor=lightblue] + 140202228968000 -> 140210811724416 + 140210811724416 [label=AccumulateGrad] + 140210811724176 -> 140210811724128 + 140210811724176 [label=ViewBackward0] + 140210811724464 -> 140210811724176 + 140210811724464 [label=ToCopyBackward0] + 140202222872416 -> 140210811724464 + 140210811723888 -> 140210811724128 + 140210811723888 [label=TBackward0] + 140210811724320 -> 140210811723888 + 140210811724320 [label=ToCopyBackward0] + 140210811724608 -> 140210811724320 + 140202228967840 [label="encoder.layer.6.experts.experts.2.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228967840 -> 140210811724608 + 140210811724608 [label=AccumulateGrad] + 140210812096368 -> 140202222906288 + 140210812096368 [label=TBackward0] + 140210811724080 -> 140210812096368 + 140210811724080 [label=ToCopyBackward0] + 140210811724560 -> 140210811724080 + 140202228967600 [label="encoder.layer.6.experts.experts.2.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228967600 -> 140210811724560 + 140210811724560 [label=AccumulateGrad] + 140202222872992 -> 140202222872800 + 140202222872992 [label=UnsqueezeBackward0] + 140202222873568 -> 140202222872992 + 140202222873568 [label=UnsqueezeBackward0] + 140202222904224 -> 140202222873568 + 140202222904224 [label=SumBackward1] + 140210812096416 -> 140202222904224 + 140210812096416 [label=MulBackward0] + 140210811724704 -> 140210812096416 + 140210811724704 [label=UnsqueezeBackward0] + 140210811723984 -> 140210811724704 + 140210811723984 [label=TopkBackward0] + 140210811724512 -> 140210811723984 + 140210811724512 [label=SoftmaxBackward0] + 140210811724800 -> 140210811724512 + 140210811724800 [label=MmBackward0] + 140210811724896 -> 140210811724800 + 140210811724896 [label=ToCopyBackward0] + 140210811725040 -> 140210811724896 + 140210811725040 [label=MeanBackward1] + 140210811725136 -> 140210811725040 + 140210811725136 [label=MulBackward0] + 140202222872416 -> 140210811725136 + 140210811724848 -> 140210811724800 + 140210811724848 [label=TBackward0] + 140210811725232 -> 140210811724848 + 140210811725232 [label=ToCopyBackward0] + 140210811724944 -> 140210811725232 + 140202228981824 [label="encoder.layer.6.experts.gate.weight + (3, 768)" fillcolor=lightblue] + 140202228981824 -> 140210811724944 + 140210811724944 [label=AccumulateGrad] + 140202222872416 -> 140202222872272 + 140202222872128 -> 140202222871936 + 140202228982144 [label="encoder.layer.6.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228982144 -> 140202222872128 + 140202222872128 [label=AccumulateGrad] + 140202222872224 -> 140202222871936 + 140202228981904 [label="encoder.layer.6.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228981904 -> 140202222872224 + 140202222872224 [label=AccumulateGrad] + 140202222871744 -> 140202222842352 + 140202222871744 [label=NativeLayerNormBackward0] + 140202222904992 -> 140202222871744 + 140202222904992 [label=AddBackward0] + 140202222873088 -> 140202222904992 + 140202222873088 [label=NativeDropoutBackward0] + 140210811724656 -> 140202222873088 + 140210811724656 [label=ViewBackward0] + 140210811723840 -> 140210811724656 + 140210811723840 [label=AddmmBackward0] + 140210811725184 -> 140210811723840 + 140210811725184 [label=ToCopyBackward0] + 140210811725376 -> 140210811725184 + 140202228983504 [label="encoder.layer.6.output.dense.bias + (768)" fillcolor=lightblue] + 140202228983504 -> 140210811725376 + 140210811725376 [label=AccumulateGrad] + 140210811725088 -> 140210811723840 + 140210811725088 [label=ViewBackward0] + 140210811725424 -> 140210811725088 + 140210811725424 [label=GeluBackward0] + 140210811725520 -> 140210811725424 + 140210811725520 [label=ViewBackward0] + 140210811725616 -> 140210811725520 + 140210811725616 [label=AddmmBackward0] + 140210811725712 -> 140210811725616 + 140210811725712 [label=ToCopyBackward0] + 140210811725904 -> 140210811725712 + 140202228983744 [label="encoder.layer.6.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228983744 -> 140210811725904 + 140210811725904 [label=AccumulateGrad] + 140210811725664 -> 140210811725616 + 140210811725664 [label=ViewBackward0] + 140210811725952 -> 140210811725664 + 140210811725952 [label=ToCopyBackward0] + 140202222873376 -> 140210811725952 + 140202222873376 [label=SliceBackward0] + 140210811726096 -> 140202222873376 + 140210811726096 [label=SliceBackward0] + 140210811726192 -> 140210811726096 + 140210811726192 [label=SliceBackward0] + 140202222935104 -> 140210811726192 + 140210811724992 -> 140210811725616 + 140210811724992 [label=TBackward0] + 140210811725856 -> 140210811724992 + 140210811725856 [label=ToCopyBackward0] + 140210811726288 -> 140210811725856 + 140202228984064 [label="encoder.layer.6.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228984064 -> 140210811726288 + 140210811726288 [label=AccumulateGrad] + 140210811724272 -> 140210811723840 + 140210811724272 [label=TBackward0] + 140210811725568 -> 140210811724272 + 140210811725568 [label=ToCopyBackward0] + 140210811726048 -> 140210811725568 + 140202228983824 [label="encoder.layer.6.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228983824 -> 140210811726048 + 140210811726048 [label=AccumulateGrad] + 140202222873376 -> 140202222904992 + 140202222872512 -> 140202222871744 + 140202228983584 [label="encoder.layer.6.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228983584 -> 140202222872512 + 140202222872512 [label=AccumulateGrad] + 140202222872320 -> 140202222871744 + 140202228983264 [label="encoder.layer.6.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228983264 -> 140202222872320 + 140202222872320 [label=AccumulateGrad] + 140202222870592 -> 140202222871168 + 140202222870592 [label=TBackward0] + 140202222871456 -> 140202222870592 + 140202222871456 [label=ToCopyBackward0] + 140202222872608 -> 140202222871456 + 140202228982384 [label="encoder.layer.7.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228982384 -> 140202222872608 + 140202222872608 [label=AccumulateGrad] + 140202222845568 -> 140202222845664 + 140202222845568 [label=ReshapeAliasBackward0] + 140202222845760 -> 140202222845568 + 140202222845760 [label=ExpandBackward0] + 140202222871264 -> 140202222845760 + 140202222871264 [label=TransposeBackward0] + 140202222872032 -> 140202222871264 + 140202222872032 [label=PermuteBackward0] + 140202222871840 -> 140202222872032 + 140202222871840 [label=ViewBackward0] + 140202222870784 -> 140202222871840 + 140202222870784 [label=ViewBackward0] + 140210811725280 -> 140202222870784 + 140210811725280 [label=AddmmBackward0] + 140210811725808 -> 140210811725280 + 140210811725808 [label=ToCopyBackward0] + 140210811726000 -> 140210811725808 + 140203184706720 [label="encoder.layer.7.attention.self.key.bias + (768)" fillcolor=lightblue] + 140203184706720 -> 140210811726000 + 140210811726000 [label=AccumulateGrad] + 140210811725760 -> 140210811725280 + 140210811725760 [label=ViewBackward0] + 140210811726336 -> 140210811725760 + 140210811726336 [label=ToCopyBackward0] + 140202222842352 -> 140210811726336 + 140210811724752 -> 140210811725280 + 140210811724752 [label=TBackward0] + 140210811725472 -> 140210811724752 + 140210811725472 [label=ToCopyBackward0] + 140210811726480 -> 140210811725472 + 140202228982624 [label="encoder.layer.7.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228982624 -> 140210811726480 + 140210811726480 [label=AccumulateGrad] + 140202222844224 -> 140202222843936 + 140202222844224 [label=ReshapeAliasBackward0] + 140202222844608 -> 140202222844224 + 140202222844608 [label=ExpandBackward0] + 140202222844896 -> 140202222844608 + 140202222844896 [label=PermuteBackward0] + 140202222845280 -> 140202222844896 + 140202222845280 [label=ViewBackward0] + 140202222844272 -> 140202222845280 + 140202222844272 [label=ViewBackward0] + 140202222844320 -> 140202222844272 + 140202222844320 [label=AddmmBackward0] + 140202222872752 -> 140202222844320 + 140202222872752 [label=ToCopyBackward0] + 140210811726432 -> 140202222872752 + 140202228969200 [label="encoder.layer.7.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228969200 -> 140210811726432 + 140210811726432 [label=AccumulateGrad] + 140202222871552 -> 140202222844320 + 140202222871552 [label=ViewBackward0] + 140210811726240 -> 140202222871552 + 140210811726240 [label=ToCopyBackward0] + 140202222842352 -> 140210811726240 + 140202222870832 -> 140202222844320 + 140202222870832 [label=TBackward0] + 140210811725328 -> 140202222870832 + 140210811725328 [label=ToCopyBackward0] + 140210811726384 -> 140210811725328 + 140202228969280 [label="encoder.layer.7.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228969280 -> 140210811726384 + 140210811726384 [label=AccumulateGrad] + 140202222842592 -> 140202222842832 + 140202222842592 [label=TBackward0] + 140202222843744 -> 140202222842592 + 140202222843744 [label=ToCopyBackward0] + 140202222844032 -> 140202222843744 + 140202228968960 [label="encoder.layer.7.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228968960 -> 140202222844032 + 140202222844032 [label=AccumulateGrad] + 140202222842352 -> 140202222841968 + 140202222842112 -> 140202222820224 + 140202228967520 [label="encoder.layer.7.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228967520 -> 140202222842112 + 140202222842112 [label=AccumulateGrad] + 140202222842016 -> 140202222820224 + 140202228967280 [label="encoder.layer.7.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228967280 -> 140202222842016 + 140202222842016 [label=AccumulateGrad] + 140202222819456 -> 140202222819936 + 140202222819456 [label=TBackward0] + 140202222820368 -> 140202222819456 + 140202222820368 [label=ToCopyBackward0] + 140202222821088 -> 140202222820368 + 140202228965600 [label="encoder.layer.7.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228965600 -> 140202222821088 + 140202222821088 [label=AccumulateGrad] + 140202222818880 -> 140202222819168 + 140202222818880 [label=TBackward0] + 140202222819888 -> 140202222818880 + 140202222819888 [label=ToCopyBackward0] + 140202222820800 -> 140202222819888 + 140202228965440 [label="encoder.layer.7.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228965440 -> 140202222820800 + 140202222820800 [label=AccumulateGrad] + 140202222818688 -> 140202222818304 + 140202222818400 -> 140202222818208 + 140202228952736 [label="encoder.layer.7.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228952736 -> 140202222818400 + 140202222818400 [label=AccumulateGrad] + 140202222818112 -> 140202222818208 + 140202228952816 [label="encoder.layer.7.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228952816 -> 140202222818112 + 140202222818112 [label=AccumulateGrad] + 140202222817632 -> 140202223288032 + 140202222817632 [label=NativeLayerNormBackward0] + 140202222818784 -> 140202222817632 + 140202222818784 [label=AddBackward0] + 140202222820320 -> 140202222818784 + 140202222820320 [label=NativeDropoutBackward0] + 140202222819840 -> 140202222820320 + 140202222819840 [label=ViewBackward0] + 140202222820512 -> 140202222819840 + 140202222820512 [label=AddmmBackward0] + 140202222842208 -> 140202222820512 + 140202222842208 [label=ToCopyBackward0] + 140202222843264 -> 140202222842208 + 140202228966880 [label="encoder.layer.7.output.dense.bias + (768)" fillcolor=lightblue] + 140202228966880 -> 140202222843264 + 140202222843264 [label=AccumulateGrad] + 140202222842304 -> 140202222820512 + 140202222842304 [label=ViewBackward0] + 140202222843792 -> 140202222842304 + 140202222843792 [label=GeluBackward0] + 140202222843168 -> 140202222843792 + 140202222843168 [label=ViewBackward0] + 140202222844800 -> 140202222843168 + 140202222844800 [label=AddmmBackward0] + 140202222845376 -> 140202222844800 + 140202222845376 [label=ToCopyBackward0] + 140210811726144 -> 140202222845376 + 140202228967120 [label="encoder.layer.7.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228967120 -> 140210811726144 + 140210811726144 [label=AccumulateGrad] + 140202222845088 -> 140202222844800 + 140202222845088 [label=ViewBackward0] + 140210811726672 -> 140202222845088 + 140210811726672 [label=ToCopyBackward0] + 140202222819408 -> 140210811726672 + 140202222819408 [label=SliceBackward0] + 140210811726720 -> 140202222819408 + 140210811726720 [label=SliceBackward0] + 140210811726816 -> 140210811726720 + 140210811726816 [label=SliceBackward0] + 140202222820224 -> 140210811726816 + 140202222842976 -> 140202222844800 + 140202222842976 [label=TBackward0] + 140210811726528 -> 140202222842976 + 140210811726528 [label=ToCopyBackward0] + 140210811726912 -> 140210811726528 + 140202228967360 [label="encoder.layer.7.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228967360 -> 140210811726912 + 140210811726912 [label=AccumulateGrad] + 140202222841920 -> 140202222820512 + 140202222841920 [label=TBackward0] + 140202222843552 -> 140202222841920 + 140202222843552 [label=ToCopyBackward0] + 140202222871072 -> 140202222843552 + 140202228966800 [label="encoder.layer.7.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228966800 -> 140202222871072 + 140202222871072 [label=AccumulateGrad] + 140202222819408 -> 140202222818784 + 140202222818496 -> 140202222817632 + 140202228966560 [label="encoder.layer.7.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228966560 -> 140202222818496 + 140202222818496 [label=AccumulateGrad] + 140202222818448 -> 140202222817632 + 140202228966640 [label="encoder.layer.7.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228966640 -> 140202222818448 + 140202222818448 [label=AccumulateGrad] + 140202222817440 -> 140202223316800 + 140202222817440 [label=TBackward0] + 140202222817728 -> 140202222817440 + 140202222817728 [label=ToCopyBackward0] + 140202222819264 -> 140202222817728 + 140202228952496 [label="encoder.layer.8.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228952496 -> 140202222819264 + 140202222819264 [label=AccumulateGrad] + 140202223316128 -> 140202223315840 + 140202223316128 [label=ReshapeAliasBackward0] + 140202223316512 -> 140202223316128 + 140202223316512 [label=ExpandBackward0] + 140202223316176 -> 140202223316512 + 140202223316176 [label=TransposeBackward0] + 140202223316224 -> 140202223316176 + 140202223316224 [label=PermuteBackward0] + 140202222821280 -> 140202223316224 + 140202222821280 [label=ViewBackward0] + 140202222817968 -> 140202222821280 + 140202222817968 [label=ViewBackward0] + 140202222817536 -> 140202222817968 + 140202222817536 [label=AddmmBackward0] + 140202222844416 -> 140202222817536 + 140202222844416 [label=ToCopyBackward0] + 140210811726624 -> 140202222844416 + 140202228952336 [label="encoder.layer.8.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228952336 -> 140210811726624 + 140210811726624 [label=AccumulateGrad] + 140202222842496 -> 140202222817536 + 140202222842496 [label=ViewBackward0] + 140210811726960 -> 140202222842496 + 140210811726960 [label=ToCopyBackward0] + 140202223288032 -> 140210811726960 + 140210811724368 -> 140202222817536 + 140210811724368 [label=TBackward0] + 140210811726576 -> 140210811724368 + 140210811726576 [label=ToCopyBackward0] + 140210811727104 -> 140210811726576 + 140202228952256 [label="encoder.layer.8.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228952256 -> 140210811727104 + 140210811727104 [label=AccumulateGrad] + 140202223314400 -> 140202223314496 + 140202223314400 [label=ReshapeAliasBackward0] + 140202223315168 -> 140202223314400 + 140202223315168 [label=ExpandBackward0] + 140202223315456 -> 140202223315168 + 140202223315456 [label=PermuteBackward0] + 140202223315696 -> 140202223315456 + 140202223315696 [label=ViewBackward0] + 140202223314592 -> 140202223315696 + 140202223314592 [label=ViewBackward0] + 140202223316704 -> 140202223314592 + 140202223316704 [label=AddmmBackward0] + 140202223314736 -> 140202223316704 + 140202223314736 [label=ToCopyBackward0] + 140202222842688 -> 140202223314736 + 140202228952096 [label="encoder.layer.8.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228952096 -> 140202222842688 + 140202222842688 [label=AccumulateGrad] + 140202222818976 -> 140202223316704 + 140202222818976 [label=ViewBackward0] + 140210811726864 -> 140202222818976 + 140210811726864 [label=ToCopyBackward0] + 140202223288032 -> 140210811726864 + 140202222818016 -> 140202223316704 + 140202222818016 [label=TBackward0] + 140210811726768 -> 140202222818016 + 140210811726768 [label=ToCopyBackward0] + 140210811727008 -> 140210811726768 + 140202228952016 [label="encoder.layer.8.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228952016 -> 140210811727008 + 140210811727008 [label=AccumulateGrad] + 140202223313056 -> 140202223313152 + 140202223313056 [label=TBackward0] + 140202223313920 -> 140202223313056 + 140202223313920 [label=ToCopyBackward0] + 140202223314304 -> 140202223313920 + 140202228951776 [label="encoder.layer.8.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228951776 -> 140202223314304 + 140202223314304 [label=AccumulateGrad] + 140202223288032 -> 140202223287936 + 140202223287744 -> 140202223287696 + 140202228951536 [label="encoder.layer.8.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228951536 -> 140202223287744 + 140202223287744 [label=AccumulateGrad] + 140202223286976 -> 140202223287696 + 140202228951616 [label="encoder.layer.8.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228951616 -> 140202223286976 + 140202223286976 [label=AccumulateGrad] + 140202223285776 -> 140202223286688 + 140202223285776 [label=TBackward0] + 140202223286880 -> 140202223285776 + 140202223286880 [label=ToCopyBackward0] + 140202223287552 -> 140202223286880 + 140202228951296 [label="encoder.layer.8.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228951296 -> 140202223287552 + 140202223287552 [label=AccumulateGrad] + 140202223285728 -> 140202223285440 + 140202223285728 [label=ReshapeAliasBackward0] + 140202223286112 -> 140202223285728 + 140202223286112 [label=ExpandBackward0] + 140202223286400 -> 140202223286112 + 140202223286400 [label=TransposeBackward0] + 140202223287264 -> 140202223286400 + 140202223287264 [label=PermuteBackward0] + 140202223288128 -> 140202223287264 + 140202223288128 [label=ViewBackward0] + 140202223287216 -> 140202223288128 + 140202223287216 [label=ViewBackward0] + 140202223285824 -> 140202223287216 + 140202223285824 [label=AddmmBackward0] + 140202223313440 -> 140202223285824 + 140202223313440 [label=ToCopyBackward0] + 140202223313824 -> 140202223313440 + 140202228951136 [label="encoder.layer.8.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202228951136 -> 140202223313824 + 140202223313824 [label=AccumulateGrad] + 140202223313536 -> 140202223285824 + 140202223313536 [label=ViewBackward0] + 140202223315216 -> 140202223313536 + 140202223315216 [label=ToCopyBackward0] + 140210812052960 -> 140202223315216 + 140202223312960 -> 140202223285824 + 140202223312960 [label=TBackward0] + 140202223314976 -> 140202223312960 + 140202223314976 [label=ToCopyBackward0] + 140202223316320 -> 140202223314976 + 140202228951056 [label="encoder.layer.8.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202228951056 -> 140202223316320 + 140202223316320 [label=AccumulateGrad] + 140202223251168 -> 140202223251264 + 140202223251168 [label=ReshapeAliasBackward0] + 140202223284768 -> 140202223251168 + 140202223284768 [label=ExpandBackward0] + 140202223285056 -> 140202223284768 + 140202223285056 [label=PermuteBackward0] + 140202223285296 -> 140202223285056 + 140202223285296 [label=ViewBackward0] + 140202223284288 -> 140202223285296 + 140202223284288 [label=ViewBackward0] + 140202223286304 -> 140202223284288 + 140202223286304 [label=AddmmBackward0] + 140202223287360 -> 140202223286304 + 140202223287360 [label=ToCopyBackward0] + 140202222820848 -> 140202223287360 + 140202228950896 [label="encoder.layer.8.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202228950896 -> 140202222820848 + 140202222820848 [label=AccumulateGrad] + 140202223286784 -> 140202223286304 + 140202223286784 [label=ViewBackward0] + 140202223315936 -> 140202223286784 + 140202223315936 [label=ToCopyBackward0] + 140210812052960 -> 140202223315936 + 140202223284336 -> 140202223286304 + 140202223284336 [label=TBackward0] + 140202223313248 -> 140202223284336 + 140202223313248 [label=ToCopyBackward0] + 140202223314112 -> 140202223313248 + 140202228950816 [label="encoder.layer.8.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202228950816 -> 140202223314112 + 140202223314112 [label=AccumulateGrad] + 140202223249584 -> 140202223250016 + 140202223249584 [label=TBackward0] + 140202223250784 -> 140202223249584 + 140202223250784 [label=ToCopyBackward0] + 140202223251072 -> 140202223250784 + 140202228950576 [label="encoder.layer.8.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228950576 -> 140202223251072 + 140202223251072 [label=AccumulateGrad] + 140202223249536 -> 140202223249152 + 140202223249104 -> 140202223248960 + 140202228950336 [label="encoder.layer.8.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228950336 -> 140202223249104 + 140202223249104 [label=AccumulateGrad] + 140202223248288 -> 140202223248960 + 140202228950416 [label="encoder.layer.8.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228950416 -> 140202223248288 + 140202223248288 [label=AccumulateGrad] + 140202223247520 -> 140202223248000 + 140202223247520 [label=TBackward0] + 140202223248576 -> 140202223247520 + 140202223248576 [label=ToCopyBackward0] + 140202223249344 -> 140202223248576 + 140202228934912 [label="encoder.layer.8.experts.experts.0.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228934912 -> 140202223249344 + 140202223249344 [label=AccumulateGrad] + 140202223230256 -> 140202223230736 + 140202223230256 [label=TBackward0] + 140202223248096 -> 140202223230256 + 140202223248096 [label=ToCopyBackward0] + 140202223248864 -> 140202223248096 + 140202228934592 [label="encoder.layer.8.experts.experts.0.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228934592 -> 140202223248864 + 140202223248864 [label=AccumulateGrad] + 140202223229920 -> 140202223230016 + 140202223229920 [label=UnsqueezeBackward0] + 140202223230592 -> 140202223229920 + 140202223230592 [label=NativeDropoutBackward0] + 140202223230304 -> 140202223230592 + 140202223230304 [label=ViewBackward0] + 140202223249632 -> 140202223230304 + 140202223249632 [label=AddmmBackward0] + 140202223247904 -> 140202223249632 + 140202223247904 [label=ToCopyBackward0] + 140202223250112 -> 140202223247904 + 140202228934672 [label="encoder.layer.8.experts.experts.1.dense2.bias + (768)" fillcolor=lightblue] + 140202228934672 -> 140202223250112 + 140202223250112 [label=AccumulateGrad] + 140202223249056 -> 140202223249632 + 140202223249056 [label=ViewBackward0] + 140202223250304 -> 140202223249056 + 140202223250304 [label=GeluBackward0] + 140202223251024 -> 140202223250304 + 140202223251024 [label=ViewBackward0] + 140202223250544 -> 140202223251024 + 140202223250544 [label=AddmmBackward0] + 140202223285248 -> 140202223250544 + 140202223285248 [label=ToCopyBackward0] + 140202223287840 -> 140202223285248 + 140202228935072 [label="encoder.layer.8.experts.experts.1.dense1.bias + (3072)" fillcolor=lightblue] + 140202228935072 -> 140202223287840 + 140202223287840 [label=AccumulateGrad] + 140202223284816 -> 140202223250544 + 140202223284816 [label=ViewBackward0] + 140202223315648 -> 140202223284816 + 140202223315648 [label=ToCopyBackward0] + 140202223229152 -> 140202223315648 + 140202223284576 -> 140202223250544 + 140202223284576 [label=TBackward0] + 140202223285920 -> 140202223284576 + 140202223285920 [label=ToCopyBackward0] + 140210811727152 -> 140202223285920 + 140202228934352 [label="encoder.layer.8.experts.experts.1.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228934352 -> 140210811727152 + 140210811727152 [label=AccumulateGrad] + 140202223247424 -> 140202223249632 + 140202223247424 [label=TBackward0] + 140202223249728 -> 140202223247424 + 140202223249728 [label=ToCopyBackward0] + 140202223313728 -> 140202223249728 + 140202228934112 [label="encoder.layer.8.experts.experts.1.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228934112 -> 140202223313728 + 140202223313728 [label=AccumulateGrad] + 140202223229776 -> 140202223230016 + 140202223229776 [label=UnsqueezeBackward0] + 140202223285536 -> 140202223229776 + 140202223285536 [label=NativeDropoutBackward0] + 140202223249248 -> 140202223285536 + 140202223249248 [label=ViewBackward0] + 140202223250208 -> 140202223249248 + 140202223250208 [label=AddmmBackward0] + 140202223247616 -> 140202223250208 + 140202223247616 [label=ToCopyBackward0] + 140210811727392 -> 140202223247616 + 140202228934192 [label="encoder.layer.8.experts.experts.2.dense2.bias + (768)" fillcolor=lightblue] + 140202228934192 -> 140210811727392 + 140210811727392 [label=AccumulateGrad] + 140210811727296 -> 140202223250208 + 140210811727296 [label=ViewBackward0] + 140210811727440 -> 140210811727296 + 140210811727440 [label=GeluBackward0] + 140210811727536 -> 140210811727440 + 140210811727536 [label=ViewBackward0] + 140210811727632 -> 140210811727536 + 140210811727632 [label=AddmmBackward0] + 140210811727728 -> 140210811727632 + 140210811727728 [label=ToCopyBackward0] + 140210811727824 -> 140210811727728 + 140202228934432 [label="encoder.layer.8.experts.experts.2.dense1.bias + (3072)" fillcolor=lightblue] + 140202228934432 -> 140210811727824 + 140210811727824 [label=AccumulateGrad] + 140210811727680 -> 140210811727632 + 140210811727680 [label=ViewBackward0] + 140210811781280 -> 140210811727680 + 140210811781280 [label=ToCopyBackward0] + 140202223229152 -> 140210811781280 + 140210811727344 -> 140210811727632 + 140210811727344 [label=TBackward0] + 140210811781232 -> 140210811727344 + 140210811781232 [label=ToCopyBackward0] + 140210811781424 -> 140210811781232 + 140202228933872 [label="encoder.layer.8.experts.experts.2.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228933872 -> 140210811781424 + 140210811781424 [label=AccumulateGrad] + 140210811727200 -> 140202223250208 + 140210811727200 [label=TBackward0] + 140210811727584 -> 140210811727200 + 140210811727584 [label=ToCopyBackward0] + 140210811727776 -> 140210811727584 + 140202228933632 [label="encoder.layer.8.experts.experts.2.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228933632 -> 140210811727776 + 140210811727776 [label=AccumulateGrad] + 140202223229728 -> 140202223229440 + 140202223229728 [label=UnsqueezeBackward0] + 140202223230400 -> 140202223229728 + 140202223230400 [label=UnsqueezeBackward0] + 140202223248672 -> 140202223230400 + 140202223248672 [label=SumBackward1] + 140202223229824 -> 140202223248672 + 140202223229824 [label=MulBackward0] + 140210811727488 -> 140202223229824 + 140210811727488 [label=UnsqueezeBackward0] + 140210811781376 -> 140210811727488 + 140210811781376 [label=TopkBackward0] + 140210811781328 -> 140210811781376 + 140210811781328 [label=SoftmaxBackward0] + 140210811781616 -> 140210811781328 + 140210811781616 [label=MmBackward0] + 140210811781712 -> 140210811781616 + 140210811781712 [label=ToCopyBackward0] + 140210811781856 -> 140210811781712 + 140210811781856 [label=MeanBackward1] + 140210811781952 -> 140210811781856 + 140210811781952 [label=MulBackward0] + 140202223229152 -> 140210811781952 + 140210811781664 -> 140210811781616 + 140210811781664 [label=TBackward0] + 140210811782048 -> 140210811781664 + 140210811782048 [label=ToCopyBackward0] + 140210811781760 -> 140210811782048 + 140202228935872 [label="encoder.layer.8.experts.gate.weight + (3, 768)" fillcolor=lightblue] + 140202228935872 -> 140210811781760 + 140210811781760 [label=AccumulateGrad] + 140202223229152 -> 140202223229056 + 140202223228864 -> 140202223228672 + 140202228935792 [label="encoder.layer.8.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228935792 -> 140202223228864 + 140202223228864 [label=AccumulateGrad] + 140202223228816 -> 140202223228672 + 140202228935552 [label="encoder.layer.8.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228935552 -> 140202223228816 + 140202223228816 [label=AccumulateGrad] + 140202223228336 -> 140202223195040 + 140202223228336 [label=NativeLayerNormBackward0] + 140202223229536 -> 140202223228336 + 140202223229536 [label=AddBackward0] + 140202223248384 -> 140202223229536 + 140202223248384 [label=NativeDropoutBackward0] + 140210811727248 -> 140202223248384 + 140210811727248 [label=ViewBackward0] + 140210811781520 -> 140210811727248 + 140210811781520 [label=AddmmBackward0] + 140210811782000 -> 140210811781520 + 140210811782000 [label=ToCopyBackward0] + 140210811782192 -> 140210811782000 + 140202228949936 [label="encoder.layer.8.output.dense.bias + (768)" fillcolor=lightblue] + 140202228949936 -> 140210811782192 + 140210811782192 [label=AccumulateGrad] + 140210811781904 -> 140210811781520 + 140210811781904 [label=ViewBackward0] + 140210811782240 -> 140210811781904 + 140210811782240 [label=GeluBackward0] + 140210811782336 -> 140210811782240 + 140210811782336 [label=ViewBackward0] + 140210811782432 -> 140210811782336 + 140210811782432 [label=AddmmBackward0] + 140210811782528 -> 140210811782432 + 140210811782528 [label=ToCopyBackward0] + 140210811782720 -> 140210811782528 + 140202228950176 [label="encoder.layer.8.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228950176 -> 140210811782720 + 140210811782720 [label=AccumulateGrad] + 140210811782480 -> 140210811782432 + 140210811782480 [label=ViewBackward0] + 140210811782768 -> 140210811782480 + 140210811782768 [label=ToCopyBackward0] + 140202223230112 -> 140210811782768 + 140202223230112 [label=SliceBackward0] + 140210811782912 -> 140202223230112 + 140210811782912 [label=SliceBackward0] + 140210811783008 -> 140210811782912 + 140210811783008 [label=SliceBackward0] + 140202223287696 -> 140210811783008 + 140210811781808 -> 140210811782432 + 140210811781808 [label=TBackward0] + 140210811782672 -> 140210811781808 + 140210811782672 [label=ToCopyBackward0] + 140210811783104 -> 140210811782672 + 140202228950096 [label="encoder.layer.8.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228950096 -> 140210811783104 + 140210811783104 [label=AccumulateGrad] + 140210811781184 -> 140210811781520 + 140210811781184 [label=TBackward0] + 140210811782384 -> 140210811781184 + 140210811782384 [label=ToCopyBackward0] + 140210811782864 -> 140210811782384 + 140202228949856 [label="encoder.layer.8.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228949856 -> 140210811782864 + 140210811782864 [label=AccumulateGrad] + 140202223230112 -> 140202223229536 + 140202223229248 -> 140202223228336 + 140202228949616 [label="encoder.layer.8.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228949616 -> 140202223229248 + 140202223229248 [label=AccumulateGrad] + 140202223228960 -> 140202223228336 + 140202228949696 [label="encoder.layer.8.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228949696 -> 140202223228960 + 140202223228960 [label=AccumulateGrad] + 140202223226992 -> 140202223227904 + 140202223226992 [label=TBackward0] + 140202223228192 -> 140202223226992 + 140202223228192 [label=ToCopyBackward0] + 140210811727056 -> 140202223228192 + 140202228936112 [label="encoder.layer.9.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228936112 -> 140210811727056 + 140210811727056 [label=AccumulateGrad] + 140202223226944 -> 140202223198016 + 140202223226944 [label=ReshapeAliasBackward0] + 140202223227616 -> 140202223226944 + 140202223227616 [label=ExpandBackward0] + 140202223227856 -> 140202223227616 + 140202223227856 [label=TransposeBackward0] + 140202223228768 -> 140202223227856 + 140202223228768 [label=PermuteBackward0] + 140202223228480 -> 140202223228768 + 140202223228480 [label=ViewBackward0] + 140202223227328 -> 140202223228480 + 140202223227328 [label=ViewBackward0] + 140210811782096 -> 140202223227328 + 140210811782096 [label=AddmmBackward0] + 140210811782624 -> 140210811782096 + 140210811782624 [label=ToCopyBackward0] + 140210811782816 -> 140210811782624 + 140202228936272 [label="encoder.layer.9.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228936272 -> 140210811782816 + 140210811782816 [label=AccumulateGrad] + 140210811782576 -> 140210811782096 + 140210811782576 [label=ViewBackward0] + 140210811783152 -> 140210811782576 + 140210811783152 [label=ToCopyBackward0] + 140202223195040 -> 140210811783152 + 140210811781568 -> 140210811782096 + 140210811781568 [label=TBackward0] + 140210811782288 -> 140210811781568 + 140210811782288 [label=ToCopyBackward0] + 140210811783296 -> 140210811782288 + 140202228936352 [label="encoder.layer.9.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228936352 -> 140210811783296 + 140210811783296 [label=AccumulateGrad] + 140202223196720 -> 140202223196576 + 140202223196720 [label=ReshapeAliasBackward0] + 140202223197248 -> 140202223196720 + 140202223197248 [label=ExpandBackward0] + 140202223197536 -> 140202223197248 + 140202223197536 [label=PermuteBackward0] + 140202223197824 -> 140202223197536 + 140202223197824 [label=ViewBackward0] + 140202223196960 -> 140202223197824 + 140202223196960 [label=ViewBackward0] + 140202223227808 -> 140202223196960 + 140202223227808 [label=AddmmBackward0] + 140202223229344 -> 140202223227808 + 140202223229344 [label=ToCopyBackward0] + 140210811783248 -> 140202223229344 + 140202228935312 [label="encoder.layer.9.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228935312 -> 140210811783248 + 140210811783248 [label=AccumulateGrad] + 140202223228288 -> 140202223227808 + 140202223228288 [label=ViewBackward0] + 140210811783056 -> 140202223228288 + 140210811783056 [label=ToCopyBackward0] + 140202223195040 -> 140210811783056 + 140202223227040 -> 140202223227808 + 140202223227040 [label=TBackward0] + 140210811782144 -> 140202223227040 + 140210811782144 [label=ToCopyBackward0] + 140210811783200 -> 140210811782144 + 140202228935632 [label="encoder.layer.9.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228935632 -> 140210811783200 + 140210811783200 [label=AccumulateGrad] + 140202223195232 -> 140202223195520 + 140202223195232 [label=TBackward0] + 140202223196240 -> 140202223195232 + 140202223196240 [label=ToCopyBackward0] + 140202223196672 -> 140202223196240 + 140202228935392 [label="encoder.layer.9.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228935392 -> 140202223196672 + 140202223196672 [label=AccumulateGrad] + 140202223195040 -> 140202223194656 + 140202223194752 -> 140202223194464 + 140202228933952 [label="encoder.layer.9.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228933952 -> 140202223194752 + 140202223194752 [label=AccumulateGrad] + 140202223194272 -> 140202223194464 + 140202228933712 [label="encoder.layer.9.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228933712 -> 140202223194272 + 140202223194272 [label=AccumulateGrad] + 140202223172000 -> 140202223172480 + 140202223172000 [label=TBackward0] + 140202223173056 -> 140202223172000 + 140202223173056 [label=ToCopyBackward0] + 140202223173536 -> 140202223173056 + 140202228927840 [label="encoder.layer.9.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228927840 -> 140202223173536 + 140202223173536 [label=AccumulateGrad] + 140202223171376 -> 140202223171808 + 140202223171376 [label=TBackward0] + 140202223172576 -> 140202223171376 + 140202223172576 [label=ToCopyBackward0] + 140202223173296 -> 140202223172576 + 140202228927600 [label="encoder.layer.9.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228927600 -> 140202223173296 + 140202223173296 [label=AccumulateGrad] + 140202223171328 -> 140202223170944 + 140202223170896 -> 140202223170848 + 140202228927360 [label="encoder.layer.9.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228927360 -> 140202223170896 + 140202223170896 [label=AccumulateGrad] + 140202223170752 -> 140202223170848 + 140202228927040 [label="encoder.layer.9.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228927040 -> 140202223170752 + 140202223170752 [label=AccumulateGrad] + 140202223170272 -> 140202223136928 + 140202223170272 [label=NativeLayerNormBackward0] + 140202223171424 -> 140202223170272 + 140202223171424 [label=AddBackward0] + 140202223172816 -> 140202223171424 + 140202223172816 [label=NativeDropoutBackward0] + 140202223172336 -> 140202223172816 + 140202223172336 [label=ViewBackward0] + 140202223194176 -> 140202223172336 + 140202223194176 [label=AddmmBackward0] + 140202223194848 -> 140202223194176 + 140202223194848 [label=ToCopyBackward0] + 140202223195760 -> 140202223194848 + 140202228932912 [label="encoder.layer.9.output.dense.bias + (768)" fillcolor=lightblue] + 140202228932912 -> 140202223195760 + 140202223195760 [label=AccumulateGrad] + 140202223194800 -> 140202223194176 + 140202223194800 [label=ViewBackward0] + 140202223196480 -> 140202223194800 + 140202223196480 [label=GeluBackward0] + 140202223195808 -> 140202223196480 + 140202223195808 [label=ViewBackward0] + 140202223197344 -> 140202223195808 + 140202223197344 [label=AddmmBackward0] + 140202223196864 -> 140202223197344 + 140202223196864 [label=ToCopyBackward0] + 140210811782960 -> 140202223196864 + 140202228933152 [label="encoder.layer.9.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228933152 -> 140210811782960 + 140210811782960 [label=AccumulateGrad] + 140202223197728 -> 140202223197344 + 140202223197728 [label=ViewBackward0] + 140210811783488 -> 140202223197728 + 140210811783488 [label=ToCopyBackward0] + 140202223172096 -> 140210811783488 + 140202223172096 [label=SliceBackward0] + 140210811783536 -> 140202223172096 + 140210811783536 [label=SliceBackward0] + 140210811783632 -> 140210811783536 + 140210811783632 [label=SliceBackward0] + 140202223194464 -> 140210811783632 + 140202223195616 -> 140202223197344 + 140202223195616 [label=TBackward0] + 140210811783344 -> 140202223195616 + 140210811783344 [label=ToCopyBackward0] + 140210811783728 -> 140210811783344 + 140202228933392 [label="encoder.layer.9.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228933392 -> 140210811783728 + 140210811783728 [label=AccumulateGrad] + 140202223194560 -> 140202223194176 + 140202223194560 [label=TBackward0] + 140202223196192 -> 140202223194560 + 140202223196192 [label=ToCopyBackward0] + 140202223227376 -> 140202223196192 + 140202228933232 [label="encoder.layer.9.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228933232 -> 140202223227376 + 140202223227376 [label=AccumulateGrad] + 140202223172096 -> 140202223171424 + 140202223171040 -> 140202223170272 + 140202228932992 [label="encoder.layer.9.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228932992 -> 140202223171040 + 140202223171040 [label=AccumulateGrad] + 140202223171136 -> 140202223170272 + 140202228932672 [label="encoder.layer.9.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228932672 -> 140202223171136 + 140202223171136 [label=AccumulateGrad] + 140202223169696 -> 140202223169936 + 140202223169696 [label=TBackward0] + 140202223170368 -> 140202223169696 + 140202223170368 [label=ToCopyBackward0] + 140202223171904 -> 140202223170368 + 140202228927120 [label="encoder.layer.10.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228927120 -> 140202223171904 + 140202223171904 [label=AccumulateGrad] + 140202223140240 -> 140202223140096 + 140202223140240 [label=ReshapeAliasBackward0] + 140202223140480 -> 140202223140240 + 140202223140480 [label=ExpandBackward0] + 140202223140384 -> 140202223140480 + 140202223140384 [label=TransposeBackward0] + 140202223170560 -> 140202223140384 + 140202223170560 [label=PermuteBackward0] + 140202223173152 -> 140202223170560 + 140202223173152 [label=ViewBackward0] + 140202223170656 -> 140202223173152 + 140202223170656 [label=ViewBackward0] + 140202223195328 -> 140202223170656 + 140202223195328 [label=AddmmBackward0] + 140202223197056 -> 140202223195328 + 140202223197056 [label=ToCopyBackward0] + 140210811783440 -> 140202223197056 + 140202228926560 [label="encoder.layer.10.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228926560 -> 140210811783440 + 140210811783440 [label=AccumulateGrad] + 140202223194320 -> 140202223195328 + 140202223194320 [label=ViewBackward0] + 140210811783776 -> 140202223194320 + 140210811783776 [label=ToCopyBackward0] + 140202223136928 -> 140210811783776 + 140210811781472 -> 140202223195328 + 140210811781472 [label=TBackward0] + 140210811783392 -> 140210811781472 + 140210811783392 [label=ToCopyBackward0] + 140210811783920 -> 140210811783392 + 140202228926880 [label="encoder.layer.10.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228926880 -> 140210811783920 + 140210811783920 [label=AccumulateGrad] + 140202223138656 -> 140202223138752 + 140202223138656 [label=ReshapeAliasBackward0] + 140202223139280 -> 140202223138656 + 140202223139280 [label=ExpandBackward0] + 140202223139712 -> 140202223139280 + 140202223139712 [label=PermuteBackward0] + 140202223140000 -> 140202223139712 + 140202223140000 [label=ViewBackward0] + 140202223138848 -> 140202223140000 + 140202223138848 [label=ViewBackward0] + 140202223140576 -> 140202223138848 + 140202223140576 [label=AddmmBackward0] + 140202223171520 -> 140202223140576 + 140202223171520 [label=ToCopyBackward0] + 140202223195136 -> 140202223171520 + 140202228926320 [label="encoder.layer.10.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228926320 -> 140202223195136 + 140202223195136 [label=AccumulateGrad] + 140202223170080 -> 140202223140576 + 140202223170080 [label=ViewBackward0] + 140210811783680 -> 140202223170080 + 140210811783680 [label=ToCopyBackward0] + 140202223136928 -> 140210811783680 + 140202223169792 -> 140202223140576 + 140202223169792 [label=TBackward0] + 140210811783584 -> 140202223169792 + 140210811783584 [label=ToCopyBackward0] + 140210811783824 -> 140210811783584 + 140202228926640 [label="encoder.layer.10.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228926640 -> 140210811783824 + 140210811783824 [label=AccumulateGrad] + 140202223137024 -> 140202223137408 + 140202223137024 [label=TBackward0] + 140202223138176 -> 140202223137024 + 140202223138176 [label=ToCopyBackward0] + 140202223138464 -> 140202223138176 + 140202228926400 [label="encoder.layer.10.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228926400 -> 140202223138464 + 140202223138464 [label=AccumulateGrad] + 140202223136928 -> 140202223112096 + 140202223111904 -> 140202223112000 + 140202228926160 [label="encoder.layer.10.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228926160 -> 140202223111904 + 140202223111904 [label=AccumulateGrad] + 140202223111232 -> 140202223112000 + 140202228925840 [label="encoder.layer.10.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228925840 -> 140202223111232 + 140202223111232 [label=AccumulateGrad] + 140202223110080 -> 140202223110800 + 140202223110080 [label=TBackward0] + 140202223111136 -> 140202223110080 + 140202223111136 [label=ToCopyBackward0] + 140202223111808 -> 140202223111136 + 140202228925920 [label="encoder.layer.10.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228925920 -> 140202223111808 + 140202223111808 [label=AccumulateGrad] + 140202223109840 -> 140202223109696 + 140202223109840 [label=ReshapeAliasBackward0] + 140202223110368 -> 140202223109840 + 140202223110368 [label=ExpandBackward0] + 140202223110656 -> 140202223110368 + 140202223110656 [label=TransposeBackward0] + 140202223111424 -> 140202223110656 + 140202223111424 [label=PermuteBackward0] + 140202223111520 -> 140202223111424 + 140202223111520 [label=ViewBackward0] + 140202223109984 -> 140202223111520 + 140202223109984 [label=ViewBackward0] + 140202223137360 -> 140202223109984 + 140202223137360 [label=AddmmBackward0] + 140202223137696 -> 140202223137360 + 140202223137696 [label=ToCopyBackward0] + 140202223137984 -> 140202223137696 + 140202228925360 [label="encoder.layer.10.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140202228925360 -> 140202223137984 + 140202223137984 [label=AccumulateGrad] + 140202223137792 -> 140202223137360 + 140202223137792 [label=ViewBackward0] + 140202223139520 -> 140202223137792 + 140202223139520 [label=ToCopyBackward0] + 140210812052960 -> 140202223139520 + 140202223136880 -> 140202223137360 + 140202223136880 [label=TBackward0] + 140202223139232 -> 140202223136880 + 140202223139232 [label=ToCopyBackward0] + 140202223139040 -> 140202223139232 + 140202228925680 [label="encoder.layer.10.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140202228925680 -> 140202223139040 + 140202223139040 [label=AccumulateGrad] + 140202223108256 -> 140202223082800 + 140202223108256 [label=ReshapeAliasBackward0] + 140202223108880 -> 140202223108256 + 140202223108880 [label=ExpandBackward0] + 140202223109312 -> 140202223108880 + 140202223109312 [label=PermuteBackward0] + 140202223109600 -> 140202223109312 + 140202223109600 [label=ViewBackward0] + 140202223108448 -> 140202223109600 + 140202223108448 [label=ViewBackward0] + 140202223110464 -> 140202223108448 + 140202223110464 [label=AddmmBackward0] + 140202223111616 -> 140202223110464 + 140202223111616 [label=ToCopyBackward0] + 140202223138368 -> 140202223111616 + 140202228925120 [label="encoder.layer.10.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140202228925120 -> 140202223138368 + 140202223138368 [label=AccumulateGrad] + 140202223110944 -> 140202223110464 + 140202223110944 [label=ViewBackward0] + 140202223140192 -> 140202223110944 + 140202223140192 [label=ToCopyBackward0] + 140210812052960 -> 140202223140192 + 140202223108640 -> 140202223110464 + 140202223108640 [label=TBackward0] + 140202223136832 -> 140202223108640 + 140202223136832 [label=ToCopyBackward0] + 140202223137312 -> 140202223136832 + 140202228925440 [label="encoder.layer.10.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140202228925440 -> 140202223137312 + 140202223137312 [label=AccumulateGrad] + 140202223081984 -> 140202223082368 + 140202223081984 [label=TBackward0] + 140202223083136 -> 140202223081984 + 140202223083136 [label=ToCopyBackward0] + 140202223083328 -> 140202223083136 + 140202228925200 [label="encoder.layer.10.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228925200 -> 140202223083328 + 140202223083328 [label=AccumulateGrad] + 140202223081888 -> 140202223081792 + 140202223081504 -> 140202223081600 + 140202228924960 [label="encoder.layer.10.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228924960 -> 140202223081504 + 140202223081504 [label=AccumulateGrad] + 140202223080880 -> 140202223081600 + 140202228924640 [label="encoder.layer.10.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228924640 -> 140202223080880 + 140202223080880 [label=AccumulateGrad] + 140202223080160 -> 140202223080640 + 140202223080160 [label=TBackward0] + 140202223080928 -> 140202223080160 + 140202223080928 [label=ToCopyBackward0] + 140202223081696 -> 140202223080928 + 140202228905040 [label="encoder.layer.10.experts.experts.0.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228905040 -> 140202223081696 + 140202223081696 [label=AccumulateGrad] + 140202223079584 -> 140202223079536 + 140202223079584 [label=TBackward0] + 140202223080448 -> 140202223079584 + 140202223080448 [label=ToCopyBackward0] + 140202223081216 -> 140202223080448 + 140202228905120 [label="encoder.layer.10.experts.experts.0.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228905120 -> 140202223081216 + 140202223081216 [label=AccumulateGrad] + 140202223578608 -> 140202223578464 + 140202223578608 [label=UnsqueezeBackward0] + 140202223579040 -> 140202223578608 + 140202223579040 [label=NativeDropoutBackward0] + 140202223079968 -> 140202223579040 + 140202223079968 [label=ViewBackward0] + 140202223082272 -> 140202223079968 + 140202223082272 [label=AddmmBackward0] + 140202223080256 -> 140202223082272 + 140202223080256 [label=ToCopyBackward0] + 140202223082752 -> 140202223080256 + 140202228904800 [label="encoder.layer.10.experts.experts.1.dense2.bias + (768)" fillcolor=lightblue] + 140202228904800 -> 140202223082752 + 140202223082752 [label=AccumulateGrad] + 140202223081408 -> 140202223082272 + 140202223081408 [label=ViewBackward0] + 140202223082656 -> 140202223081408 + 140202223082656 [label=GeluBackward0] + 140202223081312 -> 140202223082656 + 140202223081312 [label=ViewBackward0] + 140202223108352 -> 140202223081312 + 140202223108352 [label=AddmmBackward0] + 140202223109360 -> 140202223108352 + 140202223109360 [label=ToCopyBackward0] + 140202223169600 -> 140202223109360 + 140202228905600 [label="encoder.layer.10.experts.experts.1.dense1.bias + (3072)" fillcolor=lightblue] + 140202228905600 -> 140202223169600 + 140202223169600 [label=AccumulateGrad] + 140202223109120 -> 140202223108352 + 140202223109120 [label=ViewBackward0] + 140202223139760 -> 140202223109120 + 140202223139760 [label=ToCopyBackward0] + 140202223577888 -> 140202223139760 + 140202223108832 -> 140202223108352 + 140202223108832 [label=TBackward0] + 140202223110176 -> 140202223108832 + 140202223110176 [label=ToCopyBackward0] + 140210811783968 -> 140202223110176 + 140202228904880 [label="encoder.layer.10.experts.experts.1.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228904880 -> 140210811783968 + 140210811783968 [label=AccumulateGrad] + 140202223079488 -> 140202223082272 + 140202223079488 [label=TBackward0] + 140202223082320 -> 140202223079488 + 140202223082320 [label=ToCopyBackward0] + 140202223137840 -> 140202223082320 + 140202228904640 [label="encoder.layer.10.experts.experts.1.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228904640 -> 140202223137840 + 140202223137840 [label=AccumulateGrad] + 140202223578272 -> 140202223578464 + 140202223578272 [label=UnsqueezeBackward0] + 140202223080736 -> 140202223578272 + 140202223080736 [label=NativeDropoutBackward0] + 140202223082944 -> 140202223080736 + 140202223082944 [label=ViewBackward0] + 140202223108160 -> 140202223082944 + 140202223108160 [label=AddmmBackward0] + 140202223079680 -> 140202223108160 + 140202223079680 [label=ToCopyBackward0] + 140210811784208 -> 140202223079680 + 140202228895552 [label="encoder.layer.10.experts.experts.2.dense2.bias + (768)" fillcolor=lightblue] + 140202228895552 -> 140210811784208 + 140210811784208 [label=AccumulateGrad] + 140210811784112 -> 140202223108160 + 140210811784112 [label=ViewBackward0] + 140210811784256 -> 140210811784112 + 140210811784256 [label=GeluBackward0] + 140210811784352 -> 140210811784256 + 140210811784352 [label=ViewBackward0] + 140210811784448 -> 140210811784352 + 140210811784448 [label=AddmmBackward0] + 140210811784544 -> 140210811784448 + 140210811784544 [label=ToCopyBackward0] + 140210811784736 -> 140210811784544 + 140202228904560 [label="encoder.layer.10.experts.experts.2.dense1.bias + (3072)" fillcolor=lightblue] + 140202228904560 -> 140210811784736 + 140210811784736 [label=AccumulateGrad] + 140210811784496 -> 140210811784448 + 140210811784496 [label=ViewBackward0] + 140210811784784 -> 140210811784496 + 140210811784784 [label=ToCopyBackward0] + 140202223577888 -> 140210811784784 + 140210811784160 -> 140210811784448 + 140210811784160 [label=TBackward0] + 140210811784640 -> 140210811784160 + 140210811784640 [label=ToCopyBackward0] + 140210811784928 -> 140210811784640 + 140202228904400 [label="encoder.layer.10.experts.experts.2.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228904400 -> 140210811784928 + 140210811784928 [label=AccumulateGrad] + 140210811784016 -> 140202223108160 + 140210811784016 [label=TBackward0] + 140210811784400 -> 140210811784016 + 140210811784400 [label=ToCopyBackward0] + 140210811784880 -> 140210811784400 + 140202228904320 [label="encoder.layer.10.experts.experts.2.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228904320 -> 140210811784880 + 140210811784880 [label=AccumulateGrad] + 140202223578176 -> 140202223578128 + 140202223578176 [label=UnsqueezeBackward0] + 140202223578848 -> 140202223578176 + 140202223578848 [label=UnsqueezeBackward0] + 140202223109792 -> 140202223578848 + 140202223109792 [label=SumBackward1] + 140202223079920 -> 140202223109792 + 140202223079920 [label=MulBackward0] + 140210811785024 -> 140202223079920 + 140210811785024 [label=UnsqueezeBackward0] + 140210811784304 -> 140210811785024 + 140210811784304 [label=TopkBackward0] + 140210811784832 -> 140210811784304 + 140210811784832 [label=SoftmaxBackward0] + 140210811785120 -> 140210811784832 + 140210811785120 [label=MmBackward0] + 140210811785168 -> 140210811785120 + 140210811785168 [label=ToCopyBackward0] + 140210811850960 -> 140210811785168 + 140210811850960 [label=MeanBackward1] + 140210811851056 -> 140210811850960 + 140210811851056 [label=MulBackward0] + 140202223577888 -> 140210811851056 + 140210811784064 -> 140210811785120 + 140210811784064 [label=TBackward0] + 140210811851152 -> 140210811784064 + 140210811851152 [label=ToCopyBackward0] + 140210811850864 -> 140210811851152 + 140202228906000 [label="encoder.layer.10.experts.gate.weight + (3, 768)" fillcolor=lightblue] + 140202228906000 -> 140210811850864 + 140210811850864 [label=AccumulateGrad] + 140202223577888 -> 140202223577504 + 140202223577600 -> 140202223577408 + 140202228906320 [label="encoder.layer.10.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228906320 -> 140202223577600 + 140202223577600 [label=AccumulateGrad] + 140202223577312 -> 140202223577408 + 140202228906080 [label="encoder.layer.10.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228906080 -> 140202223577312 + 140202223577312 [label=AccumulateGrad] + 140202223576832 -> 140202223540112 + 140202223576832 [label=NativeLayerNormBackward0] + 140202223577984 -> 140202223576832 + 140202223577984 [label=AddBackward0] + 140202223081840 -> 140202223577984 + 140202223081840 [label=NativeDropoutBackward0] + 140210811784976 -> 140202223081840 + 140210811784976 [label=ViewBackward0] + 140210811785072 -> 140210811784976 + 140210811785072 [label=AddmmBackward0] + 140210811851104 -> 140210811785072 + 140210811851104 [label=ToCopyBackward0] + 140210811851296 -> 140210811851104 + 140202228907680 [label="encoder.layer.10.output.dense.bias + (768)" fillcolor=lightblue] + 140202228907680 -> 140210811851296 + 140210811851296 [label=AccumulateGrad] + 140210811851008 -> 140210811785072 + 140210811851008 [label=ViewBackward0] + 140210811851344 -> 140210811851008 + 140210811851344 [label=GeluBackward0] + 140210811851440 -> 140210811851344 + 140210811851440 [label=ViewBackward0] + 140210811851536 -> 140210811851440 + 140210811851536 [label=AddmmBackward0] + 140210811851632 -> 140210811851536 + 140210811851632 [label=ToCopyBackward0] + 140210811851824 -> 140210811851632 + 140202228924480 [label="encoder.layer.10.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228924480 -> 140210811851824 + 140210811851824 [label=AccumulateGrad] + 140210811851584 -> 140210811851536 + 140210811851584 [label=ViewBackward0] + 140210811851872 -> 140210811851584 + 140210811851872 [label=ToCopyBackward0] + 140202223578560 -> 140210811851872 + 140202223578560 [label=SliceBackward0] + 140210811852016 -> 140202223578560 + 140210811852016 [label=SliceBackward0] + 140210811852112 -> 140210811852016 + 140210811852112 [label=SliceBackward0] + 140202223112000 -> 140210811852112 + 140210811850912 -> 140210811851536 + 140210811850912 [label=TBackward0] + 140210811851776 -> 140210811850912 + 140210811851776 [label=ToCopyBackward0] + 140210811852208 -> 140210811851776 + 140202228924720 [label="encoder.layer.10.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228924720 -> 140210811852208 + 140210811852208 [label=AccumulateGrad] + 140210811850816 -> 140210811785072 + 140210811850816 [label=TBackward0] + 140210811851488 -> 140210811850816 + 140210811851488 [label=ToCopyBackward0] + 140210811851968 -> 140210811851488 + 140202228907920 [label="encoder.layer.10.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228907920 -> 140210811851968 + 140210811851968 [label=AccumulateGrad] + 140202223578560 -> 140202223577984 + 140202223577696 -> 140202223576832 + 140202228907760 [label="encoder.layer.10.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228907760 -> 140202223577696 + 140202223577696 [label=AccumulateGrad] + 140202223577648 -> 140202223576832 + 140202228907440 [label="encoder.layer.10.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228907440 -> 140202223577648 + 140202223577648 [label=AccumulateGrad] + 140202223575728 -> 140202223576640 + 140202223575728 [label=TBackward0] + 140202223576928 -> 140202223575728 + 140202223576928 [label=ToCopyBackward0] + 140202223578080 -> 140202223576928 + 140202228906240 [label="encoder.layer.11.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140202228906240 -> 140202223578080 + 140202223578080 [label=AccumulateGrad] + 140202223575680 -> 140202223575392 + 140202223575680 [label=ReshapeAliasBackward0] + 140202223576064 -> 140202223575680 + 140202223576064 [label=ExpandBackward0] + 140202223576352 -> 140202223576064 + 140202223576352 [label=TransposeBackward0] + 140202223577216 -> 140202223576352 + 140202223577216 [label=PermuteBackward0] + 140202223577168 -> 140202223577216 + 140202223577168 [label=ViewBackward0] + 140210811783872 -> 140202223577168 + 140210811783872 [label=ViewBackward0] + 140210811784592 -> 140210811783872 + 140210811784592 [label=AddmmBackward0] + 140210811851728 -> 140210811784592 + 140210811851728 [label=ToCopyBackward0] + 140210811851920 -> 140210811851728 + 140202228906800 [label="encoder.layer.11.attention.self.key.bias + (768)" fillcolor=lightblue] + 140202228906800 -> 140210811851920 + 140210811851920 [label=AccumulateGrad] + 140210811851680 -> 140210811784592 + 140210811851680 [label=ViewBackward0] + 140210811852256 -> 140210811851680 + 140210811852256 [label=ToCopyBackward0] + 140202223540112 -> 140210811852256 + 140210811851248 -> 140210811784592 + 140210811851248 [label=TBackward0] + 140210811851392 -> 140210811851248 + 140210811851392 [label=ToCopyBackward0] + 140210811852400 -> 140210811851392 + 140202228906480 [label="encoder.layer.11.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140202228906480 -> 140210811852400 + 140210811852400 [label=AccumulateGrad] + 140202223541168 -> 140202223541312 + 140202223541168 [label=ReshapeAliasBackward0] + 140202223541888 -> 140202223541168 + 140202223541888 [label=ExpandBackward0] + 140202223542176 -> 140202223541888 + 140202223542176 [label=PermuteBackward0] + 140202223541456 -> 140202223542176 + 140202223541456 [label=ViewBackward0] + 140202223575200 -> 140202223541456 + 140202223575200 [label=ViewBackward0] + 140202223576256 -> 140202223575200 + 140202223576256 [label=AddmmBackward0] + 140202223575776 -> 140202223576256 + 140202223575776 [label=ToCopyBackward0] + 140210811852352 -> 140202223575776 + 140202228905840 [label="encoder.layer.11.attention.self.value.bias + (768)" fillcolor=lightblue] + 140202228905840 -> 140210811852352 + 140210811852352 [label=AccumulateGrad] + 140202223576736 -> 140202223576256 + 140202223576736 [label=ViewBackward0] + 140210811852160 -> 140202223576736 + 140210811852160 [label=ToCopyBackward0] + 140202223540112 -> 140210811852160 + 140202223575104 -> 140202223576256 + 140202223575104 [label=TBackward0] + 140210811851200 -> 140202223575104 + 140210811851200 [label=ToCopyBackward0] + 140210811852304 -> 140210811851200 + 140202228905760 [label="encoder.layer.11.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140202228905760 -> 140210811852304 + 140210811852304 [label=AccumulateGrad] + 140202223540208 -> 140202223540400 + 140202223540208 [label=TBackward0] + 140202223540880 -> 140202223540208 + 140202223540880 [label=ToCopyBackward0] + 140202223541072 -> 140202223540880 + 140202228905520 [label="encoder.layer.11.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140202228905520 -> 140202223541072 + 140202223541072 [label=AccumulateGrad] + 140202223540112 -> 140202223540160 + 140202223539920 -> 140202223540064 + 140202228904160 [label="encoder.layer.11.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228904160 -> 140202223539920 + 140202223539920 [label=AccumulateGrad] + 140202223538576 -> 140202223540064 + 140202228895312 [label="encoder.layer.11.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228895312 -> 140202223538576 + 140202223538576 [label=AccumulateGrad] + 140202223539056 -> 140202223539680 + 140202223539056 [label=TBackward0] + 140202223538384 -> 140202223539056 + 140202223538384 [label=ToCopyBackward0] + 140202223539632 -> 140202223538384 + 140202228893872 [label="encoder.layer.11.experts.dense1.weight + (3072, 768)" fillcolor=lightblue] + 140202228893872 -> 140202223539632 + 140202223539632 [label=AccumulateGrad] + 140202223539344 -> 140202223539008 + 140202223539344 [label=TBackward0] + 140202223539488 -> 140202223539344 + 140202223539488 [label=ToCopyBackward0] + 140202223538240 -> 140202223539488 + 140202228893632 [label="encoder.layer.11.experts.dense2.weight + (768, 3072)" fillcolor=lightblue] + 140202228893632 -> 140202223538240 + 140202223538240 [label=AccumulateGrad] + 140202223538480 -> 140202228614096 + 140202228614192 -> 140202228615488 + 140202228893392 [label="encoder.layer.11.expert_ln.weight + (768)" fillcolor=lightblue] + 140202228893392 -> 140202228614192 + 140202228614192 [label=AccumulateGrad] + 140202228614336 -> 140202228615488 + 140202228893472 [label="encoder.layer.11.expert_ln.bias + (768)" fillcolor=lightblue] + 140202228893472 -> 140202228614336 + 140202228614336 [label=AccumulateGrad] + 140202228614480 -> 140202228657312 + 140202228614480 [label=NativeLayerNormBackward0] + 140202228614432 -> 140202228614480 + 140202228614432 [label=AddBackward0] + 140202223538816 -> 140202228614432 + 140202223538816 [label=NativeDropoutBackward0] + 140202223539392 -> 140202223538816 + 140202223539392 [label=ViewBackward0] + 140202223538432 -> 140202223539392 + 140202223538432 [label=AddmmBackward0] + 140202223540256 -> 140202223538432 + 140202223540256 [label=ToCopyBackward0] + 140202223540592 -> 140202223540256 + 140202228895152 [label="encoder.layer.11.output.dense.bias + (768)" fillcolor=lightblue] + 140202228895152 -> 140202223540592 + 140202223540592 [label=AccumulateGrad] + 140202223540016 -> 140202223538432 + 140202223540016 [label=ViewBackward0] + 140202223540976 -> 140202223540016 + 140202223540976 [label=GeluBackward0] + 140202223540832 -> 140202223540976 + 140202223540832 [label=ViewBackward0] + 140202223541936 -> 140202223540832 + 140202223541936 [label=AddmmBackward0] + 140202223540736 -> 140202223541936 + 140202223540736 [label=ToCopyBackward0] + 140210811784688 -> 140202223540736 + 140202228895392 [label="encoder.layer.11.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140202228895392 -> 140210811784688 + 140210811784688 [label=AccumulateGrad] + 140202223575488 -> 140202223541936 + 140202223575488 [label=ViewBackward0] + 140210811852592 -> 140202223575488 + 140210811852592 [label=ToCopyBackward0] + 140202223539200 -> 140210811852592 + 140202223539200 [label=SliceBackward0] + 140210811852640 -> 140202223539200 + 140210811852640 [label=SliceBackward0] + 140210811852736 -> 140210811852640 + 140210811852736 [label=SliceBackward0] + 140202223540064 -> 140210811852736 + 140202223575248 -> 140202223541936 + 140202223575248 [label=TBackward0] + 140210811852064 -> 140202223575248 + 140210811852064 [label=ToCopyBackward0] + 140210811852832 -> 140210811852064 + 140202228895632 [label="encoder.layer.11.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140202228895632 -> 140210811852832 + 140210811852832 [label=AccumulateGrad] + 140202223539824 -> 140202223538432 + 140202223539824 [label=TBackward0] + 140202223540784 -> 140202223539824 + 140202223540784 [label=ToCopyBackward0] + 140202223575872 -> 140202223540784 + 140202228895072 [label="encoder.layer.11.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140202228895072 -> 140202223575872 + 140202223575872 [label=AccumulateGrad] + 140202223539200 -> 140202228614432 + 140202223538672 -> 140202228614480 + 140202228894832 [label="encoder.layer.11.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140202228894832 -> 140202223538672 + 140202223538672 [label=AccumulateGrad] + 140202223538624 -> 140202228614480 + 140202228894912 [label="encoder.layer.11.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140202228894912 -> 140202223538624 + 140202223538624 [label=AccumulateGrad] + 140202228657312 -> 140202223089520 +} diff --git a/Pre_PromptMoE_RawProb_backward_graph.pdf b/Pre_PromptMoE_RawProb_backward_graph.pdf new file mode 100644 index 0000000..54f7e67 Binary files /dev/null and b/Pre_PromptMoE_RawProb_backward_graph.pdf differ diff --git a/environment.yml b/environment.yml index 51561c7..5230311 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: minigptv +name: promptmoe channels: - pytorch - defaults diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml index 8d62c89..8e96a13 100644 --- a/minigpt4/configs/datasets/coco/caption.yaml +++ b/minigpt4/configs/datasets/coco/caption.yaml @@ -17,14 +17,14 @@ datasets: # md5: aa31ac474cf6250ebb81d18348a07ed8 storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json - val: - url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json - storage: - - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json - test: - url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json - storage: - - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json + # val: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json + # storage: + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json + # test: + # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json + # storage: + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json images: storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO diff --git a/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml b/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml index f281d88..7943d6a 100755 --- a/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml +++ b/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml @@ -20,6 +20,7 @@ datasets: - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json @@ -29,6 +30,7 @@ datasets: - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json images: diff --git a/minigpt4/configs/datasets/okvqa/eval.yaml b/minigpt4/configs/datasets/okvqa/eval.yaml index 244398c..d58c446 100755 --- a/minigpt4/configs/datasets/okvqa/eval.yaml +++ b/minigpt4/configs/datasets/okvqa/eval.yaml @@ -20,6 +20,7 @@ datasets: - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json @@ -32,6 +33,7 @@ datasets: - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json storage: - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json + # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py index e412dd4..6b74cb5 100644 --- a/minigpt4/datasets/datasets/caption_datasets.py +++ b/minigpt4/datasets/datasets/caption_datasets.py @@ -105,6 +105,8 @@ class COCOCaptionDataset(BaseDataset, __DisplMixin): 'Using language, provide a short account of the image.', 'Use a few words to illustrate what is happening in the picture.', ] + self.source = 'coco_cap' + def __getitem__(self, index): # TODO this assumes image input, not general enough @@ -118,13 +120,20 @@ class COCOCaptionDataset(BaseDataset, __DisplMixin): image = self.vis_processor(image) caption = self.text_processor(ann["caption"]) - instruction = random.choice(self.instruction_pool) - instruction = " [caption] {} ".format(instruction) + # instruction = random.choice(self.instruction_pool) + # instruction = " [caption] {} ".format(instruction) + q_input = "" + llm_input = random.choice(self.instruction_pool) return { "image": image, + "image_id": ann["image"], "answer": caption, - "instruction_input": instruction, + "q_input": q_input, + "llm_input": llm_input, + "text_input": llm_input, + "text_output": caption, + "source": 'coco_cap', } class CaptionEvalDataset(BaseDataset, __DisplMixin): diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py index 76f86e4..e388956 100755 --- a/minigpt4/datasets/datasets/coco_caption.py +++ b/minigpt4/datasets/datasets/coco_caption.py @@ -31,6 +31,7 @@ class COCOCapEvalDataset(CaptionEvalDataset): split (string): val or test """ super().__init__(vis_processor, text_processor, vis_root, ann_paths) + self.source = 'coco_cap' def __getitem__(self, index): ann = self.annotation[index] diff --git a/minigpt4/datasets/datasets/dataloader_utils.py b/minigpt4/datasets/datasets/dataloader_utils.py index c827643..08f64da 100644 --- a/minigpt4/datasets/datasets/dataloader_utils.py +++ b/minigpt4/datasets/datasets/dataloader_utils.py @@ -31,7 +31,6 @@ class MultiIterLoader: if ratios is None: ratios = [1.0] * len(loaders) else: - # import pdb; pdb.set_trace() assert len(ratios) == len(loaders) ratios = [float(ratio) / sum(ratios) for ratio in ratios] diff --git a/minigpt4/eval_scripts/eval_vqa.py b/minigpt4/eval_scripts/eval_vqa.py index e8aa39d..6d92b11 100644 --- a/minigpt4/eval_scripts/eval_vqa.py +++ b/minigpt4/eval_scripts/eval_vqa.py @@ -12,7 +12,6 @@ from tqdm import tqdm import torch from torch.utils.data import DataLoader import torch.backends.cudnn as cudnn -from datasets import load_dataset import sys sys.path.append("/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE") @@ -248,6 +247,7 @@ if 'vsr' in args.dataset: img_path = cfg.evaluation_datasets_cfg["vsr"]["img_path"] batch_size = cfg.evaluation_datasets_cfg["vsr"]["batch_size"] max_new_tokens = cfg.evaluation_datasets_cfg["vsr"]["max_new_tokens"] + from datasets import load_dataset annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test') data = VSREvalData(annotation, vis_processor, img_path) diff --git a/minigpt4/models/QformerMoE.py b/minigpt4/models/QformerMoE.py index 5002448..5cc8c1f 100644 --- a/minigpt4/models/QformerMoE.py +++ b/minigpt4/models/QformerMoE.py @@ -386,17 +386,23 @@ class BertOutput(nn.Module): # Add & Norm class FeedForward(nn.Module): + # remove LayerNorm def __init__(self, config): - nn.Module.__init__(self) - # first layer - self.intermediate_query = BertIntermediate(config) - # second layer - self.output_query = BertOutput(config) + super().__init__() + self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2 + # self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2 def forward(self, hidden_states: Tensor): - input_tensor = hidden_states - intermediate_output = self.intermediate_query(hidden_states) - hidden_states = self.output_query(intermediate_output, input_tensor) + hidden_states = self.dense1(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense2(hidden_states) + hidden_states = self.dropout(hidden_states) return hidden_states @@ -440,6 +446,7 @@ class BertLayer(nn.Module): ) else: self.experts = ffn + self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward( self, @@ -494,7 +501,8 @@ class BertLayer(nn.Module): moe_ffn_attention_input = query_attention_output[:, :query_length, :] moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length] layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask) # layer_output, gate_loss, gate_load - + # import pdb; pdb.set_trace() # test0107 + if attention_output.shape[1] > query_length: # have text input in Qformer layer_output_text = apply_chunking_to_forward( self.feed_forward_chunk, @@ -503,6 +511,7 @@ class BertLayer(nn.Module): attention_output[:, query_length:, :], ) layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2]) + else: layer_output = apply_chunking_to_forward( self.feed_forward_chunk, @@ -524,15 +533,14 @@ class BertLayer(nn.Module): def feed_forward_query_moe(self, attention_output, expert_attention_mask): if not self.use_experts: - layer_output = self.experts(attention_output) + hidden_states = self.experts(attention_output) + layer_output = self.expert_ln(hidden_states + attention_output) return layer_output, 0.0, [] - # if not self.importance_processor.is_moe: - # raise RuntimeError("Need to turn the model to a MoE first.") - - layer_output, gate_loss, gate_load = self.experts( + hidden_states, gate_loss, gate_load = self.experts( attention_output, expert_attention_mask ) + layer_output = self.expert_ln(hidden_states + attention_output) return layer_output, gate_loss, gate_load class BertEncoder(nn.Module): diff --git a/minigpt4/models/QformerRouteMoE.py b/minigpt4/models/QformerRouteMoE.py index 910a7d0..8595dc6 100644 --- a/minigpt4/models/QformerRouteMoE.py +++ b/minigpt4/models/QformerRouteMoE.py @@ -46,10 +46,9 @@ from transformers.utils import logging from transformers.models.bert.configuration_bert import BertConfig from minigpt4.models.moe.utils import ( - FeedForward, MoEModelOutput, MoEModelOutputWithPooling, - use_experts, + use_experts_route, moe_layer_judge, ) from minigpt4.models.moe.route_moe_layer import RouteMoELayer @@ -378,13 +377,14 @@ class BertOutput(nn.Module): # Add & Norm def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1 self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + # Move LayerNorm & ResNet out of FFN After MoEFFN + hidden_states = self.LayerNorm(hidden_states + input_tensor) # 1 return hidden_states @@ -429,7 +429,7 @@ class BertLayer(nn.Module): self.output_query = BertOutput(config) # Add MoE FFN - self.use_experts = use_experts(layer_num) + self.use_experts = use_experts_route(layer_num) self.layer_judge = moe_layer_judge(layer_num) self.num_beams = config.moebert_num_beams ffn = FeedForward(config) @@ -442,10 +442,13 @@ class BertLayer(nn.Module): num_beams=config.moebert_num_beams, layer_judge = self.layer_judge, route_method=config.route_method, + weight_type=config.moe_weight_type, ) else: self.experts = ffn + # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + def forward( self, hidden_states, @@ -463,8 +466,8 @@ class BertLayer(nn.Module): self_attn_past_key_value = ( past_key_value[:2] if past_key_value is not None else None ) - # import pdb;pdb.set_trace() - + # import pdb; pdb.set_trace() # 0107test + # adjust the dimension of hidden_states, attention_mask, encoder_attention_mask and encoder_hidden_states to be the same if self.num_beams > 1: if hidden_states.shape[0]== attention_mask.shape[0]*self.num_beams: @@ -494,10 +497,6 @@ class BertLayer(nn.Module): present_key_value = self_attention_outputs[-1] - # import pdb;pdb.set_trace() - # print(self.layer_num, hidden_states.shape, attention_mask.shape) - - if query_length > 0: query_attention_output = attention_output[:, :query_length, :] @@ -526,7 +525,8 @@ class BertLayer(nn.Module): moe_ffn_attention_input = query_attention_output[:, :query_length, :] moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length] layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask, beam_scores, expert_route) - # layer_output = (layer_output, beam_scores, expert_route, beam_idx) + # layer_output = (layer_output, beam_scores, expert_route, beam_idx, importance_loss) + # import pdb; pdb.set_trace() # 0107test if attention_output.shape[1] > query_length: # have text input in Qformer layer_output_text = apply_chunking_to_forward( @@ -535,7 +535,8 @@ class BertLayer(nn.Module): self.seq_len_dim, attention_output[:, query_length:, :], ) - if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1: + if self.layer_judge == 'first' and self.num_beams>1: + # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1: # adjust the dimension of layer_output_text to bz*num_beams layer_output_text = self.adjust_layer_output_text(layer_output_text) @@ -550,7 +551,8 @@ class BertLayer(nn.Module): # layer_output & layer_output_text dimen_0 from bz*num_beams to bz layer_output, layer_output_text = self.route_moe_last_layer_top1(layer_output, layer_output_text) - layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2]) + layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2], layer_output[3],layer_output[4]) + # import pdb; pdb.set_trace() # 0107test else: layer_output = apply_chunking_to_forward( @@ -559,7 +561,7 @@ class BertLayer(nn.Module): self.seq_len_dim, attention_output, ) - layer_output = (layer_output, None, None) + layer_output = (layer_output, None, None, None, 0.0) outputs = (layer_output,) + outputs @@ -594,24 +596,27 @@ class BertLayer(nn.Module): beam_scores_new = beam_scores[selects] expert_route_new = expert_route[selects] - return (hidden_states_new, beam_scores_new, expert_route_new), layer_output_text + return (hidden_states_new, beam_scores_new, expert_route_new, layer_output[3], layer_output[4]), layer_output_text def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) + # layer_output = self.LayerNorm(layer_output + attention_output) return layer_output def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route): - if not self.use_experts: layer_output = self.experts(attention_output) - return layer_output, None, None, None + # layer_output = self.LayerNorm(layer_output + attention_output) + return layer_output, None, None, None, 0.0 - layer_output, beam_scores, expert_route, beam_idx = self.experts( + layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts( attention_output, expert_attention_mask, beam_scores, expert_route ) - return layer_output, beam_scores, expert_route, beam_idx + + # layer_output = self.LayerNorm(layer_output + attention_output) + return layer_output, beam_scores, expert_route, beam_idx, importance_loss class BertEncoder(nn.Module): def __init__(self, config): @@ -645,6 +650,7 @@ class BertEncoder(nn.Module): next_decoder_cache = () if use_cache else None beam_scores=None expert_route=None + importance_loss = 0 for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -693,6 +699,7 @@ class BertEncoder(nn.Module): hidden_states = layer_outputs[0][0] beam_scores = beam_scores if layer_outputs[0][1] == None else layer_outputs[0][1] expert_route = expert_route if layer_outputs[0][2] == None else layer_outputs[0][2] + importance_loss += layer_outputs[0][4] if use_cache: next_decoder_cache += (layer_outputs[-1],) @@ -724,6 +731,7 @@ class BertEncoder(nn.Module): cross_attentions=all_cross_attentions, beam_scores=beam_scores, expert_route=expert_route, + gate_loss=importance_loss, ) @@ -1103,6 +1111,7 @@ class BertModel(BertPreTrainedModel): cross_attentions=encoder_outputs.cross_attentions, beam_scores=encoder_outputs.beam_scores, expert_route=encoder_outputs.expert_route, + gate_loss=encoder_outputs.gate_loss ) diff --git a/minigpt4/models/blip2.py b/minigpt4/models/blip2.py index d79f31d..a6bf474 100644 --- a/minigpt4/models/blip2.py +++ b/minigpt4/models/blip2.py @@ -62,7 +62,7 @@ class Blip2Base(BaseModel): return Qformer, query_tokens @classmethod - def init_RouteMoEQformer(cls, num_query_token, vision_width, moebert_expert_num, moebert_num_beams, route_method, cross_attention_freq=2): + def init_RouteMoEQformer(cls, num_query_token, vision_width, moebert_expert_num, moebert_num_beams, route_method, moe_weight_type, cross_attention_freq=2): moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased") moe_encoder_config.encoder_width = vision_width @@ -74,6 +74,7 @@ class Blip2Base(BaseModel): moe_encoder_config.moebert_expert_num = moebert_expert_num moe_encoder_config.moebert_num_beams = moebert_num_beams moe_encoder_config.route_method = route_method + moe_encoder_config.moe_weight_type = moe_weight_type RouteMoEQformer = BertMoERouteLMHeadModel.from_pretrained( "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config diff --git a/minigpt4/models/blip2_vicuna_instruct.py b/minigpt4/models/blip2_vicuna_instruct.py index 34acf28..13421ab 100644 --- a/minigpt4/models/blip2_vicuna_instruct.py +++ b/minigpt4/models/blip2_vicuna_instruct.py @@ -99,6 +99,7 @@ class Blip2VicunaInstruct(Blip2Base): moebert_expert_num=moebert_expert_num, moebert_num_beams=moebert_num_beams, route_method=moebert_route_method, + moe_weight_type=moe_weight_type, cross_attention_freq=2 ) else: @@ -118,7 +119,6 @@ class Blip2VicunaInstruct(Blip2Base): num_query_token, self.visual_encoder.num_features ) - # import pdb;pdb.set_trace() if not qformer_text_input: self.Qformer.bert.embeddings.word_embeddings = None self.Qformer.bert.embeddings.position_embeddings = None @@ -178,6 +178,19 @@ class Blip2VicunaInstruct(Blip2Base): if "_query" in name and "experts" not in name: # raw ffn_query not update param.requires_grad = False + ln_pattern = r"bert\.encoder\.layer\.\d+\.expert_ln\.(weight|bias)" + if re.match(ln_pattern, name): + key_orig = re.sub('expert_ln', 'output_query.LayerNorm', name) + param.data.copy_(state_dict[key_orig]) + d1_pattern = r"bert\.encoder\.layer\.(\d+)\.experts(\.|\.experts\.\d+\.)dense1\.(weight|bias)" + if re.match(d1_pattern, name): + key_orig = re.sub(r'experts(\.|\.experts\.\d+\.)dense1', 'intermediate_query.dense', name) + param.data.copy_(state_dict[key_orig]) + d2_pattern = r"bert\.encoder\.layer\.(\d+)\.experts(\.|\.experts\.\d+\.)dense2\.(weight|bias)" + if re.match(d2_pattern, name): + key_orig = re.sub(r'experts(\.|\.experts\.\d+\.)dense2', 'output_query.dense', name) + param.data.copy_(state_dict[key_orig]) + # freeze qformer if freeze_qformer: for name, param in self.Qformer.named_parameters(): @@ -205,6 +218,7 @@ class Blip2VicunaInstruct(Blip2Base): self.use_moeqformer = use_moeqformer self.use_route_moe = use_route_moe self.moebert_load_balance = moebert_load_balance + self.moebert_num_beams = moebert_num_beams self.gate_save_path = gate_save_path # if self.gate_save_path != None: @@ -242,7 +256,7 @@ class Blip2VicunaInstruct(Blip2Base): # print(samples["text_input"]) # print(samples["text_output"]) # print('-----------------') - # import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() # 0107test image = samples["image"] with self.maybe_autocast(): image_embeds = self.ln_vision(self.visual_encoder(image)) @@ -278,10 +292,10 @@ class Blip2VicunaInstruct(Blip2Base): return_dict=True, output_hidden_states=True, ) - + # import pdb; pdb.set_trace()# 0107test query_output_to_linear = query_output.last_hidden_state[:,:query_tokens.size(1),:] - if self.use_moeqformer and not self.use_route_moe: + if self.use_moeqformer: gate_loss = query_output.gate_loss # only available in QformerMoE if self.gate_save_path != None: @@ -312,7 +326,7 @@ class Blip2VicunaInstruct(Blip2Base): # 'gate_route_1': prob_gate_normalized[0][i].tolist(), }) # for layer in [6,8,10]: - # layer_data = all_hidden_states[layer] + # layer_data = all_hidden_states[layer]s # file_path = os.path.join(self.gate_save_path, f'{image_id}_{str(layer)}.npy') # x = layer_data.data.cpu().numpy() # np.save(file_path,x) @@ -323,7 +337,6 @@ class Blip2VicunaInstruct(Blip2Base): print("Gate Save Error....") print(e) - inputs_llm = self.llm_proj(query_output_to_linear) atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device) @@ -380,7 +393,7 @@ class Blip2VicunaInstruct(Blip2Base): labels=targets, ) - if self.use_moeqformer and not self.use_route_moe: + if self.use_moeqformer: loss = outputs.loss + self.moebert_load_balance * gate_loss else: loss = outputs.loss @@ -441,6 +454,8 @@ class Blip2VicunaInstruct(Blip2Base): output_hidden_states=True, ) + # import pdb; pdb.set_trace() + if self.gate_save_path != None: all_hidden_states = query_output.hidden_states # prob_gate_normalized = query_output.gate_loads @@ -471,11 +486,11 @@ class Blip2VicunaInstruct(Blip2Base): # 'gate_route_3': prob_gate_normalized[2][i].tolist(), # 'gate_route_1': prob_gate_normalized[0][i].tolist(), }) - for layer in [6,8,10]: - if layer == 6: - layer_data = all_hidden_states[layer][i, :32, :] + for layer in [6,7,8,9,10,11]: + if layer in [6,11]: + layer_data = all_hidden_states[layer][i, :, :] else: - layer_data = all_hidden_states[layer][i*3, :32, :] + layer_data = all_hidden_states[layer][i*self.moebert_num_beams, :, :] file_path = os.path.join(self.gate_save_path, f'{image_id}_{str(layer)}.npy') x = layer_data.data.cpu().numpy() np.save(file_path,x) # 大功告成 @@ -683,5 +698,6 @@ class Blip2VicunaInstruct(Blip2Base): for name, param in model.named_parameters(): if param.requires_grad == True: print(name) - + # [name for name, param in model.named_parameters() if (param.requires_grad == False and 'Qformer' in name and 'intermediate_query' in name)] + # import pdb; pdb.set_trace()# 0107test return model diff --git a/minigpt4/models/moe/beam_search.py b/minigpt4/models/moe/beam_search.py index 676d707..c4b3c5b 100644 --- a/minigpt4/models/moe/beam_search.py +++ b/minigpt4/models/moe/beam_search.py @@ -21,7 +21,6 @@ class MoELayer(nn.Module): else: raise KeyError("Routing method not supported.") - def _forward_gate_sentence(self, x, attention_mask): """ x: query_attention_output , torch.Size([bz, 32, 768]) @@ -77,7 +76,65 @@ class MoELayer(nn.Module): print('Layer Qformer MoE: \n',prob_gate) return moe_result, select_prob_gate, gate + def _forward_gate_sentence_post(self, x, attention_mask): + """ + x: query_attention_output; torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + bz = 4 + x = torch.randn(bz,32,768) + attention_mask = torch.ones([bz, 32]) + """ + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + + def forward_expert(input_x, expert_idx): + # input_x += torch.randn(4,32,768) + # return input_x + output_x = self.experts[expert_idx].forward(input_x) + return output_x + + outputs = list() + logits_gate_lst = list() + for expert_idx in range(self.num_experts): + output_x = forward_expert(x_masked, expert_idx) + outputs.append(output_x.unsqueeze(0)) + + output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) + # gate_acore = self.gates[expert_idx](output_x_aver) + gate_score = self.gate(output_x_aver) + logits_gate_lst.append(gate_score) + + candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768]) + logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert]) + prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) + topk_values, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + gate_load = num_sentences.clone() + + # load balancing loss + if self.use_balance_loss: + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + + # importance loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768]) + # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768]) + # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1]) + + prob_gate_topk = torch.zeros_like(prob_gate) + prob_gate_topk.scatter_(1, gate, topk_values) + prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) # torch.Size([bz, num_expert]) + candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768]) + results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768]) + moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768]) + import pdb;pdb.set_trace() + + return moe_result, (balance_loss+importance_loss), prob_gate_normalized + def forward(self, x, attention_mask): if self.route_method == "gate-token": x, balance_loss, gate_load = self._forward_gate_token(x) @@ -95,7 +152,7 @@ class MoELayer(nn.Module): class RouteMoELayer(nn.Module): - def __init__(self, hidden_size, expert, gate, num_experts, num_beams=2, layer_judge=None, route_method="pre-route"): + def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route", weight_type="ffn_prob"): # remove hash list nn.Module.__init__(self) self.num_experts = num_experts @@ -103,13 +160,26 @@ class RouteMoELayer(nn.Module): self.num_beams = num_beams self.hidden_size = hidden_size self.layer_judge = layer_judge + self.weight_type = weight_type self.route_method = route_method if self.route_method == "pre-route": self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() elif self.route_method == "post-route": - # gate = nn.Linear(hidden_size, 1, bias=False).float() - self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) + gate = nn.Linear(hidden_size, 1, bias=False).float() + self.gate = gate + # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) + + def _importance_auxiliary_loss(self, prob_gate): + # From VMOE + # _importance_auxiliary_loss + axis = tuple(range(prob_gate.ndim - 1)) # All except last. + importance_per_expert = torch.sum(prob_gate, dim=axis) + std_importance_per_expert = torch.std(importance_per_expert) + mean_importance_per_expert = torch.mean(importance_per_expert) + # Compute coefficient of variation (i.e. std/mean) squared. + return (std_importance_per_expert / mean_importance_per_expert)**2 + def forward_gate(self, x): """ @@ -123,19 +193,21 @@ class RouteMoELayer(nn.Module): prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts]) return prob_gate - def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size): - import pdb;pdb.set_trace() + + def beam_search_backup(self, current_scores_log, beam_scores, expert_route, batch_size): if self.layer_judge=='first' and self.route_method=='pre-route': + # current_scores_log torch.Size([bz, num_experts]) assert beam_scores==None and expert_route==None current_scores = torch.exp(current_scores_log) topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) - beam_idx = None + beam_idx = torch.tensor(range(self.num_beams * batch_size)) + else: if self.layer_judge=='first' and self.route_method == 'post-route': batch_size = batch_size - next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts]) + next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_beams*num_experts]) else: batch_size = int(batch_size // self.num_beams) next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 @@ -147,9 +219,6 @@ class RouteMoELayer(nn.Module): next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True) # next_scores torch.Size([bz, num_beams]) # next_tokens torch.Size([bz, num_beams]) - print(next_scores_raw1) - print(next_scores) - print(next_experts) next_batch_beam = list() for batch_idx in range(batch_size): @@ -166,7 +235,7 @@ class RouteMoELayer(nn.Module): next_batch_beam.extend(next_sent_beam) import pdb;pdb.set_trace() - + if self.layer_judge=='first' and self.route_method == 'post-route': beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) expert_route = next_experts.view(self.num_beams * batch_size) @@ -181,33 +250,91 @@ class RouteMoELayer(nn.Module): pre_route = expert_route[beam_idx,:] expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) - import pdb;pdb.set_trace() + return beam_scores, expert_route, beam_idx + + def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size): + if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']: + # current_scores_log torch.Size([bz, num_experts]) + assert beam_scores==None and expert_route==None + current_scores = torch.exp(current_scores_log) + topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) + expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) + beam_idx = torch.tensor(range(self.num_beams * batch_size)) + import pdb;pdb.set_trace() + + else: + batch_size = int(batch_size // self.num_beams) + next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 + next_scores_exp = torch.exp(next_scores_raw) + next_scores_raw1 = next_scores_exp.view( + batch_size, self.num_beams * self.num_experts + ) # torch.Size([bz, num_beams*num_experts]) + + next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True) + # next_scores torch.Size([bz, num_beams]) + # next_tokens torch.Size([bz, num_beams]) + + next_batch_beam = list() + for batch_idx in range(batch_size): + next_sent_beam = list() + for rank, (expert_id, expert_score) in enumerate( + zip(next_experts[batch_idx], next_scores[batch_idx]) + ): + expert_id = expert_id.item() + beam_id = expert_id // self.num_experts + ex_id = expert_id % self.num_experts + effective_beam_id = batch_idx*self.num_beams + beam_id + + next_sent_beam.append((expert_score, ex_id, effective_beam_id)) + next_batch_beam.extend(next_sent_beam) + + # import pdb;pdb.set_trace() + + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) + beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) + pre_route = expert_route[beam_idx,:] + expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) + + print("next_scores_raw1:\n",next_scores_raw1) return beam_scores, expert_route, beam_idx - - def forward_expert_ffn(self, x, expert_select, beam_scores): + + + def forward_expert_ffn(self, x, expert_select, current_scores): """ x_repeat : [bz*num_beams, 32,768] expert_select : [bz*num_beams] + current_scores : [bz*num_beams, num_experts] / [bz, num_experts] """ - # add_1212 l2_normalization - # normalized_tensor = torch.nn.functional.normalize(beam_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) + # add_1228 l2_normalization + # normalized_tensor = torch.nn.functional.normalize(current_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) # tmp_prob = normalized_tensor.unsqueeze(-1).unsqueeze(-1) - + import pdb;pdb.set_trace() outputs = list() - for i in range(x.shape[0]): - output_x = self.experts[expert_select[i]].forward(x[i]) - outputs.append(output_x.unsqueeze(0)) - candidate_output = torch.cat(outputs) + for i in range(self.num_experts): + output_x = self.experts[i].forward(x) + outputs.append(output_x.unsqueeze(1)) + candidate_output = torch.cat(outputs, dim=1) + expert_select_matrix = F.one_hot(expert_select, self.num_experts) - # candidate_output = candidate_output * tmp_prob - return candidate_output # torch.Size([bz*num_beams, 32, 768]) + if self.weight_type == 'ffn_prob': + tmp_prob = current_scores * expert_select_matrix + candidate_output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1) + else: + candidate_output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1) + import pdb;pdb.set_trace() + output = torch.sum(candidate_output, dim=1) + return output # torch.Size([bz*num_beams, 32, 768]) def forward_pre_route(self, x, beam_scores, expert_route, use_log=True): - - current_scores = self.forward_gate(x) # [bz*num_beams, 5] + import pdb;pdb.set_trace() + current_scores = self.forward_gate(x) # [bz, num_beams] / [bz*num_beams, num_beams] + + importance_loss = self._importance_auxiliary_loss(current_scores) if use_log: current_scores_log = torch.log(current_scores) # 取log之后可以直接相加 @@ -215,42 +342,45 @@ class RouteMoELayer(nn.Module): current_scores_log = current_scores batch_size, num_tokens = x.shape[0], x.shape[1] - beam_scores, expert_route, _ = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) - + beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) current_expert_select = expert_route[:,-1] + import pdb;pdb.set_trace() + if self.layer_judge=='first': # expand first dim to batch_size * num_beams replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size) x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768] + current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts) + current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts] - candidate_output = self.forward_expert_ffn(x, current_expert_select, beam_scores) # [bz*num_beams, 32,768] - - return candidate_output, beam_scores, expert_route + input_x = x[beam_idx] + candidate_output = self.forward_expert_ffn(input_x, current_expert_select, current_scores) # [bz*num_beams, 32,768] + import pdb;pdb.set_trace() + return candidate_output, beam_scores, expert_route, beam_idx, importance_loss def forward_post_route(self, x, beam_scores, expert_route, use_log=True): - # if self.layer_judge=='first': # expand first dim to batch_size * num_beams - # batch_size, num_tokens = x.shape[0], x.shape[1] - # replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size) - # x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768] - attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) - + def forward_expert(input_x, expert_idx): output_x = self.experts[expert_idx].forward(input_x) return output_x + import pdb; pdb.set_trace() outputs = list() logits_gate_lst = list() for expert_idx in range(self.num_experts): output_x = forward_expert(x_masked, expert_idx) + # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768]) + output_x_aver = torch.mean(output_x, dim=1) + # gate_score = self.gates[expert_idx](output_x_aver) + gate_score = self.gate(output_x_aver) + logits_gate_lst.append(gate_score) outputs.append(output_x.unsqueeze(0)) - output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768]) - gate_acore = self.gates[expert_idx](output_x_aver) - logits_gate_lst.append(gate_acore) - candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768]) + + candidate_output_raw = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768]) logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert]) current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts]) @@ -259,25 +389,39 @@ class RouteMoELayer(nn.Module): else: current_scores_log = current_scores - import pdb;pdb.set_trace() + # importance loss + importance_loss = self._importance_auxiliary_loss(current_scores) + + # import pdb; pdb.set_trace() - batch_size = x.shape[0] # bz*num_beam + batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) # beam_scores torch.Size([bz*num_beam]) # expert_route torch.Size([bz*num_beam, layer_n]) current_select_expert = expert_route[:,-1] + # current_select_expert torch.Size([bz*num_beam, 1]) - output = list() - for i in range(beam_idx.shape[0]): - b_idx = beam_idx[i] - ex_idx = current_select_expert[i] - ex_out = candidate_output[ex_idx, b_idx, :,:] - output.append(ex_out.unsqueeze(0)) - - final_output = torch.concat(output, dim=0) - - return final_output, beam_scores, expert_route, beam_idx + # import pdb; pdb.set_trace() + + if self.layer_judge == 'first': + replicated_tensor = candidate_output_raw.unsqueeze(2).expand(self.num_experts, batch_size, self.num_beams, num_tokens, self.hidden_size) + candidate_output_raw = replicated_tensor.contiguous().view(self.num_experts, -1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768] + current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts) + current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts] + + candidate_output = candidate_output_raw.permute(1, 0, 2, 3)[beam_idx] # torch.Size([8, 2, 32, 768]) + expert_select_matrix = F.one_hot(current_select_expert, self.num_experts) + if self.weight_type == 'ffn_prob': + tmp_prob = current_scores[beam_idx] * expert_select_matrix + output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1) + else: + output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1) + final_output = torch.sum(output, dim=1) + + import pdb; pdb.set_trace() + print("current_scores:\n",current_scores) + return final_output, beam_scores, expert_route, beam_idx, importance_loss def forward(self, x, attention_mask, beam_scores, expert_route, use_log=True): """ @@ -286,13 +430,12 @@ class RouteMoELayer(nn.Module): """ if self.route_method == 'pre-route': - candidate_output, beam_scores, expert_route, _ = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) + candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) elif self.route_method == "post-route": - candidate_output, beam_scores, expert_route, beam_idx = self.forward_post_route(x, beam_scores, expert_route, use_log=True) + candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True) - return candidate_output, beam_scores, expert_route, beam_idx + return candidate_output, beam_scores, expert_route, beam_idx, importance_loss - if __name__ == '__main__': import sys @@ -314,8 +457,8 @@ if __name__ == '__main__': config.add_cross_attention = True config.cross_attention_freq = cross_attention_freq config.query_length = num_query_token - config.moebert_expert_num = 3 - config.moebert_num_beams = 3 + config.moebert_expert_num = 2 + config.moebert_num_beams = 2 config.moebert_route_method = 'gate-sentence' config.moe_topk = 2 config.use_balance_loss = False @@ -332,40 +475,46 @@ if __name__ == '__main__': for layer_num in [6, 8, 10]: layer_judge = moe_layer_judge(layer_num) ffn = FeedForward(config) - gate = nn.Linear(768, config.moebert_expert_num, bias=False).float() # experts = RouteMoELayer( # hidden_size=768, # expert=ffn, - # gate = gate, # num_experts=config.moebert_expert_num, # num_beams=config.moebert_num_beams, # layer_judge = layer_judge, - # route_method = "pre-route" + # route_method = "pre-route", + # weight_type="no_ffn_prob" # ) # layer_output = experts(x, None, beam_scores, expert_route) - # hidden_states1, beam_scores, expert_route,_ = layer_output + # hidden_states1, beam_scores, expert_route, beam_idx, importance_loss = layer_output # print(beam_scores) # print(expert_route) + # print(beam_idx) + # print(importance_loss) + # x = hidden_states1 gate1 = nn.Linear(768, 1, bias=False).float() experts_post = RouteMoELayer( hidden_size=768, expert=ffn, - gate = gate1, num_experts=config.moebert_expert_num, num_beams=config.moebert_num_beams, layer_judge = layer_judge, - route_method = "post-route" + route_method = "post-route", + weight_type="ffn_prob" ) layer_output = experts_post(x1, None, beam_scores1, expert_route1, False) - hidden_states2, beam_scores1, expert_route1, beam_idx = layer_output + hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output print(beam_scores1) print(expert_route1) print(beam_idx) + print(importance_loss) + x1 = hidden_states2 + + # gate = nn.Linear(768, config.moebert_expert_num, bias=False).float() # experts_moe = MoELayer( # hidden_size=config.hidden_size, # expert=ffn, @@ -382,11 +531,62 @@ if __name__ == '__main__': # print(select_prob_gate) # print(gate_load) - - - # x = hidden_states1 - x1 = hidden_states2 # x2 = hidden_states3 print("------------------------------------") + import pdb; pdb.set_trace() + + + + def forward_post_route_backup(self, x, beam_scores, expert_route, use_log=True): + + attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + + def forward_expert(input_x, expert_idx): + output_x = self.experts[expert_idx].forward(input_x) + return output_x + + outputs = list() + logits_gate_lst = list() + for expert_idx in range(self.num_experts): + output_x = forward_expert(x_masked, expert_idx) + outputs.append(output_x.unsqueeze(0)) + # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768]) + # gate_score = self.gates[expert_idx](output_x_aver) + output_x_aver = torch.mean(output_x, dim=1) + gate_score = self.gate(output_x_aver) + logits_gate_lst.append(gate_score) + candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768]) + logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert]) + current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts]) + + if use_log: + current_scores_log = torch.log(current_scores) # 取log之后可以直接相加 + else: + current_scores_log = current_scores + + # importance loss + importance_loss = self._importance_auxiliary_loss(current_scores) + + batch_size = x.shape[0] # bz*num_beam + beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) + # beam_scores torch.Size([bz*num_beam]) + # expert_route torch.Size([bz*num_beam, layer_n]) + current_select_expert = expert_route[:,-1] + # current_select_expert torch.Size([bz*num_beam, 1]) + + output = list() + for i in range(beam_idx.shape[0]): + b_idx = beam_idx[i] + ex_idx = current_select_expert[i] + ex_out = candidate_output[ex_idx, b_idx, :,:] + if self.weight_type == 'ffn_prob': + prob = current_scores[b_idx, ex_idx] + ex_out = ex_out*prob + output.append(ex_out.unsqueeze(0)) + + final_output = torch.concat(output, dim=0) + # import pdb;pdb.set_trace() + return final_output, beam_scores, expert_route, beam_idx, importance_loss diff --git a/minigpt4/models/moe/beam_search_test.py b/minigpt4/models/moe/beam_search_test.py deleted file mode 100644 index 8a8f128..0000000 --- a/minigpt4/models/moe/beam_search_test.py +++ /dev/null @@ -1,155 +0,0 @@ -import torch -import copy -import pickle -import torch -import torch.nn as nn -import torch.nn.functional as F - -device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") - - -def forward_expert(input_x, expert_idx): - input_x += torch.randn(32,768) - return input_x - # output_x = self.experts[expert_idx].forward(input_x) - # return output_x - - -def forward_ffn(x_repeat, expert_select): - """ - x_repeat : [bz*num_beams, 32,768] - expert_select : [bz*num_beams] - """ - outputs = list() - num_beams_bz = x_repeat.shape[0] - for i in range(num_beams_bz): - output_x = forward_expert(x_repeat[i], expert_select[i]) # (32,768) - outputs.append(output_x.unsqueeze(0)) - candidate_output = torch.cat(outputs) - return candidate_output # torch.Size([bz*num_beams, 32, 768]) - -def forward_gate(x, num_expert): - """ - x : torch.Size([bz*num_beams, 32, 768]) or torch.Size([bz, 32, 768]) - prob_gate : torch.Size([bz*num_beams, num_experts]) or torch.Size([bz, num_experts]) - """ - # attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) - # x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz*num_beams, 32, 768]) - # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768]) - # logits_gate = gate(x_average) # torch.Size([bz, num_experts]) - logits_gate = torch.randn(x.shape[0], num_expert) - prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts]) - return prob_gate - -def beam_search(layer, current_scores, beam_scores, expert_route, num_beams): - if layer == 0 and beam_scores==None and expert_route==None: - topk_values, gate = torch.topk(current_scores, num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) - beam_scores = topk_values.view(num_beams*batch_size) # torch.Size([bz * num_beams]) - expert_route = gate.view(num_beams*batch_size).unsqueeze(1) # torch.Size([bz * num_beams]) - - else: - next_scores_raw = current_scores + beam_scores.unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率 - next_scores_raw1 = next_scores_raw.view( - batch_size, num_beams * num_expert - ) # torch.Size([4, 3*5]) - next_scores, next_experts = torch.topk(next_scores_raw1, num_beams, dim=1, largest=True, sorted=True) - # next_scores torch.Size([4, 3*num_beams]) - # next_tokens torch.Size([4, 3*num_beams]) - - next_batch_beam = list() - for batch_idx in range(batch_size): - next_sent_beam = list() - print(batch_idx) - for rank, (expert_id, expert_score) in enumerate( - zip(next_experts[batch_idx], next_scores[batch_idx]) - ): - expert_id = expert_id.item() - beam_id = expert_id // num_expert - ex_id = expert_id % num_expert - effective_beam_id = batch_idx*num_beams + beam_id - - # print(expert_id, beam_id, ex_id, effective_beam_id, expert_score) - - next_sent_beam.append((expert_score, ex_id, effective_beam_id)) - next_batch_beam.extend(next_sent_beam) - - # print() - - import pdb;pdb.set_trace() - - beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) - beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam]) - beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam]) - - pre_route = expert_route[beam_idx,:] - expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1) - - return beam_scores, expert_route - - -if __name__ == '__main__': - - batch_size = 3 - num_beams = 2 - num_expert = 5 - x = torch.randn(batch_size, 32, 768) - beam_scores, expert_route = None, None - - for layer in range(0,3): - # import pdb;pdb.set_trace() - - current_scores = forward_gate(x, num_expert) - import pdb;pdb.set_trace() - - beam_scores, expert_route = beam_search(layer, current_scores, beam_scores, expert_route, num_beams) - current_expert_select = expert_route[:,-1] - - if layer == 0: - replicated_tensor = x.unsqueeze(1).expand(batch_size, num_beams, 32, 768) - x = replicated_tensor.contiguous().view(-1, 32, 768) # [12,32,768] [bz*num_beams, 32,768] - else: - x = candidate_output - - candidate_output = forward_ffn(x, current_expert_select) # torch.Size([4*3, 5]) - - x = candidate_output - - - scores = beam_scores.view(batch_size, num_beams) - topk_values, gate = torch.topk(scores, 1, dim=1) - # gate [batch_size, 1] - # topk_values [batch_size, 1] - selects = [ (bz_idx * num_beams + gate[bz_idx].item()) for bz_idx in range(batch_size)] - final_scores = beam_scores[selects] - final_expert_route = expert_route[selects] - final_output = candidate_output[selects] - - - - - - - -# def forward_ffn_post(x_repeat, expert_select): -# """ -# x_repeat : [bz*num_beams, 32,768] -# expert_select : [bz*num_beams] -# prob_gate : torch.Size([bz*num_beams, num_experts]) -# """ -# outputs = list() -# logits_gate_lst = list() -# # attention_mask = torch.ones([batch_size, 32]) -# for i in range(num_beams*batch_size): -# output_x = forward_expert(x_repeat[i], expert_select[i]) # (32,768) -# outputs.append(output_x.unsqueeze(0)) -# # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) -# # gate_acore = self.gates[expert_idx](output_x_aver) -# # gate_score = self.gate(output_x_aver) -# num_expert = 5 -# gate_score = torch.randn(1,num_expert) -# logits_gate_lst.append(gate_score) - -# candidate_output = torch.cat(outputs) # torch.Size([bz*num_beams, 32, 768]) -# logits_gate = torch.cat(logits_gate_lst,dim=0)# torch.Size([bz*num_beams, num_expert]) -# prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts]) -# return prob_gate, candidate_output \ No newline at end of file diff --git a/minigpt4/models/moe/moe_layer.py b/minigpt4/models/moe/moe_layer.py index 303862c..abd24b9 100644 --- a/minigpt4/models/moe/moe_layer.py +++ b/minigpt4/models/moe/moe_layer.py @@ -5,7 +5,7 @@ import torch.nn as nn import torch.nn.functional as F class MoELayer(nn.Module): - def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='l2_norm'): + def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='raw_prob'): # remove hash list nn.Module.__init__(self) self.num_experts = num_experts @@ -81,54 +81,6 @@ class MoELayer(nn.Module): return x, balance_loss, gate_load - def _forward_gate_sentence_top1_raw(self, x, attention_mask): - """ - x: query_attention_output , torch.Size([bz, 32, 768]) - attention_mask: torch.ones([bz, 32]) - - ### Notice: - the raw version of expert_attention_mask is the extended_attention_mask, - which will be add to attention_score directly - the values of extended_attention_mask are -0.0 or -10000 - it should be adjust to 1/0 version to be processed by experts - """ - attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) - x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) - x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) - logits_gate = self.gate(x_average) # torch.Size([bz, num_experts]) - prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) - gate = torch.argmax(prob_gate, dim=-1) # torch.Size([bz]) - - order = gate.argsort(0) - num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0) - gate_load = num_sentences.clone() - x = x[order] # reorder according to expert number - x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts - - # compute the load balancing loss - P = prob_gate.mean(0) - temp = num_sentences.float() - f = temp / temp.sum(0, keepdim=True) - balance_loss = self.num_experts * torch.sum(P * f) - - prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1)) - prob_gate = prob_gate[order] - prob_gate = prob_gate.split(num_sentences.tolist(), dim=0) - - def forward_expert(input_x, prob_x, expert_idx): - input_x = self.experts[expert_idx].forward(input_x) - input_x = input_x * prob_x.unsqueeze(-1) - return input_x - - result = [] - for i in range(self.num_experts): - if x[i].size(0) > 0: - result.append(forward_expert(x[i], prob_gate[i], i)) - result = torch.vstack(result) - result = result[order.argsort(0)] # restore original order - - return result, balance_loss, gate_load - def _forward_gate_sentence_post(self, x, attention_mask): """ x: query_attention_output; torch.Size([bz, 32, 768]) @@ -174,13 +126,17 @@ class MoELayer(nn.Module): # importance loss importance_loss = self._importance_auxiliary_loss(prob_gate) - # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768]) - # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768]) - # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1]) - prob_gate_topk = torch.zeros_like(prob_gate) prob_gate_topk.scatter_(1, gate, topk_values) - prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) # torch.Size([bz, num_expert]) + + if self.weight_type == 'average': + # torch.Size([bz, num_expert]) 未选中的expert prob_gate_norm为0 + prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) + elif self.weight_type == 'raw_prob': + prob_gate_normalized = prob_gate_topk + elif self.weight_type == 'softmax_norm': + prob_gate_normalized = F.softmax(prob_gate_topk, dim=-1) # torch.Size([bz, num_expert]) + candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768]) results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768]) moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768]) @@ -188,6 +144,46 @@ class MoELayer(nn.Module): return moe_result, (balance_loss+importance_loss), prob_gate_normalized + def router(self, x, attention_mask): + # Prepare input x + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + x_average = torch.mean(x_masked, dim=1) # torch.Size([bz, 768]) + + # Forward Gate + # logits_gate: [bz, num_experts] + logits_gate = self.gate(x_average) + + # Probabilities for each sample of what expert it should be sent to. + # prob_gate: [bz, num_experts] + prob_gate = F.softmax(logits_gate, dim=-1) + + # Get Top-K experts for each sample + # gate: [bz, topk] + # select_prob_gate: [bz, topk] + select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) + + # Reshap Prob_gate & Gate + # expert_mask: [batch_size, topk, num_experts] + # expert_gate: [batch_size, topk, num_experts] + # combine_tensor: [batch_size, num_experts] + expert_mask = F.one_hot(gate, self.num_experts) + expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask + combine_tensor = torch.sum(expert_gate, dim=1) + + # Calculate Balancing Loss + if self.use_balance_loss: + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + + # Calculate Importance Loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + # import pdb; pdb.set_trace() + + return expert_mask, combine_tensor, balance_loss, importance_loss def _forward_gate_sentence(self, x, attention_mask): """ @@ -200,81 +196,37 @@ class MoELayer(nn.Module): the values of extended_attention_mask are -0.0 or -10000 it should be adjust to 1/0 version to be processed by experts """ - attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) - x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) - x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) - logits_gate = self.gate(x_average) # torch.Size([bz, num_experts]) - prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) - select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + # Forward Router + expert_mask, combine_tensor, balance_loss, importance_loss = self.router(x, attention_mask) + + # Forward Expert FFN + result = [] + for expert_idx in range(self.num_experts): + output_x = self.experts[expert_idx].forward(x) + result.append(output_x.unsqueeze(0)) + expert_output = torch.cat(result).permute(1,0,2,3) # torch.Size([batch_size, num_expert, num_tokens, hidden_states]) - # 这里用l2 norm 去加权 - if self.weight_type == 'l2_norm': - normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) - elif self.weight_type == 'average': - normalized_tensor = select_prob_gate / select_prob_gate.sum(dim=1, keepdim=True) + # multiply outputs of experts by the routing probability + if self.weight_type == 'raw_prob': + expert_outputs_combined = expert_output * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states]) + elif self.weight_type == 'no_prob': + combine_index = torch.sum(expert_mask, dim=1) + expert_outputs_combined = expert_output * combine_index.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states]) - num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) - gate_load = num_sentences.clone() + outputs = torch.sum(expert_outputs_combined, dim=1) # torch.Size([batch_size, num_tokens, hidden_states]) - # load balancing loss - if self.use_balance_loss: - balance_loss = self._balancing_loss(prob_gate, num_sentences) - else: - balance_loss = 0.0 + # import pdb; pdb.set_trace() - # importance loss - importance_loss = self._importance_auxiliary_loss(prob_gate) - - # forward experts - def forward_expert(input_x, expert_idx): - input_x = self.experts[expert_idx].forward(input_x) - return input_x - - result_lst = list() - for i in range(self.topk): - # top1、top2... 分别为一组,进行gate分组之后过expert,然后乘以概率后相加 - tmp_gate = gate[:,i] - tmp_prob = normalized_tensor[:,i].unsqueeze(-1).unsqueeze(-1) - order = tmp_gate.argsort(0) - num_sentences_t = F.one_hot(tmp_gate, self.num_experts).gt(0).sum(0) - x1 = x[order] # reorder according to expert number - x1 = x1.split(num_sentences_t.tolist(), dim=0) # a list of length self.num_experts - - result = [] - for i in range(self.num_experts): - if x1[i].size(0) > 0: - result.append(forward_expert(x1[i], i)) - result = torch.vstack(result) - result = result[order.argsort(0)] # restore original order - # result_lst.append(result * tmp_prob) # result * prob - result_lst.append(result) # result * prob # add_1212 - - moe_result = sum(result_lst) - # import pdb;pdb.set_trace() - return moe_result, (balance_loss+importance_loss), gate - - def _forward_sentence_single_expert(self, x, attention_mask): - x_masked = x * attention_mask.unsqueeze(-1) - x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) - logits_gate = self.gate(x_average) - prob_gate = F.softmax(logits_gate, dim=-1) - gate = torch.argmax(prob_gate, dim=-1) - - gate_load = F.one_hot(gate, self.num_experts).gt(0).sum(0) - x = self.experts[gate.cpu().item()].forward(x) - return x, 0.0, gate_load + return outputs, (balance_loss+importance_loss), combine_tensor def forward(self, x, attention_mask): if self.route_method == "gate-token": x, balance_loss, gate_load = self._forward_gate_token(x) elif self.route_method == "gate-sentence": - if x.size(0) == 1: - x, balance_loss, gate_load = self._forward_sentence_single_expert(x, attention_mask) - else: - x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask) + x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask) elif self.route_method == "gate-sentence-post": x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask) else: raise KeyError("Routing method not supported.") - + # import pdb; pdb.set_trace() return x, balance_loss, gate_load diff --git a/minigpt4/models/moe/moe_layer_backup.py b/minigpt4/models/moe/moe_layer_backup.py new file mode 100644 index 0000000..25f2e59 --- /dev/null +++ b/minigpt4/models/moe/moe_layer_backup.py @@ -0,0 +1,330 @@ +import copy +import pickle +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MoELayer(nn.Module): + def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='l2_norm'): + # remove hash list + nn.Module.__init__(self) + self.num_experts = num_experts + self.experts = nn.ModuleList([copy.deepcopy(expert) for i in range(num_experts)]) + self.route_method = route_method + self.topk = topk + self.use_balance_loss = use_balance_loss + self.weight_type = weight_type + + if route_method in ["gate-token", "gate-sentence"]: + self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() + elif route_method in ["gate-sentence-post"]: + gate = nn.Linear(hidden_size, 1, bias=False).float() + # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) + self.gate = gate + else: + raise KeyError("Routing method not supported.") + + def _balancing_loss(self, prob_gate, num_tokens): + # From MOEBERT + # compute the load balancing loss + # prob_gate,是 [bz, num_expert],每个样本被分配给每个expert的概率 + # 等价于 VMOE 中 _gshard_auxiliary_loss + P = prob_gate.mean(0) # torch.Size([num_expert]) 每个expert被分配到样本的平均概率 + temp = num_tokens.float() + f = temp / temp.sum(0, keepdim=True) # 每个expert被分配的sample比例 + balance_loss = self.num_experts * torch.sum(P * f) + return balance_loss + + def _importance_auxiliary_loss(self, prob_gate): + # From VMOE + # _importance_auxiliary_loss + axis = tuple(range(prob_gate.ndim - 1)) # All except last. + importance_per_expert = torch.sum(prob_gate, dim=axis) + std_importance_per_expert = torch.std(importance_per_expert) + mean_importance_per_expert = torch.mean(importance_per_expert) + # Compute coefficient of variation (i.e. std/mean) squared. + return (std_importance_per_expert / mean_importance_per_expert)**2 + + def _forward_gate_token(self, x): + bsz, seq_len, dim = x.size() + + x = x.view(-1, dim) + logits_gate = self.gate(x) + prob_gate = F.softmax(logits_gate, dim=-1) + gate = torch.argmax(prob_gate, dim=-1) + + order = gate.argsort(0) + num_tokens = F.one_hot(gate, self.num_experts).gt(0).sum(0) + gate_load = num_tokens.clone() + x = x[order] # reorder according to expert number + x = x.split(num_tokens.tolist(), dim=0) # a list of length self.num_experts + + # compute the load balancing loss + P = prob_gate.mean(0) + temp = num_tokens.float() + f = temp / temp.sum(0, keepdim=True) + balance_loss = self.num_experts * torch.sum(P * f) + + prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1)) + prob_gate = prob_gate[order] + prob_gate = prob_gate.split(num_tokens.tolist(), dim=0) + + def forward_expert(input_x, prob_x, expert_idx): + input_x = self.experts[expert_idx].forward(input_x) + input_x = input_x * prob_x + return input_x + + x = [forward_expert(x[i], prob_gate[i], i) for i in range(self.num_experts)] + x = torch.vstack(x) + x = x[order.argsort(0)] # restore original order + x = x.view(bsz, seq_len, dim) + + return x, balance_loss, gate_load + + def _forward_gate_sentence_top1_raw(self, x, attention_mask): + """ + x: query_attention_output , torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + + ### Notice: + the raw version of expert_attention_mask is the extended_attention_mask, + which will be add to attention_score directly + the values of extended_attention_mask are -0.0 or -10000 + it should be adjust to 1/0 version to be processed by experts + """ + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) + logits_gate = self.gate(x_average) # torch.Size([bz, num_experts]) + prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) + gate = torch.argmax(prob_gate, dim=-1) # torch.Size([bz]) + + order = gate.argsort(0) + num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0) + gate_load = num_sentences.clone() + x = x[order] # reorder according to expert number + x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts + + # compute the load balancing loss + P = prob_gate.mean(0) + temp = num_sentences.float() + f = temp / temp.sum(0, keepdim=True) + balance_loss = self.num_experts * torch.sum(P * f) + + prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1)) + prob_gate = prob_gate[order] + prob_gate = prob_gate.split(num_sentences.tolist(), dim=0) + + def forward_expert(input_x, prob_x, expert_idx): + input_x = self.experts[expert_idx].forward(input_x) + input_x = input_x * prob_x.unsqueeze(-1) + return input_x + + result = [] + for i in range(self.num_experts): + if x[i].size(0) > 0: + result.append(forward_expert(x[i], prob_gate[i], i)) + result = torch.vstack(result) + result = result[order.argsort(0)] # restore original order + + return result, balance_loss, gate_load + + def _forward_gate_sentence_post(self, x, attention_mask): + """ + x: query_attention_output; torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + bz = 4 + x = torch.randn(bz,32,768) + attention_mask = torch.ones([bz, 32]) + + """ + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + + def forward_expert(input_x, expert_idx): + # input_x += torch.randn(4,32,768) + # return input_x + output_x = self.experts[expert_idx].forward(input_x) + return output_x + + outputs = list() + logits_gate_lst = list() + for expert_idx in range(self.num_experts): + output_x = forward_expert(x_masked, expert_idx) + outputs.append(output_x.unsqueeze(0)) + + output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) + # gate_acore = self.gates[expert_idx](output_x_aver) + gate_score = self.gate(output_x_aver) + logits_gate_lst.append(gate_score) + + candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768]) + logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert]) + prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) + topk_values, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + gate_load = num_sentences.clone() + + # load balancing loss + if self.use_balance_loss: + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + + # importance loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768]) + # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768]) + # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1]) + + prob_gate_topk = torch.zeros_like(prob_gate) + prob_gate_topk.scatter_(1, gate, topk_values) + + if self.weight_type == 'average': + # torch.Size([bz, num_expert]) 未选中的expert prob_gate_norm为0 + prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) + elif self.weight_type == 'raw_prob': + prob_gate_normalized = prob_gate_topk + elif self.weight_type == 'softmax_norm': + prob_gate_normalized = F.softmax(prob_gate_topk, dim=-1) # torch.Size([bz, num_expert]) + + candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768]) + results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768]) + moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768]) + # import pdb;pdb.set_trace() + + return moe_result, (balance_loss+importance_loss), prob_gate_normalized + + # def _forward_gate_sentence(self, x, attention_mask): + + # attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + # x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) + # logits_gate = self.gate(x_average) + # prob_gate = F.softmax(logits_gate, dim=-1) + # gate = torch.argmax(prob_gate, dim=-1) + + # order = gate.argsort(0) + # num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0) + # gate_load = num_sentences.clone() + # x = x[order] # reorder according to expert number + # x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts + + # # compute the load balancing loss + # P = prob_gate.mean(0) + # temp = num_sentences.float() + # f = temp / temp.sum(0, keepdim=True) + # balance_loss = self.num_experts * torch.sum(P * f) + + # prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1)) + # prob_gate = prob_gate[order] + # prob_gate = prob_gate.split(num_sentences.tolist(), dim=0) + + # def forward_expert(input_x, prob_x, expert_idx): + # input_x = self.experts[expert_idx].forward(input_x) + # input_x = input_x * prob_x.unsqueeze(-1) + # return input_x + + # result = [] + # for i in range(self.num_experts): + # if x[i].size(0) > 0: + # result.append(forward_expert(x[i], prob_gate[i], i)) + # result = torch.vstack(result) + # result = result[order.argsort(0)] # restore original order + + # return result, balance_loss, gate_load + + def _forward_gate_sentence(self, x, attention_mask): + """ + x: query_attention_output , torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + + ### Notice: + the raw version of expert_attention_mask is the extended_attention_mask, + which will be add to attention_score directly + the values of extended_attention_mask are -0.0 or -10000 + it should be adjust to 1/0 version to be processed by experts + """ + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768]) + logits_gate = self.gate(x_average) # torch.Size([bz, num_experts]) + prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) + select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + + # 这里用l2 norm 去加权 + if self.weight_type == 'l2_norm': + # actually neigther dim=0 nor dim=1 is right + normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=1) # L2 Normalization torch.Size([bz, topk]) + elif self.weight_type == 'l2_norm_0': + normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) + elif self.weight_type == 'average': + normalized_tensor = select_prob_gate / select_prob_gate.sum(dim=1, keepdim=True) + elif self.weight_type == 'raw_prob': + normalized_tensor = select_prob_gate + + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + gate_load = num_sentences.clone() + + # load balancing loss + if self.use_balance_loss: + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + + # importance loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + # forward experts + def forward_expert(input_x, expert_idx): + input_x = self.experts[expert_idx].forward(input_x) + return input_x + + result_lst = list() + for i in range(self.topk): + # top1、top2... 分别为一组,进行gate分组之后过expert,然后乘以概率后相加 + tmp_gate = gate[:,i] + tmp_prob = normalized_tensor[:,i].unsqueeze(-1).unsqueeze(-1) + order = tmp_gate.argsort(0) + num_sentences_t = F.one_hot(tmp_gate, self.num_experts).gt(0).sum(0) + x1 = x[order] # reorder according to expert number + x1 = x1.split(num_sentences_t.tolist(), dim=0) # a list of length self.num_experts + + result = [] + for i in range(self.num_experts): + if x1[i].size(0) > 0: + result.append(forward_expert(x1[i], i)) + result = torch.vstack(result) + result = result[order.argsort(0)] # restore original order + result_lst.append(result * tmp_prob) # result * prob + # result_lst.append(result) # result * prob # add_1212 + + moe_result = sum(result_lst) + return moe_result, (balance_loss+importance_loss), gate + + def _forward_sentence_single_expert(self, x, attention_mask): + x_masked = x * attention_mask.unsqueeze(-1) + x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) + logits_gate = self.gate(x_average) + prob_gate = F.softmax(logits_gate, dim=-1) + gate = torch.argmax(prob_gate, dim=-1) + + gate_load = F.one_hot(gate, self.num_experts).gt(0).sum(0) + x = self.experts[gate.cpu().item()].forward(x) + return x, 0.0, gate_load + + def forward(self, x, attention_mask): + if self.route_method == "gate-token": + x, balance_loss, gate_load = self._forward_gate_token(x) + elif self.route_method == "gate-sentence": + if x.size(0) == 1: + x, balance_loss, gate_load = self._forward_sentence_single_expert(x, attention_mask) + else: + x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask) + elif self.route_method == "gate-sentence-post": + x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask) + else: + raise KeyError("Routing method not supported.") + # import pdb; pdb.set_trace() + return x, balance_loss, gate_load diff --git a/minigpt4/models/moe/prompt_moe.py b/minigpt4/models/moe/prompt_moe.py index 8b5e2d2..8ea4cea 100644 --- a/minigpt4/models/moe/prompt_moe.py +++ b/minigpt4/models/moe/prompt_moe.py @@ -92,7 +92,6 @@ class PrePromptMoE(PromptMoEBase): self.topk = topk if route_method in ["gate-token", "gate-single-token", "gate-sentence"]: self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() - print(self.gate) else: raise KeyError("Routing method not supported.") diff --git a/minigpt4/models/moe/route_moe_layer.py b/minigpt4/models/moe/route_moe_layer.py index 31b75c2..6012dd2 100644 --- a/minigpt4/models/moe/route_moe_layer.py +++ b/minigpt4/models/moe/route_moe_layer.py @@ -5,7 +5,7 @@ import torch.nn as nn import torch.nn.functional as F class RouteMoELayer(nn.Module): - def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route"): + def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route", weight_type="ffn_prob"): # remove hash list nn.Module.__init__(self) self.num_experts = num_experts @@ -13,6 +13,7 @@ class RouteMoELayer(nn.Module): self.num_beams = num_beams self.hidden_size = hidden_size self.layer_judge = layer_judge + self.weight_type = weight_type self.route_method = route_method if self.route_method == "pre-route": @@ -22,6 +23,17 @@ class RouteMoELayer(nn.Module): self.gate = gate # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) + def _importance_auxiliary_loss(self, prob_gate): + # From VMOE + # _importance_auxiliary_loss + axis = tuple(range(prob_gate.ndim - 1)) # All except last. + importance_per_expert = torch.sum(prob_gate, dim=axis) + std_importance_per_expert = torch.std(importance_per_expert) + mean_importance_per_expert = torch.mean(importance_per_expert) + # Compute coefficient of variation (i.e. std/mean) squared. + return (std_importance_per_expert / mean_importance_per_expert)**2 + + def forward_gate(self, x): """ x : torch.Size([bz*num_beams, 32, 768]) or torch.Size([bz, 32, 768]) @@ -29,7 +41,8 @@ class RouteMoELayer(nn.Module): """ attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz*num_beams, 32, 768]) - x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768]) + # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768]) + x_average = torch.mean(x_masked, dim=1) # torch.Size([bz*num_beams, 768]) logits_gate = self.gate(x_average) # torch.Size([bz*num_beams, num_experts]) prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts]) return prob_gate @@ -42,7 +55,7 @@ class RouteMoELayer(nn.Module): topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams]) expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1]) - beam_idx = None + beam_idx = torch.tensor(range(self.num_beams * batch_size)) else: if self.layer_judge=='first' and self.route_method == 'post-route': batch_size = batch_size @@ -89,54 +102,63 @@ class RouteMoELayer(nn.Module): return beam_scores, expert_route, beam_idx - - def forward_expert_ffn(self, x, expert_select, beam_scores): + def forward_expert_ffn(self, x, expert_select, current_scores): """ x_repeat : [bz*num_beams, 32,768] expert_select : [bz*num_beams] + current_scores : [bz*num_beams, num_experts] / [bz, num_experts] """ - # add_1212 l2_normalization - # normalized_tensor = torch.nn.functional.normalize(beam_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) + # add_1228 l2_normalization + # normalized_tensor = torch.nn.functional.normalize(current_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk]) # tmp_prob = normalized_tensor.unsqueeze(-1).unsqueeze(-1) - + # import pdb;pdb.set_trace() outputs = list() - for i in range(x.shape[0]): - output_x = self.experts[expert_select[i]].forward(x[i]) - outputs.append(output_x.unsqueeze(0)) - candidate_output = torch.cat(outputs) - - # candidate_output = candidate_output * tmp_prob - return candidate_output # torch.Size([bz*num_beams, 32, 768]) - + for i in range(self.num_experts): + output_x = self.experts[i].forward(x) + outputs.append(output_x.unsqueeze(1)) + candidate_output = torch.cat(outputs, dim=1) + expert_select_matrix = F.one_hot(expert_select, self.num_experts) + if self.weight_type == 'ffn_prob': + tmp_prob = current_scores * expert_select_matrix + candidate_output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1) + else: + candidate_output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1) + output = torch.sum(candidate_output, dim=1) + # import pdb;pdb.set_trace() + return output # torch.Size([bz*num_beams, 32, 768]) def forward_pre_route(self, x, beam_scores, expert_route, use_log=True): - current_scores = self.forward_gate(x) # [bz*num_beams, 5] + current_scores = self.forward_gate(x) # [bz, num_beams] / [bz*num_beams, num_beams] + + importance_loss = self._importance_auxiliary_loss(current_scores) if use_log: current_scores_log = torch.log(current_scores) # 取log之后可以直接相加 else: current_scores_log = current_scores - + # import pdb;pdb.set_trace() batch_size, num_tokens = x.shape[0], x.shape[1] beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) - current_expert_select = expert_route[:,-1] if self.layer_judge=='first': # expand first dim to batch_size * num_beams replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size) x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768] + current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts) + current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts] - candidate_output = self.forward_expert_ffn(x, current_expert_select, beam_scores) # [bz*num_beams, 32,768] - - return candidate_output, beam_scores, expert_route, beam_idx + input_x = x[beam_idx] + candidate_output = self.forward_expert_ffn(input_x, current_expert_select, current_scores) # [bz*num_beams, 32,768] + # import pdb;pdb.set_trace() + return candidate_output, beam_scores, expert_route, beam_idx, importance_loss def forward_post_route(self, x, beam_scores, expert_route, use_log=True): attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) - + def forward_expert(input_x, expert_idx): output_x = self.experts[expert_idx].forward(input_x) return output_x @@ -145,12 +167,14 @@ class RouteMoELayer(nn.Module): logits_gate_lst = list() for expert_idx in range(self.num_experts): output_x = forward_expert(x_masked, expert_idx) - outputs.append(output_x.unsqueeze(0)) - output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768]) + # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768]) + output_x_aver = torch.mean(output_x, dim=1) # gate_score = self.gates[expert_idx](output_x_aver) gate_score = self.gate(output_x_aver) logits_gate_lst.append(gate_score) - candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768]) + outputs.append(output_x.unsqueeze(0)) + + candidate_output_raw = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768]) logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert]) current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts]) @@ -159,24 +183,33 @@ class RouteMoELayer(nn.Module): else: current_scores_log = current_scores - batch_size = x.shape[0] # bz*num_beam + # importance loss + importance_loss = self._importance_auxiliary_loss(current_scores) + + batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size) # beam_scores torch.Size([bz*num_beam]) # expert_route torch.Size([bz*num_beam, layer_n]) current_select_expert = expert_route[:,-1] + # current_select_expert torch.Size([bz*num_beam, 1]) - output = list() - for i in range(beam_idx.shape[0]): - b_idx = beam_idx[i] - ex_idx = current_select_expert[i] - ex_out = candidate_output[ex_idx, b_idx, :,:] - output.append(ex_out.unsqueeze(0)) - - final_output = torch.concat(output, dim=0) - - return final_output, beam_scores, expert_route, beam_idx - - + if self.layer_judge == 'first': + replicated_tensor = candidate_output_raw.unsqueeze(2).expand(self.num_experts, batch_size, self.num_beams, num_tokens, self.hidden_size) + candidate_output_raw = replicated_tensor.contiguous().view(self.num_experts, -1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768] + current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts) + current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts] + + candidate_output = candidate_output_raw.permute(1, 0, 2, 3)[beam_idx] # torch.Size([8, 2, 32, 768]) + expert_select_matrix = F.one_hot(current_select_expert, self.num_experts) + if self.weight_type == 'ffn_prob': + tmp_prob = current_scores[beam_idx] * expert_select_matrix + output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1) + else: + output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1) + final_output = torch.sum(output, dim=1) + + return final_output, beam_scores, expert_route, beam_idx, importance_loss + def forward(self, x, attention_mask, beam_scores, expert_route, use_log=True): """ if first_layer: x [bz, 32, 768] @@ -184,11 +217,11 @@ class RouteMoELayer(nn.Module): """ if self.route_method == 'pre-route': - candidate_output, beam_scores, expert_route, beam_idx = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) + candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True) elif self.route_method == "post-route": - candidate_output, beam_scores, expert_route, beam_idx = self.forward_post_route(x, beam_scores, expert_route, use_log=True) + candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True) - return candidate_output, beam_scores, expert_route, beam_idx + return candidate_output, beam_scores, expert_route, beam_idx, importance_loss diff --git a/minigpt4/models/moe/test_moe_layer.py b/minigpt4/models/moe/test_moe_layer.py new file mode 100644 index 0000000..5253340 --- /dev/null +++ b/minigpt4/models/moe/test_moe_layer.py @@ -0,0 +1,294 @@ +import copy +import pickle +import torch +import torch.nn as nn +import torch.nn.functional as F + +import copy +import pickle +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MoELayer(nn.Module): + def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='raw_prob, topk(softmax)'): + # remove hash list + nn.Module.__init__(self) + self.num_experts = num_experts + self.experts = nn.ModuleList([copy.deepcopy(expert) for i in range(num_experts)]) + self.route_method = route_method + self.topk = topk + self.use_balance_loss = use_balance_loss + self.weight_type = weight_type + + if route_method in ["gate-token", "gate-sentence"]: + self.gate = nn.Linear(hidden_size, num_experts, bias=False).float() + elif route_method in ["gate-sentence-post"]: + gate = nn.Linear(hidden_size, 1, bias=False).float() + # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)]) + self.gate = gate + else: + raise KeyError("Routing method not supported.") + + def _balancing_loss(self, prob_gate, num_tokens): + # From MOEBERT + # compute the load balancing loss + # prob_gate,是 [bz, num_expert],每个样本被分配给每个expert的概率 + # 等价于 VMOE 中 _gshard_auxiliary_loss + P = prob_gate.mean(0) # torch.Size([num_expert]) 每个expert被分配到样本的平均概率 + temp = num_tokens.float() + f = temp / temp.sum(0, keepdim=True) # 每个expert被分配的sample比例 + balance_loss = self.num_experts * torch.sum(P * f) + return balance_loss + + def _importance_auxiliary_loss(self, prob_gate): + # From VMOE + # _importance_auxiliary_loss + axis = tuple(range(prob_gate.ndim - 1)) # All except last. + importance_per_expert = torch.sum(prob_gate, dim=axis) + std_importance_per_expert = torch.std(importance_per_expert) + mean_importance_per_expert = torch.mean(importance_per_expert) + # Compute coefficient of variation (i.e. std/mean) squared. + return (std_importance_per_expert / mean_importance_per_expert)**2 + + def _forward_gate_token(self, x): + bsz, seq_len, dim = x.size() + + x = x.view(-1, dim) + logits_gate = self.gate(x) + prob_gate = F.softmax(logits_gate, dim=-1) + gate = torch.argmax(prob_gate, dim=-1) + + order = gate.argsort(0) + num_tokens = F.one_hot(gate, self.num_experts).gt(0).sum(0) + gate_load = num_tokens.clone() + x = x[order] # reorder according to expert number + x = x.split(num_tokens.tolist(), dim=0) # a list of length self.num_experts + + # compute the load balancing loss + P = prob_gate.mean(0) + temp = num_tokens.float() + f = temp / temp.sum(0, keepdim=True) + balance_loss = self.num_experts * torch.sum(P * f) + + prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1)) + prob_gate = prob_gate[order] + prob_gate = prob_gate.split(num_tokens.tolist(), dim=0) + + def forward_expert(input_x, prob_x, expert_idx): + input_x = self.experts[expert_idx].forward(input_x) + input_x = input_x * prob_x + return input_x + + x = [forward_expert(x[i], prob_gate[i], i) for i in range(self.num_experts)] + x = torch.vstack(x) + x = x[order.argsort(0)] # restore original order + x = x.view(bsz, seq_len, dim) + + return x, balance_loss, gate_load + + def _forward_gate_sentence_post(self, x, attention_mask): + """ + x: query_attention_output; torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + bz = 4 + x = torch.randn(bz,32,768) + attention_mask = torch.ones([bz, 32]) + + """ + # Prepare Input x + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + + # FeedForward(x) & Forward Gate + outputs = list() + logits_gate_lst = list() + for expert_idx in range(self.num_experts): + output_x = self.experts[expert_idx].forward(x_masked) + outputs.append(output_x.unsqueeze(0)) + + output_x_aver = torch.mean(output_x, dim=1) + # gate_acore = self.gates[expert_idx](output_x_aver) + gate_score = self.gate(output_x_aver) + logits_gate_lst.append(gate_score) + candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768]) + logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert]) + + # Probabilities for each sample of what expert it should be sent to. + prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts]) + if 'softmax(topk)' in self.weight_type: + prob_gate1, gate = torch.topk(logits_gate, self.topk, dim=1) + select_prob_gate = F.softmax(prob_gate1, dim=-1) + else: + select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk]) + + # Calculate Balancing Loss + if self.use_balance_loss: + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + # Calculate Importance Loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + # Reshap Prob_gate & Gate + # expert_mask: [batch_size, topk, num_experts] + # expert_gate: [batch_size, topk, num_experts] + # combine_tensor: [batch_size, num_experts] + expert_mask = F.one_hot(gate, self.num_experts) + expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask + combine_tensor = torch.sum(expert_gate, dim=1) + # combine_tensor = torch.zeros_like(prob_gate) + # combine_tensor.scatter_(1, gate, select_prob_gate) # 等价操作,但可能不可导 + + candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768]) + results = candidate_output_ad * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([bz, num_expert, 32, 768]) + outputs = torch.sum(results, dim=1) # torch.Size([bz, 32, 768]) + import pdb;pdb.set_trace() + + return outputs, (balance_loss+importance_loss), combine_tensor + + def pre_router(self, x, attention_mask): + # Prepare input x + attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device) + x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768]) + x_average = torch.mean(x_masked, dim=1) # torch.Size([bz, 768]) + + # Forward Gate + # logits_gate: [bz, num_experts] + logits_gate = self.gate(x_average) + + # Probabilities for each sample of what expert it should be sent to. + # prob_gate: [bz, num_experts] + prob_gate = F.softmax(logits_gate, dim=-1) + + if 'softmax(topk)' in self.weight_type: + prob_gate1, gate = torch.topk(logits_gate, self.topk, dim=1) + select_prob_gate = F.softmax(prob_gate1, dim=-1) + else: + # topk(softmax) + # Get Top-K experts for each sample + # gate: [bz, topk] + # select_prob_gate: [bz, topk] + select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) + + # Reshap Prob_gate & Gate + # expert_mask: [batch_size, topk, num_experts] + # expert_gate: [batch_size, topk, num_experts] + # combine_tensor: [batch_size, num_experts] + expert_mask = F.one_hot(gate, self.num_experts) + expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask + combine_tensor = torch.sum(expert_gate, dim=1) + + # Calculate Balancing Loss + if self.use_balance_loss: + num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert]) + balance_loss = self._balancing_loss(prob_gate, num_sentences) + else: + balance_loss = 0.0 + + # Calculate Importance Loss + importance_loss = self._importance_auxiliary_loss(prob_gate) + + import pdb; pdb.set_trace() + + return expert_mask, combine_tensor, balance_loss, importance_loss + + def _forward_gate_sentence(self, x, attention_mask): + """ + x: query_attention_output , torch.Size([bz, 32, 768]) + attention_mask: torch.ones([bz, 32]) + + ### Notice: + the raw version of expert_attention_mask is the extended_attention_mask, + which will be add to attention_score directly + the values of extended_attention_mask are -0.0 or -10000 + it should be adjust to 1/0 version to be processed by experts + """ + # Forward Router + expert_mask, combine_tensor, balance_loss, importance_loss = self.pre_router(x, attention_mask) + + # Forward Expert FFN + result = [] + for expert_idx in range(self.num_experts): + output_x = self.experts[expert_idx].forward(x) + result.append(output_x.unsqueeze(0)) + expert_output = torch.cat(result).permute(1,0,2,3) # torch.Size([batch_size, num_expert, num_tokens, hidden_states]) + + # multiply outputs of experts by the routing probability + expert_outputs_combined = expert_output * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states]) + outputs = torch.sum(expert_outputs_combined, dim=1) # torch.Size([batch_size, num_tokens, hidden_states]) + + import pdb; pdb.set_trace() + + return outputs, (balance_loss+importance_loss), combine_tensor + + + def forward(self, x, attention_mask): + if self.route_method == "gate-token": + x, balance_loss, gate_load = self._forward_gate_token(x) + elif self.route_method == "gate-sentence": + x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask) + elif self.route_method == "gate-sentence-post": + x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask) + else: + raise KeyError("Routing method not supported.") + # import pdb; pdb.set_trace() + return x, balance_loss, gate_load + +if __name__ == '__main__': + + import sys + sys.path.append("/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE") + from minigpt4.models.QformerRouteMoE import BertConfig + from minigpt4.models.QformerRouteMoE import FeedForward + from minigpt4.models.moe.utils import ( + moe_layer_judge, + ) + + vision_width = 1408 + cross_attention_freq = 2 + num_query_token = 32 + # init_QformerMoE + config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased") + config.encoder_width = vision_width + # insert cross-attention layer every other block + config.add_cross_attention = True + config.cross_attention_freq = cross_attention_freq + config.query_length = num_query_token + config.moebert_expert_num = 3 + config.moebert_num_beams = 2 + config.moebert_route_method = 'gate-sentence-post' + config.moe_topk = 1 + config.use_balance_loss = False + # config.moe_weight_type = 'raw_prob, softmax(topk)' + config.moe_weight_type = 'raw_prob, topk(softmax)' + + batch_size = 4 + x2 = torch.randn(batch_size, 32, 768) + beam_scores, expert_route = None, None + + for layer_num in [6, 8, 10]: + layer_judge = moe_layer_judge(layer_num) + ffn = FeedForward(config) + gate = nn.Linear(768, config.moebert_expert_num, bias=False).float() + + experts_moe = MoELayer( + hidden_size=config.hidden_size, + expert=ffn, + num_experts=config.moebert_expert_num, + route_method=config.moebert_route_method, + topk=config.moe_topk, + use_balance_loss=config.use_balance_loss, + weight_type=config.moe_weight_type, + ) + attn_mask = torch.ones([batch_size, 32]) + layer_output = experts_moe(x2, attn_mask) + hidden_states3, aux_loss, combine_tensor = layer_output + + print(combine_tensor) + print(aux_loss) + x2 = hidden_states3 + + print("------------------------------------") + import pdb; pdb.set_trace() \ No newline at end of file diff --git a/minigpt4/models/moe/utils.py b/minigpt4/models/moe/utils.py index 6f5858d..52f78b8 100644 --- a/minigpt4/models/moe/utils.py +++ b/minigpt4/models/moe/utils.py @@ -19,15 +19,33 @@ def use_experts(layer_idx): else: return False +def use_experts_route(layer_idx): + # if layer_idx % 2 == 0: + # use moe_ffn after cross_attns + # if int(layer_idx) in [0,2,4,6,8,10]: + if int(layer_idx) in [6,7,8,9,10,11]: + return True + else: + return False + def moe_layer_judge(layer_idx): if layer_idx == 6: return 'first' - elif layer_idx == 8: + elif layer_idx in [7,8,9,10]: return 'mid' - elif layer_idx == 10: + elif layer_idx == 11: return 'last' else: return None + + # if layer_idx == 0: + # return 'first' + # elif layer_idx in [2,4,6,8]: + # return 'mid' + # elif layer_idx == 10: + # return 'last' + # else: + # return None def process_ffn(model): if model.config.model_type == "bert": diff --git a/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml index b74d7aa..8c5e050 100644 --- a/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml +++ b/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml @@ -10,7 +10,6 @@ model: load_finetuned: False vit_model: eva_clip_g pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" - # finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_Post_train_qf_train_qt_aver_weight_5ex_top1_1loss_textinqf_epo3_s42_1201/20231201184/checkpoint_best.pth" finetuned: "" q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" @@ -38,7 +37,7 @@ model: # moe use_moeqformer: True - moebert_expert_num: 5 + moebert_expert_num: 3 moebert_route_method: "gate-sentence-post" moebert_load_balance: 0 moe_topk: 1 @@ -110,6 +109,7 @@ run: max_epoch: 1 num_workers: 4 warmup_steps: 600 + iters_per_epoch: 1000 seed: 42 output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_Post_train_qf_train_qt_aver_weight_5ex_top1_1loss_textinqf_epo3_s42_1201/" diff --git a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml index c5a4d5a..74f4ab0 100644 --- a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml +++ b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml @@ -10,7 +10,7 @@ model: load_finetuned: True vit_model: eva_clip_g pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" - finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_linear_gate_3ex_3beam_1loss_top3layer_log_textinqf_epo3_1216/20231216155/checkpoint_best.pth" + finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth" q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" # vit encoder @@ -38,10 +38,12 @@ model: # moe use_moeqformer: True use_route_moe: True - moebert_expert_num: 3 - moebert_num_beams: 3 moebert_route_method: "post-route" - gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209_eval_latest1/" + moebert_load_balance: 0 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/" datasets: gqa: @@ -81,19 +83,20 @@ run: task: instruction_tuning # optimizer lr_sched: "linear_warmup_cosine_lr" - init_lr: 2e-5 + init_lr: 5e-5 min_lr: 1e-6 warmup_lr: 1e-6 log_freq: 5 save_freq: 1500 weight_decay: 0.05 - max_epoch: 5 + max_epoch: 10 num_workers: 4 warmup_steps: 600 + iters_per_epoch: 3000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml index 5ec25e0..16440dc 100644 --- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml @@ -38,10 +38,12 @@ model: # moe use_moeqformer: True use_route_moe: True + moebert_route_method: "post-route" + moebert_load_balance: 0 moebert_expert_num: 3 moebert_num_beams: 3 - moebert_route_method: "post-route" - gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + moe_weight_type: 'ffn_prob' + use_balance_loss: False datasets: gqa: # train: 943000, 12578, 12578) @@ -97,19 +99,20 @@ run: task: instruction_tuning # optimizer lr_sched: "linear_warmup_cosine_lr" - init_lr: 2e-5 + init_lr: 5e-5 min_lr: 1e-6 warmup_lr: 1e-6 log_freq: 5 save_freq: 1500 weight_decay: 0.05 - max_epoch: 5 + max_epoch: 8 num_workers: 4 warmup_steps: 600 + iters_per_epoch: 5000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml new file mode 100644 index 0000000..8818143 --- /dev/null +++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml @@ -0,0 +1,129 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +# 0107test + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: True + moebert_route_method: "post-route" + moebert_load_balance: 0 + moebert_expert_num: 2 + moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + +datasets: + # gqa: # train: 943000, 12578, 12578) + # type: balanced_sft_raw + # batch_size: 1 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 224 + # eval: + # name: "blip2_image_eval" + # image_size: 224 + # text_processor: + # train: + # name: "blip_caption" + # eval: + # name: "blip_caption" + # sample_ratio: 10 + + ok_vqa: # train, valid (9009, 5046) + batch_size: 1 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 1 + + # coco_vqa: # 658104 + # batch_size: 1 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 224 + # eval: + # name: "blip2_image_eval" + # image_size: 224 + # text_processor: + # train: + # name: "blip_caption" + # eval: + # name: "blip_caption" + # sample_ratio: 9 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + init_lr: 2e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 5 + num_workers: 4 + warmup_steps: 600 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + # test_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml b/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml index 3e02942..98de298 100644 --- a/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml +++ b/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml @@ -10,7 +10,7 @@ model: load_finetuned: True vit_model: eva_clip_g pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" - finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_NoNorm_5ex_2beam_1loss_top3layer_textinqf_epo6_1215/20231216161/checkpoint_best.pth" + finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_1loss_5e5lr_top6layer_textinqf_epo8_0111/20240111145/checkpoint_best.pth" q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" # vit encoder @@ -39,8 +39,11 @@ model: use_moeqformer: True use_route_moe: True moebert_route_method: "post-route" - moebert_expert_num: 5 + moebert_load_balance: 0 + moebert_expert_num: 2 moebert_num_beams: 2 + moe_weight_type: 'ffn_prob' + use_balance_loss: False datasets: ok_vqa: # train, valid (9009, 5046) @@ -78,7 +81,7 @@ evaluation_datasets: run: task: instruction_tuning name: vqa_benchmark_evaluation - save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/eval/benchmarks/mix_1048k_raw_QformerMoE_Route_Post_NoNorm_5ex_2beam_1loss_top3layer_textinqf_epo6_1215/" + save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/benchmarks/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_1loss_5e5lr_top6layer_textinqf_epo8_0111/" seed: 42 diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml new file mode 100644 index 0000000..979e0a1 --- /dev/null +++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml @@ -0,0 +1,131 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna7b + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: True + use_route_moe: True + moebert_route_method: "post-route" + moebert_load_balance: 0.05 + moebert_expert_num: 3 + moebert_num_beams: 3 + moe_weight_type: 'ffn_prob' + use_balance_loss: False + +datasets: + gqa: # train: 943000, 12578, 12578) + type: balanced_sft_raw + # batch_size: 16 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 50 + + ok_vqa: # train, valid (9009, 5046) + # batch_size: 16 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 8 + + coco_vqa: # 658104 + # batch_size: 16 + batch_size: 32 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 15 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + # init_lr: 2e-5 + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_3ex_3beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml index 2eccb6b..d3f21ec 100644 --- a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml +++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml @@ -37,19 +37,19 @@ model: # moe use_moeqformer: True - moebert_expert_num: 5 + moebert_expert_num: 3 moebert_route_method: "gate-sentence" moebert_load_balance: 0 moe_topk: 1 use_balance_loss: False - moe_weight_type: 'l2_norm' - gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/gate_save/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/" + moe_weight_type: 'raw_prob' + # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/gate_save/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/" datasets: gqa: # train: 94254 type: balanced_sft_raw_part - batch_size: 32 + batch_size: 1 vis_processor: train: name: "blip2_image_train" @@ -65,7 +65,7 @@ datasets: sample_ratio: 50 ok_vqa: # train, valid (9009, 5046 - batch_size: 32 + batch_size: 1 vis_processor: train: name: "blip2_image_train" @@ -80,22 +80,22 @@ datasets: name: "blip_caption" sample_ratio: 8 - coco_vqa: # 214352 vqa_val - type: vqa_v2_part - batch_size: 32 - vis_processor: - train: - name: "blip2_image_train" - image_size: 224 - eval: - name: "blip2_image_eval" - image_size: 224 - text_processor: - train: - name: "blip_caption" - eval: - name: "blip_caption" - sample_ratio: 15 + # coco_vqa: # 214352 vqa_val + # type: vqa_v2_part + # batch_size: 1 + # vis_processor: + # train: + # name: "blip2_image_train" + # image_size: 224 + # eval: + # name: "blip2_image_eval" + # image_size: 224 + # text_processor: + # train: + # name: "blip_caption" + # eval: + # name: "blip_caption" + # sample_ratio: 15 run: task: instruction_tuning @@ -108,12 +108,13 @@ run: save_freq: 1500 weight_decay: 0.05 - max_epoch: 5 + max_epoch: 1 num_workers: 4 warmup_steps: 600 + iters_per_epoch: 1000 seed: 42 - output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/" + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1220_test/" amp: True resume_ckpt_path: null diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml new file mode 100644 index 0000000..afdb4eb --- /dev/null +++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml @@ -0,0 +1,125 @@ + # Copyright (c) 2022, salesforce.com, inc. + # All rights reserved. + # SPDX-License-Identifier: BSD-3-Clause + # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause + +model: + arch: blip2_vicuna_instruct + model_type: vicuna7b_pretrain + load_pretrained: True + load_finetuned: False + vit_model: eva_clip_g + pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + # finetuned: "" + q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth" + + # vit encoder + image_size: 224 + drop_path_rate: 0 + use_grad_checkpoint: False + vit_precision: "fp16" + + # Q-Former + num_query_token: 32 + qformer_text_input: True + + # vicuna7b + llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1" + prompt: "" + max_txt_len: 256 + max_output_txt_len: 256 + + # freeze + freeze_vit: True + freeze_llm: True + freeze_qformer: False + freeze_t5_proj: False + + # moe + use_moeqformer: False + moebert_expert_num: 1 + moebert_route_method: "gate-sentence" + moebert_load_balance: 0.05 + moe_topk: 1 + +datasets: + gqa: # train: 943000, 12578, 12578) + type: balanced_sft_raw + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 50 + + ok_vqa: # train, valid (9009, 5046) + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 8 + + coco_vqa: # 658104 + batch_size: 16 + vis_processor: + train: + name: "blip2_image_train" + image_size: 224 + eval: + name: "blip2_image_eval" + image_size: 224 + text_processor: + train: + name: "blip_caption" + eval: + name: "blip_caption" + sample_ratio: 15 + +run: + task: instruction_tuning + # optimizer + lr_sched: "linear_warmup_cosine_lr" + # init_lr: 2e-5 + init_lr: 5e-5 + min_lr: 1e-6 + warmup_lr: 1e-6 + log_freq: 5 + save_freq: 1500 + + weight_decay: 0.05 + max_epoch: 8 + num_workers: 4 + warmup_steps: 600 + iters_per_epoch: 5000 + + seed: 42 + output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_1610k_raw_QformerMoE_train_qf_train_qt_1ex_top1_textinqf_epo8_lr5e5_seed42_0112/" + + amp: True + resume_ckpt_path: null + + evaluate: False + train_splits: ["train"] + valid_splits: ["val"] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True \ No newline at end of file diff --git a/minigpt4/runners/runner_base.py b/minigpt4/runners/runner_base.py index 89413a3..8bc071b 100644 --- a/minigpt4/runners/runner_base.py +++ b/minigpt4/runners/runner_base.py @@ -110,6 +110,7 @@ class RunnerBase: else: p_wd.append(p) num_parameters += p.data.nelement() + # import pdb; pdb.set_trace() # 0107test logging.info("number of trainable parameters: %d" % num_parameters) optim_params = [ { diff --git a/minigpt4/tasks/base_task.py b/minigpt4/tasks/base_task.py index 3a39fc8..f0993ce 100644 --- a/minigpt4/tasks/base_task.py +++ b/minigpt4/tasks/base_task.py @@ -238,13 +238,17 @@ class BaseTask: with torch.cuda.amp.autocast(enabled=use_amp): loss = self.train_step(model=model, samples=samples) - + # after_train_step() if use_amp: + # torch.autograd.set_detect_anomaly(True) + # 反向传播时检测是否有异常值,定位code + # with torch.autograd.detect_anomaly(): scaler.scale(loss).backward() else: loss.backward() + # import pdb; pdb.set_trace() # 0107test # update gradients every accum_grad_iters iterations if (i + 1) % accum_grad_iters == 0: if use_amp: @@ -252,6 +256,9 @@ class BaseTask: scaler.update() else: optimizer.step() + + # import pdb; pdb.set_trace()# 0107test + optimizer.zero_grad() # if self.cfg.wandb_log: # if self.cfg.run_cfg.wandb_log: diff --git a/requirements.txt b/requirements.txt index cbfa260..0d7634c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,4 +44,6 @@ wheel visualizer tensorboard kmeans_pytorch -visual_genome \ No newline at end of file +visual_genome +gpustat +torchviz \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6a67455 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +""" + Copyright (c) 2022, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" + +from setuptools import setup, find_namespace_packages +import platform + +DEPENDENCY_LINKS = [] +if platform.system() == "Windows": + DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html") + + +def fetch_requirements(filename): + with open(filename) as f: + return [ln.strip() for ln in f.read().split("\n")] + + +setup( + name="PromptMoE", + version="1.0.1", + author="Hanzi Wang", + description="PromptMoE & QformerMoE Based on LAVIS", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch", + license="3-Clause BSD", + packages=find_namespace_packages(include="lavis.*"), + install_requires=fetch_requirements("requirements.txt"), + python_requires=">=3.7.0", + include_package_data=True, + dependency_links=DEPENDENCY_LINKS, + zip_safe=False, +) \ No newline at end of file diff --git a/test.pdf/backward_graph b/test.pdf/backward_graph new file mode 100644 index 0000000..7867fb1 --- /dev/null +++ b/test.pdf/backward_graph @@ -0,0 +1,5570 @@ +digraph { + graph [size="778.8,778.8"] + node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled] + 140509988778688 [label=" + (1, 49, 768)" fillcolor=darkolivegreen1] + 140509588281712 [label=CatBackward0] + 140509588282912 -> 140509588281712 + 140509588282912 [label=IndexBackward0] + 140509588281808 -> 140509588282912 + 140509588281808 [label=SumBackward1] + 140509588283152 -> 140509588281808 + 140509588283152 [label=MulBackward0] + 140509588282864 -> 140509588283152 + 140509588282864 [label=CatBackward0] + 140509591316848 -> 140509588282864 + 140509591316848 [label=UnsqueezeBackward0] + 140509591314640 -> 140509591316848 + 140509591314640 [label=NativeLayerNormBackward0] + 140509591317376 -> 140509591314640 + 140509591317376 [label=AddBackward0] + 140509588312944 -> 140509591317376 + 140509588312944 [label=NativeDropoutBackward0] + 140509588313424 -> 140509588312944 + 140509588313424 [label=ViewBackward0] + 140509588313232 -> 140509588313424 + 140509588313232 [label=AddmmBackward0] + 140509588312560 -> 140509588313232 + 140509588312560 [label=ToCopyBackward0] + 140509591318384 -> 140509588312560 + 140509591260672 [label="encoder.layer.11.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591260672 -> 140509591318384 + 140509591318384 [label=AccumulateGrad] + 140509588313040 -> 140509588313232 + 140509588313040 [label=ViewBackward0] + 140509588312368 -> 140509588313040 + 140509588312368 [label=GeluBackward0] + 140509588312176 -> 140509588312368 + 140509588312176 [label=ViewBackward0] + 140509588313328 -> 140509588312176 + 140509588313328 [label=AddmmBackward0] + 140509588313520 -> 140509588313328 + 140509588313520 [label=ToCopyBackward0] + 140509588313808 -> 140509588313520 + 140509591261072 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591261072 -> 140509588313808 + 140509588313808 [label=AccumulateGrad] + 140509588313616 -> 140509588313328 + 140509588313616 [label=ViewBackward0] + 140509588314096 -> 140509588313616 + 140509588314096 [label=ToCopyBackward0] + 140509588312608 -> 140509588314096 + 140509588312608 [label=SliceBackward0] + 140509588314048 -> 140509588312608 + 140509588314048 [label=SliceBackward0] + 140509588314288 -> 140509588314048 + 140509588314288 [label=SliceBackward0] + 140509588314480 -> 140509588314288 + 140509588314480 [label=SliceBackward0] + 140509588314528 -> 140509588314480 + 140509588314528 [label=SliceBackward0] + 140509588314768 -> 140509588314528 + 140509588314768 [label=NativeLayerNormBackward0] + 140509588314960 -> 140509588314768 + 140509588314960 [label=AddBackward0] + 140509588315248 -> 140509588314960 + 140509588315248 [label=NativeDropoutBackward0] + 140509588315632 -> 140509588315248 + 140509588315632 [label=ViewBackward0] + 140509588315824 -> 140509588315632 + 140509588315824 [label=AddmmBackward0] + 140509588316016 -> 140509588315824 + 140509588316016 [label=ToCopyBackward0] + 140509588315968 -> 140509588316016 + 140509591290880 [label="encoder.layer.11.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509591290880 -> 140509588315968 + 140509588315968 [label=AccumulateGrad] + 140509588315728 -> 140509588315824 + 140509588315728 [label=ViewBackward0] + 140509588316112 -> 140509588315728 + 140509588316112 [label=ViewBackward0] + 140509588345136 -> 140509588316112 + 140509588345136 [label=CloneBackward0] + 140509588345184 -> 140509588345136 + 140509588345184 [label=PermuteBackward0] + 140509588345424 -> 140509588345184 + 140509588345424 [label=UnsafeViewBackward0] + 140509588345616 -> 140509588345424 + 140509588345616 [label=BmmBackward0] + 140509588345664 -> 140509588345616 + 140509588345664 [label=ReshapeAliasBackward0] + 140509588346192 -> 140509588345664 + 140509588346192 [label=ExpandBackward0] + 140509588346288 -> 140509588346192 + 140509588346288 [label=ToCopyBackward0] + 140509588346480 -> 140509588346288 + 140509588346480 [label=NativeDropoutBackward0] + 140509588346672 -> 140509588346480 + 140509588346672 [label=SoftmaxBackward0] + 140509588346768 -> 140509588346672 + 140509588346768 [label=AddBackward0] + 140509588346960 -> 140509588346768 + 140509588346960 [label=DivBackward0] + 140509588347152 -> 140509588346960 + 140509588347152 [label=UnsafeViewBackward0] + 140509588347248 -> 140509588347152 + 140509588347248 [label=BmmBackward0] + 140509588347440 -> 140509588347248 + 140509588347440 [label=UnsafeViewBackward0] + 140509588347536 -> 140509588347440 + 140509588347536 [label=CloneBackward0] + 140509588347584 -> 140509588347536 + 140509588347584 [label=ExpandBackward0] + 140509588347824 -> 140509588347584 + 140509588347824 [label=PermuteBackward0] + 140509588348016 -> 140509588347824 + 140509588348016 [label=ViewBackward0] + 140509588348064 -> 140509588348016 + 140509588348064 [label=ViewBackward0] + 140509588348304 -> 140509588348064 + 140509588348304 [label=AddmmBackward0] + 140509588348496 -> 140509588348304 + 140509588348496 [label=ToCopyBackward0] + 140509588348784 -> 140509588348496 + 140509591291680 [label="encoder.layer.11.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509591291680 -> 140509588348784 + 140509588348784 [label=AccumulateGrad] + 140509588348592 -> 140509588348304 + 140509588348592 [label=ViewBackward0] + 140509588348544 -> 140509588348592 + 140509588348544 [label=ToCopyBackward0] + 140509588315344 -> 140509588348544 + 140509588315344 [label=CatBackward0] + 140509588369568 -> 140509588315344 + 140509588369568 [label=SumBackward1] + 140509588370096 -> 140509588369568 + 140509588370096 [label=MulBackward0] + 140509588370192 -> 140509588370096 + 140509588370192 [label=CatBackward0] + 140509588370288 -> 140509588370192 + 140509588370288 [label=UnsqueezeBackward0] + 140509588370672 -> 140509588370288 + 140509588370672 [label=NativeLayerNormBackward0] + 140509588370864 -> 140509588370672 + 140509588370864 [label=AddBackward0] + 140509588371152 -> 140509588370864 + 140509588371152 [label=NativeDropoutBackward0] + 140509588371248 -> 140509588371152 + 140509588371248 [label=ViewBackward0] + 140509588371440 -> 140509588371248 + 140509588371440 [label=AddmmBackward0] + 140509588371488 -> 140509588371440 + 140509588371488 [label=ToCopyBackward0] + 140509588371920 -> 140509588371488 + 140509591285568 [label="encoder.layer.10.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591285568 -> 140509588371920 + 140509588371920 [label=AccumulateGrad] + 140509588371632 -> 140509588371440 + 140509588371632 [label=ViewBackward0] + 140509588372112 -> 140509588371632 + 140509588372112 [label=GeluBackward0] + 140509588372304 -> 140509588372112 + 140509588372304 [label=ViewBackward0] + 140509588372496 -> 140509588372304 + 140509588372496 [label=AddmmBackward0] + 140509588372592 -> 140509588372496 + 140509588372592 [label=ToCopyBackward0] + 140509588372976 -> 140509588372592 + 140509591285488 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591285488 -> 140509588372976 + 140509588372976 [label=AccumulateGrad] + 140509588372400 -> 140509588372496 + 140509588372400 [label=ViewBackward0] + 140509588372880 -> 140509588372400 + 140509588372880 [label=ToCopyBackward0] + 140509588370960 -> 140509588372880 + 140509588370960 [label=SliceBackward0] + 140509588373264 -> 140509588370960 + 140509588373264 [label=SliceBackward0] + 140509588373456 -> 140509588373264 + 140509588373456 [label=NativeLayerNormBackward0] + 140509588373360 -> 140509588373456 + 140509588373360 [label=AddBackward0] + 140509588402672 -> 140509588373360 + 140509588402672 [label=NativeDropoutBackward0] + 140509588402624 -> 140509588402672 + 140509588402624 [label=ViewBackward0] + 140509588402864 -> 140509588402624 + 140509588402864 [label=AddmmBackward0] + 140509588403056 -> 140509588402864 + 140509588403056 [label=ToCopyBackward0] + 140509588403344 -> 140509588403056 + 140509591293840 [label="encoder.layer.10.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509591293840 -> 140509588403344 + 140509588403344 [label=AccumulateGrad] + 140509588403152 -> 140509588402864 + 140509588403152 [label=ViewBackward0] + 140509588403632 -> 140509588403152 + 140509588403632 [label=ViewBackward0] + 140509588403728 -> 140509588403632 + 140509588403728 [label=CloneBackward0] + 140509588403920 -> 140509588403728 + 140509588403920 [label=PermuteBackward0] + 140509588404112 -> 140509588403920 + 140509588404112 [label=UnsafeViewBackward0] + 140509588404208 -> 140509588404112 + 140509588404208 [label=BmmBackward0] + 140509588404400 -> 140509588404208 + 140509588404400 [label=ReshapeAliasBackward0] + 140509588404496 -> 140509588404400 + 140509588404496 [label=ExpandBackward0] + 140509588404544 -> 140509588404496 + 140509588404544 [label=ToCopyBackward0] + 140509588404784 -> 140509588404544 + 140509588404784 [label=NativeDropoutBackward0] + 140509588404976 -> 140509588404784 + 140509588404976 [label=SoftmaxBackward0] + 140509588405024 -> 140509588404976 + 140509588405024 [label=AddBackward0] + 140509588405264 -> 140509588405024 + 140509588405264 [label=DivBackward0] + 140509588405456 -> 140509588405264 + 140509588405456 [label=UnsafeViewBackward0] + 140509588405504 -> 140509588405456 + 140509588405504 [label=BmmBackward0] + 140509588405744 -> 140509588405504 + 140509588405744 [label=UnsafeViewBackward0] + 140509588406128 -> 140509588405744 + 140509588406128 [label=CloneBackward0] + 140509588405984 -> 140509588406128 + 140509588405984 [label=ExpandBackward0] + 140509588427056 -> 140509588405984 + 140509588427056 [label=PermuteBackward0] + 140509588427152 -> 140509588427056 + 140509588427152 [label=ViewBackward0] + 140509588427344 -> 140509588427152 + 140509588427344 [label=ViewBackward0] + 140509588427536 -> 140509588427344 + 140509588427536 [label=AddmmBackward0] + 140509588427632 -> 140509588427536 + 140509588427632 [label=ToCopyBackward0] + 140509588428016 -> 140509588427632 + 140509591312160 [label="encoder.layer.10.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509591312160 -> 140509588428016 + 140509588428016 [label=AccumulateGrad] + 140509588427440 -> 140509588427536 + 140509588427440 [label=ViewBackward0] + 140509588427920 -> 140509588427440 + 140509588427920 [label=ToCopyBackward0] + 140509588402384 -> 140509588427920 + 140509588402384 [label=SliceBackward0] + 140509588428304 -> 140509588402384 + 140509588428304 [label=SliceBackward0] + 140509588428496 -> 140509588428304 + 140509588428496 [label=SliceBackward0] + 140509588428592 -> 140509588428496 + 140509588428592 [label=NativeLayerNormBackward0] + 140509588428784 -> 140509588428592 + 140509588428784 [label=AddBackward0] + 140509588429072 -> 140509588428784 + 140509588429072 [label=NativeDropoutBackward0] + 140509588429168 -> 140509588429072 + 140509588429168 [label=ViewBackward0] + 140509588429360 -> 140509588429168 + 140509588429360 [label=AddmmBackward0] + 140509588429408 -> 140509588429360 + 140509588429408 [label=ToCopyBackward0] + 140509588429840 -> 140509588429408 + 140509591312960 [label="encoder.layer.10.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509591312960 -> 140509588429840 + 140509588429840 [label=AccumulateGrad] + 140509588429552 -> 140509588429360 + 140509588429552 [label=ViewBackward0] + 140509588430032 -> 140509588429552 + 140509588430032 [label=ViewBackward0] + 140509588430224 -> 140509588430032 + 140509588430224 [label=CloneBackward0] + 140509588430416 -> 140509588430224 + 140509588430416 [label=PermuteBackward0] + 140509588430512 -> 140509588430416 + 140509588430512 [label=UnsafeViewBackward0] + 140509588430704 -> 140509588430512 + 140509588430704 [label=BmmBackward0] + 140509588430608 -> 140509588430704 + 140509588430608 [label=ReshapeAliasBackward0] + 140509588459728 -> 140509588430608 + 140509588459728 [label=ExpandBackward0] + 140509588459824 -> 140509588459728 + 140509588459824 [label=ToCopyBackward0] + 140509588460016 -> 140509588459824 + 140509588460016 [label=NativeDropoutBackward0] + 140509588460064 -> 140509588460016 + 140509588460064 [label=SoftmaxBackward0] + 140509588460304 -> 140509588460064 + 140509588460304 [label=AddBackward0] + 140509588460496 -> 140509588460304 + 140509588460496 [label=DivBackward0] + 140509588460544 -> 140509588460496 + 140509588460544 [label=UnsafeViewBackward0] + 140509588460784 -> 140509588460544 + 140509588460784 [label=BmmBackward0] + 140509588460976 -> 140509588460784 + 140509588460976 [label=UnsafeViewBackward0] + 140509588461360 -> 140509588460976 + 140509588461360 [label=CloneBackward0] + 140509588461552 -> 140509588461360 + 140509588461552 [label=ExpandBackward0] + 140509588461648 -> 140509588461552 + 140509588461648 [label=PermuteBackward0] + 140509588461840 -> 140509588461648 + 140509588461840 [label=ViewBackward0] + 140509588462032 -> 140509588461840 + 140509588462032 [label=ViewBackward0] + 140509588462128 -> 140509588462032 + 140509588462128 [label=AddmmBackward0] + 140509588462320 -> 140509588462128 + 140509588462320 [label=ToCopyBackward0] + 140509588462608 -> 140509588462320 + 140509591313360 [label="encoder.layer.10.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509591313360 -> 140509588462608 + 140509588462608 [label=AccumulateGrad] + 140509588461984 -> 140509588462128 + 140509588461984 [label=ViewBackward0] + 140509588462464 -> 140509588461984 + 140509588462464 [label=ToCopyBackward0] + 140509588428880 -> 140509588462464 + 140509588428880 [label=CatBackward0] + 140509588462992 -> 140509588428880 + 140509588462992 [label=SumBackward1] + 140509588462944 -> 140509588462992 + 140509588462944 [label=MulBackward0] + 140509588463184 -> 140509588462944 + 140509588463184 [label=CatBackward0] + 140509588463568 -> 140509588463184 + 140509588463568 [label=UnsqueezeBackward0] + 140509588463424 -> 140509588463568 + 140509588463424 [label=NativeLayerNormBackward0] + 140509587960112 -> 140509588463424 + 140509587960112 [label=AddBackward0] + 140509587960400 -> 140509587960112 + 140509587960400 [label=NativeDropoutBackward0] + 140509587960784 -> 140509587960400 + 140509587960784 [label=ViewBackward0] + 140509587960976 -> 140509587960784 + 140509587960976 [label=AddmmBackward0] + 140509587961168 -> 140509587960976 + 140509587961168 [label=ToCopyBackward0] + 140509587961456 -> 140509587961168 + 140509591311680 [label="encoder.layer.9.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591311680 -> 140509587961456 + 140509587961456 [label=AccumulateGrad] + 140509587960880 -> 140509587960976 + 140509587960880 [label=ViewBackward0] + 140509587961360 -> 140509587960880 + 140509587961360 [label=GeluBackward0] + 140509587961552 -> 140509587961360 + 140509587961552 [label=ViewBackward0] + 140509587961600 -> 140509587961552 + 140509587961600 [label=AddmmBackward0] + 140509587961840 -> 140509587961600 + 140509587961840 [label=ToCopyBackward0] + 140509587962080 -> 140509587961840 + 140509591312000 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591312000 -> 140509587962080 + 140509587962080 [label=AccumulateGrad] + 140509587961936 -> 140509587961600 + 140509587961936 [label=ViewBackward0] + 140509587962416 -> 140509587961936 + 140509587962416 [label=ToCopyBackward0] + 140509587960496 -> 140509587962416 + 140509587960496 [label=SliceBackward0] + 140509587962512 -> 140509587960496 + 140509587962512 [label=SliceBackward0] + 140509587962560 -> 140509587962512 + 140509587962560 [label=SliceBackward0] + 140509587962800 -> 140509587962560 + 140509587962800 [label=SliceBackward0] + 140509587962992 -> 140509587962800 + 140509587962992 [label=SliceBackward0] + 140509587963040 -> 140509587962992 + 140509587963040 [label=NativeLayerNormBackward0] + 140509587963280 -> 140509587963040 + 140509587963280 [label=AddBackward0] + 140509587963520 -> 140509587963280 + 140509587963520 [label=NativeDropoutBackward0] + 140509587963760 -> 140509587963520 + 140509587963760 [label=ViewBackward0] + 140509587988784 -> 140509587963760 + 140509587988784 [label=AddmmBackward0] + 140509587988976 -> 140509587988784 + 140509587988976 [label=ToCopyBackward0] + 140509587989264 -> 140509587988976 + 140509591321152 [label="encoder.layer.9.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509591321152 -> 140509587989264 + 140509587989264 [label=AccumulateGrad] + 140509587988640 -> 140509587988784 + 140509587988640 [label=ViewBackward0] + 140509587989120 -> 140509587988640 + 140509587989120 [label=ViewBackward0] + 140509587989360 -> 140509587989120 + 140509587989360 [label=CloneBackward0] + 140509587989552 -> 140509587989360 + 140509587989552 [label=PermuteBackward0] + 140509587989600 -> 140509587989552 + 140509587989600 [label=UnsafeViewBackward0] + 140509587989840 -> 140509587989600 + 140509587989840 [label=BmmBackward0] + 140509587990032 -> 140509587989840 + 140509587990032 [label=ReshapeAliasBackward0] + 140509587990416 -> 140509587990032 + 140509587990416 [label=ExpandBackward0] + 140509587990608 -> 140509587990416 + 140509587990608 [label=ToCopyBackward0] + 140509587990704 -> 140509587990608 + 140509587990704 [label=NativeDropoutBackward0] + 140509587990896 -> 140509587990704 + 140509587990896 [label=SoftmaxBackward0] + 140509587991088 -> 140509587990896 + 140509587991088 [label=AddBackward0] + 140509587991184 -> 140509587991088 + 140509587991184 [label=DivBackward0] + 140509587991376 -> 140509587991184 + 140509587991376 [label=UnsafeViewBackward0] + 140509587991568 -> 140509587991376 + 140509587991568 [label=BmmBackward0] + 140509587991664 -> 140509587991568 + 140509587991664 [label=UnsafeViewBackward0] + 140509587991760 -> 140509587991664 + 140509587991760 [label=CloneBackward0] + 140509587991952 -> 140509587991760 + 140509587991952 [label=ExpandBackward0] + 140509587992000 -> 140509587991952 + 140509587992000 [label=PermuteBackward0] + 140509587992240 -> 140509587992000 + 140509587992240 [label=ViewBackward0] + 140509587992432 -> 140509587992240 + 140509587992432 [label=ViewBackward0] + 140509587991520 -> 140509587992432 + 140509587991520 [label=AddmmBackward0] + 140509588021456 -> 140509587991520 + 140509588021456 [label=ToCopyBackward0] + 140509588021696 -> 140509588021456 + 140509591321952 [label="encoder.layer.9.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509591321952 -> 140509588021696 + 140509588021696 [label=AccumulateGrad] + 140509588021552 -> 140509587991520 + 140509588021552 [label=ViewBackward0] + 140509588022032 -> 140509588021552 + 140509588022032 [label=ToCopyBackward0] + 140509587963664 -> 140509588022032 + 140509587963664 [label=CatBackward0] + 140509588022128 -> 140509587963664 + 140509588022128 [label=SumBackward1] + 140509588022512 -> 140509588022128 + 140509588022512 [label=MulBackward0] + 140509588022704 -> 140509588022512 + 140509588022704 [label=CatBackward0] + 140509588022656 -> 140509588022704 + 140509588022656 [label=UnsqueezeBackward0] + 140509588023184 -> 140509588022656 + 140509588023184 [label=NativeLayerNormBackward0] + 140509588023280 -> 140509588023184 + 140509588023280 [label=AddBackward0] + 140509588023664 -> 140509588023280 + 140509588023664 [label=NativeDropoutBackward0] + 140509588023616 -> 140509588023664 + 140509588023616 [label=ViewBackward0] + 140509588023856 -> 140509588023616 + 140509588023856 [label=AddmmBackward0] + 140509588024048 -> 140509588023856 + 140509588024048 [label=ToCopyBackward0] + 140509588024336 -> 140509588024048 + 140509591320272 [label="encoder.layer.8.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591320272 -> 140509588024336 + 140509588024336 [label=AccumulateGrad] + 140509588024144 -> 140509588023856 + 140509588024144 [label=ViewBackward0] + 140509588024624 -> 140509588024144 + 140509588024624 [label=GeluBackward0] + 140509588024720 -> 140509588024624 + 140509588024720 [label=ViewBackward0] + 140509588024912 -> 140509588024720 + 140509588024912 [label=AddmmBackward0] + 140509588025104 -> 140509588024912 + 140509588025104 [label=ToCopyBackward0] + 140509588025056 -> 140509588025104 + 140509591320192 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591320192 -> 140509588025056 + 140509588025056 [label=AccumulateGrad] + 140509588024816 -> 140509588024912 + 140509588024816 [label=ViewBackward0] + 140509588025200 -> 140509588024816 + 140509588025200 [label=ToCopyBackward0] + 140509588023376 -> 140509588025200 + 140509588023376 [label=SliceBackward0] + 140509588046224 -> 140509588023376 + 140509588046224 [label=SliceBackward0] + 140509588046416 -> 140509588046224 + 140509588046416 [label=NativeLayerNormBackward0] + 140509588046608 -> 140509588046416 + 140509588046608 [label=AddBackward0] + 140509588046800 -> 140509588046608 + 140509588046800 [label=NativeDropoutBackward0] + 140509588047184 -> 140509588046800 + 140509588047184 [label=ViewBackward0] + 140509588047376 -> 140509588047184 + 140509588047376 [label=AddmmBackward0] + 140509588047568 -> 140509588047376 + 140509588047568 [label=ToCopyBackward0] + 140509588047856 -> 140509588047568 + 140509591341312 [label="encoder.layer.8.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509591341312 -> 140509588047856 + 140509588047856 [label=AccumulateGrad] + 140509588047280 -> 140509588047376 + 140509588047280 [label=ViewBackward0] + 140509588047760 -> 140509588047280 + 140509588047760 [label=ViewBackward0] + 140509588047952 -> 140509588047760 + 140509588047952 [label=CloneBackward0] + 140509588048000 -> 140509588047952 + 140509588048000 [label=PermuteBackward0] + 140509588048240 -> 140509588048000 + 140509588048240 [label=UnsafeViewBackward0] + 140509588048432 -> 140509588048240 + 140509588048432 [label=BmmBackward0] + 140509588048480 -> 140509588048432 + 140509588048480 [label=ReshapeAliasBackward0] + 140509588049008 -> 140509588048480 + 140509588049008 [label=ExpandBackward0] + 140509588049104 -> 140509588049008 + 140509588049104 [label=ToCopyBackward0] + 140509588049296 -> 140509588049104 + 140509588049296 [label=NativeDropoutBackward0] + 140509588049488 -> 140509588049296 + 140509588049488 [label=SoftmaxBackward0] + 140509588049584 -> 140509588049488 + 140509588049584 [label=AddBackward0] + 140509588049776 -> 140509588049584 + 140509588049776 [label=DivBackward0] + 140509588049680 -> 140509588049776 + 140509588049680 [label=UnsafeViewBackward0] + 140509588074656 -> 140509588049680 + 140509588074656 [label=BmmBackward0] + 140509588074896 -> 140509588074656 + 140509588074896 [label=UnsafeViewBackward0] + 140509588074992 -> 140509588074896 + 140509588074992 [label=CloneBackward0] + 140509588075040 -> 140509588074992 + 140509588075040 [label=ExpandBackward0] + 140509588075280 -> 140509588075040 + 140509588075280 [label=PermuteBackward0] + 140509588075472 -> 140509588075280 + 140509588075472 [label=ViewBackward0] + 140509588075520 -> 140509588075472 + 140509588075520 [label=ViewBackward0] + 140509588075760 -> 140509588075520 + 140509588075760 [label=AddmmBackward0] + 140509588075952 -> 140509588075760 + 140509588075952 [label=ToCopyBackward0] + 140509588076240 -> 140509588075952 + 140509591342432 [label="encoder.layer.8.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509591342432 -> 140509588076240 + 140509588076240 [label=AccumulateGrad] + 140509588076048 -> 140509588075760 + 140509588076048 [label=ViewBackward0] + 140509588076528 -> 140509588076048 + 140509588076528 [label=ToCopyBackward0] + 140509588046896 -> 140509588076528 + 140509588046896 [label=SliceBackward0] + 140509588076480 -> 140509588046896 + 140509588076480 [label=SliceBackward0] + 140509588076720 -> 140509588076480 + 140509588076720 [label=SliceBackward0] + 140509588076912 -> 140509588076720 + 140509588076912 [label=NativeLayerNormBackward0] + 140509588076960 -> 140509588076912 + 140509588076960 [label=AddBackward0] + 140509588077392 -> 140509588076960 + 140509588077392 [label=NativeDropoutBackward0] + 140509588077776 -> 140509588077392 + 140509588077776 [label=ViewBackward0] + 140509588077968 -> 140509588077776 + 140509588077968 [label=AddmmBackward0] + 140509588078064 -> 140509588077968 + 140509588078064 [label=ToCopyBackward0] + 140509588078448 -> 140509588078064 + 140509590823056 [label="encoder.layer.8.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590823056 -> 140509588078448 + 140509588078448 [label=AccumulateGrad] + 140509588077872 -> 140509588077968 + 140509588077872 [label=ViewBackward0] + 140509588078352 -> 140509588077872 + 140509588078352 [label=ViewBackward0] + 140509588078400 -> 140509588078352 + 140509588078400 [label=CloneBackward0] + 140509588078160 -> 140509588078400 + 140509588078160 [label=PermuteBackward0] + 140509588103472 -> 140509588078160 + 140509588103472 [label=UnsafeViewBackward0] + 140509588103520 -> 140509588103472 + 140509588103520 [label=BmmBackward0] + 140509588103760 -> 140509588103520 + 140509588103760 [label=ReshapeAliasBackward0] + 140509588104144 -> 140509588103760 + 140509588104144 [label=ExpandBackward0] + 140509588104336 -> 140509588104144 + 140509588104336 [label=ToCopyBackward0] + 140509588104528 -> 140509588104336 + 140509588104528 [label=NativeDropoutBackward0] + 140509588104624 -> 140509588104528 + 140509588104624 [label=SoftmaxBackward0] + 140509588104816 -> 140509588104624 + 140509588104816 [label=AddBackward0] + 140509588105008 -> 140509588104816 + 140509588105008 [label=DivBackward0] + 140509588105104 -> 140509588105008 + 140509588105104 [label=UnsafeViewBackward0] + 140509588105296 -> 140509588105104 + 140509588105296 [label=BmmBackward0] + 140509588105488 -> 140509588105296 + 140509588105488 [label=UnsafeViewBackward0] + 140509588105440 -> 140509588105488 + 140509588105440 [label=CloneBackward0] + 140509588105680 -> 140509588105440 + 140509588105680 [label=ExpandBackward0] + 140509588105872 -> 140509588105680 + 140509588105872 [label=PermuteBackward0] + 140509588105920 -> 140509588105872 + 140509588105920 [label=ViewBackward0] + 140509588106160 -> 140509588105920 + 140509588106160 [label=ViewBackward0] + 140509588106352 -> 140509588106160 + 140509588106352 [label=AddmmBackward0] + 140509588106400 -> 140509588106352 + 140509588106400 [label=ToCopyBackward0] + 140509588106832 -> 140509588106400 + 140509590823536 [label="encoder.layer.8.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590823536 -> 140509588106832 + 140509588106832 [label=AccumulateGrad] + 140509588106544 -> 140509588106352 + 140509588106544 [label=ViewBackward0] + 140509588107024 -> 140509588106544 + 140509588107024 [label=ToCopyBackward0] + 140509588077488 -> 140509588107024 + 140509588077488 [label=CatBackward0] + 140509588107120 -> 140509588077488 + 140509588107120 [label=SumBackward1] + 140509588136240 -> 140509588107120 + 140509588136240 [label=MulBackward0] + 140509588136432 -> 140509588136240 + 140509588136432 [label=CatBackward0] + 140509588136528 -> 140509588136432 + 140509588136528 [label=UnsqueezeBackward0] + 140509588136912 -> 140509588136528 + 140509588136912 [label=NativeLayerNormBackward0] + 140509588137104 -> 140509588136912 + 140509588137104 [label=AddBackward0] + 140509588137392 -> 140509588137104 + 140509588137392 [label=NativeDropoutBackward0] + 140509588137488 -> 140509588137392 + 140509588137488 [label=ViewBackward0] + 140509588137536 -> 140509588137488 + 140509588137536 [label=AddmmBackward0] + 140509588137776 -> 140509588137536 + 140509588137776 [label=ToCopyBackward0] + 140509588138016 -> 140509588137776 + 140509591341952 [label="encoder.layer.7.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591341952 -> 140509588138016 + 140509588138016 [label=AccumulateGrad] + 140509588137872 -> 140509588137536 + 140509588137872 [label=ViewBackward0] + 140509588138352 -> 140509588137872 + 140509588138352 [label=GeluBackward0] + 140509588138544 -> 140509588138352 + 140509588138544 [label=ViewBackward0] + 140509588138640 -> 140509588138544 + 140509588138640 [label=AddmmBackward0] + 140509588138832 -> 140509588138640 + 140509588138832 [label=ToCopyBackward0] + 140509588139120 -> 140509588138832 + 140509591342272 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591342272 -> 140509588139120 + 140509588139120 [label=AccumulateGrad] + 140509588138496 -> 140509588138640 + 140509588138496 [label=ViewBackward0] + 140509588138976 -> 140509588138496 + 140509588138976 [label=ToCopyBackward0] + 140509588137056 -> 140509588138976 + 140509588137056 [label=SliceBackward0] + 140509588139504 -> 140509588137056 + 140509588139504 [label=SliceBackward0] + 140509588139600 -> 140509588139504 + 140509588139600 [label=SliceBackward0] + 140509588139792 -> 140509588139600 + 140509588139792 [label=SliceBackward0] + 140509588139984 -> 140509588139792 + 140509588139984 [label=SliceBackward0] + 140509588139888 -> 140509588139984 + 140509588139888 [label=NativeLayerNormBackward0] + 140509588164912 -> 140509588139888 + 140509588164912 [label=AddBackward0] + 140509588165200 -> 140509588164912 + 140509588165200 [label=NativeDropoutBackward0] + 140509588165296 -> 140509588165200 + 140509588165296 [label=ViewBackward0] + 140509588165488 -> 140509588165296 + 140509588165488 [label=AddmmBackward0] + 140509588165536 -> 140509588165488 + 140509588165536 [label=ToCopyBackward0] + 140509588165968 -> 140509588165536 + 140509590839360 [label="encoder.layer.7.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590839360 -> 140509588165968 + 140509588165968 [label=AccumulateGrad] + 140509588165680 -> 140509588165488 + 140509588165680 [label=ViewBackward0] + 140509588166160 -> 140509588165680 + 140509588166160 [label=ViewBackward0] + 140509588166352 -> 140509588166160 + 140509588166352 [label=CloneBackward0] + 140509588166544 -> 140509588166352 + 140509588166544 [label=PermuteBackward0] + 140509588166640 -> 140509588166544 + 140509588166640 [label=UnsafeViewBackward0] + 140509588166832 -> 140509588166640 + 140509588166832 [label=BmmBackward0] + 140509588167024 -> 140509588166832 + 140509588167024 [label=ReshapeAliasBackward0] + 140509588166976 -> 140509588167024 + 140509588166976 [label=ExpandBackward0] + 140509588167216 -> 140509588166976 + 140509588167216 [label=ToCopyBackward0] + 140509588167408 -> 140509588167216 + 140509588167408 [label=NativeDropoutBackward0] + 140509588167456 -> 140509588167408 + 140509588167456 [label=SoftmaxBackward0] + 140509588167696 -> 140509588167456 + 140509588167696 [label=AddBackward0] + 140509588167888 -> 140509588167696 + 140509588167888 [label=DivBackward0] + 140509588167936 -> 140509588167888 + 140509588167936 [label=UnsafeViewBackward0] + 140509588168176 -> 140509588167936 + 140509588168176 [label=BmmBackward0] + 140509588168368 -> 140509588168176 + 140509588168368 [label=UnsafeViewBackward0] + 140509588168416 -> 140509588168368 + 140509588168416 [label=CloneBackward0] + 140509588193584 -> 140509588168416 + 140509588193584 [label=ExpandBackward0] + 140509588193680 -> 140509588193584 + 140509588193680 [label=PermuteBackward0] + 140509588193872 -> 140509588193680 + 140509588193872 [label=ViewBackward0] + 140509588194064 -> 140509588193872 + 140509588194064 [label=ViewBackward0] + 140509588194160 -> 140509588194064 + 140509588194160 [label=AddmmBackward0] + 140509588194352 -> 140509588194160 + 140509588194352 [label=ToCopyBackward0] + 140509588194640 -> 140509588194352 + 140509590840320 [label="encoder.layer.7.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590840320 -> 140509588194640 + 140509588194640 [label=AccumulateGrad] + 140509588194016 -> 140509588194160 + 140509588194016 [label=ViewBackward0] + 140509588194496 -> 140509588194016 + 140509588194496 [label=ToCopyBackward0] + 140509588165008 -> 140509588194496 + 140509588165008 [label=CatBackward0] + 140509588195024 -> 140509588165008 + 140509588195024 [label=SumBackward1] + 140509588194976 -> 140509588195024 + 140509588194976 [label=MulBackward0] + 140509588195216 -> 140509588194976 + 140509588195216 [label=CatBackward0] + 140509588195600 -> 140509588195216 + 140509588195600 [label=UnsqueezeBackward0] + 140509588195696 -> 140509588195600 + 140509588195696 [label=NativeLayerNormBackward0] + 140509588195888 -> 140509588195696 + 140509588195888 [label=AddBackward0] + 140509588196176 -> 140509588195888 + 140509588196176 [label=NativeDropoutBackward0] + 140509588196560 -> 140509588196176 + 140509588196560 [label=ViewBackward0] + 140509588196752 -> 140509588196560 + 140509588196752 [label=AddmmBackward0] + 140509588196944 -> 140509588196752 + 140509588196944 [label=ToCopyBackward0] + 140509588197232 -> 140509588196944 + 140509590825776 [label="encoder.layer.6.experts.experts.0.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590825776 -> 140509588197232 + 140509588197232 [label=AccumulateGrad] + 140509588196656 -> 140509588196752 + 140509588196656 [label=ViewBackward0] + 140509588197040 -> 140509588196656 + 140509588197040 [label=GeluBackward0] + 140509588196896 -> 140509588197040 + 140509588196896 [label=ViewBackward0] + 140509587696464 -> 140509588196896 + 140509587696464 [label=AddmmBackward0] + 140509587696368 -> 140509587696464 + 140509587696368 [label=ToCopyBackward0] + 140509587693680 -> 140509587696368 + 140509590826256 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590826256 -> 140509587693680 + 140509587693680 [label=AccumulateGrad] + 140509587696752 -> 140509587696464 + 140509587696752 [label=ViewBackward0] + 140509587693728 -> 140509587696752 + 140509587693728 [label=ToCopyBackward0] + 140509588196272 -> 140509587693728 + 140509588196272 [label=ViewBackward0] + 140509587693872 -> 140509588196272 + 140509587693872 [label=CloneBackward0] + 140509587694064 -> 140509587693872 + 140509587694064 [label=ExpandBackward0] + 140509587694112 -> 140509587694064 + 140509587694112 [label=UnsqueezeBackward0] + 140509587694352 -> 140509587694112 + 140509587694352 [label=SliceBackward0] + 140509587694544 -> 140509587694352 + 140509587694544 [label=SliceBackward0] + 140509587694592 -> 140509587694544 + 140509587694592 [label=NativeLayerNormBackward0] + 140509587694832 -> 140509587694592 + 140509587694832 [label=AddBackward0] + 140509587695072 -> 140509587694832 + 140509587695072 [label=NativeDropoutBackward0] + 140509587695408 -> 140509587695072 + 140509587695408 [label=ViewBackward0] + 140509587695600 -> 140509587695408 + 140509587695600 [label=AddmmBackward0] + 140509587697232 -> 140509587695600 + 140509587697232 [label=ToCopyBackward0] + 140509587696992 -> 140509587697232 + 140509590842480 [label="encoder.layer.6.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590842480 -> 140509587696992 + 140509587696992 [label=AccumulateGrad] + 140509587697472 -> 140509587695600 + 140509587697472 [label=ViewBackward0] + 140509587697616 -> 140509587697472 + 140509587697616 [label=ViewBackward0] + 140509587696272 -> 140509587697616 + 140509587696272 [label=CloneBackward0] + 140509587696944 -> 140509587696272 + 140509587696944 [label=PermuteBackward0] + 140509587696512 -> 140509587696944 + 140509587696512 [label=UnsafeViewBackward0] + 140509587695984 -> 140509587696512 + 140509587695984 [label=BmmBackward0] + 140509587696032 -> 140509587695984 + 140509587696032 [label=ReshapeAliasBackward0] + 140509587852640 -> 140509587696032 + 140509587852640 [label=ExpandBackward0] + 140509587852544 -> 140509587852640 + 140509587852544 [label=ToCopyBackward0] + 140509587852448 -> 140509587852544 + 140509587852448 [label=NativeDropoutBackward0] + 140509587852352 -> 140509587852448 + 140509587852352 [label=SoftmaxBackward0] + 140509587852256 -> 140509587852352 + 140509587852256 [label=AddBackward0] + 140509587852160 -> 140509587852256 + 140509587852160 [label=DivBackward0] + 140509587852064 -> 140509587852160 + 140509587852064 [label=UnsafeViewBackward0] + 140509587851968 -> 140509587852064 + 140509587851968 [label=BmmBackward0] + 140509587851872 -> 140509587851968 + 140509587851872 [label=ReshapeAliasBackward0] + 140509587851824 -> 140509587851872 + 140509587851824 [label=ExpandBackward0] + 140509587851728 -> 140509587851824 + 140509587851728 [label=PermuteBackward0] + 140509587851632 -> 140509587851728 + 140509587851632 [label=ViewBackward0] + 140509587851536 -> 140509587851632 + 140509587851536 [label=ViewBackward0] + 140509587851440 -> 140509587851536 + 140509587851440 [label=AddmmBackward0] + 140509587851344 -> 140509587851440 + 140509587851344 [label=ToCopyBackward0] + 140509587851152 -> 140509587851344 + 140509590843200 [label="encoder.layer.6.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509590843200 -> 140509587851152 + 140509587851152 [label=AccumulateGrad] + 140509587851296 -> 140509587851440 + 140509587851296 [label=ViewBackward0] + 140509587851008 -> 140509587851296 + 140509587851008 [label=ToCopyBackward0] + 140509587695120 -> 140509587851008 + 140509587695120 [label=SliceBackward0] + 140509587850960 -> 140509587695120 + 140509587850960 [label=SliceBackward0] + 140509587850864 -> 140509587850960 + 140509587850864 [label=SliceBackward0] + 140509587850768 -> 140509587850864 + 140509587850768 [label=NativeLayerNormBackward0] + 140509587850672 -> 140509587850768 + 140509587850672 [label=AddBackward0] + 140509587850480 -> 140509587850672 + 140509587850480 [label=NativeDropoutBackward0] + 140509587850240 -> 140509587850480 + 140509587850240 [label=ViewBackward0] + 140509587850144 -> 140509587850240 + 140509587850144 [label=AddmmBackward0] + 140509587850048 -> 140509587850144 + 140509587850048 [label=ToCopyBackward0] + 140509587849856 -> 140509587850048 + 140509590856064 [label="encoder.layer.6.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590856064 -> 140509587849856 + 140509587849856 [label=AccumulateGrad] + 140509587850192 -> 140509587850144 + 140509587850192 [label=ViewBackward0] + 140509587849904 -> 140509587850192 + 140509587849904 [label=ViewBackward0] + 140509587849808 -> 140509587849904 + 140509587849808 [label=CloneBackward0] + 140509587849712 -> 140509587849808 + 140509587849712 [label=PermuteBackward0] + 140509587849616 -> 140509587849712 + 140509587849616 [label=UnsafeViewBackward0] + 140509587849520 -> 140509587849616 + 140509587849520 [label=BmmBackward0] + 140509587849424 -> 140509587849520 + 140509587849424 [label=ReshapeAliasBackward0] + 140509587852976 -> 140509587849424 + 140509587852976 [label=ExpandBackward0] + 140509587853072 -> 140509587852976 + 140509587853072 [label=ToCopyBackward0] + 140509587853168 -> 140509587853072 + 140509587853168 [label=NativeDropoutBackward0] + 140509587853264 -> 140509587853168 + 140509587853264 [label=SoftmaxBackward0] + 140509587849280 -> 140509587853264 + 140509587849280 [label=AddBackward0] + 140509587558608 -> 140509587849280 + 140509587558608 [label=DivBackward0] + 140509587558704 -> 140509587558608 + 140509587558704 [label=UnsafeViewBackward0] + 140509587558800 -> 140509587558704 + 140509587558800 [label=BmmBackward0] + 140509587558896 -> 140509587558800 + 140509587558896 [label=ReshapeAliasBackward0] + 140509587559040 -> 140509587558896 + 140509587559040 [label=ExpandBackward0] + 140509587559136 -> 140509587559040 + 140509587559136 [label=PermuteBackward0] + 140509587559232 -> 140509587559136 + 140509587559232 [label=ViewBackward0] + 140509587559328 -> 140509587559232 + 140509587559328 [label=ViewBackward0] + 140509587559424 -> 140509587559328 + 140509587559424 [label=AddmmBackward0] + 140509587559520 -> 140509587559424 + 140509587559520 [label=ToCopyBackward0] + 140509587559712 -> 140509587559520 + 140509590856784 [label="encoder.layer.6.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590856784 -> 140509587559712 + 140509587559712 [label=AccumulateGrad] + 140509587559472 -> 140509587559424 + 140509587559472 [label=ViewBackward0] + 140509587559760 -> 140509587559472 + 140509587559760 [label=ToCopyBackward0] + 140509587850432 -> 140509587559760 + 140509587850432 [label=CatBackward0] + 140509587559904 -> 140509587850432 + 140509587559904 [label=NativeLayerNormBackward0] + 140509587560048 -> 140509587559904 + 140509587560048 [label=AddBackward0] + 140509587560240 -> 140509587560048 + 140509587560240 [label=NativeDropoutBackward0] + 140509587560384 -> 140509587560240 + 140509587560384 [label=ViewBackward0] + 140509587560480 -> 140509587560384 + 140509587560480 [label=AddmmBackward0] + 140509587560576 -> 140509587560480 + 140509587560576 [label=ToCopyBackward0] + 140509587560768 -> 140509587560576 + 140509590857264 [label="encoder.layer.5.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590857264 -> 140509587560768 + 140509587560768 [label=AccumulateGrad] + 140509587560528 -> 140509587560480 + 140509587560528 [label=ViewBackward0] + 140509587560816 -> 140509587560528 + 140509587560816 [label=GeluBackward0] + 140509587560912 -> 140509587560816 + 140509587560912 [label=ViewBackward0] + 140509587561008 -> 140509587560912 + 140509587561008 [label=AddmmBackward0] + 140509587561104 -> 140509587561008 + 140509587561104 [label=ToCopyBackward0] + 140509587561296 -> 140509587561104 + 140509590857504 [label="encoder.layer.5.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590857504 -> 140509587561296 + 140509587561296 [label=AccumulateGrad] + 140509587561056 -> 140509587561008 + 140509587561056 [label=ViewBackward0] + 140509587561344 -> 140509587561056 + 140509587561344 [label=ToCopyBackward0] + 140509587560192 -> 140509587561344 + 140509587560192 [label=SliceBackward0] + 140509587561488 -> 140509587560192 + 140509587561488 [label=SliceBackward0] + 140509587561584 -> 140509587561488 + 140509587561584 [label=SliceBackward0] + 140509587561680 -> 140509587561584 + 140509587561680 [label=SliceBackward0] + 140509587561776 -> 140509587561680 + 140509587561776 [label=SliceBackward0] + 140509587561872 -> 140509587561776 + 140509587561872 [label=NativeLayerNormBackward0] + 140509587561968 -> 140509587561872 + 140509587561968 [label=AddBackward0] + 140509587562160 -> 140509587561968 + 140509587562160 [label=NativeDropoutBackward0] + 140509587562304 -> 140509587562160 + 140509587562304 [label=ViewBackward0] + 140509587562400 -> 140509587562304 + 140509587562400 [label=AddmmBackward0] + 140509587562448 -> 140509587562400 + 140509587562448 [label=ToCopyBackward0] + 140509587570944 -> 140509587562448 + 140509590859424 [label="encoder.layer.5.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590859424 -> 140509587570944 + 140509587570944 [label=AccumulateGrad] + 140509587562208 -> 140509587562400 + 140509587562208 [label=ViewBackward0] + 140509587570992 -> 140509587562208 + 140509587570992 [label=ViewBackward0] + 140509587571136 -> 140509587570992 + 140509587571136 [label=CloneBackward0] + 140509587571232 -> 140509587571136 + 140509587571232 [label=PermuteBackward0] + 140509587571328 -> 140509587571232 + 140509587571328 [label=UnsafeViewBackward0] + 140509587571424 -> 140509587571328 + 140509587571424 [label=BmmBackward0] + 140509587571520 -> 140509587571424 + 140509587571520 [label=ReshapeAliasBackward0] + 140509587571664 -> 140509587571520 + 140509587571664 [label=ExpandBackward0] + 140509587571760 -> 140509587571664 + 140509587571760 [label=ToCopyBackward0] + 140509587571856 -> 140509587571760 + 140509587571856 [label=NativeDropoutBackward0] + 140509587571952 -> 140509587571856 + 140509587571952 [label=SoftmaxBackward0] + 140509587572048 -> 140509587571952 + 140509587572048 [label=AddBackward0] + 140509587572144 -> 140509587572048 + 140509587572144 [label=DivBackward0] + 140509587572240 -> 140509587572144 + 140509587572240 [label=UnsafeViewBackward0] + 140509587572336 -> 140509587572240 + 140509587572336 [label=BmmBackward0] + 140509587572432 -> 140509587572336 + 140509587572432 [label=ReshapeAliasBackward0] + 140509587572576 -> 140509587572432 + 140509587572576 [label=ExpandBackward0] + 140509587572672 -> 140509587572576 + 140509587572672 [label=PermuteBackward0] + 140509587572768 -> 140509587572672 + 140509587572768 [label=ViewBackward0] + 140509587572864 -> 140509587572768 + 140509587572864 [label=ViewBackward0] + 140509587572960 -> 140509587572864 + 140509587572960 [label=AddmmBackward0] + 140509587573056 -> 140509587572960 + 140509587573056 [label=ToCopyBackward0] + 140509587573248 -> 140509587573056 + 140509590872528 [label="encoder.layer.5.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590872528 -> 140509587573248 + 140509587573248 [label=AccumulateGrad] + 140509587573008 -> 140509587572960 + 140509587573008 [label=ViewBackward0] + 140509587573296 -> 140509587573008 + 140509587573296 [label=ToCopyBackward0] + 140509587562112 -> 140509587573296 + 140509587562112 [label=CatBackward0] + 140509587573440 -> 140509587562112 + 140509587573440 [label=NativeLayerNormBackward0] + 140509587573584 -> 140509587573440 + 140509587573584 [label=AddBackward0] + 140509587573776 -> 140509587573584 + 140509587573776 [label=NativeDropoutBackward0] + 140509587573920 -> 140509587573776 + 140509587573920 [label=ViewBackward0] + 140509587574016 -> 140509587573920 + 140509587574016 [label=AddmmBackward0] + 140509587574112 -> 140509587574016 + 140509587574112 [label=ToCopyBackward0] + 140509587574304 -> 140509587574112 + 140509590873008 [label="encoder.layer.4.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590873008 -> 140509587574304 + 140509587574304 [label=AccumulateGrad] + 140509587574064 -> 140509587574016 + 140509587574064 [label=ViewBackward0] + 140509587574352 -> 140509587574064 + 140509587574352 [label=GeluBackward0] + 140509587574448 -> 140509587574352 + 140509587574448 [label=ViewBackward0] + 140509587574544 -> 140509587574448 + 140509587574544 [label=AddmmBackward0] + 140509587574640 -> 140509587574544 + 140509587574640 [label=ToCopyBackward0] + 140509587574736 -> 140509587574640 + 140509590873248 [label="encoder.layer.4.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590873248 -> 140509587574736 + 140509587574736 [label=AccumulateGrad] + 140509587574592 -> 140509587574544 + 140509587574592 [label=ViewBackward0] + 140509587591232 -> 140509587574592 + 140509587591232 [label=ToCopyBackward0] + 140509587573728 -> 140509587591232 + 140509587573728 [label=SliceBackward0] + 140509587591472 -> 140509587573728 + 140509587591472 [label=SliceBackward0] + 140509587591568 -> 140509587591472 + 140509587591568 [label=NativeLayerNormBackward0] + 140509587591664 -> 140509587591568 + 140509587591664 [label=AddBackward0] + 140509587591856 -> 140509587591664 + 140509587591856 [label=NativeDropoutBackward0] + 140509587592000 -> 140509587591856 + 140509587592000 [label=ViewBackward0] + 140509587592096 -> 140509587592000 + 140509587592096 [label=AddmmBackward0] + 140509587592192 -> 140509587592096 + 140509587592192 [label=ToCopyBackward0] + 140509587592384 -> 140509587592192 + 140509590875168 [label="encoder.layer.4.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590875168 -> 140509587592384 + 140509587592384 [label=AccumulateGrad] + 140509587592144 -> 140509587592096 + 140509587592144 [label=ViewBackward0] + 140509587592432 -> 140509587592144 + 140509587592432 [label=ViewBackward0] + 140509587592528 -> 140509587592432 + 140509587592528 [label=CloneBackward0] + 140509587592624 -> 140509587592528 + 140509587592624 [label=PermuteBackward0] + 140509587592720 -> 140509587592624 + 140509587592720 [label=UnsafeViewBackward0] + 140509587592816 -> 140509587592720 + 140509587592816 [label=BmmBackward0] + 140509587592912 -> 140509587592816 + 140509587592912 [label=ReshapeAliasBackward0] + 140509587593056 -> 140509587592912 + 140509587593056 [label=ExpandBackward0] + 140509587593152 -> 140509587593056 + 140509587593152 [label=ToCopyBackward0] + 140509587593248 -> 140509587593152 + 140509587593248 [label=NativeDropoutBackward0] + 140509587593344 -> 140509587593248 + 140509587593344 [label=SoftmaxBackward0] + 140509587593440 -> 140509587593344 + 140509587593440 [label=AddBackward0] + 140509587593536 -> 140509587593440 + 140509587593536 [label=DivBackward0] + 140509587593632 -> 140509587593536 + 140509587593632 [label=UnsafeViewBackward0] + 140509587593728 -> 140509587593632 + 140509587593728 [label=BmmBackward0] + 140509587593824 -> 140509587593728 + 140509587593824 [label=ReshapeAliasBackward0] + 140509587593968 -> 140509587593824 + 140509587593968 [label=ExpandBackward0] + 140509587594064 -> 140509587593968 + 140509587594064 [label=PermuteBackward0] + 140509587594160 -> 140509587594064 + 140509587594160 [label=ViewBackward0] + 140509587594256 -> 140509587594160 + 140509587594256 [label=ViewBackward0] + 140509587594352 -> 140509587594256 + 140509587594352 [label=AddmmBackward0] + 140509587594448 -> 140509587594352 + 140509587594448 [label=ToCopyBackward0] + 140509587594640 -> 140509587594448 + 140509590875888 [label="encoder.layer.4.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509590875888 -> 140509587594640 + 140509587594640 [label=AccumulateGrad] + 140509587594400 -> 140509587594352 + 140509587594400 [label=ViewBackward0] + 140509587594688 -> 140509587594400 + 140509587594688 [label=ToCopyBackward0] + 140509587591808 -> 140509587594688 + 140509587591808 [label=SliceBackward0] + 140509587594832 -> 140509587591808 + 140509587594832 [label=SliceBackward0] + 140509587594928 -> 140509587594832 + 140509587594928 [label=SliceBackward0] + 140509587595024 -> 140509587594928 + 140509587595024 [label=NativeLayerNormBackward0] + 140509587595120 -> 140509587595024 + 140509587595120 [label=AddBackward0] + 140509587595216 -> 140509587595120 + 140509587595216 [label=NativeDropoutBackward0] + 140509587607808 -> 140509587595216 + 140509587607808 [label=ViewBackward0] + 140509587607904 -> 140509587607808 + 140509587607904 [label=AddmmBackward0] + 140509587608000 -> 140509587607904 + 140509587608000 [label=ToCopyBackward0] + 140509587608192 -> 140509587608000 + 140509590892848 [label="encoder.layer.4.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590892848 -> 140509587608192 + 140509587608192 [label=AccumulateGrad] + 140509587607952 -> 140509587607904 + 140509587607952 [label=ViewBackward0] + 140509587608240 -> 140509587607952 + 140509587608240 [label=ViewBackward0] + 140509587608336 -> 140509587608240 + 140509587608336 [label=CloneBackward0] + 140509587608432 -> 140509587608336 + 140509587608432 [label=PermuteBackward0] + 140509587608528 -> 140509587608432 + 140509587608528 [label=UnsafeViewBackward0] + 140509587608624 -> 140509587608528 + 140509587608624 [label=BmmBackward0] + 140509587608720 -> 140509587608624 + 140509587608720 [label=ReshapeAliasBackward0] + 140509587608864 -> 140509587608720 + 140509587608864 [label=ExpandBackward0] + 140509587608960 -> 140509587608864 + 140509587608960 [label=ToCopyBackward0] + 140509587609056 -> 140509587608960 + 140509587609056 [label=NativeDropoutBackward0] + 140509587609152 -> 140509587609056 + 140509587609152 [label=SoftmaxBackward0] + 140509587609248 -> 140509587609152 + 140509587609248 [label=AddBackward0] + 140509587609344 -> 140509587609248 + 140509587609344 [label=DivBackward0] + 140509587609440 -> 140509587609344 + 140509587609440 [label=UnsafeViewBackward0] + 140509587609536 -> 140509587609440 + 140509587609536 [label=BmmBackward0] + 140509587609632 -> 140509587609536 + 140509587609632 [label=ReshapeAliasBackward0] + 140509587609776 -> 140509587609632 + 140509587609776 [label=ExpandBackward0] + 140509587609872 -> 140509587609776 + 140509587609872 [label=PermuteBackward0] + 140509587609968 -> 140509587609872 + 140509587609968 [label=ViewBackward0] + 140509587610064 -> 140509587609968 + 140509587610064 [label=ViewBackward0] + 140509587610160 -> 140509587610064 + 140509587610160 [label=AddmmBackward0] + 140509587610256 -> 140509587610160 + 140509587610256 [label=ToCopyBackward0] + 140509587610448 -> 140509587610256 + 140509590893568 [label="encoder.layer.4.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590893568 -> 140509587610448 + 140509587610448 [label=AccumulateGrad] + 140509587610208 -> 140509587610160 + 140509587610208 [label=ViewBackward0] + 140509587610496 -> 140509587610208 + 140509587610496 [label=ToCopyBackward0] + 140509587607664 -> 140509587610496 + 140509587607664 [label=CatBackward0] + 140509587610640 -> 140509587607664 + 140509587610640 [label=NativeLayerNormBackward0] + 140509587610784 -> 140509587610640 + 140509587610784 [label=AddBackward0] + 140509587610976 -> 140509587610784 + 140509587610976 [label=NativeDropoutBackward0] + 140509587611120 -> 140509587610976 + 140509587611120 [label=ViewBackward0] + 140509587611216 -> 140509587611120 + 140509587611216 [label=AddmmBackward0] + 140509587611312 -> 140509587611216 + 140509587611312 [label=ToCopyBackward0] + 140509587611504 -> 140509587611312 + 140509590894048 [label="encoder.layer.3.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590894048 -> 140509587611504 + 140509587611504 [label=AccumulateGrad] + 140509587611264 -> 140509587611216 + 140509587611264 [label=ViewBackward0] + 140509587611552 -> 140509587611264 + 140509587611552 [label=GeluBackward0] + 140509587611408 -> 140509587611552 + 140509587611408 [label=ViewBackward0] + 140509587624096 -> 140509587611408 + 140509587624096 [label=AddmmBackward0] + 140509587624192 -> 140509587624096 + 140509587624192 [label=ToCopyBackward0] + 140509587624384 -> 140509587624192 + 140509590894288 [label="encoder.layer.3.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590894288 -> 140509587624384 + 140509587624384 [label=AccumulateGrad] + 140509587624144 -> 140509587624096 + 140509587624144 [label=ViewBackward0] + 140509587624432 -> 140509587624144 + 140509587624432 [label=ToCopyBackward0] + 140509587610928 -> 140509587624432 + 140509587610928 [label=SliceBackward0] + 140509587624576 -> 140509587610928 + 140509587624576 [label=SliceBackward0] + 140509587624672 -> 140509587624576 + 140509587624672 [label=SliceBackward0] + 140509587624768 -> 140509587624672 + 140509587624768 [label=SliceBackward0] + 140509587624864 -> 140509587624768 + 140509587624864 [label=SliceBackward0] + 140509587624960 -> 140509587624864 + 140509587624960 [label=NativeLayerNormBackward0] + 140509587625056 -> 140509587624960 + 140509587625056 [label=AddBackward0] + 140509587625248 -> 140509587625056 + 140509587625248 [label=NativeDropoutBackward0] + 140509587625392 -> 140509587625248 + 140509587625392 [label=ViewBackward0] + 140509587625488 -> 140509587625392 + 140509587625488 [label=AddmmBackward0] + 140509587625584 -> 140509587625488 + 140509587625584 [label=ToCopyBackward0] + 140509587625776 -> 140509587625584 + 140509590896208 [label="encoder.layer.3.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590896208 -> 140509587625776 + 140509587625776 [label=AccumulateGrad] + 140509587625536 -> 140509587625488 + 140509587625536 [label=ViewBackward0] + 140509587625824 -> 140509587625536 + 140509587625824 [label=ViewBackward0] + 140509587625920 -> 140509587625824 + 140509587625920 [label=CloneBackward0] + 140509587626016 -> 140509587625920 + 140509587626016 [label=PermuteBackward0] + 140509587626112 -> 140509587626016 + 140509587626112 [label=UnsafeViewBackward0] + 140509587626208 -> 140509587626112 + 140509587626208 [label=BmmBackward0] + 140509587626304 -> 140509587626208 + 140509587626304 [label=ReshapeAliasBackward0] + 140509587626448 -> 140509587626304 + 140509587626448 [label=ExpandBackward0] + 140509587626544 -> 140509587626448 + 140509587626544 [label=ToCopyBackward0] + 140509587626640 -> 140509587626544 + 140509587626640 [label=NativeDropoutBackward0] + 140509587626736 -> 140509587626640 + 140509587626736 [label=SoftmaxBackward0] + 140509587626832 -> 140509587626736 + 140509587626832 [label=AddBackward0] + 140509587626928 -> 140509587626832 + 140509587626928 [label=DivBackward0] + 140509587627024 -> 140509587626928 + 140509587627024 [label=UnsafeViewBackward0] + 140509587627120 -> 140509587627024 + 140509587627120 [label=BmmBackward0] + 140509587627216 -> 140509587627120 + 140509587627216 [label=ReshapeAliasBackward0] + 140509587627360 -> 140509587627216 + 140509587627360 [label=ExpandBackward0] + 140509587627456 -> 140509587627360 + 140509587627456 [label=PermuteBackward0] + 140509587627552 -> 140509587627456 + 140509587627552 [label=ViewBackward0] + 140509587627648 -> 140509587627552 + 140509587627648 [label=ViewBackward0] + 140509587627744 -> 140509587627648 + 140509587627744 [label=AddmmBackward0] + 140509587627840 -> 140509587627744 + 140509587627840 [label=ToCopyBackward0] + 140509587627984 -> 140509587627840 + 140509590901120 [label="encoder.layer.3.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590901120 -> 140509587627984 + 140509587627984 [label=AccumulateGrad] + 140509587627792 -> 140509587627744 + 140509587627792 [label=ViewBackward0] + 140509587627936 -> 140509587627792 + 140509587627936 [label=ToCopyBackward0] + 140509587625200 -> 140509587627936 + 140509587625200 [label=CatBackward0] + 140509587640576 -> 140509587625200 + 140509587640576 [label=NativeLayerNormBackward0] + 140509587640720 -> 140509587640576 + 140509587640720 [label=AddBackward0] + 140509587640912 -> 140509587640720 + 140509587640912 [label=NativeDropoutBackward0] + 140509587641056 -> 140509587640912 + 140509587641056 [label=ViewBackward0] + 140509587641152 -> 140509587641056 + 140509587641152 [label=AddmmBackward0] + 140509587641248 -> 140509587641152 + 140509587641248 [label=ToCopyBackward0] + 140509587641440 -> 140509587641248 + 140509590901600 [label="encoder.layer.2.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590901600 -> 140509587641440 + 140509587641440 [label=AccumulateGrad] + 140509587641200 -> 140509587641152 + 140509587641200 [label=ViewBackward0] + 140509587641488 -> 140509587641200 + 140509587641488 [label=GeluBackward0] + 140509587641584 -> 140509587641488 + 140509587641584 [label=ViewBackward0] + 140509587641680 -> 140509587641584 + 140509587641680 [label=AddmmBackward0] + 140509587641776 -> 140509587641680 + 140509587641776 [label=ToCopyBackward0] + 140509587641968 -> 140509587641776 + 140509590901840 [label="encoder.layer.2.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590901840 -> 140509587641968 + 140509587641968 [label=AccumulateGrad] + 140509587641728 -> 140509587641680 + 140509587641728 [label=ViewBackward0] + 140509587642016 -> 140509587641728 + 140509587642016 [label=ToCopyBackward0] + 140509587640864 -> 140509587642016 + 140509587640864 [label=SliceBackward0] + 140509587642160 -> 140509587640864 + 140509587642160 [label=SliceBackward0] + 140509587642256 -> 140509587642160 + 140509587642256 [label=NativeLayerNormBackward0] + 140509587642352 -> 140509587642256 + 140509587642352 [label=AddBackward0] + 140509587642544 -> 140509587642352 + 140509587642544 [label=NativeDropoutBackward0] + 140509587642688 -> 140509587642544 + 140509587642688 [label=ViewBackward0] + 140509587642784 -> 140509587642688 + 140509587642784 [label=AddmmBackward0] + 140509587642880 -> 140509587642784 + 140509587642880 [label=ToCopyBackward0] + 140509587643072 -> 140509587642880 + 140509590903760 [label="encoder.layer.2.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590903760 -> 140509587643072 + 140509587643072 [label=AccumulateGrad] + 140509587642832 -> 140509587642784 + 140509587642832 [label=ViewBackward0] + 140509587643120 -> 140509587642832 + 140509587643120 [label=ViewBackward0] + 140509587643216 -> 140509587643120 + 140509587643216 [label=CloneBackward0] + 140509587643312 -> 140509587643216 + 140509587643312 [label=PermuteBackward0] + 140509587643408 -> 140509587643312 + 140509587643408 [label=UnsafeViewBackward0] + 140509587643504 -> 140509587643408 + 140509587643504 [label=BmmBackward0] + 140509587643600 -> 140509587643504 + 140509587643600 [label=ReshapeAliasBackward0] + 140509587643744 -> 140509587643600 + 140509587643744 [label=ExpandBackward0] + 140509587643840 -> 140509587643744 + 140509587643840 [label=ToCopyBackward0] + 140509587643936 -> 140509587643840 + 140509587643936 [label=NativeDropoutBackward0] + 140509587644032 -> 140509587643936 + 140509587644032 [label=SoftmaxBackward0] + 140509587644128 -> 140509587644032 + 140509587644128 [label=AddBackward0] + 140509587644224 -> 140509587644128 + 140509587644224 [label=DivBackward0] + 140509587644320 -> 140509587644224 + 140509587644320 [label=UnsafeViewBackward0] + 140509587644368 -> 140509587644320 + 140509587644368 [label=BmmBackward0] + 140509587656864 -> 140509587644368 + 140509587656864 [label=ReshapeAliasBackward0] + 140509587657008 -> 140509587656864 + 140509587657008 [label=ExpandBackward0] + 140509587657104 -> 140509587657008 + 140509587657104 [label=PermuteBackward0] + 140509587657200 -> 140509587657104 + 140509587657200 [label=ViewBackward0] + 140509587657296 -> 140509587657200 + 140509587657296 [label=ViewBackward0] + 140509587657392 -> 140509587657296 + 140509587657392 [label=AddmmBackward0] + 140509587657488 -> 140509587657392 + 140509587657488 [label=ToCopyBackward0] + 140509587657680 -> 140509587657488 + 140509590904480 [label="encoder.layer.2.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509590904480 -> 140509587657680 + 140509587657680 [label=AccumulateGrad] + 140509587657440 -> 140509587657392 + 140509587657440 [label=ViewBackward0] + 140509587657728 -> 140509587657440 + 140509587657728 [label=ToCopyBackward0] + 140509587642496 -> 140509587657728 + 140509587642496 [label=SliceBackward0] + 140509587657872 -> 140509587642496 + 140509587657872 [label=SliceBackward0] + 140509587657968 -> 140509587657872 + 140509587657968 [label=SliceBackward0] + 140509587658064 -> 140509587657968 + 140509587658064 [label=NativeLayerNormBackward0] + 140509587658160 -> 140509587658064 + 140509587658160 [label=AddBackward0] + 140509587658352 -> 140509587658160 + 140509587658352 [label=NativeDropoutBackward0] + 140509587658496 -> 140509587658352 + 140509587658496 [label=ViewBackward0] + 140509587658592 -> 140509587658496 + 140509587658592 [label=AddmmBackward0] + 140509587658688 -> 140509587658592 + 140509587658688 [label=ToCopyBackward0] + 140509587658880 -> 140509587658688 + 140509590913248 [label="encoder.layer.2.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590913248 -> 140509587658880 + 140509587658880 [label=AccumulateGrad] + 140509587658640 -> 140509587658592 + 140509587658640 [label=ViewBackward0] + 140509587658928 -> 140509587658640 + 140509587658928 [label=ViewBackward0] + 140509587659024 -> 140509587658928 + 140509587659024 [label=CloneBackward0] + 140509587659120 -> 140509587659024 + 140509587659120 [label=PermuteBackward0] + 140509587659216 -> 140509587659120 + 140509587659216 [label=UnsafeViewBackward0] + 140509587659312 -> 140509587659216 + 140509587659312 [label=BmmBackward0] + 140509587659408 -> 140509587659312 + 140509587659408 [label=ReshapeAliasBackward0] + 140509587659552 -> 140509587659408 + 140509587659552 [label=ExpandBackward0] + 140509587659648 -> 140509587659552 + 140509587659648 [label=ToCopyBackward0] + 140509587659744 -> 140509587659648 + 140509587659744 [label=NativeDropoutBackward0] + 140509587659840 -> 140509587659744 + 140509587659840 [label=SoftmaxBackward0] + 140509587659936 -> 140509587659840 + 140509587659936 [label=AddBackward0] + 140509587660032 -> 140509587659936 + 140509587660032 [label=DivBackward0] + 140509587660128 -> 140509587660032 + 140509587660128 [label=UnsafeViewBackward0] + 140509587660224 -> 140509587660128 + 140509587660224 [label=BmmBackward0] + 140509587660320 -> 140509587660224 + 140509587660320 [label=ReshapeAliasBackward0] + 140509587660464 -> 140509587660320 + 140509587660464 [label=ExpandBackward0] + 140509587660560 -> 140509587660464 + 140509587660560 [label=PermuteBackward0] + 140509587660656 -> 140509587660560 + 140509587660656 [label=ViewBackward0] + 140509587660752 -> 140509587660656 + 140509587660752 [label=ViewBackward0] + 140509587660368 -> 140509587660752 + 140509587660368 [label=AddmmBackward0] + 140509587673296 -> 140509587660368 + 140509587673296 [label=ToCopyBackward0] + 140509587673488 -> 140509587673296 + 140509590913968 [label="encoder.layer.2.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590913968 -> 140509587673488 + 140509587673488 [label=AccumulateGrad] + 140509587673248 -> 140509587660368 + 140509587673248 [label=ViewBackward0] + 140509587673536 -> 140509587673248 + 140509587673536 [label=ToCopyBackward0] + 140509587658304 -> 140509587673536 + 140509587658304 [label=CatBackward0] + 140509587673680 -> 140509587658304 + 140509587673680 [label=NativeLayerNormBackward0] + 140509587673824 -> 140509587673680 + 140509587673824 [label=AddBackward0] + 140509587674016 -> 140509587673824 + 140509587674016 [label=NativeDropoutBackward0] + 140509587674160 -> 140509587674016 + 140509587674160 [label=ViewBackward0] + 140509587674256 -> 140509587674160 + 140509587674256 [label=AddmmBackward0] + 140509587674352 -> 140509587674256 + 140509587674352 [label=ToCopyBackward0] + 140509587674544 -> 140509587674352 + 140509590914448 [label="encoder.layer.1.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590914448 -> 140509587674544 + 140509587674544 [label=AccumulateGrad] + 140509587674304 -> 140509587674256 + 140509587674304 [label=ViewBackward0] + 140509587674592 -> 140509587674304 + 140509587674592 [label=GeluBackward0] + 140509587674688 -> 140509587674592 + 140509587674688 [label=ViewBackward0] + 140509587674784 -> 140509587674688 + 140509587674784 [label=AddmmBackward0] + 140509587674880 -> 140509587674784 + 140509587674880 [label=ToCopyBackward0] + 140509587675072 -> 140509587674880 + 140509590914688 [label="encoder.layer.1.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590914688 -> 140509587675072 + 140509587675072 [label=AccumulateGrad] + 140509587674832 -> 140509587674784 + 140509587674832 [label=ViewBackward0] + 140509587675120 -> 140509587674832 + 140509587675120 [label=ToCopyBackward0] + 140509587673968 -> 140509587675120 + 140509587673968 [label=SliceBackward0] + 140509587675264 -> 140509587673968 + 140509587675264 [label=SliceBackward0] + 140509587675360 -> 140509587675264 + 140509587675360 [label=SliceBackward0] + 140509587675456 -> 140509587675360 + 140509587675456 [label=SliceBackward0] + 140509587675552 -> 140509587675456 + 140509587675552 [label=SliceBackward0] + 140509587675648 -> 140509587675552 + 140509587675648 [label=NativeLayerNormBackward0] + 140509587675744 -> 140509587675648 + 140509587675744 [label=AddBackward0] + 140509587675936 -> 140509587675744 + 140509587675936 [label=NativeDropoutBackward0] + 140509587676080 -> 140509587675936 + 140509587676080 [label=ViewBackward0] + 140509587676176 -> 140509587676080 + 140509587676176 [label=AddmmBackward0] + 140509587676272 -> 140509587676176 + 140509587676272 [label=ToCopyBackward0] + 140509587676464 -> 140509587676272 + 140509590916608 [label="encoder.layer.1.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590916608 -> 140509587676464 + 140509587676464 [label=AccumulateGrad] + 140509587676224 -> 140509587676176 + 140509587676224 [label=ViewBackward0] + 140509587676512 -> 140509587676224 + 140509587676512 [label=ViewBackward0] + 140509587676608 -> 140509587676512 + 140509587676608 [label=CloneBackward0] + 140509587676704 -> 140509587676608 + 140509587676704 [label=PermuteBackward0] + 140509587676800 -> 140509587676704 + 140509587676800 [label=UnsafeViewBackward0] + 140509587676896 -> 140509587676800 + 140509587676896 [label=BmmBackward0] + 140509587676992 -> 140509587676896 + 140509587676992 [label=ReshapeAliasBackward0] + 140509587677136 -> 140509587676992 + 140509587677136 [label=ExpandBackward0] + 140509587677040 -> 140509587677136 + 140509587677040 [label=ToCopyBackward0] + 140517615505616 -> 140509587677040 + 140517615505616 [label=NativeDropoutBackward0] + 140517615505712 -> 140517615505616 + 140517615505712 [label=SoftmaxBackward0] + 140517615505808 -> 140517615505712 + 140517615505808 [label=AddBackward0] + 140517615505904 -> 140517615505808 + 140517615505904 [label=DivBackward0] + 140517615506000 -> 140517615505904 + 140517615506000 [label=UnsafeViewBackward0] + 140517615506096 -> 140517615506000 + 140517615506096 [label=BmmBackward0] + 140517615506192 -> 140517615506096 + 140517615506192 [label=ReshapeAliasBackward0] + 140517615506336 -> 140517615506192 + 140517615506336 [label=ExpandBackward0] + 140517615506432 -> 140517615506336 + 140517615506432 [label=PermuteBackward0] + 140517615506528 -> 140517615506432 + 140517615506528 [label=ViewBackward0] + 140517615506624 -> 140517615506528 + 140517615506624 [label=ViewBackward0] + 140517615506720 -> 140517615506624 + 140517615506720 [label=AddmmBackward0] + 140517615506816 -> 140517615506720 + 140517615506816 [label=ToCopyBackward0] + 140517615507008 -> 140517615506816 + 140509590933808 [label="encoder.layer.1.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590933808 -> 140517615507008 + 140517615507008 [label=AccumulateGrad] + 140517615506768 -> 140517615506720 + 140517615506768 [label=ViewBackward0] + 140517615507056 -> 140517615506768 + 140517615507056 [label=ToCopyBackward0] + 140509587675888 -> 140517615507056 + 140509587675888 [label=CatBackward0] + 140517615507200 -> 140509587675888 + 140517615507200 [label=NativeLayerNormBackward0] + 140517615507344 -> 140517615507200 + 140517615507344 [label=AddBackward0] + 140517615507536 -> 140517615507344 + 140517615507536 [label=NativeDropoutBackward0] + 140517615507680 -> 140517615507536 + 140517615507680 [label=ViewBackward0] + 140517615507776 -> 140517615507680 + 140517615507776 [label=AddmmBackward0] + 140517615507872 -> 140517615507776 + 140517615507872 [label=ToCopyBackward0] + 140517615508064 -> 140517615507872 + 140509590934288 [label="encoder.layer.0.experts.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590934288 -> 140517615508064 + 140517615508064 [label=AccumulateGrad] + 140517615507824 -> 140517615507776 + 140517615507824 [label=ViewBackward0] + 140517615508112 -> 140517615507824 + 140517615508112 [label=GeluBackward0] + 140517615508208 -> 140517615508112 + 140517615508208 [label=ViewBackward0] + 140517615508304 -> 140517615508208 + 140517615508304 [label=AddmmBackward0] + 140517615508400 -> 140517615508304 + 140517615508400 [label=ToCopyBackward0] + 140517615508592 -> 140517615508400 + 140509590934528 [label="encoder.layer.0.experts.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590934528 -> 140517615508592 + 140517615508592 [label=AccumulateGrad] + 140517615508352 -> 140517615508304 + 140517615508352 [label=ViewBackward0] + 140517615508640 -> 140517615508352 + 140517615508640 [label=ToCopyBackward0] + 140517615507488 -> 140517615508640 + 140517615507488 [label=SliceBackward0] + 140517615508784 -> 140517615507488 + 140517615508784 [label=SliceBackward0] + 140517615508880 -> 140517615508784 + 140517615508880 [label=NativeLayerNormBackward0] + 140517615508976 -> 140517615508880 + 140517615508976 [label=AddBackward0] + 140517615509168 -> 140517615508976 + 140517615509168 [label=NativeDropoutBackward0] + 140517615509312 -> 140517615509168 + 140517615509312 [label=ViewBackward0] + 140517615509408 -> 140517615509312 + 140517615509408 [label=AddmmBackward0] + 140517615509456 -> 140517615509408 + 140517615509456 [label=ToCopyBackward0] + 140517615522048 -> 140517615509456 + 140509590936448 [label="encoder.layer.0.crossattention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590936448 -> 140517615522048 + 140517615522048 [label=AccumulateGrad] + 140517615509216 -> 140517615509408 + 140517615509216 [label=ViewBackward0] + 140517615522096 -> 140517615509216 + 140517615522096 [label=ViewBackward0] + 140517615522192 -> 140517615522096 + 140517615522192 [label=CloneBackward0] + 140517615522288 -> 140517615522192 + 140517615522288 [label=PermuteBackward0] + 140517615522384 -> 140517615522288 + 140517615522384 [label=UnsafeViewBackward0] + 140517615522480 -> 140517615522384 + 140517615522480 [label=BmmBackward0] + 140517615522576 -> 140517615522480 + 140517615522576 [label=ReshapeAliasBackward0] + 140517615522720 -> 140517615522576 + 140517615522720 [label=ExpandBackward0] + 140517615522816 -> 140517615522720 + 140517615522816 [label=ToCopyBackward0] + 140517615522912 -> 140517615522816 + 140517615522912 [label=NativeDropoutBackward0] + 140517615523008 -> 140517615522912 + 140517615523008 [label=SoftmaxBackward0] + 140517615523104 -> 140517615523008 + 140517615523104 [label=AddBackward0] + 140517615523200 -> 140517615523104 + 140517615523200 [label=DivBackward0] + 140517615523296 -> 140517615523200 + 140517615523296 [label=UnsafeViewBackward0] + 140517615523392 -> 140517615523296 + 140517615523392 [label=BmmBackward0] + 140517615523488 -> 140517615523392 + 140517615523488 [label=ReshapeAliasBackward0] + 140517615523632 -> 140517615523488 + 140517615523632 [label=ExpandBackward0] + 140517615523728 -> 140517615523632 + 140517615523728 [label=PermuteBackward0] + 140517615523824 -> 140517615523728 + 140517615523824 [label=ViewBackward0] + 140517615523920 -> 140517615523824 + 140517615523920 [label=ViewBackward0] + 140517615524016 -> 140517615523920 + 140517615524016 [label=AddmmBackward0] + 140517615524112 -> 140517615524016 + 140517615524112 [label=ToCopyBackward0] + 140517615524304 -> 140517615524112 + 140509590937168 [label="encoder.layer.0.crossattention.self.query.bias + (768)" fillcolor=lightblue] + 140509590937168 -> 140517615524304 + 140517615524304 [label=AccumulateGrad] + 140517615524064 -> 140517615524016 + 140517615524064 [label=ViewBackward0] + 140517615524352 -> 140517615524064 + 140517615524352 [label=ToCopyBackward0] + 140517615509120 -> 140517615524352 + 140517615509120 [label=SliceBackward0] + 140517615524496 -> 140517615509120 + 140517615524496 [label=SliceBackward0] + 140517615524592 -> 140517615524496 + 140517615524592 [label=SliceBackward0] + 140517615524688 -> 140517615524592 + 140517615524688 [label=NativeLayerNormBackward0] + 140517615524784 -> 140517615524688 + 140517615524784 [label=AddBackward0] + 140517615524976 -> 140517615524784 + 140517615524976 [label=NativeDropoutBackward0] + 140517615525120 -> 140517615524976 + 140517615525120 [label=ViewBackward0] + 140517615525216 -> 140517615525120 + 140517615525216 [label=AddmmBackward0] + 140517615525312 -> 140517615525216 + 140517615525312 [label=ToCopyBackward0] + 140517615525504 -> 140517615525312 + 140509590945936 [label="encoder.layer.0.attention.output.dense.bias + (768)" fillcolor=lightblue] + 140509590945936 -> 140517615525504 + 140517615525504 [label=AccumulateGrad] + 140517615525264 -> 140517615525216 + 140517615525264 [label=ViewBackward0] + 140517615525552 -> 140517615525264 + 140517615525552 [label=ViewBackward0] + 140517615525648 -> 140517615525552 + 140517615525648 [label=CloneBackward0] + 140517615525744 -> 140517615525648 + 140517615525744 [label=PermuteBackward0] + 140517615525840 -> 140517615525744 + 140517615525840 [label=UnsafeViewBackward0] + 140517615525456 -> 140517615525840 + 140517615525456 [label=BmmBackward0] + 140517615538384 -> 140517615525456 + 140517615538384 [label=ReshapeAliasBackward0] + 140517615538528 -> 140517615538384 + 140517615538528 [label=ExpandBackward0] + 140517615538624 -> 140517615538528 + 140517615538624 [label=ToCopyBackward0] + 140517615538720 -> 140517615538624 + 140517615538720 [label=NativeDropoutBackward0] + 140517615538816 -> 140517615538720 + 140517615538816 [label=SoftmaxBackward0] + 140517615538912 -> 140517615538816 + 140517615538912 [label=AddBackward0] + 140517615539008 -> 140517615538912 + 140517615539008 [label=DivBackward0] + 140517615539104 -> 140517615539008 + 140517615539104 [label=UnsafeViewBackward0] + 140517615539200 -> 140517615539104 + 140517615539200 [label=BmmBackward0] + 140517615539296 -> 140517615539200 + 140517615539296 [label=ReshapeAliasBackward0] + 140517615539440 -> 140517615539296 + 140517615539440 [label=ExpandBackward0] + 140517615539536 -> 140517615539440 + 140517615539536 [label=PermuteBackward0] + 140517615539632 -> 140517615539536 + 140517615539632 [label=ViewBackward0] + 140517615539728 -> 140517615539632 + 140517615539728 [label=ViewBackward0] + 140517615539824 -> 140517615539728 + 140517615539824 [label=AddmmBackward0] + 140517615539920 -> 140517615539824 + 140517615539920 [label=ToCopyBackward0] + 140517615540112 -> 140517615539920 + 140509590600896 [label="encoder.layer.0.attention.self.query.bias + (768)" fillcolor=lightblue] + 140509590600896 -> 140517615540112 + 140517615540112 [label=AccumulateGrad] + 140517615539872 -> 140517615539824 + 140517615539872 [label=ViewBackward0] + 140517615540160 -> 140517615539872 + 140517615540160 [label=ToCopyBackward0] + 140517615524928 -> 140517615540160 + 140517615524928 [label=NativeDropoutBackward0] + 140517615540304 -> 140517615524928 + 140517615540304 [label=NativeLayerNormBackward0] + 140517615540400 -> 140517615540304 + 140517615540400 [label=CatBackward0] + 140517615540592 -> 140517615540400 + 140517615540592 [label=ExpandBackward0] + 140517615540736 -> 140517615540592 + 140509590947296 [label=" + (1, 32, 768)" fillcolor=lightblue] + 140509590947296 -> 140517615540736 + 140517615540736 [label=AccumulateGrad] + 140517615540544 -> 140517615540400 + 140517615540544 [label=AddBackward0] + 140517615540784 -> 140517615540544 + 140517615540784 [label=EmbeddingBackward0] + 140517615540928 -> 140517615540784 + 140509590947856 [label="embeddings.word_embeddings.weight + (30523, 768)" fillcolor=lightblue] + 140509590947856 -> 140517615540928 + 140517615540928 [label=AccumulateGrad] + 140517615540832 -> 140517615540544 + 140517615540832 [label=EmbeddingBackward0] + 140517615540976 -> 140517615540832 + 140509939919504 [label="embeddings.position_embeddings.weight + (512, 768)" fillcolor=lightblue] + 140509939919504 -> 140517615540976 + 140517615540976 [label=AccumulateGrad] + 140517615540352 -> 140517615540304 + 140509590958304 [label="embeddings.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590958304 -> 140517615540352 + 140517615540352 [label=AccumulateGrad] + 140517615540016 -> 140517615540304 + 140509590946656 [label="embeddings.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590946656 -> 140517615540016 + 140517615540016 [label=AccumulateGrad] + 140517615539344 -> 140517615539824 + 140517615539344 [label=TBackward0] + 140517615540064 -> 140517615539344 + 140517615540064 [label=ToCopyBackward0] + 140517615540496 -> 140517615540064 + 140509986890912 [label="encoder.layer.0.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509986890912 -> 140517615540496 + 140517615540496 [label=AccumulateGrad] + 140517615539248 -> 140517615539200 + 140517615539248 [label=ReshapeAliasBackward0] + 140517615539584 -> 140517615539248 + 140517615539584 [label=ExpandBackward0] + 140517615539776 -> 140517615539584 + 140517615539776 [label=TransposeBackward0] + 140517615540256 -> 140517615539776 + 140517615540256 [label=PermuteBackward0] + 140517615541024 -> 140517615540256 + 140517615541024 [label=ViewBackward0] + 140517615540208 -> 140517615541024 + 140517615540208 [label=ViewBackward0] + 140517615540640 -> 140517615540208 + 140517615540640 [label=AddmmBackward0] + 140517615541120 -> 140517615540640 + 140517615541120 [label=ToCopyBackward0] + 140517615541312 -> 140517615541120 + 140509590946096 [label="encoder.layer.0.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590946096 -> 140517615541312 + 140517615541312 [label=AccumulateGrad] + 140517615540880 -> 140517615540640 + 140517615540880 [label=ViewBackward0] + 140517615541360 -> 140517615540880 + 140517615541360 [label=ToCopyBackward0] + 140517615524928 -> 140517615541360 + 140517615539392 -> 140517615540640 + 140517615539392 [label=TBackward0] + 140517615541216 -> 140517615539392 + 140517615541216 [label=ToCopyBackward0] + 140517615541504 -> 140517615541216 + 140509590600816 [label="encoder.layer.0.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590600816 -> 140517615541504 + 140517615541504 [label=AccumulateGrad] + 140517615538336 -> 140517615525456 + 140517615538336 [label=ReshapeAliasBackward0] + 140517615538672 -> 140517615538336 + 140517615538672 [label=ExpandBackward0] + 140517615538864 -> 140517615538672 + 140517615538864 [label=PermuteBackward0] + 140517615539056 -> 140517615538864 + 140517615539056 [label=ViewBackward0] + 140517615538432 -> 140517615539056 + 140517615538432 [label=ViewBackward0] + 140517615539680 -> 140517615538432 + 140517615539680 [label=AddmmBackward0] + 140517615540448 -> 140517615539680 + 140517615540448 [label=ToCopyBackward0] + 140517615541456 -> 140517615540448 + 140509590945856 [label="encoder.layer.0.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590945856 -> 140517615541456 + 140517615541456 [label=AccumulateGrad] + 140517615539968 -> 140517615539680 + 140517615539968 [label=ViewBackward0] + 140517615541264 -> 140517615539968 + 140517615541264 [label=ToCopyBackward0] + 140517615524928 -> 140517615541264 + 140517615538480 -> 140517615539680 + 140517615538480 [label=TBackward0] + 140517615541072 -> 140517615538480 + 140517615541072 [label=ToCopyBackward0] + 140517615541408 -> 140517615541072 + 140509590946176 [label="encoder.layer.0.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590946176 -> 140517615541408 + 140517615541408 [label=AccumulateGrad] + 140517615525024 -> 140517615525216 + 140517615525024 [label=TBackward0] + 140517615525696 -> 140517615525024 + 140517615525696 [label=ToCopyBackward0] + 140517615525792 -> 140517615525696 + 140509987117712 [label="encoder.layer.0.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509987117712 -> 140517615525792 + 140517615525792 [label=AccumulateGrad] + 140517615524928 -> 140517615524784 + 140517615524736 -> 140517615524688 + 140509590937328 [label="encoder.layer.0.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590937328 -> 140517615524736 + 140517615524736 [label=AccumulateGrad] + 140517615524208 -> 140517615524688 + 140509590937408 [label="encoder.layer.0.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590937408 -> 140517615524208 + 140517615524208 [label=AccumulateGrad] + 140517615523536 -> 140517615524016 + 140517615523536 [label=TBackward0] + 140517615524256 -> 140517615523536 + 140517615524256 [label=ToCopyBackward0] + 140517615524640 -> 140517615524256 + 140509590937088 [label="encoder.layer.0.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590937088 -> 140517615524640 + 140517615524640 [label=AccumulateGrad] + 140517615523440 -> 140517615523392 + 140517615523440 [label=ReshapeAliasBackward0] + 140517615523776 -> 140517615523440 + 140517615523776 [label=ExpandBackward0] + 140517615523968 -> 140517615523776 + 140517615523968 [label=TransposeBackward0] + 140517615524448 -> 140517615523968 + 140517615524448 [label=PermuteBackward0] + 140517615524880 -> 140517615524448 + 140517615524880 [label=ViewBackward0] + 140517615524400 -> 140517615524880 + 140517615524400 [label=ViewBackward0] + 140517615525168 -> 140517615524400 + 140517615525168 [label=AddmmBackward0] + 140517615525408 -> 140517615525168 + 140517615525408 [label=ToCopyBackward0] + 140517615538288 -> 140517615525408 + 140509590936928 [label="encoder.layer.0.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509590936928 -> 140517615538288 + 140517615538288 [label=AccumulateGrad] + 140517615525360 -> 140517615525168 + 140517615525360 [label=ViewBackward0] + 140517615538768 -> 140517615525360 + 140517615538768 [label=ToCopyBackward0] + 140517615539152 -> 140517615538768 + 140517615539152 [label=NativeLayerNormBackward0] + 140517615540688 -> 140517615539152 + 140509590598736 [label=" + (1408)" fillcolor=lightblue] + 140509590598736 -> 140517615540688 + 140517615540688 [label=AccumulateGrad] + 140517615539488 -> 140517615539152 + 140509590598976 [label=" + (1408)" fillcolor=lightblue] + 140509590598976 -> 140517615539488 + 140517615539488 [label=AccumulateGrad] + 140517615523584 -> 140517615525168 + 140517615523584 [label=TBackward0] + 140517615538240 -> 140517615523584 + 140517615538240 [label=ToCopyBackward0] + 140517615541168 -> 140517615538240 + 140509590936848 [label="encoder.layer.0.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509590936848 -> 140517615541168 + 140517615541168 [label=AccumulateGrad] + 140517615522528 -> 140517615522480 + 140517615522528 [label=ReshapeAliasBackward0] + 140517615522864 -> 140517615522528 + 140517615522864 [label=ExpandBackward0] + 140517615523056 -> 140517615522864 + 140517615523056 [label=PermuteBackward0] + 140517615523248 -> 140517615523056 + 140517615523248 [label=ViewBackward0] + 140517615522624 -> 140517615523248 + 140517615522624 [label=ViewBackward0] + 140517615523872 -> 140517615522624 + 140517615523872 [label=AddmmBackward0] + 140517615524544 -> 140517615523872 + 140517615524544 [label=ToCopyBackward0] + 140517615525600 -> 140517615524544 + 140509590936688 [label="encoder.layer.0.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509590936688 -> 140517615525600 + 140517615525600 [label=AccumulateGrad] + 140517615524160 -> 140517615523872 + 140517615524160 [label=ViewBackward0] + 140517615525072 -> 140517615524160 + 140517615525072 [label=ToCopyBackward0] + 140517615539152 -> 140517615525072 + 140517615522672 -> 140517615523872 + 140517615522672 [label=TBackward0] + 140517615538576 -> 140517615522672 + 140517615538576 [label=ToCopyBackward0] + 140517615538960 -> 140517615538576 + 140509590936608 [label="encoder.layer.0.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509590936608 -> 140517615538960 + 140517615538960 [label=AccumulateGrad] + 140517615521856 -> 140517615509408 + 140517615521856 [label=TBackward0] + 140517615522240 -> 140517615521856 + 140517615522240 [label=ToCopyBackward0] + 140517615522432 -> 140517615522240 + 140509590936368 [label="encoder.layer.0.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590936368 -> 140517615522432 + 140517615522432 [label=AccumulateGrad] + 140517615509120 -> 140517615508976 + 140517615508928 -> 140517615508880 + 140509590936128 [label="encoder.layer.0.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590936128 -> 140517615508928 + 140517615508928 [label=AccumulateGrad] + 140517615508496 -> 140517615508880 + 140509590936208 [label="encoder.layer.0.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590936208 -> 140517615508496 + 140517615508496 [label=AccumulateGrad] + 140517615508016 -> 140517615508304 + 140517615508016 [label=TBackward0] + 140517615508544 -> 140517615508016 + 140517615508544 [label=ToCopyBackward0] + 140517615509024 -> 140517615508544 + 140509590934448 [label="encoder.layer.0.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590934448 -> 140517615509024 + 140517615509024 [label=AccumulateGrad] + 140517615507584 -> 140517615507776 + 140517615507584 [label=TBackward0] + 140517615508256 -> 140517615507584 + 140517615508256 [label=ToCopyBackward0] + 140517615508736 -> 140517615508256 + 140509590934208 [label="encoder.layer.0.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590934208 -> 140517615508736 + 140517615508736 [label=AccumulateGrad] + 140517615507488 -> 140517615507344 + 140517615507296 -> 140517615507200 + 140509590933968 [label="encoder.layer.0.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590933968 -> 140517615507296 + 140517615507296 [label=AccumulateGrad] + 140517615507248 -> 140517615507200 + 140509590934048 [label="encoder.layer.0.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590934048 -> 140517615507248 + 140517615507248 [label=AccumulateGrad] + 140517615506960 -> 140509587675888 + 140517615506960 [label=NativeLayerNormBackward0] + 140517615507632 -> 140517615506960 + 140517615507632 [label=AddBackward0] + 140517615508448 -> 140517615507632 + 140517615508448 [label=NativeDropoutBackward0] + 140517615508160 -> 140517615508448 + 140517615508160 [label=ViewBackward0] + 140517615508688 -> 140517615508160 + 140517615508688 [label=AddmmBackward0] + 140517615509360 -> 140517615508688 + 140517615509360 [label=ToCopyBackward0] + 140517615522000 -> 140517615509360 + 140509590935728 [label="encoder.layer.0.output.dense.bias + (768)" fillcolor=lightblue] + 140509590935728 -> 140517615522000 + 140517615522000 [label=AccumulateGrad] + 140517615509264 -> 140517615508688 + 140517615509264 [label=ViewBackward0] + 140517615522144 -> 140517615509264 + 140517615522144 [label=GeluBackward0] + 140517615523152 -> 140517615522144 + 140517615523152 [label=ViewBackward0] + 140517615523680 -> 140517615523152 + 140517615523680 [label=AddmmBackward0] + 140517615524832 -> 140517615523680 + 140517615524832 [label=ToCopyBackward0] + 140517615541552 -> 140517615524832 + 140509590935968 [label="encoder.layer.0.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590935968 -> 140517615541552 + 140517615541552 [label=AccumulateGrad] + 140517615522768 -> 140517615523680 + 140517615522768 [label=ViewBackward0] + 140517615541792 -> 140517615522768 + 140517615541792 [label=ToCopyBackward0] + 140517615507968 -> 140517615541792 + 140517615507968 [label=SliceBackward0] + 140517615541936 -> 140517615507968 + 140517615541936 [label=SliceBackward0] + 140517615542032 -> 140517615541936 + 140517615542032 [label=SliceBackward0] + 140517615524688 -> 140517615542032 + 140517615541696 -> 140517615523680 + 140517615541696 [label=TBackward0] + 140517615541600 -> 140517615541696 + 140517615541600 [label=ToCopyBackward0] + 140517615542128 -> 140517615541600 + 140509590935888 [label="encoder.layer.0.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590935888 -> 140517615542128 + 140517615542128 [label=AccumulateGrad] + 140517615521904 -> 140517615508688 + 140517615521904 [label=TBackward0] + 140517615523344 -> 140517615521904 + 140517615523344 [label=ToCopyBackward0] + 140517615522960 -> 140517615523344 + 140509590935648 [label="encoder.layer.0.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590935648 -> 140517615522960 + 140517615522960 [label=AccumulateGrad] + 140517615507968 -> 140517615507632 + 140517615507440 -> 140517615506960 + 140509590935408 [label="encoder.layer.0.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590935408 -> 140517615507440 + 140517615507440 [label=AccumulateGrad] + 140517615507392 -> 140517615506960 + 140509590935488 [label="encoder.layer.0.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590935488 -> 140517615507392 + 140517615507392 [label=AccumulateGrad] + 140517615506240 -> 140517615506720 + 140517615506240 [label=TBackward0] + 140517615506912 -> 140517615506240 + 140517615506912 [label=ToCopyBackward0] + 140517615507920 -> 140517615506912 + 140509590933728 [label="encoder.layer.1.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590933728 -> 140517615507920 + 140517615507920 [label=AccumulateGrad] + 140517615506144 -> 140517615506096 + 140517615506144 [label=ReshapeAliasBackward0] + 140517615506480 -> 140517615506144 + 140517615506480 [label=ExpandBackward0] + 140517615506672 -> 140517615506480 + 140517615506672 [label=TransposeBackward0] + 140517615507152 -> 140517615506672 + 140517615507152 [label=PermuteBackward0] + 140517615509072 -> 140517615507152 + 140517615509072 [label=ViewBackward0] + 140517615507104 -> 140517615509072 + 140517615507104 [label=ViewBackward0] + 140517615522336 -> 140517615507104 + 140517615522336 [label=AddmmBackward0] + 140517615506288 -> 140517615522336 + 140517615506288 [label=ToCopyBackward0] + 140517615541840 -> 140517615506288 + 140509590917008 [label="encoder.layer.1.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590917008 -> 140517615541840 + 140517615541840 [label=AccumulateGrad] + 140517615541744 -> 140517615522336 + 140517615541744 [label=ViewBackward0] + 140517615542176 -> 140517615541744 + 140517615542176 [label=ToCopyBackward0] + 140509587675888 -> 140517615542176 + 140517615541888 -> 140517615522336 + 140517615541888 [label=TBackward0] + 140517615542080 -> 140517615541888 + 140517615542080 [label=ToCopyBackward0] + 140517615542224 -> 140517615542080 + 140509590933568 [label="encoder.layer.1.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590933568 -> 140517615542224 + 140517615542224 [label=AccumulateGrad] + 140509587676944 -> 140509587676896 + 140509587676944 [label=ReshapeAliasBackward0] + 140509587677088 -> 140509587676944 + 140509587677088 [label=ExpandBackward0] + 140517615505760 -> 140509587677088 + 140517615505760 [label=PermuteBackward0] + 140517615505952 -> 140517615505760 + 140517615505952 [label=ViewBackward0] + 140517615505472 -> 140517615505952 + 140517615505472 [label=ViewBackward0] + 140517615506576 -> 140517615505472 + 140517615506576 [label=AddmmBackward0] + 140517615507728 -> 140517615506576 + 140517615507728 [label=ToCopyBackward0] + 140517615541648 -> 140517615507728 + 140509590916848 [label="encoder.layer.1.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590916848 -> 140517615541648 + 140517615541648 [label=AccumulateGrad] + 140517615506864 -> 140517615506576 + 140517615506864 [label=ViewBackward0] + 140517615521952 -> 140517615506864 + 140517615521952 [label=ToCopyBackward0] + 140509587675888 -> 140517615521952 + 140517615505520 -> 140517615506576 + 140517615505520 [label=TBackward0] + 140517615541984 -> 140517615505520 + 140517615541984 [label=ToCopyBackward0] + 140517615591632 -> 140517615541984 + 140509590916768 [label="encoder.layer.1.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590916768 -> 140517615591632 + 140517615591632 [label=AccumulateGrad] + 140509587675984 -> 140509587676176 + 140509587675984 [label=TBackward0] + 140509587676656 -> 140509587675984 + 140509587676656 [label=ToCopyBackward0] + 140509587676848 -> 140509587676656 + 140509590916528 [label="encoder.layer.1.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590916528 -> 140509587676848 + 140509587676848 [label=AccumulateGrad] + 140509587675888 -> 140509587675744 + 140509587675696 -> 140509587675648 + 140509590916288 [label="encoder.layer.1.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590916288 -> 140509587675696 + 140509587675696 [label=AccumulateGrad] + 140509587674976 -> 140509587675648 + 140509590916368 [label="encoder.layer.1.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590916368 -> 140509587674976 + 140509587674976 [label=AccumulateGrad] + 140509587674496 -> 140509587674784 + 140509587674496 [label=TBackward0] + 140509587675024 -> 140509587674496 + 140509587675024 [label=ToCopyBackward0] + 140509587675408 -> 140509587675024 + 140509590914608 [label="encoder.layer.1.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590914608 -> 140509587675408 + 140509587675408 [label=AccumulateGrad] + 140509587674064 -> 140509587674256 + 140509587674064 [label=TBackward0] + 140509587674736 -> 140509587674064 + 140509587674736 [label=ToCopyBackward0] + 140509587675216 -> 140509587674736 + 140509590914368 [label="encoder.layer.1.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590914368 -> 140509587675216 + 140509587675216 [label=AccumulateGrad] + 140509587673968 -> 140509587673824 + 140509587673776 -> 140509587673680 + 140509590914128 [label="encoder.layer.1.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590914128 -> 140509587673776 + 140509587673776 [label=AccumulateGrad] + 140509587673728 -> 140509587673680 + 140509590914208 [label="encoder.layer.1.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590914208 -> 140509587673728 + 140509587673728 [label=AccumulateGrad] + 140509587673440 -> 140509587658304 + 140509587673440 [label=NativeLayerNormBackward0] + 140509587674112 -> 140509587673440 + 140509587674112 [label=AddBackward0] + 140509587674928 -> 140509587674112 + 140509587674928 [label=NativeDropoutBackward0] + 140509587674640 -> 140509587674928 + 140509587674640 [label=ViewBackward0] + 140509587675168 -> 140509587674640 + 140509587675168 [label=AddmmBackward0] + 140509587675840 -> 140509587675168 + 140509587675840 [label=ToCopyBackward0] + 140509587676368 -> 140509587675840 + 140509590915888 [label="encoder.layer.1.output.dense.bias + (768)" fillcolor=lightblue] + 140509590915888 -> 140509587676368 + 140509587676368 [label=AccumulateGrad] + 140509587675792 -> 140509587675168 + 140509587675792 [label=ViewBackward0] + 140509587676752 -> 140509587675792 + 140509587676752 [label=GeluBackward0] + 140509587676560 -> 140509587676752 + 140509587676560 [label=ViewBackward0] + 140509587676320 -> 140509587676560 + 140509587676320 [label=AddmmBackward0] + 140517615506048 -> 140509587676320 + 140517615506048 [label=ToCopyBackward0] + 140517615508832 -> 140517615506048 + 140509590916128 [label="encoder.layer.1.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590916128 -> 140517615508832 + 140517615508832 [label=AccumulateGrad] + 140517615505856 -> 140509587676320 + 140517615505856 [label=ViewBackward0] + 140517615591728 -> 140517615505856 + 140517615591728 [label=ToCopyBackward0] + 140509587674448 -> 140517615591728 + 140509587674448 [label=SliceBackward0] + 140517615591776 -> 140509587674448 + 140517615591776 [label=SliceBackward0] + 140517615591872 -> 140517615591776 + 140517615591872 [label=SliceBackward0] + 140509587675648 -> 140517615591872 + 140517615505568 -> 140509587676320 + 140517615505568 [label=TBackward0] + 140517615591536 -> 140517615505568 + 140517615591536 [label=ToCopyBackward0] + 140517615591968 -> 140517615591536 + 140509590916048 [label="encoder.layer.1.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590916048 -> 140517615591968 + 140517615591968 [label=AccumulateGrad] + 140509587675600 -> 140509587675168 + 140509587675600 [label=TBackward0] + 140509587676128 -> 140509587675600 + 140509587676128 [label=ToCopyBackward0] + 140517615506384 -> 140509587676128 + 140509590915808 [label="encoder.layer.1.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590915808 -> 140517615506384 + 140517615506384 [label=AccumulateGrad] + 140509587674448 -> 140509587674112 + 140509587673920 -> 140509587673440 + 140509590915568 [label="encoder.layer.1.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590915568 -> 140509587673920 + 140509587673920 [label=AccumulateGrad] + 140509587673872 -> 140509587673440 + 140509590915648 [label="encoder.layer.1.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590915648 -> 140509587673872 + 140509587673872 [label=AccumulateGrad] + 140509587673152 -> 140509587660368 + 140509587673152 [label=TBackward0] + 140509587673392 -> 140509587673152 + 140509587673392 [label=ToCopyBackward0] + 140509587674400 -> 140509587673392 + 140509590913888 [label="encoder.layer.2.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590913888 -> 140509587674400 + 140509587674400 [label=AccumulateGrad] + 140509587660272 -> 140509587660224 + 140509587660272 [label=ReshapeAliasBackward0] + 140509587660608 -> 140509587660272 + 140509587660608 [label=ExpandBackward0] + 140509587660704 -> 140509587660608 + 140509587660704 [label=TransposeBackward0] + 140509587673632 -> 140509587660704 + 140509587673632 [label=PermuteBackward0] + 140509587675504 -> 140509587673632 + 140509587675504 [label=ViewBackward0] + 140509587673584 -> 140509587675504 + 140509587673584 [label=ViewBackward0] + 140509587676416 -> 140509587673584 + 140509587676416 [label=AddmmBackward0] + 140517615505664 -> 140509587676416 + 140517615505664 [label=ToCopyBackward0] + 140517615591680 -> 140517615505664 + 140509590913728 [label="encoder.layer.2.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590913728 -> 140517615591680 + 140517615591680 [label=AccumulateGrad] + 140509587673200 -> 140509587676416 + 140509587673200 [label=ViewBackward0] + 140517615592016 -> 140509587673200 + 140517615592016 [label=ToCopyBackward0] + 140509587658304 -> 140517615592016 + 140517615591488 -> 140509587676416 + 140517615591488 [label=TBackward0] + 140517615591584 -> 140517615591488 + 140517615591584 [label=ToCopyBackward0] + 140517615592160 -> 140517615591584 + 140509590913648 [label="encoder.layer.2.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590913648 -> 140517615592160 + 140517615592160 [label=AccumulateGrad] + 140509587659360 -> 140509587659312 + 140509587659360 [label=ReshapeAliasBackward0] + 140509587659696 -> 140509587659360 + 140509587659696 [label=ExpandBackward0] + 140509587659888 -> 140509587659696 + 140509587659888 [label=PermuteBackward0] + 140509587660080 -> 140509587659888 + 140509587660080 [label=ViewBackward0] + 140509587659456 -> 140509587660080 + 140509587659456 [label=ViewBackward0] + 140509587660416 -> 140509587659456 + 140509587660416 [label=AddmmBackward0] + 140509587659504 -> 140509587660416 + 140509587659504 [label=ToCopyBackward0] + 140509587676032 -> 140509587659504 + 140509590913488 [label="encoder.layer.2.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590913488 -> 140509587676032 + 140509587676032 [label=AccumulateGrad] + 140509587674208 -> 140509587660416 + 140509587674208 [label=ViewBackward0] + 140517615591920 -> 140509587674208 + 140517615591920 [label=ToCopyBackward0] + 140509587658304 -> 140517615591920 + 140509587673344 -> 140509587660416 + 140509587673344 [label=TBackward0] + 140517615591824 -> 140509587673344 + 140517615591824 [label=ToCopyBackward0] + 140517615592064 -> 140517615591824 + 140509590913408 [label="encoder.layer.2.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590913408 -> 140517615592064 + 140517615592064 [label=AccumulateGrad] + 140509587658400 -> 140509587658592 + 140509587658400 [label=TBackward0] + 140509587659072 -> 140509587658400 + 140509587659072 [label=ToCopyBackward0] + 140509587659264 -> 140509587659072 + 140509590913168 [label="encoder.layer.2.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590913168 -> 140509587659264 + 140509587659264 [label=AccumulateGrad] + 140509587658304 -> 140509587658160 + 140509587658112 -> 140509587658064 + 140509590904640 [label="encoder.layer.2.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590904640 -> 140509587658112 + 140509587658112 [label=AccumulateGrad] + 140509587657584 -> 140509587658064 + 140509590904720 [label="encoder.layer.2.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590904720 -> 140509587657584 + 140509587657584 [label=AccumulateGrad] + 140509587656912 -> 140509587657392 + 140509587656912 [label=TBackward0] + 140509587657632 -> 140509587656912 + 140509587657632 [label=ToCopyBackward0] + 140509587658016 -> 140509587657632 + 140509590904400 [label="encoder.layer.2.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590904400 -> 140509587658016 + 140509587658016 [label=AccumulateGrad] + 140509587656816 -> 140509587644368 + 140509587656816 [label=ReshapeAliasBackward0] + 140509587657152 -> 140509587656816 + 140509587657152 [label=ExpandBackward0] + 140509587657344 -> 140509587657152 + 140509587657344 [label=TransposeBackward0] + 140509587657824 -> 140509587657344 + 140509587657824 [label=PermuteBackward0] + 140509587658256 -> 140509587657824 + 140509587658256 [label=ViewBackward0] + 140509587657776 -> 140509587658256 + 140509587657776 [label=ViewBackward0] + 140509587658544 -> 140509587657776 + 140509587658544 [label=AddmmBackward0] + 140509587658784 -> 140509587658544 + 140509587658784 [label=ToCopyBackward0] + 140509587658976 -> 140509587658784 + 140509590904240 [label="encoder.layer.2.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509590904240 -> 140509587658976 + 140509587658976 [label=AccumulateGrad] + 140509587658736 -> 140509587658544 + 140509587658736 [label=ViewBackward0] + 140509587659792 -> 140509587658736 + 140509587659792 [label=ToCopyBackward0] + 140517615539152 -> 140509587659792 + 140509587656960 -> 140509587658544 + 140509587656960 [label=TBackward0] + 140509587659600 -> 140509587656960 + 140509587659600 [label=ToCopyBackward0] + 140509587660512 -> 140509587659600 + 140509590904160 [label="encoder.layer.2.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509590904160 -> 140509587660512 + 140509587660512 [label=AccumulateGrad] + 140509587643552 -> 140509587643504 + 140509587643552 [label=ReshapeAliasBackward0] + 140509587643888 -> 140509587643552 + 140509587643888 [label=ExpandBackward0] + 140509587644080 -> 140509587643888 + 140509587644080 [label=PermuteBackward0] + 140509587644272 -> 140509587644080 + 140509587644272 [label=ViewBackward0] + 140509587675312 -> 140509587644272 + 140509587675312 [label=ViewBackward0] + 140509587643696 -> 140509587675312 + 140509587643696 [label=AddmmBackward0] + 140509587657536 -> 140509587643696 + 140509587657536 [label=ToCopyBackward0] + 140509587659168 -> 140509587657536 + 140509590904000 [label="encoder.layer.2.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509590904000 -> 140509587659168 + 140509587659168 [label=AccumulateGrad] + 140509587657248 -> 140509587643696 + 140509587657248 [label=ViewBackward0] + 140509587660176 -> 140509587657248 + 140509587660176 [label=ToCopyBackward0] + 140517615539152 -> 140509587660176 + 140509587656768 -> 140509587643696 + 140509587656768 [label=TBackward0] + 140509587658208 -> 140509587656768 + 140509587658208 [label=ToCopyBackward0] + 140509587658448 -> 140509587658208 + 140509590903920 [label="encoder.layer.2.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509590903920 -> 140509587658448 + 140509587658448 [label=AccumulateGrad] + 140509587642592 -> 140509587642784 + 140509587642592 [label=TBackward0] + 140509587643264 -> 140509587642592 + 140509587643264 [label=ToCopyBackward0] + 140509587643456 -> 140509587643264 + 140509590903680 [label="encoder.layer.2.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590903680 -> 140509587643456 + 140509587643456 [label=AccumulateGrad] + 140509587642496 -> 140509587642352 + 140509587642304 -> 140509587642256 + 140509590903440 [label="encoder.layer.2.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590903440 -> 140509587642304 + 140509587642304 [label=AccumulateGrad] + 140509587641872 -> 140509587642256 + 140509590903520 [label="encoder.layer.2.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590903520 -> 140509587641872 + 140509587641872 [label=AccumulateGrad] + 140509587641392 -> 140509587641680 + 140509587641392 [label=TBackward0] + 140509587641920 -> 140509587641392 + 140509587641920 [label=ToCopyBackward0] + 140509587642400 -> 140509587641920 + 140509590901760 [label="encoder.layer.2.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590901760 -> 140509587642400 + 140509587642400 [label=AccumulateGrad] + 140509587640960 -> 140509587641152 + 140509587640960 [label=TBackward0] + 140509587641632 -> 140509587640960 + 140509587641632 [label=ToCopyBackward0] + 140509587642112 -> 140509587641632 + 140509590901520 [label="encoder.layer.2.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590901520 -> 140509587642112 + 140509587642112 [label=AccumulateGrad] + 140509587640864 -> 140509587640720 + 140509587640672 -> 140509587640576 + 140509590901280 [label="encoder.layer.2.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590901280 -> 140509587640672 + 140509587640672 [label=AccumulateGrad] + 140509587640624 -> 140509587640576 + 140509590901360 [label="encoder.layer.2.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590901360 -> 140509587640624 + 140509587640624 [label=AccumulateGrad] + 140509587640480 -> 140509587625200 + 140509587640480 [label=NativeLayerNormBackward0] + 140509587641008 -> 140509587640480 + 140509587641008 [label=AddBackward0] + 140509587641824 -> 140509587641008 + 140509587641824 [label=NativeDropoutBackward0] + 140509587641536 -> 140509587641824 + 140509587641536 [label=ViewBackward0] + 140509587642064 -> 140509587641536 + 140509587642064 [label=AddmmBackward0] + 140509587642928 -> 140509587642064 + 140509587642928 [label=ToCopyBackward0] + 140509587643024 -> 140509587642928 + 140509590903040 [label="encoder.layer.2.output.dense.bias + (768)" fillcolor=lightblue] + 140509590903040 -> 140509587643024 + 140509587643024 [label=AccumulateGrad] + 140509587642736 -> 140509587642064 + 140509587642736 [label=ViewBackward0] + 140509587643168 -> 140509587642736 + 140509587643168 [label=GeluBackward0] + 140509587644176 -> 140509587643168 + 140509587644176 [label=ViewBackward0] + 140509587643648 -> 140509587644176 + 140509587643648 [label=AddmmBackward0] + 140509587659984 -> 140509587643648 + 140509587659984 [label=ToCopyBackward0] + 140517615592208 -> 140509587659984 + 140509590903280 [label="encoder.layer.2.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590903280 -> 140517615592208 + 140517615592208 [label=AccumulateGrad] + 140509587657920 -> 140509587643648 + 140509587657920 [label=ViewBackward0] + 140517615592304 -> 140509587657920 + 140517615592304 [label=ToCopyBackward0] + 140509587641344 -> 140517615592304 + 140509587641344 [label=SliceBackward0] + 140517615592448 -> 140509587641344 + 140517615592448 [label=SliceBackward0] + 140517615592544 -> 140517615592448 + 140517615592544 [label=SliceBackward0] + 140509587658064 -> 140517615592544 + 140509587657056 -> 140509587643648 + 140509587657056 [label=TBackward0] + 140517615592112 -> 140509587657056 + 140517615592112 [label=ToCopyBackward0] + 140517615592640 -> 140517615592112 + 140509590903200 [label="encoder.layer.2.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590903200 -> 140517615592640 + 140517615592640 [label=AccumulateGrad] + 140509587642640 -> 140509587642064 + 140509587642640 [label=TBackward0] + 140509587643792 -> 140509587642640 + 140509587643792 [label=ToCopyBackward0] + 140509587658832 -> 140509587643792 + 140509590902960 [label="encoder.layer.2.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590902960 -> 140509587658832 + 140509587658832 [label=AccumulateGrad] + 140509587641344 -> 140509587641008 + 140509587640816 -> 140509587640480 + 140509590902720 [label="encoder.layer.2.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590902720 -> 140509587640816 + 140509587640816 [label=AccumulateGrad] + 140509587640768 -> 140509587640480 + 140509590902800 [label="encoder.layer.2.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590902800 -> 140509587640768 + 140509587640768 [label=AccumulateGrad] + 140509587627264 -> 140509587627744 + 140509587627264 [label=TBackward0] + 140509587640384 -> 140509587627264 + 140509587640384 [label=ToCopyBackward0] + 140509587641296 -> 140509587640384 + 140509590901040 [label="encoder.layer.3.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590901040 -> 140509587641296 + 140509587641296 [label=AccumulateGrad] + 140509587627168 -> 140509587627120 + 140509587627168 [label=ReshapeAliasBackward0] + 140509587627504 -> 140509587627168 + 140509587627504 [label=ExpandBackward0] + 140509587627696 -> 140509587627504 + 140509587627696 [label=TransposeBackward0] + 140509587627888 -> 140509587627696 + 140509587627888 [label=PermuteBackward0] + 140509587642448 -> 140509587627888 + 140509587642448 [label=ViewBackward0] + 140509587640432 -> 140509587642448 + 140509587640432 [label=ViewBackward0] + 140509587643360 -> 140509587640432 + 140509587643360 [label=AddmmBackward0] + 140509587643984 -> 140509587643360 + 140509587643984 [label=ToCopyBackward0] + 140517615592256 -> 140509587643984 + 140509590900880 [label="encoder.layer.3.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590900880 -> 140517615592256 + 140517615592256 [label=AccumulateGrad] + 140509587640528 -> 140509587643360 + 140509587640528 [label=ViewBackward0] + 140517615592688 -> 140509587640528 + 140517615592688 [label=ToCopyBackward0] + 140509587625200 -> 140517615592688 + 140517615592352 -> 140509587643360 + 140517615592352 [label=TBackward0] + 140517615592400 -> 140517615592352 + 140517615592400 [label=ToCopyBackward0] + 140517615592832 -> 140517615592400 + 140509590900800 [label="encoder.layer.3.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590900800 -> 140517615592832 + 140517615592832 [label=AccumulateGrad] + 140509587626256 -> 140509587626208 + 140509587626256 [label=ReshapeAliasBackward0] + 140509587626592 -> 140509587626256 + 140509587626592 [label=ExpandBackward0] + 140509587626784 -> 140509587626592 + 140509587626784 [label=PermuteBackward0] + 140509587626976 -> 140509587626784 + 140509587626976 [label=ViewBackward0] + 140509587626352 -> 140509587626976 + 140509587626352 [label=ViewBackward0] + 140509587627600 -> 140509587626352 + 140509587627600 [label=AddmmBackward0] + 140509587627312 -> 140509587627600 + 140509587627312 [label=ToCopyBackward0] + 140509587642976 -> 140509587627312 + 140509590896448 [label="encoder.layer.3.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590896448 -> 140509587642976 + 140509587642976 [label=AccumulateGrad] + 140509587626400 -> 140509587627600 + 140509587626400 [label=ViewBackward0] + 140517615592592 -> 140509587626400 + 140517615592592 [label=ToCopyBackward0] + 140509587625200 -> 140517615592592 + 140509587641104 -> 140509587627600 + 140509587641104 [label=TBackward0] + 140517615592496 -> 140509587641104 + 140517615592496 [label=ToCopyBackward0] + 140517615592736 -> 140517615592496 + 140509590896368 [label="encoder.layer.3.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590896368 -> 140517615592736 + 140517615592736 [label=AccumulateGrad] + 140509587625296 -> 140509587625488 + 140509587625296 [label=TBackward0] + 140509587625968 -> 140509587625296 + 140509587625968 [label=ToCopyBackward0] + 140509587626160 -> 140509587625968 + 140509590896128 [label="encoder.layer.3.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590896128 -> 140509587626160 + 140509587626160 [label=AccumulateGrad] + 140509587625200 -> 140509587625056 + 140509587625008 -> 140509587624960 + 140509590895888 [label="encoder.layer.3.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590895888 -> 140509587625008 + 140509587625008 [label=AccumulateGrad] + 140509587624288 -> 140509587624960 + 140509590895968 [label="encoder.layer.3.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590895968 -> 140509587624288 + 140509587624288 [label=AccumulateGrad] + 140509587624000 -> 140509587624096 + 140509587624000 [label=TBackward0] + 140509587624336 -> 140509587624000 + 140509587624336 [label=ToCopyBackward0] + 140509587624720 -> 140509587624336 + 140509590894208 [label="encoder.layer.3.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590894208 -> 140509587624720 + 140509587624720 [label=AccumulateGrad] + 140509587611024 -> 140509587611216 + 140509587611024 [label=TBackward0] + 140509587611456 -> 140509587611024 + 140509587611456 [label=ToCopyBackward0] + 140509587624528 -> 140509587611456 + 140509590893968 [label="encoder.layer.3.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590893968 -> 140509587624528 + 140509587624528 [label=AccumulateGrad] + 140509587610928 -> 140509587610784 + 140509587610736 -> 140509587610640 + 140509590893728 [label="encoder.layer.3.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590893728 -> 140509587610736 + 140509587610736 [label=AccumulateGrad] + 140509587610688 -> 140509587610640 + 140509590893808 [label="encoder.layer.3.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590893808 -> 140509587610688 + 140509587610688 [label=AccumulateGrad] + 140509587610400 -> 140509587607664 + 140509587610400 [label=NativeLayerNormBackward0] + 140509587611072 -> 140509587610400 + 140509587611072 [label=AddBackward0] + 140509587611600 -> 140509587611072 + 140509587611600 [label=NativeDropoutBackward0] + 140509587624048 -> 140509587611600 + 140509587624048 [label=ViewBackward0] + 140509587624480 -> 140509587624048 + 140509587624480 [label=AddmmBackward0] + 140509587625152 -> 140509587624480 + 140509587625152 [label=ToCopyBackward0] + 140509587625680 -> 140509587625152 + 140509590895488 [label="encoder.layer.3.output.dense.bias + (768)" fillcolor=lightblue] + 140509590895488 -> 140509587625680 + 140509587625680 [label=AccumulateGrad] + 140509587625104 -> 140509587624480 + 140509587625104 [label=ViewBackward0] + 140509587626064 -> 140509587625104 + 140509587626064 [label=GeluBackward0] + 140509587625728 -> 140509587626064 + 140509587625728 [label=ViewBackward0] + 140509587626688 -> 140509587625728 + 140509587626688 [label=AddmmBackward0] + 140509587627072 -> 140509587626688 + 140509587627072 [label=ToCopyBackward0] + 140509587642208 -> 140509587627072 + 140509590895728 [label="encoder.layer.3.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590895728 -> 140509587642208 + 140509587642208 [label=AccumulateGrad] + 140509587626880 -> 140509587626688 + 140509587626880 [label=ViewBackward0] + 140517615593024 -> 140509587626880 + 140517615593024 [label=ToCopyBackward0] + 140509587611360 -> 140517615593024 + 140509587611360 [label=SliceBackward0] + 140517615593072 -> 140509587611360 + 140517615593072 [label=SliceBackward0] + 140517615593168 -> 140517615593072 + 140517615593168 [label=SliceBackward0] + 140509587624960 -> 140517615593168 + 140509587625632 -> 140509587626688 + 140509587625632 [label=TBackward0] + 140517615592784 -> 140509587625632 + 140517615592784 [label=ToCopyBackward0] + 140517615593264 -> 140517615592784 + 140509590895648 [label="encoder.layer.3.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590895648 -> 140517615593264 + 140517615593264 [label=AccumulateGrad] + 140509587624912 -> 140509587624480 + 140509587624912 [label=TBackward0] + 140509587625872 -> 140509587624912 + 140509587625872 [label=ToCopyBackward0] + 140509587627408 -> 140509587625872 + 140509590895408 [label="encoder.layer.3.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590895408 -> 140509587627408 + 140509587627408 [label=AccumulateGrad] + 140509587611360 -> 140509587611072 + 140509587610880 -> 140509587610400 + 140509590895168 [label="encoder.layer.3.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590895168 -> 140509587610880 + 140509587610880 [label=AccumulateGrad] + 140509587610832 -> 140509587610400 + 140509590895248 [label="encoder.layer.3.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590895248 -> 140509587610832 + 140509587610832 [label=AccumulateGrad] + 140509587609680 -> 140509587610160 + 140509587609680 [label=TBackward0] + 140509587610352 -> 140509587609680 + 140509587610352 [label=ToCopyBackward0] + 140509587611168 -> 140509587610352 + 140509590893488 [label="encoder.layer.4.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590893488 -> 140509587611168 + 140509587611168 [label=AccumulateGrad] + 140509587609584 -> 140509587609536 + 140509587609584 [label=ReshapeAliasBackward0] + 140509587609920 -> 140509587609584 + 140509587609920 [label=ExpandBackward0] + 140509587610112 -> 140509587609920 + 140509587610112 [label=TransposeBackward0] + 140509587610592 -> 140509587610112 + 140509587610592 [label=PermuteBackward0] + 140509587610544 -> 140509587610592 + 140509587610544 [label=ViewBackward0] + 140509587624240 -> 140509587610544 + 140509587624240 [label=ViewBackward0] + 140509587625440 -> 140509587624240 + 140509587625440 [label=AddmmBackward0] + 140509587626496 -> 140509587625440 + 140509587626496 [label=ToCopyBackward0] + 140517615592976 -> 140509587626496 + 140509590893328 [label="encoder.layer.4.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590893328 -> 140517615592976 + 140517615592976 [label=AccumulateGrad] + 140509587624816 -> 140509587625440 + 140509587624816 [label=ViewBackward0] + 140517615593312 -> 140509587624816 + 140517615593312 [label=ToCopyBackward0] + 140509587607664 -> 140517615593312 + 140517615592880 -> 140509587625440 + 140517615592880 [label=TBackward0] + 140517615592928 -> 140517615592880 + 140517615592928 [label=ToCopyBackward0] + 140517615593456 -> 140517615592928 + 140509590893248 [label="encoder.layer.4.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590893248 -> 140517615593456 + 140517615593456 [label=AccumulateGrad] + 140509587608672 -> 140509587608624 + 140509587608672 [label=ReshapeAliasBackward0] + 140509587609008 -> 140509587608672 + 140509587609008 [label=ExpandBackward0] + 140509587609200 -> 140509587609008 + 140509587609200 [label=PermuteBackward0] + 140509587609392 -> 140509587609200 + 140509587609392 [label=ViewBackward0] + 140509587608768 -> 140509587609392 + 140509587608768 [label=ViewBackward0] + 140509587610016 -> 140509587608768 + 140509587610016 [label=AddmmBackward0] + 140509587609728 -> 140509587610016 + 140509587609728 [label=ToCopyBackward0] + 140509587625344 -> 140509587609728 + 140509590893088 [label="encoder.layer.4.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590893088 -> 140509587625344 + 140509587625344 [label=AccumulateGrad] + 140509587610304 -> 140509587610016 + 140509587610304 [label=ViewBackward0] + 140517615593216 -> 140509587610304 + 140517615593216 [label=ToCopyBackward0] + 140509587607664 -> 140517615593216 + 140509587608816 -> 140509587610016 + 140509587608816 [label=TBackward0] + 140517615593120 -> 140509587608816 + 140517615593120 [label=ToCopyBackward0] + 140517615593360 -> 140517615593120 + 140509590893008 [label="encoder.layer.4.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590893008 -> 140517615593360 + 140517615593360 [label=AccumulateGrad] + 140509587607712 -> 140509587607904 + 140509587607712 [label=TBackward0] + 140509587608384 -> 140509587607712 + 140509587608384 [label=ToCopyBackward0] + 140509587608576 -> 140509587608384 + 140509590892768 [label="encoder.layer.4.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590892768 -> 140509587608576 + 140509587608576 [label=AccumulateGrad] + 140509587607664 -> 140509587595120 + 140509587595072 -> 140509587595024 + 140509590892608 [label="encoder.layer.4.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590892608 -> 140509587595072 + 140509587595072 [label=AccumulateGrad] + 140509587594544 -> 140509587595024 + 140509590876048 [label="encoder.layer.4.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590876048 -> 140509587594544 + 140509587594544 [label=AccumulateGrad] + 140509587593872 -> 140509587594352 + 140509587593872 [label=TBackward0] + 140509587594592 -> 140509587593872 + 140509587594592 [label=ToCopyBackward0] + 140509587594976 -> 140509587594592 + 140509590875808 [label="encoder.layer.4.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590875808 -> 140509587594976 + 140509587594976 [label=AccumulateGrad] + 140509587593776 -> 140509587593728 + 140509587593776 [label=ReshapeAliasBackward0] + 140509587594112 -> 140509587593776 + 140509587594112 [label=ExpandBackward0] + 140509587594304 -> 140509587594112 + 140509587594304 [label=TransposeBackward0] + 140509587594784 -> 140509587594304 + 140509587594784 [label=PermuteBackward0] + 140509587595168 -> 140509587594784 + 140509587595168 [label=ViewBackward0] + 140509587594736 -> 140509587595168 + 140509587594736 [label=ViewBackward0] + 140509587607856 -> 140509587594736 + 140509587607856 [label=AddmmBackward0] + 140509587608096 -> 140509587607856 + 140509587608096 [label=ToCopyBackward0] + 140509587608288 -> 140509587608096 + 140509590875648 [label="encoder.layer.4.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509590875648 -> 140509587608288 + 140509587608288 [label=AccumulateGrad] + 140509587608048 -> 140509587607856 + 140509587608048 [label=ViewBackward0] + 140509587609104 -> 140509587608048 + 140509587609104 [label=ToCopyBackward0] + 140517615539152 -> 140509587609104 + 140509587607616 -> 140509587607856 + 140509587607616 [label=TBackward0] + 140509587608912 -> 140509587607616 + 140509587608912 [label=ToCopyBackward0] + 140509587609824 -> 140509587608912 + 140509590875568 [label="encoder.layer.4.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509590875568 -> 140509587609824 + 140509587609824 [label=AccumulateGrad] + 140509587592864 -> 140509587592816 + 140509587592864 [label=ReshapeAliasBackward0] + 140509587593200 -> 140509587592864 + 140509587593200 [label=ExpandBackward0] + 140509587593392 -> 140509587593200 + 140509587593392 [label=PermuteBackward0] + 140509587593584 -> 140509587593392 + 140509587593584 [label=ViewBackward0] + 140509587592960 -> 140509587593584 + 140509587592960 [label=ViewBackward0] + 140509587594208 -> 140509587592960 + 140509587594208 [label=AddmmBackward0] + 140509587594880 -> 140509587594208 + 140509587594880 [label=ToCopyBackward0] + 140509587624624 -> 140509587594880 + 140509590875408 [label="encoder.layer.4.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509590875408 -> 140509587624624 + 140509587624624 [label=AccumulateGrad] + 140509587594496 -> 140509587594208 + 140509587594496 [label=ViewBackward0] + 140509587609488 -> 140509587594496 + 140509587609488 [label=ToCopyBackward0] + 140517615539152 -> 140509587609488 + 140509587593008 -> 140509587594208 + 140509587593008 [label=TBackward0] + 140509587607760 -> 140509587593008 + 140509587607760 [label=ToCopyBackward0] + 140509587608480 -> 140509587607760 + 140509590875328 [label="encoder.layer.4.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509590875328 -> 140509587608480 + 140509587608480 [label=AccumulateGrad] + 140509587591904 -> 140509587592096 + 140509587591904 [label=TBackward0] + 140509587592576 -> 140509587591904 + 140509587592576 [label=ToCopyBackward0] + 140509587592768 -> 140509587592576 + 140509590875088 [label="encoder.layer.4.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590875088 -> 140509587592768 + 140509587592768 [label=AccumulateGrad] + 140509587591808 -> 140509587591664 + 140509587591616 -> 140509587591568 + 140509590874848 [label="encoder.layer.4.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590874848 -> 140509587591616 + 140509587591616 [label=AccumulateGrad] + 140509587591376 -> 140509587591568 + 140509590874928 [label="encoder.layer.4.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590874928 -> 140509587591376 + 140509587591376 [label=AccumulateGrad] + 140509587574256 -> 140509587574544 + 140509587574256 [label=TBackward0] + 140509587591328 -> 140509587574256 + 140509587591328 [label=ToCopyBackward0] + 140509587591712 -> 140509587591328 + 140509590873168 [label="encoder.layer.4.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590873168 -> 140509587591712 + 140509587591712 [label=AccumulateGrad] + 140509587573824 -> 140509587574016 + 140509587573824 [label=TBackward0] + 140509587574496 -> 140509587573824 + 140509587574496 [label=ToCopyBackward0] + 140509587574688 -> 140509587574496 + 140509590872928 [label="encoder.layer.4.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590872928 -> 140509587574688 + 140509587574688 [label=AccumulateGrad] + 140509587573728 -> 140509587573584 + 140509587573536 -> 140509587573440 + 140509590872688 [label="encoder.layer.4.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590872688 -> 140509587573536 + 140509587573536 [label=AccumulateGrad] + 140509587573488 -> 140509587573440 + 140509590872768 [label="encoder.layer.4.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590872768 -> 140509587573488 + 140509587573488 [label=AccumulateGrad] + 140509587573200 -> 140509587562112 + 140509587573200 [label=NativeLayerNormBackward0] + 140509587573872 -> 140509587573200 + 140509587573872 [label=AddBackward0] + 140509587574400 -> 140509587573872 + 140509587574400 [label=NativeDropoutBackward0] + 140509587591424 -> 140509587574400 + 140509587591424 [label=ViewBackward0] + 140509587591280 -> 140509587591424 + 140509587591280 [label=AddmmBackward0] + 140509587592240 -> 140509587591280 + 140509587592240 [label=ToCopyBackward0] + 140509587592336 -> 140509587592240 + 140509590874448 [label="encoder.layer.4.output.dense.bias + (768)" fillcolor=lightblue] + 140509590874448 -> 140509587592336 + 140509587592336 [label=AccumulateGrad] + 140509587592048 -> 140509587591280 + 140509587592048 [label=ViewBackward0] + 140509587592480 -> 140509587592048 + 140509587592480 [label=GeluBackward0] + 140509587593488 -> 140509587592480 + 140509587593488 [label=ViewBackward0] + 140509587594016 -> 140509587593488 + 140509587594016 [label=AddmmBackward0] + 140509587593920 -> 140509587594016 + 140509587593920 [label=ToCopyBackward0] + 140517615593504 -> 140509587593920 + 140509590874688 [label="encoder.layer.4.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590874688 -> 140517615593504 + 140517615593504 [label=AccumulateGrad] + 140509587593104 -> 140509587594016 + 140509587593104 [label=ViewBackward0] + 140517615593600 -> 140509587593104 + 140517615593600 [label=ToCopyBackward0] + 140509587574208 -> 140517615593600 + 140509587574208 [label=SliceBackward0] + 140517615593744 -> 140509587574208 + 140517615593744 [label=SliceBackward0] + 140517615593840 -> 140517615593744 + 140517615593840 [label=SliceBackward0] + 140509587595024 -> 140517615593840 + 140509587609296 -> 140509587594016 + 140509587609296 [label=TBackward0] + 140517615593408 -> 140509587609296 + 140517615593408 [label=ToCopyBackward0] + 140517615593936 -> 140517615593408 + 140509590874608 [label="encoder.layer.4.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590874608 -> 140517615593936 + 140517615593936 [label=AccumulateGrad] + 140509587591952 -> 140509587591280 + 140509587591952 [label=TBackward0] + 140509587593680 -> 140509587591952 + 140509587593680 [label=ToCopyBackward0] + 140509587608144 -> 140509587593680 + 140509590874368 [label="encoder.layer.4.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590874368 -> 140509587608144 + 140509587608144 [label=AccumulateGrad] + 140509587574208 -> 140509587573872 + 140509587573680 -> 140509587573200 + 140509590874128 [label="encoder.layer.4.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590874128 -> 140509587573680 + 140509587573680 [label=AccumulateGrad] + 140509587573632 -> 140509587573200 + 140509590874208 [label="encoder.layer.4.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590874208 -> 140509587573632 + 140509587573632 [label=AccumulateGrad] + 140509587572480 -> 140509587572960 + 140509587572480 [label=TBackward0] + 140509587573152 -> 140509587572480 + 140509587573152 [label=ToCopyBackward0] + 140509587574160 -> 140509587573152 + 140509590872448 [label="encoder.layer.5.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590872448 -> 140509587574160 + 140509587574160 [label=AccumulateGrad] + 140509587572384 -> 140509587572336 + 140509587572384 [label=ReshapeAliasBackward0] + 140509587572720 -> 140509587572384 + 140509587572720 [label=ExpandBackward0] + 140509587572912 -> 140509587572720 + 140509587572912 [label=TransposeBackward0] + 140509587573392 -> 140509587572912 + 140509587573392 [label=PermuteBackward0] + 140509587573344 -> 140509587573392 + 140509587573344 [label=ViewBackward0] + 140509587572528 -> 140509587573344 + 140509587572528 [label=ViewBackward0] + 140509587592672 -> 140509587572528 + 140509587592672 [label=AddmmBackward0] + 140509587593296 -> 140509587592672 + 140509587593296 [label=ToCopyBackward0] + 140517615593552 -> 140509587593296 + 140509590872288 [label="encoder.layer.5.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590872288 -> 140517615593552 + 140517615593552 [label=AccumulateGrad] + 140509587591760 -> 140509587592672 + 140509587591760 [label=ViewBackward0] + 140517615593984 -> 140509587591760 + 140517615593984 [label=ToCopyBackward0] + 140509587562112 -> 140517615593984 + 140517615593648 -> 140509587592672 + 140517615593648 [label=TBackward0] + 140517615593696 -> 140517615593648 + 140517615593696 [label=ToCopyBackward0] + 140517615594128 -> 140517615593696 + 140509590872208 [label="encoder.layer.5.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590872208 -> 140517615594128 + 140517615594128 [label=AccumulateGrad] + 140509587571472 -> 140509587571424 + 140509587571472 [label=ReshapeAliasBackward0] + 140509587571808 -> 140509587571472 + 140509587571808 [label=ExpandBackward0] + 140509587572000 -> 140509587571808 + 140509587572000 [label=PermuteBackward0] + 140509587572192 -> 140509587572000 + 140509587572192 [label=ViewBackward0] + 140509587571568 -> 140509587572192 + 140509587571568 [label=ViewBackward0] + 140509587572816 -> 140509587571568 + 140509587572816 [label=AddmmBackward0] + 140509587573968 -> 140509587572816 + 140509587573968 [label=ToCopyBackward0] + 140509587592288 -> 140509587573968 + 140509590859664 [label="encoder.layer.5.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590859664 -> 140509587592288 + 140509587592288 [label=AccumulateGrad] + 140509587573104 -> 140509587572816 + 140509587573104 [label=ViewBackward0] + 140517615593888 -> 140509587573104 + 140517615593888 [label=ToCopyBackward0] + 140509587562112 -> 140517615593888 + 140509587571616 -> 140509587572816 + 140509587571616 [label=TBackward0] + 140517615593792 -> 140509587571616 + 140517615593792 [label=ToCopyBackward0] + 140517615594032 -> 140517615593792 + 140509590859584 [label="encoder.layer.5.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590859584 -> 140517615594032 + 140517615594032 [label=AccumulateGrad] + 140509587570752 -> 140509587562400 + 140509587570752 [label=TBackward0] + 140509587571184 -> 140509587570752 + 140509587571184 [label=ToCopyBackward0] + 140509587571376 -> 140509587571184 + 140509590859344 [label="encoder.layer.5.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590859344 -> 140509587571376 + 140509587571376 [label=AccumulateGrad] + 140509587562112 -> 140509587561968 + 140509587561920 -> 140509587561872 + 140509590859104 [label="encoder.layer.5.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590859104 -> 140509587561920 + 140509587561920 [label=AccumulateGrad] + 140509587561200 -> 140509587561872 + 140509590859184 [label="encoder.layer.5.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590859184 -> 140509587561200 + 140509587561200 [label=AccumulateGrad] + 140509587560720 -> 140509587561008 + 140509587560720 [label=TBackward0] + 140509587561248 -> 140509587560720 + 140509587561248 [label=ToCopyBackward0] + 140509587561632 -> 140509587561248 + 140509590857424 [label="encoder.layer.5.experts.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590857424 -> 140509587561632 + 140509587561632 [label=AccumulateGrad] + 140509587560288 -> 140509587560480 + 140509587560288 [label=TBackward0] + 140509587560960 -> 140509587560288 + 140509587560960 [label=ToCopyBackward0] + 140509587561440 -> 140509587560960 + 140509590857184 [label="encoder.layer.5.experts.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590857184 -> 140509587561440 + 140509587561440 [label=AccumulateGrad] + 140509587560192 -> 140509587560048 + 140509587560000 -> 140509587559904 + 140509590856944 [label="encoder.layer.5.experts.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590856944 -> 140509587560000 + 140509587560000 [label=AccumulateGrad] + 140509587559952 -> 140509587559904 + 140509590857024 [label="encoder.layer.5.experts.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590857024 -> 140509587559952 + 140509587559952 [label=AccumulateGrad] + 140509587559664 -> 140509587850432 + 140509587559664 [label=NativeLayerNormBackward0] + 140509587560336 -> 140509587559664 + 140509587560336 [label=AddBackward0] + 140509587561152 -> 140509587560336 + 140509587561152 [label=NativeDropoutBackward0] + 140509587560864 -> 140509587561152 + 140509587560864 [label=ViewBackward0] + 140509587561392 -> 140509587560864 + 140509587561392 [label=AddmmBackward0] + 140509587562064 -> 140509587561392 + 140509587562064 [label=ToCopyBackward0] + 140509587562352 -> 140509587562064 + 140509590858704 [label="encoder.layer.5.output.dense.bias + (768)" fillcolor=lightblue] + 140509590858704 -> 140509587562352 + 140509587562352 [label=AccumulateGrad] + 140509587562016 -> 140509587561392 + 140509587562016 [label=ViewBackward0] + 140509587571280 -> 140509587562016 + 140509587571280 [label=GeluBackward0] + 140509587570848 -> 140509587571280 + 140509587570848 [label=ViewBackward0] + 140509587571904 -> 140509587570848 + 140509587571904 [label=AddmmBackward0] + 140509587572288 -> 140509587571904 + 140509587572288 [label=ToCopyBackward0] + 140509587591520 -> 140509587572288 + 140509590858944 [label="encoder.layer.5.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590858944 -> 140509587591520 + 140509587591520 [label=AccumulateGrad] + 140509587572096 -> 140509587571904 + 140509587572096 [label=ViewBackward0] + 140517615594320 -> 140509587572096 + 140517615594320 [label=ToCopyBackward0] + 140509587560672 -> 140517615594320 + 140509587560672 [label=SliceBackward0] + 140517615594368 -> 140509587560672 + 140517615594368 [label=SliceBackward0] + 140517615594464 -> 140517615594368 + 140517615594464 [label=SliceBackward0] + 140509587561872 -> 140517615594464 + 140509587571088 -> 140509587571904 + 140509587571088 [label=TBackward0] + 140517615594080 -> 140509587571088 + 140517615594080 [label=ToCopyBackward0] + 140517615594560 -> 140517615594080 + 140509590858864 [label="encoder.layer.5.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590858864 -> 140517615594560 + 140517615594560 [label=AccumulateGrad] + 140509587561824 -> 140509587561392 + 140509587561824 [label=TBackward0] + 140509587571040 -> 140509587561824 + 140509587571040 [label=ToCopyBackward0] + 140509587572624 -> 140509587571040 + 140509590858624 [label="encoder.layer.5.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590858624 -> 140509587572624 + 140509587572624 [label=AccumulateGrad] + 140509587560672 -> 140509587560336 + 140509587560144 -> 140509587559664 + 140509590858384 [label="encoder.layer.5.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590858384 -> 140509587560144 + 140509587560144 [label=AccumulateGrad] + 140509587560096 -> 140509587559664 + 140509590858464 [label="encoder.layer.5.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590858464 -> 140509587560096 + 140509587560096 [label=AccumulateGrad] + 140509587558944 -> 140509587559424 + 140509587558944 [label=TBackward0] + 140509587559616 -> 140509587558944 + 140509587559616 [label=ToCopyBackward0] + 140509587560624 -> 140509587559616 + 140509590856704 [label="encoder.layer.6.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590856704 -> 140509587560624 + 140509587560624 [label=AccumulateGrad] + 140509587558848 -> 140509587558800 + 140509587558848 [label=ReshapeAliasBackward0] + 140509587559184 -> 140509587558848 + 140509587559184 [label=ExpandBackward0] + 140509587559376 -> 140509587559184 + 140509587559376 [label=TransposeBackward0] + 140509587559856 -> 140509587559376 + 140509587559856 [label=PermuteBackward0] + 140509587561728 -> 140509587559856 + 140509587561728 [label=ViewBackward0] + 140509587559808 -> 140509587561728 + 140509587559808 [label=ViewBackward0] + 140509587562256 -> 140509587559808 + 140509587562256 [label=AddmmBackward0] + 140509587571712 -> 140509587562256 + 140509587571712 [label=ToCopyBackward0] + 140517615594272 -> 140509587571712 + 140509590856544 [label="encoder.layer.6.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590856544 -> 140517615594272 + 140517615594272 [label=AccumulateGrad] + 140509587570800 -> 140509587562256 + 140509587570800 [label=ViewBackward0] + 140517615594608 -> 140509587570800 + 140517615594608 [label=ToCopyBackward0] + 140509587850432 -> 140517615594608 + 140517615594176 -> 140509587562256 + 140517615594176 [label=TBackward0] + 140517615594224 -> 140517615594176 + 140517615594224 [label=ToCopyBackward0] + 140517615594752 -> 140517615594224 + 140509590856464 [label="encoder.layer.6.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590856464 -> 140517615594752 + 140517615594752 [label=AccumulateGrad] + 140509587849376 -> 140509587849520 + 140509587849376 [label=ReshapeAliasBackward0] + 140509587853120 -> 140509587849376 + 140509587853120 [label=ExpandBackward0] + 140509587853216 -> 140509587853120 + 140509587853216 [label=PermuteBackward0] + 140509587558656 -> 140509587853216 + 140509587558656 [label=ViewBackward0] + 140509587558464 -> 140509587558656 + 140509587558464 [label=ViewBackward0] + 140509587559280 -> 140509587558464 + 140509587559280 [label=AddmmBackward0] + 140509587560432 -> 140509587559280 + 140509587560432 [label=ToCopyBackward0] + 140509587558992 -> 140509587560432 + 140509590856304 [label="encoder.layer.6.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590856304 -> 140509587558992 + 140509587558992 [label=AccumulateGrad] + 140509587559568 -> 140509587559280 + 140509587559568 [label=ViewBackward0] + 140517615594512 -> 140509587559568 + 140517615594512 [label=ToCopyBackward0] + 140509587850432 -> 140517615594512 + 140509587558512 -> 140509587559280 + 140509587558512 [label=TBackward0] + 140517615594416 -> 140509587558512 + 140517615594416 [label=ToCopyBackward0] + 140517615594656 -> 140517615594416 + 140509590856224 [label="encoder.layer.6.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590856224 -> 140517615594656 + 140517615594656 [label=AccumulateGrad] + 140509587850336 -> 140509587850144 + 140509587850336 [label=TBackward0] + 140509587849664 -> 140509587850336 + 140509587849664 [label=ToCopyBackward0] + 140509587849472 -> 140509587849664 + 140509590855984 [label="encoder.layer.6.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590855984 -> 140509587849472 + 140509587849472 [label=AccumulateGrad] + 140509587850432 -> 140509587850672 + 140509587850624 -> 140509587850768 + 140509590855744 [label="encoder.layer.6.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590855744 -> 140509587850624 + 140509587850624 [label=AccumulateGrad] + 140509587851248 -> 140509587850768 + 140509590855824 [label="encoder.layer.6.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590855824 -> 140509587851248 + 140509587851248 [label=AccumulateGrad] + 140509587851920 -> 140509587851440 + 140509587851920 [label=TBackward0] + 140509587851104 -> 140509587851920 + 140509587851104 [label=ToCopyBackward0] + 140509587850720 -> 140509587851104 + 140509590843120 [label="encoder.layer.6.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590843120 -> 140509587850720 + 140509587850720 [label=AccumulateGrad] + 140509587852016 -> 140509587851968 + 140509587852016 [label=ReshapeAliasBackward0] + 140509587851584 -> 140509587852016 + 140509587851584 [label=ExpandBackward0] + 140509587851392 -> 140509587851584 + 140509587851392 [label=TransposeBackward0] + 140509587850912 -> 140509587851392 + 140509587850912 [label=PermuteBackward0] + 140509587850576 -> 140509587850912 + 140509587850576 [label=ViewBackward0] + 140509587851056 -> 140509587850576 + 140509587851056 [label=ViewBackward0] + 140509587850288 -> 140509587851056 + 140509587850288 [label=AddmmBackward0] + 140509587849952 -> 140509587850288 + 140509587849952 [label=ToCopyBackward0] + 140509587849760 -> 140509587849952 + 140509590842960 [label="encoder.layer.6.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509590842960 -> 140509587849760 + 140509587849760 [label=AccumulateGrad] + 140509587850096 -> 140509587850288 + 140509587850096 [label=ViewBackward0] + 140509587853024 -> 140509587850096 + 140509587853024 [label=ToCopyBackward0] + 140517615539152 -> 140509587853024 + 140509587851776 -> 140509587850288 + 140509587851776 [label=TBackward0] + 140509587850000 -> 140509587851776 + 140509587850000 [label=ToCopyBackward0] + 140509587559088 -> 140509587850000 + 140509590842880 [label="encoder.layer.6.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509590842880 -> 140509587559088 + 140509587559088 [label=AccumulateGrad] + 140509587852880 -> 140509587695984 + 140509587852880 [label=ReshapeAliasBackward0] + 140509587852592 -> 140509587852880 + 140509587852592 [label=ExpandBackward0] + 140509587852400 -> 140509587852592 + 140509587852400 [label=PermuteBackward0] + 140509587852208 -> 140509587852400 + 140509587852208 [label=ViewBackward0] + 140509587852736 -> 140509587852208 + 140509587852736 [label=ViewBackward0] + 140509587851488 -> 140509587852736 + 140509587851488 [label=AddmmBackward0] + 140509587850816 -> 140509587851488 + 140509587850816 [label=ToCopyBackward0] + 140509587849328 -> 140509587850816 + 140509590842720 [label="encoder.layer.6.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509590842720 -> 140509587849328 + 140509587849328 [label=AccumulateGrad] + 140509587851200 -> 140509587851488 + 140509587851200 [label=ViewBackward0] + 140509587850384 -> 140509587851200 + 140509587850384 [label=ToCopyBackward0] + 140517615539152 -> 140509587850384 + 140509587852784 -> 140509587851488 + 140509587852784 [label=TBackward0] + 140509587849568 -> 140509587852784 + 140509587849568 [label=ToCopyBackward0] + 140509587561536 -> 140509587849568 + 140509590842640 [label="encoder.layer.6.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509590842640 -> 140509587561536 + 140509587561536 [label=AccumulateGrad] + 140509587695216 -> 140509587695600 + 140509587695216 [label=TBackward0] + 140509587697520 -> 140509587695216 + 140509587697520 [label=ToCopyBackward0] + 140509587695552 -> 140509587697520 + 140509590842400 [label="encoder.layer.6.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590842400 -> 140509587695552 + 140509587695552 [label=AccumulateGrad] + 140509587695120 -> 140509587694832 + 140509587696080 -> 140509587694592 + 140509590842160 [label="encoder.layer.6.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590842160 -> 140509587696080 + 140509587696080 [label=AccumulateGrad] + 140509587697040 -> 140509587694592 + 140509590842240 [label="encoder.layer.6.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590842240 -> 140509587697040 + 140509587697040 [label=AccumulateGrad] + 140509587697328 -> 140509587696464 + 140509587697328 [label=TBackward0] + 140509587693632 -> 140509587697328 + 140509587693632 [label=ToCopyBackward0] + 140509587694256 -> 140509587693632 + 140509590826016 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590826016 -> 140509587694256 + 140509587694256 [label=AccumulateGrad] + 140509588196464 -> 140509588196752 + 140509588196464 [label=TBackward0] + 140509588197136 -> 140509588196464 + 140509588197136 [label=ToCopyBackward0] + 140509587693968 -> 140509588197136 + 140509590826176 [label="encoder.layer.6.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590826176 -> 140509587693968 + 140509587693968 [label=AccumulateGrad] + 140509588196272 -> 140509588195888 + 140509588195984 -> 140509588195696 + 140509590825696 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590825696 -> 140509588195984 + 140509588195984 [label=AccumulateGrad] + 140509588195456 -> 140509588195696 + 140509590826496 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590826496 -> 140509588195456 + 140509588195456 [label=AccumulateGrad] + 140509588195408 -> 140509588195216 + 140509588195408 [label=UnsqueezeBackward0] + 140509588195936 -> 140509588195408 + 140509588195936 [label=NativeLayerNormBackward0] + 140509588196416 -> 140509588195936 + 140509588196416 [label=AddBackward0] + 140509587694640 -> 140509588196416 + 140509587694640 [label=NativeDropoutBackward0] + 140509587697424 -> 140509587694640 + 140509587697424 [label=ViewBackward0] + 140509587693776 -> 140509587697424 + 140509587693776 [label=AddmmBackward0] + 140509587694928 -> 140509587693776 + 140509587694928 [label=ToCopyBackward0] + 140509587696848 -> 140509587694928 + 140509590825936 [label="encoder.layer.6.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509590825936 -> 140509587696848 + 140509587696848 [label=AccumulateGrad] + 140509587694736 -> 140509587693776 + 140509587694736 [label=ViewBackward0] + 140509587695888 -> 140509587694736 + 140509587695888 [label=GeluBackward0] + 140509587696176 -> 140509587695888 + 140509587696176 [label=ViewBackward0] + 140509587695504 -> 140509587696176 + 140509587695504 [label=AddmmBackward0] + 140509587852304 -> 140509587695504 + 140509587852304 [label=ToCopyBackward0] + 140509587850528 -> 140509587852304 + 140509590825456 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509590825456 -> 140509587850528 + 140509587850528 [label=AccumulateGrad] + 140509587852496 -> 140509587695504 + 140509587852496 [label=ViewBackward0] + 140509587558560 -> 140509587852496 + 140509587558560 [label=ToCopyBackward0] + 140509588196272 -> 140509587558560 + 140509587852688 -> 140509587695504 + 140509587852688 [label=TBackward0] + 140509587851680 -> 140509587852688 + 140509587851680 [label=ToCopyBackward0] + 140517615594800 -> 140509587851680 + 140509590825536 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590825536 -> 140517615594800 + 140517615594800 [label=AccumulateGrad] + 140509587697136 -> 140509587693776 + 140509587697136 [label=TBackward0] + 140509587695312 -> 140509587697136 + 140509587695312 [label=ToCopyBackward0] + 140509587558752 -> 140509587695312 + 140509590825296 [label="encoder.layer.6.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590825296 -> 140509587558752 + 140509587558752 [label=AccumulateGrad] + 140509588196272 -> 140509588196416 + 140509588196368 -> 140509588195936 + 140509590825056 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590825056 -> 140509588196368 + 140509588196368 [label=AccumulateGrad] + 140509588195792 -> 140509588195936 + 140509590824976 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590824976 -> 140509588195792 + 140509588195792 [label=AccumulateGrad] + 140509588195312 -> 140509588194976 + 140509588195312 [label=UnsqueezeBackward0] + 140509588196848 -> 140509588195312 + 140509588196848 [label=UnsqueezeBackward0] + 140509588195504 -> 140509588196848 + 140509588195504 [label=MulBackward0] + 140509587695024 -> 140509588195504 + 140509587695024 [label=ViewBackward0] + 140509587696656 -> 140509587695024 + 140509587696656 [label=CloneBackward0] + 140509587852832 -> 140509587696656 + 140509587852832 [label=ExpandBackward0] + 140517615594896 -> 140509587852832 + 140517615594896 [label=UnsqueezeBackward0] + 140517615594992 -> 140517615594896 + 140517615594992 [label=SoftmaxBackward0] + 140517615595088 -> 140517615594992 + 140517615595088 [label=MmBackward0] + 140517615595184 -> 140517615595088 + 140517615595184 [label=ToCopyBackward0] + 140517615595328 -> 140517615595184 + 140517615595328 [label=DivBackward0] + 140517615595424 -> 140517615595328 + 140517615595424 [label=SumBackward1] + 140517615595472 -> 140517615595424 + 140517615595472 [label=MulBackward0] + 140509587694352 -> 140517615595472 + 140517615595136 -> 140517615595088 + 140517615595136 [label=TBackward0] + 140517615595232 -> 140517615595136 + 140517615595232 [label=ToCopyBackward0] + 140517615595280 -> 140517615595232 + 140509590839840 [label="encoder.layer.6.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509590839840 -> 140517615595280 + 140517615595280 [label=AccumulateGrad] + 140509588194448 -> 140509588165008 + 140509588194448 [label=ViewBackward0] + 140509588196080 -> 140509588194448 + 140509588196080 [label=CloneBackward0] + 140509588195120 -> 140509588196080 + 140509588195120 [label=ExpandBackward0] + 140509587852112 -> 140509588195120 + 140509587852112 [label=UnsqueezeBackward0] + 140509587694160 -> 140509587852112 + 140509587694160 [label=NativeLayerNormBackward0] + 140517615594848 -> 140509587694160 + 140517615594848 [label=AddBackward0] + 140517615726656 -> 140517615594848 + 140517615726656 [label=NativeDropoutBackward0] + 140517615726896 -> 140517615726656 + 140517615726896 [label=ViewBackward0] + 140517615726992 -> 140517615726896 + 140517615726992 [label=AddmmBackward0] + 140517615727088 -> 140517615726992 + 140517615727088 [label=ToCopyBackward0] + 140517615727280 -> 140517615727088 + 140509590841760 [label="encoder.layer.6.output.dense.bias + (768)" fillcolor=lightblue] + 140509590841760 -> 140517615727280 + 140517615727280 [label=AccumulateGrad] + 140517615727040 -> 140517615726992 + 140517615727040 [label=ViewBackward0] + 140517615727328 -> 140517615727040 + 140517615727328 [label=GeluBackward0] + 140517615727424 -> 140517615727328 + 140517615727424 [label=ViewBackward0] + 140517615727520 -> 140517615727424 + 140517615727520 [label=AddmmBackward0] + 140517615727616 -> 140517615727520 + 140517615727616 [label=ToCopyBackward0] + 140517615727808 -> 140517615727616 + 140509590842000 [label="encoder.layer.6.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590842000 -> 140517615727808 + 140517615727808 [label=AccumulateGrad] + 140517615727568 -> 140517615727520 + 140517615727568 [label=ViewBackward0] + 140517615727856 -> 140517615727568 + 140517615727856 [label=ToCopyBackward0] + 140517615726800 -> 140517615727856 + 140517615726800 [label=SliceBackward0] + 140517615728000 -> 140517615726800 + 140517615728000 [label=SliceBackward0] + 140517615728096 -> 140517615728000 + 140517615728096 [label=SliceBackward0] + 140509587850768 -> 140517615728096 + 140517615727232 -> 140517615727520 + 140517615727232 [label=TBackward0] + 140517615727760 -> 140517615727232 + 140517615727760 [label=ToCopyBackward0] + 140517615728192 -> 140517615727760 + 140509590841920 [label="encoder.layer.6.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509590841920 -> 140517615728192 + 140517615728192 [label=AccumulateGrad] + 140517615726752 -> 140517615726992 + 140517615726752 [label=TBackward0] + 140517615727472 -> 140517615726752 + 140517615727472 [label=ToCopyBackward0] + 140517615727952 -> 140517615727472 + 140509590841680 [label="encoder.layer.6.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590841680 -> 140517615727952 + 140517615727952 [label=AccumulateGrad] + 140517615726800 -> 140517615594848 + 140517615595040 -> 140509587694160 + 140509590841440 [label="encoder.layer.6.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590841440 -> 140517615595040 + 140517615595040 [label=AccumulateGrad] + 140517615594944 -> 140509587694160 + 140509590841520 [label="encoder.layer.6.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590841520 -> 140517615594944 + 140517615594944 [label=AccumulateGrad] + 140509588193344 -> 140509588194160 + 140509588193344 [label=TBackward0] + 140509588194544 -> 140509588193344 + 140509588194544 [label=ToCopyBackward0] + 140509588194928 -> 140509588194544 + 140509590840000 [label="encoder.layer.7.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590840000 -> 140509588194928 + 140509588194928 [label=AccumulateGrad] + 140509588168464 -> 140509588168176 + 140509588168464 [label=UnsafeViewBackward0] + 140509588168560 -> 140509588168464 + 140509588168560 [label=CloneBackward0] + 140509588193776 -> 140509588168560 + 140509588193776 [label=ExpandBackward0] + 140509588194256 -> 140509588193776 + 140509588194256 [label=TransposeBackward0] + 140509588194832 -> 140509588194256 + 140509588194832 [label=PermuteBackward0] + 140509587694448 -> 140509588194832 + 140509587694448 [label=ViewBackward0] + 140517615595376 -> 140509587694448 + 140517615595376 [label=ViewBackward0] + 140509588193392 -> 140517615595376 + 140509588193392 [label=AddmmBackward0] + 140517615727136 -> 140509588193392 + 140517615727136 [label=ToCopyBackward0] + 140517615728048 -> 140517615727136 + 140509590840560 [label="encoder.layer.7.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590840560 -> 140517615728048 + 140517615728048 [label=AccumulateGrad] + 140517615726944 -> 140509588193392 + 140517615726944 [label=ViewBackward0] + 140517615727376 -> 140517615726944 + 140517615727376 [label=ToCopyBackward0] + 140509588165008 -> 140517615727376 + 140517615726704 -> 140509588193392 + 140517615726704 [label=TBackward0] + 140517615727664 -> 140517615726704 + 140517615727664 [label=ToCopyBackward0] + 140517615728240 -> 140517615727664 + 140509590840240 [label="encoder.layer.7.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590840240 -> 140517615728240 + 140517615728240 [label=AccumulateGrad] + 140509588166736 -> 140509588166832 + 140509588166736 [label=UnsafeViewBackward0] + 140509588167504 -> 140509588166736 + 140509588167504 [label=CloneBackward0] + 140509588167792 -> 140509588167504 + 140509588167792 [label=ExpandBackward0] + 140509588168080 -> 140509588167792 + 140509588168080 [label=PermuteBackward0] + 140509588166928 -> 140509588168080 + 140509588166928 [label=ViewBackward0] + 140509588167120 -> 140509588166928 + 140509588167120 [label=ViewBackward0] + 140509588194736 -> 140509588167120 + 140509588194736 [label=AddmmBackward0] + 140517615594704 -> 140509588194736 + 140517615594704 [label=ToCopyBackward0] + 140517615727712 -> 140517615594704 + 140509590839760 [label="encoder.layer.7.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590839760 -> 140517615727712 + 140517615727712 [label=AccumulateGrad] + 140509587695792 -> 140509588194736 + 140509587695792 [label=ViewBackward0] + 140517615728336 -> 140509587695792 + 140517615728336 [label=ToCopyBackward0] + 140509588165008 -> 140517615728336 + 140509588193488 -> 140509588194736 + 140509588193488 [label=TBackward0] + 140517615727184 -> 140509588193488 + 140517615727184 [label=ToCopyBackward0] + 140517615728384 -> 140517615727184 + 140509590840480 [label="encoder.layer.7.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590840480 -> 140517615728384 + 140517615728384 [label=AccumulateGrad] + 140509588165056 -> 140509588165488 + 140509588165056 [label=TBackward0] + 140509588166256 -> 140509588165056 + 140509588166256 [label=ToCopyBackward0] + 140509588166496 -> 140509588166256 + 140509590839600 [label="encoder.layer.7.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590839600 -> 140509588166496 + 140509588166496 [label=AccumulateGrad] + 140509588165008 -> 140509588164912 + 140509588164720 -> 140509588139888 + 140509590839520 [label="encoder.layer.7.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590839520 -> 140509588164720 + 140509588164720 [label=AccumulateGrad] + 140509588164672 -> 140509588139888 + 140509985419152 [label="encoder.layer.7.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509985419152 -> 140509588164672 + 140509588164672 [label=AccumulateGrad] + 140509588138160 -> 140509588138640 + 140509588138160 [label=TBackward0] + 140509588138928 -> 140509588138160 + 140509588138928 [label=ToCopyBackward0] + 140509588139456 -> 140509588138928 + 140509591342032 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591342032 -> 140509588139456 + 140509588139456 [label=AccumulateGrad] + 140509588137296 -> 140509588137536 + 140509588137296 [label=TBackward0] + 140509588138448 -> 140509588137296 + 140509588138448 [label=ToCopyBackward0] + 140509588139216 -> 140509588138448 + 140509591341712 [label="encoder.layer.7.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591341712 -> 140509588139216 + 140509588139216 [label=AccumulateGrad] + 140509588137056 -> 140509588137104 + 140509588136816 -> 140509588136912 + 140509591341472 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591341472 -> 140509588136816 + 140509588136816 [label=AccumulateGrad] + 140509588136720 -> 140509588136912 + 140509591341792 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591341792 -> 140509588136720 + 140509588136720 [label=AccumulateGrad] + 140509588136624 -> 140509588136432 + 140509588136624 [label=UnsqueezeBackward0] + 140509588137200 -> 140509588136624 + 140509588137200 [label=NativeLayerNormBackward0] + 140509588137680 -> 140509588137200 + 140509588137680 [label=AddBackward0] + 140509588139024 -> 140509588137680 + 140509588139024 [label=NativeDropoutBackward0] + 140509588138256 -> 140509588139024 + 140509588138256 [label=ViewBackward0] + 140509588139312 -> 140509588138256 + 140509588139312 [label=AddmmBackward0] + 140509588137968 -> 140509588139312 + 140509588137968 [label=ToCopyBackward0] + 140509588165776 -> 140509588137968 + 140509591342192 [label="encoder.layer.7.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591342192 -> 140509588165776 + 140509588165776 [label=AccumulateGrad] + 140509588165104 -> 140509588139312 + 140509588165104 [label=ViewBackward0] + 140509588166448 -> 140509588165104 + 140509588166448 [label=GeluBackward0] + 140509588166064 -> 140509588166448 + 140509588166064 [label=ViewBackward0] + 140509588167600 -> 140509588166064 + 140509588167600 [label=AddmmBackward0] + 140509588168272 -> 140509588167600 + 140509588168272 [label=ToCopyBackward0] + 140509588193968 -> 140509588168272 + 140509591341552 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591341552 -> 140509588193968 + 140509588193968 [label=AccumulateGrad] + 140509588167984 -> 140509588167600 + 140509588167984 [label=ViewBackward0] + 140517615727904 -> 140509588167984 + 140517615727904 [label=ToCopyBackward0] + 140509588137056 -> 140517615727904 + 140509588165872 -> 140509588167600 + 140509588165872 [label=TBackward0] + 140517615726848 -> 140509588165872 + 140517615726848 [label=ToCopyBackward0] + 140517615728288 -> 140517615726848 + 140509591341232 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591341232 -> 140517615728288 + 140517615728288 [label=AccumulateGrad] + 140509588164816 -> 140509588139312 + 140509588164816 [label=TBackward0] + 140509588166016 -> 140509588164816 + 140509588166016 [label=ToCopyBackward0] + 140509588193536 -> 140509588166016 + 140509591340992 [label="encoder.layer.7.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591340992 -> 140509588193536 + 140509588193536 [label=AccumulateGrad] + 140509588137056 -> 140509588137680 + 140509588137584 -> 140509588137200 + 140509591340752 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591340752 -> 140509588137584 + 140509588137584 [label=AccumulateGrad] + 140509588136576 -> 140509588137200 + 140509591341072 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591341072 -> 140509588136576 + 140509588136576 [label=AccumulateGrad] + 140509588136096 -> 140509588136240 + 140509588136096 [label=UnsqueezeBackward0] + 140509588138064 -> 140509588136096 + 140509588138064 [label=UnsqueezeBackward0] + 140509588139408 -> 140509588138064 + 140509588139408 [label=MulBackward0] + 140509588139696 -> 140509588139408 + 140509588139696 [label=SoftmaxBackward0] + 140509588167312 -> 140509588139696 + 140509588167312 [label=MmBackward0] + 140509588165392 -> 140509588167312 + 140509588165392 [label=ToCopyBackward0] + 140517615728480 -> 140509588165392 + 140517615728480 [label=DivBackward0] + 140517615728672 -> 140517615728480 + 140517615728672 [label=SumBackward1] + 140517615728768 -> 140517615728672 + 140517615728768 [label=MulBackward0] + 140509588137056 -> 140517615728768 + 140517615728144 -> 140509588167312 + 140517615728144 [label=TBackward0] + 140517615728720 -> 140517615728144 + 140517615728720 [label=ToCopyBackward0] + 140517615728816 -> 140517615728720 + 140509590823376 [label="encoder.layer.7.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509590823376 -> 140517615728816 + 140517615728816 [label=AccumulateGrad] + 140509588106928 -> 140509588077488 + 140509588106928 [label=IndexBackward0] + 140509588137008 -> 140509588106928 + 140509588137008 [label=NativeLayerNormBackward0] + 140509588136336 -> 140509588137008 + 140509588136336 [label=AddBackward0] + 140517615728864 -> 140509588136336 + 140517615728864 [label=NativeDropoutBackward0] + 140517615728528 -> 140517615728864 + 140517615728528 [label=ViewBackward0] + 140517615729008 -> 140517615728528 + 140517615729008 [label=AddmmBackward0] + 140517615729104 -> 140517615729008 + 140517615729104 [label=ToCopyBackward0] + 140517615729296 -> 140517615729104 + 140509590826656 [label="encoder.layer.7.output.dense.bias + (768)" fillcolor=lightblue] + 140509590826656 -> 140517615729296 + 140517615729296 [label=AccumulateGrad] + 140517615729056 -> 140517615729008 + 140517615729056 [label=ViewBackward0] + 140517615729344 -> 140517615729056 + 140517615729344 [label=GeluBackward0] + 140517615729440 -> 140517615729344 + 140517615729440 [label=ViewBackward0] + 140517615729536 -> 140517615729440 + 140517615729536 [label=AddmmBackward0] + 140517615729632 -> 140517615729536 + 140517615729632 [label=ToCopyBackward0] + 140517615729824 -> 140517615729632 + 140509590826896 [label="encoder.layer.7.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509590826896 -> 140517615729824 + 140517615729824 [label=AccumulateGrad] + 140517615729584 -> 140517615729536 + 140517615729584 [label=ViewBackward0] + 140517615729872 -> 140517615729584 + 140517615729872 [label=ToCopyBackward0] + 140517615728624 -> 140517615729872 + 140517615728624 [label=SliceBackward0] + 140517615730016 -> 140517615728624 + 140517615730016 [label=SliceBackward0] + 140517615730112 -> 140517615730016 + 140517615730112 [label=SliceBackward0] + 140509588139888 -> 140517615730112 + 140517615729248 -> 140517615729536 + 140517615729248 [label=TBackward0] + 140517615729776 -> 140517615729248 + 140517615729776 [label=ToCopyBackward0] + 140517615730208 -> 140517615729776 + 140509985417872 [label="encoder.layer.7.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509985417872 -> 140517615730208 + 140517615730208 [label=AccumulateGrad] + 140517615728912 -> 140517615729008 + 140517615728912 [label=TBackward0] + 140517615729488 -> 140517615728912 + 140517615729488 [label=ToCopyBackward0] + 140517615729968 -> 140517615729488 + 140509590826416 [label="encoder.layer.7.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509590826416 -> 140517615729968 + 140517615729968 [label=AccumulateGrad] + 140517615728624 -> 140509588136336 + 140509588138736 -> 140509588137008 + 140509590826736 [label="encoder.layer.7.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590826736 -> 140509588138736 + 140509588138736 [label=AccumulateGrad] + 140509588136048 -> 140509588137008 + 140509590824496 [label="encoder.layer.7.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509590824496 -> 140509588136048 + 140509588136048 [label=AccumulateGrad] + 140509588105392 -> 140509588106352 + 140509588105392 [label=TBackward0] + 140509588106640 -> 140509588105392 + 140509588106640 [label=ToCopyBackward0] + 140509588165584 -> 140509588106640 + 140509590823616 [label="encoder.layer.8.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509590823616 -> 140509588165584 + 140509588165584 [label=AccumulateGrad] + 140509588105200 -> 140509588105296 + 140509588105200 [label=UnsafeViewBackward0] + 140509588136144 -> 140509588105200 + 140509588136144 [label=CloneBackward0] + 140509588106064 -> 140509588136144 + 140509588106064 [label=ExpandBackward0] + 140509588106448 -> 140509588106064 + 140509588106448 [label=TransposeBackward0] + 140509588107216 -> 140509588106448 + 140509588107216 [label=PermuteBackward0] + 140509588106880 -> 140509588107216 + 140509588106880 [label=ViewBackward0] + 140517615728960 -> 140509588106880 + 140517615728960 [label=ViewBackward0] + 140517615729200 -> 140517615728960 + 140517615729200 [label=AddmmBackward0] + 140517615729728 -> 140517615729200 + 140517615729728 [label=ToCopyBackward0] + 140517615729920 -> 140517615729728 + 140509590823776 [label="encoder.layer.8.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509590823776 -> 140517615729920 + 140517615729920 [label=AccumulateGrad] + 140517615729680 -> 140517615729200 + 140517615729680 [label=ViewBackward0] + 140517615730256 -> 140517615729680 + 140517615730256 [label=ToCopyBackward0] + 140509588077488 -> 140517615730256 + 140517615728432 -> 140517615729200 + 140517615728432 [label=TBackward0] + 140517615729392 -> 140517615728432 + 140517615729392 [label=ToCopyBackward0] + 140517615730400 -> 140517615729392 + 140509590823856 [label="encoder.layer.8.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509590823856 -> 140517615730400 + 140517615730400 [label=AccumulateGrad] + 140509588103856 -> 140509588103520 + 140509588103856 [label=UnsafeViewBackward0] + 140509588104240 -> 140509588103856 + 140509588104240 [label=CloneBackward0] + 140509588104480 -> 140509588104240 + 140509588104480 [label=ExpandBackward0] + 140509588104912 -> 140509588104480 + 140509588104912 [label=PermuteBackward0] + 140509588104048 -> 140509588104912 + 140509588104048 [label=ViewBackward0] + 140509588105968 -> 140509588104048 + 140509588105968 [label=ViewBackward0] + 140509588106736 -> 140509588105968 + 140509588106736 [label=AddmmBackward0] + 140509588105584 -> 140509588106736 + 140509588105584 [label=ToCopyBackward0] + 140517615730160 -> 140509588105584 + 140509590824016 [label="encoder.layer.8.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509590824016 -> 140517615730160 + 140517615730160 [label=AccumulateGrad] + 140509588103952 -> 140509588106736 + 140509588103952 [label=ViewBackward0] + 140517615730496 -> 140509588103952 + 140517615730496 [label=ToCopyBackward0] + 140509588077488 -> 140517615730496 + 140517615728576 -> 140509588106736 + 140517615728576 [label=TBackward0] + 140517615730064 -> 140517615728576 + 140517615730064 [label=ToCopyBackward0] + 140517615730544 -> 140517615730064 + 140509590824096 [label="encoder.layer.8.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509590824096 -> 140517615730544 + 140517615730544 [label=AccumulateGrad] + 140509588077584 -> 140509588077968 + 140509588077584 [label=TBackward0] + 140509588078256 -> 140509588077584 + 140509588078256 [label=ToCopyBackward0] + 140509588103664 -> 140509588078256 + 140509590823296 [label="encoder.layer.8.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509590823296 -> 140509588103664 + 140509588103664 [label=AccumulateGrad] + 140509588077488 -> 140509588076960 + 140509588077104 -> 140509588076912 + 140509590823136 [label="encoder.layer.8.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509590823136 -> 140509588077104 + 140509588077104 [label=AccumulateGrad] + 140509588076000 -> 140509588076912 + 140509591342912 [label="encoder.layer.8.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591342912 -> 140509588076000 + 140509588076000 [label=AccumulateGrad] + 140509588074800 -> 140509588075760 + 140509588074800 [label=TBackward0] + 140509588076336 -> 140509588074800 + 140509588076336 [label=ToCopyBackward0] + 140509588077008 -> 140509588076336 + 140509591342992 [label="encoder.layer.8.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509591342992 -> 140509588077008 + 140509588077008 [label=AccumulateGrad] + 140509588074704 -> 140509588074656 + 140509588074704 [label=UnsafeViewBackward0] + 140509588075376 -> 140509588074704 + 140509588075376 [label=CloneBackward0] + 140509588075664 -> 140509588075376 + 140509588075664 [label=ExpandBackward0] + 140509588076144 -> 140509588075664 + 140509588076144 [label=TransposeBackward0] + 140509588076816 -> 140509588076144 + 140509588076816 [label=PermuteBackward0] + 140509588077296 -> 140509588076816 + 140509588077296 [label=ViewBackward0] + 140509588077440 -> 140509588077296 + 140509588077440 [label=ViewBackward0] + 140509588077920 -> 140509588077440 + 140509588077920 [label=AddmmBackward0] + 140509588078544 -> 140509588077920 + 140509588078544 [label=ToCopyBackward0] + 140509588104432 -> 140509588078544 + 140509591342752 [label="encoder.layer.8.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509591342752 -> 140509588104432 + 140509588104432 [label=AccumulateGrad] + 140509588075088 -> 140509588077920 + 140509588075088 [label=ViewBackward0] + 140509588104720 -> 140509588075088 + 140509588104720 [label=ToCopyBackward0] + 140509588105776 -> 140509588104720 + 140509588105776 [label=ViewBackward0] + 140509588106256 -> 140509588105776 + 140509588106256 [label=CloneBackward0] + 140517615730352 -> 140509588106256 + 140517615730352 [label=ExpandBackward0] + 140517615730592 -> 140517615730352 + 140517615730592 [label=UnsqueezeBackward0] + 140517615539152 -> 140517615730592 + 140509588103568 -> 140509588077920 + 140509588103568 [label=TBackward0] + 140509588103280 -> 140509588103568 + 140509588103280 [label=ToCopyBackward0] + 140509588104960 -> 140509588103280 + 140509591342672 [label="encoder.layer.8.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509591342672 -> 140509588104960 + 140509588104960 [label=AccumulateGrad] + 140509588048624 -> 140509588048432 + 140509588048624 [label=UnsafeViewBackward0] + 140509588048960 -> 140509588048624 + 140509588048960 [label=CloneBackward0] + 140509588049392 -> 140509588048960 + 140509588049392 [label=ExpandBackward0] + 140509588048816 -> 140509588049392 + 140509588048816 [label=PermuteBackward0] + 140509588048720 -> 140509588048816 + 140509588048720 [label=ViewBackward0] + 140509588075568 -> 140509588048720 + 140509588075568 [label=ViewBackward0] + 140509588076624 -> 140509588075568 + 140509588076624 [label=AddmmBackward0] + 140509588076432 -> 140509588076624 + 140509588076432 [label=ToCopyBackward0] + 140509588103376 -> 140509588076432 + 140509591340592 [label="encoder.layer.8.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509591340592 -> 140509588103376 + 140509588103376 [label=AccumulateGrad] + 140509588077200 -> 140509588076624 + 140509588077200 [label=ViewBackward0] + 140509588104000 -> 140509588077200 + 140509588104000 [label=ToCopyBackward0] + 140509588105776 -> 140509588104000 + 140509588074608 -> 140509588076624 + 140509588074608 [label=TBackward0] + 140517615730640 -> 140509588074608 + 140517615730640 [label=ToCopyBackward0] + 140517615730448 -> 140517615730640 + 140509591342512 [label="encoder.layer.8.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509591342512 -> 140517615730448 + 140517615730448 [label=AccumulateGrad] + 140509588047088 -> 140509588047376 + 140509588047088 [label=TBackward0] + 140509588048144 -> 140509588047088 + 140509588048144 [label=ToCopyBackward0] + 140509588048528 -> 140509588048144 + 140509591340832 [label="encoder.layer.8.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509591340832 -> 140509588048528 + 140509588048528 [label=AccumulateGrad] + 140509588046896 -> 140509588046608 + 140509588046320 -> 140509588046416 + 140509591340512 [label="encoder.layer.8.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591340512 -> 140509588046320 + 140509588046320 [label=AccumulateGrad] + 140509588045888 -> 140509588046416 + 140509591340272 [label="encoder.layer.8.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591340272 -> 140509588045888 + 140509588045888 [label=AccumulateGrad] + 140509588024432 -> 140509588024912 + 140509588024432 [label=TBackward0] + 140509588046128 -> 140509588024432 + 140509588046128 [label=ToCopyBackward0] + 140509588046512 -> 140509588046128 + 140509591319952 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591319952 -> 140509588046512 + 140509588046512 [label=AccumulateGrad] + 140509588023568 -> 140509588023856 + 140509588023568 [label=TBackward0] + 140509588024576 -> 140509588023568 + 140509588024576 [label=ToCopyBackward0] + 140509588025008 -> 140509588024576 + 140509591320032 [label="encoder.layer.8.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591320032 -> 140509588025008 + 140509588025008 [label=AccumulateGrad] + 140509588023376 -> 140509588023280 + 140509588023088 -> 140509588023184 + 140509591319792 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591319792 -> 140509588023088 + 140509588023088 [label=AccumulateGrad] + 140509588022992 -> 140509588023184 + 140509591319712 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591319712 -> 140509588022992 + 140509588022992 [label=AccumulateGrad] + 140509588022800 -> 140509588022704 + 140509588022800 [label=UnsqueezeBackward0] + 140509588023472 -> 140509588022800 + 140509588023472 [label=NativeLayerNormBackward0] + 140509588023952 -> 140509588023472 + 140509588023952 [label=AddBackward0] + 140509588024528 -> 140509588023952 + 140509588024528 [label=NativeDropoutBackward0] + 140509588046032 -> 140509588024528 + 140509588046032 [label=ViewBackward0] + 140509588045936 -> 140509588046032 + 140509588045936 [label=AddmmBackward0] + 140509588047472 -> 140509588045936 + 140509588047472 [label=ToCopyBackward0] + 140509588047520 -> 140509588047472 + 140509591320512 [label="encoder.layer.8.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591320512 -> 140509588047520 + 140509588047520 [label=AccumulateGrad] + 140509588047040 -> 140509588045936 + 140509588047040 [label=ViewBackward0] + 140509588048048 -> 140509588047040 + 140509588048048 [label=GeluBackward0] + 140509588049440 -> 140509588048048 + 140509588049440 [label=ViewBackward0] + 140509588048912 -> 140509588049440 + 140509588048912 [label=AddmmBackward0] + 140509588077680 -> 140509588048912 + 140509588077680 [label=ToCopyBackward0] + 140517615729152 -> 140509588077680 + 140509591319472 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591319472 -> 140517615729152 + 140517615729152 [label=AccumulateGrad] + 140509588075856 -> 140509588048912 + 140509588075856 [label=ViewBackward0] + 140517615268000 -> 140509588075856 + 140517615268000 [label=ToCopyBackward0] + 140509588023376 -> 140517615268000 + 140509588074560 -> 140509588048912 + 140509588074560 [label=TBackward0] + 140517615267904 -> 140509588074560 + 140517615267904 [label=ToCopyBackward0] + 140517615268144 -> 140517615267904 + 140509591319552 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591319552 -> 140517615268144 + 140517615268144 [label=AccumulateGrad] + 140509588046992 -> 140509588045936 + 140509588046992 [label=TBackward0] + 140509588075184 -> 140509588046992 + 140509588075184 [label=ToCopyBackward0] + 140517615730304 -> 140509588075184 + 140509591319312 [label="encoder.layer.8.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591319312 -> 140517615730304 + 140517615730304 [label=AccumulateGrad] + 140509588023376 -> 140509588023952 + 140509588023760 -> 140509588023472 + 140509591319072 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591319072 -> 140509588023760 + 140509588023760 [label=AccumulateGrad] + 140509588022896 -> 140509588023472 + 140509591318992 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591318992 -> 140509588022896 + 140509588022896 [label=AccumulateGrad] + 140509588022416 -> 140509588022512 + 140509588022416 [label=UnsqueezeBackward0] + 140509588024240 -> 140509588022416 + 140509588024240 [label=UnsqueezeBackward0] + 140509588024096 -> 140509588024240 + 140509588024096 [label=MulBackward0] + 140509588047664 -> 140509588024096 + 140509588047664 [label=SoftmaxBackward0] + 140509588049200 -> 140509588047664 + 140509588049200 [label=MmBackward0] + 140509588046080 -> 140509588049200 + 140509588046080 [label=ToCopyBackward0] + 140517615268048 -> 140509588046080 + 140517615268048 [label=DivBackward0] + 140517615268336 -> 140517615268048 + 140517615268336 [label=SumBackward1] + 140517615268432 -> 140517615268336 + 140517615268432 [label=MulBackward0] + 140509588023376 -> 140517615268432 + 140517615267952 -> 140509588049200 + 140517615267952 [label=TBackward0] + 140517615268384 -> 140517615267952 + 140517615268384 [label=ToCopyBackward0] + 140517615268480 -> 140517615268384 + 140509591321392 [label="encoder.layer.8.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509591321392 -> 140517615268480 + 140517615268480 [label=AccumulateGrad] + 140509588021840 -> 140509587963664 + 140509588021840 [label=IndexBackward0] + 140509588023136 -> 140509588021840 + 140509588023136 [label=NativeLayerNormBackward0] + 140509588022608 -> 140509588023136 + 140509588022608 [label=AddBackward0] + 140517615268528 -> 140509588022608 + 140517615268528 [label=NativeDropoutBackward0] + 140517615268192 -> 140517615268528 + 140517615268192 [label=ViewBackward0] + 140517615268672 -> 140517615268192 + 140517615268672 [label=AddmmBackward0] + 140517615268768 -> 140517615268672 + 140517615268768 [label=ToCopyBackward0] + 140517615268960 -> 140517615268768 + 140509591339792 [label="encoder.layer.8.output.dense.bias + (768)" fillcolor=lightblue] + 140509591339792 -> 140517615268960 + 140517615268960 [label=AccumulateGrad] + 140517615268720 -> 140517615268672 + 140517615268720 [label=ViewBackward0] + 140517615269008 -> 140517615268720 + 140517615269008 [label=GeluBackward0] + 140517615269104 -> 140517615269008 + 140517615269104 [label=ViewBackward0] + 140517615269200 -> 140517615269104 + 140517615269200 [label=AddmmBackward0] + 140517615269296 -> 140517615269200 + 140517615269296 [label=ToCopyBackward0] + 140517615269488 -> 140517615269296 + 140509591340032 [label="encoder.layer.8.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509591340032 -> 140517615269488 + 140517615269488 [label=AccumulateGrad] + 140517615269248 -> 140517615269200 + 140517615269248 [label=ViewBackward0] + 140517615269536 -> 140517615269248 + 140517615269536 [label=ToCopyBackward0] + 140517615268288 -> 140517615269536 + 140517615268288 [label=SliceBackward0] + 140517615269680 -> 140517615268288 + 140517615269680 [label=SliceBackward0] + 140517615269776 -> 140517615269680 + 140517615269776 [label=SliceBackward0] + 140509588076912 -> 140517615269776 + 140517615268912 -> 140517615269200 + 140517615268912 [label=TBackward0] + 140517615269440 -> 140517615268912 + 140517615269440 [label=ToCopyBackward0] + 140517615269872 -> 140517615269440 + 140509591340352 [label="encoder.layer.8.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591340352 -> 140517615269872 + 140517615269872 [label=AccumulateGrad] + 140517615268576 -> 140517615268672 + 140517615268576 [label=TBackward0] + 140517615269152 -> 140517615268576 + 140517615269152 [label=ToCopyBackward0] + 140517615269632 -> 140517615269152 + 140509591340112 [label="encoder.layer.8.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591340112 -> 140517615269632 + 140517615269632 [label=AccumulateGrad] + 140517615268288 -> 140509588022608 + 140509588022176 -> 140509588023136 + 140509591339872 [label="encoder.layer.8.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591339872 -> 140509588022176 + 140509588022176 [label=AccumulateGrad] + 140509588046560 -> 140509588023136 + 140509591339552 [label="encoder.layer.8.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591339552 -> 140509588046560 + 140509588046560 [label=AccumulateGrad] + 140509588021312 -> 140509587991520 + 140509588021312 [label=TBackward0] + 140509588021648 -> 140509588021312 + 140509588021648 [label=ToCopyBackward0] + 140509588048336 -> 140509588021648 + 140509591321632 [label="encoder.layer.9.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509591321632 -> 140509588048336 + 140509588048336 [label=AccumulateGrad] + 140509587991472 -> 140509587991568 + 140509587991472 [label=UnsafeViewBackward0] + 140509587992144 -> 140509587991472 + 140509587992144 [label=CloneBackward0] + 140509587992528 -> 140509587992144 + 140509587992528 [label=ExpandBackward0] + 140509587991856 -> 140509587992528 + 140509587991856 [label=TransposeBackward0] + 140509588022320 -> 140509587991856 + 140509588022320 [label=PermuteBackward0] + 140509588021936 -> 140509588022320 + 140509588021936 [label=ViewBackward0] + 140517615268624 -> 140509588021936 + 140517615268624 [label=ViewBackward0] + 140517615268864 -> 140517615268624 + 140517615268864 [label=AddmmBackward0] + 140517615269392 -> 140517615268864 + 140517615269392 [label=ToCopyBackward0] + 140517615269584 -> 140517615269392 + 140509591322192 [label="encoder.layer.9.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509591322192 -> 140517615269584 + 140517615269584 [label=AccumulateGrad] + 140517615269344 -> 140517615268864 + 140517615269344 [label=ViewBackward0] + 140517615269920 -> 140517615269344 + 140517615269920 [label=ToCopyBackward0] + 140509587963664 -> 140517615269920 + 140517615268096 -> 140517615268864 + 140517615268096 [label=TBackward0] + 140517615269056 -> 140517615268096 + 140517615269056 [label=ToCopyBackward0] + 140517615270064 -> 140517615269056 + 140509591321872 [label="encoder.layer.9.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509591321872 -> 140517615270064 + 140517615270064 [label=AccumulateGrad] + 140509587990128 -> 140509587989840 + 140509587990128 [label=UnsafeViewBackward0] + 140509587990512 -> 140509587990128 + 140509587990512 [label=CloneBackward0] + 140509587990800 -> 140509587990512 + 140509587990800 [label=ExpandBackward0] + 140509587991040 -> 140509587990800 + 140509587991040 [label=PermuteBackward0] + 140509587990224 -> 140509587991040 + 140509587990224 [label=ViewBackward0] + 140509587992336 -> 140509587990224 + 140509587992336 [label=ViewBackward0] + 140509587990080 -> 140509587992336 + 140509587990080 [label=AddmmBackward0] + 140509588021360 -> 140509587990080 + 140509588021360 [label=ToCopyBackward0] + 140517615269824 -> 140509588021360 + 140509591322432 [label="encoder.layer.9.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509591322432 -> 140517615269824 + 140517615269824 [label=AccumulateGrad] + 140509588021744 -> 140509587990080 + 140509588021744 [label=ViewBackward0] + 140517615270160 -> 140509588021744 + 140517615270160 [label=ToCopyBackward0] + 140509587963664 -> 140517615270160 + 140517615268240 -> 140509587990080 + 140517615268240 [label=TBackward0] + 140517615269728 -> 140517615268240 + 140517615269728 [label=ToCopyBackward0] + 140517615270208 -> 140517615269728 + 140509591322112 [label="encoder.layer.9.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509591322112 -> 140517615270208 + 140517615270208 [label=AccumulateGrad] + 140509587988688 -> 140509587988784 + 140509587988688 [label=TBackward0] + 140509587989648 -> 140509587988688 + 140509587989648 [label=ToCopyBackward0] + 140509587989936 -> 140509587989648 + 140509591321712 [label="encoder.layer.9.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509591321712 -> 140509587989936 + 140509587989936 [label=AccumulateGrad] + 140509587963664 -> 140509587963280 + 140509587963376 -> 140509587963040 + 140509591321232 [label="encoder.layer.9.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591321232 -> 140509587963376 + 140509587963376 [label=AccumulateGrad] + 140509587962032 -> 140509587963040 + 140509591321472 [label="encoder.layer.9.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591321472 -> 140509587962032 + 140509587962032 [label=AccumulateGrad] + 140509587961120 -> 140509587961600 + 140509587961120 [label=TBackward0] + 140509587962224 -> 140509587961120 + 140509587962224 [label=ToCopyBackward0] + 140509587962896 -> 140509587962224 + 140509591311760 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591311760 -> 140509587962896 + 140509587962896 [label=AccumulateGrad] + 140509587960688 -> 140509587960976 + 140509587960688 [label=TBackward0] + 140509587961744 -> 140509587960688 + 140509587961744 [label=ToCopyBackward0] + 140509587962608 -> 140509587961744 + 140509591311440 [label="encoder.layer.9.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591311440 -> 140509587962608 + 140509587962608 [label=AccumulateGrad] + 140509587960496 -> 140509587960112 + 140509587960208 -> 140509588463424 + 140509591311200 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591311200 -> 140509587960208 + 140509587960208 [label=AccumulateGrad] + 140509587960016 -> 140509588463424 + 140509591311520 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591311520 -> 140509587960016 + 140509587960016 [label=AccumulateGrad] + 140509588463376 -> 140509588463184 + 140509588463376 [label=UnsqueezeBackward0] + 140509587960160 -> 140509588463376 + 140509587960160 [label=NativeLayerNormBackward0] + 140509587960640 -> 140509587960160 + 140509587960640 [label=AddBackward0] + 140509587963184 -> 140509587960640 + 140509587963184 [label=NativeDropoutBackward0] + 140509587961648 -> 140509587963184 + 140509587961648 [label=ViewBackward0] + 140509587962320 -> 140509587961648 + 140509587962320 [label=AddmmBackward0] + 140509587963472 -> 140509587962320 + 140509587963472 [label=ToCopyBackward0] + 140509587989168 -> 140509587963472 + 140509591311920 [label="encoder.layer.9.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591311920 -> 140509587989168 + 140509587989168 [label=AccumulateGrad] + 140509587963568 -> 140509587962320 + 140509587963568 [label=ViewBackward0] + 140509587989744 -> 140509587963568 + 140509587989744 [label=GeluBackward0] + 140509587989072 -> 140509587989744 + 140509587989072 [label=ViewBackward0] + 140509587990560 -> 140509587989072 + 140509587990560 [label=AddmmBackward0] + 140509587991280 -> 140509587990560 + 140509587991280 [label=ToCopyBackward0] + 140509588022224 -> 140509587991280 + 140509591311280 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591311280 -> 140509588022224 + 140509588022224 [label=AccumulateGrad] + 140509587990992 -> 140509587990560 + 140509587990992 [label=ViewBackward0] + 140517615270016 -> 140509587990992 + 140517615270016 [label=ToCopyBackward0] + 140509587960496 -> 140517615270016 + 140509587988880 -> 140509587990560 + 140509587988880 [label=TBackward0] + 140517615268816 -> 140509587988880 + 140517615268816 [label=ToCopyBackward0] + 140517615270112 -> 140517615268816 + 140509591310960 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591310960 -> 140517615270112 + 140517615270112 [label=AccumulateGrad] + 140509587961264 -> 140509587962320 + 140509587961264 [label=TBackward0] + 140509587989456 -> 140509587961264 + 140509587989456 [label=ToCopyBackward0] + 140509587992048 -> 140509587989456 + 140509591310720 [label="encoder.layer.9.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591310720 -> 140509587992048 + 140509587992048 [label=AccumulateGrad] + 140509587960496 -> 140509587960640 + 140509587960592 -> 140509587960160 + 140509591310480 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591310480 -> 140509587960592 + 140509587960592 [label=AccumulateGrad] + 140509587959920 -> 140509587960160 + 140509591310800 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591310800 -> 140509587959920 + 140509587959920 [label=AccumulateGrad] + 140509588463280 -> 140509588462944 + 140509588463280 [label=UnsqueezeBackward0] + 140509588463472 -> 140509588463280 + 140509588463472 [label=UnsqueezeBackward0] + 140509587962704 -> 140509588463472 + 140509587962704 [label=MulBackward0] + 140509587963856 -> 140509587962704 + 140509587963856 [label=SoftmaxBackward0] + 140509587990320 -> 140509587963856 + 140509587990320 [label=MmBackward0] + 140509587960304 -> 140509587990320 + 140509587960304 [label=ToCopyBackward0] + 140517615270304 -> 140509587960304 + 140517615270304 [label=DivBackward0] + 140517615270496 -> 140517615270304 + 140517615270496 [label=SumBackward1] + 140517615270592 -> 140517615270496 + 140517615270592 [label=MulBackward0] + 140509587960496 -> 140517615270592 + 140517615269968 -> 140509587990320 + 140517615269968 [label=TBackward0] + 140517615270544 -> 140517615269968 + 140517615270544 [label=ToCopyBackward0] + 140517615270640 -> 140517615270544 + 140509591313200 [label="encoder.layer.9.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509591313200 -> 140517615270640 + 140517615270640 [label=AccumulateGrad] + 140509588462416 -> 140509588428880 + 140509588462416 [label=IndexBackward0] + 140509588462896 -> 140509588462416 + 140509588462896 [label=NativeLayerNormBackward0] + 140509587963088 -> 140509588462896 + 140509587963088 [label=AddBackward0] + 140517615270688 -> 140509587963088 + 140517615270688 [label=NativeDropoutBackward0] + 140517615270352 -> 140517615270688 + 140517615270352 [label=ViewBackward0] + 140517615270832 -> 140517615270352 + 140517615270832 [label=AddmmBackward0] + 140517615270928 -> 140517615270832 + 140517615270928 [label=ToCopyBackward0] + 140517615271120 -> 140517615270928 + 140509591320672 [label="encoder.layer.9.output.dense.bias + (768)" fillcolor=lightblue] + 140509591320672 -> 140517615271120 + 140517615271120 [label=AccumulateGrad] + 140517615270880 -> 140517615270832 + 140517615270880 [label=ViewBackward0] + 140517615271168 -> 140517615270880 + 140517615271168 [label=GeluBackward0] + 140517615271264 -> 140517615271168 + 140517615271264 [label=ViewBackward0] + 140517615271360 -> 140517615271264 + 140517615271360 [label=AddmmBackward0] + 140517615271456 -> 140517615271360 + 140517615271456 [label=ToCopyBackward0] + 140517615271648 -> 140517615271456 + 140509591320752 [label="encoder.layer.9.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509591320752 -> 140517615271648 + 140517615271648 [label=AccumulateGrad] + 140517615271408 -> 140517615271360 + 140517615271408 [label=ViewBackward0] + 140517615271696 -> 140517615271408 + 140517615271696 [label=ToCopyBackward0] + 140517615270448 -> 140517615271696 + 140517615270448 [label=SliceBackward0] + 140517615271840 -> 140517615270448 + 140517615271840 [label=SliceBackward0] + 140517615271888 -> 140517615271840 + 140517615271888 [label=SliceBackward0] + 140509587963040 -> 140517615271888 + 140517615271072 -> 140517615271360 + 140517615271072 [label=TBackward0] + 140517615271600 -> 140517615271072 + 140517615271600 [label=ToCopyBackward0] + 140517615271552 -> 140517615271600 + 140509591320912 [label="encoder.layer.9.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591320912 -> 140517615271552 + 140517615271552 [label=AccumulateGrad] + 140517615270736 -> 140517615270832 + 140517615270736 [label=TBackward0] + 140517615271312 -> 140517615270736 + 140517615271312 [label=ToCopyBackward0] + 140517615271792 -> 140517615271312 + 140509591320992 [label="encoder.layer.9.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591320992 -> 140517615271792 + 140517615271792 [label=AccumulateGrad] + 140517615270448 -> 140509587963088 + 140509587962128 -> 140509588462896 + 140509591320432 [label="encoder.layer.9.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591320432 -> 140509587962128 + 140509587962128 [label=AccumulateGrad] + 140509587961072 -> 140509588462896 + 140509591318592 [label="encoder.layer.9.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591318592 -> 140509587961072 + 140509587961072 [label=AccumulateGrad] + 140509588461168 -> 140509588462128 + 140509588461168 [label=TBackward0] + 140509588462512 -> 140509588461168 + 140509588462512 [label=ToCopyBackward0] + 140509587988592 -> 140509588462512 + 140509591313440 [label="encoder.layer.10.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509591313440 -> 140509587988592 + 140509587988592 [label=AccumulateGrad] + 140509588461072 -> 140509588460784 + 140509588461072 [label=UnsafeViewBackward0] + 140509588461456 -> 140509588461072 + 140509588461456 [label=CloneBackward0] + 140509588461744 -> 140509588461456 + 140509588461744 [label=ExpandBackward0] + 140509588462224 -> 140509588461744 + 140509588462224 [label=TransposeBackward0] + 140509588463088 -> 140509588462224 + 140509588463088 [label=PermuteBackward0] + 140509588462800 -> 140509588463088 + 140509588462800 [label=ViewBackward0] + 140517615270784 -> 140509588462800 + 140517615270784 [label=ViewBackward0] + 140517615271024 -> 140517615270784 + 140517615271024 [label=AddmmBackward0] + 140517615271744 -> 140517615271024 + 140517615271744 [label=ToCopyBackward0] + 140517615321248 -> 140517615271744 + 140509591313600 [label="encoder.layer.10.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509591313600 -> 140517615321248 + 140517615321248 [label=AccumulateGrad] + 140517615271504 -> 140517615271024 + 140517615271504 [label=ViewBackward0] + 140517615321296 -> 140517615271504 + 140517615321296 [label=ToCopyBackward0] + 140509588428880 -> 140517615321296 + 140517615270256 -> 140517615271024 + 140517615270256 [label=TBackward0] + 140517615321152 -> 140517615270256 + 140517615321152 [label=ToCopyBackward0] + 140517615321440 -> 140517615321152 + 140509591313680 [label="encoder.layer.10.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509591313680 -> 140517615321440 + 140517615321440 [label=AccumulateGrad] + 140509588429936 -> 140509588430704 + 140509588429936 [label=UnsafeViewBackward0] + 140509588460112 -> 140509588429936 + 140509588460112 [label=CloneBackward0] + 140509588460400 -> 140509588460112 + 140509588460400 [label=ExpandBackward0] + 140509588460688 -> 140509588460400 + 140509588460688 [label=PermuteBackward0] + 140509588459632 -> 140509588460688 + 140509588459632 [label=ViewBackward0] + 140509588461504 -> 140509588459632 + 140509588461504 [label=ViewBackward0] + 140509588462704 -> 140509588461504 + 140509588462704 [label=AddmmBackward0] + 140509588461024 -> 140509588462704 + 140509588461024 [label=ToCopyBackward0] + 140517615271216 -> 140509588461024 + 140509591313840 [label="encoder.layer.10.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509591313840 -> 140517615271216 + 140517615271216 [label=AccumulateGrad] + 140509588459680 -> 140509588462704 + 140509588459680 [label=ViewBackward0] + 140517615321536 -> 140509588459680 + 140517615321536 [label=ToCopyBackward0] + 140509588428880 -> 140517615321536 + 140517615270400 -> 140509588462704 + 140517615270400 [label=TBackward0] + 140517615321392 -> 140517615270400 + 140517615321392 [label=ToCopyBackward0] + 140517615321584 -> 140517615321392 + 140509591313920 [label="encoder.layer.10.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509591313920 -> 140517615321584 + 140517615321584 [label=AccumulateGrad] + 140509588428928 -> 140509588429360 + 140509588428928 [label=TBackward0] + 140509588430128 -> 140509588428928 + 140509588430128 [label=ToCopyBackward0] + 140509588430368 -> 140509588430128 + 140509591313120 [label="encoder.layer.10.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509591313120 -> 140509588430368 + 140509588430368 [label=AccumulateGrad] + 140509588428880 -> 140509588428784 + 140509588428448 -> 140509588428592 + 140509591312640 [label="encoder.layer.10.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591312640 -> 140509588428448 + 140509588428448 [label=AccumulateGrad] + 140509588427824 -> 140509588428592 + 140509591312880 [label="encoder.layer.10.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591312880 -> 140509588427824 + 140509588427824 [label=AccumulateGrad] + 140509588426816 -> 140509588427536 + 140509588426816 [label=TBackward0] + 140509588427728 -> 140509588426816 + 140509588427728 [label=ToCopyBackward0] + 140509588428400 -> 140509588427728 + 140509591312720 [label="encoder.layer.10.crossattention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509591312720 -> 140509588428400 + 140509588428400 [label=AccumulateGrad] + 140509588405840 -> 140509588405504 + 140509588405840 [label=UnsafeViewBackward0] + 140509588406032 -> 140509588405840 + 140509588406032 [label=CloneBackward0] + 140509588427008 -> 140509588406032 + 140509588427008 [label=ExpandBackward0] + 140509588427488 -> 140509588427008 + 140509588427488 [label=TransposeBackward0] + 140509588428208 -> 140509588427488 + 140509588428208 [label=PermuteBackward0] + 140509588428688 -> 140509588428208 + 140509588428688 [label=ViewBackward0] + 140509588429264 -> 140509588428688 + 140509588429264 [label=ViewBackward0] + 140509588429744 -> 140509588429264 + 140509588429744 [label=AddmmBackward0] + 140509588430320 -> 140509588429744 + 140509588430320 [label=ToCopyBackward0] + 140509588460208 -> 140509588430320 + 140509591312480 [label="encoder.layer.10.crossattention.self.key.bias + (768)" fillcolor=lightblue] + 140509591312480 -> 140509588460208 + 140509588460208 [label=AccumulateGrad] + 140509588429648 -> 140509588429744 + 140509588429648 [label=ViewBackward0] + 140509588460592 -> 140509588429648 + 140509588460592 [label=ToCopyBackward0] + 140509588461264 -> 140509588460592 + 140509588461264 [label=ViewBackward0] + 140517615270976 -> 140509588461264 + 140517615270976 [label=CloneBackward0] + 140509588459584 -> 140517615270976 + 140509588459584 [label=ExpandBackward0] + 140517615321632 -> 140509588459584 + 140517615321632 [label=UnsqueezeBackward0] + 140517615539152 -> 140517615321632 + 140509588426864 -> 140509588429744 + 140509588426864 [label=TBackward0] + 140509588461936 -> 140509588426864 + 140509588461936 [label=ToCopyBackward0] + 140509588460880 -> 140509588461936 + 140509591312400 [label="encoder.layer.10.crossattention.self.key.weight + (768, 1408)" fillcolor=lightblue] + 140509591312400 -> 140509588460880 + 140509588460880 [label=AccumulateGrad] + 140509588404064 -> 140509588404208 + 140509588404064 [label=UnsafeViewBackward0] + 140509588404880 -> 140509588404064 + 140509588404880 [label=CloneBackward0] + 140509588405168 -> 140509588404880 + 140509588405168 [label=ExpandBackward0] + 140509588405552 -> 140509588405168 + 140509588405552 [label=PermuteBackward0] + 140509588404304 -> 140509588405552 + 140509588404304 [label=ViewBackward0] + 140509588405936 -> 140509588404304 + 140509588405936 [label=ViewBackward0] + 140509588427968 -> 140509588405936 + 140509588427968 [label=AddmmBackward0] + 140509588428112 -> 140509588427968 + 140509588428112 [label=ToCopyBackward0] + 140509588459920 -> 140509588428112 + 140509591310560 [label="encoder.layer.10.crossattention.self.value.bias + (768)" fillcolor=lightblue] + 140509591310560 -> 140509588459920 + 140509588459920 [label=AccumulateGrad] + 140509588428976 -> 140509588427968 + 140509588428976 [label=ViewBackward0] + 140509588429888 -> 140509588428976 + 140509588429888 [label=ToCopyBackward0] + 140509588461264 -> 140509588429888 + 140509588426960 -> 140509588427968 + 140509588426960 [label=TBackward0] + 140517615321680 -> 140509588426960 + 140517615321680 [label=ToCopyBackward0] + 140517615321344 -> 140517615321680 + 140509591312240 [label="encoder.layer.10.crossattention.self.value.weight + (768, 1408)" fillcolor=lightblue] + 140509591312240 -> 140517615321344 + 140517615321344 [label=AccumulateGrad] + 140509588402576 -> 140509588402864 + 140509588402576 [label=TBackward0] + 140509588403584 -> 140509588402576 + 140509588403584 [label=ToCopyBackward0] + 140509588404016 -> 140509588403584 + 140509591311040 [label="encoder.layer.10.crossattention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509591311040 -> 140509588404016 + 140509588404016 [label=AccumulateGrad] + 140509588402384 -> 140509588373360 + 140509588372784 -> 140509588373456 + 140509591293760 [label="encoder.layer.10.crossattention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591293760 -> 140509588372784 + 140509588372784 [label=AccumulateGrad] + 140509588402240 -> 140509588373456 + 140509591293520 [label="encoder.layer.10.crossattention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591293520 -> 140509588402240 + 140509588402240 [label=AccumulateGrad] + 140509588372016 -> 140509588372496 + 140509588372016 [label=TBackward0] + 140509588372688 -> 140509588372016 + 140509588372688 [label=ToCopyBackward0] + 140509588373168 -> 140509588372688 + 140509591289920 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591289920 -> 140509588373168 + 140509588373168 [label=AccumulateGrad] + 140509588371008 -> 140509588371440 + 140509588371008 [label=TBackward0] + 140509588372208 -> 140509588371008 + 140509588372208 [label=ToCopyBackward0] + 140509588372928 -> 140509588372208 + 140509591290240 [label="encoder.layer.10.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591290240 -> 140509588372928 + 140509588372928 [label=AccumulateGrad] + 140509588370960 -> 140509588370864 + 140509588370528 -> 140509588370672 + 140509591285328 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591285328 -> 140509588370528 + 140509588370528 [label=AccumulateGrad] + 140509588370576 -> 140509588370672 + 140509591285248 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591285248 -> 140509588370576 + 140509588370576 [label=AccumulateGrad] + 140509588370384 -> 140509588370192 + 140509588370384 [label=UnsqueezeBackward0] + 140509588371056 -> 140509588370384 + 140509588371056 [label=NativeLayerNormBackward0] + 140509588371536 -> 140509588371056 + 140509588371536 [label=AddBackward0] + 140509588373072 -> 140509588371536 + 140509588373072 [label=NativeDropoutBackward0] + 140509588371968 -> 140509588373072 + 140509588371968 [label=ViewBackward0] + 140509588402288 -> 140509588371968 + 140509588402288 [label=AddmmBackward0] + 140509588403248 -> 140509588402288 + 140509588403248 [label=ToCopyBackward0] + 140509588403440 -> 140509588403248 + 140509591284528 [label="encoder.layer.10.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591284528 -> 140509588403440 + 140509588403440 [label=AccumulateGrad] + 140509588402960 -> 140509588402288 + 140509588402960 [label=ViewBackward0] + 140509588403536 -> 140509588402960 + 140509588403536 [label=GeluBackward0] + 140509588405360 -> 140509588403536 + 140509588405360 [label=ViewBackward0] + 140509588404592 -> 140509588405360 + 140509588404592 [label=AddmmBackward0] + 140509588429456 -> 140509588404592 + 140509588429456 [label=ToCopyBackward0] + 140517615321776 -> 140509588429456 + 140509591284768 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591284768 -> 140517615321776 + 140517615321776 [label=AccumulateGrad] + 140509588427248 -> 140509588404592 + 140509588427248 [label=ViewBackward0] + 140517615321488 -> 140509588427248 + 140517615321488 [label=ToCopyBackward0] + 140509588370960 -> 140517615321488 + 140509588404688 -> 140509588404592 + 140509588404688 [label=TBackward0] + 140517615321728 -> 140509588404688 + 140517615321728 [label=ToCopyBackward0] + 140517615321968 -> 140517615321728 + 140509591285088 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591285088 -> 140517615321968 + 140517615321968 [label=AccumulateGrad] + 140509588402768 -> 140509588402288 + 140509588402768 [label=TBackward0] + 140509588405648 -> 140509588402768 + 140509588405648 [label=ToCopyBackward0] + 140509588405072 -> 140509588405648 + 140509591284848 [label="encoder.layer.10.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591284848 -> 140509588405072 + 140509588405072 [label=AccumulateGrad] + 140509588370960 -> 140509588371536 + 140509588371344 -> 140509588371056 + 140509591284608 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591284608 -> 140509588371344 + 140509588371344 [label=AccumulateGrad] + 140509588370480 -> 140509588371056 + 140509591285008 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591285008 -> 140509588370480 + 140509588370480 [label=AccumulateGrad] + 140509588370000 -> 140509588370096 + 140509588370000 [label=UnsqueezeBackward0] + 140509588371824 -> 140509588370000 + 140509588371824 [label=UnsqueezeBackward0] + 140509588371728 -> 140509588371824 + 140509588371728 [label=MulBackward0] + 140509588370048 -> 140509588371728 + 140509588370048 [label=SoftmaxBackward0] + 140509588403824 -> 140509588370048 + 140509588403824 [label=MmBackward0] + 140517615321824 -> 140509588403824 + 140517615321824 [label=ToCopyBackward0] + 140517615321872 -> 140517615321824 + 140517615321872 [label=DivBackward0] + 140517615322160 -> 140517615321872 + 140517615322160 [label=SumBackward1] + 140517615322256 -> 140517615322160 + 140517615322256 [label=MulBackward0] + 140509588370960 -> 140517615322256 + 140517615322064 -> 140509588403824 + 140517615322064 [label=TBackward0] + 140517615322208 -> 140517615322064 + 140517615322208 [label=ToCopyBackward0] + 140517615322304 -> 140517615322208 + 140509591291120 [label="encoder.layer.10.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509591291120 -> 140517615322304 + 140517615322304 [label=AccumulateGrad] + 140509588369520 -> 140509588315344 + 140509588369520 [label=IndexBackward0] + 140509588370768 -> 140509588369520 + 140509588370768 [label=NativeLayerNormBackward0] + 140509588372448 -> 140509588370768 + 140509588372448 [label=AddBackward0] + 140517615322352 -> 140509588372448 + 140517615322352 [label=NativeDropoutBackward0] + 140517615322016 -> 140517615322352 + 140517615322016 [label=ViewBackward0] + 140517615322496 -> 140517615322016 + 140517615322496 [label=AddmmBackward0] + 140517615322592 -> 140517615322496 + 140517615322592 [label=ToCopyBackward0] + 140517615322784 -> 140517615322592 + 140509591293040 [label="encoder.layer.10.output.dense.bias + (768)" fillcolor=lightblue] + 140509591293040 -> 140517615322784 + 140517615322784 [label=AccumulateGrad] + 140517615322544 -> 140517615322496 + 140517615322544 [label=ViewBackward0] + 140517615322832 -> 140517615322544 + 140517615322832 [label=GeluBackward0] + 140517615322928 -> 140517615322832 + 140517615322928 [label=ViewBackward0] + 140517615323024 -> 140517615322928 + 140517615323024 [label=AddmmBackward0] + 140517615323120 -> 140517615323024 + 140517615323120 [label=ToCopyBackward0] + 140517615323312 -> 140517615323120 + 140509591293280 [label="encoder.layer.10.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509591293280 -> 140517615323312 + 140517615323312 [label=AccumulateGrad] + 140517615323072 -> 140517615323024 + 140517615323072 [label=ViewBackward0] + 140517615323360 -> 140517615323072 + 140517615323360 [label=ToCopyBackward0] + 140517615322112 -> 140517615323360 + 140517615322112 [label=SliceBackward0] + 140517615323504 -> 140517615322112 + 140517615323504 [label=SliceBackward0] + 140517615323600 -> 140517615323504 + 140517615323600 [label=SliceBackward0] + 140509588428592 -> 140517615323600 + 140517615322736 -> 140517615323024 + 140517615322736 [label=TBackward0] + 140517615323264 -> 140517615322736 + 140517615323264 [label=ToCopyBackward0] + 140517615323696 -> 140517615323264 + 140509591293600 [label="encoder.layer.10.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591293600 -> 140517615323696 + 140517615323696 [label=AccumulateGrad] + 140517615322400 -> 140517615322496 + 140517615322400 [label=TBackward0] + 140517615322976 -> 140517615322400 + 140517615322976 [label=ToCopyBackward0] + 140517615323456 -> 140517615322976 + 140509591293360 [label="encoder.layer.10.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591293360 -> 140517615323456 + 140517615323456 [label=AccumulateGrad] + 140517615322112 -> 140509588372448 + 140509588369808 -> 140509588370768 + 140509591293120 [label="encoder.layer.10.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591293120 -> 140509588369808 + 140509588369808 [label=AccumulateGrad] + 140509588403104 -> 140509588370768 + 140509591292800 [label="encoder.layer.10.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591292800 -> 140509588403104 + 140509588403104 [label=AccumulateGrad] + 140509588347344 -> 140509588348304 + 140509588347344 [label=TBackward0] + 140509588348880 -> 140509588347344 + 140509588348880 [label=ToCopyBackward0] + 140509588402480 -> 140509588348880 + 140509591291360 [label="encoder.layer.11.attention.self.query.weight + (768, 768)" fillcolor=lightblue] + 140509591291360 -> 140509588402480 + 140509588402480 [label=AccumulateGrad] + 140509588347104 -> 140509588347248 + 140509588347104 [label=UnsafeViewBackward0] + 140509588347920 -> 140509588347104 + 140509588347920 [label=CloneBackward0] + 140509588348208 -> 140509588347920 + 140509588348208 [label=ExpandBackward0] + 140509588348688 -> 140509588348208 + 140509588348688 [label=TransposeBackward0] + 140509588347632 -> 140509588348688 + 140509588347632 [label=PermuteBackward0] + 140509588369712 -> 140509588347632 + 140509588369712 [label=ViewBackward0] + 140517615322448 -> 140509588369712 + 140517615322448 [label=ViewBackward0] + 140517615322688 -> 140517615322448 + 140517615322688 [label=AddmmBackward0] + 140517615323216 -> 140517615322688 + 140517615323216 [label=ToCopyBackward0] + 140517615323408 -> 140517615323216 + 140509591291920 [label="encoder.layer.11.attention.self.key.bias + (768)" fillcolor=lightblue] + 140509591291920 -> 140517615323408 + 140517615323408 [label=AccumulateGrad] + 140517615323168 -> 140517615322688 + 140517615323168 [label=ViewBackward0] + 140517615323744 -> 140517615323168 + 140517615323744 [label=ToCopyBackward0] + 140509588315344 -> 140517615323744 + 140517615321200 -> 140517615322688 + 140517615321200 [label=TBackward0] + 140517615322880 -> 140517615321200 + 140517615322880 [label=ToCopyBackward0] + 140517615323888 -> 140517615322880 + 140509591291600 [label="encoder.layer.11.attention.self.key.weight + (768, 768)" fillcolor=lightblue] + 140509591291600 -> 140517615323888 + 140517615323888 [label=AccumulateGrad] + 140509588345808 -> 140509588345616 + 140509588345808 [label=UnsafeViewBackward0] + 140509588346144 -> 140509588345808 + 140509588346144 [label=CloneBackward0] + 140509588346576 -> 140509588346144 + 140509588346576 [label=ExpandBackward0] + 140509588346864 -> 140509588346576 + 140509588346864 [label=PermuteBackward0] + 140509588346000 -> 140509588346864 + 140509588346000 [label=ViewBackward0] + 140509588348112 -> 140509588346000 + 140509588348112 [label=ViewBackward0] + 140509588348400 -> 140509588348112 + 140509588348400 [label=AddmmBackward0] + 140509588369616 -> 140509588348400 + 140509588369616 [label=ToCopyBackward0] + 140517615323648 -> 140509588369616 + 140509591292160 [label="encoder.layer.11.attention.self.value.bias + (768)" fillcolor=lightblue] + 140509591292160 -> 140517615323648 + 140517615323648 [label=AccumulateGrad] + 140509588369904 -> 140509588348400 + 140509588369904 [label=ViewBackward0] + 140517615323984 -> 140509588369904 + 140517615323984 [label=ToCopyBackward0] + 140509588315344 -> 140517615323984 + 140517615321920 -> 140509588348400 + 140517615321920 [label=TBackward0] + 140517615323552 -> 140517615321920 + 140517615323552 [label=ToCopyBackward0] + 140517615324032 -> 140517615323552 + 140509591291840 [label="encoder.layer.11.attention.self.value.weight + (768, 768)" fillcolor=lightblue] + 140509591291840 -> 140517615324032 + 140517615324032 [label=AccumulateGrad] + 140509588315536 -> 140509588315824 + 140509588315536 [label=TBackward0] + 140509588345328 -> 140509588315536 + 140509588345328 [label=ToCopyBackward0] + 140509588345712 -> 140509588345328 + 140509591291440 [label="encoder.layer.11.attention.output.dense.weight + (768, 768)" fillcolor=lightblue] + 140509591291440 -> 140509588345712 + 140509588345712 [label=AccumulateGrad] + 140509588315344 -> 140509588314960 + 140509588315056 -> 140509588314768 + 140509591290960 [label="encoder.layer.11.attention.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591290960 -> 140509588315056 + 140509588315056 [label=AccumulateGrad] + 140509588313568 -> 140509588314768 + 140509591291200 [label="encoder.layer.11.attention.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591291200 -> 140509588313568 + 140509588313568 [label=AccumulateGrad] + 140509588312272 -> 140509588313328 + 140509588312272 [label=TBackward0] + 140509588313904 -> 140509588312272 + 140509588313904 [label=ToCopyBackward0] + 140509588314576 -> 140509588313904 + 140509591260912 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591260912 -> 140509588314576 + 140509588314576 [label=AccumulateGrad] + 140509588312848 -> 140509588313232 + 140509588312848 [label=TBackward0] + 140509588312128 -> 140509588312848 + 140509588312128 [label=ToCopyBackward0] + 140509588314192 -> 140509588312128 + 140509591260592 [label="encoder.layer.11.experts.experts.0.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591260592 -> 140509588314192 + 140509588314192 [label=AccumulateGrad] + 140509588312608 -> 140509591317376 + 140509591314832 -> 140509591314640 + 140509591260352 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591260352 -> 140509591314832 + 140509591314832 [label=AccumulateGrad] + 140509591317568 -> 140509591314640 + 140509591260832 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591260832 -> 140509591317568 + 140509591317568 [label=AccumulateGrad] + 140509591315408 -> 140509588282864 + 140509591315408 [label=UnsqueezeBackward0] + 140509591268800 -> 140509591315408 + 140509591268800 [label=NativeLayerNormBackward0] + 140509588313088 -> 140509591268800 + 140509588313088 [label=AddBackward0] + 140509588314864 -> 140509588313088 + 140509588314864 [label=NativeDropoutBackward0] + 140509588312224 -> 140509588314864 + 140509588312224 [label=ViewBackward0] + 140509588314000 -> 140509588312224 + 140509588314000 [label=AddmmBackward0] + 140509588315008 -> 140509588314000 + 140509588315008 [label=ToCopyBackward0] + 140509588315920 -> 140509588315008 + 140509591259952 [label="encoder.layer.11.experts.experts.1.output_query.dense.bias + (768)" fillcolor=lightblue] + 140509591259952 -> 140509588315920 + 140509588315920 [label=AccumulateGrad] + 140509588315152 -> 140509588314000 + 140509588315152 [label=ViewBackward0] + 140509588315488 -> 140509588315152 + 140509588315488 [label=GeluBackward0] + 140509588345232 -> 140509588315488 + 140509588345232 [label=ViewBackward0] + 140509588346384 -> 140509588345232 + 140509588346384 [label=AddmmBackward0] + 140509588347056 -> 140509588346384 + 140509588347056 [label=ToCopyBackward0] + 140509588345904 -> 140509588347056 + 140509591260192 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.bias + (3072)" fillcolor=lightblue] + 140509591260192 -> 140509588345904 + 140509588345904 [label=AccumulateGrad] + 140509588346624 -> 140509588346384 + 140509588346624 [label=ViewBackward0] + 140517615323840 -> 140509588346624 + 140517615323840 [label=ToCopyBackward0] + 140509588312608 -> 140517615323840 + 140509588346096 -> 140509588346384 + 140509588346096 [label=TBackward0] + 140517615322640 -> 140509588346096 + 140517615322640 [label=ToCopyBackward0] + 140517615323936 -> 140517615322640 + 140509591260112 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591260112 -> 140517615323936 + 140517615323936 [label=AccumulateGrad] + 140509588312464 -> 140509588314000 + 140509588312464 [label=TBackward0] + 140509588344944 -> 140509588312464 + 140509588344944 [label=ToCopyBackward0] + 140509588347728 -> 140509588344944 + 140509591259872 [label="encoder.layer.11.experts.experts.1.output_query.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591259872 -> 140509588347728 + 140509588347728 [label=AccumulateGrad] + 140509588312608 -> 140509588313088 + 140509588313136 -> 140509591268800 + 140509591259632 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591259632 -> 140509588313136 + 140509588313136 [label=AccumulateGrad] + 140509588312752 -> 140509591268800 + 140509591260432 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591260432 -> 140509588312752 + 140509588312752 [label=AccumulateGrad] + 140509588282672 -> 140509588283152 + 140509588282672 [label=UnsqueezeBackward0] + 140509591318432 -> 140509588282672 + 140509591318432 [label=UnsqueezeBackward0] + 140509588314384 -> 140509591318432 + 140509588314384 [label=MulBackward0] + 140509588315440 -> 140509588314384 + 140509588315440 [label=SoftmaxBackward0] + 140509588345520 -> 140509588315440 + 140509588345520 [label=MmBackward0] + 140509588312656 -> 140509588345520 + 140509588312656 [label=ToCopyBackward0] + 140517615324128 -> 140509588312656 + 140517615324128 [label=DivBackward0] + 140517615324320 -> 140517615324128 + 140517615324320 [label=SumBackward1] + 140517615324416 -> 140517615324320 + 140517615324416 [label=MulBackward0] + 140509588312608 -> 140517615324416 + 140517615323792 -> 140509588345520 + 140517615323792 [label=TBackward0] + 140517615324368 -> 140517615323792 + 140517615324368 [label=ToCopyBackward0] + 140517615324464 -> 140517615324368 + 140509591282928 [label="encoder.layer.11.experts.gate.weight + (2, 768)" fillcolor=lightblue] + 140509591282928 -> 140517615324464 + 140517615324464 [label=AccumulateGrad] + 140509588282432 -> 140509588281712 + 140509588282432 [label=IndexBackward0] + 140509588283248 -> 140509588282432 + 140509588283248 [label=IndexBackward0] + 140509591317952 -> 140509588283248 + 140509591317952 [label=NativeLayerNormBackward0] + 140509588345040 -> 140509591317952 + 140509588345040 [label=AddBackward0] + 140517615324560 -> 140509588345040 + 140517615324560 [label=NativeDropoutBackward0] + 140517615324608 -> 140517615324560 + 140517615324608 [label=ViewBackward0] + 140517615324704 -> 140517615324608 + 140517615324704 [label=AddmmBackward0] + 140517615324800 -> 140517615324704 + 140517615324800 [label=ToCopyBackward0] + 140517615324992 -> 140517615324800 + 140509591290400 [label="encoder.layer.11.output.dense.bias + (768)" fillcolor=lightblue] + 140509591290400 -> 140517615324992 + 140517615324992 [label=AccumulateGrad] + 140517615324752 -> 140517615324704 + 140517615324752 [label=ViewBackward0] + 140517615325040 -> 140517615324752 + 140517615325040 [label=GeluBackward0] + 140517615325136 -> 140517615325040 + 140517615325136 [label=ViewBackward0] + 140517615324944 -> 140517615325136 + 140517615324944 [label=AddmmBackward0] + 140517615382736 -> 140517615324944 + 140517615382736 [label=ToCopyBackward0] + 140517615382928 -> 140517615382736 + 140509591290480 [label="encoder.layer.11.intermediate.dense.bias + (3072)" fillcolor=lightblue] + 140509591290480 -> 140517615382928 + 140517615382928 [label=AccumulateGrad] + 140517615382688 -> 140517615324944 + 140517615382688 [label=ViewBackward0] + 140517615382976 -> 140517615382688 + 140517615382976 [label=ToCopyBackward0] + 140517615324512 -> 140517615382976 + 140517615324512 [label=SliceBackward0] + 140517615383120 -> 140517615324512 + 140517615383120 [label=SliceBackward0] + 140517615383216 -> 140517615383120 + 140517615383216 [label=SliceBackward0] + 140509588314768 -> 140517615383216 + 140517615382592 -> 140517615324944 + 140517615382592 [label=TBackward0] + 140517615382880 -> 140517615382592 + 140517615382880 [label=ToCopyBackward0] + 140517615383312 -> 140517615382880 + 140509591290640 [label="encoder.layer.11.intermediate.dense.weight + (3072, 768)" fillcolor=lightblue] + 140509591290640 -> 140517615383312 + 140517615383312 [label=AccumulateGrad] + 140517615324080 -> 140517615324704 + 140517615324080 [label=TBackward0] + 140517615324896 -> 140517615324080 + 140517615324896 [label=ToCopyBackward0] + 140517615383072 -> 140517615324896 + 140509591290720 [label="encoder.layer.11.output.dense.weight + (768, 3072)" fillcolor=lightblue] + 140509591290720 -> 140517615383072 + 140517615383072 [label=AccumulateGrad] + 140517615324512 -> 140509588345040 + 140509588314672 -> 140509591317952 + 140509591290160 [label="encoder.layer.11.output.LayerNorm.weight + (768)" fillcolor=lightblue] + 140509591290160 -> 140509588314672 + 140509588314672 [label=AccumulateGrad] + 140509588313712 -> 140509591317952 + 140509591290000 [label="encoder.layer.11.output.LayerNorm.bias + (768)" fillcolor=lightblue] + 140509591290000 -> 140509588313712 + 140509588313712 [label=AccumulateGrad] + 140509588281712 -> 140509988778688 +} diff --git a/test.pdf/backward_graph.pdf b/test.pdf/backward_graph.pdf new file mode 100644 index 0000000..7f162b0 Binary files /dev/null and b/test.pdf/backward_graph.pdf differ diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..516c092 --- /dev/null +++ b/test.txt @@ -0,0 +1,360 @@ +tmp_name = [name for name, p in model.named_parameters() if (p.requires_grad and '10.expert' in name)] + +tmp = [p for name, p in model.named_parameters() if (p.requires_grad and '10.expert' in name)] + +tensor([[-1.4032e-02, 3.7242e-03, 8.4997e-03, -3.4016e-03, -6.4855e-03, + 4.3595e-02, 3.4423e-02, -8.6274e-03, -1.9702e-02, 9.1813e-03, + 1.1643e-02, 2.3939e-02, -2.0908e-02, 3.4555e-03, 9.1636e-03, + 1.5413e-02, 2.4148e-02, -1.0880e-03, 1.1193e-02, -1.3591e-02, + 9.3484e-03, 1.5999e-02, -9.6086e-04, 3.8322e-02, -8.0687e-03, + -1.4056e-02, 3.9486e-02, 3.5167e-02, -9.3226e-03, -1.0493e-02, + -2.5795e-02, -9.7541e-03, 4.4437e-03, 7.7226e-03, 7.5210e-03, + -1.3526e-02, -5.0316e-03, -1.1149e-02, 6.0583e-03, 2.0564e-02, + -6.4477e-03, 1.4170e-02, -3.7847e-02, 1.1780e-02, 1.3321e-02, + -8.2501e-03, -1.0298e-02, 1.4805e-02, -1.2432e-02, -1.9159e-02, + -5.7095e-04, -3.8618e-02, -2.4230e-02, -1.4991e-03, -1.4114e-02, + -1.5365e-02, 1.5640e-02, -4.8623e-02, -2.9991e-02, 1.2796e-02, + -4.9917e-03, 2.3846e-03, 7.7368e-03, 1.2913e-02, 1.5300e-02, + 8.5125e-03, 1.1582e-02, 8.1161e-03, 4.2259e-03, 7.6109e-03, + -2.0747e-02, -3.5099e-03, 2.2282e-02, 5.0493e-02, -1.7849e-02, + -3.7106e-02, -1.4944e-02, -1.4582e-02, -2.2458e-02, -4.6173e-05, + -8.1270e-03, 1.9037e-02, -2.0086e-02, 3.0980e-03, -9.3947e-03, + 1.3054e-02, 2.3203e-02, -9.9304e-03, -2.6038e-02, 1.8679e-02, + 9.2081e-03, -2.1770e-02, -1.6568e-03, -3.6503e-02, 2.0054e-02, + 1.2886e-02, -1.8021e-02, 3.4457e-02, -1.3704e-02, -6.1498e-03, + -8.6769e-03, 1.5024e-02, -1.3875e-02, 1.7416e-02, -1.1178e-02, + -2.4088e-02, -1.7802e-02, 3.3326e-02, -1.1216e-02, -8.6330e-03, + -5.5359e-03, -1.1939e-02, -1.7777e-02, -2.8666e-02, -3.8280e-02, + 4.2682e-02, 1.4946e-02, 9.6427e-03, 8.2754e-03, -1.0516e-03, + 2.9560e-02, 2.4552e-03, -4.8354e-02, 1.5568e-02, 2.5881e-02, + -1.7354e-02, -3.1232e-02, 2.3683e-02, -2.3239e-02, 2.2966e-02, + 5.6349e-03, -8.7595e-03, 1.5173e-02, 2.7660e-02, -4.3304e-03, + -2.5330e-02, -2.1795e-02, 1.6856e-02, -2.1587e-04, 2.3707e-02, + -2.3667e-02, 3.5378e-02, -7.9245e-03, 7.1029e-04, -3.2800e-02, + -1.5402e-03, -8.5634e-03, -1.1356e-02, -2.1935e-03, -1.8854e-02, + -1.9705e-03, -3.8333e-02, 2.9131e-02, -4.4470e-02, -2.0893e-03, + 1.2937e-02, -1.7116e-02, 2.7778e-02, 1.0311e-02, -6.4017e-03, + 3.7647e-02, -1.9953e-02, -5.3925e-03, 3.6978e-02, -1.5534e-02, + 1.2241e-02, 1.3597e-02, 2.0703e-03, 2.4213e-03, 9.2604e-03, + 6.6108e-03, -5.8213e-03, 9.8167e-03, -9.8300e-04, -1.0236e-02, + 2.9581e-02, 1.0987e-02, 2.0046e-02, -1.0500e-02, -3.2221e-03, + -2.6303e-02, 1.3688e-02, -2.2529e-02, -5.7654e-03, 1.1784e-02, + 1.6221e-02, 2.8743e-02, 5.7565e-03, 1.8129e-02, 1.5140e-02, + -1.1748e-02, -1.7528e-02, 4.7977e-02, 1.5568e-02, 4.7030e-04, + 3.2757e-03, 1.6631e-02, 1.9986e-02, -7.3463e-03, 1.1435e-02, + -1.4739e-02, -3.2959e-03, -2.8770e-03, 2.9260e-02, 1.7007e-02, + 3.0611e-02, 2.2102e-02, -3.3819e-02, -1.9403e-02, 2.5524e-02, + 3.0738e-02, -1.9951e-02, -1.4553e-02, -1.5796e-02, -2.3143e-02, + -2.8826e-02, 2.4739e-02, -5.8602e-03, 4.1871e-02, 5.0821e-04, + 3.3493e-02, 2.3524e-02, 2.3191e-02, 9.0416e-03, 3.3262e-02, + -1.6805e-02, 1.1545e-02, -1.7195e-02, -3.8696e-02, -8.4358e-04, + -8.1605e-03, 3.1372e-03, 1.0726e-03, 1.0865e-03, 1.0760e-02, + -5.2421e-03, 1.3039e-02, 3.6873e-04, 1.0464e-02, -1.1544e-02, + -2.2775e-02, -4.8439e-02, -1.0711e-02, 4.4236e-03, 2.0351e-02, + 2.4479e-03, -1.9968e-02, -2.2941e-02, -2.0486e-02, -1.9528e-02, + -2.3176e-02, -3.2731e-03, 1.1789e-02, 2.0921e-02, 2.9809e-03, + -8.8507e-03, -3.5716e-02, 8.8418e-03, 5.3665e-05, -1.1288e-02, + -7.5571e-03, 2.1053e-02, -3.7381e-03, -4.0165e-03, -2.2628e-03, + 3.7554e-03, -1.6597e-02, 7.6946e-03, -3.2689e-02, 2.2016e-02, + 5.5122e-03, 4.5455e-02, 6.7586e-03, 1.5714e-02, 5.2125e-03, + 3.9596e-03, 1.8134e-02, 1.5834e-03, -1.6239e-02, -1.3889e-02, + -2.3522e-02, 1.4738e-02, 5.5867e-03, -7.0727e-03, -2.8140e-03, + 1.6849e-02, -3.1327e-02, -3.2443e-02, 4.7851e-03, 1.2980e-02, + -2.0014e-04, -9.9475e-03, 8.0657e-03, 1.9468e-02, -1.5774e-02, + 1.7017e-02, -8.7196e-03, -4.0681e-03, -6.9754e-03, -2.2007e-02, + -6.6217e-03, -1.8219e-02, 4.2186e-02, -5.6621e-03, -9.3449e-03, + -1.1662e-02, 2.8700e-02, -9.0654e-03, 3.1569e-02, -2.9825e-03, + -3.8198e-02, -5.2723e-02, -4.8325e-02, -2.7871e-03, 5.1127e-03, + 1.4511e-02, 9.3245e-03, -2.3339e-02, -8.6658e-03, 1.5276e-02, + -1.5823e-02, -3.4476e-03, 1.4601e-02, 6.3504e-03, -1.4307e-02, + 2.2817e-02, 2.1998e-02, 1.7330e-02, -2.4448e-02, 4.0178e-03, + 3.2280e-03, -1.2721e-02, 1.9661e-02, 7.5263e-03, 2.0245e-02, + 4.5525e-02, -1.5658e-02, -4.0676e-02, 9.3160e-03, 1.1920e-02, + -1.9317e-02, 1.7848e-02, -5.8601e-03, 1.1786e-03, 8.3864e-03, + -1.8341e-02, 2.5985e-02, -1.1387e-02, -1.5069e-02, -2.8097e-02, + 2.4966e-02, 1.4790e-02, 2.0424e-02, -1.3062e-02, 3.1314e-02, + 1.7811e-02, 7.2393e-03, 1.4413e-02, -1.2746e-02, 3.1039e-02, + -1.1697e-02, -1.4826e-02, -8.8397e-03, 1.5157e-02, -1.5855e-02, + -1.8157e-03, 1.3024e-02, -1.8902e-03, 2.5212e-02, -3.4886e-02, + 4.3029e-02, -4.0842e-02, 1.1362e-02, -1.4654e-02, -1.3337e-02, + -3.1832e-02, 3.6222e-03, 8.2804e-03, -1.4269e-02, 2.8399e-03, + -1.2008e-02, 2.4685e-02, -4.3070e-03, 6.3163e-03, -1.3517e-02, + -1.3807e-02, 2.4617e-02, 2.1453e-02, 4.7332e-03, 9.1636e-03, + -1.2881e-02, 1.9077e-02, 1.7571e-04, -5.2817e-03, -2.8821e-02, + 5.8223e-03, -3.0979e-02, 2.4609e-02, 3.6666e-02, -1.0950e-02, + 2.0421e-02, -2.6378e-03, 3.1825e-02, -9.6689e-04, -2.8398e-02, + -2.7513e-02, 1.6946e-02, -2.4110e-02, -1.3575e-02, -1.3443e-02, + 8.4217e-03, 2.6754e-02, -2.3309e-03, -2.5086e-02, 1.1844e-02, + 1.4152e-02, 1.2989e-02, -5.7336e-03, 4.7391e-03, 3.4106e-02, + 1.0142e-02, -1.8029e-02, -1.5410e-04, -1.3548e-02, 9.1742e-03, + -3.0150e-02, 1.5666e-02, 4.3049e-03, 1.6273e-02, 2.0672e-02, + -1.2458e-02, 4.5496e-02, 3.2131e-02, -3.0967e-03, 2.1891e-02, + 2.5524e-02, -1.1998e-02, -1.8866e-03, -1.0945e-02, 5.9930e-03, + -8.4233e-03, -8.9095e-03, -1.8261e-02, 1.9308e-02, -1.9728e-02, + -1.4216e-02, 1.4952e-02, 5.7355e-04, -2.4753e-02, -1.0948e-02, + 1.0965e-02, 1.3607e-03, 3.4974e-02, -4.1396e-03, 2.5519e-02, + 1.0364e-02, -1.5851e-02, -4.9224e-03, 1.0903e-02, -1.0523e-04, + 3.1355e-02, -1.5105e-02, 5.6972e-03, -8.4078e-03, -1.9868e-02, + 1.7186e-03, 2.9396e-02, -4.1439e-02, 1.4124e-02, -3.7745e-03, + 3.3007e-02, 8.0368e-04, 8.5574e-03, 1.7269e-02, 1.1955e-02, + 8.8142e-03, -1.3123e-02, 1.6817e-02, -1.5456e-02, -1.3868e-02, + 2.4139e-02, -9.1566e-03, -1.8477e-02, -4.7972e-03, -6.8459e-03, + 1.6818e-02, 3.1645e-03, -3.0901e-02, -5.6036e-03, -1.4758e-02, + 2.0473e-02, -7.5411e-05, 2.0673e-03, -7.0061e-03, 9.5544e-03, + 1.6600e-02, -1.7315e-02, -2.0168e-02, -5.3008e-03, 2.0206e-02, + 2.4209e-03, 2.1205e-02, -8.9188e-03, -4.1350e-04, -1.0638e-02, + 1.3705e-02, 9.5925e-05, 3.8877e-02, 3.2884e-02, -2.7730e-03, + 1.0052e-02, 1.9311e-02, 1.1341e-02, -1.2988e-02, -1.7157e-02, + 3.2095e-02, -1.8493e-02, -9.2551e-03, -2.6509e-03, -1.1130e-02, + 1.6581e-02, 1.0216e-02, 1.3687e-02, 1.1860e-02, -3.0462e-03, + -1.2082e-02, 2.8502e-03, -1.2620e-02, 8.8330e-03, 1.7357e-02, + 1.8383e-02, -2.3130e-02, -3.2654e-02, 1.2853e-02, -7.8144e-03, + 1.9418e-04, 3.8635e-03, 4.9333e-02, 1.9350e-02, -2.0643e-02, + 8.4650e-04, 5.0242e-02, 1.6576e-02, -8.9166e-03, -5.8805e-03, + -4.1484e-02, 9.3217e-03, -1.1292e-02, -8.7944e-03, -3.3190e-03, + 5.7970e-03, -6.6078e-03, -2.4052e-02, -5.6347e-03, 8.4539e-03, + 1.9250e-02, 7.9559e-03, -3.0055e-03, -3.0398e-04, 2.7007e-02, + 3.1046e-03, 1.8332e-02, 5.5470e-03, 6.6815e-03, 1.1466e-02, + 1.9738e-02, 1.2176e-02, -2.0220e-02, 8.6928e-03, 4.2451e-03, + 4.4517e-03, -5.1524e-03, 1.0805e-02, -2.1935e-02, -1.7575e-02, + -1.2529e-02, -2.2191e-02, -1.0854e-02, -9.4462e-03, -2.9102e-02, + 2.6752e-02, -1.0919e-02, -2.6724e-02, 8.3694e-04, 2.9832e-03, + 1.4416e-02, -2.9906e-02, 2.3556e-02, -6.6624e-03, 2.6671e-02, + -3.6474e-02, 1.7237e-02, -2.5176e-02, 6.5560e-03, -2.6062e-02, + -2.3838e-02, 3.0629e-02, 2.5382e-02, 1.2302e-02, -1.1665e-02, + -7.0603e-03, 1.9931e-02, 2.3401e-02, -2.6047e-03, -2.7728e-02, + -1.7212e-02, 2.3061e-02, -2.5961e-02, 3.9764e-04, -2.9022e-02, + -1.5546e-03, 4.5519e-03, 2.3589e-02, -3.5005e-02, 4.1890e-03, + -1.5586e-02, 1.2389e-02, -2.1045e-02, 1.6377e-03, -1.1328e-02, + 1.0195e-02, 6.4322e-03, -3.8431e-02, 2.2918e-02, -4.0123e-03, + 6.6680e-02, 4.1135e-02, -1.5031e-02, -1.3550e-02, -2.2566e-02, + -2.3622e-03, -2.9323e-02, 2.1756e-02, 1.8399e-03, -4.2460e-03, + -1.5128e-03, -2.4731e-02, 1.8663e-02, 1.3469e-02, -1.3897e-02, + 2.6399e-02, -8.0740e-03, -4.6753e-03, 3.9857e-02, 6.2364e-03, + 2.2371e-03, 2.1501e-03, 5.9443e-02, 1.3574e-02, 7.6483e-03, + -6.2290e-03, 1.4324e-02, 1.2572e-02, 2.7331e-02, -6.0165e-03, + -5.9154e-03, -3.7000e-02, 1.4001e-02, 1.2869e-02, -2.8854e-02, + -9.4147e-03, 8.3965e-03, -1.4530e-03, -7.4215e-03, 9.0369e-03, + -2.4612e-02, 2.0625e-02, 2.2329e-02, -1.5216e-02, 1.4947e-03, + -3.6020e-02, -2.0702e-02, -4.0410e-02, -1.3157e-02, -1.5085e-02, + 1.2911e-02, -2.7552e-02, -2.9781e-02, -4.7424e-03, 2.0521e-02, + -4.0043e-02, -4.8763e-02, -1.3175e-02, 2.6802e-02, 2.8869e-02, + 6.5014e-03, -2.3213e-02, 1.4438e-02, -7.6318e-03, -1.9928e-03, + 1.8509e-03, 2.9728e-03, 1.5225e-02, -2.9405e-03, -7.2875e-03, + 2.9562e-05, -1.8661e-02, 9.1341e-03, -2.4919e-02, 2.9786e-02, + 9.5186e-03, 1.5435e-02, -1.1080e-02, 1.1192e-02, -2.7315e-03, + 6.9769e-05, -1.5392e-02, 4.9892e-03, 7.9857e-03, 2.0063e-02, + -2.0283e-02, -1.2596e-02, -4.1985e-04, -6.9686e-03, -5.4704e-02, + -1.9142e-02, 9.9706e-03, 2.3217e-02, -5.0579e-03, -4.9132e-02, + 2.0023e-02, -2.6238e-02, 1.0709e-02, 2.1528e-02, -1.6390e-03, + -6.7829e-03, 1.3211e-02, -9.6793e-03, 1.3130e-02, -1.2878e-02, + 1.7365e-02, 1.2509e-02, 1.2986e-03, -3.9292e-02, 9.5784e-03, + -8.0514e-03, -3.5619e-02, -3.2298e-02, 6.5933e-04, 9.9298e-03, + 3.7268e-02, -3.4047e-02, -7.8385e-03, 2.3999e-02, 1.0386e-02, + 1.7853e-02, -1.0122e-04, 5.2483e-04, -7.3150e-03, 1.0818e-02, + 1.6245e-02, -3.5619e-02, -9.9190e-03, 4.0132e-03, 9.7788e-03, + 2.7039e-02, -4.7858e-02, -2.0010e-02, -2.3702e-02, 7.8376e-04, + -2.5326e-02, 1.1698e-02, -1.3041e-02, 3.8634e-03, 9.3083e-03, + 4.8204e-03, 3.9503e-02, -4.1356e-03]], requires_grad=True) +model.Qformer.bert.encoder.layer[10].experts.gate.weight + +layer 11 +0: +model.Qformer.bert.encoder.layer[11].output.dense.weight.grad +model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad + +nan: +model.Qformer.bert.encoder.layer[11].attention.output.dense.weight.grad +model.Qformer.bert.encoder.layer[11].attention.self.query.weight.grad +model.Qformer.bert.encoder.layer[11].experts.intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[11].experts.output_query.dense.weight.grad + +None: +model.Qformer.bert.encoder.layer[11].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[11].output_query.dense.weight.grad + +layer 8 +0: +model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[2].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[0].output_query.dense.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[2].output_query.dense.weight.grad + +nan: +model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[1].output_query.dense.weight.grad +(Qformer)model.Qformer.bert.encoder.layer[8].intermediate_query.dense.weight.grad + +None: +model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad == None +model.Qformer.bert.encoder.layer[8].experts.gate.weight.requires_grad == True + + +model.Qformer.bert.encoder.layer[6].experts.gate.weight +Qformer.bert.encoder.layer.6.experts.gate.weight + +tensor([[-0.0089, -0.0123, -0.0168, ..., -0.0072, 0.0295, -0.0167], + [ 0.0305, 0.0277, -0.0215, ..., 0.0149, 0.0016, -0.0415], + [ 0.0199, 0.0151, 0.0237, ..., 0.0007, 0.0023, 0.0167]], + requires_grad=True) + +tensor([[-0.0089, -0.0123, -0.0168, ..., -0.0072, 0.0295, -0.0167], + [ 0.0305, 0.0277, -0.0215, ..., 0.0149, 0.0016, -0.0415], + [ 0.0199, 0.0151, 0.0237, ..., 0.0007, 0.0023, 0.0167]], + requires_grad=True) + + +tensor([[ 4.5972e-02, -1.5231e-02, -6.9533e-03, 3.2431e-02, -7.9703e-03, + 1.5567e-02, 2.9619e-03, -2.2609e-04, 1.8580e-02, -2.8783e-02, + 1.3093e-02, -1.0594e-02, 1.1918e-02, 4.4701e-02, 2.0108e-02, + -1.1011e-03, -8.2449e-03, 8.8876e-03, 4.6096e-03, 2.3274e-02, + -9.2557e-03, 2.5704e-03, 1.8919e-02, -5.3251e-03, -3.2665e-03, + -3.2663e-02, -5.6756e-02, -2.3400e-02, 1.3674e-02, -6.6185e-03, + 1.4429e-03, 1.2354e-02, 2.5934e-03, 2.1895e-02, -1.9793e-02, + 1.5497e-03, 4.3056e-03, -4.0023e-02, 9.8740e-03, 3.8631e-03, + -1.2918e-02, -3.6782e-02, -9.8365e-03, 3.2182e-02, 2.3729e-02, + 2.3509e-03, 1.8473e-02, 1.5583e-02, -1.1029e-02, -1.0738e-02, + -3.0278e-02, -9.8731e-03, -1.0500e-02, 7.9832e-05, -1.0345e-02, + 8.2803e-03, -5.9923e-03, -1.2669e-02, 1.2065e-03, 7.5720e-03, + -1.9286e-02, 4.0070e-02, 3.6221e-03, -1.7486e-02, 2.1725e-02, + -3.3231e-02, 7.3948e-03, -1.0924e-02, 3.1448e-02, 1.2101e-02, + 6.1737e-03, -2.0851e-02, -3.7964e-02, 8.0938e-03, -8.8967e-03, + 2.5925e-02, -7.8063e-04, 8.6102e-03, 2.7370e-02, 1.2323e-02, + 4.0606e-03, 3.9316e-02, -1.0837e-02, -2.6835e-03, 3.1941e-03, + -1.2017e-02, -2.3022e-02, 8.3533e-03, -2.2668e-02, 1.4438e-02, + -2.3664e-02, 4.5595e-02, -1.0962e-02, 1.7547e-02, -1.6739e-03, + 1.2048e-02, 2.0544e-02, 2.8837e-02, -1.6736e-02, 2.1207e-02, + 8.7612e-03, 2.8757e-02, -3.8561e-03, 8.4050e-03, -1.1503e-02, + -5.8332e-03, 1.5734e-02, -1.0773e-02, 7.5827e-03, 6.5794e-03, + 2.4291e-02, 2.6811e-02, 1.1681e-02, -3.3246e-02, 4.5776e-03, + -9.0628e-04, -2.9400e-02, 4.2933e-03, 1.5885e-03, 5.5757e-02, + 7.5518e-03, 1.0099e-02, 5.3507e-03, -3.0182e-02, 2.0830e-02, + 1.0102e-02, -9.3074e-03, 3.1161e-02, -1.7800e-02, -4.4445e-03, + -3.1503e-02, 2.3028e-02, 8.3472e-03, 7.4444e-03, 1.8838e-02, + -1.1977e-02, -2.6713e-02, 1.1364e-02, 8.3522e-04, 3.3736e-03, + 6.9425e-03, -2.0632e-02, 1.8155e-02, -2.1711e-02, -3.4703e-02, + -3.6268e-03, -4.8810e-03, -2.8142e-02, -1.5781e-02, -3.3166e-02, + -2.9910e-02, -9.7459e-03, -6.7474e-03, 1.7988e-02, 9.0176e-03, + 1.9452e-02, 4.2009e-02, 1.7217e-02, 1.4959e-02, -1.6552e-02, + -3.8206e-03, -2.4889e-02, 7.7993e-03, -1.9285e-02, -1.9770e-02, + 2.6936e-02, -5.0484e-03, -2.5117e-02, -2.3122e-02, 1.3754e-02, + 1.6025e-02, -9.1569e-03, -2.0068e-02, -1.6013e-02, -2.1775e-02, + -2.4154e-02, 6.2840e-03, -1.3684e-02, 2.5378e-02, -1.3166e-02, + -1.2201e-02, 1.0011e-02, -8.2324e-03, -5.6623e-03, -1.0383e-02, + -1.6251e-02, 1.0723e-02, -3.0207e-03, -6.9374e-03, -2.3161e-03, + -2.0850e-03, -3.4216e-02, 3.3997e-02, 3.7444e-02, -3.4273e-02, + 1.5051e-02, -9.5605e-03, -2.6979e-03, 1.8848e-02, 2.3090e-02, + 1.9669e-02, -3.9656e-02, 1.0453e-02, 5.2222e-03, -7.2493e-03, + 1.4122e-02, 5.6583e-04, -1.3991e-02, 4.0975e-02, 1.3947e-02, + 4.6919e-03, 7.9121e-03, 2.6936e-02, 1.2338e-02, 1.9048e-02, + 7.7740e-03, -6.4494e-03, -5.2965e-02, 8.1929e-03, -1.3503e-02, + 3.7466e-03, -3.3504e-02, -8.1192e-03, 1.0463e-02, -2.1568e-02, + 1.0076e-02, -1.3420e-02, -6.3353e-04, 7.4253e-03, 2.2281e-02, + 5.2829e-03, 1.4102e-02, 1.4427e-02, 1.6331e-02, -2.3305e-04, + -4.4875e-02, 6.5300e-03, 2.4963e-02, 2.2141e-03, 3.9830e-02, + 1.1405e-02, 8.6810e-03, -2.0404e-03, -1.8579e-03, 1.4765e-02, + 5.4752e-03, -1.3364e-02, -1.3082e-03, 1.5873e-03, 1.9309e-02, + 3.4367e-02, 1.8459e-02, -1.1323e-02, -1.8764e-02, -1.5370e-02, + 3.6180e-03, 2.8253e-02, -1.6867e-03, 3.5884e-03, -2.1952e-02, + -1.5026e-02, -2.1070e-02, -1.2149e-02, 1.1162e-02, -3.0343e-02, + -4.1372e-02, 1.0880e-02, 2.2365e-02, 1.2896e-02, 2.9694e-02, + -8.4248e-03, -7.8876e-03, -6.7049e-03, 2.3700e-02, 4.7528e-03, + -7.8350e-03, -5.9220e-03, 3.8396e-02, -4.1598e-02, -2.3161e-03, + 1.3419e-02, 7.1029e-03, 1.4195e-02, -1.1124e-02, 1.5812e-02, + -1.9789e-02, -2.3883e-02, -8.2788e-04, 1.4670e-02, -2.1482e-02, + -1.1182e-02, -1.6532e-02, -8.0637e-03, -3.7822e-02, 3.9402e-02, + -1.4097e-03, -7.6648e-03, -3.7156e-02, 2.5791e-02, 6.1038e-03, + -6.3429e-03, 3.2865e-03, 3.6277e-02, 9.4312e-03, -2.1003e-02, + -3.6885e-03, 1.7147e-02, -1.3079e-02, -4.9414e-02, -3.2066e-02, + 1.4835e-02, -2.9742e-02, 1.8358e-02, -2.1733e-02, 3.0256e-03, + 1.7825e-02, 1.1079e-02, 1.1619e-02, -2.3680e-02, -7.8721e-03, + 2.4456e-03, 4.3608e-02, -4.5674e-03, -3.6818e-02, 3.3952e-02, + 3.3108e-02, -3.1665e-03, -2.3468e-03, 1.5091e-02, 7.0856e-03, + 1.1723e-02, -2.0713e-02, -6.9180e-03, 3.7929e-02, 3.7671e-03, + 4.6663e-02, 9.5301e-03, 1.2638e-02, -6.5623e-03, -3.1771e-03, + -1.7568e-02, 1.8711e-03, -1.2310e-02, 2.1518e-02, 4.3408e-03, + -6.7171e-03, -5.0451e-03, 2.6870e-02, -1.9832e-02, 7.0422e-03, + 1.1274e-02, -2.4637e-02, -4.8450e-03, 2.1892e-02, -2.6059e-02, + 1.5605e-02, -1.1617e-02, -1.9273e-02, -8.6735e-04, -9.8002e-04, + -1.8553e-02, 2.1239e-02, 2.1078e-02, -1.2091e-02, 9.7025e-03, + 1.3426e-02, -1.1710e-02, -2.2242e-03, 6.4133e-03, -1.4820e-02, + 1.4682e-02, 3.0679e-02, 1.1526e-02, 1.0072e-02, -1.1572e-02, + 2.6128e-02, 4.0879e-03, -1.7936e-02, 1.3715e-02, -2.3667e-02, + 2.0419e-03, -1.6887e-02, 1.2595e-02, -2.1988e-02, -2.3777e-02, + -1.0399e-02, 2.4868e-03, -1.2265e-02, -1.8543e-02, 3.4672e-02, + 2.1114e-02, 2.0523e-02, 7.6818e-03, 2.9282e-02, -5.9593e-03, + -2.8496e-02, 2.8482e-03, 3.6874e-04, 4.7455e-02, -2.9770e-02, + -2.0684e-02, -2.0749e-02, -5.7681e-02, -2.6175e-03, -2.4488e-02, + -5.2550e-03, -7.1191e-03, 3.8192e-02, 4.3438e-02, 5.4181e-03, + 2.8392e-02, 1.9493e-02, -3.5262e-02, 1.4839e-02, 4.6481e-03, + 1.7219e-02, 2.0160e-02, 4.9998e-03, 2.1316e-02, -8.7929e-04, + -2.1542e-02, 3.9816e-03, 1.5879e-02, 9.9231e-03, 1.3962e-02, + -5.3418e-03, 3.9857e-02, 2.0997e-02, -2.1291e-05, 1.8133e-02, + -1.2472e-02, 4.9437e-03, -1.5099e-02, 4.8860e-02, 6.1980e-03, + 2.0197e-02, 1.3141e-04, -3.1087e-03, -2.2718e-03, 2.3804e-02, + 6.0726e-03, -2.0485e-02, -2.0514e-02, -2.7679e-02, -3.0412e-02, + -1.7661e-02, -1.7462e-02, 7.5216e-03, 2.2238e-02, 1.1413e-03, + 2.6647e-02, -2.3855e-02, 2.2652e-03, -4.3256e-03, -9.3274e-03, + 2.5149e-02, 6.8432e-03, 4.2664e-03, 3.8221e-02, 7.7480e-03, + 8.7203e-03, -1.2851e-03, -1.1325e-02, -1.0650e-02, -2.8079e-02, + -1.5375e-02, 2.2630e-02, -4.3439e-03, 1.3493e-02, -1.8223e-02, + 9.9750e-03, -2.4560e-02, 1.0904e-03, -3.1198e-02, 4.7331e-03, + 1.6713e-02, -1.7653e-02, -3.8674e-02, 1.5458e-02, 4.0555e-02, + 6.9451e-03, 1.1988e-03, 8.0718e-04, 3.9985e-03, -2.2781e-02, + 8.1173e-04, 2.0106e-02, -1.2800e-02, -1.2961e-02, -2.1273e-02, + -4.4104e-05, -3.6080e-02, -1.9392e-02, 3.2862e-02, -5.6041e-03, + 2.3288e-02, -4.6795e-02, 1.7282e-02, 5.7052e-03, 2.2405e-02, + 1.9871e-03, -1.4333e-02, 5.3773e-03, 4.3568e-02, 9.8980e-03, + -1.9403e-03, 1.8981e-02, -2.5712e-02, -3.3621e-03, 2.9886e-02, + 1.3326e-03, 1.1318e-02, -3.3238e-03, -1.5494e-02, -3.0565e-02, + 1.7137e-02, -2.7874e-02, -1.1257e-02, 3.2250e-02, -2.5293e-02, + -3.0693e-03, -2.7787e-02, 1.4931e-02, 2.4202e-03, -4.0572e-03, + 5.0273e-03, 9.7496e-03, 2.2601e-02, 3.2389e-02, -1.1910e-02, + 9.1037e-03, 5.6000e-02, -1.9640e-02, 1.5469e-02, -3.3027e-02, + 1.4839e-02, 2.5071e-02, -1.2687e-02, -1.3466e-02, 1.9031e-02, + -7.3403e-03, -1.5207e-02, -1.4486e-02, 2.0678e-02, -4.1996e-02, + 1.0585e-02, 3.6276e-02, 6.1149e-03, 1.6405e-02, 1.5643e-02, + 1.5060e-02, -5.1235e-03, -2.2824e-02, -1.3752e-02, -1.5742e-02, + 2.4032e-02, -2.1782e-03, -1.3158e-02, 3.9482e-03, 3.2267e-02, + -2.2632e-03, 1.2055e-02, 4.4731e-02, 1.8271e-02, -1.1486e-02, + 1.7836e-02, 1.7886e-03, -2.4020e-02, 2.6064e-02, -2.2122e-04, + 1.8643e-02, -2.9808e-02, -6.1845e-03, -4.4464e-03, 8.8374e-04, + 1.5268e-02, 1.7205e-03, 5.7832e-02, -1.7486e-02, 1.1897e-02, + 5.8081e-02, 1.7667e-02, -7.7282e-03, 1.4036e-02, -1.4936e-03, + 6.0635e-04, 1.6124e-03, -1.6916e-02, -1.1239e-02, 1.8497e-02, + 1.2334e-03, -2.0706e-02, 3.2959e-03, 2.9186e-02, 3.7506e-02, + 1.2037e-02, -1.4903e-02, 8.5606e-03, 3.4136e-03, 1.1850e-02, + -7.4782e-03, 5.3924e-03, -2.4772e-02, 2.6840e-02, -2.7656e-02, + -3.2637e-02, -1.2779e-02, 1.0730e-02, 1.4096e-03, 3.1572e-02, + 7.8976e-04, 3.1674e-02, 8.5333e-03, -1.2679e-02, 1.1176e-02, + -2.0446e-02, 1.8628e-02, -4.0158e-02, -2.3358e-02, -2.2504e-02, + -2.8759e-02, -1.4597e-02, -8.5879e-03, 1.0550e-02, -3.5556e-02, + -1.9046e-02, -1.9159e-02, -2.2703e-02, -7.2056e-03, 4.2380e-02, + -9.7475e-03, -2.4754e-02, 1.3992e-03, -1.0411e-02, 1.5708e-02, + -8.2899e-03, -6.4856e-03, 1.6359e-02, -5.1969e-04, -5.0958e-03, + -4.1232e-02, 2.7349e-03, -1.7723e-02, 1.3388e-02, 2.2776e-03, + -2.0786e-02, -1.8082e-02, -2.4866e-03, 2.2141e-02, 6.9998e-03, + -5.5714e-03, 2.1088e-02, 5.8745e-03, 1.2788e-02, 4.2977e-03, + 5.8631e-03, -1.8121e-02, 1.9242e-03, 2.3622e-02, 1.4917e-02, + -5.3198e-03, -3.9222e-02, -2.4697e-02, 9.1218e-03, -1.0711e-02, + 1.0268e-02, 1.5148e-02, -4.4508e-02, 4.6783e-03, 2.8093e-03, + 9.1253e-03, -7.3281e-03, 1.0114e-03, -9.2369e-04, 1.4841e-02, + 2.2642e-02, 2.3675e-02, 1.3902e-02, -5.6343e-03, 1.4851e-02, + -9.5169e-03, -3.1721e-02, 1.6696e-02, 2.9285e-02, -1.4090e-02, + 2.1128e-02, 4.8656e-02, 3.8431e-02, -3.5470e-02, -4.8230e-03, + -1.6513e-02, 4.1917e-02, 8.9090e-03, -1.4022e-04, 4.0182e-03, + 7.1723e-03, 3.1419e-02, -4.8508e-03, 1.7768e-03, -7.3688e-03, + 3.4637e-03, -2.3227e-02, 3.9606e-05, -2.4731e-02, -1.3640e-02, + -5.1718e-03, 2.6662e-02, -1.2871e-02, -1.6009e-02, -5.3720e-03, + 2.7397e-04, -3.4016e-03, 2.6429e-02, 3.8069e-02, 1.0929e-02, + -1.0620e-02, 1.2165e-02, -2.6018e-02, 1.6021e-02, 4.0644e-02, + -8.0898e-03, -3.5198e-02, -1.9602e-02, 2.4986e-02, -5.8400e-03, + 3.2070e-02, -1.8265e-02, -5.4518e-03, 2.8195e-02, 5.5598e-02, + -3.9959e-02, 1.5521e-02, -2.8416e-02, 3.1130e-02, -1.0038e-02, + 2.1522e-02, -1.1654e-02, 2.2382e-02, -5.4467e-03, -2.2840e-02, + 2.7036e-03, -4.4607e-02, -4.1953e-02, 2.0079e-02, -5.0121e-03, + -1.7495e-02, 4.4070e-03, 3.7400e-04, 1.0899e-02, 1.7008e-02, + -1.6307e-02, -1.9986e-02, -2.3865e-02, -2.5618e-02, -2.9981e-02, + -2.7230e-03, 2.7079e-02, 5.2920e-03, 2.1069e-02, -2.5896e-02, + -1.6256e-02, -1.4182e-03, 1.1829e-02, 1.0360e-02, 2.8883e-02, + -6.8762e-03, 1.4032e-02, -4.3389e-03]], requires_grad=True) \ No newline at end of file diff --git a/test1.txt b/test1.txt new file mode 100644 index 0000000..a6e7a8b --- /dev/null +++ b/test1.txt @@ -0,0 +1,109 @@ +from torchviz import make_dot +dot = make_dot(query_output.last_hidden_state, params=dict(self.Qformer.bert.named_parameters())) +log_dir = '/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/' +dot.render(filename="Pre_PromptMoE_RawProb_backward_graph", directory=log_dir, format="pdf") + + +# Pre-Prompt-MoE +model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[6].experts.experts[0].dense1.weight.grad +model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[0].dense1.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[1].dense1.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[2].dense1.weight.grad + + +model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight +model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[10].intermediate.dense.weight.grad +model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad + +model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight +model.Qformer.bert.encoder.layer[10].experts.experts[2].dense1.weight +model.Qformer.bert.encoder.layer[10].experts.experts[1].dense1.weight +model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight +model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight == model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight + +# Pre-MoE gate-sentence +# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 不更新 + +# Pre-MoE gate-token +# 正常更新 + +# Post-MoE gate-sentence +model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad +# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 正常更新 +# model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad 全是0/-0 +# model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad 全是0/-0 + +# Route-MoE +# Pre-MoE 算的beam_scores有问题 + +# Post-Route 会更新多个expert的参数;会更新gate的参数 +# Layer 6 更新了两个expert的参数 (layer 6 layer 8) +# model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad 是0?都是0 +# model.Qformer.bert.encoder.layer[11].output.dense.weight.grad + +model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[6].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[6].experts.experts[1].intermediate_query.dense.weight.grad + +model.Qformer.bert.encoder.layer[7].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[7].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[7].experts.experts[1].intermediate_query.dense.weight.grad + +model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad + +model.Qformer.bert.encoder.layer[9].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[9].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[9].experts.experts[1].intermediate_query.dense.weight.grad + +model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[10].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[10].experts.experts[1].intermediate_query.dense.weight.grad + +model.Qformer.bert.encoder.layer[11].experts.gate.weight.grad +model.Qformer.bert.encoder.layer[11].experts.experts[0].intermediate_query.dense.weight.grad +model.Qformer.bert.encoder.layer[11].experts.experts[1].intermediate_query.dense.weight.grad + + +(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.10.experts.experts.0.dense1.weight'] +[Parameter containing: +tensor([[-0.0328, 0.0414, 0.0010, ..., -0.0068, 0.0244, 0.0587], + [ 0.0120, 0.0458, 0.0171, ..., -0.0439, -0.0107, -0.0397], + [ 0.0239, 0.0191, -0.0145, ..., 0.0008, -0.0067, 0.0090], + ..., + [ 0.0174, -0.0465, -0.0106, ..., -0.0095, 0.0153, -0.0195], + [-0.0151, -0.0082, -0.0320, ..., -0.0016, -0.0232, -0.0147], + [ 0.0142, -0.0286, 0.0161, ..., -0.0160, -0.0306, -0.0272]], + device='cuda:0', requires_grad=True)] +(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.8.experts.experts.0.dense1.weight'] +[Parameter containing: +tensor([[ 0.0024, 0.0218, -0.0186, ..., -0.0178, -0.0067, 0.0820], + [-0.0759, -0.0002, -0.0548, ..., 0.0292, 0.0531, 0.0779], + [-0.0220, -0.0037, -0.0520, ..., -0.0426, -0.0261, -0.0357], + ..., + [-0.0448, 0.0471, 0.0133, ..., -0.0062, -0.0217, -0.0203], + [ 0.0532, 0.0197, 0.0320, ..., -0.0010, -0.0838, 0.0682], + [ 0.0284, 0.0038, -0.0007, ..., -0.0305, 0.0296, 0.0056]], + device='cuda:0', requires_grad=True)] +(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.6.experts.experts.0.dense1.weight'] +[Parameter containing: +tensor([[ 6.5176e-02, -4.6473e-02, -2.7396e-02, ..., 2.1774e-03, + 6.1457e-02, 1.9180e-03], + [ 7.3707e-03, 6.1392e-02, -2.7108e-02, ..., 4.0778e-02, + -1.9791e-02, -1.1612e-02], + [ 2.1193e-02, -3.8323e-02, -6.0238e-02, ..., -1.4539e-02, + 9.2965e-02, 3.9153e-02], + ..., + [ 5.3203e-03, -1.7276e-02, -3.2191e-02, ..., -1.6435e-02, + -1.8553e-02, -2.8158e-02], + [-6.9853e-02, 9.2719e-03, -1.8895e-03, ..., -2.6425e-02, + 1.4880e-03, 3.4505e-02], + [-1.2168e-03, 3.7038e-02, 4.8047e-02, ..., -3.4523e-03, + -1.3030e-05, -1.4778e-02]], device='cuda:0', requires_grad=True)] \ No newline at end of file