diff --git a/Pre_PromptMoE_RawProb_backward_graph b/Pre_PromptMoE_RawProb_backward_graph
new file mode 100644
index 0000000..3a8d029
--- /dev/null
+++ b/Pre_PromptMoE_RawProb_backward_graph
@@ -0,0 +1,5294 @@
+digraph {
+ graph [size="739.65,739.65"]
+ node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled]
+ 140202223089520 [label="
+ (1, 46, 768)" fillcolor=darkolivegreen1]
+ 140202228657312 [label=CatBackward0]
+ 140202228615488 -> 140202228657312
+ 140202228615488 [label=NativeLayerNormBackward0]
+ 140202228614096 -> 140202228615488
+ 140202228614096 [label=AddBackward0]
+ 140202223538720 -> 140202228614096
+ 140202223538720 [label=NativeDropoutBackward0]
+ 140202223538912 -> 140202223538720
+ 140202223538912 [label=ViewBackward0]
+ 140202223539008 -> 140202223538912
+ 140202223539008 [label=AddmmBackward0]
+ 140202223539104 -> 140202223539008
+ 140202223539104 [label=ToCopyBackward0]
+ 140202223539296 -> 140202223539104
+ 140202228893712 [label="encoder.layer.11.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228893712 -> 140202223539296
+ 140202223539296 [label=AccumulateGrad]
+ 140202223538864 -> 140202223539008
+ 140202223538864 [label=ViewBackward0]
+ 140202223539152 -> 140202223538864
+ 140202223539152 [label=GeluBackward0]
+ 140202223539248 -> 140202223539152
+ 140202223539248 [label=ViewBackward0]
+ 140202223539680 -> 140202223539248
+ 140202223539680 [label=AddmmBackward0]
+ 140202223539584 -> 140202223539680
+ 140202223539584 [label=ToCopyBackward0]
+ 140202223538528 -> 140202223539584
+ 140202228893952 [label="encoder.layer.11.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228893952 -> 140202223538528
+ 140202223538528 [label=AccumulateGrad]
+ 140202223539440 -> 140202223539680
+ 140202223539440 [label=ViewBackward0]
+ 140202223538288 -> 140202223539440
+ 140202223538288 [label=ToCopyBackward0]
+ 140202223538480 -> 140202223538288
+ 140202223538480 [label=SliceBackward0]
+ 140202223538336 -> 140202223538480
+ 140202223538336 [label=SliceBackward0]
+ 140202223539776 -> 140202223538336
+ 140202223539776 [label=SliceBackward0]
+ 140202223539872 -> 140202223539776
+ 140202223539872 [label=SliceBackward0]
+ 140202223539968 -> 140202223539872
+ 140202223539968 [label=SliceBackward0]
+ 140202223540064 -> 140202223539968
+ 140202223540064 [label=NativeLayerNormBackward0]
+ 140202223540160 -> 140202223540064
+ 140202223540160 [label=AddBackward0]
+ 140202223540352 -> 140202223540160
+ 140202223540352 [label=NativeDropoutBackward0]
+ 140202223540304 -> 140202223540352
+ 140202223540304 [label=ViewBackward0]
+ 140202223540400 -> 140202223540304
+ 140202223540400 [label=AddmmBackward0]
+ 140202223540496 -> 140202223540400
+ 140202223540496 [label=ToCopyBackward0]
+ 140202223540688 -> 140202223540496
+ 140202228904080 [label="encoder.layer.11.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228904080 -> 140202223540688
+ 140202223540688 [label=AccumulateGrad]
+ 140202223540640 -> 140202223540400
+ 140202223540640 [label=ViewBackward0]
+ 140202223540928 -> 140202223540640
+ 140202223540928 [label=ViewBackward0]
+ 140202223541024 -> 140202223540928
+ 140202223541024 [label=CloneBackward0]
+ 140202223541120 -> 140202223541024
+ 140202223541120 [label=PermuteBackward0]
+ 140202223541216 -> 140202223541120
+ 140202223541216 [label=UnsafeViewBackward0]
+ 140202223541312 -> 140202223541216
+ 140202223541312 [label=BmmBackward0]
+ 140202223541408 -> 140202223541312
+ 140202223541408 [label=ReshapeAliasBackward0]
+ 140202223541504 -> 140202223541408
+ 140202223541504 [label=ExpandBackward0]
+ 140202223541600 -> 140202223541504
+ 140202223541600 [label=ToCopyBackward0]
+ 140202223541792 -> 140202223541600
+ 140202223541792 [label=NativeDropoutBackward0]
+ 140202223541984 -> 140202223541792
+ 140202223541984 [label=SoftmaxBackward0]
+ 140202223542080 -> 140202223541984
+ 140202223542080 [label=AddBackward0]
+ 140202223541264 -> 140202223542080
+ 140202223541264 [label=DivBackward0]
+ 140202223575296 -> 140202223541264
+ 140202223575296 [label=UnsafeViewBackward0]
+ 140202223575392 -> 140202223575296
+ 140202223575392 [label=BmmBackward0]
+ 140202223575584 -> 140202223575392
+ 140202223575584 [label=ReshapeAliasBackward0]
+ 140202223575968 -> 140202223575584
+ 140202223575968 [label=ExpandBackward0]
+ 140202223576160 -> 140202223575968
+ 140202223576160 [label=PermuteBackward0]
+ 140202223576208 -> 140202223576160
+ 140202223576208 [label=ViewBackward0]
+ 140202223576448 -> 140202223576208
+ 140202223576448 [label=ViewBackward0]
+ 140202223576640 -> 140202223576448
+ 140202223576640 [label=AddmmBackward0]
+ 140202223576688 -> 140202223576640
+ 140202223576688 [label=ToCopyBackward0]
+ 140202223577120 -> 140202223576688
+ 140202228906560 [label="encoder.layer.11.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228906560 -> 140202223577120
+ 140202223577120 [label=AccumulateGrad]
+ 140202223576544 -> 140202223576640
+ 140202223576544 [label=ViewBackward0]
+ 140202223577024 -> 140202223576544
+ 140202223577024 [label=ToCopyBackward0]
+ 140202223540112 -> 140202223577024
+ 140202223540112 [label=CatBackward0]
+ 140202223577408 -> 140202223540112
+ 140202223577408 [label=NativeLayerNormBackward0]
+ 140202223577504 -> 140202223577408
+ 140202223577504 [label=AddBackward0]
+ 140202223577792 -> 140202223577504
+ 140202223577792 [label=SumBackward1]
+ 140202223578128 -> 140202223577792
+ 140202223578128 [label=MulBackward0]
+ 140202223578368 -> 140202223578128
+ 140202223578368 [label=PermuteBackward0]
+ 140202223578464 -> 140202223578368
+ 140202223578464 [label=CatBackward0]
+ 140202223578656 -> 140202223578464
+ 140202223578656 [label=UnsqueezeBackward0]
+ 140202223578944 -> 140202223578656
+ 140202223578944 [label=NativeDropoutBackward0]
+ 140202223578752 -> 140202223578944
+ 140202223578752 [label=ViewBackward0]
+ 140202223079536 -> 140202223578752
+ 140202223079536 [label=AddmmBackward0]
+ 140202223079776 -> 140202223079536
+ 140202223079776 [label=ToCopyBackward0]
+ 140202223080064 -> 140202223079776
+ 140202228905360 [label="encoder.layer.10.experts.experts.0.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228905360 -> 140202223080064
+ 140202223080064 [label=AccumulateGrad]
+ 140202223079872 -> 140202223079536
+ 140202223079872 [label=ViewBackward0]
+ 140202223080352 -> 140202223079872
+ 140202223080352 [label=GeluBackward0]
+ 140202223080400 -> 140202223080352
+ 140202223080400 [label=ViewBackward0]
+ 140202223080640 -> 140202223080400
+ 140202223080640 [label=AddmmBackward0]
+ 140202223080832 -> 140202223080640
+ 140202223080832 [label=ToCopyBackward0]
+ 140202223081120 -> 140202223080832
+ 140202228905280 [label="encoder.layer.10.experts.experts.0.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228905280 -> 140202223081120
+ 140202223081120 [label=AccumulateGrad]
+ 140202223080544 -> 140202223080640
+ 140202223080544 [label=ViewBackward0]
+ 140202223081024 -> 140202223080544
+ 140202223081024 [label=ToCopyBackward0]
+ 140202223577888 -> 140202223081024
+ 140202223577888 [label=SliceBackward0]
+ 140202223081360 -> 140202223577888
+ 140202223081360 [label=SliceBackward0]
+ 140202223081600 -> 140202223081360
+ 140202223081600 [label=NativeLayerNormBackward0]
+ 140202223081792 -> 140202223081600
+ 140202223081792 [label=AddBackward0]
+ 140202223082080 -> 140202223081792
+ 140202223082080 [label=NativeDropoutBackward0]
+ 140202223082176 -> 140202223082080
+ 140202223082176 [label=ViewBackward0]
+ 140202223082368 -> 140202223082176
+ 140202223082368 [label=AddmmBackward0]
+ 140202223082464 -> 140202223082368
+ 140202223082464 [label=ToCopyBackward0]
+ 140202223082848 -> 140202223082464
+ 140202228924880 [label="encoder.layer.10.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228924880 -> 140202223082848
+ 140202223082848 [label=AccumulateGrad]
+ 140202223082560 -> 140202223082368
+ 140202223082560 [label=ViewBackward0]
+ 140202223083040 -> 140202223082560
+ 140202223083040 [label=ViewBackward0]
+ 140202223083232 -> 140202223083040
+ 140202223083232 [label=CloneBackward0]
+ 140202223083280 -> 140202223083232
+ 140202223083280 [label=PermuteBackward0]
+ 140202223083424 -> 140202223083280
+ 140202223083424 [label=UnsafeViewBackward0]
+ 140202223082800 -> 140202223083424
+ 140202223082800 [label=BmmBackward0]
+ 140202223108400 -> 140202223082800
+ 140202223108400 [label=ReshapeAliasBackward0]
+ 140202223108544 -> 140202223108400
+ 140202223108544 [label=ExpandBackward0]
+ 140202223108736 -> 140202223108544
+ 140202223108736 [label=ToCopyBackward0]
+ 140202223108928 -> 140202223108736
+ 140202223108928 [label=NativeDropoutBackward0]
+ 140202223109024 -> 140202223108928
+ 140202223109024 [label=SoftmaxBackward0]
+ 140202223109216 -> 140202223109024
+ 140202223109216 [label=AddBackward0]
+ 140202223109408 -> 140202223109216
+ 140202223109408 [label=DivBackward0]
+ 140202223109504 -> 140202223109408
+ 140202223109504 [label=UnsafeViewBackward0]
+ 140202223109696 -> 140202223109504
+ 140202223109696 [label=BmmBackward0]
+ 140202223109888 -> 140202223109696
+ 140202223109888 [label=ReshapeAliasBackward0]
+ 140202223110272 -> 140202223109888
+ 140202223110272 [label=ExpandBackward0]
+ 140202223110320 -> 140202223110272
+ 140202223110320 [label=PermuteBackward0]
+ 140202223110560 -> 140202223110320
+ 140202223110560 [label=ViewBackward0]
+ 140202223110752 -> 140202223110560
+ 140202223110752 [label=ViewBackward0]
+ 140202223110800 -> 140202223110752
+ 140202223110800 [label=AddmmBackward0]
+ 140202223111040 -> 140202223110800
+ 140202223111040 [label=ToCopyBackward0]
+ 140202223111280 -> 140202223111040
+ 140202228925600 [label="encoder.layer.10.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228925600 -> 140202223111280
+ 140202223111280 [label=AccumulateGrad]
+ 140202223110848 -> 140202223110800
+ 140202223110848 [label=ViewBackward0]
+ 140202223111328 -> 140202223110848
+ 140202223111328 [label=ToCopyBackward0]
+ 140202223081888 -> 140202223111328
+ 140202223081888 [label=SliceBackward0]
+ 140202223111712 -> 140202223081888
+ 140202223111712 [label=SliceBackward0]
+ 140202223111760 -> 140202223111712
+ 140202223111760 [label=SliceBackward0]
+ 140202223112000 -> 140202223111760
+ 140202223112000 [label=NativeLayerNormBackward0]
+ 140202223112096 -> 140202223112000
+ 140202223112096 [label=AddBackward0]
+ 140202223137120 -> 140202223112096
+ 140202223137120 [label=NativeDropoutBackward0]
+ 140202223137216 -> 140202223137120
+ 140202223137216 [label=ViewBackward0]
+ 140202223137408 -> 140202223137216
+ 140202223137408 [label=AddmmBackward0]
+ 140202223137504 -> 140202223137408
+ 140202223137504 [label=ToCopyBackward0]
+ 140202223137888 -> 140202223137504
+ 140202228926080 [label="encoder.layer.10.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228926080 -> 140202223137888
+ 140202223137888 [label=AccumulateGrad]
+ 140202223137600 -> 140202223137408
+ 140202223137600 [label=ViewBackward0]
+ 140202223138080 -> 140202223137600
+ 140202223138080 [label=ViewBackward0]
+ 140202223138272 -> 140202223138080
+ 140202223138272 [label=CloneBackward0]
+ 140202223138320 -> 140202223138272
+ 140202223138320 [label=PermuteBackward0]
+ 140202223138560 -> 140202223138320
+ 140202223138560 [label=UnsafeViewBackward0]
+ 140202223138752 -> 140202223138560
+ 140202223138752 [label=BmmBackward0]
+ 140202223138800 -> 140202223138752
+ 140202223138800 [label=ReshapeAliasBackward0]
+ 140202223138944 -> 140202223138800
+ 140202223138944 [label=ExpandBackward0]
+ 140202223139136 -> 140202223138944
+ 140202223139136 [label=ToCopyBackward0]
+ 140202223139328 -> 140202223139136
+ 140202223139328 [label=NativeDropoutBackward0]
+ 140202223139424 -> 140202223139328
+ 140202223139424 [label=SoftmaxBackward0]
+ 140202223139616 -> 140202223139424
+ 140202223139616 [label=AddBackward0]
+ 140202223139808 -> 140202223139616
+ 140202223139808 [label=DivBackward0]
+ 140202223139904 -> 140202223139808
+ 140202223139904 [label=UnsafeViewBackward0]
+ 140202223140096 -> 140202223139904
+ 140202223140096 [label=BmmBackward0]
+ 140202223140288 -> 140202223140096
+ 140202223140288 [label=ReshapeAliasBackward0]
+ 140202223140672 -> 140202223140288
+ 140202223140672 [label=ExpandBackward0]
+ 140202223140720 -> 140202223140672
+ 140202223140720 [label=PermuteBackward0]
+ 140202223140768 -> 140202223140720
+ 140202223140768 [label=ViewBackward0]
+ 140202223169888 -> 140202223140768
+ 140202223169888 [label=ViewBackward0]
+ 140202223169936 -> 140202223169888
+ 140202223169936 [label=AddmmBackward0]
+ 140202223170176 -> 140202223169936
+ 140202223170176 [label=ToCopyBackward0]
+ 140202223170416 -> 140202223170176
+ 140202228926800 [label="encoder.layer.10.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228926800 -> 140202223170416
+ 140202223170416 [label=AccumulateGrad]
+ 140202223169984 -> 140202223169936
+ 140202223169984 [label=ViewBackward0]
+ 140202223170464 -> 140202223169984
+ 140202223170464 [label=ToCopyBackward0]
+ 140202223136928 -> 140202223170464
+ 140202223136928 [label=CatBackward0]
+ 140202223170848 -> 140202223136928
+ 140202223170848 [label=NativeLayerNormBackward0]
+ 140202223170944 -> 140202223170848
+ 140202223170944 [label=AddBackward0]
+ 140202223171232 -> 140202223170944
+ 140202223171232 [label=NativeDropoutBackward0]
+ 140202223171616 -> 140202223171232
+ 140202223171616 [label=ViewBackward0]
+ 140202223171808 -> 140202223171616
+ 140202223171808 [label=AddmmBackward0]
+ 140202223171856 -> 140202223171808
+ 140202223171856 [label=ToCopyBackward0]
+ 140202223172288 -> 140202223171856
+ 140202228927280 [label="encoder.layer.9.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228927280 -> 140202223172288
+ 140202223172288 [label=AccumulateGrad]
+ 140202223171712 -> 140202223171808
+ 140202223171712 [label=ViewBackward0]
+ 140202223172192 -> 140202223171712
+ 140202223172192 [label=GeluBackward0]
+ 140202223172384 -> 140202223172192
+ 140202223172384 [label=ViewBackward0]
+ 140202223172480 -> 140202223172384
+ 140202223172480 [label=AddmmBackward0]
+ 140202223172672 -> 140202223172480
+ 140202223172672 [label=ToCopyBackward0]
+ 140202223172960 -> 140202223172672
+ 140202228927520 [label="encoder.layer.9.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228927520 -> 140202223172960
+ 140202223172960 [label=AccumulateGrad]
+ 140202223172768 -> 140202223172480
+ 140202223172768 [label=ViewBackward0]
+ 140202223173248 -> 140202223172768
+ 140202223173248 [label=ToCopyBackward0]
+ 140202223171328 -> 140202223173248
+ 140202223171328 [label=SliceBackward0]
+ 140202223173344 -> 140202223171328
+ 140202223173344 [label=SliceBackward0]
+ 140202223173440 -> 140202223173344
+ 140202223173440 [label=SliceBackward0]
+ 140202223172864 -> 140202223173440
+ 140202223172864 [label=SliceBackward0]
+ 140202223194368 -> 140202223172864
+ 140202223194368 [label=SliceBackward0]
+ 140202223194464 -> 140202223194368
+ 140202223194464 [label=NativeLayerNormBackward0]
+ 140202223194656 -> 140202223194464
+ 140202223194656 [label=AddBackward0]
+ 140202223194944 -> 140202223194656
+ 140202223194944 [label=NativeDropoutBackward0]
+ 140202223195280 -> 140202223194944
+ 140202223195280 [label=ViewBackward0]
+ 140202223195520 -> 140202223195280
+ 140202223195520 [label=AddmmBackward0]
+ 140202223195712 -> 140202223195520
+ 140202223195712 [label=ToCopyBackward0]
+ 140202223196000 -> 140202223195712
+ 140202228933472 [label="encoder.layer.9.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228933472 -> 140202223196000
+ 140202223196000 [label=AccumulateGrad]
+ 140202223195424 -> 140202223195520
+ 140202223195424 [label=ViewBackward0]
+ 140202223195904 -> 140202223195424
+ 140202223195904 [label=ViewBackward0]
+ 140202223196096 -> 140202223195904
+ 140202223196096 [label=CloneBackward0]
+ 140202223196288 -> 140202223196096
+ 140202223196288 [label=PermuteBackward0]
+ 140202223196384 -> 140202223196288
+ 140202223196384 [label=UnsafeViewBackward0]
+ 140202223196576 -> 140202223196384
+ 140202223196576 [label=BmmBackward0]
+ 140202223196768 -> 140202223196576
+ 140202223196768 [label=ReshapeAliasBackward0]
+ 140202223197152 -> 140202223196768
+ 140202223197152 [label=ExpandBackward0]
+ 140202223197200 -> 140202223197152
+ 140202223197200 [label=ToCopyBackward0]
+ 140202223197440 -> 140202223197200
+ 140202223197440 [label=NativeDropoutBackward0]
+ 140202223197632 -> 140202223197440
+ 140202223197632 [label=SoftmaxBackward0]
+ 140202223197680 -> 140202223197632
+ 140202223197680 [label=AddBackward0]
+ 140202223197920 -> 140202223197680
+ 140202223197920 [label=DivBackward0]
+ 140202223198112 -> 140202223197920
+ 140202223198112 [label=UnsafeViewBackward0]
+ 140202223198016 -> 140202223198112
+ 140202223198016 [label=BmmBackward0]
+ 140202223227136 -> 140202223198016
+ 140202223227136 [label=ReshapeAliasBackward0]
+ 140202223227232 -> 140202223227136
+ 140202223227232 [label=ExpandBackward0]
+ 140202223227424 -> 140202223227232
+ 140202223227424 [label=PermuteBackward0]
+ 140202223227520 -> 140202223227424
+ 140202223227520 [label=ViewBackward0]
+ 140202223227712 -> 140202223227520
+ 140202223227712 [label=ViewBackward0]
+ 140202223227904 -> 140202223227712
+ 140202223227904 [label=AddmmBackward0]
+ 140202223228000 -> 140202223227904
+ 140202223228000 [label=ToCopyBackward0]
+ 140202223228384 -> 140202223228000
+ 140202228936032 [label="encoder.layer.9.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228936032 -> 140202223228384
+ 140202223228384 [label=AccumulateGrad]
+ 140202223228096 -> 140202223227904
+ 140202223228096 [label=ViewBackward0]
+ 140202223228576 -> 140202223228096
+ 140202223228576 [label=ToCopyBackward0]
+ 140202223195040 -> 140202223228576
+ 140202223195040 [label=CatBackward0]
+ 140202223228672 -> 140202223195040
+ 140202223228672 [label=NativeLayerNormBackward0]
+ 140202223229056 -> 140202223228672
+ 140202223229056 [label=AddBackward0]
+ 140202223229296 -> 140202223229056
+ 140202223229296 [label=SumBackward1]
+ 140202223229440 -> 140202223229296
+ 140202223229440 [label=MulBackward0]
+ 140202223229632 -> 140202223229440
+ 140202223229632 [label=PermuteBackward0]
+ 140202223230016 -> 140202223229632
+ 140202223230016 [label=CatBackward0]
+ 140202223230208 -> 140202223230016
+ 140202223230208 [label=UnsqueezeBackward0]
+ 140202223230496 -> 140202223230208
+ 140202223230496 [label=NativeDropoutBackward0]
+ 140202223230688 -> 140202223230496
+ 140202223230688 [label=ViewBackward0]
+ 140202223230736 -> 140202223230688
+ 140202223230736 [label=AddmmBackward0]
+ 140202223230880 -> 140202223230736
+ 140202223230880 [label=ToCopyBackward0]
+ 140202223247664 -> 140202223230880
+ 140202228934832 [label="encoder.layer.8.experts.experts.0.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228934832 -> 140202223247664
+ 140202223247664 [label=AccumulateGrad]
+ 140202223230784 -> 140202223230736
+ 140202223230784 [label=ViewBackward0]
+ 140202223247712 -> 140202223230784
+ 140202223247712 [label=GeluBackward0]
+ 140202223247808 -> 140202223247712
+ 140202223247808 [label=ViewBackward0]
+ 140202223248000 -> 140202223247808
+ 140202223248000 [label=AddmmBackward0]
+ 140202223248192 -> 140202223248000
+ 140202223248192 [label=ToCopyBackward0]
+ 140202223248480 -> 140202223248192
+ 140202228935152 [label="encoder.layer.8.experts.experts.0.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228935152 -> 140202223248480
+ 140202223248480 [label=AccumulateGrad]
+ 140202223248144 -> 140202223248000
+ 140202223248144 [label=ViewBackward0]
+ 140202223248624 -> 140202223248144
+ 140202223248624 [label=ToCopyBackward0]
+ 140202223229152 -> 140202223248624
+ 140202223229152 [label=SliceBackward0]
+ 140202223248768 -> 140202223229152
+ 140202223248768 [label=SliceBackward0]
+ 140202223248960 -> 140202223248768
+ 140202223248960 [label=NativeLayerNormBackward0]
+ 140202223249152 -> 140202223248960
+ 140202223249152 [label=AddBackward0]
+ 140202223249440 -> 140202223249152
+ 140202223249440 [label=NativeDropoutBackward0]
+ 140202223249824 -> 140202223249440
+ 140202223249824 [label=ViewBackward0]
+ 140202223250016 -> 140202223249824
+ 140202223250016 [label=AddmmBackward0]
+ 140202223250064 -> 140202223250016
+ 140202223250064 [label=ToCopyBackward0]
+ 140202223250496 -> 140202223250064
+ 140202228950656 [label="encoder.layer.8.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228950656 -> 140202223250496
+ 140202223250496 [label=AccumulateGrad]
+ 140202223249920 -> 140202223250016
+ 140202223249920 [label=ViewBackward0]
+ 140202223250400 -> 140202223249920
+ 140202223250400 [label=ViewBackward0]
+ 140202223250592 -> 140202223250400
+ 140202223250592 [label=CloneBackward0]
+ 140202223250688 -> 140202223250592
+ 140202223250688 [label=PermuteBackward0]
+ 140202223250976 -> 140202223250688
+ 140202223250976 [label=UnsafeViewBackward0]
+ 140202223251264 -> 140202223250976
+ 140202223251264 [label=BmmBackward0]
+ 140202223251360 -> 140202223251264
+ 140202223251360 [label=ReshapeAliasBackward0]
+ 140202223284384 -> 140202223251360
+ 140202223284384 [label=ExpandBackward0]
+ 140202223284480 -> 140202223284384
+ 140202223284480 [label=ToCopyBackward0]
+ 140202223284672 -> 140202223284480
+ 140202223284672 [label=NativeDropoutBackward0]
+ 140202223284864 -> 140202223284672
+ 140202223284864 [label=SoftmaxBackward0]
+ 140202223284960 -> 140202223284864
+ 140202223284960 [label=AddBackward0]
+ 140202223285152 -> 140202223284960
+ 140202223285152 [label=DivBackward0]
+ 140202223285344 -> 140202223285152
+ 140202223285344 [label=UnsafeViewBackward0]
+ 140202223285440 -> 140202223285344
+ 140202223285440 [label=BmmBackward0]
+ 140202223285632 -> 140202223285440
+ 140202223285632 [label=ReshapeAliasBackward0]
+ 140202223286016 -> 140202223285632
+ 140202223286016 [label=ExpandBackward0]
+ 140202223286208 -> 140202223286016
+ 140202223286208 [label=PermuteBackward0]
+ 140202223286256 -> 140202223286208
+ 140202223286256 [label=ViewBackward0]
+ 140202223286496 -> 140202223286256
+ 140202223286496 [label=ViewBackward0]
+ 140202223286688 -> 140202223286496
+ 140202223286688 [label=AddmmBackward0]
+ 140202223286736 -> 140202223286688
+ 140202223286736 [label=ToCopyBackward0]
+ 140202223287168 -> 140202223286736
+ 140202228951376 [label="encoder.layer.8.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228951376 -> 140202223287168
+ 140202223287168 [label=AccumulateGrad]
+ 140202223286592 -> 140202223286688
+ 140202223286592 [label=ViewBackward0]
+ 140202223287072 -> 140202223286592
+ 140202223287072 [label=ToCopyBackward0]
+ 140202223249536 -> 140202223287072
+ 140202223249536 [label=SliceBackward0]
+ 140202223287456 -> 140202223249536
+ 140202223287456 [label=SliceBackward0]
+ 140202223287648 -> 140202223287456
+ 140202223287648 [label=SliceBackward0]
+ 140202223287696 -> 140202223287648
+ 140202223287696 [label=NativeLayerNormBackward0]
+ 140202223287936 -> 140202223287696
+ 140202223287936 [label=AddBackward0]
+ 140202223288176 -> 140202223287936
+ 140202223288176 [label=NativeDropoutBackward0]
+ 140202223288224 -> 140202223288176
+ 140202223288224 [label=ViewBackward0]
+ 140202223313152 -> 140202223288224
+ 140202223313152 [label=AddmmBackward0]
+ 140202223313344 -> 140202223313152
+ 140202223313344 [label=ToCopyBackward0]
+ 140202223313632 -> 140202223313344
+ 140202228951856 [label="encoder.layer.8.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228951856 -> 140202223313632
+ 140202223313632 [label=AccumulateGrad]
+ 140202223313296 -> 140202223313152
+ 140202223313296 [label=ViewBackward0]
+ 140202223313776 -> 140202223313296
+ 140202223313776 [label=ViewBackward0]
+ 140202223314016 -> 140202223313776
+ 140202223314016 [label=CloneBackward0]
+ 140202223314208 -> 140202223314016
+ 140202223314208 [label=PermuteBackward0]
+ 140202223314256 -> 140202223314208
+ 140202223314256 [label=UnsafeViewBackward0]
+ 140202223314496 -> 140202223314256
+ 140202223314496 [label=BmmBackward0]
+ 140202223314688 -> 140202223314496
+ 140202223314688 [label=ReshapeAliasBackward0]
+ 140202223314784 -> 140202223314688
+ 140202223314784 [label=ExpandBackward0]
+ 140202223314880 -> 140202223314784
+ 140202223314880 [label=ToCopyBackward0]
+ 140202223315072 -> 140202223314880
+ 140202223315072 [label=NativeDropoutBackward0]
+ 140202223315264 -> 140202223315072
+ 140202223315264 [label=SoftmaxBackward0]
+ 140202223315360 -> 140202223315264
+ 140202223315360 [label=AddBackward0]
+ 140202223315552 -> 140202223315360
+ 140202223315552 [label=DivBackward0]
+ 140202223315744 -> 140202223315552
+ 140202223315744 [label=UnsafeViewBackward0]
+ 140202223315840 -> 140202223315744
+ 140202223315840 [label=BmmBackward0]
+ 140202223316032 -> 140202223315840
+ 140202223316032 [label=ReshapeAliasBackward0]
+ 140202223316416 -> 140202223316032
+ 140202223316416 [label=ExpandBackward0]
+ 140202223316608 -> 140202223316416
+ 140202223316608 [label=PermuteBackward0]
+ 140202223316656 -> 140202223316608
+ 140202223316656 [label=ViewBackward0]
+ 140202223316896 -> 140202223316656
+ 140202223316896 [label=ViewBackward0]
+ 140202223316800 -> 140202223316896
+ 140202223316800 [label=AddmmBackward0]
+ 140202222817488 -> 140202223316800
+ 140202222817488 [label=ToCopyBackward0]
+ 140202222817920 -> 140202222817488
+ 140202228952576 [label="encoder.layer.8.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228952576 -> 140202222817920
+ 140202222817920 [label=AccumulateGrad]
+ 140202222817344 -> 140202223316800
+ 140202222817344 [label=ViewBackward0]
+ 140202222817824 -> 140202222817344
+ 140202222817824 [label=ToCopyBackward0]
+ 140202223288032 -> 140202222817824
+ 140202223288032 [label=CatBackward0]
+ 140202222818208 -> 140202223288032
+ 140202222818208 [label=NativeLayerNormBackward0]
+ 140202222818304 -> 140202222818208
+ 140202222818304 [label=AddBackward0]
+ 140202222818592 -> 140202222818304
+ 140202222818592 [label=NativeDropoutBackward0]
+ 140202222818928 -> 140202222818592
+ 140202222818928 [label=ViewBackward0]
+ 140202222819168 -> 140202222818928
+ 140202222819168 [label=AddmmBackward0]
+ 140202222819360 -> 140202222819168
+ 140202222819360 [label=ToCopyBackward0]
+ 140202222819648 -> 140202222819360
+ 140202228952976 [label="encoder.layer.7.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228952976 -> 140202222819648
+ 140202222819648 [label=AccumulateGrad]
+ 140202222819072 -> 140202222819168
+ 140202222819072 [label=ViewBackward0]
+ 140202222819552 -> 140202222819072
+ 140202222819552 [label=GeluBackward0]
+ 140202222819744 -> 140202222819552
+ 140202222819744 [label=ViewBackward0]
+ 140202222819936 -> 140202222819744
+ 140202222819936 [label=AddmmBackward0]
+ 140202222820032 -> 140202222819936
+ 140202222820032 [label=ToCopyBackward0]
+ 140202222820416 -> 140202222820032
+ 140202228965680 [label="encoder.layer.7.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228965680 -> 140202222820416
+ 140202222820416 [label=AccumulateGrad]
+ 140202222820128 -> 140202222819936
+ 140202222820128 [label=ViewBackward0]
+ 140202222820608 -> 140202222820128
+ 140202222820608 [label=ToCopyBackward0]
+ 140202222818688 -> 140202222820608
+ 140202222818688 [label=SliceBackward0]
+ 140202222820704 -> 140202222818688
+ 140202222820704 [label=SliceBackward0]
+ 140202222820896 -> 140202222820704
+ 140202222820896 [label=SliceBackward0]
+ 140202222820992 -> 140202222820896
+ 140202222820992 [label=SliceBackward0]
+ 140202222821184 -> 140202222820992
+ 140202222821184 [label=SliceBackward0]
+ 140202222820224 -> 140202222821184
+ 140202222820224 [label=NativeLayerNormBackward0]
+ 140202222841968 -> 140202222820224
+ 140202222841968 [label=AddBackward0]
+ 140202222842400 -> 140202222841968
+ 140202222842400 [label=NativeDropoutBackward0]
+ 140202222842784 -> 140202222842400
+ 140202222842784 [label=ViewBackward0]
+ 140202222842832 -> 140202222842784
+ 140202222842832 [label=AddmmBackward0]
+ 140202222843072 -> 140202222842832
+ 140202222843072 [label=ToCopyBackward0]
+ 140202222843312 -> 140202222843072
+ 140202228967040 [label="encoder.layer.7.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228967040 -> 140202222843312
+ 140202222843312 [label=AccumulateGrad]
+ 140202222842880 -> 140202222842832
+ 140202222842880 [label=ViewBackward0]
+ 140202222843360 -> 140202222842880
+ 140202222843360 [label=ViewBackward0]
+ 140202222843456 -> 140202222843360
+ 140202222843456 [label=CloneBackward0]
+ 140202222843648 -> 140202222843456
+ 140202222843648 [label=PermuteBackward0]
+ 140202222843840 -> 140202222843648
+ 140202222843840 [label=UnsafeViewBackward0]
+ 140202222843936 -> 140202222843840
+ 140202222843936 [label=BmmBackward0]
+ 140202222844128 -> 140202222843936
+ 140202222844128 [label=ReshapeAliasBackward0]
+ 140202222844512 -> 140202222844128
+ 140202222844512 [label=ExpandBackward0]
+ 140202222844704 -> 140202222844512
+ 140202222844704 [label=ToCopyBackward0]
+ 140202222844752 -> 140202222844704
+ 140202222844752 [label=NativeDropoutBackward0]
+ 140202222844992 -> 140202222844752
+ 140202222844992 [label=SoftmaxBackward0]
+ 140202222845184 -> 140202222844992
+ 140202222845184 [label=AddBackward0]
+ 140202222845232 -> 140202222845184
+ 140202222845232 [label=DivBackward0]
+ 140202222845472 -> 140202222845232
+ 140202222845472 [label=UnsafeViewBackward0]
+ 140202222845664 -> 140202222845472
+ 140202222845664 [label=BmmBackward0]
+ 140202222845712 -> 140202222845664
+ 140202222845712 [label=ReshapeAliasBackward0]
+ 140202222845856 -> 140202222845712
+ 140202222845856 [label=ExpandBackward0]
+ 140202222870688 -> 140202222845856
+ 140202222870688 [label=PermuteBackward0]
+ 140202222870880 -> 140202222870688
+ 140202222870880 [label=ViewBackward0]
+ 140202222870976 -> 140202222870880
+ 140202222870976 [label=ViewBackward0]
+ 140202222871168 -> 140202222870976
+ 140202222871168 [label=AddmmBackward0]
+ 140202222871360 -> 140202222871168
+ 140202222871360 [label=ToCopyBackward0]
+ 140202222871648 -> 140202222871360
+ 140202228982304 [label="encoder.layer.7.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228982304 -> 140202222871648
+ 140202222871648 [label=AccumulateGrad]
+ 140202222871312 -> 140202222871168
+ 140202222871312 [label=ViewBackward0]
+ 140202222871792 -> 140202222871312
+ 140202222871792 [label=ToCopyBackward0]
+ 140202222842352 -> 140202222871792
+ 140202222842352 [label=CatBackward0]
+ 140202222871936 -> 140202222842352
+ 140202222871936 [label=NativeLayerNormBackward0]
+ 140202222872272 -> 140202222871936
+ 140202222872272 [label=AddBackward0]
+ 140202222872704 -> 140202222872272
+ 140202222872704 [label=SumBackward1]
+ 140202222872800 -> 140202222872704
+ 140202222872800 [label=MulBackward0]
+ 140202222872896 -> 140202222872800
+ 140202222872896 [label=PermuteBackward0]
+ 140202222873232 -> 140202222872896
+ 140202222873232 [label=CatBackward0]
+ 140202222873472 -> 140202222873232
+ 140202222873472 [label=UnsqueezeBackward0]
+ 140202222873712 -> 140202222873472
+ 140202222873712 [label=NativeDropoutBackward0]
+ 140202222873952 -> 140202222873712
+ 140202222873952 [label=ViewBackward0]
+ 140202222874144 -> 140202222873952
+ 140202222874144 [label=AddmmBackward0]
+ 140202222874192 -> 140202222874144
+ 140202222874192 [label=ToCopyBackward0]
+ 140202222874528 -> 140202222874192
+ 140202228968800 [label="encoder.layer.6.experts.experts.0.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228968800 -> 140202222874528
+ 140202222874528 [label=AccumulateGrad]
+ 140202222874048 -> 140202222874144
+ 140202222874048 [label=ViewBackward0]
+ 140202222874432 -> 140202222874048
+ 140202222874432 [label=GeluBackward0]
+ 140202222903456 -> 140202222874432
+ 140202222903456 [label=ViewBackward0]
+ 140202222903552 -> 140202222903456
+ 140202222903552 [label=AddmmBackward0]
+ 140202222903744 -> 140202222903552
+ 140202222903744 [label=ToCopyBackward0]
+ 140202222904032 -> 140202222903744
+ 140202228968720 [label="encoder.layer.6.experts.experts.0.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228968720 -> 140202222904032
+ 140202222904032 [label=AccumulateGrad]
+ 140202222903840 -> 140202222903552
+ 140202222903840 [label=ViewBackward0]
+ 140202222904320 -> 140202222903840
+ 140202222904320 [label=ToCopyBackward0]
+ 140202222872416 -> 140202222904320
+ 140202222872416 [label=SliceBackward0]
+ 140202222904416 -> 140202222872416
+ 140202222904416 [label=SliceBackward0]
+ 140202222904512 -> 140202222904416
+ 140202222904512 [label=NativeLayerNormBackward0]
+ 140202222904896 -> 140202222904512
+ 140202222904896 [label=AddBackward0]
+ 140202222905184 -> 140202222904896
+ 140202222905184 [label=NativeDropoutBackward0]
+ 140202222905280 -> 140202222905184
+ 140202222905280 [label=ViewBackward0]
+ 140202222905328 -> 140202222905280
+ 140202222905328 [label=AddmmBackward0]
+ 140202222905568 -> 140202222905328
+ 140202222905568 [label=ToCopyBackward0]
+ 140202222905808 -> 140202222905568
+ 140202228984224 [label="encoder.layer.6.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228984224 -> 140202222905808
+ 140202222905808 [label=AccumulateGrad]
+ 140202222905664 -> 140202222905328
+ 140202222905664 [label=ViewBackward0]
+ 140202222906144 -> 140202222905664
+ 140202222906144 [label=ViewBackward0]
+ 140202222906336 -> 140202222906144
+ 140202222906336 [label=CloneBackward0]
+ 140202222906432 -> 140202222906336
+ 140202222906432 [label=PermuteBackward0]
+ 140202222906624 -> 140202222906432
+ 140202222906624 [label=UnsafeViewBackward0]
+ 140202222906816 -> 140202222906624
+ 140202222906816 [label=BmmBackward0]
+ 140202222906912 -> 140202222906816
+ 140202222906912 [label=ReshapeAliasBackward0]
+ 140202222907008 -> 140202222906912
+ 140202222907008 [label=ExpandBackward0]
+ 140202222907200 -> 140202222907008
+ 140202222907200 [label=ToCopyBackward0]
+ 140202222907248 -> 140202222907200
+ 140202222907248 [label=NativeDropoutBackward0]
+ 140202222932128 -> 140202222907248
+ 140202222932128 [label=SoftmaxBackward0]
+ 140202222932320 -> 140202222932128
+ 140202222932320 [label=AddBackward0]
+ 140202222932368 -> 140202222932320
+ 140202222932368 [label=DivBackward0]
+ 140202222932608 -> 140202222932368
+ 140202222932608 [label=UnsafeViewBackward0]
+ 140202222932800 -> 140202222932608
+ 140202222932800 [label=BmmBackward0]
+ 140202222932848 -> 140202222932800
+ 140202222932848 [label=ReshapeAliasBackward0]
+ 140202222933376 -> 140202222932848
+ 140202222933376 [label=ExpandBackward0]
+ 140202222933472 -> 140202222933376
+ 140202222933472 [label=PermuteBackward0]
+ 140202222933664 -> 140202222933472
+ 140202222933664 [label=ViewBackward0]
+ 140202222933856 -> 140202222933664
+ 140202222933856 [label=ViewBackward0]
+ 140202222933952 -> 140202222933856
+ 140202222933952 [label=AddmmBackward0]
+ 140202222934144 -> 140202222933952
+ 140202222934144 [label=ToCopyBackward0]
+ 140202222934432 -> 140202222934144
+ 140202228984944 [label="encoder.layer.6.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228984944 -> 140202222934432
+ 140202222934432 [label=AccumulateGrad]
+ 140202222933808 -> 140202222933952
+ 140202222933808 [label=ViewBackward0]
+ 140202222934288 -> 140202222933808
+ 140202222934288 [label=ToCopyBackward0]
+ 140202222904848 -> 140202222934288
+ 140202222904848 [label=SliceBackward0]
+ 140202222934816 -> 140202222904848
+ 140202222934816 [label=SliceBackward0]
+ 140202222934912 -> 140202222934816
+ 140202222934912 [label=SliceBackward0]
+ 140202222935104 -> 140202222934912
+ 140202222935104 [label=NativeLayerNormBackward0]
+ 140202222935296 -> 140202222935104
+ 140202222935296 [label=AddBackward0]
+ 140202222935584 -> 140202222935296
+ 140202222935584 [label=NativeDropoutBackward0]
+ 140202222935680 -> 140202222935584
+ 140202222935680 [label=ViewBackward0]
+ 140202222935728 -> 140202222935680
+ 140202222935728 [label=AddmmBackward0]
+ 140202222935968 -> 140202222935728
+ 140202222935968 [label=ToCopyBackward0]
+ 140202222960848 -> 140202222935968
+ 140202228985424 [label="encoder.layer.6.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228985424 -> 140202222960848
+ 140202222960848 [label=AccumulateGrad]
+ 140202222935488 -> 140202222935728
+ 140202222935488 [label=ViewBackward0]
+ 140202222961184 -> 140202222935488
+ 140202222961184 [label=ViewBackward0]
+ 140202222961376 -> 140202222961184
+ 140202222961376 [label=CloneBackward0]
+ 140202222961472 -> 140202222961376
+ 140202222961472 [label=PermuteBackward0]
+ 140202222961664 -> 140202222961472
+ 140202222961664 [label=UnsafeViewBackward0]
+ 140202222961856 -> 140202222961664
+ 140202222961856 [label=BmmBackward0]
+ 140202222961952 -> 140202222961856
+ 140202222961952 [label=ReshapeAliasBackward0]
+ 140202222962048 -> 140202222961952
+ 140202222962048 [label=ExpandBackward0]
+ 140202222962240 -> 140202222962048
+ 140202222962240 [label=ToCopyBackward0]
+ 140202222962288 -> 140202222962240
+ 140202222962288 [label=NativeDropoutBackward0]
+ 140202222962528 -> 140202222962288
+ 140202222962528 [label=SoftmaxBackward0]
+ 140202222962720 -> 140202222962528
+ 140202222962720 [label=AddBackward0]
+ 140202222962768 -> 140202222962720
+ 140202222962768 [label=DivBackward0]
+ 140202222963008 -> 140202222962768
+ 140202222963008 [label=UnsafeViewBackward0]
+ 140202222963200 -> 140202222963008
+ 140202222963200 [label=BmmBackward0]
+ 140202222963248 -> 140202222963200
+ 140202222963248 [label=ReshapeAliasBackward0]
+ 140202222963776 -> 140202222963248
+ 140202222963776 [label=ExpandBackward0]
+ 140202222963872 -> 140202222963776
+ 140202222963872 [label=PermuteBackward0]
+ 140202222964064 -> 140202222963872
+ 140202222964064 [label=ViewBackward0]
+ 140202222964256 -> 140202222964064
+ 140202222964256 [label=ViewBackward0]
+ 140202222964352 -> 140202222964256
+ 140202222964352 [label=AddmmBackward0]
+ 140202222964544 -> 140202222964352
+ 140202222964544 [label=ToCopyBackward0]
+ 140202222964640 -> 140202222964544
+ 140202228986240 [label="encoder.layer.6.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228986240 -> 140202222964640
+ 140202222964640 [label=AccumulateGrad]
+ 140202222964208 -> 140202222964352
+ 140202222964208 [label=ViewBackward0]
+ 140202222988064 -> 140202222964208
+ 140202222988064 [label=ToCopyBackward0]
+ 140202222935248 -> 140202222988064
+ 140202222935248 [label=CatBackward0]
+ 140202222988736 -> 140202222935248
+ 140202222988736 [label=NativeLayerNormBackward0]
+ 140202222985280 -> 140202222988736
+ 140202222985280 [label=AddBackward0]
+ 140202222985472 -> 140202222985280
+ 140202222985472 [label=NativeDropoutBackward0]
+ 140202222985856 -> 140202222985472
+ 140202222985856 [label=ViewBackward0]
+ 140202222986048 -> 140202222985856
+ 140202222986048 [label=AddmmBackward0]
+ 140202222986240 -> 140202222986048
+ 140202222986240 [label=ToCopyBackward0]
+ 140202222987680 -> 140202222986240
+ 140202228986720 [label="encoder.layer.5.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228986720 -> 140202222987680
+ 140202222987680 [label=AccumulateGrad]
+ 140202222985952 -> 140202222986048
+ 140202222985952 [label=ViewBackward0]
+ 140202222986432 -> 140202222985952
+ 140202222986432 [label=GeluBackward0]
+ 140202222986624 -> 140202222986432
+ 140202222986624 [label=ViewBackward0]
+ 140202222986672 -> 140202222986624
+ 140202222986672 [label=AddmmBackward0]
+ 140202222986912 -> 140202222986672
+ 140202222986912 [label=ToCopyBackward0]
+ 140202222989072 -> 140202222986912
+ 140202228986960 [label="encoder.layer.5.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228986960 -> 140202222989072
+ 140202222989072 [label=AccumulateGrad]
+ 140202222986816 -> 140202222986672
+ 140202222986816 [label=ViewBackward0]
+ 140202222988832 -> 140202222986816
+ 140202222988832 [label=ToCopyBackward0]
+ 140202222985568 -> 140202222988832
+ 140202222985568 [label=SliceBackward0]
+ 140202222987632 -> 140202222985568
+ 140202222987632 [label=SliceBackward0]
+ 140202222989216 -> 140202222987632
+ 140202222989216 [label=SliceBackward0]
+ 140202222987872 -> 140202222989216
+ 140202222987872 [label=SliceBackward0]
+ 140202222987968 -> 140202222987872
+ 140202222987968 [label=SliceBackward0]
+ 140202222988352 -> 140202222987968
+ 140202222988352 [label=NativeLayerNormBackward0]
+ 140202222987584 -> 140202222988352
+ 140202222987584 [label=AddBackward0]
+ 140202224191520 -> 140202222987584
+ 140202224191520 [label=NativeDropoutBackward0]
+ 140202224191280 -> 140202224191520
+ 140202224191280 [label=ViewBackward0]
+ 140202224191184 -> 140202224191280
+ 140202224191184 [label=AddmmBackward0]
+ 140202224191088 -> 140202224191184
+ 140202224191088 [label=ToCopyBackward0]
+ 140202224190896 -> 140202224191088
+ 140202228988880 [label="encoder.layer.5.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228988880 -> 140202224190896
+ 140202224190896 [label=AccumulateGrad]
+ 140202224191232 -> 140202224191184
+ 140202224191232 [label=ViewBackward0]
+ 140202224190944 -> 140202224191232
+ 140202224190944 [label=ViewBackward0]
+ 140202224190848 -> 140202224190944
+ 140202224190848 [label=CloneBackward0]
+ 140202224190752 -> 140202224190848
+ 140202224190752 [label=PermuteBackward0]
+ 140202224190656 -> 140202224190752
+ 140202224190656 [label=UnsafeViewBackward0]
+ 140202224190560 -> 140202224190656
+ 140202224190560 [label=BmmBackward0]
+ 140202224190464 -> 140202224190560
+ 140202224190464 [label=ReshapeAliasBackward0]
+ 140202224190224 -> 140202224190464
+ 140202224190224 [label=ExpandBackward0]
+ 140202224190128 -> 140202224190224
+ 140202224190128 [label=ToCopyBackward0]
+ 140202224190032 -> 140202224190128
+ 140202224190032 [label=NativeDropoutBackward0]
+ 140202224189936 -> 140202224190032
+ 140202224189936 [label=SoftmaxBackward0]
+ 140202224189840 -> 140202224189936
+ 140202224189840 [label=AddBackward0]
+ 140202224189744 -> 140202224189840
+ 140202224189744 [label=DivBackward0]
+ 140202224189648 -> 140202224189744
+ 140202224189648 [label=UnsafeViewBackward0]
+ 140202224189552 -> 140202224189648
+ 140202224189552 [label=BmmBackward0]
+ 140202224189504 -> 140202224189552
+ 140202224189504 [label=ReshapeAliasBackward0]
+ 140202224191808 -> 140202224189504
+ 140202224191808 [label=ExpandBackward0]
+ 140202224191904 -> 140202224191808
+ 140202224191904 [label=PermuteBackward0]
+ 140202224192000 -> 140202224191904
+ 140202224192000 [label=ViewBackward0]
+ 140202224192096 -> 140202224192000
+ 140202224192096 [label=ViewBackward0]
+ 140202224192192 -> 140202224192096
+ 140202224192192 [label=AddmmBackward0]
+ 140202224192288 -> 140202224192192
+ 140202224192288 [label=ToCopyBackward0]
+ 140202224192480 -> 140202224192288
+ 140202228989600 [label="encoder.layer.5.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228989600 -> 140202224192480
+ 140202224192480 [label=AccumulateGrad]
+ 140202224192240 -> 140202224192192
+ 140202224192240 [label=ViewBackward0]
+ 140202224192576 -> 140202224192240
+ 140202224192576 [label=ToCopyBackward0]
+ 140202224191472 -> 140202224192576
+ 140202224191472 [label=CatBackward0]
+ 140202224192720 -> 140202224191472
+ 140202224192720 [label=NativeLayerNormBackward0]
+ 140202224192864 -> 140202224192720
+ 140202224192864 [label=AddBackward0]
+ 140202224193056 -> 140202224192864
+ 140202224193056 [label=NativeDropoutBackward0]
+ 140202224193200 -> 140202224193056
+ 140202224193200 [label=ViewBackward0]
+ 140202224193296 -> 140202224193200
+ 140202224193296 [label=AddmmBackward0]
+ 140202224193392 -> 140202224193296
+ 140202224193392 [label=ToCopyBackward0]
+ 140202224193488 -> 140202224193392
+ 140202229010656 [label="encoder.layer.4.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202229010656 -> 140202224193488
+ 140202224193488 [label=AccumulateGrad]
+ 140202224193344 -> 140202224193296
+ 140202224193344 [label=ViewBackward0]
+ 140210811924640 -> 140202224193344
+ 140210811924640 [label=GeluBackward0]
+ 140210811924736 -> 140210811924640
+ 140210811924736 [label=ViewBackward0]
+ 140210811924832 -> 140210811924736
+ 140210811924832 [label=AddmmBackward0]
+ 140210811924928 -> 140210811924832
+ 140210811924928 [label=ToCopyBackward0]
+ 140210811925120 -> 140210811924928
+ 140202229010896 [label="encoder.layer.4.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202229010896 -> 140210811925120
+ 140210811925120 [label=AccumulateGrad]
+ 140210811924880 -> 140210811924832
+ 140210811924880 [label=ViewBackward0]
+ 140210811925168 -> 140210811924880
+ 140210811925168 [label=ToCopyBackward0]
+ 140202224193008 -> 140210811925168
+ 140202224193008 [label=SliceBackward0]
+ 140210811925312 -> 140202224193008
+ 140210811925312 [label=SliceBackward0]
+ 140210811925408 -> 140210811925312
+ 140210811925408 [label=NativeLayerNormBackward0]
+ 140210811925504 -> 140210811925408
+ 140210811925504 [label=AddBackward0]
+ 140210811925696 -> 140210811925504
+ 140210811925696 [label=NativeDropoutBackward0]
+ 140210811925840 -> 140210811925696
+ 140210811925840 [label=ViewBackward0]
+ 140210811925936 -> 140210811925840
+ 140210811925936 [label=AddmmBackward0]
+ 140210811926032 -> 140210811925936
+ 140210811926032 [label=ToCopyBackward0]
+ 140210811926224 -> 140210811926032
+ 140202229012816 [label="encoder.layer.4.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229012816 -> 140210811926224
+ 140210811926224 [label=AccumulateGrad]
+ 140210811925984 -> 140210811925936
+ 140210811925984 [label=ViewBackward0]
+ 140210811926272 -> 140210811925984
+ 140210811926272 [label=ViewBackward0]
+ 140210811926368 -> 140210811926272
+ 140210811926368 [label=CloneBackward0]
+ 140210811926464 -> 140210811926368
+ 140210811926464 [label=PermuteBackward0]
+ 140210811926560 -> 140210811926464
+ 140210811926560 [label=UnsafeViewBackward0]
+ 140210811926656 -> 140210811926560
+ 140210811926656 [label=BmmBackward0]
+ 140210811926752 -> 140210811926656
+ 140210811926752 [label=ReshapeAliasBackward0]
+ 140210811926896 -> 140210811926752
+ 140210811926896 [label=ExpandBackward0]
+ 140210811926992 -> 140210811926896
+ 140210811926992 [label=ToCopyBackward0]
+ 140210811927088 -> 140210811926992
+ 140210811927088 [label=NativeDropoutBackward0]
+ 140210811927184 -> 140210811927088
+ 140210811927184 [label=SoftmaxBackward0]
+ 140210811927280 -> 140210811927184
+ 140210811927280 [label=AddBackward0]
+ 140210811927376 -> 140210811927280
+ 140210811927376 [label=DivBackward0]
+ 140210811927472 -> 140210811927376
+ 140210811927472 [label=UnsafeViewBackward0]
+ 140210811927568 -> 140210811927472
+ 140210811927568 [label=BmmBackward0]
+ 140210811927664 -> 140210811927568
+ 140210811927664 [label=ReshapeAliasBackward0]
+ 140210811927808 -> 140210811927664
+ 140210811927808 [label=ExpandBackward0]
+ 140210811927904 -> 140210811927808
+ 140210811927904 [label=PermuteBackward0]
+ 140210811928000 -> 140210811927904
+ 140210811928000 [label=ViewBackward0]
+ 140210811928096 -> 140210811928000
+ 140210811928096 [label=ViewBackward0]
+ 140210811928192 -> 140210811928096
+ 140210811928192 [label=AddmmBackward0]
+ 140210811928288 -> 140210811928192
+ 140210811928288 [label=ToCopyBackward0]
+ 140210811928480 -> 140210811928288
+ 140202229013536 [label="encoder.layer.4.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229013536 -> 140210811928480
+ 140210811928480 [label=AccumulateGrad]
+ 140210811928240 -> 140210811928192
+ 140210811928240 [label=ViewBackward0]
+ 140210811928384 -> 140210811928240
+ 140210811928384 [label=ToCopyBackward0]
+ 140210811925648 -> 140210811928384
+ 140210811925648 [label=SliceBackward0]
+ 140210811941024 -> 140210811925648
+ 140210811941024 [label=SliceBackward0]
+ 140210811941120 -> 140210811941024
+ 140210811941120 [label=SliceBackward0]
+ 140210811941216 -> 140210811941120
+ 140210811941216 [label=NativeLayerNormBackward0]
+ 140210811941312 -> 140210811941216
+ 140210811941312 [label=AddBackward0]
+ 140210811941504 -> 140210811941312
+ 140210811941504 [label=NativeDropoutBackward0]
+ 140210811941648 -> 140210811941504
+ 140210811941648 [label=ViewBackward0]
+ 140210811941744 -> 140210811941648
+ 140210811941744 [label=AddmmBackward0]
+ 140210811941840 -> 140210811941744
+ 140210811941840 [label=ToCopyBackward0]
+ 140210811942032 -> 140210811941840
+ 140202229014016 [label="encoder.layer.4.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229014016 -> 140210811942032
+ 140210811942032 [label=AccumulateGrad]
+ 140210811941792 -> 140210811941744
+ 140210811941792 [label=ViewBackward0]
+ 140210811942080 -> 140210811941792
+ 140210811942080 [label=ViewBackward0]
+ 140210811942176 -> 140210811942080
+ 140210811942176 [label=CloneBackward0]
+ 140210811942272 -> 140210811942176
+ 140210811942272 [label=PermuteBackward0]
+ 140210811942368 -> 140210811942272
+ 140210811942368 [label=UnsafeViewBackward0]
+ 140210811942464 -> 140210811942368
+ 140210811942464 [label=BmmBackward0]
+ 140210811942560 -> 140210811942464
+ 140210811942560 [label=ReshapeAliasBackward0]
+ 140210811942704 -> 140210811942560
+ 140210811942704 [label=ExpandBackward0]
+ 140210811942800 -> 140210811942704
+ 140210811942800 [label=ToCopyBackward0]
+ 140210811942896 -> 140210811942800
+ 140210811942896 [label=NativeDropoutBackward0]
+ 140210811942992 -> 140210811942896
+ 140210811942992 [label=SoftmaxBackward0]
+ 140210811943088 -> 140210811942992
+ 140210811943088 [label=AddBackward0]
+ 140210811943184 -> 140210811943088
+ 140210811943184 [label=DivBackward0]
+ 140210811943280 -> 140210811943184
+ 140210811943280 [label=UnsafeViewBackward0]
+ 140210811943376 -> 140210811943280
+ 140210811943376 [label=BmmBackward0]
+ 140210811943472 -> 140210811943376
+ 140210811943472 [label=ReshapeAliasBackward0]
+ 140210811943616 -> 140210811943472
+ 140210811943616 [label=ExpandBackward0]
+ 140210811943712 -> 140210811943616
+ 140210811943712 [label=PermuteBackward0]
+ 140210811943808 -> 140210811943712
+ 140210811943808 [label=ViewBackward0]
+ 140210811943904 -> 140210811943808
+ 140210811943904 [label=ViewBackward0]
+ 140210811944000 -> 140210811943904
+ 140210811944000 [label=AddmmBackward0]
+ 140210811944096 -> 140210811944000
+ 140210811944096 [label=ToCopyBackward0]
+ 140210811944288 -> 140210811944096
+ 140202229023024 [label="encoder.layer.4.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229023024 -> 140210811944288
+ 140210811944288 [label=AccumulateGrad]
+ 140210811944048 -> 140210811944000
+ 140210811944048 [label=ViewBackward0]
+ 140210811944336 -> 140210811944048
+ 140210811944336 [label=ToCopyBackward0]
+ 140210811941456 -> 140210811944336
+ 140210811941456 [label=CatBackward0]
+ 140210811944480 -> 140210811941456
+ 140210811944480 [label=NativeLayerNormBackward0]
+ 140210811944624 -> 140210811944480
+ 140210811944624 [label=AddBackward0]
+ 140210811944816 -> 140210811944624
+ 140210811944816 [label=NativeDropoutBackward0]
+ 140210811944912 -> 140210811944816
+ 140210811944912 [label=ViewBackward0]
+ 140210811957408 -> 140210811944912
+ 140210811957408 [label=AddmmBackward0]
+ 140210811957504 -> 140210811957408
+ 140210811957504 [label=ToCopyBackward0]
+ 140210811957696 -> 140210811957504
+ 140202229023504 [label="encoder.layer.3.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202229023504 -> 140210811957696
+ 140210811957696 [label=AccumulateGrad]
+ 140210811957456 -> 140210811957408
+ 140210811957456 [label=ViewBackward0]
+ 140210811957744 -> 140210811957456
+ 140210811957744 [label=GeluBackward0]
+ 140210811957840 -> 140210811957744
+ 140210811957840 [label=ViewBackward0]
+ 140210811957936 -> 140210811957840
+ 140210811957936 [label=AddmmBackward0]
+ 140210811958032 -> 140210811957936
+ 140210811958032 [label=ToCopyBackward0]
+ 140210811958224 -> 140210811958032
+ 140202229023744 [label="encoder.layer.3.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202229023744 -> 140210811958224
+ 140210811958224 [label=AccumulateGrad]
+ 140210811957984 -> 140210811957936
+ 140210811957984 [label=ViewBackward0]
+ 140210811958272 -> 140210811957984
+ 140210811958272 [label=ToCopyBackward0]
+ 140210811944768 -> 140210811958272
+ 140210811944768 [label=SliceBackward0]
+ 140210811958416 -> 140210811944768
+ 140210811958416 [label=SliceBackward0]
+ 140210811958512 -> 140210811958416
+ 140210811958512 [label=SliceBackward0]
+ 140210811958608 -> 140210811958512
+ 140210811958608 [label=SliceBackward0]
+ 140210811958704 -> 140210811958608
+ 140210811958704 [label=SliceBackward0]
+ 140210811958800 -> 140210811958704
+ 140210811958800 [label=NativeLayerNormBackward0]
+ 140210811958896 -> 140210811958800
+ 140210811958896 [label=AddBackward0]
+ 140210811959088 -> 140210811958896
+ 140210811959088 [label=NativeDropoutBackward0]
+ 140210811959232 -> 140210811959088
+ 140210811959232 [label=ViewBackward0]
+ 140210811959328 -> 140210811959232
+ 140210811959328 [label=AddmmBackward0]
+ 140210811959424 -> 140210811959328
+ 140210811959424 [label=ToCopyBackward0]
+ 140210811959616 -> 140210811959424
+ 140202229025664 [label="encoder.layer.3.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229025664 -> 140210811959616
+ 140210811959616 [label=AccumulateGrad]
+ 140210811959376 -> 140210811959328
+ 140210811959376 [label=ViewBackward0]
+ 140210811959664 -> 140210811959376
+ 140210811959664 [label=ViewBackward0]
+ 140210811959760 -> 140210811959664
+ 140210811959760 [label=CloneBackward0]
+ 140210811959856 -> 140210811959760
+ 140210811959856 [label=PermuteBackward0]
+ 140210811959952 -> 140210811959856
+ 140210811959952 [label=UnsafeViewBackward0]
+ 140210811960048 -> 140210811959952
+ 140210811960048 [label=BmmBackward0]
+ 140210811960144 -> 140210811960048
+ 140210811960144 [label=ReshapeAliasBackward0]
+ 140210811960288 -> 140210811960144
+ 140210811960288 [label=ExpandBackward0]
+ 140210811960384 -> 140210811960288
+ 140210811960384 [label=ToCopyBackward0]
+ 140210811960480 -> 140210811960384
+ 140210811960480 [label=NativeDropoutBackward0]
+ 140210811960576 -> 140210811960480
+ 140210811960576 [label=SoftmaxBackward0]
+ 140210811960672 -> 140210811960576
+ 140210811960672 [label=AddBackward0]
+ 140210811960768 -> 140210811960672
+ 140210811960768 [label=DivBackward0]
+ 140210811960864 -> 140210811960768
+ 140210811960864 [label=UnsafeViewBackward0]
+ 140210811960960 -> 140210811960864
+ 140210811960960 [label=BmmBackward0]
+ 140210811961056 -> 140210811960960
+ 140210811961056 [label=ReshapeAliasBackward0]
+ 140210811961200 -> 140210811961056
+ 140210811961200 [label=ExpandBackward0]
+ 140210811961296 -> 140210811961200
+ 140210811961296 [label=PermuteBackward0]
+ 140210811961104 -> 140210811961296
+ 140210811961104 [label=ViewBackward0]
+ 140210811973840 -> 140210811961104
+ 140210811973840 [label=ViewBackward0]
+ 140210811973936 -> 140210811973840
+ 140210811973936 [label=AddmmBackward0]
+ 140210811974032 -> 140210811973936
+ 140210811974032 [label=ToCopyBackward0]
+ 140210811974224 -> 140210811974032
+ 140202229026384 [label="encoder.layer.3.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229026384 -> 140210811974224
+ 140210811974224 [label=AccumulateGrad]
+ 140210811973984 -> 140210811973936
+ 140210811973984 [label=ViewBackward0]
+ 140210811974272 -> 140210811973984
+ 140210811974272 [label=ToCopyBackward0]
+ 140210811959040 -> 140210811974272
+ 140210811959040 [label=CatBackward0]
+ 140210811974416 -> 140210811959040
+ 140210811974416 [label=NativeLayerNormBackward0]
+ 140210811974560 -> 140210811974416
+ 140210811974560 [label=AddBackward0]
+ 140210811974752 -> 140210811974560
+ 140210811974752 [label=NativeDropoutBackward0]
+ 140210811974896 -> 140210811974752
+ 140210811974896 [label=ViewBackward0]
+ 140210811974992 -> 140210811974896
+ 140210811974992 [label=AddmmBackward0]
+ 140210811975088 -> 140210811974992
+ 140210811975088 [label=ToCopyBackward0]
+ 140210811975280 -> 140210811975088
+ 140202229039248 [label="encoder.layer.2.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202229039248 -> 140210811975280
+ 140210811975280 [label=AccumulateGrad]
+ 140210811975040 -> 140210811974992
+ 140210811975040 [label=ViewBackward0]
+ 140210811975328 -> 140210811975040
+ 140210811975328 [label=GeluBackward0]
+ 140210811975424 -> 140210811975328
+ 140210811975424 [label=ViewBackward0]
+ 140210811975520 -> 140210811975424
+ 140210811975520 [label=AddmmBackward0]
+ 140210811975616 -> 140210811975520
+ 140210811975616 [label=ToCopyBackward0]
+ 140210811975808 -> 140210811975616
+ 140202229039488 [label="encoder.layer.2.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202229039488 -> 140210811975808
+ 140210811975808 [label=AccumulateGrad]
+ 140210811975568 -> 140210811975520
+ 140210811975568 [label=ViewBackward0]
+ 140210811975856 -> 140210811975568
+ 140210811975856 [label=ToCopyBackward0]
+ 140210811974704 -> 140210811975856
+ 140210811974704 [label=SliceBackward0]
+ 140210811976000 -> 140210811974704
+ 140210811976000 [label=SliceBackward0]
+ 140210811976096 -> 140210811976000
+ 140210811976096 [label=NativeLayerNormBackward0]
+ 140210811976192 -> 140210811976096
+ 140210811976192 [label=AddBackward0]
+ 140210811976384 -> 140210811976192
+ 140210811976384 [label=NativeDropoutBackward0]
+ 140210811976528 -> 140210811976384
+ 140210811976528 [label=ViewBackward0]
+ 140210811976624 -> 140210811976528
+ 140210811976624 [label=AddmmBackward0]
+ 140210811976720 -> 140210811976624
+ 140210811976720 [label=ToCopyBackward0]
+ 140210811976912 -> 140210811976720
+ 140202229041408 [label="encoder.layer.2.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229041408 -> 140210811976912
+ 140210811976912 [label=AccumulateGrad]
+ 140210811976672 -> 140210811976624
+ 140210811976672 [label=ViewBackward0]
+ 140210811976960 -> 140210811976672
+ 140210811976960 [label=ViewBackward0]
+ 140210811977056 -> 140210811976960
+ 140210811977056 [label=CloneBackward0]
+ 140210811977152 -> 140210811977056
+ 140210811977152 [label=PermuteBackward0]
+ 140210811977248 -> 140210811977152
+ 140210811977248 [label=UnsafeViewBackward0]
+ 140210811977344 -> 140210811977248
+ 140210811977344 [label=BmmBackward0]
+ 140210811977440 -> 140210811977344
+ 140210811977440 [label=ReshapeAliasBackward0]
+ 140210811977584 -> 140210811977440
+ 140210811977584 [label=ExpandBackward0]
+ 140210811977680 -> 140210811977584
+ 140210811977680 [label=ToCopyBackward0]
+ 140210811977488 -> 140210811977680
+ 140210811977488 [label=NativeDropoutBackward0]
+ 140210811994320 -> 140210811977488
+ 140210811994320 [label=SoftmaxBackward0]
+ 140210811994416 -> 140210811994320
+ 140210811994416 [label=AddBackward0]
+ 140210811994512 -> 140210811994416
+ 140210811994512 [label=DivBackward0]
+ 140210811994608 -> 140210811994512
+ 140210811994608 [label=UnsafeViewBackward0]
+ 140210811994704 -> 140210811994608
+ 140210811994704 [label=BmmBackward0]
+ 140210811994800 -> 140210811994704
+ 140210811994800 [label=ReshapeAliasBackward0]
+ 140210811994944 -> 140210811994800
+ 140210811994944 [label=ExpandBackward0]
+ 140210811995040 -> 140210811994944
+ 140210811995040 [label=PermuteBackward0]
+ 140210811995136 -> 140210811995040
+ 140210811995136 [label=ViewBackward0]
+ 140210811995232 -> 140210811995136
+ 140210811995232 [label=ViewBackward0]
+ 140210811995328 -> 140210811995232
+ 140210811995328 [label=AddmmBackward0]
+ 140210811995424 -> 140210811995328
+ 140210811995424 [label=ToCopyBackward0]
+ 140210811995616 -> 140210811995424
+ 140202229042128 [label="encoder.layer.2.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229042128 -> 140210811995616
+ 140210811995616 [label=AccumulateGrad]
+ 140210811995376 -> 140210811995328
+ 140210811995376 [label=ViewBackward0]
+ 140210811995664 -> 140210811995376
+ 140210811995664 [label=ToCopyBackward0]
+ 140210811976336 -> 140210811995664
+ 140210811976336 [label=SliceBackward0]
+ 140210811995808 -> 140210811976336
+ 140210811995808 [label=SliceBackward0]
+ 140210811995904 -> 140210811995808
+ 140210811995904 [label=SliceBackward0]
+ 140210811996000 -> 140210811995904
+ 140210811996000 [label=NativeLayerNormBackward0]
+ 140210811996096 -> 140210811996000
+ 140210811996096 [label=AddBackward0]
+ 140210811996288 -> 140210811996096
+ 140210811996288 [label=NativeDropoutBackward0]
+ 140210811996432 -> 140210811996288
+ 140210811996432 [label=ViewBackward0]
+ 140210811996528 -> 140210811996432
+ 140210811996528 [label=AddmmBackward0]
+ 140210811996624 -> 140210811996528
+ 140210811996624 [label=ToCopyBackward0]
+ 140210811996816 -> 140210811996624
+ 140202229042608 [label="encoder.layer.2.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229042608 -> 140210811996816
+ 140210811996816 [label=AccumulateGrad]
+ 140210811996576 -> 140210811996528
+ 140210811996576 [label=ViewBackward0]
+ 140210811996864 -> 140210811996576
+ 140210811996864 [label=ViewBackward0]
+ 140210811996960 -> 140210811996864
+ 140210811996960 [label=CloneBackward0]
+ 140210811997056 -> 140210811996960
+ 140210811997056 [label=PermuteBackward0]
+ 140210811997152 -> 140210811997056
+ 140210811997152 [label=UnsafeViewBackward0]
+ 140210811997248 -> 140210811997152
+ 140210811997248 [label=BmmBackward0]
+ 140210811997344 -> 140210811997248
+ 140210811997344 [label=ReshapeAliasBackward0]
+ 140210811997488 -> 140210811997344
+ 140210811997488 [label=ExpandBackward0]
+ 140210811997584 -> 140210811997488
+ 140210811997584 [label=ToCopyBackward0]
+ 140210811997680 -> 140210811997584
+ 140210811997680 [label=NativeDropoutBackward0]
+ 140210811997776 -> 140210811997680
+ 140210811997776 [label=SoftmaxBackward0]
+ 140210811997872 -> 140210811997776
+ 140210811997872 [label=AddBackward0]
+ 140210811997968 -> 140210811997872
+ 140210811997968 [label=DivBackward0]
+ 140210811998064 -> 140210811997968
+ 140210811998064 [label=UnsafeViewBackward0]
+ 140210811998160 -> 140210811998064
+ 140210811998160 [label=BmmBackward0]
+ 140210811997392 -> 140210811998160
+ 140210811997392 [label=ReshapeAliasBackward0]
+ 140210812006656 -> 140210811997392
+ 140210812006656 [label=ExpandBackward0]
+ 140210812006752 -> 140210812006656
+ 140210812006752 [label=PermuteBackward0]
+ 140210812006848 -> 140210812006752
+ 140210812006848 [label=ViewBackward0]
+ 140210812006944 -> 140210812006848
+ 140210812006944 [label=ViewBackward0]
+ 140210812007040 -> 140210812006944
+ 140210812007040 [label=AddmmBackward0]
+ 140210812007136 -> 140210812007040
+ 140210812007136 [label=ToCopyBackward0]
+ 140210812007328 -> 140210812007136
+ 140202229047520 [label="encoder.layer.2.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229047520 -> 140210812007328
+ 140210812007328 [label=AccumulateGrad]
+ 140210812007088 -> 140210812007040
+ 140210812007088 [label=ViewBackward0]
+ 140210812007376 -> 140210812007088
+ 140210812007376 [label=ToCopyBackward0]
+ 140210811996240 -> 140210812007376
+ 140210811996240 [label=CatBackward0]
+ 140210812007520 -> 140210811996240
+ 140210812007520 [label=NativeLayerNormBackward0]
+ 140210812007664 -> 140210812007520
+ 140210812007664 [label=AddBackward0]
+ 140210812007856 -> 140210812007664
+ 140210812007856 [label=NativeDropoutBackward0]
+ 140210812008000 -> 140210812007856
+ 140210812008000 [label=ViewBackward0]
+ 140210812008096 -> 140210812008000
+ 140210812008096 [label=AddmmBackward0]
+ 140210812008192 -> 140210812008096
+ 140210812008192 [label=ToCopyBackward0]
+ 140210812008384 -> 140210812008192
+ 140202229048000 [label="encoder.layer.1.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202229048000 -> 140210812008384
+ 140210812008384 [label=AccumulateGrad]
+ 140210812008144 -> 140210812008096
+ 140210812008144 [label=ViewBackward0]
+ 140210812008432 -> 140210812008144
+ 140210812008432 [label=GeluBackward0]
+ 140210812008528 -> 140210812008432
+ 140210812008528 [label=ViewBackward0]
+ 140210812008624 -> 140210812008528
+ 140210812008624 [label=AddmmBackward0]
+ 140210812008720 -> 140210812008624
+ 140210812008720 [label=ToCopyBackward0]
+ 140210812008912 -> 140210812008720
+ 140202229048240 [label="encoder.layer.1.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202229048240 -> 140210812008912
+ 140210812008912 [label=AccumulateGrad]
+ 140210812008672 -> 140210812008624
+ 140210812008672 [label=ViewBackward0]
+ 140210812008960 -> 140210812008672
+ 140210812008960 [label=ToCopyBackward0]
+ 140210812007808 -> 140210812008960
+ 140210812007808 [label=SliceBackward0]
+ 140210812009104 -> 140210812007808
+ 140210812009104 [label=SliceBackward0]
+ 140210812009200 -> 140210812009104
+ 140210812009200 [label=SliceBackward0]
+ 140210812009296 -> 140210812009200
+ 140210812009296 [label=SliceBackward0]
+ 140210812009392 -> 140210812009296
+ 140210812009392 [label=SliceBackward0]
+ 140210812009488 -> 140210812009392
+ 140210812009488 [label=NativeLayerNormBackward0]
+ 140210812009584 -> 140210812009488
+ 140210812009584 [label=AddBackward0]
+ 140210812009776 -> 140210812009584
+ 140210812009776 [label=NativeDropoutBackward0]
+ 140210812009920 -> 140210812009776
+ 140210812009920 [label=ViewBackward0]
+ 140210812010016 -> 140210812009920
+ 140210812010016 [label=AddmmBackward0]
+ 140210812010112 -> 140210812010016
+ 140210812010112 [label=ToCopyBackward0]
+ 140210812010304 -> 140210812010112
+ 140202229050160 [label="encoder.layer.1.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229050160 -> 140210812010304
+ 140210812010304 [label=AccumulateGrad]
+ 140210812010064 -> 140210812010016
+ 140210812010064 [label=ViewBackward0]
+ 140210812010352 -> 140210812010064
+ 140210812010352 [label=ViewBackward0]
+ 140210812010448 -> 140210812010352
+ 140210812010448 [label=CloneBackward0]
+ 140210812010256 -> 140210812010448
+ 140210812010256 [label=PermuteBackward0]
+ 140210812022992 -> 140210812010256
+ 140210812022992 [label=UnsafeViewBackward0]
+ 140210812023088 -> 140210812022992
+ 140210812023088 [label=BmmBackward0]
+ 140210812023184 -> 140210812023088
+ 140210812023184 [label=ReshapeAliasBackward0]
+ 140210812023328 -> 140210812023184
+ 140210812023328 [label=ExpandBackward0]
+ 140210812023424 -> 140210812023328
+ 140210812023424 [label=ToCopyBackward0]
+ 140210812023520 -> 140210812023424
+ 140210812023520 [label=NativeDropoutBackward0]
+ 140210812023616 -> 140210812023520
+ 140210812023616 [label=SoftmaxBackward0]
+ 140210812023712 -> 140210812023616
+ 140210812023712 [label=AddBackward0]
+ 140210812023808 -> 140210812023712
+ 140210812023808 [label=DivBackward0]
+ 140210812023904 -> 140210812023808
+ 140210812023904 [label=UnsafeViewBackward0]
+ 140210812024000 -> 140210812023904
+ 140210812024000 [label=BmmBackward0]
+ 140210812024096 -> 140210812024000
+ 140210812024096 [label=ReshapeAliasBackward0]
+ 140210812024240 -> 140210812024096
+ 140210812024240 [label=ExpandBackward0]
+ 140210812024336 -> 140210812024240
+ 140210812024336 [label=PermuteBackward0]
+ 140210812024432 -> 140210812024336
+ 140210812024432 [label=ViewBackward0]
+ 140210812024528 -> 140210812024432
+ 140210812024528 [label=ViewBackward0]
+ 140210812024624 -> 140210812024528
+ 140210812024624 [label=AddmmBackward0]
+ 140210812024720 -> 140210812024624
+ 140210812024720 [label=ToCopyBackward0]
+ 140210812024912 -> 140210812024720
+ 140202229050880 [label="encoder.layer.1.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229050880 -> 140210812024912
+ 140210812024912 [label=AccumulateGrad]
+ 140210812024672 -> 140210812024624
+ 140210812024672 [label=ViewBackward0]
+ 140210812024960 -> 140210812024672
+ 140210812024960 [label=ToCopyBackward0]
+ 140210812009728 -> 140210812024960
+ 140210812009728 [label=CatBackward0]
+ 140210812025104 -> 140210812009728
+ 140210812025104 [label=NativeLayerNormBackward0]
+ 140210812025248 -> 140210812025104
+ 140210812025248 [label=AddBackward0]
+ 140210812025440 -> 140210812025248
+ 140210812025440 [label=NativeDropoutBackward0]
+ 140210812025584 -> 140210812025440
+ 140210812025584 [label=ViewBackward0]
+ 140210812025680 -> 140210812025584
+ 140210812025680 [label=AddmmBackward0]
+ 140210812025776 -> 140210812025680
+ 140210812025776 [label=ToCopyBackward0]
+ 140210812025968 -> 140210812025776
+ 140202229067840 [label="encoder.layer.0.experts.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202229067840 -> 140210812025968
+ 140210812025968 [label=AccumulateGrad]
+ 140210812025728 -> 140210812025680
+ 140210812025728 [label=ViewBackward0]
+ 140210812026016 -> 140210812025728
+ 140210812026016 [label=GeluBackward0]
+ 140210812026112 -> 140210812026016
+ 140210812026112 [label=ViewBackward0]
+ 140210812026208 -> 140210812026112
+ 140210812026208 [label=AddmmBackward0]
+ 140210812026304 -> 140210812026208
+ 140210812026304 [label=ToCopyBackward0]
+ 140210812026496 -> 140210812026304
+ 140202229068080 [label="encoder.layer.0.experts.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202229068080 -> 140210812026496
+ 140210812026496 [label=AccumulateGrad]
+ 140210812026256 -> 140210812026208
+ 140210812026256 [label=ViewBackward0]
+ 140210812026544 -> 140210812026256
+ 140210812026544 [label=ToCopyBackward0]
+ 140210812025392 -> 140210812026544
+ 140210812025392 [label=SliceBackward0]
+ 140210812026688 -> 140210812025392
+ 140210812026688 [label=SliceBackward0]
+ 140210812026784 -> 140210812026688
+ 140210812026784 [label=NativeLayerNormBackward0]
+ 140210812026832 -> 140210812026784
+ 140210812026832 [label=AddBackward0]
+ 140210812039424 -> 140210812026832
+ 140210812039424 [label=NativeDropoutBackward0]
+ 140210812039568 -> 140210812039424
+ 140210812039568 [label=ViewBackward0]
+ 140210812039664 -> 140210812039568
+ 140210812039664 [label=AddmmBackward0]
+ 140210812039760 -> 140210812039664
+ 140210812039760 [label=ToCopyBackward0]
+ 140210812039952 -> 140210812039760
+ 140202229070000 [label="encoder.layer.0.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229070000 -> 140210812039952
+ 140210812039952 [label=AccumulateGrad]
+ 140210812039712 -> 140210812039664
+ 140210812039712 [label=ViewBackward0]
+ 140210812040000 -> 140210812039712
+ 140210812040000 [label=ViewBackward0]
+ 140210812040096 -> 140210812040000
+ 140210812040096 [label=CloneBackward0]
+ 140210812040192 -> 140210812040096
+ 140210812040192 [label=PermuteBackward0]
+ 140210812040288 -> 140210812040192
+ 140210812040288 [label=UnsafeViewBackward0]
+ 140210812040384 -> 140210812040288
+ 140210812040384 [label=BmmBackward0]
+ 140210812040480 -> 140210812040384
+ 140210812040480 [label=ReshapeAliasBackward0]
+ 140210812040624 -> 140210812040480
+ 140210812040624 [label=ExpandBackward0]
+ 140210812040720 -> 140210812040624
+ 140210812040720 [label=ToCopyBackward0]
+ 140210812040816 -> 140210812040720
+ 140210812040816 [label=NativeDropoutBackward0]
+ 140210812040912 -> 140210812040816
+ 140210812040912 [label=SoftmaxBackward0]
+ 140210812041008 -> 140210812040912
+ 140210812041008 [label=AddBackward0]
+ 140210812041104 -> 140210812041008
+ 140210812041104 [label=DivBackward0]
+ 140210812041200 -> 140210812041104
+ 140210812041200 [label=UnsafeViewBackward0]
+ 140210812041296 -> 140210812041200
+ 140210812041296 [label=BmmBackward0]
+ 140210812041392 -> 140210812041296
+ 140210812041392 [label=ReshapeAliasBackward0]
+ 140210812041536 -> 140210812041392
+ 140210812041536 [label=ExpandBackward0]
+ 140210812041632 -> 140210812041536
+ 140210812041632 [label=PermuteBackward0]
+ 140210812041728 -> 140210812041632
+ 140210812041728 [label=ViewBackward0]
+ 140210812041824 -> 140210812041728
+ 140210812041824 [label=ViewBackward0]
+ 140210812041920 -> 140210812041824
+ 140210812041920 [label=AddmmBackward0]
+ 140210812042016 -> 140210812041920
+ 140210812042016 [label=ToCopyBackward0]
+ 140210812042208 -> 140210812042016
+ 140202229070720 [label="encoder.layer.0.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202229070720 -> 140210812042208
+ 140210812042208 [label=AccumulateGrad]
+ 140210812041968 -> 140210812041920
+ 140210812041968 [label=ViewBackward0]
+ 140210812042256 -> 140210812041968
+ 140210812042256 [label=ToCopyBackward0]
+ 140210812039376 -> 140210812042256
+ 140210812039376 [label=SliceBackward0]
+ 140210812042400 -> 140210812039376
+ 140210812042400 [label=SliceBackward0]
+ 140210812042496 -> 140210812042400
+ 140210812042496 [label=SliceBackward0]
+ 140210812042592 -> 140210812042496
+ 140210812042592 [label=NativeLayerNormBackward0]
+ 140210812042688 -> 140210812042592
+ 140210812042688 [label=AddBackward0]
+ 140210812042880 -> 140210812042688
+ 140210812042880 [label=NativeDropoutBackward0]
+ 140210812043024 -> 140210812042880
+ 140210812043024 [label=ViewBackward0]
+ 140210812043120 -> 140210812043024
+ 140210812043120 [label=AddmmBackward0]
+ 140210812043216 -> 140210812043120
+ 140210812043216 [label=ToCopyBackward0]
+ 140210812051664 -> 140210812043216
+ 140202229071200 [label="encoder.layer.0.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229071200 -> 140210812051664
+ 140210812051664 [label=AccumulateGrad]
+ 140210812043168 -> 140210812043120
+ 140210812043168 [label=ViewBackward0]
+ 140210812051712 -> 140210812043168
+ 140210812051712 [label=ViewBackward0]
+ 140210812051808 -> 140210812051712
+ 140210812051808 [label=CloneBackward0]
+ 140210812051904 -> 140210812051808
+ 140210812051904 [label=PermuteBackward0]
+ 140210812052000 -> 140210812051904
+ 140210812052000 [label=UnsafeViewBackward0]
+ 140210812052096 -> 140210812052000
+ 140210812052096 [label=BmmBackward0]
+ 140210812052192 -> 140210812052096
+ 140210812052192 [label=ReshapeAliasBackward0]
+ 140210812052336 -> 140210812052192
+ 140210812052336 [label=ExpandBackward0]
+ 140210812052432 -> 140210812052336
+ 140210812052432 [label=ToCopyBackward0]
+ 140210812052528 -> 140210812052432
+ 140210812052528 [label=NativeDropoutBackward0]
+ 140210812052624 -> 140210812052528
+ 140210812052624 [label=SoftmaxBackward0]
+ 140210812052720 -> 140210812052624
+ 140210812052720 [label=AddBackward0]
+ 140210812052816 -> 140210812052720
+ 140210812052816 [label=DivBackward0]
+ 140210812052912 -> 140210812052816
+ 140210812052912 [label=UnsafeViewBackward0]
+ 140210812053008 -> 140210812052912
+ 140210812053008 [label=BmmBackward0]
+ 140210812053104 -> 140210812053008
+ 140210812053104 [label=ReshapeAliasBackward0]
+ 140210812053248 -> 140210812053104
+ 140210812053248 [label=ExpandBackward0]
+ 140210812053344 -> 140210812053248
+ 140210812053344 [label=PermuteBackward0]
+ 140210812053440 -> 140210812053344
+ 140210812053440 [label=ViewBackward0]
+ 140210812053536 -> 140210812053440
+ 140210812053536 [label=ViewBackward0]
+ 140210812053632 -> 140210812053536
+ 140210812053632 [label=AddmmBackward0]
+ 140210812053728 -> 140210812053632
+ 140210812053728 [label=ToCopyBackward0]
+ 140210812053920 -> 140210812053728
+ 140202228734688 [label="encoder.layer.0.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140202228734688 -> 140210812053920
+ 140210812053920 [label=AccumulateGrad]
+ 140210812053680 -> 140210812053632
+ 140210812053680 [label=ViewBackward0]
+ 140210812053968 -> 140210812053680
+ 140210812053968 [label=ToCopyBackward0]
+ 140210812042832 -> 140210812053968
+ 140210812042832 [label=NativeDropoutBackward0]
+ 140210812054112 -> 140210812042832
+ 140210812054112 [label=NativeLayerNormBackward0]
+ 140210812054208 -> 140210812054112
+ 140210812054208 [label=CatBackward0]
+ 140210812054400 -> 140210812054208
+ 140210812054400 [label=ExpandBackward0]
+ 140210812054544 -> 140210812054400
+ 140202228561216 [label="
+ (1, 32, 768)" fillcolor=lightblue]
+ 140202228561216 -> 140210812054544
+ 140210812054544 [label=AccumulateGrad]
+ 140210812054352 -> 140210812054208
+ 140210812054352 [label=AddBackward0]
+ 140210812054592 -> 140210812054352
+ 140210812054592 [label=EmbeddingBackward0]
+ 140210812054736 -> 140210812054592
+ 140202228561776 [label="embeddings.word_embeddings.weight
+ (30523, 768)" fillcolor=lightblue]
+ 140202228561776 -> 140210812054736
+ 140210812054736 [label=AccumulateGrad]
+ 140210812054640 -> 140210812054352
+ 140210812054640 [label=EmbeddingBackward0]
+ 140210812054784 -> 140210812054640
+ 140202228735888 [label="embeddings.position_embeddings.weight
+ (512, 768)" fillcolor=lightblue]
+ 140202228735888 -> 140210812054784
+ 140210812054784 [label=AccumulateGrad]
+ 140210812054160 -> 140210812054112
+ 140202228560576 [label="embeddings.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228560576 -> 140210812054160
+ 140210812054160 [label=AccumulateGrad]
+ 140210812053824 -> 140210812054112
+ 140202228560336 [label="embeddings.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228560336 -> 140210812053824
+ 140210812053824 [label=AccumulateGrad]
+ 140210812053152 -> 140210812053632
+ 140210812053152 [label=TBackward0]
+ 140210812053872 -> 140210812053152
+ 140210812053872 [label=ToCopyBackward0]
+ 140210812054304 -> 140210812053872
+ 140202228560096 [label="encoder.layer.0.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228560096 -> 140210812054304
+ 140210812054304 [label=AccumulateGrad]
+ 140210812053056 -> 140210812053008
+ 140210812053056 [label=ReshapeAliasBackward0]
+ 140210812053392 -> 140210812053056
+ 140210812053392 [label=ExpandBackward0]
+ 140210812053584 -> 140210812053392
+ 140210812053584 [label=TransposeBackward0]
+ 140210812054064 -> 140210812053584
+ 140210812054064 [label=PermuteBackward0]
+ 140210812054832 -> 140210812054064
+ 140210812054832 [label=ViewBackward0]
+ 140210812054016 -> 140210812054832
+ 140210812054016 [label=ViewBackward0]
+ 140210812054448 -> 140210812054016
+ 140210812054448 [label=AddmmBackward0]
+ 140210812054928 -> 140210812054448
+ 140210812054928 [label=ToCopyBackward0]
+ 140210812055120 -> 140210812054928
+ 140202229071680 [label="encoder.layer.0.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229071680 -> 140210812055120
+ 140210812055120 [label=AccumulateGrad]
+ 140210812054688 -> 140210812054448
+ 140210812054688 [label=ViewBackward0]
+ 140210812055168 -> 140210812054688
+ 140210812055168 [label=ToCopyBackward0]
+ 140210812042832 -> 140210812055168
+ 140210812053200 -> 140210812054448
+ 140210812053200 [label=TBackward0]
+ 140210812055024 -> 140210812053200
+ 140210812055024 [label=ToCopyBackward0]
+ 140210812055312 -> 140210812055024
+ 140202228734048 [label="encoder.layer.0.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228734048 -> 140210812055312
+ 140210812055312 [label=AccumulateGrad]
+ 140210812052144 -> 140210812052096
+ 140210812052144 [label=ReshapeAliasBackward0]
+ 140210812052480 -> 140210812052144
+ 140210812052480 [label=ExpandBackward0]
+ 140210812052672 -> 140210812052480
+ 140210812052672 [label=PermuteBackward0]
+ 140210812052864 -> 140210812052672
+ 140210812052864 [label=ViewBackward0]
+ 140210812052240 -> 140210812052864
+ 140210812052240 [label=ViewBackward0]
+ 140210812053488 -> 140210812052240
+ 140210812053488 [label=AddmmBackward0]
+ 140210812054256 -> 140210812053488
+ 140210812054256 [label=ToCopyBackward0]
+ 140210812055264 -> 140210812054256
+ 140202229071440 [label="encoder.layer.0.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229071440 -> 140210812055264
+ 140210812055264 [label=AccumulateGrad]
+ 140210812053776 -> 140210812053488
+ 140210812053776 [label=ViewBackward0]
+ 140210812055072 -> 140210812053776
+ 140210812055072 [label=ToCopyBackward0]
+ 140210812042832 -> 140210812055072
+ 140210812052288 -> 140210812053488
+ 140210812052288 [label=TBackward0]
+ 140210812054880 -> 140210812052288
+ 140210812054880 [label=ToCopyBackward0]
+ 140210812055216 -> 140210812054880
+ 140202229071760 [label="encoder.layer.0.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229071760 -> 140210812055216
+ 140210812055216 [label=AccumulateGrad]
+ 140210812042928 -> 140210812043120
+ 140210812042928 [label=TBackward0]
+ 140210812051856 -> 140210812042928
+ 140210812051856 [label=ToCopyBackward0]
+ 140210812052048 -> 140210812051856
+ 140202229071520 [label="encoder.layer.0.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229071520 -> 140210812052048
+ 140210812052048 [label=AccumulateGrad]
+ 140210812042832 -> 140210812042688
+ 140210812042640 -> 140210812042592
+ 140202229071280 [label="encoder.layer.0.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229071280 -> 140210812042640
+ 140210812042640 [label=AccumulateGrad]
+ 140210812042112 -> 140210812042592
+ 140202229070960 [label="encoder.layer.0.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229070960 -> 140210812042112
+ 140210812042112 [label=AccumulateGrad]
+ 140210812041440 -> 140210812041920
+ 140210812041440 [label=TBackward0]
+ 140210812042160 -> 140210812041440
+ 140210812042160 [label=ToCopyBackward0]
+ 140210812042544 -> 140210812042160
+ 140202229071040 [label="encoder.layer.0.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229071040 -> 140210812042544
+ 140210812042544 [label=AccumulateGrad]
+ 140210812041344 -> 140210812041296
+ 140210812041344 [label=ReshapeAliasBackward0]
+ 140210812041680 -> 140210812041344
+ 140210812041680 [label=ExpandBackward0]
+ 140210812041872 -> 140210812041680
+ 140210812041872 [label=TransposeBackward0]
+ 140210812042352 -> 140210812041872
+ 140210812042352 [label=PermuteBackward0]
+ 140210812042784 -> 140210812042352
+ 140210812042784 [label=ViewBackward0]
+ 140210812042304 -> 140210812042784
+ 140210812042304 [label=ViewBackward0]
+ 140210812043072 -> 140210812042304
+ 140210812043072 [label=AddmmBackward0]
+ 140210812041488 -> 140210812043072
+ 140210812041488 [label=ToCopyBackward0]
+ 140210812051760 -> 140210812041488
+ 140202229070480 [label="encoder.layer.0.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229070480 -> 140210812051760
+ 140210812051760 [label=AccumulateGrad]
+ 140210812051568 -> 140210812043072
+ 140210812051568 [label=ViewBackward0]
+ 140210812052576 -> 140210812051568
+ 140210812052576 [label=ToCopyBackward0]
+ 140210812052960 -> 140210812052576
+ 140210812052960 [label=NativeLayerNormBackward0]
+ 140210812054496 -> 140210812052960
+ 140202228735248 [label="
+ (1408)" fillcolor=lightblue]
+ 140202228735248 -> 140210812054496
+ 140210812054496 [label=AccumulateGrad]
+ 140210812053296 -> 140210812052960
+ 140202228735488 [label="
+ (1408)" fillcolor=lightblue]
+ 140202228735488 -> 140210812053296
+ 140210812053296 [label=AccumulateGrad]
+ 140210812051520 -> 140210812043072
+ 140210812051520 [label=TBackward0]
+ 140210812051616 -> 140210812051520
+ 140210812051616 [label=ToCopyBackward0]
+ 140210812054976 -> 140210812051616
+ 140202229070800 [label="encoder.layer.0.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229070800 -> 140210812054976
+ 140210812054976 [label=AccumulateGrad]
+ 140210812040432 -> 140210812040384
+ 140210812040432 [label=ReshapeAliasBackward0]
+ 140210812040768 -> 140210812040432
+ 140210812040768 [label=ExpandBackward0]
+ 140210812040960 -> 140210812040768
+ 140210812040960 [label=PermuteBackward0]
+ 140210812041152 -> 140210812040960
+ 140210812041152 [label=ViewBackward0]
+ 140210812040528 -> 140210812041152
+ 140210812040528 [label=ViewBackward0]
+ 140210812041776 -> 140210812040528
+ 140210812041776 [label=AddmmBackward0]
+ 140210812042448 -> 140210812041776
+ 140210812042448 [label=ToCopyBackward0]
+ 140210812042976 -> 140210812042448
+ 140202229070240 [label="encoder.layer.0.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229070240 -> 140210812042976
+ 140210812042976 [label=AccumulateGrad]
+ 140210812042064 -> 140210812041776
+ 140210812042064 [label=ViewBackward0]
+ 140210812055360 -> 140210812042064
+ 140210812055360 [label=ToCopyBackward0]
+ 140210812052960 -> 140210812055360
+ 140210812040576 -> 140210812041776
+ 140210812040576 [label=TBackward0]
+ 140210812051952 -> 140210812040576
+ 140210812051952 [label=ToCopyBackward0]
+ 140210812052768 -> 140210812051952
+ 140202229070560 [label="encoder.layer.0.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229070560 -> 140210812052768
+ 140210812052768 [label=AccumulateGrad]
+ 140210812039472 -> 140210812039664
+ 140210812039472 [label=TBackward0]
+ 140210812040144 -> 140210812039472
+ 140210812040144 [label=ToCopyBackward0]
+ 140210812040336 -> 140210812040144
+ 140202229070320 [label="encoder.layer.0.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229070320 -> 140210812040336
+ 140210812040336 [label=AccumulateGrad]
+ 140210812039376 -> 140210812026832
+ 140210812026400 -> 140210812026784
+ 140202229070080 [label="encoder.layer.0.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229070080 -> 140210812026400
+ 140210812026400 [label=AccumulateGrad]
+ 140210812039232 -> 140210812026784
+ 140202229069760 [label="encoder.layer.0.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229069760 -> 140210812039232
+ 140210812039232 [label=AccumulateGrad]
+ 140210812025920 -> 140210812026208
+ 140210812025920 [label=TBackward0]
+ 140210812026448 -> 140210812025920
+ 140210812026448 [label=ToCopyBackward0]
+ 140210812026736 -> 140210812026448
+ 140202229068400 [label="encoder.layer.0.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229068400 -> 140210812026736
+ 140210812026736 [label=AccumulateGrad]
+ 140210812025488 -> 140210812025680
+ 140210812025488 [label=TBackward0]
+ 140210812026160 -> 140210812025488
+ 140210812026160 [label=ToCopyBackward0]
+ 140210812026640 -> 140210812026160
+ 140202229068160 [label="encoder.layer.0.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229068160 -> 140210812026640
+ 140210812026640 [label=AccumulateGrad]
+ 140210812025392 -> 140210812025248
+ 140210812025200 -> 140210812025104
+ 140202229067920 [label="encoder.layer.0.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202229067920 -> 140210812025200
+ 140210812025200 [label=AccumulateGrad]
+ 140210812025152 -> 140210812025104
+ 140202229051120 [label="encoder.layer.0.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202229051120 -> 140210812025152
+ 140210812025152 [label=AccumulateGrad]
+ 140210812024864 -> 140210812009728
+ 140210812024864 [label=NativeLayerNormBackward0]
+ 140210812025536 -> 140210812024864
+ 140210812025536 [label=AddBackward0]
+ 140210812026352 -> 140210812025536
+ 140210812026352 [label=NativeDropoutBackward0]
+ 140210812026064 -> 140210812026352
+ 140210812026064 [label=ViewBackward0]
+ 140210812039280 -> 140210812026064
+ 140210812039280 [label=AddmmBackward0]
+ 140210812039808 -> 140210812039280
+ 140210812039808 [label=ToCopyBackward0]
+ 140210812039904 -> 140210812039808
+ 140202229069280 [label="encoder.layer.0.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229069280 -> 140210812039904
+ 140210812039904 [label=AccumulateGrad]
+ 140210812039616 -> 140210812039280
+ 140210812039616 [label=ViewBackward0]
+ 140210812040048 -> 140210812039616
+ 140210812040048 [label=GeluBackward0]
+ 140210812041056 -> 140210812040048
+ 140210812041056 [label=ViewBackward0]
+ 140210812041584 -> 140210812041056
+ 140210812041584 [label=AddmmBackward0]
+ 140210812042736 -> 140210812041584
+ 140210812042736 [label=ToCopyBackward0]
+ 140210812055456 -> 140210812042736
+ 140202229069520 [label="encoder.layer.0.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202229069520 -> 140210812055456
+ 140210812055456 [label=AccumulateGrad]
+ 140210812040672 -> 140210812041584
+ 140210812040672 [label=ViewBackward0]
+ 140210812055408 -> 140210812040672
+ 140210812055408 [label=ToCopyBackward0]
+ 140210812025872 -> 140210812055408
+ 140210812025872 [label=SliceBackward0]
+ 140210812092672 -> 140210812025872
+ 140210812092672 [label=SliceBackward0]
+ 140210812092768 -> 140210812092672
+ 140210812092768 [label=SliceBackward0]
+ 140210812042592 -> 140210812092768
+ 140210812055504 -> 140210812041584
+ 140210812055504 [label=TBackward0]
+ 140210812092576 -> 140210812055504
+ 140210812092576 [label=ToCopyBackward0]
+ 140210812092864 -> 140210812092576
+ 140202229069840 [label="encoder.layer.0.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229069840 -> 140210812092864
+ 140210812092864 [label=AccumulateGrad]
+ 140210812039520 -> 140210812039280
+ 140210812039520 [label=TBackward0]
+ 140210812041248 -> 140210812039520
+ 140210812041248 [label=ToCopyBackward0]
+ 140210812052384 -> 140210812041248
+ 140202229069600 [label="encoder.layer.0.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229069600 -> 140210812052384
+ 140210812052384 [label=AccumulateGrad]
+ 140210812025872 -> 140210812025536
+ 140210812025344 -> 140210812024864
+ 140202229069360 [label="encoder.layer.0.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229069360 -> 140210812025344
+ 140210812025344 [label=AccumulateGrad]
+ 140210812025296 -> 140210812024864
+ 140202229069040 [label="encoder.layer.0.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229069040 -> 140210812025296
+ 140210812025296 [label=AccumulateGrad]
+ 140210812024144 -> 140210812024624
+ 140210812024144 [label=TBackward0]
+ 140210812024816 -> 140210812024144
+ 140210812024816 [label=ToCopyBackward0]
+ 140210812025824 -> 140210812024816
+ 140202229051200 [label="encoder.layer.1.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229051200 -> 140210812025824
+ 140210812025824 [label=AccumulateGrad]
+ 140210812024048 -> 140210812024000
+ 140210812024048 [label=ReshapeAliasBackward0]
+ 140210812024384 -> 140210812024048
+ 140210812024384 [label=ExpandBackward0]
+ 140210812024576 -> 140210812024384
+ 140210812024576 [label=TransposeBackward0]
+ 140210812025056 -> 140210812024576
+ 140210812025056 [label=PermuteBackward0]
+ 140210812026592 -> 140210812025056
+ 140210812026592 [label=ViewBackward0]
+ 140210812025008 -> 140210812026592
+ 140210812025008 [label=ViewBackward0]
+ 140210812040240 -> 140210812025008
+ 140210812040240 [label=AddmmBackward0]
+ 140210812040864 -> 140210812040240
+ 140210812040864 [label=ToCopyBackward0]
+ 140210812092528 -> 140210812040864
+ 140202229050640 [label="encoder.layer.1.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229050640 -> 140210812092528
+ 140210812092528 [label=AccumulateGrad]
+ 140210812039328 -> 140210812040240
+ 140210812039328 [label=ViewBackward0]
+ 140210812092912 -> 140210812039328
+ 140210812092912 [label=ToCopyBackward0]
+ 140210812009728 -> 140210812092912
+ 140210812092480 -> 140210812040240
+ 140210812092480 [label=TBackward0]
+ 140210812092624 -> 140210812092480
+ 140210812092624 [label=ToCopyBackward0]
+ 140210812093056 -> 140210812092624
+ 140202229050960 [label="encoder.layer.1.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229050960 -> 140210812093056
+ 140210812093056 [label=AccumulateGrad]
+ 140210812023136 -> 140210812023088
+ 140210812023136 [label=ReshapeAliasBackward0]
+ 140210812023472 -> 140210812023136
+ 140210812023472 [label=ExpandBackward0]
+ 140210812023664 -> 140210812023472
+ 140210812023664 [label=PermuteBackward0]
+ 140210812023856 -> 140210812023664
+ 140210812023856 [label=ViewBackward0]
+ 140210812023232 -> 140210812023856
+ 140210812023232 [label=ViewBackward0]
+ 140210812024480 -> 140210812023232
+ 140210812024480 [label=AddmmBackward0]
+ 140210812025632 -> 140210812024480
+ 140210812025632 [label=ToCopyBackward0]
+ 140210812039856 -> 140210812025632
+ 140202229050400 [label="encoder.layer.1.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229050400 -> 140210812039856
+ 140210812039856 [label=AccumulateGrad]
+ 140210812024768 -> 140210812024480
+ 140210812024768 [label=ViewBackward0]
+ 140210812092816 -> 140210812024768
+ 140210812092816 [label=ToCopyBackward0]
+ 140210812009728 -> 140210812092816
+ 140210812023280 -> 140210812024480
+ 140210812023280 [label=TBackward0]
+ 140210812092720 -> 140210812023280
+ 140210812092720 [label=ToCopyBackward0]
+ 140210812092960 -> 140210812092720
+ 140202229050720 [label="encoder.layer.1.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229050720 -> 140210812092960
+ 140210812092960 [label=AccumulateGrad]
+ 140210812009824 -> 140210812010016
+ 140210812009824 [label=TBackward0]
+ 140210812010208 -> 140210812009824
+ 140210812010208 [label=ToCopyBackward0]
+ 140210812023040 -> 140210812010208
+ 140202229050480 [label="encoder.layer.1.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229050480 -> 140210812023040
+ 140210812023040 [label=AccumulateGrad]
+ 140210812009728 -> 140210812009584
+ 140210812009536 -> 140210812009488
+ 140202229050240 [label="encoder.layer.1.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229050240 -> 140210812009536
+ 140210812009536 [label=AccumulateGrad]
+ 140210812008816 -> 140210812009488
+ 140202229049920 [label="encoder.layer.1.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229049920 -> 140210812008816
+ 140210812008816 [label=AccumulateGrad]
+ 140210812008336 -> 140210812008624
+ 140210812008336 [label=TBackward0]
+ 140210812008864 -> 140210812008336
+ 140210812008864 [label=ToCopyBackward0]
+ 140210812009248 -> 140210812008864
+ 140202229048560 [label="encoder.layer.1.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229048560 -> 140210812009248
+ 140210812009248 [label=AccumulateGrad]
+ 140210812007904 -> 140210812008096
+ 140210812007904 [label=TBackward0]
+ 140210812008576 -> 140210812007904
+ 140210812008576 [label=ToCopyBackward0]
+ 140210812009056 -> 140210812008576
+ 140202229048320 [label="encoder.layer.1.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229048320 -> 140210812009056
+ 140210812009056 [label=AccumulateGrad]
+ 140210812007808 -> 140210812007664
+ 140210812007616 -> 140210812007520
+ 140202229048080 [label="encoder.layer.1.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202229048080 -> 140210812007616
+ 140210812007616 [label=AccumulateGrad]
+ 140210812007568 -> 140210812007520
+ 140202229047760 [label="encoder.layer.1.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202229047760 -> 140210812007568
+ 140210812007568 [label=AccumulateGrad]
+ 140210812007280 -> 140210811996240
+ 140210812007280 [label=NativeLayerNormBackward0]
+ 140210812007952 -> 140210812007280
+ 140210812007952 [label=AddBackward0]
+ 140210812008768 -> 140210812007952
+ 140210812008768 [label=NativeDropoutBackward0]
+ 140210812008480 -> 140210812008768
+ 140210812008480 [label=ViewBackward0]
+ 140210812009008 -> 140210812008480
+ 140210812009008 [label=AddmmBackward0]
+ 140210812009680 -> 140210812009008
+ 140210812009680 [label=ToCopyBackward0]
+ 140210812010400 -> 140210812009680
+ 140202229049440 [label="encoder.layer.1.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229049440 -> 140210812010400
+ 140210812010400 [label=AccumulateGrad]
+ 140210812009632 -> 140210812009008
+ 140210812009632 [label=ViewBackward0]
+ 140210812009968 -> 140210812009632
+ 140210812009968 [label=GeluBackward0]
+ 140210812022848 -> 140210812009968
+ 140210812022848 [label=ViewBackward0]
+ 140210812023568 -> 140210812022848
+ 140210812023568 [label=AddmmBackward0]
+ 140210812023952 -> 140210812023568
+ 140210812023952 [label=ToCopyBackward0]
+ 140210812024192 -> 140210812023952
+ 140202229049680 [label="encoder.layer.1.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202229049680 -> 140210812024192
+ 140210812024192 [label=AccumulateGrad]
+ 140210812023760 -> 140210812023568
+ 140210812023760 [label=ViewBackward0]
+ 140210812093248 -> 140210812023760
+ 140210812093248 [label=ToCopyBackward0]
+ 140210812008288 -> 140210812093248
+ 140210812008288 [label=SliceBackward0]
+ 140210812093296 -> 140210812008288
+ 140210812093296 [label=SliceBackward0]
+ 140210812093392 -> 140210812093296
+ 140210812093392 [label=SliceBackward0]
+ 140210812009488 -> 140210812093392
+ 140210812023376 -> 140210812023568
+ 140210812023376 [label=TBackward0]
+ 140210812093008 -> 140210812023376
+ 140210812093008 [label=ToCopyBackward0]
+ 140210812093488 -> 140210812093008
+ 140202229050000 [label="encoder.layer.1.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229050000 -> 140210812093488
+ 140210812093488 [label=AccumulateGrad]
+ 140210812009440 -> 140210812009008
+ 140210812009440 [label=TBackward0]
+ 140210812010160 -> 140210812009440
+ 140210812010160 [label=ToCopyBackward0]
+ 140210812024288 -> 140210812010160
+ 140202229049760 [label="encoder.layer.1.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229049760 -> 140210812024288
+ 140210812024288 [label=AccumulateGrad]
+ 140210812008288 -> 140210812007952
+ 140210812007760 -> 140210812007280
+ 140202229049520 [label="encoder.layer.1.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229049520 -> 140210812007760
+ 140210812007760 [label=AccumulateGrad]
+ 140210812007712 -> 140210812007280
+ 140202229049200 [label="encoder.layer.1.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229049200 -> 140210812007712
+ 140210812007712 [label=AccumulateGrad]
+ 140210812006560 -> 140210812007040
+ 140210812006560 [label=TBackward0]
+ 140210812007232 -> 140210812006560
+ 140210812007232 [label=ToCopyBackward0]
+ 140210812008240 -> 140210812007232
+ 140202229047840 [label="encoder.layer.2.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229047840 -> 140210812008240
+ 140210812008240 [label=AccumulateGrad]
+ 140210812006464 -> 140210811998160
+ 140210812006464 [label=ReshapeAliasBackward0]
+ 140210812006800 -> 140210812006464
+ 140210812006800 [label=ExpandBackward0]
+ 140210812006992 -> 140210812006800
+ 140210812006992 [label=TransposeBackward0]
+ 140210812007472 -> 140210812006992
+ 140210812007472 [label=PermuteBackward0]
+ 140210812009344 -> 140210812007472
+ 140210812009344 [label=ViewBackward0]
+ 140210812007424 -> 140210812009344
+ 140210812007424 [label=ViewBackward0]
+ 140210812009872 -> 140210812007424
+ 140210812009872 [label=AddmmBackward0]
+ 140210812022944 -> 140210812009872
+ 140210812022944 [label=ToCopyBackward0]
+ 140210812093200 -> 140210812022944
+ 140202229047360 [label="encoder.layer.2.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229047360 -> 140210812093200
+ 140210812093200 [label=AccumulateGrad]
+ 140210812022896 -> 140210812009872
+ 140210812022896 [label=ViewBackward0]
+ 140210812093536 -> 140210812022896
+ 140210812093536 [label=ToCopyBackward0]
+ 140210811996240 -> 140210812093536
+ 140210812093104 -> 140210812009872
+ 140210812093104 [label=TBackward0]
+ 140210812093152 -> 140210812093104
+ 140210812093152 [label=ToCopyBackward0]
+ 140210812093680 -> 140210812093152
+ 140202229047600 [label="encoder.layer.2.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229047600 -> 140210812093680
+ 140210812093680 [label=AccumulateGrad]
+ 140210811997296 -> 140210811997248
+ 140210811997296 [label=ReshapeAliasBackward0]
+ 140210811997632 -> 140210811997296
+ 140210811997632 [label=ExpandBackward0]
+ 140210811997824 -> 140210811997632
+ 140210811997824 [label=PermuteBackward0]
+ 140210811998016 -> 140210811997824
+ 140210811998016 [label=ViewBackward0]
+ 140210811998112 -> 140210811998016
+ 140210811998112 [label=ViewBackward0]
+ 140210812006896 -> 140210811998112
+ 140210812006896 [label=AddmmBackward0]
+ 140210812008048 -> 140210812006896
+ 140210812008048 [label=ToCopyBackward0]
+ 140210812006608 -> 140210812008048
+ 140202229042848 [label="encoder.layer.2.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229042848 -> 140210812006608
+ 140210812006608 [label=AccumulateGrad]
+ 140210812007184 -> 140210812006896
+ 140210812007184 [label=ViewBackward0]
+ 140210812093440 -> 140210812007184
+ 140210812093440 [label=ToCopyBackward0]
+ 140210811996240 -> 140210812093440
+ 140210812006512 -> 140210812006896
+ 140210812006512 [label=TBackward0]
+ 140210812093344 -> 140210812006512
+ 140210812093344 [label=ToCopyBackward0]
+ 140210812093584 -> 140210812093344
+ 140202229043088 [label="encoder.layer.2.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229043088 -> 140210812093584
+ 140210812093584 [label=AccumulateGrad]
+ 140210811996336 -> 140210811996528
+ 140210811996336 [label=TBackward0]
+ 140210811997008 -> 140210811996336
+ 140210811997008 [label=ToCopyBackward0]
+ 140210811997200 -> 140210811997008
+ 140202229042928 [label="encoder.layer.2.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229042928 -> 140210811997200
+ 140210811997200 [label=AccumulateGrad]
+ 140210811996240 -> 140210811996096
+ 140210811996048 -> 140210811996000
+ 140202229042688 [label="encoder.layer.2.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229042688 -> 140210811996048
+ 140210811996048 [label=AccumulateGrad]
+ 140210811995520 -> 140210811996000
+ 140202229042368 [label="encoder.layer.2.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229042368 -> 140210811995520
+ 140210811995520 [label=AccumulateGrad]
+ 140210811994848 -> 140210811995328
+ 140210811994848 [label=TBackward0]
+ 140210811995568 -> 140210811994848
+ 140210811995568 [label=ToCopyBackward0]
+ 140210811995952 -> 140210811995568
+ 140202229042448 [label="encoder.layer.2.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229042448 -> 140210811995952
+ 140210811995952 [label=AccumulateGrad]
+ 140210811994752 -> 140210811994704
+ 140210811994752 [label=ReshapeAliasBackward0]
+ 140210811995088 -> 140210811994752
+ 140210811995088 [label=ExpandBackward0]
+ 140210811995280 -> 140210811995088
+ 140210811995280 [label=TransposeBackward0]
+ 140210811995760 -> 140210811995280
+ 140210811995760 [label=PermuteBackward0]
+ 140210811996192 -> 140210811995760
+ 140210811996192 [label=ViewBackward0]
+ 140210811995712 -> 140210811996192
+ 140210811995712 [label=ViewBackward0]
+ 140210811996480 -> 140210811995712
+ 140210811996480 [label=AddmmBackward0]
+ 140210811996720 -> 140210811996480
+ 140210811996720 [label=ToCopyBackward0]
+ 140210811996912 -> 140210811996720
+ 140202229041888 [label="encoder.layer.2.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229041888 -> 140210811996912
+ 140210811996912 [label=AccumulateGrad]
+ 140210811996672 -> 140210811996480
+ 140210811996672 [label=ViewBackward0]
+ 140210811997728 -> 140210811996672
+ 140210811997728 [label=ToCopyBackward0]
+ 140210812052960 -> 140210811997728
+ 140210811994896 -> 140210811996480
+ 140210811994896 [label=TBackward0]
+ 140210811997536 -> 140210811994896
+ 140210811997536 [label=ToCopyBackward0]
+ 140210811996768 -> 140210811997536
+ 140202229042208 [label="encoder.layer.2.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229042208 -> 140210811996768
+ 140210811996768 [label=AccumulateGrad]
+ 140210811977392 -> 140210811977344
+ 140210811977392 [label=ReshapeAliasBackward0]
+ 140210811977632 -> 140210811977392
+ 140210811977632 [label=ExpandBackward0]
+ 140210811994368 -> 140210811977632
+ 140210811994368 [label=PermuteBackward0]
+ 140210811994560 -> 140210811994368
+ 140210811994560 [label=ViewBackward0]
+ 140210811994176 -> 140210811994560
+ 140210811994176 [label=ViewBackward0]
+ 140210811995184 -> 140210811994176
+ 140210811995184 [label=AddmmBackward0]
+ 140210811995856 -> 140210811995184
+ 140210811995856 [label=ToCopyBackward0]
+ 140210811997440 -> 140210811995856
+ 140202229041648 [label="encoder.layer.2.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229041648 -> 140210811997440
+ 140210811997440 [label=AccumulateGrad]
+ 140210811995472 -> 140210811995184
+ 140210811995472 [label=ViewBackward0]
+ 140210811996384 -> 140210811995472
+ 140210811996384 [label=ToCopyBackward0]
+ 140210812052960 -> 140210811996384
+ 140210811994224 -> 140210811995184
+ 140210811994224 [label=TBackward0]
+ 140210812009152 -> 140210811994224
+ 140210812009152 [label=ToCopyBackward0]
+ 140210811997104 -> 140210812009152
+ 140202229041968 [label="encoder.layer.2.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229041968 -> 140210811997104
+ 140210811997104 [label=AccumulateGrad]
+ 140210811976432 -> 140210811976624
+ 140210811976432 [label=TBackward0]
+ 140210811977104 -> 140210811976432
+ 140210811977104 [label=ToCopyBackward0]
+ 140210811977296 -> 140210811977104
+ 140202229041728 [label="encoder.layer.2.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229041728 -> 140210811977296
+ 140210811977296 [label=AccumulateGrad]
+ 140210811976336 -> 140210811976192
+ 140210811976144 -> 140210811976096
+ 140202229041488 [label="encoder.layer.2.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229041488 -> 140210811976144
+ 140210811976144 [label=AccumulateGrad]
+ 140210811975712 -> 140210811976096
+ 140202229041168 [label="encoder.layer.2.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229041168 -> 140210811975712
+ 140210811975712 [label=AccumulateGrad]
+ 140210811975232 -> 140210811975520
+ 140210811975232 [label=TBackward0]
+ 140210811975760 -> 140210811975232
+ 140210811975760 [label=ToCopyBackward0]
+ 140210811976240 -> 140210811975760
+ 140202229039808 [label="encoder.layer.2.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229039808 -> 140210811976240
+ 140210811976240 [label=AccumulateGrad]
+ 140210811974800 -> 140210811974992
+ 140210811974800 [label=TBackward0]
+ 140210811975472 -> 140210811974800
+ 140210811975472 [label=ToCopyBackward0]
+ 140210811975952 -> 140210811975472
+ 140202229039568 [label="encoder.layer.2.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229039568 -> 140210811975952
+ 140210811975952 [label=AccumulateGrad]
+ 140210811974704 -> 140210811974560
+ 140210811974512 -> 140210811974416
+ 140202229039328 [label="encoder.layer.2.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202229039328 -> 140210811974512
+ 140210811974512 [label=AccumulateGrad]
+ 140210811974464 -> 140210811974416
+ 140202229026624 [label="encoder.layer.2.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202229026624 -> 140210811974464
+ 140210811974464 [label=AccumulateGrad]
+ 140210811974176 -> 140210811959040
+ 140210811974176 [label=NativeLayerNormBackward0]
+ 140210811974848 -> 140210811974176
+ 140210811974848 [label=AddBackward0]
+ 140210811975664 -> 140210811974848
+ 140210811975664 [label=NativeDropoutBackward0]
+ 140210811975376 -> 140210811975664
+ 140210811975376 [label=ViewBackward0]
+ 140210811975904 -> 140210811975376
+ 140210811975904 [label=AddmmBackward0]
+ 140210811976768 -> 140210811975904
+ 140210811976768 [label=ToCopyBackward0]
+ 140210811976864 -> 140210811976768
+ 140202229040688 [label="encoder.layer.2.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229040688 -> 140210811976864
+ 140210811976864 [label=AccumulateGrad]
+ 140210811976576 -> 140210811975904
+ 140210811976576 [label=ViewBackward0]
+ 140210812006704 -> 140210811976576
+ 140210812006704 [label=GeluBackward0]
+ 140210811977200 -> 140210812006704
+ 140210811977200 [label=ViewBackward0]
+ 140210811994656 -> 140210811977200
+ 140210811994656 [label=AddmmBackward0]
+ 140210811996144 -> 140210811994656
+ 140210811996144 [label=ToCopyBackward0]
+ 140210812093728 -> 140210811996144
+ 140202229040928 [label="encoder.layer.2.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202229040928 -> 140210812093728
+ 140210812093728 [label=AccumulateGrad]
+ 140210811994992 -> 140210811994656
+ 140210811994992 [label=ViewBackward0]
+ 140210812093824 -> 140210811994992
+ 140210812093824 [label=ToCopyBackward0]
+ 140210811975184 -> 140210812093824
+ 140210811975184 [label=SliceBackward0]
+ 140210812093968 -> 140210811975184
+ 140210812093968 [label=SliceBackward0]
+ 140210812094064 -> 140210812093968
+ 140210812094064 [label=SliceBackward0]
+ 140210811996000 -> 140210812094064
+ 140210811994272 -> 140210811994656
+ 140210811994272 [label=TBackward0]
+ 140210812093632 -> 140210811994272
+ 140210812093632 [label=ToCopyBackward0]
+ 140210812094160 -> 140210812093632
+ 140202229041248 [label="encoder.layer.2.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229041248 -> 140210812094160
+ 140210812094160 [label=AccumulateGrad]
+ 140210811976480 -> 140210811975904
+ 140210811976480 [label=TBackward0]
+ 140210811977536 -> 140210811976480
+ 140210811977536 [label=ToCopyBackward0]
+ 140210811997920 -> 140210811977536
+ 140202229041008 [label="encoder.layer.2.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229041008 -> 140210811997920
+ 140210811997920 [label=AccumulateGrad]
+ 140210811975184 -> 140210811974848
+ 140210811974656 -> 140210811974176
+ 140202229040768 [label="encoder.layer.2.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229040768 -> 140210811974656
+ 140210811974656 [label=AccumulateGrad]
+ 140210811974608 -> 140210811974176
+ 140202229040448 [label="encoder.layer.2.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229040448 -> 140210811974608
+ 140210811974608 [label=AccumulateGrad]
+ 140210811973696 -> 140210811973936
+ 140210811973696 [label=TBackward0]
+ 140210811974128 -> 140210811973696
+ 140210811974128 [label=ToCopyBackward0]
+ 140210811975136 -> 140210811974128
+ 140202229026704 [label="encoder.layer.3.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229026704 -> 140210811975136
+ 140210811975136 [label=AccumulateGrad]
+ 140210811961008 -> 140210811960960
+ 140210811961008 [label=ReshapeAliasBackward0]
+ 140210811961248 -> 140210811961008
+ 140210811961248 [label=ExpandBackward0]
+ 140210811973888 -> 140210811961248
+ 140210811973888 [label=TransposeBackward0]
+ 140210811974368 -> 140210811973888
+ 140210811974368 [label=PermuteBackward0]
+ 140210811976288 -> 140210811974368
+ 140210811976288 [label=ViewBackward0]
+ 140210811974320 -> 140210811976288
+ 140210811974320 [label=ViewBackward0]
+ 140210811977008 -> 140210811974320
+ 140210811977008 [label=AddmmBackward0]
+ 140210811994464 -> 140210811977008
+ 140210811994464 [label=ToCopyBackward0]
+ 140210812093776 -> 140210811994464
+ 140202229026144 [label="encoder.layer.3.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229026144 -> 140210812093776
+ 140210812093776 [label=AccumulateGrad]
+ 140210811973744 -> 140210811977008
+ 140210811973744 [label=ViewBackward0]
+ 140210812094208 -> 140210811973744
+ 140210812094208 [label=ToCopyBackward0]
+ 140210811959040 -> 140210812094208
+ 140210812093872 -> 140210811977008
+ 140210812093872 [label=TBackward0]
+ 140210812093920 -> 140210812093872
+ 140210812093920 [label=ToCopyBackward0]
+ 140210812094352 -> 140210812093920
+ 140202229026464 [label="encoder.layer.3.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229026464 -> 140210812094352
+ 140210812094352 [label=AccumulateGrad]
+ 140210811960096 -> 140210811960048
+ 140210811960096 [label=ReshapeAliasBackward0]
+ 140210811960432 -> 140210811960096
+ 140210811960432 [label=ExpandBackward0]
+ 140210811960624 -> 140210811960432
+ 140210811960624 [label=PermuteBackward0]
+ 140210811960816 -> 140210811960624
+ 140210811960816 [label=ViewBackward0]
+ 140210811960192 -> 140210811960816
+ 140210811960192 [label=ViewBackward0]
+ 140210811961152 -> 140210811960192
+ 140210811961152 [label=AddmmBackward0]
+ 140210811974944 -> 140210811961152
+ 140210811974944 [label=ToCopyBackward0]
+ 140210811976816 -> 140210811974944
+ 140202229025904 [label="encoder.layer.3.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229025904 -> 140210811976816
+ 140210811976816 [label=AccumulateGrad]
+ 140210811974080 -> 140210811961152
+ 140210811974080 [label=ViewBackward0]
+ 140210812094112 -> 140210811974080
+ 140210812094112 [label=ToCopyBackward0]
+ 140210811959040 -> 140210812094112
+ 140210811973792 -> 140210811961152
+ 140210811973792 [label=TBackward0]
+ 140210812094016 -> 140210811973792
+ 140210812094016 [label=ToCopyBackward0]
+ 140210812094256 -> 140210812094016
+ 140202229026224 [label="encoder.layer.3.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229026224 -> 140210812094256
+ 140210812094256 [label=AccumulateGrad]
+ 140210811959136 -> 140210811959328
+ 140210811959136 [label=TBackward0]
+ 140210811959808 -> 140210811959136
+ 140210811959808 [label=ToCopyBackward0]
+ 140210811960000 -> 140210811959808
+ 140202229025984 [label="encoder.layer.3.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229025984 -> 140210811960000
+ 140210811960000 [label=AccumulateGrad]
+ 140210811959040 -> 140210811958896
+ 140210811958848 -> 140210811958800
+ 140202229025744 [label="encoder.layer.3.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229025744 -> 140210811958848
+ 140210811958848 [label=AccumulateGrad]
+ 140210811958128 -> 140210811958800
+ 140202229025424 [label="encoder.layer.3.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229025424 -> 140210811958128
+ 140210811958128 [label=AccumulateGrad]
+ 140210811957648 -> 140210811957936
+ 140210811957648 [label=TBackward0]
+ 140210811958176 -> 140210811957648
+ 140210811958176 [label=ToCopyBackward0]
+ 140210811958560 -> 140210811958176
+ 140202229024064 [label="encoder.layer.3.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229024064 -> 140210811958560
+ 140210811958560 [label=AccumulateGrad]
+ 140210811957312 -> 140210811957408
+ 140210811957312 [label=TBackward0]
+ 140210811957888 -> 140210811957312
+ 140210811957888 [label=ToCopyBackward0]
+ 140210811958368 -> 140210811957888
+ 140202229023824 [label="encoder.layer.3.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229023824 -> 140210811958368
+ 140210811958368 [label=AccumulateGrad]
+ 140210811944768 -> 140210811944624
+ 140210811944576 -> 140210811944480
+ 140202229023584 [label="encoder.layer.3.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202229023584 -> 140210811944576
+ 140210811944576 [label=AccumulateGrad]
+ 140210811944528 -> 140210811944480
+ 140202229023264 [label="encoder.layer.3.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202229023264 -> 140210811944528
+ 140210811944528 [label=AccumulateGrad]
+ 140210811944240 -> 140210811941456
+ 140210811944240 [label=NativeLayerNormBackward0]
+ 140210811944864 -> 140210811944240
+ 140210811944864 [label=AddBackward0]
+ 140210811958080 -> 140210811944864
+ 140210811958080 [label=NativeDropoutBackward0]
+ 140210811957792 -> 140210811958080
+ 140210811957792 [label=ViewBackward0]
+ 140210811958320 -> 140210811957792
+ 140210811958320 [label=AddmmBackward0]
+ 140210811958992 -> 140210811958320
+ 140210811958992 [label=ToCopyBackward0]
+ 140210811959520 -> 140210811958992
+ 140202229024944 [label="encoder.layer.3.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229024944 -> 140210811959520
+ 140210811959520 [label=AccumulateGrad]
+ 140210811958944 -> 140210811958320
+ 140210811958944 [label=ViewBackward0]
+ 140210811959904 -> 140210811958944
+ 140210811959904 [label=GeluBackward0]
+ 140210811959568 -> 140210811959904
+ 140210811959568 [label=ViewBackward0]
+ 140210811960528 -> 140210811959568
+ 140210811960528 [label=AddmmBackward0]
+ 140210811960912 -> 140210811960528
+ 140210811960912 [label=ToCopyBackward0]
+ 140210811976048 -> 140210811960912
+ 140202229025184 [label="encoder.layer.3.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202229025184 -> 140210811976048
+ 140210811976048 [label=AccumulateGrad]
+ 140210811960720 -> 140210811960528
+ 140210811960720 [label=ViewBackward0]
+ 140210812094544 -> 140210811960720
+ 140210812094544 [label=ToCopyBackward0]
+ 140210811957600 -> 140210812094544
+ 140210811957600 [label=SliceBackward0]
+ 140210812094592 -> 140210811957600
+ 140210812094592 [label=SliceBackward0]
+ 140210812094688 -> 140210812094592
+ 140210812094688 [label=SliceBackward0]
+ 140210811958800 -> 140210812094688
+ 140210811959472 -> 140210811960528
+ 140210811959472 [label=TBackward0]
+ 140210812094304 -> 140210811959472
+ 140210812094304 [label=ToCopyBackward0]
+ 140210812094784 -> 140210812094304
+ 140202229025504 [label="encoder.layer.3.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229025504 -> 140210812094784
+ 140210812094784 [label=AccumulateGrad]
+ 140210811958752 -> 140210811958320
+ 140210811958752 [label=TBackward0]
+ 140210811959712 -> 140210811958752
+ 140210811959712 [label=ToCopyBackward0]
+ 140210811960240 -> 140210811959712
+ 140202229025264 [label="encoder.layer.3.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229025264 -> 140210811960240
+ 140210811960240 [label=AccumulateGrad]
+ 140210811957600 -> 140210811944864
+ 140210811944720 -> 140210811944240
+ 140202229025024 [label="encoder.layer.3.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229025024 -> 140210811944720
+ 140210811944720 [label=AccumulateGrad]
+ 140210811944672 -> 140210811944240
+ 140202229024704 [label="encoder.layer.3.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229024704 -> 140210811944672
+ 140210811944672 [label=AccumulateGrad]
+ 140210811943520 -> 140210811944000
+ 140210811943520 [label=TBackward0]
+ 140210811944192 -> 140210811943520
+ 140210811944192 [label=ToCopyBackward0]
+ 140210811944384 -> 140210811944192
+ 140202229023344 [label="encoder.layer.4.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229023344 -> 140210811944384
+ 140210811944384 [label=AccumulateGrad]
+ 140210811943424 -> 140210811943376
+ 140210811943424 [label=ReshapeAliasBackward0]
+ 140210811943760 -> 140210811943424
+ 140210811943760 [label=ExpandBackward0]
+ 140210811943952 -> 140210811943760
+ 140210811943952 [label=TransposeBackward0]
+ 140210811944432 -> 140210811943952
+ 140210811944432 [label=PermuteBackward0]
+ 140210811943568 -> 140210811944432
+ 140210811943568 [label=ViewBackward0]
+ 140210811957360 -> 140210811943568
+ 140210811957360 [label=ViewBackward0]
+ 140210811959280 -> 140210811957360
+ 140210811959280 [label=AddmmBackward0]
+ 140210811960336 -> 140210811959280
+ 140210811960336 [label=ToCopyBackward0]
+ 140210812094496 -> 140210811960336
+ 140202229022784 [label="encoder.layer.4.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229022784 -> 140210812094496
+ 140210812094496 [label=AccumulateGrad]
+ 140210811957552 -> 140210811959280
+ 140210811957552 [label=ViewBackward0]
+ 140210812094832 -> 140210811957552
+ 140210812094832 [label=ToCopyBackward0]
+ 140210811941456 -> 140210812094832
+ 140210812094400 -> 140210811959280
+ 140210812094400 [label=TBackward0]
+ 140210812094448 -> 140210812094400
+ 140210812094448 [label=ToCopyBackward0]
+ 140210812094976 -> 140210812094448
+ 140202229023104 [label="encoder.layer.4.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229023104 -> 140210812094976
+ 140210812094976 [label=AccumulateGrad]
+ 140210811942512 -> 140210811942464
+ 140210811942512 [label=ReshapeAliasBackward0]
+ 140210811942848 -> 140210811942512
+ 140210811942848 [label=ExpandBackward0]
+ 140210811943040 -> 140210811942848
+ 140210811943040 [label=PermuteBackward0]
+ 140210811943232 -> 140210811943040
+ 140210811943232 [label=ViewBackward0]
+ 140210811942608 -> 140210811943232
+ 140210811942608 [label=ViewBackward0]
+ 140210811943856 -> 140210811942608
+ 140210811943856 [label=AddmmBackward0]
+ 140210811944144 -> 140210811943856
+ 140210811944144 [label=ToCopyBackward0]
+ 140210811959184 -> 140210811944144
+ 140202229014256 [label="encoder.layer.4.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229014256 -> 140210811959184
+ 140210811959184 [label=AccumulateGrad]
+ 140210811942656 -> 140210811943856
+ 140210811942656 [label=ViewBackward0]
+ 140210812094736 -> 140210811942656
+ 140210812094736 [label=ToCopyBackward0]
+ 140210811941456 -> 140210812094736
+ 140210811958656 -> 140210811943856
+ 140210811958656 [label=TBackward0]
+ 140210812094640 -> 140210811958656
+ 140210812094640 [label=ToCopyBackward0]
+ 140210812094880 -> 140210812094640
+ 140202229022864 [label="encoder.layer.4.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229022864 -> 140210812094880
+ 140210812094880 [label=AccumulateGrad]
+ 140210811941552 -> 140210811941744
+ 140210811941552 [label=TBackward0]
+ 140210811942224 -> 140210811941552
+ 140210811942224 [label=ToCopyBackward0]
+ 140210811942416 -> 140210811942224
+ 140202229014336 [label="encoder.layer.4.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229014336 -> 140210811942416
+ 140210811942416 [label=AccumulateGrad]
+ 140210811941456 -> 140210811941312
+ 140210811941264 -> 140210811941216
+ 140202229014096 [label="encoder.layer.4.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229014096 -> 140210811941264
+ 140210811941264 [label=AccumulateGrad]
+ 140210811940976 -> 140210811941216
+ 140202229013776 [label="encoder.layer.4.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229013776 -> 140210811940976
+ 140210811940976 [label=AccumulateGrad]
+ 140210811927712 -> 140210811928192
+ 140210811927712 [label=TBackward0]
+ 140210811928432 -> 140210811927712
+ 140210811928432 [label=ToCopyBackward0]
+ 140210811941168 -> 140210811928432
+ 140202229013856 [label="encoder.layer.4.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229013856 -> 140210811941168
+ 140210811941168 [label=AccumulateGrad]
+ 140210811927616 -> 140210811927568
+ 140210811927616 [label=ReshapeAliasBackward0]
+ 140210811927952 -> 140210811927616
+ 140210811927952 [label=ExpandBackward0]
+ 140210811928144 -> 140210811927952
+ 140210811928144 [label=TransposeBackward0]
+ 140210811928528 -> 140210811928144
+ 140210811928528 [label=PermuteBackward0]
+ 140210811927760 -> 140210811928528
+ 140210811927760 [label=ViewBackward0]
+ 140210811940928 -> 140210811927760
+ 140210811940928 [label=ViewBackward0]
+ 140210811941696 -> 140210811940928
+ 140210811941696 [label=AddmmBackward0]
+ 140210811941936 -> 140210811941696
+ 140210811941936 [label=ToCopyBackward0]
+ 140210811942128 -> 140210811941936
+ 140202229013296 [label="encoder.layer.4.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202229013296 -> 140210811942128
+ 140210811942128 [label=AccumulateGrad]
+ 140210811941888 -> 140210811941696
+ 140210811941888 [label=ViewBackward0]
+ 140210811942944 -> 140210811941888
+ 140210811942944 [label=ToCopyBackward0]
+ 140210812052960 -> 140210811942944
+ 140210811941072 -> 140210811941696
+ 140210811941072 [label=TBackward0]
+ 140210811942752 -> 140210811941072
+ 140210811942752 [label=ToCopyBackward0]
+ 140210811943664 -> 140210811942752
+ 140202229013616 [label="encoder.layer.4.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229013616 -> 140210811943664
+ 140210811943664 [label=AccumulateGrad]
+ 140210811926704 -> 140210811926656
+ 140210811926704 [label=ReshapeAliasBackward0]
+ 140210811927040 -> 140210811926704
+ 140210811927040 [label=ExpandBackward0]
+ 140210811927232 -> 140210811927040
+ 140210811927232 [label=PermuteBackward0]
+ 140210811927424 -> 140210811927232
+ 140210811927424 [label=ViewBackward0]
+ 140210811926800 -> 140210811927424
+ 140210811926800 [label=ViewBackward0]
+ 140210811928048 -> 140210811926800
+ 140210811928048 [label=AddmmBackward0]
+ 140210811958464 -> 140210811928048
+ 140210811958464 [label=ToCopyBackward0]
+ 140210811942320 -> 140210811958464
+ 140202229013056 [label="encoder.layer.4.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202229013056 -> 140210811942320
+ 140210811942320 [label=AccumulateGrad]
+ 140210811928336 -> 140210811928048
+ 140210811928336 [label=ViewBackward0]
+ 140210811943328 -> 140210811928336
+ 140210811943328 [label=ToCopyBackward0]
+ 140210812052960 -> 140210811943328
+ 140210811926848 -> 140210811928048
+ 140210811926848 [label=TBackward0]
+ 140210811941360 -> 140210811926848
+ 140210811941360 [label=ToCopyBackward0]
+ 140210811941600 -> 140210811941360
+ 140202229013376 [label="encoder.layer.4.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202229013376 -> 140210811941600
+ 140210811941600 [label=AccumulateGrad]
+ 140210811925744 -> 140210811925936
+ 140210811925744 [label=TBackward0]
+ 140210811926416 -> 140210811925744
+ 140210811926416 [label=ToCopyBackward0]
+ 140210811926608 -> 140210811926416
+ 140202229013136 [label="encoder.layer.4.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202229013136 -> 140210811926608
+ 140210811926608 [label=AccumulateGrad]
+ 140210811925648 -> 140210811925504
+ 140210811925456 -> 140210811925408
+ 140202229012896 [label="encoder.layer.4.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229012896 -> 140210811925456
+ 140210811925456 [label=AccumulateGrad]
+ 140210811925024 -> 140210811925408
+ 140202229012576 [label="encoder.layer.4.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229012576 -> 140210811925024
+ 140210811925024 [label=AccumulateGrad]
+ 140210811924592 -> 140210811924832
+ 140210811924592 [label=TBackward0]
+ 140210811925072 -> 140210811924592
+ 140210811925072 [label=ToCopyBackward0]
+ 140210811925552 -> 140210811925072
+ 140202229011216 [label="encoder.layer.4.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229011216 -> 140210811925552
+ 140210811925552 [label=AccumulateGrad]
+ 140202224193104 -> 140202224193296
+ 140202224193104 [label=TBackward0]
+ 140210811924784 -> 140202224193104
+ 140210811924784 [label=ToCopyBackward0]
+ 140210811925264 -> 140210811924784
+ 140202229010976 [label="encoder.layer.4.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229010976 -> 140210811925264
+ 140210811925264 [label=AccumulateGrad]
+ 140202224193008 -> 140202224192864
+ 140202224192816 -> 140202224192720
+ 140202229010736 [label="encoder.layer.4.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202229010736 -> 140202224192816
+ 140202224192816 [label=AccumulateGrad]
+ 140202224192768 -> 140202224192720
+ 140202229010496 [label="encoder.layer.4.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202229010496 -> 140202224192768
+ 140202224192768 [label=AccumulateGrad]
+ 140202224192432 -> 140202224191472
+ 140202224192432 [label=NativeLayerNormBackward0]
+ 140202224193152 -> 140202224192432
+ 140202224193152 [label=AddBackward0]
+ 140202224193440 -> 140202224193152
+ 140202224193440 [label=NativeDropoutBackward0]
+ 140210811924688 -> 140202224193440
+ 140210811924688 [label=ViewBackward0]
+ 140210811925216 -> 140210811924688
+ 140210811925216 [label=AddmmBackward0]
+ 140210811926080 -> 140210811925216
+ 140210811926080 [label=ToCopyBackward0]
+ 140210811926176 -> 140210811926080
+ 140202229012096 [label="encoder.layer.4.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202229012096 -> 140210811926176
+ 140210811926176 [label=AccumulateGrad]
+ 140210811925888 -> 140210811925216
+ 140210811925888 [label=ViewBackward0]
+ 140210811926320 -> 140210811925888
+ 140210811926320 [label=GeluBackward0]
+ 140210811927328 -> 140210811926320
+ 140210811927328 [label=ViewBackward0]
+ 140210811927856 -> 140210811927328
+ 140210811927856 [label=AddmmBackward0]
+ 140210811926944 -> 140210811927856
+ 140210811926944 [label=ToCopyBackward0]
+ 140210812095024 -> 140210811926944
+ 140202229012336 [label="encoder.layer.4.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202229012336 -> 140210812095024
+ 140210812095024 [label=AccumulateGrad]
+ 140210811943136 -> 140210811927856
+ 140210811943136 [label=ViewBackward0]
+ 140210812095120 -> 140210811943136
+ 140210812095120 [label=ToCopyBackward0]
+ 140210811924976 -> 140210812095120
+ 140210811924976 [label=SliceBackward0]
+ 140210812095264 -> 140210811924976
+ 140210812095264 [label=SliceBackward0]
+ 140210812095360 -> 140210812095264
+ 140210812095360 [label=SliceBackward0]
+ 140210811941216 -> 140210812095360
+ 140210811941408 -> 140210811927856
+ 140210811941408 [label=TBackward0]
+ 140210812094928 -> 140210811941408
+ 140210812094928 [label=ToCopyBackward0]
+ 140210812095456 -> 140210812094928
+ 140202229012656 [label="encoder.layer.4.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202229012656 -> 140210812095456
+ 140210812095456 [label=AccumulateGrad]
+ 140210811925792 -> 140210811925216
+ 140210811925792 [label=TBackward0]
+ 140210811927520 -> 140210811925792
+ 140210811927520 [label=ToCopyBackward0]
+ 140210811941984 -> 140210811927520
+ 140202229012416 [label="encoder.layer.4.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202229012416 -> 140210811941984
+ 140210811941984 [label=AccumulateGrad]
+ 140210811924976 -> 140202224193152
+ 140202224192960 -> 140202224192432
+ 140202229012176 [label="encoder.layer.4.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202229012176 -> 140202224192960
+ 140202224192960 [label=AccumulateGrad]
+ 140202224192912 -> 140202224192432
+ 140202229011856 [label="encoder.layer.4.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202229011856 -> 140202224192912
+ 140202224192912 [label=AccumulateGrad]
+ 140202224191712 -> 140202224192192
+ 140202224191712 [label=TBackward0]
+ 140202224192528 -> 140202224191712
+ 140202224192528 [label=ToCopyBackward0]
+ 140202224193248 -> 140202224192528
+ 140202228989840 [label="encoder.layer.5.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228989840 -> 140202224193248
+ 140202224193248 [label=AccumulateGrad]
+ 140202224189600 -> 140202224189552
+ 140202224189600 [label=ReshapeAliasBackward0]
+ 140202224191952 -> 140202224189600
+ 140202224191952 [label=ExpandBackward0]
+ 140202224192144 -> 140202224191952
+ 140202224192144 [label=TransposeBackward0]
+ 140202224192672 -> 140202224192144
+ 140202224192672 [label=PermuteBackward0]
+ 140202224192624 -> 140202224192672
+ 140202224192624 [label=ViewBackward0]
+ 140210811924544 -> 140202224192624
+ 140210811924544 [label=ViewBackward0]
+ 140210811926512 -> 140210811924544
+ 140210811926512 [label=AddmmBackward0]
+ 140210811927136 -> 140210811926512
+ 140210811927136 [label=ToCopyBackward0]
+ 140210812095072 -> 140210811927136
+ 140202228989360 [label="encoder.layer.5.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228989360 -> 140210812095072
+ 140210812095072 [label=AccumulateGrad]
+ 140210811925600 -> 140210811926512
+ 140210811925600 [label=ViewBackward0]
+ 140210812095504 -> 140210811925600
+ 140210812095504 [label=ToCopyBackward0]
+ 140202224191472 -> 140210812095504
+ 140210812095168 -> 140210811926512
+ 140210812095168 [label=TBackward0]
+ 140210812095216 -> 140210812095168
+ 140210812095216 [label=ToCopyBackward0]
+ 140210812095648 -> 140210812095216
+ 140202228989680 [label="encoder.layer.5.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228989680 -> 140210812095648
+ 140210812095648 [label=AccumulateGrad]
+ 140202224190416 -> 140202224190560
+ 140202224190416 [label=ReshapeAliasBackward0]
+ 140202224190176 -> 140202224190416
+ 140202224190176 [label=ExpandBackward0]
+ 140202224189984 -> 140202224190176
+ 140202224189984 [label=PermuteBackward0]
+ 140202224189792 -> 140202224189984
+ 140202224189792 [label=ViewBackward0]
+ 140202224190320 -> 140202224189792
+ 140202224190320 [label=ViewBackward0]
+ 140202224192048 -> 140202224190320
+ 140202224192048 [label=AddmmBackward0]
+ 140202224191760 -> 140202224192048
+ 140202224191760 [label=ToCopyBackward0]
+ 140210811926128 -> 140202224191760
+ 140202228989120 [label="encoder.layer.5.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228989120 -> 140210811926128
+ 140210811926128 [label=AccumulateGrad]
+ 140202224192336 -> 140202224192048
+ 140202224192336 [label=ViewBackward0]
+ 140210812095408 -> 140202224192336
+ 140210812095408 [label=ToCopyBackward0]
+ 140202224191472 -> 140210812095408
+ 140202224190368 -> 140202224192048
+ 140202224190368 [label=TBackward0]
+ 140210812095312 -> 140202224190368
+ 140210812095312 [label=ToCopyBackward0]
+ 140210812095552 -> 140210812095312
+ 140202228989440 [label="encoder.layer.5.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228989440 -> 140210812095552
+ 140210812095552 [label=AccumulateGrad]
+ 140202224191376 -> 140202224191184
+ 140202224191376 [label=TBackward0]
+ 140202224190704 -> 140202224191376
+ 140202224190704 [label=ToCopyBackward0]
+ 140202224190512 -> 140202224190704
+ 140202228989200 [label="encoder.layer.5.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228989200 -> 140202224190512
+ 140202224190512 [label=AccumulateGrad]
+ 140202224191472 -> 140202222987584
+ 140202222987152 -> 140202222988352
+ 140202228988960 [label="encoder.layer.5.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228988960 -> 140202222987152
+ 140202222987152 [label=AccumulateGrad]
+ 140202222987104 -> 140202222988352
+ 140202228988640 [label="encoder.layer.5.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228988640 -> 140202222987104
+ 140202222987104 [label=AccumulateGrad]
+ 140202222986192 -> 140202222986672
+ 140202222986192 [label=TBackward0]
+ 140202222987200 -> 140202222986192
+ 140202222987200 [label=ToCopyBackward0]
+ 140202222988544 -> 140202222987200
+ 140202228987280 [label="encoder.layer.5.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228987280 -> 140202222988544
+ 140202222988544 [label=AccumulateGrad]
+ 140202222985760 -> 140202222986048
+ 140202222985760 [label=TBackward0]
+ 140202222986720 -> 140202222985760
+ 140202222986720 [label=ToCopyBackward0]
+ 140202222987488 -> 140202222986720
+ 140202228987040 [label="encoder.layer.5.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228987040 -> 140202222987488
+ 140202222987488 [label=AccumulateGrad]
+ 140202222985568 -> 140202222985280
+ 140202222988592 -> 140202222988736
+ 140202228986800 [label="encoder.layer.5.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228986800 -> 140202222988592
+ 140202222988592 [label=AccumulateGrad]
+ 140202222987392 -> 140202222988736
+ 140202228986480 [label="encoder.layer.5.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228986480 -> 140202222987392
+ 140202222987392 [label=AccumulateGrad]
+ 140202222988160 -> 140202222935248
+ 140202222988160 [label=NativeLayerNormBackward0]
+ 140202222985664 -> 140202222988160
+ 140202222985664 [label=AddBackward0]
+ 140202222987008 -> 140202222985664
+ 140202222987008 [label=NativeDropoutBackward0]
+ 140202222986528 -> 140202222987008
+ 140202222986528 [label=ViewBackward0]
+ 140202222988448 -> 140202222986528
+ 140202222988448 [label=AddmmBackward0]
+ 140202222988256 -> 140202222988448
+ 140202222988256 [label=ToCopyBackward0]
+ 140202224190992 -> 140202222988256
+ 140202228988160 [label="encoder.layer.5.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228988160 -> 140202224190992
+ 140202224190992 [label=AccumulateGrad]
+ 140202224191616 -> 140202222988448
+ 140202224191616 [label=ViewBackward0]
+ 140202224190608 -> 140202224191616
+ 140202224190608 [label=GeluBackward0]
+ 140202224191040 -> 140202224190608
+ 140202224191040 [label=ViewBackward0]
+ 140202224190080 -> 140202224191040
+ 140202224190080 [label=AddmmBackward0]
+ 140202224189696 -> 140202224190080
+ 140202224189696 [label=ToCopyBackward0]
+ 140210811925360 -> 140202224189696
+ 140202228988400 [label="encoder.layer.5.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228988400 -> 140210811925360
+ 140210811925360 [label=AccumulateGrad]
+ 140202224189888 -> 140202224190080
+ 140202224189888 [label=ViewBackward0]
+ 140210812095840 -> 140202224189888
+ 140210812095840 [label=ToCopyBackward0]
+ 140202222986336 -> 140210812095840
+ 140202222986336 [label=SliceBackward0]
+ 140210812095888 -> 140202222986336
+ 140210812095888 [label=SliceBackward0]
+ 140210812095984 -> 140210812095888
+ 140210812095984 [label=SliceBackward0]
+ 140202222988352 -> 140210812095984
+ 140202224191136 -> 140202224190080
+ 140202224191136 [label=TBackward0]
+ 140210812095600 -> 140202224191136
+ 140210812095600 [label=ToCopyBackward0]
+ 140210812096080 -> 140210812095600
+ 140202228988720 [label="encoder.layer.5.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228988720 -> 140210812096080
+ 140210812096080 [label=AccumulateGrad]
+ 140202224191568 -> 140202222988448
+ 140202224191568 [label=TBackward0]
+ 140202224190800 -> 140202224191568
+ 140202224190800 [label=ToCopyBackward0]
+ 140202224191856 -> 140202224190800
+ 140202228988480 [label="encoder.layer.5.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228988480 -> 140202224191856
+ 140202224191856 [label=AccumulateGrad]
+ 140202222986336 -> 140202222985664
+ 140202222985376 -> 140202222988160
+ 140202228988240 [label="encoder.layer.5.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228988240 -> 140202222985376
+ 140202222985376 [label=AccumulateGrad]
+ 140202222985328 -> 140202222988160
+ 140202228987920 [label="encoder.layer.5.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228987920 -> 140202222985328
+ 140202222985328 [label=AccumulateGrad]
+ 140202222963584 -> 140202222964352
+ 140202222963584 [label=TBackward0]
+ 140202222988928 -> 140202222963584
+ 140202222988928 [label=ToCopyBackward0]
+ 140202222986144 -> 140202222988928
+ 140202228986560 [label="encoder.layer.6.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228986560 -> 140202222986144
+ 140202222986144 [label=AccumulateGrad]
+ 140202222963392 -> 140202222963200
+ 140202222963392 [label=ReshapeAliasBackward0]
+ 140202222963728 -> 140202222963392
+ 140202222963728 [label=ExpandBackward0]
+ 140202222964160 -> 140202222963728
+ 140202222964160 [label=TransposeBackward0]
+ 140202222964448 -> 140202222964160
+ 140202222964448 [label=PermuteBackward0]
+ 140202222989120 -> 140202222964448
+ 140202222989120 [label=ViewBackward0]
+ 140202222988112 -> 140202222989120
+ 140202222988112 [label=ViewBackward0]
+ 140202222988640 -> 140202222988112
+ 140202222988640 [label=AddmmBackward0]
+ 140202224190272 -> 140202222988640
+ 140202224190272 [label=ToCopyBackward0]
+ 140210812095792 -> 140202224190272
+ 140202228986000 [label="encoder.layer.6.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228986000 -> 140210812095792
+ 140210812095792 [label=AccumulateGrad]
+ 140202224191424 -> 140202222988640
+ 140202224191424 [label=ViewBackward0]
+ 140210812096128 -> 140202224191424
+ 140210812096128 [label=ToCopyBackward0]
+ 140202222935248 -> 140210812096128
+ 140210812095696 -> 140202222988640
+ 140210812095696 [label=TBackward0]
+ 140210812095744 -> 140210812095696
+ 140210812095744 [label=ToCopyBackward0]
+ 140210812096272 -> 140210812095744
+ 140202228986320 [label="encoder.layer.6.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228986320 -> 140210812096272
+ 140210812096272 [label=AccumulateGrad]
+ 140202222961760 -> 140202222961856
+ 140202222961760 [label=ReshapeAliasBackward0]
+ 140202222962432 -> 140202222961760
+ 140202222962432 [label=ExpandBackward0]
+ 140202222962816 -> 140202222962432
+ 140202222962816 [label=PermuteBackward0]
+ 140202222963104 -> 140202222962816
+ 140202222963104 [label=ViewBackward0]
+ 140202222961808 -> 140202222963104
+ 140202222961808 [label=ViewBackward0]
+ 140202222963968 -> 140202222961808
+ 140202222963968 [label=AddmmBackward0]
+ 140202222963488 -> 140202222963968
+ 140202222963488 [label=ToCopyBackward0]
+ 140202224191328 -> 140202222963488
+ 140202228985664 [label="encoder.layer.6.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228985664 -> 140202224191328
+ 140202224191328 [label=AccumulateGrad]
+ 140202222962144 -> 140202222963968
+ 140202222962144 [label=ViewBackward0]
+ 140210812096032 -> 140202222962144
+ 140210812096032 [label=ToCopyBackward0]
+ 140202222935248 -> 140210812096032
+ 140202222985712 -> 140202222963968
+ 140202222985712 [label=TBackward0]
+ 140210812095936 -> 140202222985712
+ 140210812095936 [label=ToCopyBackward0]
+ 140210812096176 -> 140210812095936
+ 140202228986080 [label="encoder.layer.6.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228986080 -> 140210812096176
+ 140210812096176 [label=AccumulateGrad]
+ 140202222960704 -> 140202222935728
+ 140202222960704 [label=TBackward0]
+ 140202222961280 -> 140202222960704
+ 140202222961280 [label=ToCopyBackward0]
+ 140202222961568 -> 140202222961280
+ 140202228985744 [label="encoder.layer.6.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228985744 -> 140202222961568
+ 140202222961568 [label=AccumulateGrad]
+ 140202222935248 -> 140202222935296
+ 140202222935008 -> 140202222935104
+ 140202228985504 [label="encoder.layer.6.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228985504 -> 140202222935008
+ 140202222935008 [label=AccumulateGrad]
+ 140202222934336 -> 140202222935104
+ 140202228985184 [label="encoder.layer.6.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228985184 -> 140202222934336
+ 140202222934336 [label=AccumulateGrad]
+ 140202222933184 -> 140202222933952
+ 140202222933184 [label=TBackward0]
+ 140202222934240 -> 140202222933184
+ 140202222934240 [label=ToCopyBackward0]
+ 140202222934768 -> 140202222934240
+ 140202228985264 [label="encoder.layer.6.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228985264 -> 140202222934768
+ 140202222934768 [label=AccumulateGrad]
+ 140202222932992 -> 140202222932800
+ 140202222932992 [label=ReshapeAliasBackward0]
+ 140202222933328 -> 140202222932992
+ 140202222933328 [label=ExpandBackward0]
+ 140202222933760 -> 140202222933328
+ 140202222933760 [label=TransposeBackward0]
+ 140202222934528 -> 140202222933760
+ 140202222934528 [label=PermuteBackward0]
+ 140202222935392 -> 140202222934528
+ 140202222935392 [label=ViewBackward0]
+ 140202222934624 -> 140202222935392
+ 140202222934624 [label=ViewBackward0]
+ 140202222935872 -> 140202222934624
+ 140202222935872 [label=AddmmBackward0]
+ 140202222933088 -> 140202222935872
+ 140202222933088 [label=ToCopyBackward0]
+ 140202222961088 -> 140202222933088
+ 140202228984704 [label="encoder.layer.6.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228984704 -> 140202222961088
+ 140202222961088 [label=AccumulateGrad]
+ 140202222960800 -> 140202222935872
+ 140202222960800 [label=ViewBackward0]
+ 140202222962624 -> 140202222960800
+ 140202222962624 [label=ToCopyBackward0]
+ 140210812052960 -> 140202222962624
+ 140202222960896 -> 140202222935872
+ 140202222960896 [label=TBackward0]
+ 140202222962336 -> 140202222960896
+ 140202222962336 [label=ToCopyBackward0]
+ 140202222963680 -> 140202222962336
+ 140202228985024 [label="encoder.layer.6.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228985024 -> 140202222963680
+ 140202222963680 [label=AccumulateGrad]
+ 140202222906720 -> 140202222906816
+ 140202222906720 [label=ReshapeAliasBackward0]
+ 140202222906768 -> 140202222906720
+ 140202222906768 [label=ExpandBackward0]
+ 140202222907104 -> 140202222906768
+ 140202222907104 [label=PermuteBackward0]
+ 140202222932704 -> 140202222907104
+ 140202222932704 [label=ViewBackward0]
+ 140202222932032 -> 140202222932704
+ 140202222932032 [label=ViewBackward0]
+ 140202222933568 -> 140202222932032
+ 140202222933568 [label=AddmmBackward0]
+ 140202222934720 -> 140202222933568
+ 140202222934720 [label=ToCopyBackward0]
+ 140202222987776 -> 140202222934720
+ 140202228984464 [label="encoder.layer.6.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228984464 -> 140202222987776
+ 140202222987776 [label=AccumulateGrad]
+ 140202222934048 -> 140202222933568
+ 140202222934048 [label=ViewBackward0]
+ 140202222935776 -> 140202222934048
+ 140202222935776 [label=ToCopyBackward0]
+ 140210812052960 -> 140202222935776
+ 140202222932224 -> 140202222933568
+ 140202222932224 [label=TBackward0]
+ 140202222961328 -> 140202222932224
+ 140202222961328 [label=ToCopyBackward0]
+ 140202222960992 -> 140202222961328
+ 140202228984784 [label="encoder.layer.6.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228984784 -> 140202222960992
+ 140202222960992 [label=AccumulateGrad]
+ 140202222905088 -> 140202222905328
+ 140202222905088 [label=TBackward0]
+ 140202222906240 -> 140202222905088
+ 140202222906240 [label=ToCopyBackward0]
+ 140202222906528 -> 140202222906240
+ 140202228984544 [label="encoder.layer.6.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228984544 -> 140202222906528
+ 140202222906528 [label=AccumulateGrad]
+ 140202222904848 -> 140202222904896
+ 140202222904704 -> 140202222904512
+ 140202228984304 [label="encoder.layer.6.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228984304 -> 140202222904704
+ 140202222904704 [label=AccumulateGrad]
+ 140202222903936 -> 140202222904512
+ 140202228983984 [label="encoder.layer.6.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228983984 -> 140202222903936
+ 140202222903936 [label=AccumulateGrad]
+ 140202222903408 -> 140202222903552
+ 140202222903408 [label=TBackward0]
+ 140202222904128 -> 140202222903408
+ 140202222904128 [label=ToCopyBackward0]
+ 140202222904800 -> 140202222904128
+ 140202228968480 [label="encoder.layer.6.experts.experts.0.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228968480 -> 140202222904800
+ 140202222904800 [label=AccumulateGrad]
+ 140202222873664 -> 140202222874144
+ 140202222873664 [label=TBackward0]
+ 140202222874336 -> 140202222873664
+ 140202222874336 [label=ToCopyBackward0]
+ 140202222904368 -> 140202222874336
+ 140202228968560 [label="encoder.layer.6.experts.experts.0.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228968560 -> 140202222904368
+ 140202222904368 [label=AccumulateGrad]
+ 140202222873280 -> 140202222873232
+ 140202222873280 [label=UnsqueezeBackward0]
+ 140202222873856 -> 140202222873280
+ 140202222873856 [label=NativeDropoutBackward0]
+ 140202222874240 -> 140202222873856
+ 140202222874240 [label=ViewBackward0]
+ 140202222905376 -> 140202222874240
+ 140202222905376 [label=AddmmBackward0]
+ 140202222903360 -> 140202222905376
+ 140202222903360 [label=ToCopyBackward0]
+ 140202222905856 -> 140202222903360
+ 140202228968240 [label="encoder.layer.6.experts.experts.1.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228968240 -> 140202222905856
+ 140202222905856 [label=AccumulateGrad]
+ 140202222904608 -> 140202222905376
+ 140202222904608 [label=ViewBackward0]
+ 140202222905760 -> 140202222904608
+ 140202222905760 [label=GeluBackward0]
+ 140202222907296 -> 140202222905760
+ 140202222907296 [label=ViewBackward0]
+ 140202222906048 -> 140202222907296
+ 140202222906048 [label=AddmmBackward0]
+ 140202222905472 -> 140202222906048
+ 140202222905472 [label=ToCopyBackward0]
+ 140202222935200 -> 140202222905472
+ 140202228969040 [label="encoder.layer.6.experts.experts.1.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228969040 -> 140202222935200
+ 140202222935200 [label=AccumulateGrad]
+ 140202222932512 -> 140202222906048
+ 140202222932512 [label=ViewBackward0]
+ 140202222962912 -> 140202222932512
+ 140202222962912 [label=ToCopyBackward0]
+ 140202222872416 -> 140202222962912
+ 140202222932416 -> 140202222906048
+ 140202222932416 [label=TBackward0]
+ 140202222933280 -> 140202222932416
+ 140202222933280 [label=ToCopyBackward0]
+ 140210812096320 -> 140202222933280
+ 140202228968320 [label="encoder.layer.6.experts.experts.1.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228968320 -> 140210812096320
+ 140210812096320 [label=AccumulateGrad]
+ 140202222903648 -> 140202222905376
+ 140202222903648 [label=TBackward0]
+ 140202222905952 -> 140202222903648
+ 140202222905952 [label=ToCopyBackward0]
+ 140202222963296 -> 140202222905952
+ 140202228968080 [label="encoder.layer.6.experts.experts.1.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228968080 -> 140202222963296
+ 140202222963296 [label=AccumulateGrad]
+ 140202222873184 -> 140202222873232
+ 140202222873184 [label=UnsqueezeBackward0]
+ 140202222932896 -> 140202222873184
+ 140202222932896 [label=NativeDropoutBackward0]
+ 140202222873760 -> 140202222932896
+ 140202222873760 [label=ViewBackward0]
+ 140202222906288 -> 140202222873760
+ 140202222906288 [label=AddmmBackward0]
+ 140202222903888 -> 140202222906288
+ 140202222903888 [label=ToCopyBackward0]
+ 140210812096224 -> 140202222903888
+ 140202228967760 [label="encoder.layer.6.experts.experts.2.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228967760 -> 140210812096224
+ 140210812096224 [label=AccumulateGrad]
+ 140210812096464 -> 140202222906288
+ 140210812096464 [label=ViewBackward0]
+ 140210811723936 -> 140210812096464
+ 140210811723936 [label=GeluBackward0]
+ 140210811724032 -> 140210811723936
+ 140210811724032 [label=ViewBackward0]
+ 140210811724128 -> 140210811724032
+ 140210811724128 [label=AddmmBackward0]
+ 140210811724224 -> 140210811724128
+ 140210811724224 [label=ToCopyBackward0]
+ 140210811724416 -> 140210811724224
+ 140202228968000 [label="encoder.layer.6.experts.experts.2.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228968000 -> 140210811724416
+ 140210811724416 [label=AccumulateGrad]
+ 140210811724176 -> 140210811724128
+ 140210811724176 [label=ViewBackward0]
+ 140210811724464 -> 140210811724176
+ 140210811724464 [label=ToCopyBackward0]
+ 140202222872416 -> 140210811724464
+ 140210811723888 -> 140210811724128
+ 140210811723888 [label=TBackward0]
+ 140210811724320 -> 140210811723888
+ 140210811724320 [label=ToCopyBackward0]
+ 140210811724608 -> 140210811724320
+ 140202228967840 [label="encoder.layer.6.experts.experts.2.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228967840 -> 140210811724608
+ 140210811724608 [label=AccumulateGrad]
+ 140210812096368 -> 140202222906288
+ 140210812096368 [label=TBackward0]
+ 140210811724080 -> 140210812096368
+ 140210811724080 [label=ToCopyBackward0]
+ 140210811724560 -> 140210811724080
+ 140202228967600 [label="encoder.layer.6.experts.experts.2.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228967600 -> 140210811724560
+ 140210811724560 [label=AccumulateGrad]
+ 140202222872992 -> 140202222872800
+ 140202222872992 [label=UnsqueezeBackward0]
+ 140202222873568 -> 140202222872992
+ 140202222873568 [label=UnsqueezeBackward0]
+ 140202222904224 -> 140202222873568
+ 140202222904224 [label=SumBackward1]
+ 140210812096416 -> 140202222904224
+ 140210812096416 [label=MulBackward0]
+ 140210811724704 -> 140210812096416
+ 140210811724704 [label=UnsqueezeBackward0]
+ 140210811723984 -> 140210811724704
+ 140210811723984 [label=TopkBackward0]
+ 140210811724512 -> 140210811723984
+ 140210811724512 [label=SoftmaxBackward0]
+ 140210811724800 -> 140210811724512
+ 140210811724800 [label=MmBackward0]
+ 140210811724896 -> 140210811724800
+ 140210811724896 [label=ToCopyBackward0]
+ 140210811725040 -> 140210811724896
+ 140210811725040 [label=MeanBackward1]
+ 140210811725136 -> 140210811725040
+ 140210811725136 [label=MulBackward0]
+ 140202222872416 -> 140210811725136
+ 140210811724848 -> 140210811724800
+ 140210811724848 [label=TBackward0]
+ 140210811725232 -> 140210811724848
+ 140210811725232 [label=ToCopyBackward0]
+ 140210811724944 -> 140210811725232
+ 140202228981824 [label="encoder.layer.6.experts.gate.weight
+ (3, 768)" fillcolor=lightblue]
+ 140202228981824 -> 140210811724944
+ 140210811724944 [label=AccumulateGrad]
+ 140202222872416 -> 140202222872272
+ 140202222872128 -> 140202222871936
+ 140202228982144 [label="encoder.layer.6.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228982144 -> 140202222872128
+ 140202222872128 [label=AccumulateGrad]
+ 140202222872224 -> 140202222871936
+ 140202228981904 [label="encoder.layer.6.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228981904 -> 140202222872224
+ 140202222872224 [label=AccumulateGrad]
+ 140202222871744 -> 140202222842352
+ 140202222871744 [label=NativeLayerNormBackward0]
+ 140202222904992 -> 140202222871744
+ 140202222904992 [label=AddBackward0]
+ 140202222873088 -> 140202222904992
+ 140202222873088 [label=NativeDropoutBackward0]
+ 140210811724656 -> 140202222873088
+ 140210811724656 [label=ViewBackward0]
+ 140210811723840 -> 140210811724656
+ 140210811723840 [label=AddmmBackward0]
+ 140210811725184 -> 140210811723840
+ 140210811725184 [label=ToCopyBackward0]
+ 140210811725376 -> 140210811725184
+ 140202228983504 [label="encoder.layer.6.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228983504 -> 140210811725376
+ 140210811725376 [label=AccumulateGrad]
+ 140210811725088 -> 140210811723840
+ 140210811725088 [label=ViewBackward0]
+ 140210811725424 -> 140210811725088
+ 140210811725424 [label=GeluBackward0]
+ 140210811725520 -> 140210811725424
+ 140210811725520 [label=ViewBackward0]
+ 140210811725616 -> 140210811725520
+ 140210811725616 [label=AddmmBackward0]
+ 140210811725712 -> 140210811725616
+ 140210811725712 [label=ToCopyBackward0]
+ 140210811725904 -> 140210811725712
+ 140202228983744 [label="encoder.layer.6.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228983744 -> 140210811725904
+ 140210811725904 [label=AccumulateGrad]
+ 140210811725664 -> 140210811725616
+ 140210811725664 [label=ViewBackward0]
+ 140210811725952 -> 140210811725664
+ 140210811725952 [label=ToCopyBackward0]
+ 140202222873376 -> 140210811725952
+ 140202222873376 [label=SliceBackward0]
+ 140210811726096 -> 140202222873376
+ 140210811726096 [label=SliceBackward0]
+ 140210811726192 -> 140210811726096
+ 140210811726192 [label=SliceBackward0]
+ 140202222935104 -> 140210811726192
+ 140210811724992 -> 140210811725616
+ 140210811724992 [label=TBackward0]
+ 140210811725856 -> 140210811724992
+ 140210811725856 [label=ToCopyBackward0]
+ 140210811726288 -> 140210811725856
+ 140202228984064 [label="encoder.layer.6.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228984064 -> 140210811726288
+ 140210811726288 [label=AccumulateGrad]
+ 140210811724272 -> 140210811723840
+ 140210811724272 [label=TBackward0]
+ 140210811725568 -> 140210811724272
+ 140210811725568 [label=ToCopyBackward0]
+ 140210811726048 -> 140210811725568
+ 140202228983824 [label="encoder.layer.6.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228983824 -> 140210811726048
+ 140210811726048 [label=AccumulateGrad]
+ 140202222873376 -> 140202222904992
+ 140202222872512 -> 140202222871744
+ 140202228983584 [label="encoder.layer.6.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228983584 -> 140202222872512
+ 140202222872512 [label=AccumulateGrad]
+ 140202222872320 -> 140202222871744
+ 140202228983264 [label="encoder.layer.6.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228983264 -> 140202222872320
+ 140202222872320 [label=AccumulateGrad]
+ 140202222870592 -> 140202222871168
+ 140202222870592 [label=TBackward0]
+ 140202222871456 -> 140202222870592
+ 140202222871456 [label=ToCopyBackward0]
+ 140202222872608 -> 140202222871456
+ 140202228982384 [label="encoder.layer.7.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228982384 -> 140202222872608
+ 140202222872608 [label=AccumulateGrad]
+ 140202222845568 -> 140202222845664
+ 140202222845568 [label=ReshapeAliasBackward0]
+ 140202222845760 -> 140202222845568
+ 140202222845760 [label=ExpandBackward0]
+ 140202222871264 -> 140202222845760
+ 140202222871264 [label=TransposeBackward0]
+ 140202222872032 -> 140202222871264
+ 140202222872032 [label=PermuteBackward0]
+ 140202222871840 -> 140202222872032
+ 140202222871840 [label=ViewBackward0]
+ 140202222870784 -> 140202222871840
+ 140202222870784 [label=ViewBackward0]
+ 140210811725280 -> 140202222870784
+ 140210811725280 [label=AddmmBackward0]
+ 140210811725808 -> 140210811725280
+ 140210811725808 [label=ToCopyBackward0]
+ 140210811726000 -> 140210811725808
+ 140203184706720 [label="encoder.layer.7.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140203184706720 -> 140210811726000
+ 140210811726000 [label=AccumulateGrad]
+ 140210811725760 -> 140210811725280
+ 140210811725760 [label=ViewBackward0]
+ 140210811726336 -> 140210811725760
+ 140210811726336 [label=ToCopyBackward0]
+ 140202222842352 -> 140210811726336
+ 140210811724752 -> 140210811725280
+ 140210811724752 [label=TBackward0]
+ 140210811725472 -> 140210811724752
+ 140210811725472 [label=ToCopyBackward0]
+ 140210811726480 -> 140210811725472
+ 140202228982624 [label="encoder.layer.7.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228982624 -> 140210811726480
+ 140210811726480 [label=AccumulateGrad]
+ 140202222844224 -> 140202222843936
+ 140202222844224 [label=ReshapeAliasBackward0]
+ 140202222844608 -> 140202222844224
+ 140202222844608 [label=ExpandBackward0]
+ 140202222844896 -> 140202222844608
+ 140202222844896 [label=PermuteBackward0]
+ 140202222845280 -> 140202222844896
+ 140202222845280 [label=ViewBackward0]
+ 140202222844272 -> 140202222845280
+ 140202222844272 [label=ViewBackward0]
+ 140202222844320 -> 140202222844272
+ 140202222844320 [label=AddmmBackward0]
+ 140202222872752 -> 140202222844320
+ 140202222872752 [label=ToCopyBackward0]
+ 140210811726432 -> 140202222872752
+ 140202228969200 [label="encoder.layer.7.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228969200 -> 140210811726432
+ 140210811726432 [label=AccumulateGrad]
+ 140202222871552 -> 140202222844320
+ 140202222871552 [label=ViewBackward0]
+ 140210811726240 -> 140202222871552
+ 140210811726240 [label=ToCopyBackward0]
+ 140202222842352 -> 140210811726240
+ 140202222870832 -> 140202222844320
+ 140202222870832 [label=TBackward0]
+ 140210811725328 -> 140202222870832
+ 140210811725328 [label=ToCopyBackward0]
+ 140210811726384 -> 140210811725328
+ 140202228969280 [label="encoder.layer.7.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228969280 -> 140210811726384
+ 140210811726384 [label=AccumulateGrad]
+ 140202222842592 -> 140202222842832
+ 140202222842592 [label=TBackward0]
+ 140202222843744 -> 140202222842592
+ 140202222843744 [label=ToCopyBackward0]
+ 140202222844032 -> 140202222843744
+ 140202228968960 [label="encoder.layer.7.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228968960 -> 140202222844032
+ 140202222844032 [label=AccumulateGrad]
+ 140202222842352 -> 140202222841968
+ 140202222842112 -> 140202222820224
+ 140202228967520 [label="encoder.layer.7.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228967520 -> 140202222842112
+ 140202222842112 [label=AccumulateGrad]
+ 140202222842016 -> 140202222820224
+ 140202228967280 [label="encoder.layer.7.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228967280 -> 140202222842016
+ 140202222842016 [label=AccumulateGrad]
+ 140202222819456 -> 140202222819936
+ 140202222819456 [label=TBackward0]
+ 140202222820368 -> 140202222819456
+ 140202222820368 [label=ToCopyBackward0]
+ 140202222821088 -> 140202222820368
+ 140202228965600 [label="encoder.layer.7.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228965600 -> 140202222821088
+ 140202222821088 [label=AccumulateGrad]
+ 140202222818880 -> 140202222819168
+ 140202222818880 [label=TBackward0]
+ 140202222819888 -> 140202222818880
+ 140202222819888 [label=ToCopyBackward0]
+ 140202222820800 -> 140202222819888
+ 140202228965440 [label="encoder.layer.7.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228965440 -> 140202222820800
+ 140202222820800 [label=AccumulateGrad]
+ 140202222818688 -> 140202222818304
+ 140202222818400 -> 140202222818208
+ 140202228952736 [label="encoder.layer.7.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228952736 -> 140202222818400
+ 140202222818400 [label=AccumulateGrad]
+ 140202222818112 -> 140202222818208
+ 140202228952816 [label="encoder.layer.7.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228952816 -> 140202222818112
+ 140202222818112 [label=AccumulateGrad]
+ 140202222817632 -> 140202223288032
+ 140202222817632 [label=NativeLayerNormBackward0]
+ 140202222818784 -> 140202222817632
+ 140202222818784 [label=AddBackward0]
+ 140202222820320 -> 140202222818784
+ 140202222820320 [label=NativeDropoutBackward0]
+ 140202222819840 -> 140202222820320
+ 140202222819840 [label=ViewBackward0]
+ 140202222820512 -> 140202222819840
+ 140202222820512 [label=AddmmBackward0]
+ 140202222842208 -> 140202222820512
+ 140202222842208 [label=ToCopyBackward0]
+ 140202222843264 -> 140202222842208
+ 140202228966880 [label="encoder.layer.7.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228966880 -> 140202222843264
+ 140202222843264 [label=AccumulateGrad]
+ 140202222842304 -> 140202222820512
+ 140202222842304 [label=ViewBackward0]
+ 140202222843792 -> 140202222842304
+ 140202222843792 [label=GeluBackward0]
+ 140202222843168 -> 140202222843792
+ 140202222843168 [label=ViewBackward0]
+ 140202222844800 -> 140202222843168
+ 140202222844800 [label=AddmmBackward0]
+ 140202222845376 -> 140202222844800
+ 140202222845376 [label=ToCopyBackward0]
+ 140210811726144 -> 140202222845376
+ 140202228967120 [label="encoder.layer.7.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228967120 -> 140210811726144
+ 140210811726144 [label=AccumulateGrad]
+ 140202222845088 -> 140202222844800
+ 140202222845088 [label=ViewBackward0]
+ 140210811726672 -> 140202222845088
+ 140210811726672 [label=ToCopyBackward0]
+ 140202222819408 -> 140210811726672
+ 140202222819408 [label=SliceBackward0]
+ 140210811726720 -> 140202222819408
+ 140210811726720 [label=SliceBackward0]
+ 140210811726816 -> 140210811726720
+ 140210811726816 [label=SliceBackward0]
+ 140202222820224 -> 140210811726816
+ 140202222842976 -> 140202222844800
+ 140202222842976 [label=TBackward0]
+ 140210811726528 -> 140202222842976
+ 140210811726528 [label=ToCopyBackward0]
+ 140210811726912 -> 140210811726528
+ 140202228967360 [label="encoder.layer.7.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228967360 -> 140210811726912
+ 140210811726912 [label=AccumulateGrad]
+ 140202222841920 -> 140202222820512
+ 140202222841920 [label=TBackward0]
+ 140202222843552 -> 140202222841920
+ 140202222843552 [label=ToCopyBackward0]
+ 140202222871072 -> 140202222843552
+ 140202228966800 [label="encoder.layer.7.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228966800 -> 140202222871072
+ 140202222871072 [label=AccumulateGrad]
+ 140202222819408 -> 140202222818784
+ 140202222818496 -> 140202222817632
+ 140202228966560 [label="encoder.layer.7.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228966560 -> 140202222818496
+ 140202222818496 [label=AccumulateGrad]
+ 140202222818448 -> 140202222817632
+ 140202228966640 [label="encoder.layer.7.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228966640 -> 140202222818448
+ 140202222818448 [label=AccumulateGrad]
+ 140202222817440 -> 140202223316800
+ 140202222817440 [label=TBackward0]
+ 140202222817728 -> 140202222817440
+ 140202222817728 [label=ToCopyBackward0]
+ 140202222819264 -> 140202222817728
+ 140202228952496 [label="encoder.layer.8.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228952496 -> 140202222819264
+ 140202222819264 [label=AccumulateGrad]
+ 140202223316128 -> 140202223315840
+ 140202223316128 [label=ReshapeAliasBackward0]
+ 140202223316512 -> 140202223316128
+ 140202223316512 [label=ExpandBackward0]
+ 140202223316176 -> 140202223316512
+ 140202223316176 [label=TransposeBackward0]
+ 140202223316224 -> 140202223316176
+ 140202223316224 [label=PermuteBackward0]
+ 140202222821280 -> 140202223316224
+ 140202222821280 [label=ViewBackward0]
+ 140202222817968 -> 140202222821280
+ 140202222817968 [label=ViewBackward0]
+ 140202222817536 -> 140202222817968
+ 140202222817536 [label=AddmmBackward0]
+ 140202222844416 -> 140202222817536
+ 140202222844416 [label=ToCopyBackward0]
+ 140210811726624 -> 140202222844416
+ 140202228952336 [label="encoder.layer.8.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228952336 -> 140210811726624
+ 140210811726624 [label=AccumulateGrad]
+ 140202222842496 -> 140202222817536
+ 140202222842496 [label=ViewBackward0]
+ 140210811726960 -> 140202222842496
+ 140210811726960 [label=ToCopyBackward0]
+ 140202223288032 -> 140210811726960
+ 140210811724368 -> 140202222817536
+ 140210811724368 [label=TBackward0]
+ 140210811726576 -> 140210811724368
+ 140210811726576 [label=ToCopyBackward0]
+ 140210811727104 -> 140210811726576
+ 140202228952256 [label="encoder.layer.8.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228952256 -> 140210811727104
+ 140210811727104 [label=AccumulateGrad]
+ 140202223314400 -> 140202223314496
+ 140202223314400 [label=ReshapeAliasBackward0]
+ 140202223315168 -> 140202223314400
+ 140202223315168 [label=ExpandBackward0]
+ 140202223315456 -> 140202223315168
+ 140202223315456 [label=PermuteBackward0]
+ 140202223315696 -> 140202223315456
+ 140202223315696 [label=ViewBackward0]
+ 140202223314592 -> 140202223315696
+ 140202223314592 [label=ViewBackward0]
+ 140202223316704 -> 140202223314592
+ 140202223316704 [label=AddmmBackward0]
+ 140202223314736 -> 140202223316704
+ 140202223314736 [label=ToCopyBackward0]
+ 140202222842688 -> 140202223314736
+ 140202228952096 [label="encoder.layer.8.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228952096 -> 140202222842688
+ 140202222842688 [label=AccumulateGrad]
+ 140202222818976 -> 140202223316704
+ 140202222818976 [label=ViewBackward0]
+ 140210811726864 -> 140202222818976
+ 140210811726864 [label=ToCopyBackward0]
+ 140202223288032 -> 140210811726864
+ 140202222818016 -> 140202223316704
+ 140202222818016 [label=TBackward0]
+ 140210811726768 -> 140202222818016
+ 140210811726768 [label=ToCopyBackward0]
+ 140210811727008 -> 140210811726768
+ 140202228952016 [label="encoder.layer.8.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228952016 -> 140210811727008
+ 140210811727008 [label=AccumulateGrad]
+ 140202223313056 -> 140202223313152
+ 140202223313056 [label=TBackward0]
+ 140202223313920 -> 140202223313056
+ 140202223313920 [label=ToCopyBackward0]
+ 140202223314304 -> 140202223313920
+ 140202228951776 [label="encoder.layer.8.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228951776 -> 140202223314304
+ 140202223314304 [label=AccumulateGrad]
+ 140202223288032 -> 140202223287936
+ 140202223287744 -> 140202223287696
+ 140202228951536 [label="encoder.layer.8.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228951536 -> 140202223287744
+ 140202223287744 [label=AccumulateGrad]
+ 140202223286976 -> 140202223287696
+ 140202228951616 [label="encoder.layer.8.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228951616 -> 140202223286976
+ 140202223286976 [label=AccumulateGrad]
+ 140202223285776 -> 140202223286688
+ 140202223285776 [label=TBackward0]
+ 140202223286880 -> 140202223285776
+ 140202223286880 [label=ToCopyBackward0]
+ 140202223287552 -> 140202223286880
+ 140202228951296 [label="encoder.layer.8.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228951296 -> 140202223287552
+ 140202223287552 [label=AccumulateGrad]
+ 140202223285728 -> 140202223285440
+ 140202223285728 [label=ReshapeAliasBackward0]
+ 140202223286112 -> 140202223285728
+ 140202223286112 [label=ExpandBackward0]
+ 140202223286400 -> 140202223286112
+ 140202223286400 [label=TransposeBackward0]
+ 140202223287264 -> 140202223286400
+ 140202223287264 [label=PermuteBackward0]
+ 140202223288128 -> 140202223287264
+ 140202223288128 [label=ViewBackward0]
+ 140202223287216 -> 140202223288128
+ 140202223287216 [label=ViewBackward0]
+ 140202223285824 -> 140202223287216
+ 140202223285824 [label=AddmmBackward0]
+ 140202223313440 -> 140202223285824
+ 140202223313440 [label=ToCopyBackward0]
+ 140202223313824 -> 140202223313440
+ 140202228951136 [label="encoder.layer.8.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228951136 -> 140202223313824
+ 140202223313824 [label=AccumulateGrad]
+ 140202223313536 -> 140202223285824
+ 140202223313536 [label=ViewBackward0]
+ 140202223315216 -> 140202223313536
+ 140202223315216 [label=ToCopyBackward0]
+ 140210812052960 -> 140202223315216
+ 140202223312960 -> 140202223285824
+ 140202223312960 [label=TBackward0]
+ 140202223314976 -> 140202223312960
+ 140202223314976 [label=ToCopyBackward0]
+ 140202223316320 -> 140202223314976
+ 140202228951056 [label="encoder.layer.8.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228951056 -> 140202223316320
+ 140202223316320 [label=AccumulateGrad]
+ 140202223251168 -> 140202223251264
+ 140202223251168 [label=ReshapeAliasBackward0]
+ 140202223284768 -> 140202223251168
+ 140202223284768 [label=ExpandBackward0]
+ 140202223285056 -> 140202223284768
+ 140202223285056 [label=PermuteBackward0]
+ 140202223285296 -> 140202223285056
+ 140202223285296 [label=ViewBackward0]
+ 140202223284288 -> 140202223285296
+ 140202223284288 [label=ViewBackward0]
+ 140202223286304 -> 140202223284288
+ 140202223286304 [label=AddmmBackward0]
+ 140202223287360 -> 140202223286304
+ 140202223287360 [label=ToCopyBackward0]
+ 140202222820848 -> 140202223287360
+ 140202228950896 [label="encoder.layer.8.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228950896 -> 140202222820848
+ 140202222820848 [label=AccumulateGrad]
+ 140202223286784 -> 140202223286304
+ 140202223286784 [label=ViewBackward0]
+ 140202223315936 -> 140202223286784
+ 140202223315936 [label=ToCopyBackward0]
+ 140210812052960 -> 140202223315936
+ 140202223284336 -> 140202223286304
+ 140202223284336 [label=TBackward0]
+ 140202223313248 -> 140202223284336
+ 140202223313248 [label=ToCopyBackward0]
+ 140202223314112 -> 140202223313248
+ 140202228950816 [label="encoder.layer.8.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228950816 -> 140202223314112
+ 140202223314112 [label=AccumulateGrad]
+ 140202223249584 -> 140202223250016
+ 140202223249584 [label=TBackward0]
+ 140202223250784 -> 140202223249584
+ 140202223250784 [label=ToCopyBackward0]
+ 140202223251072 -> 140202223250784
+ 140202228950576 [label="encoder.layer.8.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228950576 -> 140202223251072
+ 140202223251072 [label=AccumulateGrad]
+ 140202223249536 -> 140202223249152
+ 140202223249104 -> 140202223248960
+ 140202228950336 [label="encoder.layer.8.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228950336 -> 140202223249104
+ 140202223249104 [label=AccumulateGrad]
+ 140202223248288 -> 140202223248960
+ 140202228950416 [label="encoder.layer.8.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228950416 -> 140202223248288
+ 140202223248288 [label=AccumulateGrad]
+ 140202223247520 -> 140202223248000
+ 140202223247520 [label=TBackward0]
+ 140202223248576 -> 140202223247520
+ 140202223248576 [label=ToCopyBackward0]
+ 140202223249344 -> 140202223248576
+ 140202228934912 [label="encoder.layer.8.experts.experts.0.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228934912 -> 140202223249344
+ 140202223249344 [label=AccumulateGrad]
+ 140202223230256 -> 140202223230736
+ 140202223230256 [label=TBackward0]
+ 140202223248096 -> 140202223230256
+ 140202223248096 [label=ToCopyBackward0]
+ 140202223248864 -> 140202223248096
+ 140202228934592 [label="encoder.layer.8.experts.experts.0.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228934592 -> 140202223248864
+ 140202223248864 [label=AccumulateGrad]
+ 140202223229920 -> 140202223230016
+ 140202223229920 [label=UnsqueezeBackward0]
+ 140202223230592 -> 140202223229920
+ 140202223230592 [label=NativeDropoutBackward0]
+ 140202223230304 -> 140202223230592
+ 140202223230304 [label=ViewBackward0]
+ 140202223249632 -> 140202223230304
+ 140202223249632 [label=AddmmBackward0]
+ 140202223247904 -> 140202223249632
+ 140202223247904 [label=ToCopyBackward0]
+ 140202223250112 -> 140202223247904
+ 140202228934672 [label="encoder.layer.8.experts.experts.1.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228934672 -> 140202223250112
+ 140202223250112 [label=AccumulateGrad]
+ 140202223249056 -> 140202223249632
+ 140202223249056 [label=ViewBackward0]
+ 140202223250304 -> 140202223249056
+ 140202223250304 [label=GeluBackward0]
+ 140202223251024 -> 140202223250304
+ 140202223251024 [label=ViewBackward0]
+ 140202223250544 -> 140202223251024
+ 140202223250544 [label=AddmmBackward0]
+ 140202223285248 -> 140202223250544
+ 140202223285248 [label=ToCopyBackward0]
+ 140202223287840 -> 140202223285248
+ 140202228935072 [label="encoder.layer.8.experts.experts.1.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228935072 -> 140202223287840
+ 140202223287840 [label=AccumulateGrad]
+ 140202223284816 -> 140202223250544
+ 140202223284816 [label=ViewBackward0]
+ 140202223315648 -> 140202223284816
+ 140202223315648 [label=ToCopyBackward0]
+ 140202223229152 -> 140202223315648
+ 140202223284576 -> 140202223250544
+ 140202223284576 [label=TBackward0]
+ 140202223285920 -> 140202223284576
+ 140202223285920 [label=ToCopyBackward0]
+ 140210811727152 -> 140202223285920
+ 140202228934352 [label="encoder.layer.8.experts.experts.1.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228934352 -> 140210811727152
+ 140210811727152 [label=AccumulateGrad]
+ 140202223247424 -> 140202223249632
+ 140202223247424 [label=TBackward0]
+ 140202223249728 -> 140202223247424
+ 140202223249728 [label=ToCopyBackward0]
+ 140202223313728 -> 140202223249728
+ 140202228934112 [label="encoder.layer.8.experts.experts.1.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228934112 -> 140202223313728
+ 140202223313728 [label=AccumulateGrad]
+ 140202223229776 -> 140202223230016
+ 140202223229776 [label=UnsqueezeBackward0]
+ 140202223285536 -> 140202223229776
+ 140202223285536 [label=NativeDropoutBackward0]
+ 140202223249248 -> 140202223285536
+ 140202223249248 [label=ViewBackward0]
+ 140202223250208 -> 140202223249248
+ 140202223250208 [label=AddmmBackward0]
+ 140202223247616 -> 140202223250208
+ 140202223247616 [label=ToCopyBackward0]
+ 140210811727392 -> 140202223247616
+ 140202228934192 [label="encoder.layer.8.experts.experts.2.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228934192 -> 140210811727392
+ 140210811727392 [label=AccumulateGrad]
+ 140210811727296 -> 140202223250208
+ 140210811727296 [label=ViewBackward0]
+ 140210811727440 -> 140210811727296
+ 140210811727440 [label=GeluBackward0]
+ 140210811727536 -> 140210811727440
+ 140210811727536 [label=ViewBackward0]
+ 140210811727632 -> 140210811727536
+ 140210811727632 [label=AddmmBackward0]
+ 140210811727728 -> 140210811727632
+ 140210811727728 [label=ToCopyBackward0]
+ 140210811727824 -> 140210811727728
+ 140202228934432 [label="encoder.layer.8.experts.experts.2.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228934432 -> 140210811727824
+ 140210811727824 [label=AccumulateGrad]
+ 140210811727680 -> 140210811727632
+ 140210811727680 [label=ViewBackward0]
+ 140210811781280 -> 140210811727680
+ 140210811781280 [label=ToCopyBackward0]
+ 140202223229152 -> 140210811781280
+ 140210811727344 -> 140210811727632
+ 140210811727344 [label=TBackward0]
+ 140210811781232 -> 140210811727344
+ 140210811781232 [label=ToCopyBackward0]
+ 140210811781424 -> 140210811781232
+ 140202228933872 [label="encoder.layer.8.experts.experts.2.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228933872 -> 140210811781424
+ 140210811781424 [label=AccumulateGrad]
+ 140210811727200 -> 140202223250208
+ 140210811727200 [label=TBackward0]
+ 140210811727584 -> 140210811727200
+ 140210811727584 [label=ToCopyBackward0]
+ 140210811727776 -> 140210811727584
+ 140202228933632 [label="encoder.layer.8.experts.experts.2.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228933632 -> 140210811727776
+ 140210811727776 [label=AccumulateGrad]
+ 140202223229728 -> 140202223229440
+ 140202223229728 [label=UnsqueezeBackward0]
+ 140202223230400 -> 140202223229728
+ 140202223230400 [label=UnsqueezeBackward0]
+ 140202223248672 -> 140202223230400
+ 140202223248672 [label=SumBackward1]
+ 140202223229824 -> 140202223248672
+ 140202223229824 [label=MulBackward0]
+ 140210811727488 -> 140202223229824
+ 140210811727488 [label=UnsqueezeBackward0]
+ 140210811781376 -> 140210811727488
+ 140210811781376 [label=TopkBackward0]
+ 140210811781328 -> 140210811781376
+ 140210811781328 [label=SoftmaxBackward0]
+ 140210811781616 -> 140210811781328
+ 140210811781616 [label=MmBackward0]
+ 140210811781712 -> 140210811781616
+ 140210811781712 [label=ToCopyBackward0]
+ 140210811781856 -> 140210811781712
+ 140210811781856 [label=MeanBackward1]
+ 140210811781952 -> 140210811781856
+ 140210811781952 [label=MulBackward0]
+ 140202223229152 -> 140210811781952
+ 140210811781664 -> 140210811781616
+ 140210811781664 [label=TBackward0]
+ 140210811782048 -> 140210811781664
+ 140210811782048 [label=ToCopyBackward0]
+ 140210811781760 -> 140210811782048
+ 140202228935872 [label="encoder.layer.8.experts.gate.weight
+ (3, 768)" fillcolor=lightblue]
+ 140202228935872 -> 140210811781760
+ 140210811781760 [label=AccumulateGrad]
+ 140202223229152 -> 140202223229056
+ 140202223228864 -> 140202223228672
+ 140202228935792 [label="encoder.layer.8.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228935792 -> 140202223228864
+ 140202223228864 [label=AccumulateGrad]
+ 140202223228816 -> 140202223228672
+ 140202228935552 [label="encoder.layer.8.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228935552 -> 140202223228816
+ 140202223228816 [label=AccumulateGrad]
+ 140202223228336 -> 140202223195040
+ 140202223228336 [label=NativeLayerNormBackward0]
+ 140202223229536 -> 140202223228336
+ 140202223229536 [label=AddBackward0]
+ 140202223248384 -> 140202223229536
+ 140202223248384 [label=NativeDropoutBackward0]
+ 140210811727248 -> 140202223248384
+ 140210811727248 [label=ViewBackward0]
+ 140210811781520 -> 140210811727248
+ 140210811781520 [label=AddmmBackward0]
+ 140210811782000 -> 140210811781520
+ 140210811782000 [label=ToCopyBackward0]
+ 140210811782192 -> 140210811782000
+ 140202228949936 [label="encoder.layer.8.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228949936 -> 140210811782192
+ 140210811782192 [label=AccumulateGrad]
+ 140210811781904 -> 140210811781520
+ 140210811781904 [label=ViewBackward0]
+ 140210811782240 -> 140210811781904
+ 140210811782240 [label=GeluBackward0]
+ 140210811782336 -> 140210811782240
+ 140210811782336 [label=ViewBackward0]
+ 140210811782432 -> 140210811782336
+ 140210811782432 [label=AddmmBackward0]
+ 140210811782528 -> 140210811782432
+ 140210811782528 [label=ToCopyBackward0]
+ 140210811782720 -> 140210811782528
+ 140202228950176 [label="encoder.layer.8.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228950176 -> 140210811782720
+ 140210811782720 [label=AccumulateGrad]
+ 140210811782480 -> 140210811782432
+ 140210811782480 [label=ViewBackward0]
+ 140210811782768 -> 140210811782480
+ 140210811782768 [label=ToCopyBackward0]
+ 140202223230112 -> 140210811782768
+ 140202223230112 [label=SliceBackward0]
+ 140210811782912 -> 140202223230112
+ 140210811782912 [label=SliceBackward0]
+ 140210811783008 -> 140210811782912
+ 140210811783008 [label=SliceBackward0]
+ 140202223287696 -> 140210811783008
+ 140210811781808 -> 140210811782432
+ 140210811781808 [label=TBackward0]
+ 140210811782672 -> 140210811781808
+ 140210811782672 [label=ToCopyBackward0]
+ 140210811783104 -> 140210811782672
+ 140202228950096 [label="encoder.layer.8.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228950096 -> 140210811783104
+ 140210811783104 [label=AccumulateGrad]
+ 140210811781184 -> 140210811781520
+ 140210811781184 [label=TBackward0]
+ 140210811782384 -> 140210811781184
+ 140210811782384 [label=ToCopyBackward0]
+ 140210811782864 -> 140210811782384
+ 140202228949856 [label="encoder.layer.8.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228949856 -> 140210811782864
+ 140210811782864 [label=AccumulateGrad]
+ 140202223230112 -> 140202223229536
+ 140202223229248 -> 140202223228336
+ 140202228949616 [label="encoder.layer.8.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228949616 -> 140202223229248
+ 140202223229248 [label=AccumulateGrad]
+ 140202223228960 -> 140202223228336
+ 140202228949696 [label="encoder.layer.8.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228949696 -> 140202223228960
+ 140202223228960 [label=AccumulateGrad]
+ 140202223226992 -> 140202223227904
+ 140202223226992 [label=TBackward0]
+ 140202223228192 -> 140202223226992
+ 140202223228192 [label=ToCopyBackward0]
+ 140210811727056 -> 140202223228192
+ 140202228936112 [label="encoder.layer.9.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228936112 -> 140210811727056
+ 140210811727056 [label=AccumulateGrad]
+ 140202223226944 -> 140202223198016
+ 140202223226944 [label=ReshapeAliasBackward0]
+ 140202223227616 -> 140202223226944
+ 140202223227616 [label=ExpandBackward0]
+ 140202223227856 -> 140202223227616
+ 140202223227856 [label=TransposeBackward0]
+ 140202223228768 -> 140202223227856
+ 140202223228768 [label=PermuteBackward0]
+ 140202223228480 -> 140202223228768
+ 140202223228480 [label=ViewBackward0]
+ 140202223227328 -> 140202223228480
+ 140202223227328 [label=ViewBackward0]
+ 140210811782096 -> 140202223227328
+ 140210811782096 [label=AddmmBackward0]
+ 140210811782624 -> 140210811782096
+ 140210811782624 [label=ToCopyBackward0]
+ 140210811782816 -> 140210811782624
+ 140202228936272 [label="encoder.layer.9.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228936272 -> 140210811782816
+ 140210811782816 [label=AccumulateGrad]
+ 140210811782576 -> 140210811782096
+ 140210811782576 [label=ViewBackward0]
+ 140210811783152 -> 140210811782576
+ 140210811783152 [label=ToCopyBackward0]
+ 140202223195040 -> 140210811783152
+ 140210811781568 -> 140210811782096
+ 140210811781568 [label=TBackward0]
+ 140210811782288 -> 140210811781568
+ 140210811782288 [label=ToCopyBackward0]
+ 140210811783296 -> 140210811782288
+ 140202228936352 [label="encoder.layer.9.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228936352 -> 140210811783296
+ 140210811783296 [label=AccumulateGrad]
+ 140202223196720 -> 140202223196576
+ 140202223196720 [label=ReshapeAliasBackward0]
+ 140202223197248 -> 140202223196720
+ 140202223197248 [label=ExpandBackward0]
+ 140202223197536 -> 140202223197248
+ 140202223197536 [label=PermuteBackward0]
+ 140202223197824 -> 140202223197536
+ 140202223197824 [label=ViewBackward0]
+ 140202223196960 -> 140202223197824
+ 140202223196960 [label=ViewBackward0]
+ 140202223227808 -> 140202223196960
+ 140202223227808 [label=AddmmBackward0]
+ 140202223229344 -> 140202223227808
+ 140202223229344 [label=ToCopyBackward0]
+ 140210811783248 -> 140202223229344
+ 140202228935312 [label="encoder.layer.9.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228935312 -> 140210811783248
+ 140210811783248 [label=AccumulateGrad]
+ 140202223228288 -> 140202223227808
+ 140202223228288 [label=ViewBackward0]
+ 140210811783056 -> 140202223228288
+ 140210811783056 [label=ToCopyBackward0]
+ 140202223195040 -> 140210811783056
+ 140202223227040 -> 140202223227808
+ 140202223227040 [label=TBackward0]
+ 140210811782144 -> 140202223227040
+ 140210811782144 [label=ToCopyBackward0]
+ 140210811783200 -> 140210811782144
+ 140202228935632 [label="encoder.layer.9.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228935632 -> 140210811783200
+ 140210811783200 [label=AccumulateGrad]
+ 140202223195232 -> 140202223195520
+ 140202223195232 [label=TBackward0]
+ 140202223196240 -> 140202223195232
+ 140202223196240 [label=ToCopyBackward0]
+ 140202223196672 -> 140202223196240
+ 140202228935392 [label="encoder.layer.9.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228935392 -> 140202223196672
+ 140202223196672 [label=AccumulateGrad]
+ 140202223195040 -> 140202223194656
+ 140202223194752 -> 140202223194464
+ 140202228933952 [label="encoder.layer.9.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228933952 -> 140202223194752
+ 140202223194752 [label=AccumulateGrad]
+ 140202223194272 -> 140202223194464
+ 140202228933712 [label="encoder.layer.9.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228933712 -> 140202223194272
+ 140202223194272 [label=AccumulateGrad]
+ 140202223172000 -> 140202223172480
+ 140202223172000 [label=TBackward0]
+ 140202223173056 -> 140202223172000
+ 140202223173056 [label=ToCopyBackward0]
+ 140202223173536 -> 140202223173056
+ 140202228927840 [label="encoder.layer.9.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228927840 -> 140202223173536
+ 140202223173536 [label=AccumulateGrad]
+ 140202223171376 -> 140202223171808
+ 140202223171376 [label=TBackward0]
+ 140202223172576 -> 140202223171376
+ 140202223172576 [label=ToCopyBackward0]
+ 140202223173296 -> 140202223172576
+ 140202228927600 [label="encoder.layer.9.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228927600 -> 140202223173296
+ 140202223173296 [label=AccumulateGrad]
+ 140202223171328 -> 140202223170944
+ 140202223170896 -> 140202223170848
+ 140202228927360 [label="encoder.layer.9.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228927360 -> 140202223170896
+ 140202223170896 [label=AccumulateGrad]
+ 140202223170752 -> 140202223170848
+ 140202228927040 [label="encoder.layer.9.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228927040 -> 140202223170752
+ 140202223170752 [label=AccumulateGrad]
+ 140202223170272 -> 140202223136928
+ 140202223170272 [label=NativeLayerNormBackward0]
+ 140202223171424 -> 140202223170272
+ 140202223171424 [label=AddBackward0]
+ 140202223172816 -> 140202223171424
+ 140202223172816 [label=NativeDropoutBackward0]
+ 140202223172336 -> 140202223172816
+ 140202223172336 [label=ViewBackward0]
+ 140202223194176 -> 140202223172336
+ 140202223194176 [label=AddmmBackward0]
+ 140202223194848 -> 140202223194176
+ 140202223194848 [label=ToCopyBackward0]
+ 140202223195760 -> 140202223194848
+ 140202228932912 [label="encoder.layer.9.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228932912 -> 140202223195760
+ 140202223195760 [label=AccumulateGrad]
+ 140202223194800 -> 140202223194176
+ 140202223194800 [label=ViewBackward0]
+ 140202223196480 -> 140202223194800
+ 140202223196480 [label=GeluBackward0]
+ 140202223195808 -> 140202223196480
+ 140202223195808 [label=ViewBackward0]
+ 140202223197344 -> 140202223195808
+ 140202223197344 [label=AddmmBackward0]
+ 140202223196864 -> 140202223197344
+ 140202223196864 [label=ToCopyBackward0]
+ 140210811782960 -> 140202223196864
+ 140202228933152 [label="encoder.layer.9.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228933152 -> 140210811782960
+ 140210811782960 [label=AccumulateGrad]
+ 140202223197728 -> 140202223197344
+ 140202223197728 [label=ViewBackward0]
+ 140210811783488 -> 140202223197728
+ 140210811783488 [label=ToCopyBackward0]
+ 140202223172096 -> 140210811783488
+ 140202223172096 [label=SliceBackward0]
+ 140210811783536 -> 140202223172096
+ 140210811783536 [label=SliceBackward0]
+ 140210811783632 -> 140210811783536
+ 140210811783632 [label=SliceBackward0]
+ 140202223194464 -> 140210811783632
+ 140202223195616 -> 140202223197344
+ 140202223195616 [label=TBackward0]
+ 140210811783344 -> 140202223195616
+ 140210811783344 [label=ToCopyBackward0]
+ 140210811783728 -> 140210811783344
+ 140202228933392 [label="encoder.layer.9.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228933392 -> 140210811783728
+ 140210811783728 [label=AccumulateGrad]
+ 140202223194560 -> 140202223194176
+ 140202223194560 [label=TBackward0]
+ 140202223196192 -> 140202223194560
+ 140202223196192 [label=ToCopyBackward0]
+ 140202223227376 -> 140202223196192
+ 140202228933232 [label="encoder.layer.9.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228933232 -> 140202223227376
+ 140202223227376 [label=AccumulateGrad]
+ 140202223172096 -> 140202223171424
+ 140202223171040 -> 140202223170272
+ 140202228932992 [label="encoder.layer.9.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228932992 -> 140202223171040
+ 140202223171040 [label=AccumulateGrad]
+ 140202223171136 -> 140202223170272
+ 140202228932672 [label="encoder.layer.9.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228932672 -> 140202223171136
+ 140202223171136 [label=AccumulateGrad]
+ 140202223169696 -> 140202223169936
+ 140202223169696 [label=TBackward0]
+ 140202223170368 -> 140202223169696
+ 140202223170368 [label=ToCopyBackward0]
+ 140202223171904 -> 140202223170368
+ 140202228927120 [label="encoder.layer.10.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228927120 -> 140202223171904
+ 140202223171904 [label=AccumulateGrad]
+ 140202223140240 -> 140202223140096
+ 140202223140240 [label=ReshapeAliasBackward0]
+ 140202223140480 -> 140202223140240
+ 140202223140480 [label=ExpandBackward0]
+ 140202223140384 -> 140202223140480
+ 140202223140384 [label=TransposeBackward0]
+ 140202223170560 -> 140202223140384
+ 140202223170560 [label=PermuteBackward0]
+ 140202223173152 -> 140202223170560
+ 140202223173152 [label=ViewBackward0]
+ 140202223170656 -> 140202223173152
+ 140202223170656 [label=ViewBackward0]
+ 140202223195328 -> 140202223170656
+ 140202223195328 [label=AddmmBackward0]
+ 140202223197056 -> 140202223195328
+ 140202223197056 [label=ToCopyBackward0]
+ 140210811783440 -> 140202223197056
+ 140202228926560 [label="encoder.layer.10.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228926560 -> 140210811783440
+ 140210811783440 [label=AccumulateGrad]
+ 140202223194320 -> 140202223195328
+ 140202223194320 [label=ViewBackward0]
+ 140210811783776 -> 140202223194320
+ 140210811783776 [label=ToCopyBackward0]
+ 140202223136928 -> 140210811783776
+ 140210811781472 -> 140202223195328
+ 140210811781472 [label=TBackward0]
+ 140210811783392 -> 140210811781472
+ 140210811783392 [label=ToCopyBackward0]
+ 140210811783920 -> 140210811783392
+ 140202228926880 [label="encoder.layer.10.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228926880 -> 140210811783920
+ 140210811783920 [label=AccumulateGrad]
+ 140202223138656 -> 140202223138752
+ 140202223138656 [label=ReshapeAliasBackward0]
+ 140202223139280 -> 140202223138656
+ 140202223139280 [label=ExpandBackward0]
+ 140202223139712 -> 140202223139280
+ 140202223139712 [label=PermuteBackward0]
+ 140202223140000 -> 140202223139712
+ 140202223140000 [label=ViewBackward0]
+ 140202223138848 -> 140202223140000
+ 140202223138848 [label=ViewBackward0]
+ 140202223140576 -> 140202223138848
+ 140202223140576 [label=AddmmBackward0]
+ 140202223171520 -> 140202223140576
+ 140202223171520 [label=ToCopyBackward0]
+ 140202223195136 -> 140202223171520
+ 140202228926320 [label="encoder.layer.10.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228926320 -> 140202223195136
+ 140202223195136 [label=AccumulateGrad]
+ 140202223170080 -> 140202223140576
+ 140202223170080 [label=ViewBackward0]
+ 140210811783680 -> 140202223170080
+ 140210811783680 [label=ToCopyBackward0]
+ 140202223136928 -> 140210811783680
+ 140202223169792 -> 140202223140576
+ 140202223169792 [label=TBackward0]
+ 140210811783584 -> 140202223169792
+ 140210811783584 [label=ToCopyBackward0]
+ 140210811783824 -> 140210811783584
+ 140202228926640 [label="encoder.layer.10.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228926640 -> 140210811783824
+ 140210811783824 [label=AccumulateGrad]
+ 140202223137024 -> 140202223137408
+ 140202223137024 [label=TBackward0]
+ 140202223138176 -> 140202223137024
+ 140202223138176 [label=ToCopyBackward0]
+ 140202223138464 -> 140202223138176
+ 140202228926400 [label="encoder.layer.10.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228926400 -> 140202223138464
+ 140202223138464 [label=AccumulateGrad]
+ 140202223136928 -> 140202223112096
+ 140202223111904 -> 140202223112000
+ 140202228926160 [label="encoder.layer.10.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228926160 -> 140202223111904
+ 140202223111904 [label=AccumulateGrad]
+ 140202223111232 -> 140202223112000
+ 140202228925840 [label="encoder.layer.10.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228925840 -> 140202223111232
+ 140202223111232 [label=AccumulateGrad]
+ 140202223110080 -> 140202223110800
+ 140202223110080 [label=TBackward0]
+ 140202223111136 -> 140202223110080
+ 140202223111136 [label=ToCopyBackward0]
+ 140202223111808 -> 140202223111136
+ 140202228925920 [label="encoder.layer.10.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228925920 -> 140202223111808
+ 140202223111808 [label=AccumulateGrad]
+ 140202223109840 -> 140202223109696
+ 140202223109840 [label=ReshapeAliasBackward0]
+ 140202223110368 -> 140202223109840
+ 140202223110368 [label=ExpandBackward0]
+ 140202223110656 -> 140202223110368
+ 140202223110656 [label=TransposeBackward0]
+ 140202223111424 -> 140202223110656
+ 140202223111424 [label=PermuteBackward0]
+ 140202223111520 -> 140202223111424
+ 140202223111520 [label=ViewBackward0]
+ 140202223109984 -> 140202223111520
+ 140202223109984 [label=ViewBackward0]
+ 140202223137360 -> 140202223109984
+ 140202223137360 [label=AddmmBackward0]
+ 140202223137696 -> 140202223137360
+ 140202223137696 [label=ToCopyBackward0]
+ 140202223137984 -> 140202223137696
+ 140202228925360 [label="encoder.layer.10.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228925360 -> 140202223137984
+ 140202223137984 [label=AccumulateGrad]
+ 140202223137792 -> 140202223137360
+ 140202223137792 [label=ViewBackward0]
+ 140202223139520 -> 140202223137792
+ 140202223139520 [label=ToCopyBackward0]
+ 140210812052960 -> 140202223139520
+ 140202223136880 -> 140202223137360
+ 140202223136880 [label=TBackward0]
+ 140202223139232 -> 140202223136880
+ 140202223139232 [label=ToCopyBackward0]
+ 140202223139040 -> 140202223139232
+ 140202228925680 [label="encoder.layer.10.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228925680 -> 140202223139040
+ 140202223139040 [label=AccumulateGrad]
+ 140202223108256 -> 140202223082800
+ 140202223108256 [label=ReshapeAliasBackward0]
+ 140202223108880 -> 140202223108256
+ 140202223108880 [label=ExpandBackward0]
+ 140202223109312 -> 140202223108880
+ 140202223109312 [label=PermuteBackward0]
+ 140202223109600 -> 140202223109312
+ 140202223109600 [label=ViewBackward0]
+ 140202223108448 -> 140202223109600
+ 140202223108448 [label=ViewBackward0]
+ 140202223110464 -> 140202223108448
+ 140202223110464 [label=AddmmBackward0]
+ 140202223111616 -> 140202223110464
+ 140202223111616 [label=ToCopyBackward0]
+ 140202223138368 -> 140202223111616
+ 140202228925120 [label="encoder.layer.10.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228925120 -> 140202223138368
+ 140202223138368 [label=AccumulateGrad]
+ 140202223110944 -> 140202223110464
+ 140202223110944 [label=ViewBackward0]
+ 140202223140192 -> 140202223110944
+ 140202223140192 [label=ToCopyBackward0]
+ 140210812052960 -> 140202223140192
+ 140202223108640 -> 140202223110464
+ 140202223108640 [label=TBackward0]
+ 140202223136832 -> 140202223108640
+ 140202223136832 [label=ToCopyBackward0]
+ 140202223137312 -> 140202223136832
+ 140202228925440 [label="encoder.layer.10.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140202228925440 -> 140202223137312
+ 140202223137312 [label=AccumulateGrad]
+ 140202223081984 -> 140202223082368
+ 140202223081984 [label=TBackward0]
+ 140202223083136 -> 140202223081984
+ 140202223083136 [label=ToCopyBackward0]
+ 140202223083328 -> 140202223083136
+ 140202228925200 [label="encoder.layer.10.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228925200 -> 140202223083328
+ 140202223083328 [label=AccumulateGrad]
+ 140202223081888 -> 140202223081792
+ 140202223081504 -> 140202223081600
+ 140202228924960 [label="encoder.layer.10.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228924960 -> 140202223081504
+ 140202223081504 [label=AccumulateGrad]
+ 140202223080880 -> 140202223081600
+ 140202228924640 [label="encoder.layer.10.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228924640 -> 140202223080880
+ 140202223080880 [label=AccumulateGrad]
+ 140202223080160 -> 140202223080640
+ 140202223080160 [label=TBackward0]
+ 140202223080928 -> 140202223080160
+ 140202223080928 [label=ToCopyBackward0]
+ 140202223081696 -> 140202223080928
+ 140202228905040 [label="encoder.layer.10.experts.experts.0.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228905040 -> 140202223081696
+ 140202223081696 [label=AccumulateGrad]
+ 140202223079584 -> 140202223079536
+ 140202223079584 [label=TBackward0]
+ 140202223080448 -> 140202223079584
+ 140202223080448 [label=ToCopyBackward0]
+ 140202223081216 -> 140202223080448
+ 140202228905120 [label="encoder.layer.10.experts.experts.0.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228905120 -> 140202223081216
+ 140202223081216 [label=AccumulateGrad]
+ 140202223578608 -> 140202223578464
+ 140202223578608 [label=UnsqueezeBackward0]
+ 140202223579040 -> 140202223578608
+ 140202223579040 [label=NativeDropoutBackward0]
+ 140202223079968 -> 140202223579040
+ 140202223079968 [label=ViewBackward0]
+ 140202223082272 -> 140202223079968
+ 140202223082272 [label=AddmmBackward0]
+ 140202223080256 -> 140202223082272
+ 140202223080256 [label=ToCopyBackward0]
+ 140202223082752 -> 140202223080256
+ 140202228904800 [label="encoder.layer.10.experts.experts.1.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228904800 -> 140202223082752
+ 140202223082752 [label=AccumulateGrad]
+ 140202223081408 -> 140202223082272
+ 140202223081408 [label=ViewBackward0]
+ 140202223082656 -> 140202223081408
+ 140202223082656 [label=GeluBackward0]
+ 140202223081312 -> 140202223082656
+ 140202223081312 [label=ViewBackward0]
+ 140202223108352 -> 140202223081312
+ 140202223108352 [label=AddmmBackward0]
+ 140202223109360 -> 140202223108352
+ 140202223109360 [label=ToCopyBackward0]
+ 140202223169600 -> 140202223109360
+ 140202228905600 [label="encoder.layer.10.experts.experts.1.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228905600 -> 140202223169600
+ 140202223169600 [label=AccumulateGrad]
+ 140202223109120 -> 140202223108352
+ 140202223109120 [label=ViewBackward0]
+ 140202223139760 -> 140202223109120
+ 140202223139760 [label=ToCopyBackward0]
+ 140202223577888 -> 140202223139760
+ 140202223108832 -> 140202223108352
+ 140202223108832 [label=TBackward0]
+ 140202223110176 -> 140202223108832
+ 140202223110176 [label=ToCopyBackward0]
+ 140210811783968 -> 140202223110176
+ 140202228904880 [label="encoder.layer.10.experts.experts.1.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228904880 -> 140210811783968
+ 140210811783968 [label=AccumulateGrad]
+ 140202223079488 -> 140202223082272
+ 140202223079488 [label=TBackward0]
+ 140202223082320 -> 140202223079488
+ 140202223082320 [label=ToCopyBackward0]
+ 140202223137840 -> 140202223082320
+ 140202228904640 [label="encoder.layer.10.experts.experts.1.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228904640 -> 140202223137840
+ 140202223137840 [label=AccumulateGrad]
+ 140202223578272 -> 140202223578464
+ 140202223578272 [label=UnsqueezeBackward0]
+ 140202223080736 -> 140202223578272
+ 140202223080736 [label=NativeDropoutBackward0]
+ 140202223082944 -> 140202223080736
+ 140202223082944 [label=ViewBackward0]
+ 140202223108160 -> 140202223082944
+ 140202223108160 [label=AddmmBackward0]
+ 140202223079680 -> 140202223108160
+ 140202223079680 [label=ToCopyBackward0]
+ 140210811784208 -> 140202223079680
+ 140202228895552 [label="encoder.layer.10.experts.experts.2.dense2.bias
+ (768)" fillcolor=lightblue]
+ 140202228895552 -> 140210811784208
+ 140210811784208 [label=AccumulateGrad]
+ 140210811784112 -> 140202223108160
+ 140210811784112 [label=ViewBackward0]
+ 140210811784256 -> 140210811784112
+ 140210811784256 [label=GeluBackward0]
+ 140210811784352 -> 140210811784256
+ 140210811784352 [label=ViewBackward0]
+ 140210811784448 -> 140210811784352
+ 140210811784448 [label=AddmmBackward0]
+ 140210811784544 -> 140210811784448
+ 140210811784544 [label=ToCopyBackward0]
+ 140210811784736 -> 140210811784544
+ 140202228904560 [label="encoder.layer.10.experts.experts.2.dense1.bias
+ (3072)" fillcolor=lightblue]
+ 140202228904560 -> 140210811784736
+ 140210811784736 [label=AccumulateGrad]
+ 140210811784496 -> 140210811784448
+ 140210811784496 [label=ViewBackward0]
+ 140210811784784 -> 140210811784496
+ 140210811784784 [label=ToCopyBackward0]
+ 140202223577888 -> 140210811784784
+ 140210811784160 -> 140210811784448
+ 140210811784160 [label=TBackward0]
+ 140210811784640 -> 140210811784160
+ 140210811784640 [label=ToCopyBackward0]
+ 140210811784928 -> 140210811784640
+ 140202228904400 [label="encoder.layer.10.experts.experts.2.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228904400 -> 140210811784928
+ 140210811784928 [label=AccumulateGrad]
+ 140210811784016 -> 140202223108160
+ 140210811784016 [label=TBackward0]
+ 140210811784400 -> 140210811784016
+ 140210811784400 [label=ToCopyBackward0]
+ 140210811784880 -> 140210811784400
+ 140202228904320 [label="encoder.layer.10.experts.experts.2.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228904320 -> 140210811784880
+ 140210811784880 [label=AccumulateGrad]
+ 140202223578176 -> 140202223578128
+ 140202223578176 [label=UnsqueezeBackward0]
+ 140202223578848 -> 140202223578176
+ 140202223578848 [label=UnsqueezeBackward0]
+ 140202223109792 -> 140202223578848
+ 140202223109792 [label=SumBackward1]
+ 140202223079920 -> 140202223109792
+ 140202223079920 [label=MulBackward0]
+ 140210811785024 -> 140202223079920
+ 140210811785024 [label=UnsqueezeBackward0]
+ 140210811784304 -> 140210811785024
+ 140210811784304 [label=TopkBackward0]
+ 140210811784832 -> 140210811784304
+ 140210811784832 [label=SoftmaxBackward0]
+ 140210811785120 -> 140210811784832
+ 140210811785120 [label=MmBackward0]
+ 140210811785168 -> 140210811785120
+ 140210811785168 [label=ToCopyBackward0]
+ 140210811850960 -> 140210811785168
+ 140210811850960 [label=MeanBackward1]
+ 140210811851056 -> 140210811850960
+ 140210811851056 [label=MulBackward0]
+ 140202223577888 -> 140210811851056
+ 140210811784064 -> 140210811785120
+ 140210811784064 [label=TBackward0]
+ 140210811851152 -> 140210811784064
+ 140210811851152 [label=ToCopyBackward0]
+ 140210811850864 -> 140210811851152
+ 140202228906000 [label="encoder.layer.10.experts.gate.weight
+ (3, 768)" fillcolor=lightblue]
+ 140202228906000 -> 140210811850864
+ 140210811850864 [label=AccumulateGrad]
+ 140202223577888 -> 140202223577504
+ 140202223577600 -> 140202223577408
+ 140202228906320 [label="encoder.layer.10.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228906320 -> 140202223577600
+ 140202223577600 [label=AccumulateGrad]
+ 140202223577312 -> 140202223577408
+ 140202228906080 [label="encoder.layer.10.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228906080 -> 140202223577312
+ 140202223577312 [label=AccumulateGrad]
+ 140202223576832 -> 140202223540112
+ 140202223576832 [label=NativeLayerNormBackward0]
+ 140202223577984 -> 140202223576832
+ 140202223577984 [label=AddBackward0]
+ 140202223081840 -> 140202223577984
+ 140202223081840 [label=NativeDropoutBackward0]
+ 140210811784976 -> 140202223081840
+ 140210811784976 [label=ViewBackward0]
+ 140210811785072 -> 140210811784976
+ 140210811785072 [label=AddmmBackward0]
+ 140210811851104 -> 140210811785072
+ 140210811851104 [label=ToCopyBackward0]
+ 140210811851296 -> 140210811851104
+ 140202228907680 [label="encoder.layer.10.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228907680 -> 140210811851296
+ 140210811851296 [label=AccumulateGrad]
+ 140210811851008 -> 140210811785072
+ 140210811851008 [label=ViewBackward0]
+ 140210811851344 -> 140210811851008
+ 140210811851344 [label=GeluBackward0]
+ 140210811851440 -> 140210811851344
+ 140210811851440 [label=ViewBackward0]
+ 140210811851536 -> 140210811851440
+ 140210811851536 [label=AddmmBackward0]
+ 140210811851632 -> 140210811851536
+ 140210811851632 [label=ToCopyBackward0]
+ 140210811851824 -> 140210811851632
+ 140202228924480 [label="encoder.layer.10.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228924480 -> 140210811851824
+ 140210811851824 [label=AccumulateGrad]
+ 140210811851584 -> 140210811851536
+ 140210811851584 [label=ViewBackward0]
+ 140210811851872 -> 140210811851584
+ 140210811851872 [label=ToCopyBackward0]
+ 140202223578560 -> 140210811851872
+ 140202223578560 [label=SliceBackward0]
+ 140210811852016 -> 140202223578560
+ 140210811852016 [label=SliceBackward0]
+ 140210811852112 -> 140210811852016
+ 140210811852112 [label=SliceBackward0]
+ 140202223112000 -> 140210811852112
+ 140210811850912 -> 140210811851536
+ 140210811850912 [label=TBackward0]
+ 140210811851776 -> 140210811850912
+ 140210811851776 [label=ToCopyBackward0]
+ 140210811852208 -> 140210811851776
+ 140202228924720 [label="encoder.layer.10.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228924720 -> 140210811852208
+ 140210811852208 [label=AccumulateGrad]
+ 140210811850816 -> 140210811785072
+ 140210811850816 [label=TBackward0]
+ 140210811851488 -> 140210811850816
+ 140210811851488 [label=ToCopyBackward0]
+ 140210811851968 -> 140210811851488
+ 140202228907920 [label="encoder.layer.10.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228907920 -> 140210811851968
+ 140210811851968 [label=AccumulateGrad]
+ 140202223578560 -> 140202223577984
+ 140202223577696 -> 140202223576832
+ 140202228907760 [label="encoder.layer.10.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228907760 -> 140202223577696
+ 140202223577696 [label=AccumulateGrad]
+ 140202223577648 -> 140202223576832
+ 140202228907440 [label="encoder.layer.10.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228907440 -> 140202223577648
+ 140202223577648 [label=AccumulateGrad]
+ 140202223575728 -> 140202223576640
+ 140202223575728 [label=TBackward0]
+ 140202223576928 -> 140202223575728
+ 140202223576928 [label=ToCopyBackward0]
+ 140202223578080 -> 140202223576928
+ 140202228906240 [label="encoder.layer.11.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228906240 -> 140202223578080
+ 140202223578080 [label=AccumulateGrad]
+ 140202223575680 -> 140202223575392
+ 140202223575680 [label=ReshapeAliasBackward0]
+ 140202223576064 -> 140202223575680
+ 140202223576064 [label=ExpandBackward0]
+ 140202223576352 -> 140202223576064
+ 140202223576352 [label=TransposeBackward0]
+ 140202223577216 -> 140202223576352
+ 140202223577216 [label=PermuteBackward0]
+ 140202223577168 -> 140202223577216
+ 140202223577168 [label=ViewBackward0]
+ 140210811783872 -> 140202223577168
+ 140210811783872 [label=ViewBackward0]
+ 140210811784592 -> 140210811783872
+ 140210811784592 [label=AddmmBackward0]
+ 140210811851728 -> 140210811784592
+ 140210811851728 [label=ToCopyBackward0]
+ 140210811851920 -> 140210811851728
+ 140202228906800 [label="encoder.layer.11.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140202228906800 -> 140210811851920
+ 140210811851920 [label=AccumulateGrad]
+ 140210811851680 -> 140210811784592
+ 140210811851680 [label=ViewBackward0]
+ 140210811852256 -> 140210811851680
+ 140210811852256 [label=ToCopyBackward0]
+ 140202223540112 -> 140210811852256
+ 140210811851248 -> 140210811784592
+ 140210811851248 [label=TBackward0]
+ 140210811851392 -> 140210811851248
+ 140210811851392 [label=ToCopyBackward0]
+ 140210811852400 -> 140210811851392
+ 140202228906480 [label="encoder.layer.11.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228906480 -> 140210811852400
+ 140210811852400 [label=AccumulateGrad]
+ 140202223541168 -> 140202223541312
+ 140202223541168 [label=ReshapeAliasBackward0]
+ 140202223541888 -> 140202223541168
+ 140202223541888 [label=ExpandBackward0]
+ 140202223542176 -> 140202223541888
+ 140202223542176 [label=PermuteBackward0]
+ 140202223541456 -> 140202223542176
+ 140202223541456 [label=ViewBackward0]
+ 140202223575200 -> 140202223541456
+ 140202223575200 [label=ViewBackward0]
+ 140202223576256 -> 140202223575200
+ 140202223576256 [label=AddmmBackward0]
+ 140202223575776 -> 140202223576256
+ 140202223575776 [label=ToCopyBackward0]
+ 140210811852352 -> 140202223575776
+ 140202228905840 [label="encoder.layer.11.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140202228905840 -> 140210811852352
+ 140210811852352 [label=AccumulateGrad]
+ 140202223576736 -> 140202223576256
+ 140202223576736 [label=ViewBackward0]
+ 140210811852160 -> 140202223576736
+ 140210811852160 [label=ToCopyBackward0]
+ 140202223540112 -> 140210811852160
+ 140202223575104 -> 140202223576256
+ 140202223575104 [label=TBackward0]
+ 140210811851200 -> 140202223575104
+ 140210811851200 [label=ToCopyBackward0]
+ 140210811852304 -> 140210811851200
+ 140202228905760 [label="encoder.layer.11.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228905760 -> 140210811852304
+ 140210811852304 [label=AccumulateGrad]
+ 140202223540208 -> 140202223540400
+ 140202223540208 [label=TBackward0]
+ 140202223540880 -> 140202223540208
+ 140202223540880 [label=ToCopyBackward0]
+ 140202223541072 -> 140202223540880
+ 140202228905520 [label="encoder.layer.11.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140202228905520 -> 140202223541072
+ 140202223541072 [label=AccumulateGrad]
+ 140202223540112 -> 140202223540160
+ 140202223539920 -> 140202223540064
+ 140202228904160 [label="encoder.layer.11.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228904160 -> 140202223539920
+ 140202223539920 [label=AccumulateGrad]
+ 140202223538576 -> 140202223540064
+ 140202228895312 [label="encoder.layer.11.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228895312 -> 140202223538576
+ 140202223538576 [label=AccumulateGrad]
+ 140202223539056 -> 140202223539680
+ 140202223539056 [label=TBackward0]
+ 140202223538384 -> 140202223539056
+ 140202223538384 [label=ToCopyBackward0]
+ 140202223539632 -> 140202223538384
+ 140202228893872 [label="encoder.layer.11.experts.dense1.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228893872 -> 140202223539632
+ 140202223539632 [label=AccumulateGrad]
+ 140202223539344 -> 140202223539008
+ 140202223539344 [label=TBackward0]
+ 140202223539488 -> 140202223539344
+ 140202223539488 [label=ToCopyBackward0]
+ 140202223538240 -> 140202223539488
+ 140202228893632 [label="encoder.layer.11.experts.dense2.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228893632 -> 140202223538240
+ 140202223538240 [label=AccumulateGrad]
+ 140202223538480 -> 140202228614096
+ 140202228614192 -> 140202228615488
+ 140202228893392 [label="encoder.layer.11.expert_ln.weight
+ (768)" fillcolor=lightblue]
+ 140202228893392 -> 140202228614192
+ 140202228614192 [label=AccumulateGrad]
+ 140202228614336 -> 140202228615488
+ 140202228893472 [label="encoder.layer.11.expert_ln.bias
+ (768)" fillcolor=lightblue]
+ 140202228893472 -> 140202228614336
+ 140202228614336 [label=AccumulateGrad]
+ 140202228614480 -> 140202228657312
+ 140202228614480 [label=NativeLayerNormBackward0]
+ 140202228614432 -> 140202228614480
+ 140202228614432 [label=AddBackward0]
+ 140202223538816 -> 140202228614432
+ 140202223538816 [label=NativeDropoutBackward0]
+ 140202223539392 -> 140202223538816
+ 140202223539392 [label=ViewBackward0]
+ 140202223538432 -> 140202223539392
+ 140202223538432 [label=AddmmBackward0]
+ 140202223540256 -> 140202223538432
+ 140202223540256 [label=ToCopyBackward0]
+ 140202223540592 -> 140202223540256
+ 140202228895152 [label="encoder.layer.11.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140202228895152 -> 140202223540592
+ 140202223540592 [label=AccumulateGrad]
+ 140202223540016 -> 140202223538432
+ 140202223540016 [label=ViewBackward0]
+ 140202223540976 -> 140202223540016
+ 140202223540976 [label=GeluBackward0]
+ 140202223540832 -> 140202223540976
+ 140202223540832 [label=ViewBackward0]
+ 140202223541936 -> 140202223540832
+ 140202223541936 [label=AddmmBackward0]
+ 140202223540736 -> 140202223541936
+ 140202223540736 [label=ToCopyBackward0]
+ 140210811784688 -> 140202223540736
+ 140202228895392 [label="encoder.layer.11.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140202228895392 -> 140210811784688
+ 140210811784688 [label=AccumulateGrad]
+ 140202223575488 -> 140202223541936
+ 140202223575488 [label=ViewBackward0]
+ 140210811852592 -> 140202223575488
+ 140210811852592 [label=ToCopyBackward0]
+ 140202223539200 -> 140210811852592
+ 140202223539200 [label=SliceBackward0]
+ 140210811852640 -> 140202223539200
+ 140210811852640 [label=SliceBackward0]
+ 140210811852736 -> 140210811852640
+ 140210811852736 [label=SliceBackward0]
+ 140202223540064 -> 140210811852736
+ 140202223575248 -> 140202223541936
+ 140202223575248 [label=TBackward0]
+ 140210811852064 -> 140202223575248
+ 140210811852064 [label=ToCopyBackward0]
+ 140210811852832 -> 140210811852064
+ 140202228895632 [label="encoder.layer.11.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140202228895632 -> 140210811852832
+ 140210811852832 [label=AccumulateGrad]
+ 140202223539824 -> 140202223538432
+ 140202223539824 [label=TBackward0]
+ 140202223540784 -> 140202223539824
+ 140202223540784 [label=ToCopyBackward0]
+ 140202223575872 -> 140202223540784
+ 140202228895072 [label="encoder.layer.11.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140202228895072 -> 140202223575872
+ 140202223575872 [label=AccumulateGrad]
+ 140202223539200 -> 140202228614432
+ 140202223538672 -> 140202228614480
+ 140202228894832 [label="encoder.layer.11.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140202228894832 -> 140202223538672
+ 140202223538672 [label=AccumulateGrad]
+ 140202223538624 -> 140202228614480
+ 140202228894912 [label="encoder.layer.11.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140202228894912 -> 140202223538624
+ 140202223538624 [label=AccumulateGrad]
+ 140202228657312 -> 140202223089520
+}
diff --git a/Pre_PromptMoE_RawProb_backward_graph.pdf b/Pre_PromptMoE_RawProb_backward_graph.pdf
new file mode 100644
index 0000000..54f7e67
Binary files /dev/null and b/Pre_PromptMoE_RawProb_backward_graph.pdf differ
diff --git a/environment.yml b/environment.yml
index 51561c7..5230311 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,4 +1,4 @@
-name: minigptv
+name: promptmoe
channels:
- pytorch
- defaults
diff --git a/minigpt4/configs/datasets/coco/caption.yaml b/minigpt4/configs/datasets/coco/caption.yaml
index 8d62c89..8e96a13 100644
--- a/minigpt4/configs/datasets/coco/caption.yaml
+++ b/minigpt4/configs/datasets/coco/caption.yaml
@@ -17,14 +17,14 @@ datasets:
# md5: aa31ac474cf6250ebb81d18348a07ed8
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_train.json
- val:
- url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
- storage:
- - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
- test:
- url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
- storage:
- - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
+ # val:
+ # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
+ # storage:
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_val.json
+ # test:
+ # url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
+ # storage:
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/COCO_Cap/coco_karpathy_test.json
images:
storage: /mnt/pfs-guan-ssai/nlu/dingyifeng/data/COCO
diff --git a/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml b/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml
index f281d88..7943d6a 100755
--- a/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml
+++ b/minigpt4/configs/datasets/coco/defaults_vqa_eval.yaml
@@ -20,6 +20,7 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_val_eval.json
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/v2_mscoco_val2014_annotations.json
@@ -29,6 +30,7 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_test.json
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/vqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/VQAv2/answer_list.json
images:
diff --git a/minigpt4/configs/datasets/okvqa/eval.yaml b/minigpt4/configs/datasets/okvqa/eval.yaml
index 244398c..d58c446 100755
--- a/minigpt4/configs/datasets/okvqa/eval.yaml
+++ b/minigpt4/configs/datasets/okvqa/eval.yaml
@@ -20,6 +20,7 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/mscoco_val2014_annotations.json
@@ -32,6 +33,7 @@ datasets:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
storage:
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval.json
+ # - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_train.json
# - /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_val_eval_part100.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/okvqa_answer_list_train.json
- /mnt/pfs-guan-ssai/nlu/wanghanzi/data/OKVQA/OpenEnded_mscoco_val2014_questions.json
diff --git a/minigpt4/datasets/datasets/caption_datasets.py b/minigpt4/datasets/datasets/caption_datasets.py
index e412dd4..6b74cb5 100644
--- a/minigpt4/datasets/datasets/caption_datasets.py
+++ b/minigpt4/datasets/datasets/caption_datasets.py
@@ -105,6 +105,8 @@ class COCOCaptionDataset(BaseDataset, __DisplMixin):
'Using language, provide a short account of the image.',
'Use a few words to illustrate what is happening in the picture.',
]
+ self.source = 'coco_cap'
+
def __getitem__(self, index):
# TODO this assumes image input, not general enough
@@ -118,13 +120,20 @@ class COCOCaptionDataset(BaseDataset, __DisplMixin):
image = self.vis_processor(image)
caption = self.text_processor(ann["caption"])
- instruction = random.choice(self.instruction_pool)
- instruction = "
[caption] {} ".format(instruction)
+ # instruction = random.choice(self.instruction_pool)
+ # instruction = "
[caption] {} ".format(instruction)
+ q_input = ""
+ llm_input = random.choice(self.instruction_pool)
return {
"image": image,
+ "image_id": ann["image"],
"answer": caption,
- "instruction_input": instruction,
+ "q_input": q_input,
+ "llm_input": llm_input,
+ "text_input": llm_input,
+ "text_output": caption,
+ "source": 'coco_cap',
}
class CaptionEvalDataset(BaseDataset, __DisplMixin):
diff --git a/minigpt4/datasets/datasets/coco_caption.py b/minigpt4/datasets/datasets/coco_caption.py
index 76f86e4..e388956 100755
--- a/minigpt4/datasets/datasets/coco_caption.py
+++ b/minigpt4/datasets/datasets/coco_caption.py
@@ -31,6 +31,7 @@ class COCOCapEvalDataset(CaptionEvalDataset):
split (string): val or test
"""
super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+ self.source = 'coco_cap'
def __getitem__(self, index):
ann = self.annotation[index]
diff --git a/minigpt4/datasets/datasets/dataloader_utils.py b/minigpt4/datasets/datasets/dataloader_utils.py
index c827643..08f64da 100644
--- a/minigpt4/datasets/datasets/dataloader_utils.py
+++ b/minigpt4/datasets/datasets/dataloader_utils.py
@@ -31,7 +31,6 @@ class MultiIterLoader:
if ratios is None:
ratios = [1.0] * len(loaders)
else:
- # import pdb; pdb.set_trace()
assert len(ratios) == len(loaders)
ratios = [float(ratio) / sum(ratios) for ratio in ratios]
diff --git a/minigpt4/eval_scripts/eval_vqa.py b/minigpt4/eval_scripts/eval_vqa.py
index e8aa39d..6d92b11 100644
--- a/minigpt4/eval_scripts/eval_vqa.py
+++ b/minigpt4/eval_scripts/eval_vqa.py
@@ -12,7 +12,6 @@ from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn
-from datasets import load_dataset
import sys
sys.path.append("/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE")
@@ -248,6 +247,7 @@ if 'vsr' in args.dataset:
img_path = cfg.evaluation_datasets_cfg["vsr"]["img_path"]
batch_size = cfg.evaluation_datasets_cfg["vsr"]["batch_size"]
max_new_tokens = cfg.evaluation_datasets_cfg["vsr"]["max_new_tokens"]
+ from datasets import load_dataset
annotation = load_dataset("cambridgeltl/vsr_zeroshot", split='test')
data = VSREvalData(annotation, vis_processor, img_path)
diff --git a/minigpt4/models/QformerMoE.py b/minigpt4/models/QformerMoE.py
index 5002448..5cc8c1f 100644
--- a/minigpt4/models/QformerMoE.py
+++ b/minigpt4/models/QformerMoE.py
@@ -386,17 +386,23 @@ class BertOutput(nn.Module): # Add & Norm
class FeedForward(nn.Module):
+ # remove LayerNorm
def __init__(self, config):
- nn.Module.__init__(self)
- # first layer
- self.intermediate_query = BertIntermediate(config)
- # second layer
- self.output_query = BertOutput(config)
+ super().__init__()
+ self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+ self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob) # adjust dropout ratio 0.1->0.2
+ # self.dropout = nn.Dropout(0.2) # adjust dropout ratio 0.1->0.2
def forward(self, hidden_states: Tensor):
- input_tensor = hidden_states
- intermediate_output = self.intermediate_query(hidden_states)
- hidden_states = self.output_query(intermediate_output, input_tensor)
+ hidden_states = self.dense1(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ hidden_states = self.dense2(hidden_states)
+ hidden_states = self.dropout(hidden_states)
return hidden_states
@@ -440,6 +446,7 @@ class BertLayer(nn.Module):
)
else:
self.experts = ffn
+ self.expert_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
@@ -494,7 +501,8 @@ class BertLayer(nn.Module):
moe_ffn_attention_input = query_attention_output[:, :query_length, :]
moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length]
layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask) # layer_output, gate_loss, gate_load
-
+ # import pdb; pdb.set_trace() # test0107
+
if attention_output.shape[1] > query_length: # have text input in Qformer
layer_output_text = apply_chunking_to_forward(
self.feed_forward_chunk,
@@ -503,6 +511,7 @@ class BertLayer(nn.Module):
attention_output[:, query_length:, :],
)
layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2])
+
else:
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk,
@@ -524,15 +533,14 @@ class BertLayer(nn.Module):
def feed_forward_query_moe(self, attention_output, expert_attention_mask):
if not self.use_experts:
- layer_output = self.experts(attention_output)
+ hidden_states = self.experts(attention_output)
+ layer_output = self.expert_ln(hidden_states + attention_output)
return layer_output, 0.0, []
- # if not self.importance_processor.is_moe:
- # raise RuntimeError("Need to turn the model to a MoE first.")
-
- layer_output, gate_loss, gate_load = self.experts(
+ hidden_states, gate_loss, gate_load = self.experts(
attention_output, expert_attention_mask
)
+ layer_output = self.expert_ln(hidden_states + attention_output)
return layer_output, gate_loss, gate_load
class BertEncoder(nn.Module):
diff --git a/minigpt4/models/QformerRouteMoE.py b/minigpt4/models/QformerRouteMoE.py
index 910a7d0..8595dc6 100644
--- a/minigpt4/models/QformerRouteMoE.py
+++ b/minigpt4/models/QformerRouteMoE.py
@@ -46,10 +46,9 @@ from transformers.utils import logging
from transformers.models.bert.configuration_bert import BertConfig
from minigpt4.models.moe.utils import (
- FeedForward,
MoEModelOutput,
MoEModelOutputWithPooling,
- use_experts,
+ use_experts_route,
moe_layer_judge,
)
from minigpt4.models.moe.route_moe_layer import RouteMoELayer
@@ -378,13 +377,14 @@ class BertOutput(nn.Module): # Add & Norm
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) # 1
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ # Move LayerNorm & ResNet out of FFN After MoEFFN
+ hidden_states = self.LayerNorm(hidden_states + input_tensor) # 1
return hidden_states
@@ -429,7 +429,7 @@ class BertLayer(nn.Module):
self.output_query = BertOutput(config)
# Add MoE FFN
- self.use_experts = use_experts(layer_num)
+ self.use_experts = use_experts_route(layer_num)
self.layer_judge = moe_layer_judge(layer_num)
self.num_beams = config.moebert_num_beams
ffn = FeedForward(config)
@@ -442,10 +442,13 @@ class BertLayer(nn.Module):
num_beams=config.moebert_num_beams,
layer_judge = self.layer_judge,
route_method=config.route_method,
+ weight_type=config.moe_weight_type,
)
else:
self.experts = ffn
+ # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
def forward(
self,
hidden_states,
@@ -463,8 +466,8 @@ class BertLayer(nn.Module):
self_attn_past_key_value = (
past_key_value[:2] if past_key_value is not None else None
)
- # import pdb;pdb.set_trace()
-
+ # import pdb; pdb.set_trace() # 0107test
+
# adjust the dimension of hidden_states, attention_mask, encoder_attention_mask and encoder_hidden_states to be the same
if self.num_beams > 1:
if hidden_states.shape[0]== attention_mask.shape[0]*self.num_beams:
@@ -494,10 +497,6 @@ class BertLayer(nn.Module):
present_key_value = self_attention_outputs[-1]
- # import pdb;pdb.set_trace()
- # print(self.layer_num, hidden_states.shape, attention_mask.shape)
-
-
if query_length > 0:
query_attention_output = attention_output[:, :query_length, :]
@@ -526,7 +525,8 @@ class BertLayer(nn.Module):
moe_ffn_attention_input = query_attention_output[:, :query_length, :]
moe_ffn_attention_mask = attention_mask.squeeze(dim=1).squeeze(dim=1)[:, :query_length]
layer_output = self.feed_forward_query_moe(moe_ffn_attention_input, moe_ffn_attention_mask, beam_scores, expert_route)
- # layer_output = (layer_output, beam_scores, expert_route, beam_idx)
+ # layer_output = (layer_output, beam_scores, expert_route, beam_idx, importance_loss)
+ # import pdb; pdb.set_trace() # 0107test
if attention_output.shape[1] > query_length: # have text input in Qformer
layer_output_text = apply_chunking_to_forward(
@@ -535,7 +535,8 @@ class BertLayer(nn.Module):
self.seq_len_dim,
attention_output[:, query_length:, :],
)
- if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
+ if self.layer_judge == 'first' and self.num_beams>1:
+ # if layer_output[0].shape[0] == layer_output_text.shape[0]*self.num_beams and self.num_beams>1:
# adjust the dimension of layer_output_text to bz*num_beams
layer_output_text = self.adjust_layer_output_text(layer_output_text)
@@ -550,7 +551,8 @@ class BertLayer(nn.Module):
# layer_output & layer_output_text dimen_0 from bz*num_beams to bz
layer_output, layer_output_text = self.route_moe_last_layer_top1(layer_output, layer_output_text)
- layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2])
+ layer_output = (torch.cat([layer_output[0], layer_output_text], dim=1), layer_output[1], layer_output[2], layer_output[3],layer_output[4])
+ # import pdb; pdb.set_trace() # 0107test
else:
layer_output = apply_chunking_to_forward(
@@ -559,7 +561,7 @@ class BertLayer(nn.Module):
self.seq_len_dim,
attention_output,
)
- layer_output = (layer_output, None, None)
+ layer_output = (layer_output, None, None, None, 0.0)
outputs = (layer_output,) + outputs
@@ -594,24 +596,27 @@ class BertLayer(nn.Module):
beam_scores_new = beam_scores[selects]
expert_route_new = expert_route[selects]
- return (hidden_states_new, beam_scores_new, expert_route_new), layer_output_text
+ return (hidden_states_new, beam_scores_new, expert_route_new, layer_output[3], layer_output[4]), layer_output_text
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
+ # layer_output = self.LayerNorm(layer_output + attention_output)
return layer_output
def feed_forward_query_moe(self, attention_output, expert_attention_mask, beam_scores, expert_route):
-
if not self.use_experts:
layer_output = self.experts(attention_output)
- return layer_output, None, None, None
+ # layer_output = self.LayerNorm(layer_output + attention_output)
+ return layer_output, None, None, None, 0.0
- layer_output, beam_scores, expert_route, beam_idx = self.experts(
+ layer_output, beam_scores, expert_route, beam_idx, importance_loss = self.experts(
attention_output, expert_attention_mask, beam_scores, expert_route
)
- return layer_output, beam_scores, expert_route, beam_idx
+
+ # layer_output = self.LayerNorm(layer_output + attention_output)
+ return layer_output, beam_scores, expert_route, beam_idx, importance_loss
class BertEncoder(nn.Module):
def __init__(self, config):
@@ -645,6 +650,7 @@ class BertEncoder(nn.Module):
next_decoder_cache = () if use_cache else None
beam_scores=None
expert_route=None
+ importance_loss = 0
for i in range(self.config.num_hidden_layers):
layer_module = self.layer[i]
@@ -693,6 +699,7 @@ class BertEncoder(nn.Module):
hidden_states = layer_outputs[0][0]
beam_scores = beam_scores if layer_outputs[0][1] == None else layer_outputs[0][1]
expert_route = expert_route if layer_outputs[0][2] == None else layer_outputs[0][2]
+ importance_loss += layer_outputs[0][4]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
@@ -724,6 +731,7 @@ class BertEncoder(nn.Module):
cross_attentions=all_cross_attentions,
beam_scores=beam_scores,
expert_route=expert_route,
+ gate_loss=importance_loss,
)
@@ -1103,6 +1111,7 @@ class BertModel(BertPreTrainedModel):
cross_attentions=encoder_outputs.cross_attentions,
beam_scores=encoder_outputs.beam_scores,
expert_route=encoder_outputs.expert_route,
+ gate_loss=encoder_outputs.gate_loss
)
diff --git a/minigpt4/models/blip2.py b/minigpt4/models/blip2.py
index d79f31d..a6bf474 100644
--- a/minigpt4/models/blip2.py
+++ b/minigpt4/models/blip2.py
@@ -62,7 +62,7 @@ class Blip2Base(BaseModel):
return Qformer, query_tokens
@classmethod
- def init_RouteMoEQformer(cls, num_query_token, vision_width, moebert_expert_num, moebert_num_beams, route_method, cross_attention_freq=2):
+ def init_RouteMoEQformer(cls, num_query_token, vision_width, moebert_expert_num, moebert_num_beams, route_method, moe_weight_type, cross_attention_freq=2):
moe_encoder_config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
moe_encoder_config.encoder_width = vision_width
@@ -74,6 +74,7 @@ class Blip2Base(BaseModel):
moe_encoder_config.moebert_expert_num = moebert_expert_num
moe_encoder_config.moebert_num_beams = moebert_num_beams
moe_encoder_config.route_method = route_method
+ moe_encoder_config.moe_weight_type = moe_weight_type
RouteMoEQformer = BertMoERouteLMHeadModel.from_pretrained(
"/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased", config=moe_encoder_config
diff --git a/minigpt4/models/blip2_vicuna_instruct.py b/minigpt4/models/blip2_vicuna_instruct.py
index 34acf28..13421ab 100644
--- a/minigpt4/models/blip2_vicuna_instruct.py
+++ b/minigpt4/models/blip2_vicuna_instruct.py
@@ -99,6 +99,7 @@ class Blip2VicunaInstruct(Blip2Base):
moebert_expert_num=moebert_expert_num,
moebert_num_beams=moebert_num_beams,
route_method=moebert_route_method,
+ moe_weight_type=moe_weight_type,
cross_attention_freq=2
)
else:
@@ -118,7 +119,6 @@ class Blip2VicunaInstruct(Blip2Base):
num_query_token, self.visual_encoder.num_features
)
- # import pdb;pdb.set_trace()
if not qformer_text_input:
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
@@ -178,6 +178,19 @@ class Blip2VicunaInstruct(Blip2Base):
if "_query" in name and "experts" not in name: # raw ffn_query not update
param.requires_grad = False
+ ln_pattern = r"bert\.encoder\.layer\.\d+\.expert_ln\.(weight|bias)"
+ if re.match(ln_pattern, name):
+ key_orig = re.sub('expert_ln', 'output_query.LayerNorm', name)
+ param.data.copy_(state_dict[key_orig])
+ d1_pattern = r"bert\.encoder\.layer\.(\d+)\.experts(\.|\.experts\.\d+\.)dense1\.(weight|bias)"
+ if re.match(d1_pattern, name):
+ key_orig = re.sub(r'experts(\.|\.experts\.\d+\.)dense1', 'intermediate_query.dense', name)
+ param.data.copy_(state_dict[key_orig])
+ d2_pattern = r"bert\.encoder\.layer\.(\d+)\.experts(\.|\.experts\.\d+\.)dense2\.(weight|bias)"
+ if re.match(d2_pattern, name):
+ key_orig = re.sub(r'experts(\.|\.experts\.\d+\.)dense2', 'output_query.dense', name)
+ param.data.copy_(state_dict[key_orig])
+
# freeze qformer
if freeze_qformer:
for name, param in self.Qformer.named_parameters():
@@ -205,6 +218,7 @@ class Blip2VicunaInstruct(Blip2Base):
self.use_moeqformer = use_moeqformer
self.use_route_moe = use_route_moe
self.moebert_load_balance = moebert_load_balance
+ self.moebert_num_beams = moebert_num_beams
self.gate_save_path = gate_save_path
# if self.gate_save_path != None:
@@ -242,7 +256,7 @@ class Blip2VicunaInstruct(Blip2Base):
# print(samples["text_input"])
# print(samples["text_output"])
# print('-----------------')
- # import pdb;pdb.set_trace()
+ # import pdb;pdb.set_trace() # 0107test
image = samples["image"]
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image))
@@ -278,10 +292,10 @@ class Blip2VicunaInstruct(Blip2Base):
return_dict=True,
output_hidden_states=True,
)
-
+ # import pdb; pdb.set_trace()# 0107test
query_output_to_linear = query_output.last_hidden_state[:,:query_tokens.size(1),:]
- if self.use_moeqformer and not self.use_route_moe:
+ if self.use_moeqformer:
gate_loss = query_output.gate_loss # only available in QformerMoE
if self.gate_save_path != None:
@@ -312,7 +326,7 @@ class Blip2VicunaInstruct(Blip2Base):
# 'gate_route_1': prob_gate_normalized[0][i].tolist(),
})
# for layer in [6,8,10]:
- # layer_data = all_hidden_states[layer]
+ # layer_data = all_hidden_states[layer]s
# file_path = os.path.join(self.gate_save_path, f'{image_id}_{str(layer)}.npy')
# x = layer_data.data.cpu().numpy()
# np.save(file_path,x)
@@ -323,7 +337,6 @@ class Blip2VicunaInstruct(Blip2Base):
print("Gate Save Error....")
print(e)
-
inputs_llm = self.llm_proj(query_output_to_linear)
atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
@@ -380,7 +393,7 @@ class Blip2VicunaInstruct(Blip2Base):
labels=targets,
)
- if self.use_moeqformer and not self.use_route_moe:
+ if self.use_moeqformer:
loss = outputs.loss + self.moebert_load_balance * gate_loss
else:
loss = outputs.loss
@@ -441,6 +454,8 @@ class Blip2VicunaInstruct(Blip2Base):
output_hidden_states=True,
)
+ # import pdb; pdb.set_trace()
+
if self.gate_save_path != None:
all_hidden_states = query_output.hidden_states
# prob_gate_normalized = query_output.gate_loads
@@ -471,11 +486,11 @@ class Blip2VicunaInstruct(Blip2Base):
# 'gate_route_3': prob_gate_normalized[2][i].tolist(),
# 'gate_route_1': prob_gate_normalized[0][i].tolist(),
})
- for layer in [6,8,10]:
- if layer == 6:
- layer_data = all_hidden_states[layer][i, :32, :]
+ for layer in [6,7,8,9,10,11]:
+ if layer in [6,11]:
+ layer_data = all_hidden_states[layer][i, :, :]
else:
- layer_data = all_hidden_states[layer][i*3, :32, :]
+ layer_data = all_hidden_states[layer][i*self.moebert_num_beams, :, :]
file_path = os.path.join(self.gate_save_path, f'{image_id}_{str(layer)}.npy')
x = layer_data.data.cpu().numpy()
np.save(file_path,x) # 大功告成
@@ -683,5 +698,6 @@ class Blip2VicunaInstruct(Blip2Base):
for name, param in model.named_parameters():
if param.requires_grad == True:
print(name)
-
+ # [name for name, param in model.named_parameters() if (param.requires_grad == False and 'Qformer' in name and 'intermediate_query' in name)]
+ # import pdb; pdb.set_trace()# 0107test
return model
diff --git a/minigpt4/models/moe/beam_search.py b/minigpt4/models/moe/beam_search.py
index 676d707..c4b3c5b 100644
--- a/minigpt4/models/moe/beam_search.py
+++ b/minigpt4/models/moe/beam_search.py
@@ -21,7 +21,6 @@ class MoELayer(nn.Module):
else:
raise KeyError("Routing method not supported.")
-
def _forward_gate_sentence(self, x, attention_mask):
"""
x: query_attention_output , torch.Size([bz, 32, 768])
@@ -77,7 +76,65 @@ class MoELayer(nn.Module):
print('Layer Qformer MoE: \n',prob_gate)
return moe_result, select_prob_gate, gate
+ def _forward_gate_sentence_post(self, x, attention_mask):
+ """
+ x: query_attention_output; torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+ bz = 4
+ x = torch.randn(bz,32,768)
+ attention_mask = torch.ones([bz, 32])
+ """
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+
+ def forward_expert(input_x, expert_idx):
+ # input_x += torch.randn(4,32,768)
+ # return input_x
+ output_x = self.experts[expert_idx].forward(input_x)
+ return output_x
+
+ outputs = list()
+ logits_gate_lst = list()
+ for expert_idx in range(self.num_experts):
+ output_x = forward_expert(x_masked, expert_idx)
+ outputs.append(output_x.unsqueeze(0))
+
+ output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
+ # gate_acore = self.gates[expert_idx](output_x_aver)
+ gate_score = self.gate(output_x_aver)
+ logits_gate_lst.append(gate_score)
+
+ candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768])
+ logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert])
+ prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
+ topk_values, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ gate_load = num_sentences.clone()
+
+ # load balancing loss
+ if self.use_balance_loss:
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768])
+ # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768])
+ # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1])
+
+ prob_gate_topk = torch.zeros_like(prob_gate)
+ prob_gate_topk.scatter_(1, gate, topk_values)
+ prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) # torch.Size([bz, num_expert])
+ candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768])
+ results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768])
+ moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768])
+ import pdb;pdb.set_trace()
+
+ return moe_result, (balance_loss+importance_loss), prob_gate_normalized
+
def forward(self, x, attention_mask):
if self.route_method == "gate-token":
x, balance_loss, gate_load = self._forward_gate_token(x)
@@ -95,7 +152,7 @@ class MoELayer(nn.Module):
class RouteMoELayer(nn.Module):
- def __init__(self, hidden_size, expert, gate, num_experts, num_beams=2, layer_judge=None, route_method="pre-route"):
+ def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route", weight_type="ffn_prob"):
# remove hash list
nn.Module.__init__(self)
self.num_experts = num_experts
@@ -103,13 +160,26 @@ class RouteMoELayer(nn.Module):
self.num_beams = num_beams
self.hidden_size = hidden_size
self.layer_judge = layer_judge
+ self.weight_type = weight_type
self.route_method = route_method
if self.route_method == "pre-route":
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
elif self.route_method == "post-route":
- # gate = nn.Linear(hidden_size, 1, bias=False).float()
- self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
+ gate = nn.Linear(hidden_size, 1, bias=False).float()
+ self.gate = gate
+ # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
+
+ def _importance_auxiliary_loss(self, prob_gate):
+ # From VMOE
+ # _importance_auxiliary_loss
+ axis = tuple(range(prob_gate.ndim - 1)) # All except last.
+ importance_per_expert = torch.sum(prob_gate, dim=axis)
+ std_importance_per_expert = torch.std(importance_per_expert)
+ mean_importance_per_expert = torch.mean(importance_per_expert)
+ # Compute coefficient of variation (i.e. std/mean) squared.
+ return (std_importance_per_expert / mean_importance_per_expert)**2
+
def forward_gate(self, x):
"""
@@ -123,19 +193,21 @@ class RouteMoELayer(nn.Module):
prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
return prob_gate
- def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
- import pdb;pdb.set_trace()
+
+ def beam_search_backup(self, current_scores_log, beam_scores, expert_route, batch_size):
if self.layer_judge=='first' and self.route_method=='pre-route':
+ # current_scores_log torch.Size([bz, num_experts])
assert beam_scores==None and expert_route==None
current_scores = torch.exp(current_scores_log)
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
- beam_idx = None
+ beam_idx = torch.tensor(range(self.num_beams * batch_size))
+
else:
if self.layer_judge=='first' and self.route_method == 'post-route':
batch_size = batch_size
- next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_experts])
+ next_scores_raw1 = torch.exp(current_scores_log) # torch.Size([bz, num_beams*num_experts])
else:
batch_size = int(batch_size // self.num_beams)
next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
@@ -147,9 +219,6 @@ class RouteMoELayer(nn.Module):
next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True)
# next_scores torch.Size([bz, num_beams])
# next_tokens torch.Size([bz, num_beams])
- print(next_scores_raw1)
- print(next_scores)
- print(next_experts)
next_batch_beam = list()
for batch_idx in range(batch_size):
@@ -166,7 +235,7 @@ class RouteMoELayer(nn.Module):
next_batch_beam.extend(next_sent_beam)
import pdb;pdb.set_trace()
-
+
if self.layer_judge=='first' and self.route_method == 'post-route':
beam_scores = next_scores.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = next_experts.view(self.num_beams * batch_size)
@@ -181,33 +250,91 @@ class RouteMoELayer(nn.Module):
pre_route = expert_route[beam_idx,:]
expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
- import pdb;pdb.set_trace()
+ return beam_scores, expert_route, beam_idx
+
+ def beam_search(self, current_scores_log, beam_scores, expert_route, batch_size):
+ if self.layer_judge=='first' and self.route_method in ['pre-route', 'post-route']:
+ # current_scores_log torch.Size([bz, num_experts])
+ assert beam_scores==None and expert_route==None
+ current_scores = torch.exp(current_scores_log)
+ topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
+ expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
+ beam_idx = torch.tensor(range(self.num_beams * batch_size))
+ import pdb;pdb.set_trace()
+
+ else:
+ batch_size = int(batch_size // self.num_beams)
+ next_scores_raw = current_scores_log + torch.log(beam_scores).unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
+ next_scores_exp = torch.exp(next_scores_raw)
+ next_scores_raw1 = next_scores_exp.view(
+ batch_size, self.num_beams * self.num_experts
+ ) # torch.Size([bz, num_beams*num_experts])
+
+ next_scores, next_experts = torch.topk(next_scores_raw1, self.num_beams, dim=1, largest=True, sorted=True)
+ # next_scores torch.Size([bz, num_beams])
+ # next_tokens torch.Size([bz, num_beams])
+
+ next_batch_beam = list()
+ for batch_idx in range(batch_size):
+ next_sent_beam = list()
+ for rank, (expert_id, expert_score) in enumerate(
+ zip(next_experts[batch_idx], next_scores[batch_idx])
+ ):
+ expert_id = expert_id.item()
+ beam_id = expert_id // self.num_experts
+ ex_id = expert_id % self.num_experts
+ effective_beam_id = batch_idx*self.num_beams + beam_id
+
+ next_sent_beam.append((expert_score, ex_id, effective_beam_id))
+ next_batch_beam.extend(next_sent_beam)
+
+ # import pdb;pdb.set_trace()
+
+ beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+ beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
+ beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
+ pre_route = expert_route[beam_idx,:]
+ expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
+
+ print("next_scores_raw1:\n",next_scores_raw1)
return beam_scores, expert_route, beam_idx
-
- def forward_expert_ffn(self, x, expert_select, beam_scores):
+
+
+ def forward_expert_ffn(self, x, expert_select, current_scores):
"""
x_repeat : [bz*num_beams, 32,768]
expert_select : [bz*num_beams]
+ current_scores : [bz*num_beams, num_experts] / [bz, num_experts]
"""
- # add_1212 l2_normalization
- # normalized_tensor = torch.nn.functional.normalize(beam_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
+ # add_1228 l2_normalization
+ # normalized_tensor = torch.nn.functional.normalize(current_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
# tmp_prob = normalized_tensor.unsqueeze(-1).unsqueeze(-1)
-
+ import pdb;pdb.set_trace()
outputs = list()
- for i in range(x.shape[0]):
- output_x = self.experts[expert_select[i]].forward(x[i])
- outputs.append(output_x.unsqueeze(0))
- candidate_output = torch.cat(outputs)
+ for i in range(self.num_experts):
+ output_x = self.experts[i].forward(x)
+ outputs.append(output_x.unsqueeze(1))
+ candidate_output = torch.cat(outputs, dim=1)
+ expert_select_matrix = F.one_hot(expert_select, self.num_experts)
- # candidate_output = candidate_output * tmp_prob
- return candidate_output # torch.Size([bz*num_beams, 32, 768])
+ if self.weight_type == 'ffn_prob':
+ tmp_prob = current_scores * expert_select_matrix
+ candidate_output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1)
+ else:
+ candidate_output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1)
+ import pdb;pdb.set_trace()
+ output = torch.sum(candidate_output, dim=1)
+ return output # torch.Size([bz*num_beams, 32, 768])
def forward_pre_route(self, x, beam_scores, expert_route, use_log=True):
-
- current_scores = self.forward_gate(x) # [bz*num_beams, 5]
+ import pdb;pdb.set_trace()
+ current_scores = self.forward_gate(x) # [bz, num_beams] / [bz*num_beams, num_beams]
+
+ importance_loss = self._importance_auxiliary_loss(current_scores)
if use_log:
current_scores_log = torch.log(current_scores) # 取log之后可以直接相加
@@ -215,42 +342,45 @@ class RouteMoELayer(nn.Module):
current_scores_log = current_scores
batch_size, num_tokens = x.shape[0], x.shape[1]
- beam_scores, expert_route, _ = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
-
+ beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
current_expert_select = expert_route[:,-1]
+ import pdb;pdb.set_trace()
+
if self.layer_judge=='first': # expand first dim to batch_size * num_beams
replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size)
x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768]
+ current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts)
+ current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts]
- candidate_output = self.forward_expert_ffn(x, current_expert_select, beam_scores) # [bz*num_beams, 32,768]
-
- return candidate_output, beam_scores, expert_route
+ input_x = x[beam_idx]
+ candidate_output = self.forward_expert_ffn(input_x, current_expert_select, current_scores) # [bz*num_beams, 32,768]
+ import pdb;pdb.set_trace()
+ return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
def forward_post_route(self, x, beam_scores, expert_route, use_log=True):
- # if self.layer_judge=='first': # expand first dim to batch_size * num_beams
- # batch_size, num_tokens = x.shape[0], x.shape[1]
- # replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size)
- # x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768]
-
attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
-
+
def forward_expert(input_x, expert_idx):
output_x = self.experts[expert_idx].forward(input_x)
return output_x
+ import pdb; pdb.set_trace()
outputs = list()
logits_gate_lst = list()
for expert_idx in range(self.num_experts):
output_x = forward_expert(x_masked, expert_idx)
+ # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768])
+ output_x_aver = torch.mean(output_x, dim=1)
+ # gate_score = self.gates[expert_idx](output_x_aver)
+ gate_score = self.gate(output_x_aver)
+ logits_gate_lst.append(gate_score)
outputs.append(output_x.unsqueeze(0))
- output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768])
- gate_acore = self.gates[expert_idx](output_x_aver)
- logits_gate_lst.append(gate_acore)
- candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768])
+
+ candidate_output_raw = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768])
logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert])
current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts])
@@ -259,25 +389,39 @@ class RouteMoELayer(nn.Module):
else:
current_scores_log = current_scores
- import pdb;pdb.set_trace()
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(current_scores)
+
+ # import pdb; pdb.set_trace()
- batch_size = x.shape[0] # bz*num_beam
+ batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
+ # current_select_expert torch.Size([bz*num_beam, 1])
- output = list()
- for i in range(beam_idx.shape[0]):
- b_idx = beam_idx[i]
- ex_idx = current_select_expert[i]
- ex_out = candidate_output[ex_idx, b_idx, :,:]
- output.append(ex_out.unsqueeze(0))
-
- final_output = torch.concat(output, dim=0)
-
- return final_output, beam_scores, expert_route, beam_idx
+ # import pdb; pdb.set_trace()
+
+ if self.layer_judge == 'first':
+ replicated_tensor = candidate_output_raw.unsqueeze(2).expand(self.num_experts, batch_size, self.num_beams, num_tokens, self.hidden_size)
+ candidate_output_raw = replicated_tensor.contiguous().view(self.num_experts, -1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768]
+ current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts)
+ current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts]
+
+ candidate_output = candidate_output_raw.permute(1, 0, 2, 3)[beam_idx] # torch.Size([8, 2, 32, 768])
+ expert_select_matrix = F.one_hot(current_select_expert, self.num_experts)
+ if self.weight_type == 'ffn_prob':
+ tmp_prob = current_scores[beam_idx] * expert_select_matrix
+ output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1)
+ else:
+ output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1)
+ final_output = torch.sum(output, dim=1)
+
+ import pdb; pdb.set_trace()
+ print("current_scores:\n",current_scores)
+ return final_output, beam_scores, expert_route, beam_idx, importance_loss
def forward(self, x, attention_mask, beam_scores, expert_route, use_log=True):
"""
@@ -286,13 +430,12 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
- candidate_output, beam_scores, expert_route, _ = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
+ candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
elif self.route_method == "post-route":
- candidate_output, beam_scores, expert_route, beam_idx = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
+ candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
- return candidate_output, beam_scores, expert_route, beam_idx
+ return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
-
if __name__ == '__main__':
import sys
@@ -314,8 +457,8 @@ if __name__ == '__main__':
config.add_cross_attention = True
config.cross_attention_freq = cross_attention_freq
config.query_length = num_query_token
- config.moebert_expert_num = 3
- config.moebert_num_beams = 3
+ config.moebert_expert_num = 2
+ config.moebert_num_beams = 2
config.moebert_route_method = 'gate-sentence'
config.moe_topk = 2
config.use_balance_loss = False
@@ -332,40 +475,46 @@ if __name__ == '__main__':
for layer_num in [6, 8, 10]:
layer_judge = moe_layer_judge(layer_num)
ffn = FeedForward(config)
- gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
# experts = RouteMoELayer(
# hidden_size=768,
# expert=ffn,
- # gate = gate,
# num_experts=config.moebert_expert_num,
# num_beams=config.moebert_num_beams,
# layer_judge = layer_judge,
- # route_method = "pre-route"
+ # route_method = "pre-route",
+ # weight_type="no_ffn_prob"
# )
# layer_output = experts(x, None, beam_scores, expert_route)
- # hidden_states1, beam_scores, expert_route,_ = layer_output
+ # hidden_states1, beam_scores, expert_route, beam_idx, importance_loss = layer_output
# print(beam_scores)
# print(expert_route)
+ # print(beam_idx)
+ # print(importance_loss)
+ # x = hidden_states1
gate1 = nn.Linear(768, 1, bias=False).float()
experts_post = RouteMoELayer(
hidden_size=768,
expert=ffn,
- gate = gate1,
num_experts=config.moebert_expert_num,
num_beams=config.moebert_num_beams,
layer_judge = layer_judge,
- route_method = "post-route"
+ route_method = "post-route",
+ weight_type="ffn_prob"
)
layer_output = experts_post(x1, None, beam_scores1, expert_route1, False)
- hidden_states2, beam_scores1, expert_route1, beam_idx = layer_output
+ hidden_states2, beam_scores1, expert_route1, beam_idx, importance_loss = layer_output
print(beam_scores1)
print(expert_route1)
print(beam_idx)
+ print(importance_loss)
+ x1 = hidden_states2
+
+ # gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
# experts_moe = MoELayer(
# hidden_size=config.hidden_size,
# expert=ffn,
@@ -382,11 +531,62 @@ if __name__ == '__main__':
# print(select_prob_gate)
# print(gate_load)
-
-
- # x = hidden_states1
- x1 = hidden_states2
# x2 = hidden_states3
print("------------------------------------")
+ import pdb; pdb.set_trace()
+
+
+
+ def forward_post_route_backup(self, x, beam_scores, expert_route, use_log=True):
+
+ attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+
+ def forward_expert(input_x, expert_idx):
+ output_x = self.experts[expert_idx].forward(input_x)
+ return output_x
+
+ outputs = list()
+ logits_gate_lst = list()
+ for expert_idx in range(self.num_experts):
+ output_x = forward_expert(x_masked, expert_idx)
+ outputs.append(output_x.unsqueeze(0))
+ # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768])
+ # gate_score = self.gates[expert_idx](output_x_aver)
+ output_x_aver = torch.mean(output_x, dim=1)
+ gate_score = self.gate(output_x_aver)
+ logits_gate_lst.append(gate_score)
+ candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768])
+ logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert])
+ current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts])
+
+ if use_log:
+ current_scores_log = torch.log(current_scores) # 取log之后可以直接相加
+ else:
+ current_scores_log = current_scores
+
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(current_scores)
+
+ batch_size = x.shape[0] # bz*num_beam
+ beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
+ # beam_scores torch.Size([bz*num_beam])
+ # expert_route torch.Size([bz*num_beam, layer_n])
+ current_select_expert = expert_route[:,-1]
+ # current_select_expert torch.Size([bz*num_beam, 1])
+
+ output = list()
+ for i in range(beam_idx.shape[0]):
+ b_idx = beam_idx[i]
+ ex_idx = current_select_expert[i]
+ ex_out = candidate_output[ex_idx, b_idx, :,:]
+ if self.weight_type == 'ffn_prob':
+ prob = current_scores[b_idx, ex_idx]
+ ex_out = ex_out*prob
+ output.append(ex_out.unsqueeze(0))
+
+ final_output = torch.concat(output, dim=0)
+ # import pdb;pdb.set_trace()
+ return final_output, beam_scores, expert_route, beam_idx, importance_loss
diff --git a/minigpt4/models/moe/beam_search_test.py b/minigpt4/models/moe/beam_search_test.py
deleted file mode 100644
index 8a8f128..0000000
--- a/minigpt4/models/moe/beam_search_test.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import torch
-import copy
-import pickle
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
-
-
-def forward_expert(input_x, expert_idx):
- input_x += torch.randn(32,768)
- return input_x
- # output_x = self.experts[expert_idx].forward(input_x)
- # return output_x
-
-
-def forward_ffn(x_repeat, expert_select):
- """
- x_repeat : [bz*num_beams, 32,768]
- expert_select : [bz*num_beams]
- """
- outputs = list()
- num_beams_bz = x_repeat.shape[0]
- for i in range(num_beams_bz):
- output_x = forward_expert(x_repeat[i], expert_select[i]) # (32,768)
- outputs.append(output_x.unsqueeze(0))
- candidate_output = torch.cat(outputs)
- return candidate_output # torch.Size([bz*num_beams, 32, 768])
-
-def forward_gate(x, num_expert):
- """
- x : torch.Size([bz*num_beams, 32, 768]) or torch.Size([bz, 32, 768])
- prob_gate : torch.Size([bz*num_beams, num_experts]) or torch.Size([bz, num_experts])
- """
- # attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
- # x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz*num_beams, 32, 768])
- # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768])
- # logits_gate = gate(x_average) # torch.Size([bz, num_experts])
- logits_gate = torch.randn(x.shape[0], num_expert)
- prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
- return prob_gate
-
-def beam_search(layer, current_scores, beam_scores, expert_route, num_beams):
- if layer == 0 and beam_scores==None and expert_route==None:
- topk_values, gate = torch.topk(current_scores, num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
- beam_scores = topk_values.view(num_beams*batch_size) # torch.Size([bz * num_beams])
- expert_route = gate.view(num_beams*batch_size).unsqueeze(1) # torch.Size([bz * num_beams])
-
- else:
- next_scores_raw = current_scores + beam_scores.unsqueeze(1) # torch.Size([4*3, 5]) # 取log 之后,可以直接相加概率
- next_scores_raw1 = next_scores_raw.view(
- batch_size, num_beams * num_expert
- ) # torch.Size([4, 3*5])
- next_scores, next_experts = torch.topk(next_scores_raw1, num_beams, dim=1, largest=True, sorted=True)
- # next_scores torch.Size([4, 3*num_beams])
- # next_tokens torch.Size([4, 3*num_beams])
-
- next_batch_beam = list()
- for batch_idx in range(batch_size):
- next_sent_beam = list()
- print(batch_idx)
- for rank, (expert_id, expert_score) in enumerate(
- zip(next_experts[batch_idx], next_scores[batch_idx])
- ):
- expert_id = expert_id.item()
- beam_id = expert_id // num_expert
- ex_id = expert_id % num_expert
- effective_beam_id = batch_idx*num_beams + beam_id
-
- # print(expert_id, beam_id, ex_id, effective_beam_id, expert_score)
-
- next_sent_beam.append((expert_score, ex_id, effective_beam_id))
- next_batch_beam.extend(next_sent_beam)
-
- # print()
-
- import pdb;pdb.set_trace()
-
- beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
- beam_experts = expert_route[:,-1].new([x[1] for x in next_batch_beam])
- beam_idx = expert_route[:,-1].new([x[2] for x in next_batch_beam])
-
- pre_route = expert_route[beam_idx,:]
- expert_route = torch.cat([pre_route, beam_experts.unsqueeze(1)], dim=-1)
-
- return beam_scores, expert_route
-
-
-if __name__ == '__main__':
-
- batch_size = 3
- num_beams = 2
- num_expert = 5
- x = torch.randn(batch_size, 32, 768)
- beam_scores, expert_route = None, None
-
- for layer in range(0,3):
- # import pdb;pdb.set_trace()
-
- current_scores = forward_gate(x, num_expert)
- import pdb;pdb.set_trace()
-
- beam_scores, expert_route = beam_search(layer, current_scores, beam_scores, expert_route, num_beams)
- current_expert_select = expert_route[:,-1]
-
- if layer == 0:
- replicated_tensor = x.unsqueeze(1).expand(batch_size, num_beams, 32, 768)
- x = replicated_tensor.contiguous().view(-1, 32, 768) # [12,32,768] [bz*num_beams, 32,768]
- else:
- x = candidate_output
-
- candidate_output = forward_ffn(x, current_expert_select) # torch.Size([4*3, 5])
-
- x = candidate_output
-
-
- scores = beam_scores.view(batch_size, num_beams)
- topk_values, gate = torch.topk(scores, 1, dim=1)
- # gate [batch_size, 1]
- # topk_values [batch_size, 1]
- selects = [ (bz_idx * num_beams + gate[bz_idx].item()) for bz_idx in range(batch_size)]
- final_scores = beam_scores[selects]
- final_expert_route = expert_route[selects]
- final_output = candidate_output[selects]
-
-
-
-
-
-
-
-# def forward_ffn_post(x_repeat, expert_select):
-# """
-# x_repeat : [bz*num_beams, 32,768]
-# expert_select : [bz*num_beams]
-# prob_gate : torch.Size([bz*num_beams, num_experts])
-# """
-# outputs = list()
-# logits_gate_lst = list()
-# # attention_mask = torch.ones([batch_size, 32])
-# for i in range(num_beams*batch_size):
-# output_x = forward_expert(x_repeat[i], expert_select[i]) # (32,768)
-# outputs.append(output_x.unsqueeze(0))
-# # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
-# # gate_acore = self.gates[expert_idx](output_x_aver)
-# # gate_score = self.gate(output_x_aver)
-# num_expert = 5
-# gate_score = torch.randn(1,num_expert)
-# logits_gate_lst.append(gate_score)
-
-# candidate_output = torch.cat(outputs) # torch.Size([bz*num_beams, 32, 768])
-# logits_gate = torch.cat(logits_gate_lst,dim=0)# torch.Size([bz*num_beams, num_expert])
-# prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
-# return prob_gate, candidate_output
\ No newline at end of file
diff --git a/minigpt4/models/moe/moe_layer.py b/minigpt4/models/moe/moe_layer.py
index 303862c..abd24b9 100644
--- a/minigpt4/models/moe/moe_layer.py
+++ b/minigpt4/models/moe/moe_layer.py
@@ -5,7 +5,7 @@ import torch.nn as nn
import torch.nn.functional as F
class MoELayer(nn.Module):
- def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='l2_norm'):
+ def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='raw_prob'):
# remove hash list
nn.Module.__init__(self)
self.num_experts = num_experts
@@ -81,54 +81,6 @@ class MoELayer(nn.Module):
return x, balance_loss, gate_load
- def _forward_gate_sentence_top1_raw(self, x, attention_mask):
- """
- x: query_attention_output , torch.Size([bz, 32, 768])
- attention_mask: torch.ones([bz, 32])
-
- ### Notice:
- the raw version of expert_attention_mask is the extended_attention_mask,
- which will be add to attention_score directly
- the values of extended_attention_mask are -0.0 or -10000
- it should be adjust to 1/0 version to be processed by experts
- """
- attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
- x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
- x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
- logits_gate = self.gate(x_average) # torch.Size([bz, num_experts])
- prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
- gate = torch.argmax(prob_gate, dim=-1) # torch.Size([bz])
-
- order = gate.argsort(0)
- num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0)
- gate_load = num_sentences.clone()
- x = x[order] # reorder according to expert number
- x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts
-
- # compute the load balancing loss
- P = prob_gate.mean(0)
- temp = num_sentences.float()
- f = temp / temp.sum(0, keepdim=True)
- balance_loss = self.num_experts * torch.sum(P * f)
-
- prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1))
- prob_gate = prob_gate[order]
- prob_gate = prob_gate.split(num_sentences.tolist(), dim=0)
-
- def forward_expert(input_x, prob_x, expert_idx):
- input_x = self.experts[expert_idx].forward(input_x)
- input_x = input_x * prob_x.unsqueeze(-1)
- return input_x
-
- result = []
- for i in range(self.num_experts):
- if x[i].size(0) > 0:
- result.append(forward_expert(x[i], prob_gate[i], i))
- result = torch.vstack(result)
- result = result[order.argsort(0)] # restore original order
-
- return result, balance_loss, gate_load
-
def _forward_gate_sentence_post(self, x, attention_mask):
"""
x: query_attention_output; torch.Size([bz, 32, 768])
@@ -174,13 +126,17 @@ class MoELayer(nn.Module):
# importance loss
importance_loss = self._importance_auxiliary_loss(prob_gate)
- # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768])
- # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768])
- # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1])
-
prob_gate_topk = torch.zeros_like(prob_gate)
prob_gate_topk.scatter_(1, gate, topk_values)
- prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True) # torch.Size([bz, num_expert])
+
+ if self.weight_type == 'average':
+ # torch.Size([bz, num_expert]) 未选中的expert prob_gate_norm为0
+ prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True)
+ elif self.weight_type == 'raw_prob':
+ prob_gate_normalized = prob_gate_topk
+ elif self.weight_type == 'softmax_norm':
+ prob_gate_normalized = F.softmax(prob_gate_topk, dim=-1) # torch.Size([bz, num_expert])
+
candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768])
results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768])
moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768])
@@ -188,6 +144,46 @@ class MoELayer(nn.Module):
return moe_result, (balance_loss+importance_loss), prob_gate_normalized
+ def router(self, x, attention_mask):
+ # Prepare input x
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+ x_average = torch.mean(x_masked, dim=1) # torch.Size([bz, 768])
+
+ # Forward Gate
+ # logits_gate: [bz, num_experts]
+ logits_gate = self.gate(x_average)
+
+ # Probabilities for each sample of what expert it should be sent to.
+ # prob_gate: [bz, num_experts]
+ prob_gate = F.softmax(logits_gate, dim=-1)
+
+ # Get Top-K experts for each sample
+ # gate: [bz, topk]
+ # select_prob_gate: [bz, topk]
+ select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1)
+
+ # Reshap Prob_gate & Gate
+ # expert_mask: [batch_size, topk, num_experts]
+ # expert_gate: [batch_size, topk, num_experts]
+ # combine_tensor: [batch_size, num_experts]
+ expert_mask = F.one_hot(gate, self.num_experts)
+ expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask
+ combine_tensor = torch.sum(expert_gate, dim=1)
+
+ # Calculate Balancing Loss
+ if self.use_balance_loss:
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+
+ # Calculate Importance Loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ # import pdb; pdb.set_trace()
+
+ return expert_mask, combine_tensor, balance_loss, importance_loss
def _forward_gate_sentence(self, x, attention_mask):
"""
@@ -200,81 +196,37 @@ class MoELayer(nn.Module):
the values of extended_attention_mask are -0.0 or -10000
it should be adjust to 1/0 version to be processed by experts
"""
- attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
- x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
- x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
- logits_gate = self.gate(x_average) # torch.Size([bz, num_experts])
- prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
- select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ # Forward Router
+ expert_mask, combine_tensor, balance_loss, importance_loss = self.router(x, attention_mask)
+
+ # Forward Expert FFN
+ result = []
+ for expert_idx in range(self.num_experts):
+ output_x = self.experts[expert_idx].forward(x)
+ result.append(output_x.unsqueeze(0))
+ expert_output = torch.cat(result).permute(1,0,2,3) # torch.Size([batch_size, num_expert, num_tokens, hidden_states])
- # 这里用l2 norm 去加权
- if self.weight_type == 'l2_norm':
- normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
- elif self.weight_type == 'average':
- normalized_tensor = select_prob_gate / select_prob_gate.sum(dim=1, keepdim=True)
+ # multiply outputs of experts by the routing probability
+ if self.weight_type == 'raw_prob':
+ expert_outputs_combined = expert_output * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states])
+ elif self.weight_type == 'no_prob':
+ combine_index = torch.sum(expert_mask, dim=1)
+ expert_outputs_combined = expert_output * combine_index.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states])
- num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
- gate_load = num_sentences.clone()
+ outputs = torch.sum(expert_outputs_combined, dim=1) # torch.Size([batch_size, num_tokens, hidden_states])
- # load balancing loss
- if self.use_balance_loss:
- balance_loss = self._balancing_loss(prob_gate, num_sentences)
- else:
- balance_loss = 0.0
+ # import pdb; pdb.set_trace()
- # importance loss
- importance_loss = self._importance_auxiliary_loss(prob_gate)
-
- # forward experts
- def forward_expert(input_x, expert_idx):
- input_x = self.experts[expert_idx].forward(input_x)
- return input_x
-
- result_lst = list()
- for i in range(self.topk):
- # top1、top2... 分别为一组,进行gate分组之后过expert,然后乘以概率后相加
- tmp_gate = gate[:,i]
- tmp_prob = normalized_tensor[:,i].unsqueeze(-1).unsqueeze(-1)
- order = tmp_gate.argsort(0)
- num_sentences_t = F.one_hot(tmp_gate, self.num_experts).gt(0).sum(0)
- x1 = x[order] # reorder according to expert number
- x1 = x1.split(num_sentences_t.tolist(), dim=0) # a list of length self.num_experts
-
- result = []
- for i in range(self.num_experts):
- if x1[i].size(0) > 0:
- result.append(forward_expert(x1[i], i))
- result = torch.vstack(result)
- result = result[order.argsort(0)] # restore original order
- # result_lst.append(result * tmp_prob) # result * prob
- result_lst.append(result) # result * prob # add_1212
-
- moe_result = sum(result_lst)
- # import pdb;pdb.set_trace()
- return moe_result, (balance_loss+importance_loss), gate
-
- def _forward_sentence_single_expert(self, x, attention_mask):
- x_masked = x * attention_mask.unsqueeze(-1)
- x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1)
- logits_gate = self.gate(x_average)
- prob_gate = F.softmax(logits_gate, dim=-1)
- gate = torch.argmax(prob_gate, dim=-1)
-
- gate_load = F.one_hot(gate, self.num_experts).gt(0).sum(0)
- x = self.experts[gate.cpu().item()].forward(x)
- return x, 0.0, gate_load
+ return outputs, (balance_loss+importance_loss), combine_tensor
def forward(self, x, attention_mask):
if self.route_method == "gate-token":
x, balance_loss, gate_load = self._forward_gate_token(x)
elif self.route_method == "gate-sentence":
- if x.size(0) == 1:
- x, balance_loss, gate_load = self._forward_sentence_single_expert(x, attention_mask)
- else:
- x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask)
+ x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask)
elif self.route_method == "gate-sentence-post":
x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask)
else:
raise KeyError("Routing method not supported.")
-
+ # import pdb; pdb.set_trace()
return x, balance_loss, gate_load
diff --git a/minigpt4/models/moe/moe_layer_backup.py b/minigpt4/models/moe/moe_layer_backup.py
new file mode 100644
index 0000000..25f2e59
--- /dev/null
+++ b/minigpt4/models/moe/moe_layer_backup.py
@@ -0,0 +1,330 @@
+import copy
+import pickle
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MoELayer(nn.Module):
+ def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='l2_norm'):
+ # remove hash list
+ nn.Module.__init__(self)
+ self.num_experts = num_experts
+ self.experts = nn.ModuleList([copy.deepcopy(expert) for i in range(num_experts)])
+ self.route_method = route_method
+ self.topk = topk
+ self.use_balance_loss = use_balance_loss
+ self.weight_type = weight_type
+
+ if route_method in ["gate-token", "gate-sentence"]:
+ self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
+ elif route_method in ["gate-sentence-post"]:
+ gate = nn.Linear(hidden_size, 1, bias=False).float()
+ # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
+ self.gate = gate
+ else:
+ raise KeyError("Routing method not supported.")
+
+ def _balancing_loss(self, prob_gate, num_tokens):
+ # From MOEBERT
+ # compute the load balancing loss
+ # prob_gate,是 [bz, num_expert],每个样本被分配给每个expert的概率
+ # 等价于 VMOE 中 _gshard_auxiliary_loss
+ P = prob_gate.mean(0) # torch.Size([num_expert]) 每个expert被分配到样本的平均概率
+ temp = num_tokens.float()
+ f = temp / temp.sum(0, keepdim=True) # 每个expert被分配的sample比例
+ balance_loss = self.num_experts * torch.sum(P * f)
+ return balance_loss
+
+ def _importance_auxiliary_loss(self, prob_gate):
+ # From VMOE
+ # _importance_auxiliary_loss
+ axis = tuple(range(prob_gate.ndim - 1)) # All except last.
+ importance_per_expert = torch.sum(prob_gate, dim=axis)
+ std_importance_per_expert = torch.std(importance_per_expert)
+ mean_importance_per_expert = torch.mean(importance_per_expert)
+ # Compute coefficient of variation (i.e. std/mean) squared.
+ return (std_importance_per_expert / mean_importance_per_expert)**2
+
+ def _forward_gate_token(self, x):
+ bsz, seq_len, dim = x.size()
+
+ x = x.view(-1, dim)
+ logits_gate = self.gate(x)
+ prob_gate = F.softmax(logits_gate, dim=-1)
+ gate = torch.argmax(prob_gate, dim=-1)
+
+ order = gate.argsort(0)
+ num_tokens = F.one_hot(gate, self.num_experts).gt(0).sum(0)
+ gate_load = num_tokens.clone()
+ x = x[order] # reorder according to expert number
+ x = x.split(num_tokens.tolist(), dim=0) # a list of length self.num_experts
+
+ # compute the load balancing loss
+ P = prob_gate.mean(0)
+ temp = num_tokens.float()
+ f = temp / temp.sum(0, keepdim=True)
+ balance_loss = self.num_experts * torch.sum(P * f)
+
+ prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1))
+ prob_gate = prob_gate[order]
+ prob_gate = prob_gate.split(num_tokens.tolist(), dim=0)
+
+ def forward_expert(input_x, prob_x, expert_idx):
+ input_x = self.experts[expert_idx].forward(input_x)
+ input_x = input_x * prob_x
+ return input_x
+
+ x = [forward_expert(x[i], prob_gate[i], i) for i in range(self.num_experts)]
+ x = torch.vstack(x)
+ x = x[order.argsort(0)] # restore original order
+ x = x.view(bsz, seq_len, dim)
+
+ return x, balance_loss, gate_load
+
+ def _forward_gate_sentence_top1_raw(self, x, attention_mask):
+ """
+ x: query_attention_output , torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+
+ ### Notice:
+ the raw version of expert_attention_mask is the extended_attention_mask,
+ which will be add to attention_score directly
+ the values of extended_attention_mask are -0.0 or -10000
+ it should be adjust to 1/0 version to be processed by experts
+ """
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+ x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
+ logits_gate = self.gate(x_average) # torch.Size([bz, num_experts])
+ prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
+ gate = torch.argmax(prob_gate, dim=-1) # torch.Size([bz])
+
+ order = gate.argsort(0)
+ num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0)
+ gate_load = num_sentences.clone()
+ x = x[order] # reorder according to expert number
+ x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts
+
+ # compute the load balancing loss
+ P = prob_gate.mean(0)
+ temp = num_sentences.float()
+ f = temp / temp.sum(0, keepdim=True)
+ balance_loss = self.num_experts * torch.sum(P * f)
+
+ prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1))
+ prob_gate = prob_gate[order]
+ prob_gate = prob_gate.split(num_sentences.tolist(), dim=0)
+
+ def forward_expert(input_x, prob_x, expert_idx):
+ input_x = self.experts[expert_idx].forward(input_x)
+ input_x = input_x * prob_x.unsqueeze(-1)
+ return input_x
+
+ result = []
+ for i in range(self.num_experts):
+ if x[i].size(0) > 0:
+ result.append(forward_expert(x[i], prob_gate[i], i))
+ result = torch.vstack(result)
+ result = result[order.argsort(0)] # restore original order
+
+ return result, balance_loss, gate_load
+
+ def _forward_gate_sentence_post(self, x, attention_mask):
+ """
+ x: query_attention_output; torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+ bz = 4
+ x = torch.randn(bz,32,768)
+ attention_mask = torch.ones([bz, 32])
+
+ """
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+
+ def forward_expert(input_x, expert_idx):
+ # input_x += torch.randn(4,32,768)
+ # return input_x
+ output_x = self.experts[expert_idx].forward(input_x)
+ return output_x
+
+ outputs = list()
+ logits_gate_lst = list()
+ for expert_idx in range(self.num_experts):
+ output_x = forward_expert(x_masked, expert_idx)
+ outputs.append(output_x.unsqueeze(0))
+
+ output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
+ # gate_acore = self.gates[expert_idx](output_x_aver)
+ gate_score = self.gate(output_x_aver)
+ logits_gate_lst.append(gate_score)
+
+ candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768])
+ logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert])
+ prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
+ topk_values, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ gate_load = num_sentences.clone()
+
+ # load balancing loss
+ if self.use_balance_loss:
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ # output_average = candidate_output.sum(2) / candidate_attn_mask.unsqueeze(-1).sum(2) # torch.Size([num_expert, bz, 768])
+ # output_average = torch.permute(output_average, (1, 0, 2)) # torch.Size([bz, num_expert, 768])
+ # logits_gate = self.gate(output_average) # torch.Size([bz, num_experts, 1])
+
+ prob_gate_topk = torch.zeros_like(prob_gate)
+ prob_gate_topk.scatter_(1, gate, topk_values)
+
+ if self.weight_type == 'average':
+ # torch.Size([bz, num_expert]) 未选中的expert prob_gate_norm为0
+ prob_gate_normalized = prob_gate_topk / prob_gate_topk.sum(dim=1, keepdim=True)
+ elif self.weight_type == 'raw_prob':
+ prob_gate_normalized = prob_gate_topk
+ elif self.weight_type == 'softmax_norm':
+ prob_gate_normalized = F.softmax(prob_gate_topk, dim=-1) # torch.Size([bz, num_expert])
+
+ candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768])
+ results = prob_gate_normalized.unsqueeze(-1).unsqueeze(-1) * candidate_output_ad # torch.Size([bz, num_expert, 32, 768])
+ moe_result = torch.sum(results, dim=1) # torch.Size([bz, 32, 768])
+ # import pdb;pdb.set_trace()
+
+ return moe_result, (balance_loss+importance_loss), prob_gate_normalized
+
+ # def _forward_gate_sentence(self, x, attention_mask):
+
+ # attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ # x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+ # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1)
+ # logits_gate = self.gate(x_average)
+ # prob_gate = F.softmax(logits_gate, dim=-1)
+ # gate = torch.argmax(prob_gate, dim=-1)
+
+ # order = gate.argsort(0)
+ # num_sentences = F.one_hot(gate, self.num_experts).gt(0).sum(0)
+ # gate_load = num_sentences.clone()
+ # x = x[order] # reorder according to expert number
+ # x = x.split(num_sentences.tolist(), dim=0) # a list of length self.num_experts
+
+ # # compute the load balancing loss
+ # P = prob_gate.mean(0)
+ # temp = num_sentences.float()
+ # f = temp / temp.sum(0, keepdim=True)
+ # balance_loss = self.num_experts * torch.sum(P * f)
+
+ # prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1))
+ # prob_gate = prob_gate[order]
+ # prob_gate = prob_gate.split(num_sentences.tolist(), dim=0)
+
+ # def forward_expert(input_x, prob_x, expert_idx):
+ # input_x = self.experts[expert_idx].forward(input_x)
+ # input_x = input_x * prob_x.unsqueeze(-1)
+ # return input_x
+
+ # result = []
+ # for i in range(self.num_experts):
+ # if x[i].size(0) > 0:
+ # result.append(forward_expert(x[i], prob_gate[i], i))
+ # result = torch.vstack(result)
+ # result = result[order.argsort(0)] # restore original order
+
+ # return result, balance_loss, gate_load
+
+ def _forward_gate_sentence(self, x, attention_mask):
+ """
+ x: query_attention_output , torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+
+ ### Notice:
+ the raw version of expert_attention_mask is the extended_attention_mask,
+ which will be add to attention_score directly
+ the values of extended_attention_mask are -0.0 or -10000
+ it should be adjust to 1/0 version to be processed by experts
+ """
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+ x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz, 768])
+ logits_gate = self.gate(x_average) # torch.Size([bz, num_experts])
+ prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
+ select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+
+ # 这里用l2 norm 去加权
+ if self.weight_type == 'l2_norm':
+ # actually neigther dim=0 nor dim=1 is right
+ normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=1) # L2 Normalization torch.Size([bz, topk])
+ elif self.weight_type == 'l2_norm_0':
+ normalized_tensor = torch.nn.functional.normalize(select_prob_gate, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
+ elif self.weight_type == 'average':
+ normalized_tensor = select_prob_gate / select_prob_gate.sum(dim=1, keepdim=True)
+ elif self.weight_type == 'raw_prob':
+ normalized_tensor = select_prob_gate
+
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ gate_load = num_sentences.clone()
+
+ # load balancing loss
+ if self.use_balance_loss:
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ # forward experts
+ def forward_expert(input_x, expert_idx):
+ input_x = self.experts[expert_idx].forward(input_x)
+ return input_x
+
+ result_lst = list()
+ for i in range(self.topk):
+ # top1、top2... 分别为一组,进行gate分组之后过expert,然后乘以概率后相加
+ tmp_gate = gate[:,i]
+ tmp_prob = normalized_tensor[:,i].unsqueeze(-1).unsqueeze(-1)
+ order = tmp_gate.argsort(0)
+ num_sentences_t = F.one_hot(tmp_gate, self.num_experts).gt(0).sum(0)
+ x1 = x[order] # reorder according to expert number
+ x1 = x1.split(num_sentences_t.tolist(), dim=0) # a list of length self.num_experts
+
+ result = []
+ for i in range(self.num_experts):
+ if x1[i].size(0) > 0:
+ result.append(forward_expert(x1[i], i))
+ result = torch.vstack(result)
+ result = result[order.argsort(0)] # restore original order
+ result_lst.append(result * tmp_prob) # result * prob
+ # result_lst.append(result) # result * prob # add_1212
+
+ moe_result = sum(result_lst)
+ return moe_result, (balance_loss+importance_loss), gate
+
+ def _forward_sentence_single_expert(self, x, attention_mask):
+ x_masked = x * attention_mask.unsqueeze(-1)
+ x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1)
+ logits_gate = self.gate(x_average)
+ prob_gate = F.softmax(logits_gate, dim=-1)
+ gate = torch.argmax(prob_gate, dim=-1)
+
+ gate_load = F.one_hot(gate, self.num_experts).gt(0).sum(0)
+ x = self.experts[gate.cpu().item()].forward(x)
+ return x, 0.0, gate_load
+
+ def forward(self, x, attention_mask):
+ if self.route_method == "gate-token":
+ x, balance_loss, gate_load = self._forward_gate_token(x)
+ elif self.route_method == "gate-sentence":
+ if x.size(0) == 1:
+ x, balance_loss, gate_load = self._forward_sentence_single_expert(x, attention_mask)
+ else:
+ x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask)
+ elif self.route_method == "gate-sentence-post":
+ x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask)
+ else:
+ raise KeyError("Routing method not supported.")
+ # import pdb; pdb.set_trace()
+ return x, balance_loss, gate_load
diff --git a/minigpt4/models/moe/prompt_moe.py b/minigpt4/models/moe/prompt_moe.py
index 8b5e2d2..8ea4cea 100644
--- a/minigpt4/models/moe/prompt_moe.py
+++ b/minigpt4/models/moe/prompt_moe.py
@@ -92,7 +92,6 @@ class PrePromptMoE(PromptMoEBase):
self.topk = topk
if route_method in ["gate-token", "gate-single-token", "gate-sentence"]:
self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
- print(self.gate)
else:
raise KeyError("Routing method not supported.")
diff --git a/minigpt4/models/moe/route_moe_layer.py b/minigpt4/models/moe/route_moe_layer.py
index 31b75c2..6012dd2 100644
--- a/minigpt4/models/moe/route_moe_layer.py
+++ b/minigpt4/models/moe/route_moe_layer.py
@@ -5,7 +5,7 @@ import torch.nn as nn
import torch.nn.functional as F
class RouteMoELayer(nn.Module):
- def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route"):
+ def __init__(self, hidden_size, expert, num_experts, num_beams=2, layer_judge=None, route_method="pre-route", weight_type="ffn_prob"):
# remove hash list
nn.Module.__init__(self)
self.num_experts = num_experts
@@ -13,6 +13,7 @@ class RouteMoELayer(nn.Module):
self.num_beams = num_beams
self.hidden_size = hidden_size
self.layer_judge = layer_judge
+ self.weight_type = weight_type
self.route_method = route_method
if self.route_method == "pre-route":
@@ -22,6 +23,17 @@ class RouteMoELayer(nn.Module):
self.gate = gate
# self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
+ def _importance_auxiliary_loss(self, prob_gate):
+ # From VMOE
+ # _importance_auxiliary_loss
+ axis = tuple(range(prob_gate.ndim - 1)) # All except last.
+ importance_per_expert = torch.sum(prob_gate, dim=axis)
+ std_importance_per_expert = torch.std(importance_per_expert)
+ mean_importance_per_expert = torch.mean(importance_per_expert)
+ # Compute coefficient of variation (i.e. std/mean) squared.
+ return (std_importance_per_expert / mean_importance_per_expert)**2
+
+
def forward_gate(self, x):
"""
x : torch.Size([bz*num_beams, 32, 768]) or torch.Size([bz, 32, 768])
@@ -29,7 +41,8 @@ class RouteMoELayer(nn.Module):
"""
attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz*num_beams, 32, 768])
- x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768])
+ # x_average = x_masked.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beams, 768])
+ x_average = torch.mean(x_masked, dim=1) # torch.Size([bz*num_beams, 768])
logits_gate = self.gate(x_average) # torch.Size([bz*num_beams, num_experts])
prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beams, num_experts])
return prob_gate
@@ -42,7 +55,7 @@ class RouteMoELayer(nn.Module):
topk_values, gate = torch.topk(current_scores, self.num_beams, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
beam_scores = topk_values.view(self.num_beams * batch_size) # torch.Size([bz * num_beams])
expert_route = gate.view(self.num_beams * batch_size).unsqueeze(1) # torch.Size([bz * num_beams,1])
- beam_idx = None
+ beam_idx = torch.tensor(range(self.num_beams * batch_size))
else:
if self.layer_judge=='first' and self.route_method == 'post-route':
batch_size = batch_size
@@ -89,54 +102,63 @@ class RouteMoELayer(nn.Module):
return beam_scores, expert_route, beam_idx
-
- def forward_expert_ffn(self, x, expert_select, beam_scores):
+ def forward_expert_ffn(self, x, expert_select, current_scores):
"""
x_repeat : [bz*num_beams, 32,768]
expert_select : [bz*num_beams]
+ current_scores : [bz*num_beams, num_experts] / [bz, num_experts]
"""
- # add_1212 l2_normalization
- # normalized_tensor = torch.nn.functional.normalize(beam_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
+ # add_1228 l2_normalization
+ # normalized_tensor = torch.nn.functional.normalize(current_scores, p=2, dim=0) # L2 Normalization torch.Size([bz, topk])
# tmp_prob = normalized_tensor.unsqueeze(-1).unsqueeze(-1)
-
+ # import pdb;pdb.set_trace()
outputs = list()
- for i in range(x.shape[0]):
- output_x = self.experts[expert_select[i]].forward(x[i])
- outputs.append(output_x.unsqueeze(0))
- candidate_output = torch.cat(outputs)
-
- # candidate_output = candidate_output * tmp_prob
- return candidate_output # torch.Size([bz*num_beams, 32, 768])
-
+ for i in range(self.num_experts):
+ output_x = self.experts[i].forward(x)
+ outputs.append(output_x.unsqueeze(1))
+ candidate_output = torch.cat(outputs, dim=1)
+ expert_select_matrix = F.one_hot(expert_select, self.num_experts)
+ if self.weight_type == 'ffn_prob':
+ tmp_prob = current_scores * expert_select_matrix
+ candidate_output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1)
+ else:
+ candidate_output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1)
+ output = torch.sum(candidate_output, dim=1)
+ # import pdb;pdb.set_trace()
+ return output # torch.Size([bz*num_beams, 32, 768])
def forward_pre_route(self, x, beam_scores, expert_route, use_log=True):
- current_scores = self.forward_gate(x) # [bz*num_beams, 5]
+ current_scores = self.forward_gate(x) # [bz, num_beams] / [bz*num_beams, num_beams]
+
+ importance_loss = self._importance_auxiliary_loss(current_scores)
if use_log:
current_scores_log = torch.log(current_scores) # 取log之后可以直接相加
else:
current_scores_log = current_scores
-
+ # import pdb;pdb.set_trace()
batch_size, num_tokens = x.shape[0], x.shape[1]
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
-
current_expert_select = expert_route[:,-1]
if self.layer_judge=='first': # expand first dim to batch_size * num_beams
replicated_tensor = x.unsqueeze(1).expand(batch_size, self.num_beams, num_tokens, self.hidden_size)
x = replicated_tensor.contiguous().view(-1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768]
+ current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts)
+ current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts]
- candidate_output = self.forward_expert_ffn(x, current_expert_select, beam_scores) # [bz*num_beams, 32,768]
-
- return candidate_output, beam_scores, expert_route, beam_idx
+ input_x = x[beam_idx]
+ candidate_output = self.forward_expert_ffn(input_x, current_expert_select, current_scores) # [bz*num_beams, 32,768]
+ # import pdb;pdb.set_trace()
+ return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
def forward_post_route(self, x, beam_scores, expert_route, use_log=True):
attention_mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
-
+
def forward_expert(input_x, expert_idx):
output_x = self.experts[expert_idx].forward(input_x)
return output_x
@@ -145,12 +167,14 @@ class RouteMoELayer(nn.Module):
logits_gate_lst = list()
for expert_idx in range(self.num_experts):
output_x = forward_expert(x_masked, expert_idx)
- outputs.append(output_x.unsqueeze(0))
- output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768])
+ # output_x_aver = output_x.sum(1) / attention_mask.unsqueeze(-1).sum(1) # torch.Size([bz*num_beam, 768])
+ output_x_aver = torch.mean(output_x, dim=1)
# gate_score = self.gates[expert_idx](output_x_aver)
gate_score = self.gate(output_x_aver)
logits_gate_lst.append(gate_score)
- candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768])
+ outputs.append(output_x.unsqueeze(0))
+
+ candidate_output_raw = torch.cat(outputs) # torch.Size([num_expert, bz*num_beam, 32, 768])
logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz*num_beam, num_expert])
current_scores = F.softmax(logits_gate, dim=-1) # torch.Size([bz*num_beam, num_experts])
@@ -159,24 +183,33 @@ class RouteMoELayer(nn.Module):
else:
current_scores_log = current_scores
- batch_size = x.shape[0] # bz*num_beam
+ # importance loss
+ importance_loss = self._importance_auxiliary_loss(current_scores)
+
+ batch_size, num_tokens = x.shape[0], x.shape[1] # bz*num_beam
beam_scores, expert_route, beam_idx = self.beam_search(current_scores_log, beam_scores, expert_route, batch_size)
# beam_scores torch.Size([bz*num_beam])
# expert_route torch.Size([bz*num_beam, layer_n])
current_select_expert = expert_route[:,-1]
+ # current_select_expert torch.Size([bz*num_beam, 1])
- output = list()
- for i in range(beam_idx.shape[0]):
- b_idx = beam_idx[i]
- ex_idx = current_select_expert[i]
- ex_out = candidate_output[ex_idx, b_idx, :,:]
- output.append(ex_out.unsqueeze(0))
-
- final_output = torch.concat(output, dim=0)
-
- return final_output, beam_scores, expert_route, beam_idx
-
-
+ if self.layer_judge == 'first':
+ replicated_tensor = candidate_output_raw.unsqueeze(2).expand(self.num_experts, batch_size, self.num_beams, num_tokens, self.hidden_size)
+ candidate_output_raw = replicated_tensor.contiguous().view(self.num_experts, -1, num_tokens, self.hidden_size) # [bz*num_beams, 32,768]
+ current_scores_t = current_scores.unsqueeze(1).expand(batch_size, self.num_beams, self.num_experts)
+ current_scores = current_scores_t.contiguous().view(-1, self.num_experts) # [bz*num_beams, num_experts]
+
+ candidate_output = candidate_output_raw.permute(1, 0, 2, 3)[beam_idx] # torch.Size([8, 2, 32, 768])
+ expert_select_matrix = F.one_hot(current_select_expert, self.num_experts)
+ if self.weight_type == 'ffn_prob':
+ tmp_prob = current_scores[beam_idx] * expert_select_matrix
+ output = candidate_output * tmp_prob.unsqueeze(-1).unsqueeze(-1)
+ else:
+ output = candidate_output * expert_select_matrix.unsqueeze(-1).unsqueeze(-1)
+ final_output = torch.sum(output, dim=1)
+
+ return final_output, beam_scores, expert_route, beam_idx, importance_loss
+
def forward(self, x, attention_mask, beam_scores, expert_route, use_log=True):
"""
if first_layer: x [bz, 32, 768]
@@ -184,11 +217,11 @@ class RouteMoELayer(nn.Module):
"""
if self.route_method == 'pre-route':
- candidate_output, beam_scores, expert_route, beam_idx = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
+ candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_pre_route(x, beam_scores, expert_route, use_log=True)
elif self.route_method == "post-route":
- candidate_output, beam_scores, expert_route, beam_idx = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
+ candidate_output, beam_scores, expert_route, beam_idx, importance_loss = self.forward_post_route(x, beam_scores, expert_route, use_log=True)
- return candidate_output, beam_scores, expert_route, beam_idx
+ return candidate_output, beam_scores, expert_route, beam_idx, importance_loss
diff --git a/minigpt4/models/moe/test_moe_layer.py b/minigpt4/models/moe/test_moe_layer.py
new file mode 100644
index 0000000..5253340
--- /dev/null
+++ b/minigpt4/models/moe/test_moe_layer.py
@@ -0,0 +1,294 @@
+import copy
+import pickle
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import copy
+import pickle
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MoELayer(nn.Module):
+ def __init__(self, hidden_size, expert, num_experts, route_method, topk=1, use_balance_loss=True, weight_type='raw_prob, topk(softmax)'):
+ # remove hash list
+ nn.Module.__init__(self)
+ self.num_experts = num_experts
+ self.experts = nn.ModuleList([copy.deepcopy(expert) for i in range(num_experts)])
+ self.route_method = route_method
+ self.topk = topk
+ self.use_balance_loss = use_balance_loss
+ self.weight_type = weight_type
+
+ if route_method in ["gate-token", "gate-sentence"]:
+ self.gate = nn.Linear(hidden_size, num_experts, bias=False).float()
+ elif route_method in ["gate-sentence-post"]:
+ gate = nn.Linear(hidden_size, 1, bias=False).float()
+ # self.gates = nn.ModuleList([copy.deepcopy(gate) for i in range(num_experts)])
+ self.gate = gate
+ else:
+ raise KeyError("Routing method not supported.")
+
+ def _balancing_loss(self, prob_gate, num_tokens):
+ # From MOEBERT
+ # compute the load balancing loss
+ # prob_gate,是 [bz, num_expert],每个样本被分配给每个expert的概率
+ # 等价于 VMOE 中 _gshard_auxiliary_loss
+ P = prob_gate.mean(0) # torch.Size([num_expert]) 每个expert被分配到样本的平均概率
+ temp = num_tokens.float()
+ f = temp / temp.sum(0, keepdim=True) # 每个expert被分配的sample比例
+ balance_loss = self.num_experts * torch.sum(P * f)
+ return balance_loss
+
+ def _importance_auxiliary_loss(self, prob_gate):
+ # From VMOE
+ # _importance_auxiliary_loss
+ axis = tuple(range(prob_gate.ndim - 1)) # All except last.
+ importance_per_expert = torch.sum(prob_gate, dim=axis)
+ std_importance_per_expert = torch.std(importance_per_expert)
+ mean_importance_per_expert = torch.mean(importance_per_expert)
+ # Compute coefficient of variation (i.e. std/mean) squared.
+ return (std_importance_per_expert / mean_importance_per_expert)**2
+
+ def _forward_gate_token(self, x):
+ bsz, seq_len, dim = x.size()
+
+ x = x.view(-1, dim)
+ logits_gate = self.gate(x)
+ prob_gate = F.softmax(logits_gate, dim=-1)
+ gate = torch.argmax(prob_gate, dim=-1)
+
+ order = gate.argsort(0)
+ num_tokens = F.one_hot(gate, self.num_experts).gt(0).sum(0)
+ gate_load = num_tokens.clone()
+ x = x[order] # reorder according to expert number
+ x = x.split(num_tokens.tolist(), dim=0) # a list of length self.num_experts
+
+ # compute the load balancing loss
+ P = prob_gate.mean(0)
+ temp = num_tokens.float()
+ f = temp / temp.sum(0, keepdim=True)
+ balance_loss = self.num_experts * torch.sum(P * f)
+
+ prob_gate = prob_gate.gather(dim=1, index=gate.unsqueeze(1))
+ prob_gate = prob_gate[order]
+ prob_gate = prob_gate.split(num_tokens.tolist(), dim=0)
+
+ def forward_expert(input_x, prob_x, expert_idx):
+ input_x = self.experts[expert_idx].forward(input_x)
+ input_x = input_x * prob_x
+ return input_x
+
+ x = [forward_expert(x[i], prob_gate[i], i) for i in range(self.num_experts)]
+ x = torch.vstack(x)
+ x = x[order.argsort(0)] # restore original order
+ x = x.view(bsz, seq_len, dim)
+
+ return x, balance_loss, gate_load
+
+ def _forward_gate_sentence_post(self, x, attention_mask):
+ """
+ x: query_attention_output; torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+ bz = 4
+ x = torch.randn(bz,32,768)
+ attention_mask = torch.ones([bz, 32])
+
+ """
+ # Prepare Input x
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+
+ # FeedForward(x) & Forward Gate
+ outputs = list()
+ logits_gate_lst = list()
+ for expert_idx in range(self.num_experts):
+ output_x = self.experts[expert_idx].forward(x_masked)
+ outputs.append(output_x.unsqueeze(0))
+
+ output_x_aver = torch.mean(output_x, dim=1)
+ # gate_acore = self.gates[expert_idx](output_x_aver)
+ gate_score = self.gate(output_x_aver)
+ logits_gate_lst.append(gate_score)
+ candidate_output = torch.cat(outputs) # torch.Size([num_expert, bz, 32, 768])
+ logits_gate = torch.cat(logits_gate_lst,dim=1)# torch.Size([bz, num_expert])
+
+ # Probabilities for each sample of what expert it should be sent to.
+ prob_gate = F.softmax(logits_gate, dim=-1) # torch.Size([bz, num_experts])
+ if 'softmax(topk)' in self.weight_type:
+ prob_gate1, gate = torch.topk(logits_gate, self.topk, dim=1)
+ select_prob_gate = F.softmax(prob_gate1, dim=-1)
+ else:
+ select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1) # gate, 每个样本被分配的expert: torch.Size([bz, topk])
+
+ # Calculate Balancing Loss
+ if self.use_balance_loss:
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+ # Calculate Importance Loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ # Reshap Prob_gate & Gate
+ # expert_mask: [batch_size, topk, num_experts]
+ # expert_gate: [batch_size, topk, num_experts]
+ # combine_tensor: [batch_size, num_experts]
+ expert_mask = F.one_hot(gate, self.num_experts)
+ expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask
+ combine_tensor = torch.sum(expert_gate, dim=1)
+ # combine_tensor = torch.zeros_like(prob_gate)
+ # combine_tensor.scatter_(1, gate, select_prob_gate) # 等价操作,但可能不可导
+
+ candidate_output_ad = torch.permute(candidate_output, (1, 0, 2, 3)) # torch.Size([bz, num_expert, 32, 768])
+ results = candidate_output_ad * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([bz, num_expert, 32, 768])
+ outputs = torch.sum(results, dim=1) # torch.Size([bz, 32, 768])
+ import pdb;pdb.set_trace()
+
+ return outputs, (balance_loss+importance_loss), combine_tensor
+
+ def pre_router(self, x, attention_mask):
+ # Prepare input x
+ attention_mask = torch.ones(attention_mask.shape[0], attention_mask.shape[1]).to(x.device)
+ x_masked = x * attention_mask.unsqueeze(-1) # torch.Size([bz, 32, 768])
+ x_average = torch.mean(x_masked, dim=1) # torch.Size([bz, 768])
+
+ # Forward Gate
+ # logits_gate: [bz, num_experts]
+ logits_gate = self.gate(x_average)
+
+ # Probabilities for each sample of what expert it should be sent to.
+ # prob_gate: [bz, num_experts]
+ prob_gate = F.softmax(logits_gate, dim=-1)
+
+ if 'softmax(topk)' in self.weight_type:
+ prob_gate1, gate = torch.topk(logits_gate, self.topk, dim=1)
+ select_prob_gate = F.softmax(prob_gate1, dim=-1)
+ else:
+ # topk(softmax)
+ # Get Top-K experts for each sample
+ # gate: [bz, topk]
+ # select_prob_gate: [bz, topk]
+ select_prob_gate, gate = torch.topk(prob_gate, self.topk, dim=1)
+
+ # Reshap Prob_gate & Gate
+ # expert_mask: [batch_size, topk, num_experts]
+ # expert_gate: [batch_size, topk, num_experts]
+ # combine_tensor: [batch_size, num_experts]
+ expert_mask = F.one_hot(gate, self.num_experts)
+ expert_gate = select_prob_gate.unsqueeze(-1) * expert_mask
+ combine_tensor = torch.sum(expert_gate, dim=1)
+
+ # Calculate Balancing Loss
+ if self.use_balance_loss:
+ num_sentences = F.one_hot(gate, self.num_experts).sum(1).gt(0).sum(0) # 每个expert被分配的样本数 torch.Size([num_expert])
+ balance_loss = self._balancing_loss(prob_gate, num_sentences)
+ else:
+ balance_loss = 0.0
+
+ # Calculate Importance Loss
+ importance_loss = self._importance_auxiliary_loss(prob_gate)
+
+ import pdb; pdb.set_trace()
+
+ return expert_mask, combine_tensor, balance_loss, importance_loss
+
+ def _forward_gate_sentence(self, x, attention_mask):
+ """
+ x: query_attention_output , torch.Size([bz, 32, 768])
+ attention_mask: torch.ones([bz, 32])
+
+ ### Notice:
+ the raw version of expert_attention_mask is the extended_attention_mask,
+ which will be add to attention_score directly
+ the values of extended_attention_mask are -0.0 or -10000
+ it should be adjust to 1/0 version to be processed by experts
+ """
+ # Forward Router
+ expert_mask, combine_tensor, balance_loss, importance_loss = self.pre_router(x, attention_mask)
+
+ # Forward Expert FFN
+ result = []
+ for expert_idx in range(self.num_experts):
+ output_x = self.experts[expert_idx].forward(x)
+ result.append(output_x.unsqueeze(0))
+ expert_output = torch.cat(result).permute(1,0,2,3) # torch.Size([batch_size, num_expert, num_tokens, hidden_states])
+
+ # multiply outputs of experts by the routing probability
+ expert_outputs_combined = expert_output * combine_tensor.unsqueeze(-1).unsqueeze(-1) # torch.Size([batch_size, num_expert, num_tokens, hidden_states])
+ outputs = torch.sum(expert_outputs_combined, dim=1) # torch.Size([batch_size, num_tokens, hidden_states])
+
+ import pdb; pdb.set_trace()
+
+ return outputs, (balance_loss+importance_loss), combine_tensor
+
+
+ def forward(self, x, attention_mask):
+ if self.route_method == "gate-token":
+ x, balance_loss, gate_load = self._forward_gate_token(x)
+ elif self.route_method == "gate-sentence":
+ x, balance_loss, gate_load = self._forward_gate_sentence(x, attention_mask)
+ elif self.route_method == "gate-sentence-post":
+ x, balance_loss, gate_load = self._forward_gate_sentence_post(x, attention_mask)
+ else:
+ raise KeyError("Routing method not supported.")
+ # import pdb; pdb.set_trace()
+ return x, balance_loss, gate_load
+
+if __name__ == '__main__':
+
+ import sys
+ sys.path.append("/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE")
+ from minigpt4.models.QformerRouteMoE import BertConfig
+ from minigpt4.models.QformerRouteMoE import FeedForward
+ from minigpt4.models.moe.utils import (
+ moe_layer_judge,
+ )
+
+ vision_width = 1408
+ cross_attention_freq = 2
+ num_query_token = 32
+ # init_QformerMoE
+ config = BertConfig.from_pretrained("/mnt/pfs-guan-ssai/nlu/wanghanzi/models/bert-base-uncased")
+ config.encoder_width = vision_width
+ # insert cross-attention layer every other block
+ config.add_cross_attention = True
+ config.cross_attention_freq = cross_attention_freq
+ config.query_length = num_query_token
+ config.moebert_expert_num = 3
+ config.moebert_num_beams = 2
+ config.moebert_route_method = 'gate-sentence-post'
+ config.moe_topk = 1
+ config.use_balance_loss = False
+ # config.moe_weight_type = 'raw_prob, softmax(topk)'
+ config.moe_weight_type = 'raw_prob, topk(softmax)'
+
+ batch_size = 4
+ x2 = torch.randn(batch_size, 32, 768)
+ beam_scores, expert_route = None, None
+
+ for layer_num in [6, 8, 10]:
+ layer_judge = moe_layer_judge(layer_num)
+ ffn = FeedForward(config)
+ gate = nn.Linear(768, config.moebert_expert_num, bias=False).float()
+
+ experts_moe = MoELayer(
+ hidden_size=config.hidden_size,
+ expert=ffn,
+ num_experts=config.moebert_expert_num,
+ route_method=config.moebert_route_method,
+ topk=config.moe_topk,
+ use_balance_loss=config.use_balance_loss,
+ weight_type=config.moe_weight_type,
+ )
+ attn_mask = torch.ones([batch_size, 32])
+ layer_output = experts_moe(x2, attn_mask)
+ hidden_states3, aux_loss, combine_tensor = layer_output
+
+ print(combine_tensor)
+ print(aux_loss)
+ x2 = hidden_states3
+
+ print("------------------------------------")
+ import pdb; pdb.set_trace()
\ No newline at end of file
diff --git a/minigpt4/models/moe/utils.py b/minigpt4/models/moe/utils.py
index 6f5858d..52f78b8 100644
--- a/minigpt4/models/moe/utils.py
+++ b/minigpt4/models/moe/utils.py
@@ -19,15 +19,33 @@ def use_experts(layer_idx):
else:
return False
+def use_experts_route(layer_idx):
+ # if layer_idx % 2 == 0:
+ # use moe_ffn after cross_attns
+ # if int(layer_idx) in [0,2,4,6,8,10]:
+ if int(layer_idx) in [6,7,8,9,10,11]:
+ return True
+ else:
+ return False
+
def moe_layer_judge(layer_idx):
if layer_idx == 6:
return 'first'
- elif layer_idx == 8:
+ elif layer_idx in [7,8,9,10]:
return 'mid'
- elif layer_idx == 10:
+ elif layer_idx == 11:
return 'last'
else:
return None
+
+ # if layer_idx == 0:
+ # return 'first'
+ # elif layer_idx in [2,4,6,8]:
+ # return 'mid'
+ # elif layer_idx == 10:
+ # return 'last'
+ # else:
+ # return None
def process_ffn(model):
if model.config.model_type == "bert":
diff --git a/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml
index b74d7aa..8c5e050 100644
--- a/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_post_vicuna/train/mix_qformer_moe_post_blip2_vicuna7b_data_balance.yaml
@@ -10,7 +10,6 @@ model:
load_finetuned: False
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
- # finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_Post_train_qf_train_qt_aver_weight_5ex_top1_1loss_textinqf_epo3_s42_1201/20231201184/checkpoint_best.pth"
finetuned: ""
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
@@ -38,7 +37,7 @@ model:
# moe
use_moeqformer: True
- moebert_expert_num: 5
+ moebert_expert_num: 3
moebert_route_method: "gate-sentence-post"
moebert_load_balance: 0
moe_topk: 1
@@ -110,6 +109,7 @@ run:
max_epoch: 1
num_workers: 4
warmup_steps: 600
+ iters_per_epoch: 1000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_Post_train_qf_train_qt_aver_weight_5ex_top1_1loss_textinqf_epo3_s42_1201/"
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
index c5a4d5a..74f4ab0 100644
--- a/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/eval/mix_vqa_coco_vicuna_eval.yaml
@@ -10,7 +10,7 @@ model:
load_finetuned: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
- finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_linear_gate_3ex_3beam_1loss_top3layer_log_textinqf_epo3_1216/20231216155/checkpoint_best.pth"
+ finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/20240112212/checkpoint_best.pth"
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
@@ -38,10 +38,12 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
- moebert_expert_num: 3
- moebert_num_beams: 3
moebert_route_method: "post-route"
- gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209_eval_latest1/"
+ moebert_load_balance: 0
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/route_save/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
datasets:
gqa:
@@ -81,19 +83,20 @@ run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
- init_lr: 2e-5
+ init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
- max_epoch: 5
+ max_epoch: 10
num_workers: 4
warmup_steps: 600
+ iters_per_epoch: 3000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
index 5ec25e0..16440dc 100644
--- a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance.yaml
@@ -38,10 +38,12 @@ model:
# moe
use_moeqformer: True
use_route_moe: True
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0
moebert_expert_num: 3
moebert_num_beams: 3
- moebert_route_method: "post-route"
- gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
datasets:
gqa: # train: 943000, 12578, 12578)
@@ -97,19 +99,20 @@ run:
task: instruction_tuning
# optimizer
lr_sched: "linear_warmup_cosine_lr"
- init_lr: 2e-5
+ init_lr: 5e-5
min_lr: 1e-6
warmup_lr: 1e-6
log_freq: 5
save_freq: 1500
weight_decay: 0.05
- max_epoch: 5
+ max_epoch: 8
num_workers: 4
warmup_steps: 600
+ iters_per_epoch: 5000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_1gate_3ex_3beam_1loss_5e5lr_top6layer_textinqf_epo8_0117/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml
new file mode 100644
index 0000000..8818143
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_route_vicuna/train/mix_qformer_moe_route_blip2_vicuna7b_data_balance_1220.yaml
@@ -0,0 +1,129 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+# 0107test
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: True
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0
+ moebert_expert_num: 2
+ moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+ # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/route_save/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+
+datasets:
+ # gqa: # train: 943000, 12578, 12578)
+ # type: balanced_sft_raw
+ # batch_size: 1
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 224
+ # eval:
+ # name: "blip2_image_eval"
+ # image_size: 224
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # eval:
+ # name: "blip_caption"
+ # sample_ratio: 10
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 1
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 1
+
+ # coco_vqa: # 658104
+ # batch_size: 1
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 224
+ # eval:
+ # name: "blip2_image_eval"
+ # image_size: 224
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # eval:
+ # name: "blip_caption"
+ # sample_ratio: 9
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ init_lr: 2e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 5
+ num_workers: 4
+ warmup_steps: 600
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_balance_raw_QformerMoE_Route_linear_gate_5ex_2beam_1loss_textinqf_epo5_toplayer3_1209/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+ # test_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml b/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml
index 3e02942..98de298 100644
--- a/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml
+++ b/minigpt4/projects/qformer_moe_vicuna/eval/vqa_benchmark_evaluation.yaml
@@ -10,7 +10,7 @@ model:
load_finetuned: True
vit_model: eva_clip_g
pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
- finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_1048k_raw_QformerMoE_Route_Post_NoNorm_5ex_2beam_1loss_top3layer_textinqf_epo6_1215/20231216161/checkpoint_best.pth"
+ finetuned: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_1loss_5e5lr_top6layer_textinqf_epo8_0111/20240111145/checkpoint_best.pth"
q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
# vit encoder
@@ -39,8 +39,11 @@ model:
use_moeqformer: True
use_route_moe: True
moebert_route_method: "post-route"
- moebert_expert_num: 5
+ moebert_load_balance: 0
+ moebert_expert_num: 2
moebert_num_beams: 2
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
datasets:
ok_vqa: # train, valid (9009, 5046)
@@ -78,7 +81,7 @@ evaluation_datasets:
run:
task: instruction_tuning
name: vqa_benchmark_evaluation
- save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/eval/benchmarks/mix_1048k_raw_QformerMoE_Route_Post_NoNorm_5ex_2beam_1loss_top3layer_textinqf_epo6_1215/"
+ save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/eval/benchmarks/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_2ex_2beam_1gate_1loss_5e5lr_top6layer_textinqf_epo8_0111/"
seed: 42
diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml
new file mode 100644
index 0000000..979e0a1
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_3ex3beam_0112.yaml
@@ -0,0 +1,131 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna7b
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: True
+ use_route_moe: True
+ moebert_route_method: "post-route"
+ moebert_load_balance: 0.05
+ moebert_expert_num: 3
+ moebert_num_beams: 3
+ moe_weight_type: 'ffn_prob'
+ use_balance_loss: False
+
+datasets:
+ gqa: # train: 943000, 12578, 12578)
+ type: balanced_sft_raw
+ # batch_size: 16
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 50
+
+ ok_vqa: # train, valid (9009, 5046)
+ # batch_size: 16
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 8
+
+ coco_vqa: # 658104
+ # batch_size: 16
+ batch_size: 32
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 15
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ # init_lr: 2e-5
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe_route/mix_coco_gqa_1610k_raw_QformerMoE_Route_Post_ffn_prob_linear_gate_3ex_3beam_1gate_2loss_5e5lr_top6layer_textinqf_epo8_0112/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml
index 2eccb6b..d3f21ec 100644
--- a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml
+++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_balance.yaml
@@ -37,19 +37,19 @@ model:
# moe
use_moeqformer: True
- moebert_expert_num: 5
+ moebert_expert_num: 3
moebert_route_method: "gate-sentence"
moebert_load_balance: 0
moe_topk: 1
use_balance_loss: False
- moe_weight_type: 'l2_norm'
- gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/gate_save/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/"
+ moe_weight_type: 'raw_prob'
+ # gate_save_path: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/gate_save/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/"
datasets:
gqa: # train: 94254
type: balanced_sft_raw_part
- batch_size: 32
+ batch_size: 1
vis_processor:
train:
name: "blip2_image_train"
@@ -65,7 +65,7 @@ datasets:
sample_ratio: 50
ok_vqa: # train, valid (9009, 5046
- batch_size: 32
+ batch_size: 1
vis_processor:
train:
name: "blip2_image_train"
@@ -80,22 +80,22 @@ datasets:
name: "blip_caption"
sample_ratio: 8
- coco_vqa: # 214352 vqa_val
- type: vqa_v2_part
- batch_size: 32
- vis_processor:
- train:
- name: "blip2_image_train"
- image_size: 224
- eval:
- name: "blip2_image_eval"
- image_size: 224
- text_processor:
- train:
- name: "blip_caption"
- eval:
- name: "blip_caption"
- sample_ratio: 15
+ # coco_vqa: # 214352 vqa_val
+ # type: vqa_v2_part
+ # batch_size: 1
+ # vis_processor:
+ # train:
+ # name: "blip2_image_train"
+ # image_size: 224
+ # eval:
+ # name: "blip2_image_eval"
+ # image_size: 224
+ # text_processor:
+ # train:
+ # name: "blip_caption"
+ # eval:
+ # name: "blip_caption"
+ # sample_ratio: 15
run:
task: instruction_tuning
@@ -108,12 +108,13 @@ run:
save_freq: 1500
weight_decay: 0.05
- max_epoch: 5
+ max_epoch: 1
num_workers: 4
warmup_steps: 600
+ iters_per_epoch: 1000
seed: 42
- output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1206/"
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_balance_raw_QformerMoE_train_qf_train_qt_linear_gate_5ex_top1_1loss_textinqf_training_epo5_toplayer3_1220_test/"
amp: True
resume_ckpt_path: null
diff --git a/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml
new file mode 100644
index 0000000..afdb4eb
--- /dev/null
+++ b/minigpt4/projects/qformer_moe_vicuna/train/mix_qformer_moe_blip2_vicuna7b_data_raw_0112.yaml
@@ -0,0 +1,125 @@
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+ arch: blip2_vicuna_instruct
+ model_type: vicuna7b_pretrain
+ load_pretrained: True
+ load_finetuned: False
+ vit_model: eva_clip_g
+ pretrained: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+ # finetuned: ""
+ q_former_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/blip2/blip2_vicuna7b/blip2_pretrained_vicuna7b.pth"
+
+ # vit encoder
+ image_size: 224
+ drop_path_rate: 0
+ use_grad_checkpoint: False
+ vit_precision: "fp16"
+
+ # Q-Former
+ num_query_token: 32
+ qformer_text_input: True
+
+ # vicuna7b
+ llm_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/vicuna-7b-v1.1"
+ prompt: ""
+ max_txt_len: 256
+ max_output_txt_len: 256
+
+ # freeze
+ freeze_vit: True
+ freeze_llm: True
+ freeze_qformer: False
+ freeze_t5_proj: False
+
+ # moe
+ use_moeqformer: False
+ moebert_expert_num: 1
+ moebert_route_method: "gate-sentence"
+ moebert_load_balance: 0.05
+ moe_topk: 1
+
+datasets:
+ gqa: # train: 943000, 12578, 12578)
+ type: balanced_sft_raw
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 50
+
+ ok_vqa: # train, valid (9009, 5046)
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 8
+
+ coco_vqa: # 658104
+ batch_size: 16
+ vis_processor:
+ train:
+ name: "blip2_image_train"
+ image_size: 224
+ eval:
+ name: "blip2_image_eval"
+ image_size: 224
+ text_processor:
+ train:
+ name: "blip_caption"
+ eval:
+ name: "blip_caption"
+ sample_ratio: 15
+
+run:
+ task: instruction_tuning
+ # optimizer
+ lr_sched: "linear_warmup_cosine_lr"
+ # init_lr: 2e-5
+ init_lr: 5e-5
+ min_lr: 1e-6
+ warmup_lr: 1e-6
+ log_freq: 5
+ save_freq: 1500
+
+ weight_decay: 0.05
+ max_epoch: 8
+ num_workers: 4
+ warmup_steps: 600
+ iters_per_epoch: 5000
+
+ seed: 42
+ output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/vicuna7b/qformer_moe/mix_coco_gqa_1610k_raw_QformerMoE_train_qf_train_qt_1ex_top1_textinqf_epo8_lr5e5_seed42_0112/"
+
+ amp: True
+ resume_ckpt_path: null
+
+ evaluate: False
+ train_splits: ["train"]
+ valid_splits: ["val"]
+
+ device: "cuda"
+ world_size: 1
+ dist_url: "env://"
+ distributed: True
\ No newline at end of file
diff --git a/minigpt4/runners/runner_base.py b/minigpt4/runners/runner_base.py
index 89413a3..8bc071b 100644
--- a/minigpt4/runners/runner_base.py
+++ b/minigpt4/runners/runner_base.py
@@ -110,6 +110,7 @@ class RunnerBase:
else:
p_wd.append(p)
num_parameters += p.data.nelement()
+ # import pdb; pdb.set_trace() # 0107test
logging.info("number of trainable parameters: %d" % num_parameters)
optim_params = [
{
diff --git a/minigpt4/tasks/base_task.py b/minigpt4/tasks/base_task.py
index 3a39fc8..f0993ce 100644
--- a/minigpt4/tasks/base_task.py
+++ b/minigpt4/tasks/base_task.py
@@ -238,13 +238,17 @@ class BaseTask:
with torch.cuda.amp.autocast(enabled=use_amp):
loss = self.train_step(model=model, samples=samples)
-
+
# after_train_step()
if use_amp:
+ # torch.autograd.set_detect_anomaly(True)
+ # 反向传播时检测是否有异常值,定位code
+ # with torch.autograd.detect_anomaly():
scaler.scale(loss).backward()
else:
loss.backward()
+ # import pdb; pdb.set_trace() # 0107test
# update gradients every accum_grad_iters iterations
if (i + 1) % accum_grad_iters == 0:
if use_amp:
@@ -252,6 +256,9 @@ class BaseTask:
scaler.update()
else:
optimizer.step()
+
+ # import pdb; pdb.set_trace()# 0107test
+
optimizer.zero_grad()
# if self.cfg.wandb_log:
# if self.cfg.run_cfg.wandb_log:
diff --git a/requirements.txt b/requirements.txt
index cbfa260..0d7634c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -44,4 +44,6 @@ wheel
visualizer
tensorboard
kmeans_pytorch
-visual_genome
\ No newline at end of file
+visual_genome
+gpustat
+torchviz
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..6a67455
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from setuptools import setup, find_namespace_packages
+import platform
+
+DEPENDENCY_LINKS = []
+if platform.system() == "Windows":
+ DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
+
+
+def fetch_requirements(filename):
+ with open(filename) as f:
+ return [ln.strip() for ln in f.read().split("\n")]
+
+
+setup(
+ name="PromptMoE",
+ version="1.0.1",
+ author="Hanzi Wang",
+ description="PromptMoE & QformerMoE Based on LAVIS",
+ long_description=open("README.md", "r", encoding="utf-8").read(),
+ long_description_content_type="text/markdown",
+ keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch",
+ license="3-Clause BSD",
+ packages=find_namespace_packages(include="lavis.*"),
+ install_requires=fetch_requirements("requirements.txt"),
+ python_requires=">=3.7.0",
+ include_package_data=True,
+ dependency_links=DEPENDENCY_LINKS,
+ zip_safe=False,
+)
\ No newline at end of file
diff --git a/test.pdf/backward_graph b/test.pdf/backward_graph
new file mode 100644
index 0000000..7867fb1
--- /dev/null
+++ b/test.pdf/backward_graph
@@ -0,0 +1,5570 @@
+digraph {
+ graph [size="778.8,778.8"]
+ node [align=left fontname=monospace fontsize=10 height=0.2 ranksep=0.1 shape=box style=filled]
+ 140509988778688 [label="
+ (1, 49, 768)" fillcolor=darkolivegreen1]
+ 140509588281712 [label=CatBackward0]
+ 140509588282912 -> 140509588281712
+ 140509588282912 [label=IndexBackward0]
+ 140509588281808 -> 140509588282912
+ 140509588281808 [label=SumBackward1]
+ 140509588283152 -> 140509588281808
+ 140509588283152 [label=MulBackward0]
+ 140509588282864 -> 140509588283152
+ 140509588282864 [label=CatBackward0]
+ 140509591316848 -> 140509588282864
+ 140509591316848 [label=UnsqueezeBackward0]
+ 140509591314640 -> 140509591316848
+ 140509591314640 [label=NativeLayerNormBackward0]
+ 140509591317376 -> 140509591314640
+ 140509591317376 [label=AddBackward0]
+ 140509588312944 -> 140509591317376
+ 140509588312944 [label=NativeDropoutBackward0]
+ 140509588313424 -> 140509588312944
+ 140509588313424 [label=ViewBackward0]
+ 140509588313232 -> 140509588313424
+ 140509588313232 [label=AddmmBackward0]
+ 140509588312560 -> 140509588313232
+ 140509588312560 [label=ToCopyBackward0]
+ 140509591318384 -> 140509588312560
+ 140509591260672 [label="encoder.layer.11.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591260672 -> 140509591318384
+ 140509591318384 [label=AccumulateGrad]
+ 140509588313040 -> 140509588313232
+ 140509588313040 [label=ViewBackward0]
+ 140509588312368 -> 140509588313040
+ 140509588312368 [label=GeluBackward0]
+ 140509588312176 -> 140509588312368
+ 140509588312176 [label=ViewBackward0]
+ 140509588313328 -> 140509588312176
+ 140509588313328 [label=AddmmBackward0]
+ 140509588313520 -> 140509588313328
+ 140509588313520 [label=ToCopyBackward0]
+ 140509588313808 -> 140509588313520
+ 140509591261072 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591261072 -> 140509588313808
+ 140509588313808 [label=AccumulateGrad]
+ 140509588313616 -> 140509588313328
+ 140509588313616 [label=ViewBackward0]
+ 140509588314096 -> 140509588313616
+ 140509588314096 [label=ToCopyBackward0]
+ 140509588312608 -> 140509588314096
+ 140509588312608 [label=SliceBackward0]
+ 140509588314048 -> 140509588312608
+ 140509588314048 [label=SliceBackward0]
+ 140509588314288 -> 140509588314048
+ 140509588314288 [label=SliceBackward0]
+ 140509588314480 -> 140509588314288
+ 140509588314480 [label=SliceBackward0]
+ 140509588314528 -> 140509588314480
+ 140509588314528 [label=SliceBackward0]
+ 140509588314768 -> 140509588314528
+ 140509588314768 [label=NativeLayerNormBackward0]
+ 140509588314960 -> 140509588314768
+ 140509588314960 [label=AddBackward0]
+ 140509588315248 -> 140509588314960
+ 140509588315248 [label=NativeDropoutBackward0]
+ 140509588315632 -> 140509588315248
+ 140509588315632 [label=ViewBackward0]
+ 140509588315824 -> 140509588315632
+ 140509588315824 [label=AddmmBackward0]
+ 140509588316016 -> 140509588315824
+ 140509588316016 [label=ToCopyBackward0]
+ 140509588315968 -> 140509588316016
+ 140509591290880 [label="encoder.layer.11.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591290880 -> 140509588315968
+ 140509588315968 [label=AccumulateGrad]
+ 140509588315728 -> 140509588315824
+ 140509588315728 [label=ViewBackward0]
+ 140509588316112 -> 140509588315728
+ 140509588316112 [label=ViewBackward0]
+ 140509588345136 -> 140509588316112
+ 140509588345136 [label=CloneBackward0]
+ 140509588345184 -> 140509588345136
+ 140509588345184 [label=PermuteBackward0]
+ 140509588345424 -> 140509588345184
+ 140509588345424 [label=UnsafeViewBackward0]
+ 140509588345616 -> 140509588345424
+ 140509588345616 [label=BmmBackward0]
+ 140509588345664 -> 140509588345616
+ 140509588345664 [label=ReshapeAliasBackward0]
+ 140509588346192 -> 140509588345664
+ 140509588346192 [label=ExpandBackward0]
+ 140509588346288 -> 140509588346192
+ 140509588346288 [label=ToCopyBackward0]
+ 140509588346480 -> 140509588346288
+ 140509588346480 [label=NativeDropoutBackward0]
+ 140509588346672 -> 140509588346480
+ 140509588346672 [label=SoftmaxBackward0]
+ 140509588346768 -> 140509588346672
+ 140509588346768 [label=AddBackward0]
+ 140509588346960 -> 140509588346768
+ 140509588346960 [label=DivBackward0]
+ 140509588347152 -> 140509588346960
+ 140509588347152 [label=UnsafeViewBackward0]
+ 140509588347248 -> 140509588347152
+ 140509588347248 [label=BmmBackward0]
+ 140509588347440 -> 140509588347248
+ 140509588347440 [label=UnsafeViewBackward0]
+ 140509588347536 -> 140509588347440
+ 140509588347536 [label=CloneBackward0]
+ 140509588347584 -> 140509588347536
+ 140509588347584 [label=ExpandBackward0]
+ 140509588347824 -> 140509588347584
+ 140509588347824 [label=PermuteBackward0]
+ 140509588348016 -> 140509588347824
+ 140509588348016 [label=ViewBackward0]
+ 140509588348064 -> 140509588348016
+ 140509588348064 [label=ViewBackward0]
+ 140509588348304 -> 140509588348064
+ 140509588348304 [label=AddmmBackward0]
+ 140509588348496 -> 140509588348304
+ 140509588348496 [label=ToCopyBackward0]
+ 140509588348784 -> 140509588348496
+ 140509591291680 [label="encoder.layer.11.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509591291680 -> 140509588348784
+ 140509588348784 [label=AccumulateGrad]
+ 140509588348592 -> 140509588348304
+ 140509588348592 [label=ViewBackward0]
+ 140509588348544 -> 140509588348592
+ 140509588348544 [label=ToCopyBackward0]
+ 140509588315344 -> 140509588348544
+ 140509588315344 [label=CatBackward0]
+ 140509588369568 -> 140509588315344
+ 140509588369568 [label=SumBackward1]
+ 140509588370096 -> 140509588369568
+ 140509588370096 [label=MulBackward0]
+ 140509588370192 -> 140509588370096
+ 140509588370192 [label=CatBackward0]
+ 140509588370288 -> 140509588370192
+ 140509588370288 [label=UnsqueezeBackward0]
+ 140509588370672 -> 140509588370288
+ 140509588370672 [label=NativeLayerNormBackward0]
+ 140509588370864 -> 140509588370672
+ 140509588370864 [label=AddBackward0]
+ 140509588371152 -> 140509588370864
+ 140509588371152 [label=NativeDropoutBackward0]
+ 140509588371248 -> 140509588371152
+ 140509588371248 [label=ViewBackward0]
+ 140509588371440 -> 140509588371248
+ 140509588371440 [label=AddmmBackward0]
+ 140509588371488 -> 140509588371440
+ 140509588371488 [label=ToCopyBackward0]
+ 140509588371920 -> 140509588371488
+ 140509591285568 [label="encoder.layer.10.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591285568 -> 140509588371920
+ 140509588371920 [label=AccumulateGrad]
+ 140509588371632 -> 140509588371440
+ 140509588371632 [label=ViewBackward0]
+ 140509588372112 -> 140509588371632
+ 140509588372112 [label=GeluBackward0]
+ 140509588372304 -> 140509588372112
+ 140509588372304 [label=ViewBackward0]
+ 140509588372496 -> 140509588372304
+ 140509588372496 [label=AddmmBackward0]
+ 140509588372592 -> 140509588372496
+ 140509588372592 [label=ToCopyBackward0]
+ 140509588372976 -> 140509588372592
+ 140509591285488 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591285488 -> 140509588372976
+ 140509588372976 [label=AccumulateGrad]
+ 140509588372400 -> 140509588372496
+ 140509588372400 [label=ViewBackward0]
+ 140509588372880 -> 140509588372400
+ 140509588372880 [label=ToCopyBackward0]
+ 140509588370960 -> 140509588372880
+ 140509588370960 [label=SliceBackward0]
+ 140509588373264 -> 140509588370960
+ 140509588373264 [label=SliceBackward0]
+ 140509588373456 -> 140509588373264
+ 140509588373456 [label=NativeLayerNormBackward0]
+ 140509588373360 -> 140509588373456
+ 140509588373360 [label=AddBackward0]
+ 140509588402672 -> 140509588373360
+ 140509588402672 [label=NativeDropoutBackward0]
+ 140509588402624 -> 140509588402672
+ 140509588402624 [label=ViewBackward0]
+ 140509588402864 -> 140509588402624
+ 140509588402864 [label=AddmmBackward0]
+ 140509588403056 -> 140509588402864
+ 140509588403056 [label=ToCopyBackward0]
+ 140509588403344 -> 140509588403056
+ 140509591293840 [label="encoder.layer.10.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591293840 -> 140509588403344
+ 140509588403344 [label=AccumulateGrad]
+ 140509588403152 -> 140509588402864
+ 140509588403152 [label=ViewBackward0]
+ 140509588403632 -> 140509588403152
+ 140509588403632 [label=ViewBackward0]
+ 140509588403728 -> 140509588403632
+ 140509588403728 [label=CloneBackward0]
+ 140509588403920 -> 140509588403728
+ 140509588403920 [label=PermuteBackward0]
+ 140509588404112 -> 140509588403920
+ 140509588404112 [label=UnsafeViewBackward0]
+ 140509588404208 -> 140509588404112
+ 140509588404208 [label=BmmBackward0]
+ 140509588404400 -> 140509588404208
+ 140509588404400 [label=ReshapeAliasBackward0]
+ 140509588404496 -> 140509588404400
+ 140509588404496 [label=ExpandBackward0]
+ 140509588404544 -> 140509588404496
+ 140509588404544 [label=ToCopyBackward0]
+ 140509588404784 -> 140509588404544
+ 140509588404784 [label=NativeDropoutBackward0]
+ 140509588404976 -> 140509588404784
+ 140509588404976 [label=SoftmaxBackward0]
+ 140509588405024 -> 140509588404976
+ 140509588405024 [label=AddBackward0]
+ 140509588405264 -> 140509588405024
+ 140509588405264 [label=DivBackward0]
+ 140509588405456 -> 140509588405264
+ 140509588405456 [label=UnsafeViewBackward0]
+ 140509588405504 -> 140509588405456
+ 140509588405504 [label=BmmBackward0]
+ 140509588405744 -> 140509588405504
+ 140509588405744 [label=UnsafeViewBackward0]
+ 140509588406128 -> 140509588405744
+ 140509588406128 [label=CloneBackward0]
+ 140509588405984 -> 140509588406128
+ 140509588405984 [label=ExpandBackward0]
+ 140509588427056 -> 140509588405984
+ 140509588427056 [label=PermuteBackward0]
+ 140509588427152 -> 140509588427056
+ 140509588427152 [label=ViewBackward0]
+ 140509588427344 -> 140509588427152
+ 140509588427344 [label=ViewBackward0]
+ 140509588427536 -> 140509588427344
+ 140509588427536 [label=AddmmBackward0]
+ 140509588427632 -> 140509588427536
+ 140509588427632 [label=ToCopyBackward0]
+ 140509588428016 -> 140509588427632
+ 140509591312160 [label="encoder.layer.10.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509591312160 -> 140509588428016
+ 140509588428016 [label=AccumulateGrad]
+ 140509588427440 -> 140509588427536
+ 140509588427440 [label=ViewBackward0]
+ 140509588427920 -> 140509588427440
+ 140509588427920 [label=ToCopyBackward0]
+ 140509588402384 -> 140509588427920
+ 140509588402384 [label=SliceBackward0]
+ 140509588428304 -> 140509588402384
+ 140509588428304 [label=SliceBackward0]
+ 140509588428496 -> 140509588428304
+ 140509588428496 [label=SliceBackward0]
+ 140509588428592 -> 140509588428496
+ 140509588428592 [label=NativeLayerNormBackward0]
+ 140509588428784 -> 140509588428592
+ 140509588428784 [label=AddBackward0]
+ 140509588429072 -> 140509588428784
+ 140509588429072 [label=NativeDropoutBackward0]
+ 140509588429168 -> 140509588429072
+ 140509588429168 [label=ViewBackward0]
+ 140509588429360 -> 140509588429168
+ 140509588429360 [label=AddmmBackward0]
+ 140509588429408 -> 140509588429360
+ 140509588429408 [label=ToCopyBackward0]
+ 140509588429840 -> 140509588429408
+ 140509591312960 [label="encoder.layer.10.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591312960 -> 140509588429840
+ 140509588429840 [label=AccumulateGrad]
+ 140509588429552 -> 140509588429360
+ 140509588429552 [label=ViewBackward0]
+ 140509588430032 -> 140509588429552
+ 140509588430032 [label=ViewBackward0]
+ 140509588430224 -> 140509588430032
+ 140509588430224 [label=CloneBackward0]
+ 140509588430416 -> 140509588430224
+ 140509588430416 [label=PermuteBackward0]
+ 140509588430512 -> 140509588430416
+ 140509588430512 [label=UnsafeViewBackward0]
+ 140509588430704 -> 140509588430512
+ 140509588430704 [label=BmmBackward0]
+ 140509588430608 -> 140509588430704
+ 140509588430608 [label=ReshapeAliasBackward0]
+ 140509588459728 -> 140509588430608
+ 140509588459728 [label=ExpandBackward0]
+ 140509588459824 -> 140509588459728
+ 140509588459824 [label=ToCopyBackward0]
+ 140509588460016 -> 140509588459824
+ 140509588460016 [label=NativeDropoutBackward0]
+ 140509588460064 -> 140509588460016
+ 140509588460064 [label=SoftmaxBackward0]
+ 140509588460304 -> 140509588460064
+ 140509588460304 [label=AddBackward0]
+ 140509588460496 -> 140509588460304
+ 140509588460496 [label=DivBackward0]
+ 140509588460544 -> 140509588460496
+ 140509588460544 [label=UnsafeViewBackward0]
+ 140509588460784 -> 140509588460544
+ 140509588460784 [label=BmmBackward0]
+ 140509588460976 -> 140509588460784
+ 140509588460976 [label=UnsafeViewBackward0]
+ 140509588461360 -> 140509588460976
+ 140509588461360 [label=CloneBackward0]
+ 140509588461552 -> 140509588461360
+ 140509588461552 [label=ExpandBackward0]
+ 140509588461648 -> 140509588461552
+ 140509588461648 [label=PermuteBackward0]
+ 140509588461840 -> 140509588461648
+ 140509588461840 [label=ViewBackward0]
+ 140509588462032 -> 140509588461840
+ 140509588462032 [label=ViewBackward0]
+ 140509588462128 -> 140509588462032
+ 140509588462128 [label=AddmmBackward0]
+ 140509588462320 -> 140509588462128
+ 140509588462320 [label=ToCopyBackward0]
+ 140509588462608 -> 140509588462320
+ 140509591313360 [label="encoder.layer.10.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509591313360 -> 140509588462608
+ 140509588462608 [label=AccumulateGrad]
+ 140509588461984 -> 140509588462128
+ 140509588461984 [label=ViewBackward0]
+ 140509588462464 -> 140509588461984
+ 140509588462464 [label=ToCopyBackward0]
+ 140509588428880 -> 140509588462464
+ 140509588428880 [label=CatBackward0]
+ 140509588462992 -> 140509588428880
+ 140509588462992 [label=SumBackward1]
+ 140509588462944 -> 140509588462992
+ 140509588462944 [label=MulBackward0]
+ 140509588463184 -> 140509588462944
+ 140509588463184 [label=CatBackward0]
+ 140509588463568 -> 140509588463184
+ 140509588463568 [label=UnsqueezeBackward0]
+ 140509588463424 -> 140509588463568
+ 140509588463424 [label=NativeLayerNormBackward0]
+ 140509587960112 -> 140509588463424
+ 140509587960112 [label=AddBackward0]
+ 140509587960400 -> 140509587960112
+ 140509587960400 [label=NativeDropoutBackward0]
+ 140509587960784 -> 140509587960400
+ 140509587960784 [label=ViewBackward0]
+ 140509587960976 -> 140509587960784
+ 140509587960976 [label=AddmmBackward0]
+ 140509587961168 -> 140509587960976
+ 140509587961168 [label=ToCopyBackward0]
+ 140509587961456 -> 140509587961168
+ 140509591311680 [label="encoder.layer.9.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591311680 -> 140509587961456
+ 140509587961456 [label=AccumulateGrad]
+ 140509587960880 -> 140509587960976
+ 140509587960880 [label=ViewBackward0]
+ 140509587961360 -> 140509587960880
+ 140509587961360 [label=GeluBackward0]
+ 140509587961552 -> 140509587961360
+ 140509587961552 [label=ViewBackward0]
+ 140509587961600 -> 140509587961552
+ 140509587961600 [label=AddmmBackward0]
+ 140509587961840 -> 140509587961600
+ 140509587961840 [label=ToCopyBackward0]
+ 140509587962080 -> 140509587961840
+ 140509591312000 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591312000 -> 140509587962080
+ 140509587962080 [label=AccumulateGrad]
+ 140509587961936 -> 140509587961600
+ 140509587961936 [label=ViewBackward0]
+ 140509587962416 -> 140509587961936
+ 140509587962416 [label=ToCopyBackward0]
+ 140509587960496 -> 140509587962416
+ 140509587960496 [label=SliceBackward0]
+ 140509587962512 -> 140509587960496
+ 140509587962512 [label=SliceBackward0]
+ 140509587962560 -> 140509587962512
+ 140509587962560 [label=SliceBackward0]
+ 140509587962800 -> 140509587962560
+ 140509587962800 [label=SliceBackward0]
+ 140509587962992 -> 140509587962800
+ 140509587962992 [label=SliceBackward0]
+ 140509587963040 -> 140509587962992
+ 140509587963040 [label=NativeLayerNormBackward0]
+ 140509587963280 -> 140509587963040
+ 140509587963280 [label=AddBackward0]
+ 140509587963520 -> 140509587963280
+ 140509587963520 [label=NativeDropoutBackward0]
+ 140509587963760 -> 140509587963520
+ 140509587963760 [label=ViewBackward0]
+ 140509587988784 -> 140509587963760
+ 140509587988784 [label=AddmmBackward0]
+ 140509587988976 -> 140509587988784
+ 140509587988976 [label=ToCopyBackward0]
+ 140509587989264 -> 140509587988976
+ 140509591321152 [label="encoder.layer.9.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591321152 -> 140509587989264
+ 140509587989264 [label=AccumulateGrad]
+ 140509587988640 -> 140509587988784
+ 140509587988640 [label=ViewBackward0]
+ 140509587989120 -> 140509587988640
+ 140509587989120 [label=ViewBackward0]
+ 140509587989360 -> 140509587989120
+ 140509587989360 [label=CloneBackward0]
+ 140509587989552 -> 140509587989360
+ 140509587989552 [label=PermuteBackward0]
+ 140509587989600 -> 140509587989552
+ 140509587989600 [label=UnsafeViewBackward0]
+ 140509587989840 -> 140509587989600
+ 140509587989840 [label=BmmBackward0]
+ 140509587990032 -> 140509587989840
+ 140509587990032 [label=ReshapeAliasBackward0]
+ 140509587990416 -> 140509587990032
+ 140509587990416 [label=ExpandBackward0]
+ 140509587990608 -> 140509587990416
+ 140509587990608 [label=ToCopyBackward0]
+ 140509587990704 -> 140509587990608
+ 140509587990704 [label=NativeDropoutBackward0]
+ 140509587990896 -> 140509587990704
+ 140509587990896 [label=SoftmaxBackward0]
+ 140509587991088 -> 140509587990896
+ 140509587991088 [label=AddBackward0]
+ 140509587991184 -> 140509587991088
+ 140509587991184 [label=DivBackward0]
+ 140509587991376 -> 140509587991184
+ 140509587991376 [label=UnsafeViewBackward0]
+ 140509587991568 -> 140509587991376
+ 140509587991568 [label=BmmBackward0]
+ 140509587991664 -> 140509587991568
+ 140509587991664 [label=UnsafeViewBackward0]
+ 140509587991760 -> 140509587991664
+ 140509587991760 [label=CloneBackward0]
+ 140509587991952 -> 140509587991760
+ 140509587991952 [label=ExpandBackward0]
+ 140509587992000 -> 140509587991952
+ 140509587992000 [label=PermuteBackward0]
+ 140509587992240 -> 140509587992000
+ 140509587992240 [label=ViewBackward0]
+ 140509587992432 -> 140509587992240
+ 140509587992432 [label=ViewBackward0]
+ 140509587991520 -> 140509587992432
+ 140509587991520 [label=AddmmBackward0]
+ 140509588021456 -> 140509587991520
+ 140509588021456 [label=ToCopyBackward0]
+ 140509588021696 -> 140509588021456
+ 140509591321952 [label="encoder.layer.9.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509591321952 -> 140509588021696
+ 140509588021696 [label=AccumulateGrad]
+ 140509588021552 -> 140509587991520
+ 140509588021552 [label=ViewBackward0]
+ 140509588022032 -> 140509588021552
+ 140509588022032 [label=ToCopyBackward0]
+ 140509587963664 -> 140509588022032
+ 140509587963664 [label=CatBackward0]
+ 140509588022128 -> 140509587963664
+ 140509588022128 [label=SumBackward1]
+ 140509588022512 -> 140509588022128
+ 140509588022512 [label=MulBackward0]
+ 140509588022704 -> 140509588022512
+ 140509588022704 [label=CatBackward0]
+ 140509588022656 -> 140509588022704
+ 140509588022656 [label=UnsqueezeBackward0]
+ 140509588023184 -> 140509588022656
+ 140509588023184 [label=NativeLayerNormBackward0]
+ 140509588023280 -> 140509588023184
+ 140509588023280 [label=AddBackward0]
+ 140509588023664 -> 140509588023280
+ 140509588023664 [label=NativeDropoutBackward0]
+ 140509588023616 -> 140509588023664
+ 140509588023616 [label=ViewBackward0]
+ 140509588023856 -> 140509588023616
+ 140509588023856 [label=AddmmBackward0]
+ 140509588024048 -> 140509588023856
+ 140509588024048 [label=ToCopyBackward0]
+ 140509588024336 -> 140509588024048
+ 140509591320272 [label="encoder.layer.8.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591320272 -> 140509588024336
+ 140509588024336 [label=AccumulateGrad]
+ 140509588024144 -> 140509588023856
+ 140509588024144 [label=ViewBackward0]
+ 140509588024624 -> 140509588024144
+ 140509588024624 [label=GeluBackward0]
+ 140509588024720 -> 140509588024624
+ 140509588024720 [label=ViewBackward0]
+ 140509588024912 -> 140509588024720
+ 140509588024912 [label=AddmmBackward0]
+ 140509588025104 -> 140509588024912
+ 140509588025104 [label=ToCopyBackward0]
+ 140509588025056 -> 140509588025104
+ 140509591320192 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591320192 -> 140509588025056
+ 140509588025056 [label=AccumulateGrad]
+ 140509588024816 -> 140509588024912
+ 140509588024816 [label=ViewBackward0]
+ 140509588025200 -> 140509588024816
+ 140509588025200 [label=ToCopyBackward0]
+ 140509588023376 -> 140509588025200
+ 140509588023376 [label=SliceBackward0]
+ 140509588046224 -> 140509588023376
+ 140509588046224 [label=SliceBackward0]
+ 140509588046416 -> 140509588046224
+ 140509588046416 [label=NativeLayerNormBackward0]
+ 140509588046608 -> 140509588046416
+ 140509588046608 [label=AddBackward0]
+ 140509588046800 -> 140509588046608
+ 140509588046800 [label=NativeDropoutBackward0]
+ 140509588047184 -> 140509588046800
+ 140509588047184 [label=ViewBackward0]
+ 140509588047376 -> 140509588047184
+ 140509588047376 [label=AddmmBackward0]
+ 140509588047568 -> 140509588047376
+ 140509588047568 [label=ToCopyBackward0]
+ 140509588047856 -> 140509588047568
+ 140509591341312 [label="encoder.layer.8.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591341312 -> 140509588047856
+ 140509588047856 [label=AccumulateGrad]
+ 140509588047280 -> 140509588047376
+ 140509588047280 [label=ViewBackward0]
+ 140509588047760 -> 140509588047280
+ 140509588047760 [label=ViewBackward0]
+ 140509588047952 -> 140509588047760
+ 140509588047952 [label=CloneBackward0]
+ 140509588048000 -> 140509588047952
+ 140509588048000 [label=PermuteBackward0]
+ 140509588048240 -> 140509588048000
+ 140509588048240 [label=UnsafeViewBackward0]
+ 140509588048432 -> 140509588048240
+ 140509588048432 [label=BmmBackward0]
+ 140509588048480 -> 140509588048432
+ 140509588048480 [label=ReshapeAliasBackward0]
+ 140509588049008 -> 140509588048480
+ 140509588049008 [label=ExpandBackward0]
+ 140509588049104 -> 140509588049008
+ 140509588049104 [label=ToCopyBackward0]
+ 140509588049296 -> 140509588049104
+ 140509588049296 [label=NativeDropoutBackward0]
+ 140509588049488 -> 140509588049296
+ 140509588049488 [label=SoftmaxBackward0]
+ 140509588049584 -> 140509588049488
+ 140509588049584 [label=AddBackward0]
+ 140509588049776 -> 140509588049584
+ 140509588049776 [label=DivBackward0]
+ 140509588049680 -> 140509588049776
+ 140509588049680 [label=UnsafeViewBackward0]
+ 140509588074656 -> 140509588049680
+ 140509588074656 [label=BmmBackward0]
+ 140509588074896 -> 140509588074656
+ 140509588074896 [label=UnsafeViewBackward0]
+ 140509588074992 -> 140509588074896
+ 140509588074992 [label=CloneBackward0]
+ 140509588075040 -> 140509588074992
+ 140509588075040 [label=ExpandBackward0]
+ 140509588075280 -> 140509588075040
+ 140509588075280 [label=PermuteBackward0]
+ 140509588075472 -> 140509588075280
+ 140509588075472 [label=ViewBackward0]
+ 140509588075520 -> 140509588075472
+ 140509588075520 [label=ViewBackward0]
+ 140509588075760 -> 140509588075520
+ 140509588075760 [label=AddmmBackward0]
+ 140509588075952 -> 140509588075760
+ 140509588075952 [label=ToCopyBackward0]
+ 140509588076240 -> 140509588075952
+ 140509591342432 [label="encoder.layer.8.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509591342432 -> 140509588076240
+ 140509588076240 [label=AccumulateGrad]
+ 140509588076048 -> 140509588075760
+ 140509588076048 [label=ViewBackward0]
+ 140509588076528 -> 140509588076048
+ 140509588076528 [label=ToCopyBackward0]
+ 140509588046896 -> 140509588076528
+ 140509588046896 [label=SliceBackward0]
+ 140509588076480 -> 140509588046896
+ 140509588076480 [label=SliceBackward0]
+ 140509588076720 -> 140509588076480
+ 140509588076720 [label=SliceBackward0]
+ 140509588076912 -> 140509588076720
+ 140509588076912 [label=NativeLayerNormBackward0]
+ 140509588076960 -> 140509588076912
+ 140509588076960 [label=AddBackward0]
+ 140509588077392 -> 140509588076960
+ 140509588077392 [label=NativeDropoutBackward0]
+ 140509588077776 -> 140509588077392
+ 140509588077776 [label=ViewBackward0]
+ 140509588077968 -> 140509588077776
+ 140509588077968 [label=AddmmBackward0]
+ 140509588078064 -> 140509588077968
+ 140509588078064 [label=ToCopyBackward0]
+ 140509588078448 -> 140509588078064
+ 140509590823056 [label="encoder.layer.8.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590823056 -> 140509588078448
+ 140509588078448 [label=AccumulateGrad]
+ 140509588077872 -> 140509588077968
+ 140509588077872 [label=ViewBackward0]
+ 140509588078352 -> 140509588077872
+ 140509588078352 [label=ViewBackward0]
+ 140509588078400 -> 140509588078352
+ 140509588078400 [label=CloneBackward0]
+ 140509588078160 -> 140509588078400
+ 140509588078160 [label=PermuteBackward0]
+ 140509588103472 -> 140509588078160
+ 140509588103472 [label=UnsafeViewBackward0]
+ 140509588103520 -> 140509588103472
+ 140509588103520 [label=BmmBackward0]
+ 140509588103760 -> 140509588103520
+ 140509588103760 [label=ReshapeAliasBackward0]
+ 140509588104144 -> 140509588103760
+ 140509588104144 [label=ExpandBackward0]
+ 140509588104336 -> 140509588104144
+ 140509588104336 [label=ToCopyBackward0]
+ 140509588104528 -> 140509588104336
+ 140509588104528 [label=NativeDropoutBackward0]
+ 140509588104624 -> 140509588104528
+ 140509588104624 [label=SoftmaxBackward0]
+ 140509588104816 -> 140509588104624
+ 140509588104816 [label=AddBackward0]
+ 140509588105008 -> 140509588104816
+ 140509588105008 [label=DivBackward0]
+ 140509588105104 -> 140509588105008
+ 140509588105104 [label=UnsafeViewBackward0]
+ 140509588105296 -> 140509588105104
+ 140509588105296 [label=BmmBackward0]
+ 140509588105488 -> 140509588105296
+ 140509588105488 [label=UnsafeViewBackward0]
+ 140509588105440 -> 140509588105488
+ 140509588105440 [label=CloneBackward0]
+ 140509588105680 -> 140509588105440
+ 140509588105680 [label=ExpandBackward0]
+ 140509588105872 -> 140509588105680
+ 140509588105872 [label=PermuteBackward0]
+ 140509588105920 -> 140509588105872
+ 140509588105920 [label=ViewBackward0]
+ 140509588106160 -> 140509588105920
+ 140509588106160 [label=ViewBackward0]
+ 140509588106352 -> 140509588106160
+ 140509588106352 [label=AddmmBackward0]
+ 140509588106400 -> 140509588106352
+ 140509588106400 [label=ToCopyBackward0]
+ 140509588106832 -> 140509588106400
+ 140509590823536 [label="encoder.layer.8.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590823536 -> 140509588106832
+ 140509588106832 [label=AccumulateGrad]
+ 140509588106544 -> 140509588106352
+ 140509588106544 [label=ViewBackward0]
+ 140509588107024 -> 140509588106544
+ 140509588107024 [label=ToCopyBackward0]
+ 140509588077488 -> 140509588107024
+ 140509588077488 [label=CatBackward0]
+ 140509588107120 -> 140509588077488
+ 140509588107120 [label=SumBackward1]
+ 140509588136240 -> 140509588107120
+ 140509588136240 [label=MulBackward0]
+ 140509588136432 -> 140509588136240
+ 140509588136432 [label=CatBackward0]
+ 140509588136528 -> 140509588136432
+ 140509588136528 [label=UnsqueezeBackward0]
+ 140509588136912 -> 140509588136528
+ 140509588136912 [label=NativeLayerNormBackward0]
+ 140509588137104 -> 140509588136912
+ 140509588137104 [label=AddBackward0]
+ 140509588137392 -> 140509588137104
+ 140509588137392 [label=NativeDropoutBackward0]
+ 140509588137488 -> 140509588137392
+ 140509588137488 [label=ViewBackward0]
+ 140509588137536 -> 140509588137488
+ 140509588137536 [label=AddmmBackward0]
+ 140509588137776 -> 140509588137536
+ 140509588137776 [label=ToCopyBackward0]
+ 140509588138016 -> 140509588137776
+ 140509591341952 [label="encoder.layer.7.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591341952 -> 140509588138016
+ 140509588138016 [label=AccumulateGrad]
+ 140509588137872 -> 140509588137536
+ 140509588137872 [label=ViewBackward0]
+ 140509588138352 -> 140509588137872
+ 140509588138352 [label=GeluBackward0]
+ 140509588138544 -> 140509588138352
+ 140509588138544 [label=ViewBackward0]
+ 140509588138640 -> 140509588138544
+ 140509588138640 [label=AddmmBackward0]
+ 140509588138832 -> 140509588138640
+ 140509588138832 [label=ToCopyBackward0]
+ 140509588139120 -> 140509588138832
+ 140509591342272 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591342272 -> 140509588139120
+ 140509588139120 [label=AccumulateGrad]
+ 140509588138496 -> 140509588138640
+ 140509588138496 [label=ViewBackward0]
+ 140509588138976 -> 140509588138496
+ 140509588138976 [label=ToCopyBackward0]
+ 140509588137056 -> 140509588138976
+ 140509588137056 [label=SliceBackward0]
+ 140509588139504 -> 140509588137056
+ 140509588139504 [label=SliceBackward0]
+ 140509588139600 -> 140509588139504
+ 140509588139600 [label=SliceBackward0]
+ 140509588139792 -> 140509588139600
+ 140509588139792 [label=SliceBackward0]
+ 140509588139984 -> 140509588139792
+ 140509588139984 [label=SliceBackward0]
+ 140509588139888 -> 140509588139984
+ 140509588139888 [label=NativeLayerNormBackward0]
+ 140509588164912 -> 140509588139888
+ 140509588164912 [label=AddBackward0]
+ 140509588165200 -> 140509588164912
+ 140509588165200 [label=NativeDropoutBackward0]
+ 140509588165296 -> 140509588165200
+ 140509588165296 [label=ViewBackward0]
+ 140509588165488 -> 140509588165296
+ 140509588165488 [label=AddmmBackward0]
+ 140509588165536 -> 140509588165488
+ 140509588165536 [label=ToCopyBackward0]
+ 140509588165968 -> 140509588165536
+ 140509590839360 [label="encoder.layer.7.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590839360 -> 140509588165968
+ 140509588165968 [label=AccumulateGrad]
+ 140509588165680 -> 140509588165488
+ 140509588165680 [label=ViewBackward0]
+ 140509588166160 -> 140509588165680
+ 140509588166160 [label=ViewBackward0]
+ 140509588166352 -> 140509588166160
+ 140509588166352 [label=CloneBackward0]
+ 140509588166544 -> 140509588166352
+ 140509588166544 [label=PermuteBackward0]
+ 140509588166640 -> 140509588166544
+ 140509588166640 [label=UnsafeViewBackward0]
+ 140509588166832 -> 140509588166640
+ 140509588166832 [label=BmmBackward0]
+ 140509588167024 -> 140509588166832
+ 140509588167024 [label=ReshapeAliasBackward0]
+ 140509588166976 -> 140509588167024
+ 140509588166976 [label=ExpandBackward0]
+ 140509588167216 -> 140509588166976
+ 140509588167216 [label=ToCopyBackward0]
+ 140509588167408 -> 140509588167216
+ 140509588167408 [label=NativeDropoutBackward0]
+ 140509588167456 -> 140509588167408
+ 140509588167456 [label=SoftmaxBackward0]
+ 140509588167696 -> 140509588167456
+ 140509588167696 [label=AddBackward0]
+ 140509588167888 -> 140509588167696
+ 140509588167888 [label=DivBackward0]
+ 140509588167936 -> 140509588167888
+ 140509588167936 [label=UnsafeViewBackward0]
+ 140509588168176 -> 140509588167936
+ 140509588168176 [label=BmmBackward0]
+ 140509588168368 -> 140509588168176
+ 140509588168368 [label=UnsafeViewBackward0]
+ 140509588168416 -> 140509588168368
+ 140509588168416 [label=CloneBackward0]
+ 140509588193584 -> 140509588168416
+ 140509588193584 [label=ExpandBackward0]
+ 140509588193680 -> 140509588193584
+ 140509588193680 [label=PermuteBackward0]
+ 140509588193872 -> 140509588193680
+ 140509588193872 [label=ViewBackward0]
+ 140509588194064 -> 140509588193872
+ 140509588194064 [label=ViewBackward0]
+ 140509588194160 -> 140509588194064
+ 140509588194160 [label=AddmmBackward0]
+ 140509588194352 -> 140509588194160
+ 140509588194352 [label=ToCopyBackward0]
+ 140509588194640 -> 140509588194352
+ 140509590840320 [label="encoder.layer.7.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590840320 -> 140509588194640
+ 140509588194640 [label=AccumulateGrad]
+ 140509588194016 -> 140509588194160
+ 140509588194016 [label=ViewBackward0]
+ 140509588194496 -> 140509588194016
+ 140509588194496 [label=ToCopyBackward0]
+ 140509588165008 -> 140509588194496
+ 140509588165008 [label=CatBackward0]
+ 140509588195024 -> 140509588165008
+ 140509588195024 [label=SumBackward1]
+ 140509588194976 -> 140509588195024
+ 140509588194976 [label=MulBackward0]
+ 140509588195216 -> 140509588194976
+ 140509588195216 [label=CatBackward0]
+ 140509588195600 -> 140509588195216
+ 140509588195600 [label=UnsqueezeBackward0]
+ 140509588195696 -> 140509588195600
+ 140509588195696 [label=NativeLayerNormBackward0]
+ 140509588195888 -> 140509588195696
+ 140509588195888 [label=AddBackward0]
+ 140509588196176 -> 140509588195888
+ 140509588196176 [label=NativeDropoutBackward0]
+ 140509588196560 -> 140509588196176
+ 140509588196560 [label=ViewBackward0]
+ 140509588196752 -> 140509588196560
+ 140509588196752 [label=AddmmBackward0]
+ 140509588196944 -> 140509588196752
+ 140509588196944 [label=ToCopyBackward0]
+ 140509588197232 -> 140509588196944
+ 140509590825776 [label="encoder.layer.6.experts.experts.0.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590825776 -> 140509588197232
+ 140509588197232 [label=AccumulateGrad]
+ 140509588196656 -> 140509588196752
+ 140509588196656 [label=ViewBackward0]
+ 140509588197040 -> 140509588196656
+ 140509588197040 [label=GeluBackward0]
+ 140509588196896 -> 140509588197040
+ 140509588196896 [label=ViewBackward0]
+ 140509587696464 -> 140509588196896
+ 140509587696464 [label=AddmmBackward0]
+ 140509587696368 -> 140509587696464
+ 140509587696368 [label=ToCopyBackward0]
+ 140509587693680 -> 140509587696368
+ 140509590826256 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590826256 -> 140509587693680
+ 140509587693680 [label=AccumulateGrad]
+ 140509587696752 -> 140509587696464
+ 140509587696752 [label=ViewBackward0]
+ 140509587693728 -> 140509587696752
+ 140509587693728 [label=ToCopyBackward0]
+ 140509588196272 -> 140509587693728
+ 140509588196272 [label=ViewBackward0]
+ 140509587693872 -> 140509588196272
+ 140509587693872 [label=CloneBackward0]
+ 140509587694064 -> 140509587693872
+ 140509587694064 [label=ExpandBackward0]
+ 140509587694112 -> 140509587694064
+ 140509587694112 [label=UnsqueezeBackward0]
+ 140509587694352 -> 140509587694112
+ 140509587694352 [label=SliceBackward0]
+ 140509587694544 -> 140509587694352
+ 140509587694544 [label=SliceBackward0]
+ 140509587694592 -> 140509587694544
+ 140509587694592 [label=NativeLayerNormBackward0]
+ 140509587694832 -> 140509587694592
+ 140509587694832 [label=AddBackward0]
+ 140509587695072 -> 140509587694832
+ 140509587695072 [label=NativeDropoutBackward0]
+ 140509587695408 -> 140509587695072
+ 140509587695408 [label=ViewBackward0]
+ 140509587695600 -> 140509587695408
+ 140509587695600 [label=AddmmBackward0]
+ 140509587697232 -> 140509587695600
+ 140509587697232 [label=ToCopyBackward0]
+ 140509587696992 -> 140509587697232
+ 140509590842480 [label="encoder.layer.6.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590842480 -> 140509587696992
+ 140509587696992 [label=AccumulateGrad]
+ 140509587697472 -> 140509587695600
+ 140509587697472 [label=ViewBackward0]
+ 140509587697616 -> 140509587697472
+ 140509587697616 [label=ViewBackward0]
+ 140509587696272 -> 140509587697616
+ 140509587696272 [label=CloneBackward0]
+ 140509587696944 -> 140509587696272
+ 140509587696944 [label=PermuteBackward0]
+ 140509587696512 -> 140509587696944
+ 140509587696512 [label=UnsafeViewBackward0]
+ 140509587695984 -> 140509587696512
+ 140509587695984 [label=BmmBackward0]
+ 140509587696032 -> 140509587695984
+ 140509587696032 [label=ReshapeAliasBackward0]
+ 140509587852640 -> 140509587696032
+ 140509587852640 [label=ExpandBackward0]
+ 140509587852544 -> 140509587852640
+ 140509587852544 [label=ToCopyBackward0]
+ 140509587852448 -> 140509587852544
+ 140509587852448 [label=NativeDropoutBackward0]
+ 140509587852352 -> 140509587852448
+ 140509587852352 [label=SoftmaxBackward0]
+ 140509587852256 -> 140509587852352
+ 140509587852256 [label=AddBackward0]
+ 140509587852160 -> 140509587852256
+ 140509587852160 [label=DivBackward0]
+ 140509587852064 -> 140509587852160
+ 140509587852064 [label=UnsafeViewBackward0]
+ 140509587851968 -> 140509587852064
+ 140509587851968 [label=BmmBackward0]
+ 140509587851872 -> 140509587851968
+ 140509587851872 [label=ReshapeAliasBackward0]
+ 140509587851824 -> 140509587851872
+ 140509587851824 [label=ExpandBackward0]
+ 140509587851728 -> 140509587851824
+ 140509587851728 [label=PermuteBackward0]
+ 140509587851632 -> 140509587851728
+ 140509587851632 [label=ViewBackward0]
+ 140509587851536 -> 140509587851632
+ 140509587851536 [label=ViewBackward0]
+ 140509587851440 -> 140509587851536
+ 140509587851440 [label=AddmmBackward0]
+ 140509587851344 -> 140509587851440
+ 140509587851344 [label=ToCopyBackward0]
+ 140509587851152 -> 140509587851344
+ 140509590843200 [label="encoder.layer.6.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590843200 -> 140509587851152
+ 140509587851152 [label=AccumulateGrad]
+ 140509587851296 -> 140509587851440
+ 140509587851296 [label=ViewBackward0]
+ 140509587851008 -> 140509587851296
+ 140509587851008 [label=ToCopyBackward0]
+ 140509587695120 -> 140509587851008
+ 140509587695120 [label=SliceBackward0]
+ 140509587850960 -> 140509587695120
+ 140509587850960 [label=SliceBackward0]
+ 140509587850864 -> 140509587850960
+ 140509587850864 [label=SliceBackward0]
+ 140509587850768 -> 140509587850864
+ 140509587850768 [label=NativeLayerNormBackward0]
+ 140509587850672 -> 140509587850768
+ 140509587850672 [label=AddBackward0]
+ 140509587850480 -> 140509587850672
+ 140509587850480 [label=NativeDropoutBackward0]
+ 140509587850240 -> 140509587850480
+ 140509587850240 [label=ViewBackward0]
+ 140509587850144 -> 140509587850240
+ 140509587850144 [label=AddmmBackward0]
+ 140509587850048 -> 140509587850144
+ 140509587850048 [label=ToCopyBackward0]
+ 140509587849856 -> 140509587850048
+ 140509590856064 [label="encoder.layer.6.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590856064 -> 140509587849856
+ 140509587849856 [label=AccumulateGrad]
+ 140509587850192 -> 140509587850144
+ 140509587850192 [label=ViewBackward0]
+ 140509587849904 -> 140509587850192
+ 140509587849904 [label=ViewBackward0]
+ 140509587849808 -> 140509587849904
+ 140509587849808 [label=CloneBackward0]
+ 140509587849712 -> 140509587849808
+ 140509587849712 [label=PermuteBackward0]
+ 140509587849616 -> 140509587849712
+ 140509587849616 [label=UnsafeViewBackward0]
+ 140509587849520 -> 140509587849616
+ 140509587849520 [label=BmmBackward0]
+ 140509587849424 -> 140509587849520
+ 140509587849424 [label=ReshapeAliasBackward0]
+ 140509587852976 -> 140509587849424
+ 140509587852976 [label=ExpandBackward0]
+ 140509587853072 -> 140509587852976
+ 140509587853072 [label=ToCopyBackward0]
+ 140509587853168 -> 140509587853072
+ 140509587853168 [label=NativeDropoutBackward0]
+ 140509587853264 -> 140509587853168
+ 140509587853264 [label=SoftmaxBackward0]
+ 140509587849280 -> 140509587853264
+ 140509587849280 [label=AddBackward0]
+ 140509587558608 -> 140509587849280
+ 140509587558608 [label=DivBackward0]
+ 140509587558704 -> 140509587558608
+ 140509587558704 [label=UnsafeViewBackward0]
+ 140509587558800 -> 140509587558704
+ 140509587558800 [label=BmmBackward0]
+ 140509587558896 -> 140509587558800
+ 140509587558896 [label=ReshapeAliasBackward0]
+ 140509587559040 -> 140509587558896
+ 140509587559040 [label=ExpandBackward0]
+ 140509587559136 -> 140509587559040
+ 140509587559136 [label=PermuteBackward0]
+ 140509587559232 -> 140509587559136
+ 140509587559232 [label=ViewBackward0]
+ 140509587559328 -> 140509587559232
+ 140509587559328 [label=ViewBackward0]
+ 140509587559424 -> 140509587559328
+ 140509587559424 [label=AddmmBackward0]
+ 140509587559520 -> 140509587559424
+ 140509587559520 [label=ToCopyBackward0]
+ 140509587559712 -> 140509587559520
+ 140509590856784 [label="encoder.layer.6.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590856784 -> 140509587559712
+ 140509587559712 [label=AccumulateGrad]
+ 140509587559472 -> 140509587559424
+ 140509587559472 [label=ViewBackward0]
+ 140509587559760 -> 140509587559472
+ 140509587559760 [label=ToCopyBackward0]
+ 140509587850432 -> 140509587559760
+ 140509587850432 [label=CatBackward0]
+ 140509587559904 -> 140509587850432
+ 140509587559904 [label=NativeLayerNormBackward0]
+ 140509587560048 -> 140509587559904
+ 140509587560048 [label=AddBackward0]
+ 140509587560240 -> 140509587560048
+ 140509587560240 [label=NativeDropoutBackward0]
+ 140509587560384 -> 140509587560240
+ 140509587560384 [label=ViewBackward0]
+ 140509587560480 -> 140509587560384
+ 140509587560480 [label=AddmmBackward0]
+ 140509587560576 -> 140509587560480
+ 140509587560576 [label=ToCopyBackward0]
+ 140509587560768 -> 140509587560576
+ 140509590857264 [label="encoder.layer.5.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590857264 -> 140509587560768
+ 140509587560768 [label=AccumulateGrad]
+ 140509587560528 -> 140509587560480
+ 140509587560528 [label=ViewBackward0]
+ 140509587560816 -> 140509587560528
+ 140509587560816 [label=GeluBackward0]
+ 140509587560912 -> 140509587560816
+ 140509587560912 [label=ViewBackward0]
+ 140509587561008 -> 140509587560912
+ 140509587561008 [label=AddmmBackward0]
+ 140509587561104 -> 140509587561008
+ 140509587561104 [label=ToCopyBackward0]
+ 140509587561296 -> 140509587561104
+ 140509590857504 [label="encoder.layer.5.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590857504 -> 140509587561296
+ 140509587561296 [label=AccumulateGrad]
+ 140509587561056 -> 140509587561008
+ 140509587561056 [label=ViewBackward0]
+ 140509587561344 -> 140509587561056
+ 140509587561344 [label=ToCopyBackward0]
+ 140509587560192 -> 140509587561344
+ 140509587560192 [label=SliceBackward0]
+ 140509587561488 -> 140509587560192
+ 140509587561488 [label=SliceBackward0]
+ 140509587561584 -> 140509587561488
+ 140509587561584 [label=SliceBackward0]
+ 140509587561680 -> 140509587561584
+ 140509587561680 [label=SliceBackward0]
+ 140509587561776 -> 140509587561680
+ 140509587561776 [label=SliceBackward0]
+ 140509587561872 -> 140509587561776
+ 140509587561872 [label=NativeLayerNormBackward0]
+ 140509587561968 -> 140509587561872
+ 140509587561968 [label=AddBackward0]
+ 140509587562160 -> 140509587561968
+ 140509587562160 [label=NativeDropoutBackward0]
+ 140509587562304 -> 140509587562160
+ 140509587562304 [label=ViewBackward0]
+ 140509587562400 -> 140509587562304
+ 140509587562400 [label=AddmmBackward0]
+ 140509587562448 -> 140509587562400
+ 140509587562448 [label=ToCopyBackward0]
+ 140509587570944 -> 140509587562448
+ 140509590859424 [label="encoder.layer.5.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590859424 -> 140509587570944
+ 140509587570944 [label=AccumulateGrad]
+ 140509587562208 -> 140509587562400
+ 140509587562208 [label=ViewBackward0]
+ 140509587570992 -> 140509587562208
+ 140509587570992 [label=ViewBackward0]
+ 140509587571136 -> 140509587570992
+ 140509587571136 [label=CloneBackward0]
+ 140509587571232 -> 140509587571136
+ 140509587571232 [label=PermuteBackward0]
+ 140509587571328 -> 140509587571232
+ 140509587571328 [label=UnsafeViewBackward0]
+ 140509587571424 -> 140509587571328
+ 140509587571424 [label=BmmBackward0]
+ 140509587571520 -> 140509587571424
+ 140509587571520 [label=ReshapeAliasBackward0]
+ 140509587571664 -> 140509587571520
+ 140509587571664 [label=ExpandBackward0]
+ 140509587571760 -> 140509587571664
+ 140509587571760 [label=ToCopyBackward0]
+ 140509587571856 -> 140509587571760
+ 140509587571856 [label=NativeDropoutBackward0]
+ 140509587571952 -> 140509587571856
+ 140509587571952 [label=SoftmaxBackward0]
+ 140509587572048 -> 140509587571952
+ 140509587572048 [label=AddBackward0]
+ 140509587572144 -> 140509587572048
+ 140509587572144 [label=DivBackward0]
+ 140509587572240 -> 140509587572144
+ 140509587572240 [label=UnsafeViewBackward0]
+ 140509587572336 -> 140509587572240
+ 140509587572336 [label=BmmBackward0]
+ 140509587572432 -> 140509587572336
+ 140509587572432 [label=ReshapeAliasBackward0]
+ 140509587572576 -> 140509587572432
+ 140509587572576 [label=ExpandBackward0]
+ 140509587572672 -> 140509587572576
+ 140509587572672 [label=PermuteBackward0]
+ 140509587572768 -> 140509587572672
+ 140509587572768 [label=ViewBackward0]
+ 140509587572864 -> 140509587572768
+ 140509587572864 [label=ViewBackward0]
+ 140509587572960 -> 140509587572864
+ 140509587572960 [label=AddmmBackward0]
+ 140509587573056 -> 140509587572960
+ 140509587573056 [label=ToCopyBackward0]
+ 140509587573248 -> 140509587573056
+ 140509590872528 [label="encoder.layer.5.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590872528 -> 140509587573248
+ 140509587573248 [label=AccumulateGrad]
+ 140509587573008 -> 140509587572960
+ 140509587573008 [label=ViewBackward0]
+ 140509587573296 -> 140509587573008
+ 140509587573296 [label=ToCopyBackward0]
+ 140509587562112 -> 140509587573296
+ 140509587562112 [label=CatBackward0]
+ 140509587573440 -> 140509587562112
+ 140509587573440 [label=NativeLayerNormBackward0]
+ 140509587573584 -> 140509587573440
+ 140509587573584 [label=AddBackward0]
+ 140509587573776 -> 140509587573584
+ 140509587573776 [label=NativeDropoutBackward0]
+ 140509587573920 -> 140509587573776
+ 140509587573920 [label=ViewBackward0]
+ 140509587574016 -> 140509587573920
+ 140509587574016 [label=AddmmBackward0]
+ 140509587574112 -> 140509587574016
+ 140509587574112 [label=ToCopyBackward0]
+ 140509587574304 -> 140509587574112
+ 140509590873008 [label="encoder.layer.4.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590873008 -> 140509587574304
+ 140509587574304 [label=AccumulateGrad]
+ 140509587574064 -> 140509587574016
+ 140509587574064 [label=ViewBackward0]
+ 140509587574352 -> 140509587574064
+ 140509587574352 [label=GeluBackward0]
+ 140509587574448 -> 140509587574352
+ 140509587574448 [label=ViewBackward0]
+ 140509587574544 -> 140509587574448
+ 140509587574544 [label=AddmmBackward0]
+ 140509587574640 -> 140509587574544
+ 140509587574640 [label=ToCopyBackward0]
+ 140509587574736 -> 140509587574640
+ 140509590873248 [label="encoder.layer.4.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590873248 -> 140509587574736
+ 140509587574736 [label=AccumulateGrad]
+ 140509587574592 -> 140509587574544
+ 140509587574592 [label=ViewBackward0]
+ 140509587591232 -> 140509587574592
+ 140509587591232 [label=ToCopyBackward0]
+ 140509587573728 -> 140509587591232
+ 140509587573728 [label=SliceBackward0]
+ 140509587591472 -> 140509587573728
+ 140509587591472 [label=SliceBackward0]
+ 140509587591568 -> 140509587591472
+ 140509587591568 [label=NativeLayerNormBackward0]
+ 140509587591664 -> 140509587591568
+ 140509587591664 [label=AddBackward0]
+ 140509587591856 -> 140509587591664
+ 140509587591856 [label=NativeDropoutBackward0]
+ 140509587592000 -> 140509587591856
+ 140509587592000 [label=ViewBackward0]
+ 140509587592096 -> 140509587592000
+ 140509587592096 [label=AddmmBackward0]
+ 140509587592192 -> 140509587592096
+ 140509587592192 [label=ToCopyBackward0]
+ 140509587592384 -> 140509587592192
+ 140509590875168 [label="encoder.layer.4.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590875168 -> 140509587592384
+ 140509587592384 [label=AccumulateGrad]
+ 140509587592144 -> 140509587592096
+ 140509587592144 [label=ViewBackward0]
+ 140509587592432 -> 140509587592144
+ 140509587592432 [label=ViewBackward0]
+ 140509587592528 -> 140509587592432
+ 140509587592528 [label=CloneBackward0]
+ 140509587592624 -> 140509587592528
+ 140509587592624 [label=PermuteBackward0]
+ 140509587592720 -> 140509587592624
+ 140509587592720 [label=UnsafeViewBackward0]
+ 140509587592816 -> 140509587592720
+ 140509587592816 [label=BmmBackward0]
+ 140509587592912 -> 140509587592816
+ 140509587592912 [label=ReshapeAliasBackward0]
+ 140509587593056 -> 140509587592912
+ 140509587593056 [label=ExpandBackward0]
+ 140509587593152 -> 140509587593056
+ 140509587593152 [label=ToCopyBackward0]
+ 140509587593248 -> 140509587593152
+ 140509587593248 [label=NativeDropoutBackward0]
+ 140509587593344 -> 140509587593248
+ 140509587593344 [label=SoftmaxBackward0]
+ 140509587593440 -> 140509587593344
+ 140509587593440 [label=AddBackward0]
+ 140509587593536 -> 140509587593440
+ 140509587593536 [label=DivBackward0]
+ 140509587593632 -> 140509587593536
+ 140509587593632 [label=UnsafeViewBackward0]
+ 140509587593728 -> 140509587593632
+ 140509587593728 [label=BmmBackward0]
+ 140509587593824 -> 140509587593728
+ 140509587593824 [label=ReshapeAliasBackward0]
+ 140509587593968 -> 140509587593824
+ 140509587593968 [label=ExpandBackward0]
+ 140509587594064 -> 140509587593968
+ 140509587594064 [label=PermuteBackward0]
+ 140509587594160 -> 140509587594064
+ 140509587594160 [label=ViewBackward0]
+ 140509587594256 -> 140509587594160
+ 140509587594256 [label=ViewBackward0]
+ 140509587594352 -> 140509587594256
+ 140509587594352 [label=AddmmBackward0]
+ 140509587594448 -> 140509587594352
+ 140509587594448 [label=ToCopyBackward0]
+ 140509587594640 -> 140509587594448
+ 140509590875888 [label="encoder.layer.4.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590875888 -> 140509587594640
+ 140509587594640 [label=AccumulateGrad]
+ 140509587594400 -> 140509587594352
+ 140509587594400 [label=ViewBackward0]
+ 140509587594688 -> 140509587594400
+ 140509587594688 [label=ToCopyBackward0]
+ 140509587591808 -> 140509587594688
+ 140509587591808 [label=SliceBackward0]
+ 140509587594832 -> 140509587591808
+ 140509587594832 [label=SliceBackward0]
+ 140509587594928 -> 140509587594832
+ 140509587594928 [label=SliceBackward0]
+ 140509587595024 -> 140509587594928
+ 140509587595024 [label=NativeLayerNormBackward0]
+ 140509587595120 -> 140509587595024
+ 140509587595120 [label=AddBackward0]
+ 140509587595216 -> 140509587595120
+ 140509587595216 [label=NativeDropoutBackward0]
+ 140509587607808 -> 140509587595216
+ 140509587607808 [label=ViewBackward0]
+ 140509587607904 -> 140509587607808
+ 140509587607904 [label=AddmmBackward0]
+ 140509587608000 -> 140509587607904
+ 140509587608000 [label=ToCopyBackward0]
+ 140509587608192 -> 140509587608000
+ 140509590892848 [label="encoder.layer.4.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590892848 -> 140509587608192
+ 140509587608192 [label=AccumulateGrad]
+ 140509587607952 -> 140509587607904
+ 140509587607952 [label=ViewBackward0]
+ 140509587608240 -> 140509587607952
+ 140509587608240 [label=ViewBackward0]
+ 140509587608336 -> 140509587608240
+ 140509587608336 [label=CloneBackward0]
+ 140509587608432 -> 140509587608336
+ 140509587608432 [label=PermuteBackward0]
+ 140509587608528 -> 140509587608432
+ 140509587608528 [label=UnsafeViewBackward0]
+ 140509587608624 -> 140509587608528
+ 140509587608624 [label=BmmBackward0]
+ 140509587608720 -> 140509587608624
+ 140509587608720 [label=ReshapeAliasBackward0]
+ 140509587608864 -> 140509587608720
+ 140509587608864 [label=ExpandBackward0]
+ 140509587608960 -> 140509587608864
+ 140509587608960 [label=ToCopyBackward0]
+ 140509587609056 -> 140509587608960
+ 140509587609056 [label=NativeDropoutBackward0]
+ 140509587609152 -> 140509587609056
+ 140509587609152 [label=SoftmaxBackward0]
+ 140509587609248 -> 140509587609152
+ 140509587609248 [label=AddBackward0]
+ 140509587609344 -> 140509587609248
+ 140509587609344 [label=DivBackward0]
+ 140509587609440 -> 140509587609344
+ 140509587609440 [label=UnsafeViewBackward0]
+ 140509587609536 -> 140509587609440
+ 140509587609536 [label=BmmBackward0]
+ 140509587609632 -> 140509587609536
+ 140509587609632 [label=ReshapeAliasBackward0]
+ 140509587609776 -> 140509587609632
+ 140509587609776 [label=ExpandBackward0]
+ 140509587609872 -> 140509587609776
+ 140509587609872 [label=PermuteBackward0]
+ 140509587609968 -> 140509587609872
+ 140509587609968 [label=ViewBackward0]
+ 140509587610064 -> 140509587609968
+ 140509587610064 [label=ViewBackward0]
+ 140509587610160 -> 140509587610064
+ 140509587610160 [label=AddmmBackward0]
+ 140509587610256 -> 140509587610160
+ 140509587610256 [label=ToCopyBackward0]
+ 140509587610448 -> 140509587610256
+ 140509590893568 [label="encoder.layer.4.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590893568 -> 140509587610448
+ 140509587610448 [label=AccumulateGrad]
+ 140509587610208 -> 140509587610160
+ 140509587610208 [label=ViewBackward0]
+ 140509587610496 -> 140509587610208
+ 140509587610496 [label=ToCopyBackward0]
+ 140509587607664 -> 140509587610496
+ 140509587607664 [label=CatBackward0]
+ 140509587610640 -> 140509587607664
+ 140509587610640 [label=NativeLayerNormBackward0]
+ 140509587610784 -> 140509587610640
+ 140509587610784 [label=AddBackward0]
+ 140509587610976 -> 140509587610784
+ 140509587610976 [label=NativeDropoutBackward0]
+ 140509587611120 -> 140509587610976
+ 140509587611120 [label=ViewBackward0]
+ 140509587611216 -> 140509587611120
+ 140509587611216 [label=AddmmBackward0]
+ 140509587611312 -> 140509587611216
+ 140509587611312 [label=ToCopyBackward0]
+ 140509587611504 -> 140509587611312
+ 140509590894048 [label="encoder.layer.3.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590894048 -> 140509587611504
+ 140509587611504 [label=AccumulateGrad]
+ 140509587611264 -> 140509587611216
+ 140509587611264 [label=ViewBackward0]
+ 140509587611552 -> 140509587611264
+ 140509587611552 [label=GeluBackward0]
+ 140509587611408 -> 140509587611552
+ 140509587611408 [label=ViewBackward0]
+ 140509587624096 -> 140509587611408
+ 140509587624096 [label=AddmmBackward0]
+ 140509587624192 -> 140509587624096
+ 140509587624192 [label=ToCopyBackward0]
+ 140509587624384 -> 140509587624192
+ 140509590894288 [label="encoder.layer.3.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590894288 -> 140509587624384
+ 140509587624384 [label=AccumulateGrad]
+ 140509587624144 -> 140509587624096
+ 140509587624144 [label=ViewBackward0]
+ 140509587624432 -> 140509587624144
+ 140509587624432 [label=ToCopyBackward0]
+ 140509587610928 -> 140509587624432
+ 140509587610928 [label=SliceBackward0]
+ 140509587624576 -> 140509587610928
+ 140509587624576 [label=SliceBackward0]
+ 140509587624672 -> 140509587624576
+ 140509587624672 [label=SliceBackward0]
+ 140509587624768 -> 140509587624672
+ 140509587624768 [label=SliceBackward0]
+ 140509587624864 -> 140509587624768
+ 140509587624864 [label=SliceBackward0]
+ 140509587624960 -> 140509587624864
+ 140509587624960 [label=NativeLayerNormBackward0]
+ 140509587625056 -> 140509587624960
+ 140509587625056 [label=AddBackward0]
+ 140509587625248 -> 140509587625056
+ 140509587625248 [label=NativeDropoutBackward0]
+ 140509587625392 -> 140509587625248
+ 140509587625392 [label=ViewBackward0]
+ 140509587625488 -> 140509587625392
+ 140509587625488 [label=AddmmBackward0]
+ 140509587625584 -> 140509587625488
+ 140509587625584 [label=ToCopyBackward0]
+ 140509587625776 -> 140509587625584
+ 140509590896208 [label="encoder.layer.3.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590896208 -> 140509587625776
+ 140509587625776 [label=AccumulateGrad]
+ 140509587625536 -> 140509587625488
+ 140509587625536 [label=ViewBackward0]
+ 140509587625824 -> 140509587625536
+ 140509587625824 [label=ViewBackward0]
+ 140509587625920 -> 140509587625824
+ 140509587625920 [label=CloneBackward0]
+ 140509587626016 -> 140509587625920
+ 140509587626016 [label=PermuteBackward0]
+ 140509587626112 -> 140509587626016
+ 140509587626112 [label=UnsafeViewBackward0]
+ 140509587626208 -> 140509587626112
+ 140509587626208 [label=BmmBackward0]
+ 140509587626304 -> 140509587626208
+ 140509587626304 [label=ReshapeAliasBackward0]
+ 140509587626448 -> 140509587626304
+ 140509587626448 [label=ExpandBackward0]
+ 140509587626544 -> 140509587626448
+ 140509587626544 [label=ToCopyBackward0]
+ 140509587626640 -> 140509587626544
+ 140509587626640 [label=NativeDropoutBackward0]
+ 140509587626736 -> 140509587626640
+ 140509587626736 [label=SoftmaxBackward0]
+ 140509587626832 -> 140509587626736
+ 140509587626832 [label=AddBackward0]
+ 140509587626928 -> 140509587626832
+ 140509587626928 [label=DivBackward0]
+ 140509587627024 -> 140509587626928
+ 140509587627024 [label=UnsafeViewBackward0]
+ 140509587627120 -> 140509587627024
+ 140509587627120 [label=BmmBackward0]
+ 140509587627216 -> 140509587627120
+ 140509587627216 [label=ReshapeAliasBackward0]
+ 140509587627360 -> 140509587627216
+ 140509587627360 [label=ExpandBackward0]
+ 140509587627456 -> 140509587627360
+ 140509587627456 [label=PermuteBackward0]
+ 140509587627552 -> 140509587627456
+ 140509587627552 [label=ViewBackward0]
+ 140509587627648 -> 140509587627552
+ 140509587627648 [label=ViewBackward0]
+ 140509587627744 -> 140509587627648
+ 140509587627744 [label=AddmmBackward0]
+ 140509587627840 -> 140509587627744
+ 140509587627840 [label=ToCopyBackward0]
+ 140509587627984 -> 140509587627840
+ 140509590901120 [label="encoder.layer.3.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590901120 -> 140509587627984
+ 140509587627984 [label=AccumulateGrad]
+ 140509587627792 -> 140509587627744
+ 140509587627792 [label=ViewBackward0]
+ 140509587627936 -> 140509587627792
+ 140509587627936 [label=ToCopyBackward0]
+ 140509587625200 -> 140509587627936
+ 140509587625200 [label=CatBackward0]
+ 140509587640576 -> 140509587625200
+ 140509587640576 [label=NativeLayerNormBackward0]
+ 140509587640720 -> 140509587640576
+ 140509587640720 [label=AddBackward0]
+ 140509587640912 -> 140509587640720
+ 140509587640912 [label=NativeDropoutBackward0]
+ 140509587641056 -> 140509587640912
+ 140509587641056 [label=ViewBackward0]
+ 140509587641152 -> 140509587641056
+ 140509587641152 [label=AddmmBackward0]
+ 140509587641248 -> 140509587641152
+ 140509587641248 [label=ToCopyBackward0]
+ 140509587641440 -> 140509587641248
+ 140509590901600 [label="encoder.layer.2.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590901600 -> 140509587641440
+ 140509587641440 [label=AccumulateGrad]
+ 140509587641200 -> 140509587641152
+ 140509587641200 [label=ViewBackward0]
+ 140509587641488 -> 140509587641200
+ 140509587641488 [label=GeluBackward0]
+ 140509587641584 -> 140509587641488
+ 140509587641584 [label=ViewBackward0]
+ 140509587641680 -> 140509587641584
+ 140509587641680 [label=AddmmBackward0]
+ 140509587641776 -> 140509587641680
+ 140509587641776 [label=ToCopyBackward0]
+ 140509587641968 -> 140509587641776
+ 140509590901840 [label="encoder.layer.2.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590901840 -> 140509587641968
+ 140509587641968 [label=AccumulateGrad]
+ 140509587641728 -> 140509587641680
+ 140509587641728 [label=ViewBackward0]
+ 140509587642016 -> 140509587641728
+ 140509587642016 [label=ToCopyBackward0]
+ 140509587640864 -> 140509587642016
+ 140509587640864 [label=SliceBackward0]
+ 140509587642160 -> 140509587640864
+ 140509587642160 [label=SliceBackward0]
+ 140509587642256 -> 140509587642160
+ 140509587642256 [label=NativeLayerNormBackward0]
+ 140509587642352 -> 140509587642256
+ 140509587642352 [label=AddBackward0]
+ 140509587642544 -> 140509587642352
+ 140509587642544 [label=NativeDropoutBackward0]
+ 140509587642688 -> 140509587642544
+ 140509587642688 [label=ViewBackward0]
+ 140509587642784 -> 140509587642688
+ 140509587642784 [label=AddmmBackward0]
+ 140509587642880 -> 140509587642784
+ 140509587642880 [label=ToCopyBackward0]
+ 140509587643072 -> 140509587642880
+ 140509590903760 [label="encoder.layer.2.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590903760 -> 140509587643072
+ 140509587643072 [label=AccumulateGrad]
+ 140509587642832 -> 140509587642784
+ 140509587642832 [label=ViewBackward0]
+ 140509587643120 -> 140509587642832
+ 140509587643120 [label=ViewBackward0]
+ 140509587643216 -> 140509587643120
+ 140509587643216 [label=CloneBackward0]
+ 140509587643312 -> 140509587643216
+ 140509587643312 [label=PermuteBackward0]
+ 140509587643408 -> 140509587643312
+ 140509587643408 [label=UnsafeViewBackward0]
+ 140509587643504 -> 140509587643408
+ 140509587643504 [label=BmmBackward0]
+ 140509587643600 -> 140509587643504
+ 140509587643600 [label=ReshapeAliasBackward0]
+ 140509587643744 -> 140509587643600
+ 140509587643744 [label=ExpandBackward0]
+ 140509587643840 -> 140509587643744
+ 140509587643840 [label=ToCopyBackward0]
+ 140509587643936 -> 140509587643840
+ 140509587643936 [label=NativeDropoutBackward0]
+ 140509587644032 -> 140509587643936
+ 140509587644032 [label=SoftmaxBackward0]
+ 140509587644128 -> 140509587644032
+ 140509587644128 [label=AddBackward0]
+ 140509587644224 -> 140509587644128
+ 140509587644224 [label=DivBackward0]
+ 140509587644320 -> 140509587644224
+ 140509587644320 [label=UnsafeViewBackward0]
+ 140509587644368 -> 140509587644320
+ 140509587644368 [label=BmmBackward0]
+ 140509587656864 -> 140509587644368
+ 140509587656864 [label=ReshapeAliasBackward0]
+ 140509587657008 -> 140509587656864
+ 140509587657008 [label=ExpandBackward0]
+ 140509587657104 -> 140509587657008
+ 140509587657104 [label=PermuteBackward0]
+ 140509587657200 -> 140509587657104
+ 140509587657200 [label=ViewBackward0]
+ 140509587657296 -> 140509587657200
+ 140509587657296 [label=ViewBackward0]
+ 140509587657392 -> 140509587657296
+ 140509587657392 [label=AddmmBackward0]
+ 140509587657488 -> 140509587657392
+ 140509587657488 [label=ToCopyBackward0]
+ 140509587657680 -> 140509587657488
+ 140509590904480 [label="encoder.layer.2.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590904480 -> 140509587657680
+ 140509587657680 [label=AccumulateGrad]
+ 140509587657440 -> 140509587657392
+ 140509587657440 [label=ViewBackward0]
+ 140509587657728 -> 140509587657440
+ 140509587657728 [label=ToCopyBackward0]
+ 140509587642496 -> 140509587657728
+ 140509587642496 [label=SliceBackward0]
+ 140509587657872 -> 140509587642496
+ 140509587657872 [label=SliceBackward0]
+ 140509587657968 -> 140509587657872
+ 140509587657968 [label=SliceBackward0]
+ 140509587658064 -> 140509587657968
+ 140509587658064 [label=NativeLayerNormBackward0]
+ 140509587658160 -> 140509587658064
+ 140509587658160 [label=AddBackward0]
+ 140509587658352 -> 140509587658160
+ 140509587658352 [label=NativeDropoutBackward0]
+ 140509587658496 -> 140509587658352
+ 140509587658496 [label=ViewBackward0]
+ 140509587658592 -> 140509587658496
+ 140509587658592 [label=AddmmBackward0]
+ 140509587658688 -> 140509587658592
+ 140509587658688 [label=ToCopyBackward0]
+ 140509587658880 -> 140509587658688
+ 140509590913248 [label="encoder.layer.2.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590913248 -> 140509587658880
+ 140509587658880 [label=AccumulateGrad]
+ 140509587658640 -> 140509587658592
+ 140509587658640 [label=ViewBackward0]
+ 140509587658928 -> 140509587658640
+ 140509587658928 [label=ViewBackward0]
+ 140509587659024 -> 140509587658928
+ 140509587659024 [label=CloneBackward0]
+ 140509587659120 -> 140509587659024
+ 140509587659120 [label=PermuteBackward0]
+ 140509587659216 -> 140509587659120
+ 140509587659216 [label=UnsafeViewBackward0]
+ 140509587659312 -> 140509587659216
+ 140509587659312 [label=BmmBackward0]
+ 140509587659408 -> 140509587659312
+ 140509587659408 [label=ReshapeAliasBackward0]
+ 140509587659552 -> 140509587659408
+ 140509587659552 [label=ExpandBackward0]
+ 140509587659648 -> 140509587659552
+ 140509587659648 [label=ToCopyBackward0]
+ 140509587659744 -> 140509587659648
+ 140509587659744 [label=NativeDropoutBackward0]
+ 140509587659840 -> 140509587659744
+ 140509587659840 [label=SoftmaxBackward0]
+ 140509587659936 -> 140509587659840
+ 140509587659936 [label=AddBackward0]
+ 140509587660032 -> 140509587659936
+ 140509587660032 [label=DivBackward0]
+ 140509587660128 -> 140509587660032
+ 140509587660128 [label=UnsafeViewBackward0]
+ 140509587660224 -> 140509587660128
+ 140509587660224 [label=BmmBackward0]
+ 140509587660320 -> 140509587660224
+ 140509587660320 [label=ReshapeAliasBackward0]
+ 140509587660464 -> 140509587660320
+ 140509587660464 [label=ExpandBackward0]
+ 140509587660560 -> 140509587660464
+ 140509587660560 [label=PermuteBackward0]
+ 140509587660656 -> 140509587660560
+ 140509587660656 [label=ViewBackward0]
+ 140509587660752 -> 140509587660656
+ 140509587660752 [label=ViewBackward0]
+ 140509587660368 -> 140509587660752
+ 140509587660368 [label=AddmmBackward0]
+ 140509587673296 -> 140509587660368
+ 140509587673296 [label=ToCopyBackward0]
+ 140509587673488 -> 140509587673296
+ 140509590913968 [label="encoder.layer.2.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590913968 -> 140509587673488
+ 140509587673488 [label=AccumulateGrad]
+ 140509587673248 -> 140509587660368
+ 140509587673248 [label=ViewBackward0]
+ 140509587673536 -> 140509587673248
+ 140509587673536 [label=ToCopyBackward0]
+ 140509587658304 -> 140509587673536
+ 140509587658304 [label=CatBackward0]
+ 140509587673680 -> 140509587658304
+ 140509587673680 [label=NativeLayerNormBackward0]
+ 140509587673824 -> 140509587673680
+ 140509587673824 [label=AddBackward0]
+ 140509587674016 -> 140509587673824
+ 140509587674016 [label=NativeDropoutBackward0]
+ 140509587674160 -> 140509587674016
+ 140509587674160 [label=ViewBackward0]
+ 140509587674256 -> 140509587674160
+ 140509587674256 [label=AddmmBackward0]
+ 140509587674352 -> 140509587674256
+ 140509587674352 [label=ToCopyBackward0]
+ 140509587674544 -> 140509587674352
+ 140509590914448 [label="encoder.layer.1.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590914448 -> 140509587674544
+ 140509587674544 [label=AccumulateGrad]
+ 140509587674304 -> 140509587674256
+ 140509587674304 [label=ViewBackward0]
+ 140509587674592 -> 140509587674304
+ 140509587674592 [label=GeluBackward0]
+ 140509587674688 -> 140509587674592
+ 140509587674688 [label=ViewBackward0]
+ 140509587674784 -> 140509587674688
+ 140509587674784 [label=AddmmBackward0]
+ 140509587674880 -> 140509587674784
+ 140509587674880 [label=ToCopyBackward0]
+ 140509587675072 -> 140509587674880
+ 140509590914688 [label="encoder.layer.1.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590914688 -> 140509587675072
+ 140509587675072 [label=AccumulateGrad]
+ 140509587674832 -> 140509587674784
+ 140509587674832 [label=ViewBackward0]
+ 140509587675120 -> 140509587674832
+ 140509587675120 [label=ToCopyBackward0]
+ 140509587673968 -> 140509587675120
+ 140509587673968 [label=SliceBackward0]
+ 140509587675264 -> 140509587673968
+ 140509587675264 [label=SliceBackward0]
+ 140509587675360 -> 140509587675264
+ 140509587675360 [label=SliceBackward0]
+ 140509587675456 -> 140509587675360
+ 140509587675456 [label=SliceBackward0]
+ 140509587675552 -> 140509587675456
+ 140509587675552 [label=SliceBackward0]
+ 140509587675648 -> 140509587675552
+ 140509587675648 [label=NativeLayerNormBackward0]
+ 140509587675744 -> 140509587675648
+ 140509587675744 [label=AddBackward0]
+ 140509587675936 -> 140509587675744
+ 140509587675936 [label=NativeDropoutBackward0]
+ 140509587676080 -> 140509587675936
+ 140509587676080 [label=ViewBackward0]
+ 140509587676176 -> 140509587676080
+ 140509587676176 [label=AddmmBackward0]
+ 140509587676272 -> 140509587676176
+ 140509587676272 [label=ToCopyBackward0]
+ 140509587676464 -> 140509587676272
+ 140509590916608 [label="encoder.layer.1.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590916608 -> 140509587676464
+ 140509587676464 [label=AccumulateGrad]
+ 140509587676224 -> 140509587676176
+ 140509587676224 [label=ViewBackward0]
+ 140509587676512 -> 140509587676224
+ 140509587676512 [label=ViewBackward0]
+ 140509587676608 -> 140509587676512
+ 140509587676608 [label=CloneBackward0]
+ 140509587676704 -> 140509587676608
+ 140509587676704 [label=PermuteBackward0]
+ 140509587676800 -> 140509587676704
+ 140509587676800 [label=UnsafeViewBackward0]
+ 140509587676896 -> 140509587676800
+ 140509587676896 [label=BmmBackward0]
+ 140509587676992 -> 140509587676896
+ 140509587676992 [label=ReshapeAliasBackward0]
+ 140509587677136 -> 140509587676992
+ 140509587677136 [label=ExpandBackward0]
+ 140509587677040 -> 140509587677136
+ 140509587677040 [label=ToCopyBackward0]
+ 140517615505616 -> 140509587677040
+ 140517615505616 [label=NativeDropoutBackward0]
+ 140517615505712 -> 140517615505616
+ 140517615505712 [label=SoftmaxBackward0]
+ 140517615505808 -> 140517615505712
+ 140517615505808 [label=AddBackward0]
+ 140517615505904 -> 140517615505808
+ 140517615505904 [label=DivBackward0]
+ 140517615506000 -> 140517615505904
+ 140517615506000 [label=UnsafeViewBackward0]
+ 140517615506096 -> 140517615506000
+ 140517615506096 [label=BmmBackward0]
+ 140517615506192 -> 140517615506096
+ 140517615506192 [label=ReshapeAliasBackward0]
+ 140517615506336 -> 140517615506192
+ 140517615506336 [label=ExpandBackward0]
+ 140517615506432 -> 140517615506336
+ 140517615506432 [label=PermuteBackward0]
+ 140517615506528 -> 140517615506432
+ 140517615506528 [label=ViewBackward0]
+ 140517615506624 -> 140517615506528
+ 140517615506624 [label=ViewBackward0]
+ 140517615506720 -> 140517615506624
+ 140517615506720 [label=AddmmBackward0]
+ 140517615506816 -> 140517615506720
+ 140517615506816 [label=ToCopyBackward0]
+ 140517615507008 -> 140517615506816
+ 140509590933808 [label="encoder.layer.1.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590933808 -> 140517615507008
+ 140517615507008 [label=AccumulateGrad]
+ 140517615506768 -> 140517615506720
+ 140517615506768 [label=ViewBackward0]
+ 140517615507056 -> 140517615506768
+ 140517615507056 [label=ToCopyBackward0]
+ 140509587675888 -> 140517615507056
+ 140509587675888 [label=CatBackward0]
+ 140517615507200 -> 140509587675888
+ 140517615507200 [label=NativeLayerNormBackward0]
+ 140517615507344 -> 140517615507200
+ 140517615507344 [label=AddBackward0]
+ 140517615507536 -> 140517615507344
+ 140517615507536 [label=NativeDropoutBackward0]
+ 140517615507680 -> 140517615507536
+ 140517615507680 [label=ViewBackward0]
+ 140517615507776 -> 140517615507680
+ 140517615507776 [label=AddmmBackward0]
+ 140517615507872 -> 140517615507776
+ 140517615507872 [label=ToCopyBackward0]
+ 140517615508064 -> 140517615507872
+ 140509590934288 [label="encoder.layer.0.experts.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590934288 -> 140517615508064
+ 140517615508064 [label=AccumulateGrad]
+ 140517615507824 -> 140517615507776
+ 140517615507824 [label=ViewBackward0]
+ 140517615508112 -> 140517615507824
+ 140517615508112 [label=GeluBackward0]
+ 140517615508208 -> 140517615508112
+ 140517615508208 [label=ViewBackward0]
+ 140517615508304 -> 140517615508208
+ 140517615508304 [label=AddmmBackward0]
+ 140517615508400 -> 140517615508304
+ 140517615508400 [label=ToCopyBackward0]
+ 140517615508592 -> 140517615508400
+ 140509590934528 [label="encoder.layer.0.experts.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590934528 -> 140517615508592
+ 140517615508592 [label=AccumulateGrad]
+ 140517615508352 -> 140517615508304
+ 140517615508352 [label=ViewBackward0]
+ 140517615508640 -> 140517615508352
+ 140517615508640 [label=ToCopyBackward0]
+ 140517615507488 -> 140517615508640
+ 140517615507488 [label=SliceBackward0]
+ 140517615508784 -> 140517615507488
+ 140517615508784 [label=SliceBackward0]
+ 140517615508880 -> 140517615508784
+ 140517615508880 [label=NativeLayerNormBackward0]
+ 140517615508976 -> 140517615508880
+ 140517615508976 [label=AddBackward0]
+ 140517615509168 -> 140517615508976
+ 140517615509168 [label=NativeDropoutBackward0]
+ 140517615509312 -> 140517615509168
+ 140517615509312 [label=ViewBackward0]
+ 140517615509408 -> 140517615509312
+ 140517615509408 [label=AddmmBackward0]
+ 140517615509456 -> 140517615509408
+ 140517615509456 [label=ToCopyBackward0]
+ 140517615522048 -> 140517615509456
+ 140509590936448 [label="encoder.layer.0.crossattention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590936448 -> 140517615522048
+ 140517615522048 [label=AccumulateGrad]
+ 140517615509216 -> 140517615509408
+ 140517615509216 [label=ViewBackward0]
+ 140517615522096 -> 140517615509216
+ 140517615522096 [label=ViewBackward0]
+ 140517615522192 -> 140517615522096
+ 140517615522192 [label=CloneBackward0]
+ 140517615522288 -> 140517615522192
+ 140517615522288 [label=PermuteBackward0]
+ 140517615522384 -> 140517615522288
+ 140517615522384 [label=UnsafeViewBackward0]
+ 140517615522480 -> 140517615522384
+ 140517615522480 [label=BmmBackward0]
+ 140517615522576 -> 140517615522480
+ 140517615522576 [label=ReshapeAliasBackward0]
+ 140517615522720 -> 140517615522576
+ 140517615522720 [label=ExpandBackward0]
+ 140517615522816 -> 140517615522720
+ 140517615522816 [label=ToCopyBackward0]
+ 140517615522912 -> 140517615522816
+ 140517615522912 [label=NativeDropoutBackward0]
+ 140517615523008 -> 140517615522912
+ 140517615523008 [label=SoftmaxBackward0]
+ 140517615523104 -> 140517615523008
+ 140517615523104 [label=AddBackward0]
+ 140517615523200 -> 140517615523104
+ 140517615523200 [label=DivBackward0]
+ 140517615523296 -> 140517615523200
+ 140517615523296 [label=UnsafeViewBackward0]
+ 140517615523392 -> 140517615523296
+ 140517615523392 [label=BmmBackward0]
+ 140517615523488 -> 140517615523392
+ 140517615523488 [label=ReshapeAliasBackward0]
+ 140517615523632 -> 140517615523488
+ 140517615523632 [label=ExpandBackward0]
+ 140517615523728 -> 140517615523632
+ 140517615523728 [label=PermuteBackward0]
+ 140517615523824 -> 140517615523728
+ 140517615523824 [label=ViewBackward0]
+ 140517615523920 -> 140517615523824
+ 140517615523920 [label=ViewBackward0]
+ 140517615524016 -> 140517615523920
+ 140517615524016 [label=AddmmBackward0]
+ 140517615524112 -> 140517615524016
+ 140517615524112 [label=ToCopyBackward0]
+ 140517615524304 -> 140517615524112
+ 140509590937168 [label="encoder.layer.0.crossattention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590937168 -> 140517615524304
+ 140517615524304 [label=AccumulateGrad]
+ 140517615524064 -> 140517615524016
+ 140517615524064 [label=ViewBackward0]
+ 140517615524352 -> 140517615524064
+ 140517615524352 [label=ToCopyBackward0]
+ 140517615509120 -> 140517615524352
+ 140517615509120 [label=SliceBackward0]
+ 140517615524496 -> 140517615509120
+ 140517615524496 [label=SliceBackward0]
+ 140517615524592 -> 140517615524496
+ 140517615524592 [label=SliceBackward0]
+ 140517615524688 -> 140517615524592
+ 140517615524688 [label=NativeLayerNormBackward0]
+ 140517615524784 -> 140517615524688
+ 140517615524784 [label=AddBackward0]
+ 140517615524976 -> 140517615524784
+ 140517615524976 [label=NativeDropoutBackward0]
+ 140517615525120 -> 140517615524976
+ 140517615525120 [label=ViewBackward0]
+ 140517615525216 -> 140517615525120
+ 140517615525216 [label=AddmmBackward0]
+ 140517615525312 -> 140517615525216
+ 140517615525312 [label=ToCopyBackward0]
+ 140517615525504 -> 140517615525312
+ 140509590945936 [label="encoder.layer.0.attention.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590945936 -> 140517615525504
+ 140517615525504 [label=AccumulateGrad]
+ 140517615525264 -> 140517615525216
+ 140517615525264 [label=ViewBackward0]
+ 140517615525552 -> 140517615525264
+ 140517615525552 [label=ViewBackward0]
+ 140517615525648 -> 140517615525552
+ 140517615525648 [label=CloneBackward0]
+ 140517615525744 -> 140517615525648
+ 140517615525744 [label=PermuteBackward0]
+ 140517615525840 -> 140517615525744
+ 140517615525840 [label=UnsafeViewBackward0]
+ 140517615525456 -> 140517615525840
+ 140517615525456 [label=BmmBackward0]
+ 140517615538384 -> 140517615525456
+ 140517615538384 [label=ReshapeAliasBackward0]
+ 140517615538528 -> 140517615538384
+ 140517615538528 [label=ExpandBackward0]
+ 140517615538624 -> 140517615538528
+ 140517615538624 [label=ToCopyBackward0]
+ 140517615538720 -> 140517615538624
+ 140517615538720 [label=NativeDropoutBackward0]
+ 140517615538816 -> 140517615538720
+ 140517615538816 [label=SoftmaxBackward0]
+ 140517615538912 -> 140517615538816
+ 140517615538912 [label=AddBackward0]
+ 140517615539008 -> 140517615538912
+ 140517615539008 [label=DivBackward0]
+ 140517615539104 -> 140517615539008
+ 140517615539104 [label=UnsafeViewBackward0]
+ 140517615539200 -> 140517615539104
+ 140517615539200 [label=BmmBackward0]
+ 140517615539296 -> 140517615539200
+ 140517615539296 [label=ReshapeAliasBackward0]
+ 140517615539440 -> 140517615539296
+ 140517615539440 [label=ExpandBackward0]
+ 140517615539536 -> 140517615539440
+ 140517615539536 [label=PermuteBackward0]
+ 140517615539632 -> 140517615539536
+ 140517615539632 [label=ViewBackward0]
+ 140517615539728 -> 140517615539632
+ 140517615539728 [label=ViewBackward0]
+ 140517615539824 -> 140517615539728
+ 140517615539824 [label=AddmmBackward0]
+ 140517615539920 -> 140517615539824
+ 140517615539920 [label=ToCopyBackward0]
+ 140517615540112 -> 140517615539920
+ 140509590600896 [label="encoder.layer.0.attention.self.query.bias
+ (768)" fillcolor=lightblue]
+ 140509590600896 -> 140517615540112
+ 140517615540112 [label=AccumulateGrad]
+ 140517615539872 -> 140517615539824
+ 140517615539872 [label=ViewBackward0]
+ 140517615540160 -> 140517615539872
+ 140517615540160 [label=ToCopyBackward0]
+ 140517615524928 -> 140517615540160
+ 140517615524928 [label=NativeDropoutBackward0]
+ 140517615540304 -> 140517615524928
+ 140517615540304 [label=NativeLayerNormBackward0]
+ 140517615540400 -> 140517615540304
+ 140517615540400 [label=CatBackward0]
+ 140517615540592 -> 140517615540400
+ 140517615540592 [label=ExpandBackward0]
+ 140517615540736 -> 140517615540592
+ 140509590947296 [label="
+ (1, 32, 768)" fillcolor=lightblue]
+ 140509590947296 -> 140517615540736
+ 140517615540736 [label=AccumulateGrad]
+ 140517615540544 -> 140517615540400
+ 140517615540544 [label=AddBackward0]
+ 140517615540784 -> 140517615540544
+ 140517615540784 [label=EmbeddingBackward0]
+ 140517615540928 -> 140517615540784
+ 140509590947856 [label="embeddings.word_embeddings.weight
+ (30523, 768)" fillcolor=lightblue]
+ 140509590947856 -> 140517615540928
+ 140517615540928 [label=AccumulateGrad]
+ 140517615540832 -> 140517615540544
+ 140517615540832 [label=EmbeddingBackward0]
+ 140517615540976 -> 140517615540832
+ 140509939919504 [label="embeddings.position_embeddings.weight
+ (512, 768)" fillcolor=lightblue]
+ 140509939919504 -> 140517615540976
+ 140517615540976 [label=AccumulateGrad]
+ 140517615540352 -> 140517615540304
+ 140509590958304 [label="embeddings.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590958304 -> 140517615540352
+ 140517615540352 [label=AccumulateGrad]
+ 140517615540016 -> 140517615540304
+ 140509590946656 [label="embeddings.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590946656 -> 140517615540016
+ 140517615540016 [label=AccumulateGrad]
+ 140517615539344 -> 140517615539824
+ 140517615539344 [label=TBackward0]
+ 140517615540064 -> 140517615539344
+ 140517615540064 [label=ToCopyBackward0]
+ 140517615540496 -> 140517615540064
+ 140509986890912 [label="encoder.layer.0.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509986890912 -> 140517615540496
+ 140517615540496 [label=AccumulateGrad]
+ 140517615539248 -> 140517615539200
+ 140517615539248 [label=ReshapeAliasBackward0]
+ 140517615539584 -> 140517615539248
+ 140517615539584 [label=ExpandBackward0]
+ 140517615539776 -> 140517615539584
+ 140517615539776 [label=TransposeBackward0]
+ 140517615540256 -> 140517615539776
+ 140517615540256 [label=PermuteBackward0]
+ 140517615541024 -> 140517615540256
+ 140517615541024 [label=ViewBackward0]
+ 140517615540208 -> 140517615541024
+ 140517615540208 [label=ViewBackward0]
+ 140517615540640 -> 140517615540208
+ 140517615540640 [label=AddmmBackward0]
+ 140517615541120 -> 140517615540640
+ 140517615541120 [label=ToCopyBackward0]
+ 140517615541312 -> 140517615541120
+ 140509590946096 [label="encoder.layer.0.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590946096 -> 140517615541312
+ 140517615541312 [label=AccumulateGrad]
+ 140517615540880 -> 140517615540640
+ 140517615540880 [label=ViewBackward0]
+ 140517615541360 -> 140517615540880
+ 140517615541360 [label=ToCopyBackward0]
+ 140517615524928 -> 140517615541360
+ 140517615539392 -> 140517615540640
+ 140517615539392 [label=TBackward0]
+ 140517615541216 -> 140517615539392
+ 140517615541216 [label=ToCopyBackward0]
+ 140517615541504 -> 140517615541216
+ 140509590600816 [label="encoder.layer.0.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590600816 -> 140517615541504
+ 140517615541504 [label=AccumulateGrad]
+ 140517615538336 -> 140517615525456
+ 140517615538336 [label=ReshapeAliasBackward0]
+ 140517615538672 -> 140517615538336
+ 140517615538672 [label=ExpandBackward0]
+ 140517615538864 -> 140517615538672
+ 140517615538864 [label=PermuteBackward0]
+ 140517615539056 -> 140517615538864
+ 140517615539056 [label=ViewBackward0]
+ 140517615538432 -> 140517615539056
+ 140517615538432 [label=ViewBackward0]
+ 140517615539680 -> 140517615538432
+ 140517615539680 [label=AddmmBackward0]
+ 140517615540448 -> 140517615539680
+ 140517615540448 [label=ToCopyBackward0]
+ 140517615541456 -> 140517615540448
+ 140509590945856 [label="encoder.layer.0.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590945856 -> 140517615541456
+ 140517615541456 [label=AccumulateGrad]
+ 140517615539968 -> 140517615539680
+ 140517615539968 [label=ViewBackward0]
+ 140517615541264 -> 140517615539968
+ 140517615541264 [label=ToCopyBackward0]
+ 140517615524928 -> 140517615541264
+ 140517615538480 -> 140517615539680
+ 140517615538480 [label=TBackward0]
+ 140517615541072 -> 140517615538480
+ 140517615541072 [label=ToCopyBackward0]
+ 140517615541408 -> 140517615541072
+ 140509590946176 [label="encoder.layer.0.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590946176 -> 140517615541408
+ 140517615541408 [label=AccumulateGrad]
+ 140517615525024 -> 140517615525216
+ 140517615525024 [label=TBackward0]
+ 140517615525696 -> 140517615525024
+ 140517615525696 [label=ToCopyBackward0]
+ 140517615525792 -> 140517615525696
+ 140509987117712 [label="encoder.layer.0.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509987117712 -> 140517615525792
+ 140517615525792 [label=AccumulateGrad]
+ 140517615524928 -> 140517615524784
+ 140517615524736 -> 140517615524688
+ 140509590937328 [label="encoder.layer.0.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590937328 -> 140517615524736
+ 140517615524736 [label=AccumulateGrad]
+ 140517615524208 -> 140517615524688
+ 140509590937408 [label="encoder.layer.0.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590937408 -> 140517615524208
+ 140517615524208 [label=AccumulateGrad]
+ 140517615523536 -> 140517615524016
+ 140517615523536 [label=TBackward0]
+ 140517615524256 -> 140517615523536
+ 140517615524256 [label=ToCopyBackward0]
+ 140517615524640 -> 140517615524256
+ 140509590937088 [label="encoder.layer.0.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590937088 -> 140517615524640
+ 140517615524640 [label=AccumulateGrad]
+ 140517615523440 -> 140517615523392
+ 140517615523440 [label=ReshapeAliasBackward0]
+ 140517615523776 -> 140517615523440
+ 140517615523776 [label=ExpandBackward0]
+ 140517615523968 -> 140517615523776
+ 140517615523968 [label=TransposeBackward0]
+ 140517615524448 -> 140517615523968
+ 140517615524448 [label=PermuteBackward0]
+ 140517615524880 -> 140517615524448
+ 140517615524880 [label=ViewBackward0]
+ 140517615524400 -> 140517615524880
+ 140517615524400 [label=ViewBackward0]
+ 140517615525168 -> 140517615524400
+ 140517615525168 [label=AddmmBackward0]
+ 140517615525408 -> 140517615525168
+ 140517615525408 [label=ToCopyBackward0]
+ 140517615538288 -> 140517615525408
+ 140509590936928 [label="encoder.layer.0.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590936928 -> 140517615538288
+ 140517615538288 [label=AccumulateGrad]
+ 140517615525360 -> 140517615525168
+ 140517615525360 [label=ViewBackward0]
+ 140517615538768 -> 140517615525360
+ 140517615538768 [label=ToCopyBackward0]
+ 140517615539152 -> 140517615538768
+ 140517615539152 [label=NativeLayerNormBackward0]
+ 140517615540688 -> 140517615539152
+ 140509590598736 [label="
+ (1408)" fillcolor=lightblue]
+ 140509590598736 -> 140517615540688
+ 140517615540688 [label=AccumulateGrad]
+ 140517615539488 -> 140517615539152
+ 140509590598976 [label="
+ (1408)" fillcolor=lightblue]
+ 140509590598976 -> 140517615539488
+ 140517615539488 [label=AccumulateGrad]
+ 140517615523584 -> 140517615525168
+ 140517615523584 [label=TBackward0]
+ 140517615538240 -> 140517615523584
+ 140517615538240 [label=ToCopyBackward0]
+ 140517615541168 -> 140517615538240
+ 140509590936848 [label="encoder.layer.0.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590936848 -> 140517615541168
+ 140517615541168 [label=AccumulateGrad]
+ 140517615522528 -> 140517615522480
+ 140517615522528 [label=ReshapeAliasBackward0]
+ 140517615522864 -> 140517615522528
+ 140517615522864 [label=ExpandBackward0]
+ 140517615523056 -> 140517615522864
+ 140517615523056 [label=PermuteBackward0]
+ 140517615523248 -> 140517615523056
+ 140517615523248 [label=ViewBackward0]
+ 140517615522624 -> 140517615523248
+ 140517615522624 [label=ViewBackward0]
+ 140517615523872 -> 140517615522624
+ 140517615523872 [label=AddmmBackward0]
+ 140517615524544 -> 140517615523872
+ 140517615524544 [label=ToCopyBackward0]
+ 140517615525600 -> 140517615524544
+ 140509590936688 [label="encoder.layer.0.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590936688 -> 140517615525600
+ 140517615525600 [label=AccumulateGrad]
+ 140517615524160 -> 140517615523872
+ 140517615524160 [label=ViewBackward0]
+ 140517615525072 -> 140517615524160
+ 140517615525072 [label=ToCopyBackward0]
+ 140517615539152 -> 140517615525072
+ 140517615522672 -> 140517615523872
+ 140517615522672 [label=TBackward0]
+ 140517615538576 -> 140517615522672
+ 140517615538576 [label=ToCopyBackward0]
+ 140517615538960 -> 140517615538576
+ 140509590936608 [label="encoder.layer.0.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590936608 -> 140517615538960
+ 140517615538960 [label=AccumulateGrad]
+ 140517615521856 -> 140517615509408
+ 140517615521856 [label=TBackward0]
+ 140517615522240 -> 140517615521856
+ 140517615522240 [label=ToCopyBackward0]
+ 140517615522432 -> 140517615522240
+ 140509590936368 [label="encoder.layer.0.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590936368 -> 140517615522432
+ 140517615522432 [label=AccumulateGrad]
+ 140517615509120 -> 140517615508976
+ 140517615508928 -> 140517615508880
+ 140509590936128 [label="encoder.layer.0.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590936128 -> 140517615508928
+ 140517615508928 [label=AccumulateGrad]
+ 140517615508496 -> 140517615508880
+ 140509590936208 [label="encoder.layer.0.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590936208 -> 140517615508496
+ 140517615508496 [label=AccumulateGrad]
+ 140517615508016 -> 140517615508304
+ 140517615508016 [label=TBackward0]
+ 140517615508544 -> 140517615508016
+ 140517615508544 [label=ToCopyBackward0]
+ 140517615509024 -> 140517615508544
+ 140509590934448 [label="encoder.layer.0.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590934448 -> 140517615509024
+ 140517615509024 [label=AccumulateGrad]
+ 140517615507584 -> 140517615507776
+ 140517615507584 [label=TBackward0]
+ 140517615508256 -> 140517615507584
+ 140517615508256 [label=ToCopyBackward0]
+ 140517615508736 -> 140517615508256
+ 140509590934208 [label="encoder.layer.0.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590934208 -> 140517615508736
+ 140517615508736 [label=AccumulateGrad]
+ 140517615507488 -> 140517615507344
+ 140517615507296 -> 140517615507200
+ 140509590933968 [label="encoder.layer.0.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590933968 -> 140517615507296
+ 140517615507296 [label=AccumulateGrad]
+ 140517615507248 -> 140517615507200
+ 140509590934048 [label="encoder.layer.0.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590934048 -> 140517615507248
+ 140517615507248 [label=AccumulateGrad]
+ 140517615506960 -> 140509587675888
+ 140517615506960 [label=NativeLayerNormBackward0]
+ 140517615507632 -> 140517615506960
+ 140517615507632 [label=AddBackward0]
+ 140517615508448 -> 140517615507632
+ 140517615508448 [label=NativeDropoutBackward0]
+ 140517615508160 -> 140517615508448
+ 140517615508160 [label=ViewBackward0]
+ 140517615508688 -> 140517615508160
+ 140517615508688 [label=AddmmBackward0]
+ 140517615509360 -> 140517615508688
+ 140517615509360 [label=ToCopyBackward0]
+ 140517615522000 -> 140517615509360
+ 140509590935728 [label="encoder.layer.0.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590935728 -> 140517615522000
+ 140517615522000 [label=AccumulateGrad]
+ 140517615509264 -> 140517615508688
+ 140517615509264 [label=ViewBackward0]
+ 140517615522144 -> 140517615509264
+ 140517615522144 [label=GeluBackward0]
+ 140517615523152 -> 140517615522144
+ 140517615523152 [label=ViewBackward0]
+ 140517615523680 -> 140517615523152
+ 140517615523680 [label=AddmmBackward0]
+ 140517615524832 -> 140517615523680
+ 140517615524832 [label=ToCopyBackward0]
+ 140517615541552 -> 140517615524832
+ 140509590935968 [label="encoder.layer.0.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590935968 -> 140517615541552
+ 140517615541552 [label=AccumulateGrad]
+ 140517615522768 -> 140517615523680
+ 140517615522768 [label=ViewBackward0]
+ 140517615541792 -> 140517615522768
+ 140517615541792 [label=ToCopyBackward0]
+ 140517615507968 -> 140517615541792
+ 140517615507968 [label=SliceBackward0]
+ 140517615541936 -> 140517615507968
+ 140517615541936 [label=SliceBackward0]
+ 140517615542032 -> 140517615541936
+ 140517615542032 [label=SliceBackward0]
+ 140517615524688 -> 140517615542032
+ 140517615541696 -> 140517615523680
+ 140517615541696 [label=TBackward0]
+ 140517615541600 -> 140517615541696
+ 140517615541600 [label=ToCopyBackward0]
+ 140517615542128 -> 140517615541600
+ 140509590935888 [label="encoder.layer.0.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590935888 -> 140517615542128
+ 140517615542128 [label=AccumulateGrad]
+ 140517615521904 -> 140517615508688
+ 140517615521904 [label=TBackward0]
+ 140517615523344 -> 140517615521904
+ 140517615523344 [label=ToCopyBackward0]
+ 140517615522960 -> 140517615523344
+ 140509590935648 [label="encoder.layer.0.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590935648 -> 140517615522960
+ 140517615522960 [label=AccumulateGrad]
+ 140517615507968 -> 140517615507632
+ 140517615507440 -> 140517615506960
+ 140509590935408 [label="encoder.layer.0.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590935408 -> 140517615507440
+ 140517615507440 [label=AccumulateGrad]
+ 140517615507392 -> 140517615506960
+ 140509590935488 [label="encoder.layer.0.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590935488 -> 140517615507392
+ 140517615507392 [label=AccumulateGrad]
+ 140517615506240 -> 140517615506720
+ 140517615506240 [label=TBackward0]
+ 140517615506912 -> 140517615506240
+ 140517615506912 [label=ToCopyBackward0]
+ 140517615507920 -> 140517615506912
+ 140509590933728 [label="encoder.layer.1.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590933728 -> 140517615507920
+ 140517615507920 [label=AccumulateGrad]
+ 140517615506144 -> 140517615506096
+ 140517615506144 [label=ReshapeAliasBackward0]
+ 140517615506480 -> 140517615506144
+ 140517615506480 [label=ExpandBackward0]
+ 140517615506672 -> 140517615506480
+ 140517615506672 [label=TransposeBackward0]
+ 140517615507152 -> 140517615506672
+ 140517615507152 [label=PermuteBackward0]
+ 140517615509072 -> 140517615507152
+ 140517615509072 [label=ViewBackward0]
+ 140517615507104 -> 140517615509072
+ 140517615507104 [label=ViewBackward0]
+ 140517615522336 -> 140517615507104
+ 140517615522336 [label=AddmmBackward0]
+ 140517615506288 -> 140517615522336
+ 140517615506288 [label=ToCopyBackward0]
+ 140517615541840 -> 140517615506288
+ 140509590917008 [label="encoder.layer.1.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590917008 -> 140517615541840
+ 140517615541840 [label=AccumulateGrad]
+ 140517615541744 -> 140517615522336
+ 140517615541744 [label=ViewBackward0]
+ 140517615542176 -> 140517615541744
+ 140517615542176 [label=ToCopyBackward0]
+ 140509587675888 -> 140517615542176
+ 140517615541888 -> 140517615522336
+ 140517615541888 [label=TBackward0]
+ 140517615542080 -> 140517615541888
+ 140517615542080 [label=ToCopyBackward0]
+ 140517615542224 -> 140517615542080
+ 140509590933568 [label="encoder.layer.1.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590933568 -> 140517615542224
+ 140517615542224 [label=AccumulateGrad]
+ 140509587676944 -> 140509587676896
+ 140509587676944 [label=ReshapeAliasBackward0]
+ 140509587677088 -> 140509587676944
+ 140509587677088 [label=ExpandBackward0]
+ 140517615505760 -> 140509587677088
+ 140517615505760 [label=PermuteBackward0]
+ 140517615505952 -> 140517615505760
+ 140517615505952 [label=ViewBackward0]
+ 140517615505472 -> 140517615505952
+ 140517615505472 [label=ViewBackward0]
+ 140517615506576 -> 140517615505472
+ 140517615506576 [label=AddmmBackward0]
+ 140517615507728 -> 140517615506576
+ 140517615507728 [label=ToCopyBackward0]
+ 140517615541648 -> 140517615507728
+ 140509590916848 [label="encoder.layer.1.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590916848 -> 140517615541648
+ 140517615541648 [label=AccumulateGrad]
+ 140517615506864 -> 140517615506576
+ 140517615506864 [label=ViewBackward0]
+ 140517615521952 -> 140517615506864
+ 140517615521952 [label=ToCopyBackward0]
+ 140509587675888 -> 140517615521952
+ 140517615505520 -> 140517615506576
+ 140517615505520 [label=TBackward0]
+ 140517615541984 -> 140517615505520
+ 140517615541984 [label=ToCopyBackward0]
+ 140517615591632 -> 140517615541984
+ 140509590916768 [label="encoder.layer.1.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590916768 -> 140517615591632
+ 140517615591632 [label=AccumulateGrad]
+ 140509587675984 -> 140509587676176
+ 140509587675984 [label=TBackward0]
+ 140509587676656 -> 140509587675984
+ 140509587676656 [label=ToCopyBackward0]
+ 140509587676848 -> 140509587676656
+ 140509590916528 [label="encoder.layer.1.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590916528 -> 140509587676848
+ 140509587676848 [label=AccumulateGrad]
+ 140509587675888 -> 140509587675744
+ 140509587675696 -> 140509587675648
+ 140509590916288 [label="encoder.layer.1.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590916288 -> 140509587675696
+ 140509587675696 [label=AccumulateGrad]
+ 140509587674976 -> 140509587675648
+ 140509590916368 [label="encoder.layer.1.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590916368 -> 140509587674976
+ 140509587674976 [label=AccumulateGrad]
+ 140509587674496 -> 140509587674784
+ 140509587674496 [label=TBackward0]
+ 140509587675024 -> 140509587674496
+ 140509587675024 [label=ToCopyBackward0]
+ 140509587675408 -> 140509587675024
+ 140509590914608 [label="encoder.layer.1.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590914608 -> 140509587675408
+ 140509587675408 [label=AccumulateGrad]
+ 140509587674064 -> 140509587674256
+ 140509587674064 [label=TBackward0]
+ 140509587674736 -> 140509587674064
+ 140509587674736 [label=ToCopyBackward0]
+ 140509587675216 -> 140509587674736
+ 140509590914368 [label="encoder.layer.1.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590914368 -> 140509587675216
+ 140509587675216 [label=AccumulateGrad]
+ 140509587673968 -> 140509587673824
+ 140509587673776 -> 140509587673680
+ 140509590914128 [label="encoder.layer.1.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590914128 -> 140509587673776
+ 140509587673776 [label=AccumulateGrad]
+ 140509587673728 -> 140509587673680
+ 140509590914208 [label="encoder.layer.1.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590914208 -> 140509587673728
+ 140509587673728 [label=AccumulateGrad]
+ 140509587673440 -> 140509587658304
+ 140509587673440 [label=NativeLayerNormBackward0]
+ 140509587674112 -> 140509587673440
+ 140509587674112 [label=AddBackward0]
+ 140509587674928 -> 140509587674112
+ 140509587674928 [label=NativeDropoutBackward0]
+ 140509587674640 -> 140509587674928
+ 140509587674640 [label=ViewBackward0]
+ 140509587675168 -> 140509587674640
+ 140509587675168 [label=AddmmBackward0]
+ 140509587675840 -> 140509587675168
+ 140509587675840 [label=ToCopyBackward0]
+ 140509587676368 -> 140509587675840
+ 140509590915888 [label="encoder.layer.1.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590915888 -> 140509587676368
+ 140509587676368 [label=AccumulateGrad]
+ 140509587675792 -> 140509587675168
+ 140509587675792 [label=ViewBackward0]
+ 140509587676752 -> 140509587675792
+ 140509587676752 [label=GeluBackward0]
+ 140509587676560 -> 140509587676752
+ 140509587676560 [label=ViewBackward0]
+ 140509587676320 -> 140509587676560
+ 140509587676320 [label=AddmmBackward0]
+ 140517615506048 -> 140509587676320
+ 140517615506048 [label=ToCopyBackward0]
+ 140517615508832 -> 140517615506048
+ 140509590916128 [label="encoder.layer.1.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590916128 -> 140517615508832
+ 140517615508832 [label=AccumulateGrad]
+ 140517615505856 -> 140509587676320
+ 140517615505856 [label=ViewBackward0]
+ 140517615591728 -> 140517615505856
+ 140517615591728 [label=ToCopyBackward0]
+ 140509587674448 -> 140517615591728
+ 140509587674448 [label=SliceBackward0]
+ 140517615591776 -> 140509587674448
+ 140517615591776 [label=SliceBackward0]
+ 140517615591872 -> 140517615591776
+ 140517615591872 [label=SliceBackward0]
+ 140509587675648 -> 140517615591872
+ 140517615505568 -> 140509587676320
+ 140517615505568 [label=TBackward0]
+ 140517615591536 -> 140517615505568
+ 140517615591536 [label=ToCopyBackward0]
+ 140517615591968 -> 140517615591536
+ 140509590916048 [label="encoder.layer.1.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590916048 -> 140517615591968
+ 140517615591968 [label=AccumulateGrad]
+ 140509587675600 -> 140509587675168
+ 140509587675600 [label=TBackward0]
+ 140509587676128 -> 140509587675600
+ 140509587676128 [label=ToCopyBackward0]
+ 140517615506384 -> 140509587676128
+ 140509590915808 [label="encoder.layer.1.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590915808 -> 140517615506384
+ 140517615506384 [label=AccumulateGrad]
+ 140509587674448 -> 140509587674112
+ 140509587673920 -> 140509587673440
+ 140509590915568 [label="encoder.layer.1.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590915568 -> 140509587673920
+ 140509587673920 [label=AccumulateGrad]
+ 140509587673872 -> 140509587673440
+ 140509590915648 [label="encoder.layer.1.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590915648 -> 140509587673872
+ 140509587673872 [label=AccumulateGrad]
+ 140509587673152 -> 140509587660368
+ 140509587673152 [label=TBackward0]
+ 140509587673392 -> 140509587673152
+ 140509587673392 [label=ToCopyBackward0]
+ 140509587674400 -> 140509587673392
+ 140509590913888 [label="encoder.layer.2.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590913888 -> 140509587674400
+ 140509587674400 [label=AccumulateGrad]
+ 140509587660272 -> 140509587660224
+ 140509587660272 [label=ReshapeAliasBackward0]
+ 140509587660608 -> 140509587660272
+ 140509587660608 [label=ExpandBackward0]
+ 140509587660704 -> 140509587660608
+ 140509587660704 [label=TransposeBackward0]
+ 140509587673632 -> 140509587660704
+ 140509587673632 [label=PermuteBackward0]
+ 140509587675504 -> 140509587673632
+ 140509587675504 [label=ViewBackward0]
+ 140509587673584 -> 140509587675504
+ 140509587673584 [label=ViewBackward0]
+ 140509587676416 -> 140509587673584
+ 140509587676416 [label=AddmmBackward0]
+ 140517615505664 -> 140509587676416
+ 140517615505664 [label=ToCopyBackward0]
+ 140517615591680 -> 140517615505664
+ 140509590913728 [label="encoder.layer.2.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590913728 -> 140517615591680
+ 140517615591680 [label=AccumulateGrad]
+ 140509587673200 -> 140509587676416
+ 140509587673200 [label=ViewBackward0]
+ 140517615592016 -> 140509587673200
+ 140517615592016 [label=ToCopyBackward0]
+ 140509587658304 -> 140517615592016
+ 140517615591488 -> 140509587676416
+ 140517615591488 [label=TBackward0]
+ 140517615591584 -> 140517615591488
+ 140517615591584 [label=ToCopyBackward0]
+ 140517615592160 -> 140517615591584
+ 140509590913648 [label="encoder.layer.2.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590913648 -> 140517615592160
+ 140517615592160 [label=AccumulateGrad]
+ 140509587659360 -> 140509587659312
+ 140509587659360 [label=ReshapeAliasBackward0]
+ 140509587659696 -> 140509587659360
+ 140509587659696 [label=ExpandBackward0]
+ 140509587659888 -> 140509587659696
+ 140509587659888 [label=PermuteBackward0]
+ 140509587660080 -> 140509587659888
+ 140509587660080 [label=ViewBackward0]
+ 140509587659456 -> 140509587660080
+ 140509587659456 [label=ViewBackward0]
+ 140509587660416 -> 140509587659456
+ 140509587660416 [label=AddmmBackward0]
+ 140509587659504 -> 140509587660416
+ 140509587659504 [label=ToCopyBackward0]
+ 140509587676032 -> 140509587659504
+ 140509590913488 [label="encoder.layer.2.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590913488 -> 140509587676032
+ 140509587676032 [label=AccumulateGrad]
+ 140509587674208 -> 140509587660416
+ 140509587674208 [label=ViewBackward0]
+ 140517615591920 -> 140509587674208
+ 140517615591920 [label=ToCopyBackward0]
+ 140509587658304 -> 140517615591920
+ 140509587673344 -> 140509587660416
+ 140509587673344 [label=TBackward0]
+ 140517615591824 -> 140509587673344
+ 140517615591824 [label=ToCopyBackward0]
+ 140517615592064 -> 140517615591824
+ 140509590913408 [label="encoder.layer.2.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590913408 -> 140517615592064
+ 140517615592064 [label=AccumulateGrad]
+ 140509587658400 -> 140509587658592
+ 140509587658400 [label=TBackward0]
+ 140509587659072 -> 140509587658400
+ 140509587659072 [label=ToCopyBackward0]
+ 140509587659264 -> 140509587659072
+ 140509590913168 [label="encoder.layer.2.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590913168 -> 140509587659264
+ 140509587659264 [label=AccumulateGrad]
+ 140509587658304 -> 140509587658160
+ 140509587658112 -> 140509587658064
+ 140509590904640 [label="encoder.layer.2.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590904640 -> 140509587658112
+ 140509587658112 [label=AccumulateGrad]
+ 140509587657584 -> 140509587658064
+ 140509590904720 [label="encoder.layer.2.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590904720 -> 140509587657584
+ 140509587657584 [label=AccumulateGrad]
+ 140509587656912 -> 140509587657392
+ 140509587656912 [label=TBackward0]
+ 140509587657632 -> 140509587656912
+ 140509587657632 [label=ToCopyBackward0]
+ 140509587658016 -> 140509587657632
+ 140509590904400 [label="encoder.layer.2.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590904400 -> 140509587658016
+ 140509587658016 [label=AccumulateGrad]
+ 140509587656816 -> 140509587644368
+ 140509587656816 [label=ReshapeAliasBackward0]
+ 140509587657152 -> 140509587656816
+ 140509587657152 [label=ExpandBackward0]
+ 140509587657344 -> 140509587657152
+ 140509587657344 [label=TransposeBackward0]
+ 140509587657824 -> 140509587657344
+ 140509587657824 [label=PermuteBackward0]
+ 140509587658256 -> 140509587657824
+ 140509587658256 [label=ViewBackward0]
+ 140509587657776 -> 140509587658256
+ 140509587657776 [label=ViewBackward0]
+ 140509587658544 -> 140509587657776
+ 140509587658544 [label=AddmmBackward0]
+ 140509587658784 -> 140509587658544
+ 140509587658784 [label=ToCopyBackward0]
+ 140509587658976 -> 140509587658784
+ 140509590904240 [label="encoder.layer.2.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590904240 -> 140509587658976
+ 140509587658976 [label=AccumulateGrad]
+ 140509587658736 -> 140509587658544
+ 140509587658736 [label=ViewBackward0]
+ 140509587659792 -> 140509587658736
+ 140509587659792 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587659792
+ 140509587656960 -> 140509587658544
+ 140509587656960 [label=TBackward0]
+ 140509587659600 -> 140509587656960
+ 140509587659600 [label=ToCopyBackward0]
+ 140509587660512 -> 140509587659600
+ 140509590904160 [label="encoder.layer.2.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590904160 -> 140509587660512
+ 140509587660512 [label=AccumulateGrad]
+ 140509587643552 -> 140509587643504
+ 140509587643552 [label=ReshapeAliasBackward0]
+ 140509587643888 -> 140509587643552
+ 140509587643888 [label=ExpandBackward0]
+ 140509587644080 -> 140509587643888
+ 140509587644080 [label=PermuteBackward0]
+ 140509587644272 -> 140509587644080
+ 140509587644272 [label=ViewBackward0]
+ 140509587675312 -> 140509587644272
+ 140509587675312 [label=ViewBackward0]
+ 140509587643696 -> 140509587675312
+ 140509587643696 [label=AddmmBackward0]
+ 140509587657536 -> 140509587643696
+ 140509587657536 [label=ToCopyBackward0]
+ 140509587659168 -> 140509587657536
+ 140509590904000 [label="encoder.layer.2.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590904000 -> 140509587659168
+ 140509587659168 [label=AccumulateGrad]
+ 140509587657248 -> 140509587643696
+ 140509587657248 [label=ViewBackward0]
+ 140509587660176 -> 140509587657248
+ 140509587660176 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587660176
+ 140509587656768 -> 140509587643696
+ 140509587656768 [label=TBackward0]
+ 140509587658208 -> 140509587656768
+ 140509587658208 [label=ToCopyBackward0]
+ 140509587658448 -> 140509587658208
+ 140509590903920 [label="encoder.layer.2.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590903920 -> 140509587658448
+ 140509587658448 [label=AccumulateGrad]
+ 140509587642592 -> 140509587642784
+ 140509587642592 [label=TBackward0]
+ 140509587643264 -> 140509587642592
+ 140509587643264 [label=ToCopyBackward0]
+ 140509587643456 -> 140509587643264
+ 140509590903680 [label="encoder.layer.2.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590903680 -> 140509587643456
+ 140509587643456 [label=AccumulateGrad]
+ 140509587642496 -> 140509587642352
+ 140509587642304 -> 140509587642256
+ 140509590903440 [label="encoder.layer.2.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590903440 -> 140509587642304
+ 140509587642304 [label=AccumulateGrad]
+ 140509587641872 -> 140509587642256
+ 140509590903520 [label="encoder.layer.2.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590903520 -> 140509587641872
+ 140509587641872 [label=AccumulateGrad]
+ 140509587641392 -> 140509587641680
+ 140509587641392 [label=TBackward0]
+ 140509587641920 -> 140509587641392
+ 140509587641920 [label=ToCopyBackward0]
+ 140509587642400 -> 140509587641920
+ 140509590901760 [label="encoder.layer.2.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590901760 -> 140509587642400
+ 140509587642400 [label=AccumulateGrad]
+ 140509587640960 -> 140509587641152
+ 140509587640960 [label=TBackward0]
+ 140509587641632 -> 140509587640960
+ 140509587641632 [label=ToCopyBackward0]
+ 140509587642112 -> 140509587641632
+ 140509590901520 [label="encoder.layer.2.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590901520 -> 140509587642112
+ 140509587642112 [label=AccumulateGrad]
+ 140509587640864 -> 140509587640720
+ 140509587640672 -> 140509587640576
+ 140509590901280 [label="encoder.layer.2.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590901280 -> 140509587640672
+ 140509587640672 [label=AccumulateGrad]
+ 140509587640624 -> 140509587640576
+ 140509590901360 [label="encoder.layer.2.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590901360 -> 140509587640624
+ 140509587640624 [label=AccumulateGrad]
+ 140509587640480 -> 140509587625200
+ 140509587640480 [label=NativeLayerNormBackward0]
+ 140509587641008 -> 140509587640480
+ 140509587641008 [label=AddBackward0]
+ 140509587641824 -> 140509587641008
+ 140509587641824 [label=NativeDropoutBackward0]
+ 140509587641536 -> 140509587641824
+ 140509587641536 [label=ViewBackward0]
+ 140509587642064 -> 140509587641536
+ 140509587642064 [label=AddmmBackward0]
+ 140509587642928 -> 140509587642064
+ 140509587642928 [label=ToCopyBackward0]
+ 140509587643024 -> 140509587642928
+ 140509590903040 [label="encoder.layer.2.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590903040 -> 140509587643024
+ 140509587643024 [label=AccumulateGrad]
+ 140509587642736 -> 140509587642064
+ 140509587642736 [label=ViewBackward0]
+ 140509587643168 -> 140509587642736
+ 140509587643168 [label=GeluBackward0]
+ 140509587644176 -> 140509587643168
+ 140509587644176 [label=ViewBackward0]
+ 140509587643648 -> 140509587644176
+ 140509587643648 [label=AddmmBackward0]
+ 140509587659984 -> 140509587643648
+ 140509587659984 [label=ToCopyBackward0]
+ 140517615592208 -> 140509587659984
+ 140509590903280 [label="encoder.layer.2.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590903280 -> 140517615592208
+ 140517615592208 [label=AccumulateGrad]
+ 140509587657920 -> 140509587643648
+ 140509587657920 [label=ViewBackward0]
+ 140517615592304 -> 140509587657920
+ 140517615592304 [label=ToCopyBackward0]
+ 140509587641344 -> 140517615592304
+ 140509587641344 [label=SliceBackward0]
+ 140517615592448 -> 140509587641344
+ 140517615592448 [label=SliceBackward0]
+ 140517615592544 -> 140517615592448
+ 140517615592544 [label=SliceBackward0]
+ 140509587658064 -> 140517615592544
+ 140509587657056 -> 140509587643648
+ 140509587657056 [label=TBackward0]
+ 140517615592112 -> 140509587657056
+ 140517615592112 [label=ToCopyBackward0]
+ 140517615592640 -> 140517615592112
+ 140509590903200 [label="encoder.layer.2.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590903200 -> 140517615592640
+ 140517615592640 [label=AccumulateGrad]
+ 140509587642640 -> 140509587642064
+ 140509587642640 [label=TBackward0]
+ 140509587643792 -> 140509587642640
+ 140509587643792 [label=ToCopyBackward0]
+ 140509587658832 -> 140509587643792
+ 140509590902960 [label="encoder.layer.2.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590902960 -> 140509587658832
+ 140509587658832 [label=AccumulateGrad]
+ 140509587641344 -> 140509587641008
+ 140509587640816 -> 140509587640480
+ 140509590902720 [label="encoder.layer.2.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590902720 -> 140509587640816
+ 140509587640816 [label=AccumulateGrad]
+ 140509587640768 -> 140509587640480
+ 140509590902800 [label="encoder.layer.2.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590902800 -> 140509587640768
+ 140509587640768 [label=AccumulateGrad]
+ 140509587627264 -> 140509587627744
+ 140509587627264 [label=TBackward0]
+ 140509587640384 -> 140509587627264
+ 140509587640384 [label=ToCopyBackward0]
+ 140509587641296 -> 140509587640384
+ 140509590901040 [label="encoder.layer.3.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590901040 -> 140509587641296
+ 140509587641296 [label=AccumulateGrad]
+ 140509587627168 -> 140509587627120
+ 140509587627168 [label=ReshapeAliasBackward0]
+ 140509587627504 -> 140509587627168
+ 140509587627504 [label=ExpandBackward0]
+ 140509587627696 -> 140509587627504
+ 140509587627696 [label=TransposeBackward0]
+ 140509587627888 -> 140509587627696
+ 140509587627888 [label=PermuteBackward0]
+ 140509587642448 -> 140509587627888
+ 140509587642448 [label=ViewBackward0]
+ 140509587640432 -> 140509587642448
+ 140509587640432 [label=ViewBackward0]
+ 140509587643360 -> 140509587640432
+ 140509587643360 [label=AddmmBackward0]
+ 140509587643984 -> 140509587643360
+ 140509587643984 [label=ToCopyBackward0]
+ 140517615592256 -> 140509587643984
+ 140509590900880 [label="encoder.layer.3.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590900880 -> 140517615592256
+ 140517615592256 [label=AccumulateGrad]
+ 140509587640528 -> 140509587643360
+ 140509587640528 [label=ViewBackward0]
+ 140517615592688 -> 140509587640528
+ 140517615592688 [label=ToCopyBackward0]
+ 140509587625200 -> 140517615592688
+ 140517615592352 -> 140509587643360
+ 140517615592352 [label=TBackward0]
+ 140517615592400 -> 140517615592352
+ 140517615592400 [label=ToCopyBackward0]
+ 140517615592832 -> 140517615592400
+ 140509590900800 [label="encoder.layer.3.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590900800 -> 140517615592832
+ 140517615592832 [label=AccumulateGrad]
+ 140509587626256 -> 140509587626208
+ 140509587626256 [label=ReshapeAliasBackward0]
+ 140509587626592 -> 140509587626256
+ 140509587626592 [label=ExpandBackward0]
+ 140509587626784 -> 140509587626592
+ 140509587626784 [label=PermuteBackward0]
+ 140509587626976 -> 140509587626784
+ 140509587626976 [label=ViewBackward0]
+ 140509587626352 -> 140509587626976
+ 140509587626352 [label=ViewBackward0]
+ 140509587627600 -> 140509587626352
+ 140509587627600 [label=AddmmBackward0]
+ 140509587627312 -> 140509587627600
+ 140509587627312 [label=ToCopyBackward0]
+ 140509587642976 -> 140509587627312
+ 140509590896448 [label="encoder.layer.3.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590896448 -> 140509587642976
+ 140509587642976 [label=AccumulateGrad]
+ 140509587626400 -> 140509587627600
+ 140509587626400 [label=ViewBackward0]
+ 140517615592592 -> 140509587626400
+ 140517615592592 [label=ToCopyBackward0]
+ 140509587625200 -> 140517615592592
+ 140509587641104 -> 140509587627600
+ 140509587641104 [label=TBackward0]
+ 140517615592496 -> 140509587641104
+ 140517615592496 [label=ToCopyBackward0]
+ 140517615592736 -> 140517615592496
+ 140509590896368 [label="encoder.layer.3.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590896368 -> 140517615592736
+ 140517615592736 [label=AccumulateGrad]
+ 140509587625296 -> 140509587625488
+ 140509587625296 [label=TBackward0]
+ 140509587625968 -> 140509587625296
+ 140509587625968 [label=ToCopyBackward0]
+ 140509587626160 -> 140509587625968
+ 140509590896128 [label="encoder.layer.3.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590896128 -> 140509587626160
+ 140509587626160 [label=AccumulateGrad]
+ 140509587625200 -> 140509587625056
+ 140509587625008 -> 140509587624960
+ 140509590895888 [label="encoder.layer.3.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590895888 -> 140509587625008
+ 140509587625008 [label=AccumulateGrad]
+ 140509587624288 -> 140509587624960
+ 140509590895968 [label="encoder.layer.3.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590895968 -> 140509587624288
+ 140509587624288 [label=AccumulateGrad]
+ 140509587624000 -> 140509587624096
+ 140509587624000 [label=TBackward0]
+ 140509587624336 -> 140509587624000
+ 140509587624336 [label=ToCopyBackward0]
+ 140509587624720 -> 140509587624336
+ 140509590894208 [label="encoder.layer.3.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590894208 -> 140509587624720
+ 140509587624720 [label=AccumulateGrad]
+ 140509587611024 -> 140509587611216
+ 140509587611024 [label=TBackward0]
+ 140509587611456 -> 140509587611024
+ 140509587611456 [label=ToCopyBackward0]
+ 140509587624528 -> 140509587611456
+ 140509590893968 [label="encoder.layer.3.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590893968 -> 140509587624528
+ 140509587624528 [label=AccumulateGrad]
+ 140509587610928 -> 140509587610784
+ 140509587610736 -> 140509587610640
+ 140509590893728 [label="encoder.layer.3.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590893728 -> 140509587610736
+ 140509587610736 [label=AccumulateGrad]
+ 140509587610688 -> 140509587610640
+ 140509590893808 [label="encoder.layer.3.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590893808 -> 140509587610688
+ 140509587610688 [label=AccumulateGrad]
+ 140509587610400 -> 140509587607664
+ 140509587610400 [label=NativeLayerNormBackward0]
+ 140509587611072 -> 140509587610400
+ 140509587611072 [label=AddBackward0]
+ 140509587611600 -> 140509587611072
+ 140509587611600 [label=NativeDropoutBackward0]
+ 140509587624048 -> 140509587611600
+ 140509587624048 [label=ViewBackward0]
+ 140509587624480 -> 140509587624048
+ 140509587624480 [label=AddmmBackward0]
+ 140509587625152 -> 140509587624480
+ 140509587625152 [label=ToCopyBackward0]
+ 140509587625680 -> 140509587625152
+ 140509590895488 [label="encoder.layer.3.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590895488 -> 140509587625680
+ 140509587625680 [label=AccumulateGrad]
+ 140509587625104 -> 140509587624480
+ 140509587625104 [label=ViewBackward0]
+ 140509587626064 -> 140509587625104
+ 140509587626064 [label=GeluBackward0]
+ 140509587625728 -> 140509587626064
+ 140509587625728 [label=ViewBackward0]
+ 140509587626688 -> 140509587625728
+ 140509587626688 [label=AddmmBackward0]
+ 140509587627072 -> 140509587626688
+ 140509587627072 [label=ToCopyBackward0]
+ 140509587642208 -> 140509587627072
+ 140509590895728 [label="encoder.layer.3.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590895728 -> 140509587642208
+ 140509587642208 [label=AccumulateGrad]
+ 140509587626880 -> 140509587626688
+ 140509587626880 [label=ViewBackward0]
+ 140517615593024 -> 140509587626880
+ 140517615593024 [label=ToCopyBackward0]
+ 140509587611360 -> 140517615593024
+ 140509587611360 [label=SliceBackward0]
+ 140517615593072 -> 140509587611360
+ 140517615593072 [label=SliceBackward0]
+ 140517615593168 -> 140517615593072
+ 140517615593168 [label=SliceBackward0]
+ 140509587624960 -> 140517615593168
+ 140509587625632 -> 140509587626688
+ 140509587625632 [label=TBackward0]
+ 140517615592784 -> 140509587625632
+ 140517615592784 [label=ToCopyBackward0]
+ 140517615593264 -> 140517615592784
+ 140509590895648 [label="encoder.layer.3.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590895648 -> 140517615593264
+ 140517615593264 [label=AccumulateGrad]
+ 140509587624912 -> 140509587624480
+ 140509587624912 [label=TBackward0]
+ 140509587625872 -> 140509587624912
+ 140509587625872 [label=ToCopyBackward0]
+ 140509587627408 -> 140509587625872
+ 140509590895408 [label="encoder.layer.3.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590895408 -> 140509587627408
+ 140509587627408 [label=AccumulateGrad]
+ 140509587611360 -> 140509587611072
+ 140509587610880 -> 140509587610400
+ 140509590895168 [label="encoder.layer.3.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590895168 -> 140509587610880
+ 140509587610880 [label=AccumulateGrad]
+ 140509587610832 -> 140509587610400
+ 140509590895248 [label="encoder.layer.3.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590895248 -> 140509587610832
+ 140509587610832 [label=AccumulateGrad]
+ 140509587609680 -> 140509587610160
+ 140509587609680 [label=TBackward0]
+ 140509587610352 -> 140509587609680
+ 140509587610352 [label=ToCopyBackward0]
+ 140509587611168 -> 140509587610352
+ 140509590893488 [label="encoder.layer.4.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590893488 -> 140509587611168
+ 140509587611168 [label=AccumulateGrad]
+ 140509587609584 -> 140509587609536
+ 140509587609584 [label=ReshapeAliasBackward0]
+ 140509587609920 -> 140509587609584
+ 140509587609920 [label=ExpandBackward0]
+ 140509587610112 -> 140509587609920
+ 140509587610112 [label=TransposeBackward0]
+ 140509587610592 -> 140509587610112
+ 140509587610592 [label=PermuteBackward0]
+ 140509587610544 -> 140509587610592
+ 140509587610544 [label=ViewBackward0]
+ 140509587624240 -> 140509587610544
+ 140509587624240 [label=ViewBackward0]
+ 140509587625440 -> 140509587624240
+ 140509587625440 [label=AddmmBackward0]
+ 140509587626496 -> 140509587625440
+ 140509587626496 [label=ToCopyBackward0]
+ 140517615592976 -> 140509587626496
+ 140509590893328 [label="encoder.layer.4.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590893328 -> 140517615592976
+ 140517615592976 [label=AccumulateGrad]
+ 140509587624816 -> 140509587625440
+ 140509587624816 [label=ViewBackward0]
+ 140517615593312 -> 140509587624816
+ 140517615593312 [label=ToCopyBackward0]
+ 140509587607664 -> 140517615593312
+ 140517615592880 -> 140509587625440
+ 140517615592880 [label=TBackward0]
+ 140517615592928 -> 140517615592880
+ 140517615592928 [label=ToCopyBackward0]
+ 140517615593456 -> 140517615592928
+ 140509590893248 [label="encoder.layer.4.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590893248 -> 140517615593456
+ 140517615593456 [label=AccumulateGrad]
+ 140509587608672 -> 140509587608624
+ 140509587608672 [label=ReshapeAliasBackward0]
+ 140509587609008 -> 140509587608672
+ 140509587609008 [label=ExpandBackward0]
+ 140509587609200 -> 140509587609008
+ 140509587609200 [label=PermuteBackward0]
+ 140509587609392 -> 140509587609200
+ 140509587609392 [label=ViewBackward0]
+ 140509587608768 -> 140509587609392
+ 140509587608768 [label=ViewBackward0]
+ 140509587610016 -> 140509587608768
+ 140509587610016 [label=AddmmBackward0]
+ 140509587609728 -> 140509587610016
+ 140509587609728 [label=ToCopyBackward0]
+ 140509587625344 -> 140509587609728
+ 140509590893088 [label="encoder.layer.4.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590893088 -> 140509587625344
+ 140509587625344 [label=AccumulateGrad]
+ 140509587610304 -> 140509587610016
+ 140509587610304 [label=ViewBackward0]
+ 140517615593216 -> 140509587610304
+ 140517615593216 [label=ToCopyBackward0]
+ 140509587607664 -> 140517615593216
+ 140509587608816 -> 140509587610016
+ 140509587608816 [label=TBackward0]
+ 140517615593120 -> 140509587608816
+ 140517615593120 [label=ToCopyBackward0]
+ 140517615593360 -> 140517615593120
+ 140509590893008 [label="encoder.layer.4.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590893008 -> 140517615593360
+ 140517615593360 [label=AccumulateGrad]
+ 140509587607712 -> 140509587607904
+ 140509587607712 [label=TBackward0]
+ 140509587608384 -> 140509587607712
+ 140509587608384 [label=ToCopyBackward0]
+ 140509587608576 -> 140509587608384
+ 140509590892768 [label="encoder.layer.4.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590892768 -> 140509587608576
+ 140509587608576 [label=AccumulateGrad]
+ 140509587607664 -> 140509587595120
+ 140509587595072 -> 140509587595024
+ 140509590892608 [label="encoder.layer.4.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590892608 -> 140509587595072
+ 140509587595072 [label=AccumulateGrad]
+ 140509587594544 -> 140509587595024
+ 140509590876048 [label="encoder.layer.4.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590876048 -> 140509587594544
+ 140509587594544 [label=AccumulateGrad]
+ 140509587593872 -> 140509587594352
+ 140509587593872 [label=TBackward0]
+ 140509587594592 -> 140509587593872
+ 140509587594592 [label=ToCopyBackward0]
+ 140509587594976 -> 140509587594592
+ 140509590875808 [label="encoder.layer.4.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590875808 -> 140509587594976
+ 140509587594976 [label=AccumulateGrad]
+ 140509587593776 -> 140509587593728
+ 140509587593776 [label=ReshapeAliasBackward0]
+ 140509587594112 -> 140509587593776
+ 140509587594112 [label=ExpandBackward0]
+ 140509587594304 -> 140509587594112
+ 140509587594304 [label=TransposeBackward0]
+ 140509587594784 -> 140509587594304
+ 140509587594784 [label=PermuteBackward0]
+ 140509587595168 -> 140509587594784
+ 140509587595168 [label=ViewBackward0]
+ 140509587594736 -> 140509587595168
+ 140509587594736 [label=ViewBackward0]
+ 140509587607856 -> 140509587594736
+ 140509587607856 [label=AddmmBackward0]
+ 140509587608096 -> 140509587607856
+ 140509587608096 [label=ToCopyBackward0]
+ 140509587608288 -> 140509587608096
+ 140509590875648 [label="encoder.layer.4.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590875648 -> 140509587608288
+ 140509587608288 [label=AccumulateGrad]
+ 140509587608048 -> 140509587607856
+ 140509587608048 [label=ViewBackward0]
+ 140509587609104 -> 140509587608048
+ 140509587609104 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587609104
+ 140509587607616 -> 140509587607856
+ 140509587607616 [label=TBackward0]
+ 140509587608912 -> 140509587607616
+ 140509587608912 [label=ToCopyBackward0]
+ 140509587609824 -> 140509587608912
+ 140509590875568 [label="encoder.layer.4.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590875568 -> 140509587609824
+ 140509587609824 [label=AccumulateGrad]
+ 140509587592864 -> 140509587592816
+ 140509587592864 [label=ReshapeAliasBackward0]
+ 140509587593200 -> 140509587592864
+ 140509587593200 [label=ExpandBackward0]
+ 140509587593392 -> 140509587593200
+ 140509587593392 [label=PermuteBackward0]
+ 140509587593584 -> 140509587593392
+ 140509587593584 [label=ViewBackward0]
+ 140509587592960 -> 140509587593584
+ 140509587592960 [label=ViewBackward0]
+ 140509587594208 -> 140509587592960
+ 140509587594208 [label=AddmmBackward0]
+ 140509587594880 -> 140509587594208
+ 140509587594880 [label=ToCopyBackward0]
+ 140509587624624 -> 140509587594880
+ 140509590875408 [label="encoder.layer.4.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590875408 -> 140509587624624
+ 140509587624624 [label=AccumulateGrad]
+ 140509587594496 -> 140509587594208
+ 140509587594496 [label=ViewBackward0]
+ 140509587609488 -> 140509587594496
+ 140509587609488 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587609488
+ 140509587593008 -> 140509587594208
+ 140509587593008 [label=TBackward0]
+ 140509587607760 -> 140509587593008
+ 140509587607760 [label=ToCopyBackward0]
+ 140509587608480 -> 140509587607760
+ 140509590875328 [label="encoder.layer.4.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590875328 -> 140509587608480
+ 140509587608480 [label=AccumulateGrad]
+ 140509587591904 -> 140509587592096
+ 140509587591904 [label=TBackward0]
+ 140509587592576 -> 140509587591904
+ 140509587592576 [label=ToCopyBackward0]
+ 140509587592768 -> 140509587592576
+ 140509590875088 [label="encoder.layer.4.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590875088 -> 140509587592768
+ 140509587592768 [label=AccumulateGrad]
+ 140509587591808 -> 140509587591664
+ 140509587591616 -> 140509587591568
+ 140509590874848 [label="encoder.layer.4.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590874848 -> 140509587591616
+ 140509587591616 [label=AccumulateGrad]
+ 140509587591376 -> 140509587591568
+ 140509590874928 [label="encoder.layer.4.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590874928 -> 140509587591376
+ 140509587591376 [label=AccumulateGrad]
+ 140509587574256 -> 140509587574544
+ 140509587574256 [label=TBackward0]
+ 140509587591328 -> 140509587574256
+ 140509587591328 [label=ToCopyBackward0]
+ 140509587591712 -> 140509587591328
+ 140509590873168 [label="encoder.layer.4.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590873168 -> 140509587591712
+ 140509587591712 [label=AccumulateGrad]
+ 140509587573824 -> 140509587574016
+ 140509587573824 [label=TBackward0]
+ 140509587574496 -> 140509587573824
+ 140509587574496 [label=ToCopyBackward0]
+ 140509587574688 -> 140509587574496
+ 140509590872928 [label="encoder.layer.4.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590872928 -> 140509587574688
+ 140509587574688 [label=AccumulateGrad]
+ 140509587573728 -> 140509587573584
+ 140509587573536 -> 140509587573440
+ 140509590872688 [label="encoder.layer.4.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590872688 -> 140509587573536
+ 140509587573536 [label=AccumulateGrad]
+ 140509587573488 -> 140509587573440
+ 140509590872768 [label="encoder.layer.4.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590872768 -> 140509587573488
+ 140509587573488 [label=AccumulateGrad]
+ 140509587573200 -> 140509587562112
+ 140509587573200 [label=NativeLayerNormBackward0]
+ 140509587573872 -> 140509587573200
+ 140509587573872 [label=AddBackward0]
+ 140509587574400 -> 140509587573872
+ 140509587574400 [label=NativeDropoutBackward0]
+ 140509587591424 -> 140509587574400
+ 140509587591424 [label=ViewBackward0]
+ 140509587591280 -> 140509587591424
+ 140509587591280 [label=AddmmBackward0]
+ 140509587592240 -> 140509587591280
+ 140509587592240 [label=ToCopyBackward0]
+ 140509587592336 -> 140509587592240
+ 140509590874448 [label="encoder.layer.4.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590874448 -> 140509587592336
+ 140509587592336 [label=AccumulateGrad]
+ 140509587592048 -> 140509587591280
+ 140509587592048 [label=ViewBackward0]
+ 140509587592480 -> 140509587592048
+ 140509587592480 [label=GeluBackward0]
+ 140509587593488 -> 140509587592480
+ 140509587593488 [label=ViewBackward0]
+ 140509587594016 -> 140509587593488
+ 140509587594016 [label=AddmmBackward0]
+ 140509587593920 -> 140509587594016
+ 140509587593920 [label=ToCopyBackward0]
+ 140517615593504 -> 140509587593920
+ 140509590874688 [label="encoder.layer.4.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590874688 -> 140517615593504
+ 140517615593504 [label=AccumulateGrad]
+ 140509587593104 -> 140509587594016
+ 140509587593104 [label=ViewBackward0]
+ 140517615593600 -> 140509587593104
+ 140517615593600 [label=ToCopyBackward0]
+ 140509587574208 -> 140517615593600
+ 140509587574208 [label=SliceBackward0]
+ 140517615593744 -> 140509587574208
+ 140517615593744 [label=SliceBackward0]
+ 140517615593840 -> 140517615593744
+ 140517615593840 [label=SliceBackward0]
+ 140509587595024 -> 140517615593840
+ 140509587609296 -> 140509587594016
+ 140509587609296 [label=TBackward0]
+ 140517615593408 -> 140509587609296
+ 140517615593408 [label=ToCopyBackward0]
+ 140517615593936 -> 140517615593408
+ 140509590874608 [label="encoder.layer.4.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590874608 -> 140517615593936
+ 140517615593936 [label=AccumulateGrad]
+ 140509587591952 -> 140509587591280
+ 140509587591952 [label=TBackward0]
+ 140509587593680 -> 140509587591952
+ 140509587593680 [label=ToCopyBackward0]
+ 140509587608144 -> 140509587593680
+ 140509590874368 [label="encoder.layer.4.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590874368 -> 140509587608144
+ 140509587608144 [label=AccumulateGrad]
+ 140509587574208 -> 140509587573872
+ 140509587573680 -> 140509587573200
+ 140509590874128 [label="encoder.layer.4.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590874128 -> 140509587573680
+ 140509587573680 [label=AccumulateGrad]
+ 140509587573632 -> 140509587573200
+ 140509590874208 [label="encoder.layer.4.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590874208 -> 140509587573632
+ 140509587573632 [label=AccumulateGrad]
+ 140509587572480 -> 140509587572960
+ 140509587572480 [label=TBackward0]
+ 140509587573152 -> 140509587572480
+ 140509587573152 [label=ToCopyBackward0]
+ 140509587574160 -> 140509587573152
+ 140509590872448 [label="encoder.layer.5.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590872448 -> 140509587574160
+ 140509587574160 [label=AccumulateGrad]
+ 140509587572384 -> 140509587572336
+ 140509587572384 [label=ReshapeAliasBackward0]
+ 140509587572720 -> 140509587572384
+ 140509587572720 [label=ExpandBackward0]
+ 140509587572912 -> 140509587572720
+ 140509587572912 [label=TransposeBackward0]
+ 140509587573392 -> 140509587572912
+ 140509587573392 [label=PermuteBackward0]
+ 140509587573344 -> 140509587573392
+ 140509587573344 [label=ViewBackward0]
+ 140509587572528 -> 140509587573344
+ 140509587572528 [label=ViewBackward0]
+ 140509587592672 -> 140509587572528
+ 140509587592672 [label=AddmmBackward0]
+ 140509587593296 -> 140509587592672
+ 140509587593296 [label=ToCopyBackward0]
+ 140517615593552 -> 140509587593296
+ 140509590872288 [label="encoder.layer.5.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590872288 -> 140517615593552
+ 140517615593552 [label=AccumulateGrad]
+ 140509587591760 -> 140509587592672
+ 140509587591760 [label=ViewBackward0]
+ 140517615593984 -> 140509587591760
+ 140517615593984 [label=ToCopyBackward0]
+ 140509587562112 -> 140517615593984
+ 140517615593648 -> 140509587592672
+ 140517615593648 [label=TBackward0]
+ 140517615593696 -> 140517615593648
+ 140517615593696 [label=ToCopyBackward0]
+ 140517615594128 -> 140517615593696
+ 140509590872208 [label="encoder.layer.5.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590872208 -> 140517615594128
+ 140517615594128 [label=AccumulateGrad]
+ 140509587571472 -> 140509587571424
+ 140509587571472 [label=ReshapeAliasBackward0]
+ 140509587571808 -> 140509587571472
+ 140509587571808 [label=ExpandBackward0]
+ 140509587572000 -> 140509587571808
+ 140509587572000 [label=PermuteBackward0]
+ 140509587572192 -> 140509587572000
+ 140509587572192 [label=ViewBackward0]
+ 140509587571568 -> 140509587572192
+ 140509587571568 [label=ViewBackward0]
+ 140509587572816 -> 140509587571568
+ 140509587572816 [label=AddmmBackward0]
+ 140509587573968 -> 140509587572816
+ 140509587573968 [label=ToCopyBackward0]
+ 140509587592288 -> 140509587573968
+ 140509590859664 [label="encoder.layer.5.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590859664 -> 140509587592288
+ 140509587592288 [label=AccumulateGrad]
+ 140509587573104 -> 140509587572816
+ 140509587573104 [label=ViewBackward0]
+ 140517615593888 -> 140509587573104
+ 140517615593888 [label=ToCopyBackward0]
+ 140509587562112 -> 140517615593888
+ 140509587571616 -> 140509587572816
+ 140509587571616 [label=TBackward0]
+ 140517615593792 -> 140509587571616
+ 140517615593792 [label=ToCopyBackward0]
+ 140517615594032 -> 140517615593792
+ 140509590859584 [label="encoder.layer.5.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590859584 -> 140517615594032
+ 140517615594032 [label=AccumulateGrad]
+ 140509587570752 -> 140509587562400
+ 140509587570752 [label=TBackward0]
+ 140509587571184 -> 140509587570752
+ 140509587571184 [label=ToCopyBackward0]
+ 140509587571376 -> 140509587571184
+ 140509590859344 [label="encoder.layer.5.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590859344 -> 140509587571376
+ 140509587571376 [label=AccumulateGrad]
+ 140509587562112 -> 140509587561968
+ 140509587561920 -> 140509587561872
+ 140509590859104 [label="encoder.layer.5.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590859104 -> 140509587561920
+ 140509587561920 [label=AccumulateGrad]
+ 140509587561200 -> 140509587561872
+ 140509590859184 [label="encoder.layer.5.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590859184 -> 140509587561200
+ 140509587561200 [label=AccumulateGrad]
+ 140509587560720 -> 140509587561008
+ 140509587560720 [label=TBackward0]
+ 140509587561248 -> 140509587560720
+ 140509587561248 [label=ToCopyBackward0]
+ 140509587561632 -> 140509587561248
+ 140509590857424 [label="encoder.layer.5.experts.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590857424 -> 140509587561632
+ 140509587561632 [label=AccumulateGrad]
+ 140509587560288 -> 140509587560480
+ 140509587560288 [label=TBackward0]
+ 140509587560960 -> 140509587560288
+ 140509587560960 [label=ToCopyBackward0]
+ 140509587561440 -> 140509587560960
+ 140509590857184 [label="encoder.layer.5.experts.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590857184 -> 140509587561440
+ 140509587561440 [label=AccumulateGrad]
+ 140509587560192 -> 140509587560048
+ 140509587560000 -> 140509587559904
+ 140509590856944 [label="encoder.layer.5.experts.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590856944 -> 140509587560000
+ 140509587560000 [label=AccumulateGrad]
+ 140509587559952 -> 140509587559904
+ 140509590857024 [label="encoder.layer.5.experts.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590857024 -> 140509587559952
+ 140509587559952 [label=AccumulateGrad]
+ 140509587559664 -> 140509587850432
+ 140509587559664 [label=NativeLayerNormBackward0]
+ 140509587560336 -> 140509587559664
+ 140509587560336 [label=AddBackward0]
+ 140509587561152 -> 140509587560336
+ 140509587561152 [label=NativeDropoutBackward0]
+ 140509587560864 -> 140509587561152
+ 140509587560864 [label=ViewBackward0]
+ 140509587561392 -> 140509587560864
+ 140509587561392 [label=AddmmBackward0]
+ 140509587562064 -> 140509587561392
+ 140509587562064 [label=ToCopyBackward0]
+ 140509587562352 -> 140509587562064
+ 140509590858704 [label="encoder.layer.5.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590858704 -> 140509587562352
+ 140509587562352 [label=AccumulateGrad]
+ 140509587562016 -> 140509587561392
+ 140509587562016 [label=ViewBackward0]
+ 140509587571280 -> 140509587562016
+ 140509587571280 [label=GeluBackward0]
+ 140509587570848 -> 140509587571280
+ 140509587570848 [label=ViewBackward0]
+ 140509587571904 -> 140509587570848
+ 140509587571904 [label=AddmmBackward0]
+ 140509587572288 -> 140509587571904
+ 140509587572288 [label=ToCopyBackward0]
+ 140509587591520 -> 140509587572288
+ 140509590858944 [label="encoder.layer.5.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590858944 -> 140509587591520
+ 140509587591520 [label=AccumulateGrad]
+ 140509587572096 -> 140509587571904
+ 140509587572096 [label=ViewBackward0]
+ 140517615594320 -> 140509587572096
+ 140517615594320 [label=ToCopyBackward0]
+ 140509587560672 -> 140517615594320
+ 140509587560672 [label=SliceBackward0]
+ 140517615594368 -> 140509587560672
+ 140517615594368 [label=SliceBackward0]
+ 140517615594464 -> 140517615594368
+ 140517615594464 [label=SliceBackward0]
+ 140509587561872 -> 140517615594464
+ 140509587571088 -> 140509587571904
+ 140509587571088 [label=TBackward0]
+ 140517615594080 -> 140509587571088
+ 140517615594080 [label=ToCopyBackward0]
+ 140517615594560 -> 140517615594080
+ 140509590858864 [label="encoder.layer.5.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590858864 -> 140517615594560
+ 140517615594560 [label=AccumulateGrad]
+ 140509587561824 -> 140509587561392
+ 140509587561824 [label=TBackward0]
+ 140509587571040 -> 140509587561824
+ 140509587571040 [label=ToCopyBackward0]
+ 140509587572624 -> 140509587571040
+ 140509590858624 [label="encoder.layer.5.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590858624 -> 140509587572624
+ 140509587572624 [label=AccumulateGrad]
+ 140509587560672 -> 140509587560336
+ 140509587560144 -> 140509587559664
+ 140509590858384 [label="encoder.layer.5.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590858384 -> 140509587560144
+ 140509587560144 [label=AccumulateGrad]
+ 140509587560096 -> 140509587559664
+ 140509590858464 [label="encoder.layer.5.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590858464 -> 140509587560096
+ 140509587560096 [label=AccumulateGrad]
+ 140509587558944 -> 140509587559424
+ 140509587558944 [label=TBackward0]
+ 140509587559616 -> 140509587558944
+ 140509587559616 [label=ToCopyBackward0]
+ 140509587560624 -> 140509587559616
+ 140509590856704 [label="encoder.layer.6.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590856704 -> 140509587560624
+ 140509587560624 [label=AccumulateGrad]
+ 140509587558848 -> 140509587558800
+ 140509587558848 [label=ReshapeAliasBackward0]
+ 140509587559184 -> 140509587558848
+ 140509587559184 [label=ExpandBackward0]
+ 140509587559376 -> 140509587559184
+ 140509587559376 [label=TransposeBackward0]
+ 140509587559856 -> 140509587559376
+ 140509587559856 [label=PermuteBackward0]
+ 140509587561728 -> 140509587559856
+ 140509587561728 [label=ViewBackward0]
+ 140509587559808 -> 140509587561728
+ 140509587559808 [label=ViewBackward0]
+ 140509587562256 -> 140509587559808
+ 140509587562256 [label=AddmmBackward0]
+ 140509587571712 -> 140509587562256
+ 140509587571712 [label=ToCopyBackward0]
+ 140517615594272 -> 140509587571712
+ 140509590856544 [label="encoder.layer.6.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590856544 -> 140517615594272
+ 140517615594272 [label=AccumulateGrad]
+ 140509587570800 -> 140509587562256
+ 140509587570800 [label=ViewBackward0]
+ 140517615594608 -> 140509587570800
+ 140517615594608 [label=ToCopyBackward0]
+ 140509587850432 -> 140517615594608
+ 140517615594176 -> 140509587562256
+ 140517615594176 [label=TBackward0]
+ 140517615594224 -> 140517615594176
+ 140517615594224 [label=ToCopyBackward0]
+ 140517615594752 -> 140517615594224
+ 140509590856464 [label="encoder.layer.6.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590856464 -> 140517615594752
+ 140517615594752 [label=AccumulateGrad]
+ 140509587849376 -> 140509587849520
+ 140509587849376 [label=ReshapeAliasBackward0]
+ 140509587853120 -> 140509587849376
+ 140509587853120 [label=ExpandBackward0]
+ 140509587853216 -> 140509587853120
+ 140509587853216 [label=PermuteBackward0]
+ 140509587558656 -> 140509587853216
+ 140509587558656 [label=ViewBackward0]
+ 140509587558464 -> 140509587558656
+ 140509587558464 [label=ViewBackward0]
+ 140509587559280 -> 140509587558464
+ 140509587559280 [label=AddmmBackward0]
+ 140509587560432 -> 140509587559280
+ 140509587560432 [label=ToCopyBackward0]
+ 140509587558992 -> 140509587560432
+ 140509590856304 [label="encoder.layer.6.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590856304 -> 140509587558992
+ 140509587558992 [label=AccumulateGrad]
+ 140509587559568 -> 140509587559280
+ 140509587559568 [label=ViewBackward0]
+ 140517615594512 -> 140509587559568
+ 140517615594512 [label=ToCopyBackward0]
+ 140509587850432 -> 140517615594512
+ 140509587558512 -> 140509587559280
+ 140509587558512 [label=TBackward0]
+ 140517615594416 -> 140509587558512
+ 140517615594416 [label=ToCopyBackward0]
+ 140517615594656 -> 140517615594416
+ 140509590856224 [label="encoder.layer.6.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590856224 -> 140517615594656
+ 140517615594656 [label=AccumulateGrad]
+ 140509587850336 -> 140509587850144
+ 140509587850336 [label=TBackward0]
+ 140509587849664 -> 140509587850336
+ 140509587849664 [label=ToCopyBackward0]
+ 140509587849472 -> 140509587849664
+ 140509590855984 [label="encoder.layer.6.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590855984 -> 140509587849472
+ 140509587849472 [label=AccumulateGrad]
+ 140509587850432 -> 140509587850672
+ 140509587850624 -> 140509587850768
+ 140509590855744 [label="encoder.layer.6.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590855744 -> 140509587850624
+ 140509587850624 [label=AccumulateGrad]
+ 140509587851248 -> 140509587850768
+ 140509590855824 [label="encoder.layer.6.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590855824 -> 140509587851248
+ 140509587851248 [label=AccumulateGrad]
+ 140509587851920 -> 140509587851440
+ 140509587851920 [label=TBackward0]
+ 140509587851104 -> 140509587851920
+ 140509587851104 [label=ToCopyBackward0]
+ 140509587850720 -> 140509587851104
+ 140509590843120 [label="encoder.layer.6.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590843120 -> 140509587850720
+ 140509587850720 [label=AccumulateGrad]
+ 140509587852016 -> 140509587851968
+ 140509587852016 [label=ReshapeAliasBackward0]
+ 140509587851584 -> 140509587852016
+ 140509587851584 [label=ExpandBackward0]
+ 140509587851392 -> 140509587851584
+ 140509587851392 [label=TransposeBackward0]
+ 140509587850912 -> 140509587851392
+ 140509587850912 [label=PermuteBackward0]
+ 140509587850576 -> 140509587850912
+ 140509587850576 [label=ViewBackward0]
+ 140509587851056 -> 140509587850576
+ 140509587851056 [label=ViewBackward0]
+ 140509587850288 -> 140509587851056
+ 140509587850288 [label=AddmmBackward0]
+ 140509587849952 -> 140509587850288
+ 140509587849952 [label=ToCopyBackward0]
+ 140509587849760 -> 140509587849952
+ 140509590842960 [label="encoder.layer.6.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590842960 -> 140509587849760
+ 140509587849760 [label=AccumulateGrad]
+ 140509587850096 -> 140509587850288
+ 140509587850096 [label=ViewBackward0]
+ 140509587853024 -> 140509587850096
+ 140509587853024 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587853024
+ 140509587851776 -> 140509587850288
+ 140509587851776 [label=TBackward0]
+ 140509587850000 -> 140509587851776
+ 140509587850000 [label=ToCopyBackward0]
+ 140509587559088 -> 140509587850000
+ 140509590842880 [label="encoder.layer.6.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590842880 -> 140509587559088
+ 140509587559088 [label=AccumulateGrad]
+ 140509587852880 -> 140509587695984
+ 140509587852880 [label=ReshapeAliasBackward0]
+ 140509587852592 -> 140509587852880
+ 140509587852592 [label=ExpandBackward0]
+ 140509587852400 -> 140509587852592
+ 140509587852400 [label=PermuteBackward0]
+ 140509587852208 -> 140509587852400
+ 140509587852208 [label=ViewBackward0]
+ 140509587852736 -> 140509587852208
+ 140509587852736 [label=ViewBackward0]
+ 140509587851488 -> 140509587852736
+ 140509587851488 [label=AddmmBackward0]
+ 140509587850816 -> 140509587851488
+ 140509587850816 [label=ToCopyBackward0]
+ 140509587849328 -> 140509587850816
+ 140509590842720 [label="encoder.layer.6.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590842720 -> 140509587849328
+ 140509587849328 [label=AccumulateGrad]
+ 140509587851200 -> 140509587851488
+ 140509587851200 [label=ViewBackward0]
+ 140509587850384 -> 140509587851200
+ 140509587850384 [label=ToCopyBackward0]
+ 140517615539152 -> 140509587850384
+ 140509587852784 -> 140509587851488
+ 140509587852784 [label=TBackward0]
+ 140509587849568 -> 140509587852784
+ 140509587849568 [label=ToCopyBackward0]
+ 140509587561536 -> 140509587849568
+ 140509590842640 [label="encoder.layer.6.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509590842640 -> 140509587561536
+ 140509587561536 [label=AccumulateGrad]
+ 140509587695216 -> 140509587695600
+ 140509587695216 [label=TBackward0]
+ 140509587697520 -> 140509587695216
+ 140509587697520 [label=ToCopyBackward0]
+ 140509587695552 -> 140509587697520
+ 140509590842400 [label="encoder.layer.6.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590842400 -> 140509587695552
+ 140509587695552 [label=AccumulateGrad]
+ 140509587695120 -> 140509587694832
+ 140509587696080 -> 140509587694592
+ 140509590842160 [label="encoder.layer.6.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590842160 -> 140509587696080
+ 140509587696080 [label=AccumulateGrad]
+ 140509587697040 -> 140509587694592
+ 140509590842240 [label="encoder.layer.6.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590842240 -> 140509587697040
+ 140509587697040 [label=AccumulateGrad]
+ 140509587697328 -> 140509587696464
+ 140509587697328 [label=TBackward0]
+ 140509587693632 -> 140509587697328
+ 140509587693632 [label=ToCopyBackward0]
+ 140509587694256 -> 140509587693632
+ 140509590826016 [label="encoder.layer.6.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590826016 -> 140509587694256
+ 140509587694256 [label=AccumulateGrad]
+ 140509588196464 -> 140509588196752
+ 140509588196464 [label=TBackward0]
+ 140509588197136 -> 140509588196464
+ 140509588197136 [label=ToCopyBackward0]
+ 140509587693968 -> 140509588197136
+ 140509590826176 [label="encoder.layer.6.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590826176 -> 140509587693968
+ 140509587693968 [label=AccumulateGrad]
+ 140509588196272 -> 140509588195888
+ 140509588195984 -> 140509588195696
+ 140509590825696 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590825696 -> 140509588195984
+ 140509588195984 [label=AccumulateGrad]
+ 140509588195456 -> 140509588195696
+ 140509590826496 [label="encoder.layer.6.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590826496 -> 140509588195456
+ 140509588195456 [label=AccumulateGrad]
+ 140509588195408 -> 140509588195216
+ 140509588195408 [label=UnsqueezeBackward0]
+ 140509588195936 -> 140509588195408
+ 140509588195936 [label=NativeLayerNormBackward0]
+ 140509588196416 -> 140509588195936
+ 140509588196416 [label=AddBackward0]
+ 140509587694640 -> 140509588196416
+ 140509587694640 [label=NativeDropoutBackward0]
+ 140509587697424 -> 140509587694640
+ 140509587697424 [label=ViewBackward0]
+ 140509587693776 -> 140509587697424
+ 140509587693776 [label=AddmmBackward0]
+ 140509587694928 -> 140509587693776
+ 140509587694928 [label=ToCopyBackward0]
+ 140509587696848 -> 140509587694928
+ 140509590825936 [label="encoder.layer.6.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590825936 -> 140509587696848
+ 140509587696848 [label=AccumulateGrad]
+ 140509587694736 -> 140509587693776
+ 140509587694736 [label=ViewBackward0]
+ 140509587695888 -> 140509587694736
+ 140509587695888 [label=GeluBackward0]
+ 140509587696176 -> 140509587695888
+ 140509587696176 [label=ViewBackward0]
+ 140509587695504 -> 140509587696176
+ 140509587695504 [label=AddmmBackward0]
+ 140509587852304 -> 140509587695504
+ 140509587852304 [label=ToCopyBackward0]
+ 140509587850528 -> 140509587852304
+ 140509590825456 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590825456 -> 140509587850528
+ 140509587850528 [label=AccumulateGrad]
+ 140509587852496 -> 140509587695504
+ 140509587852496 [label=ViewBackward0]
+ 140509587558560 -> 140509587852496
+ 140509587558560 [label=ToCopyBackward0]
+ 140509588196272 -> 140509587558560
+ 140509587852688 -> 140509587695504
+ 140509587852688 [label=TBackward0]
+ 140509587851680 -> 140509587852688
+ 140509587851680 [label=ToCopyBackward0]
+ 140517615594800 -> 140509587851680
+ 140509590825536 [label="encoder.layer.6.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590825536 -> 140517615594800
+ 140517615594800 [label=AccumulateGrad]
+ 140509587697136 -> 140509587693776
+ 140509587697136 [label=TBackward0]
+ 140509587695312 -> 140509587697136
+ 140509587695312 [label=ToCopyBackward0]
+ 140509587558752 -> 140509587695312
+ 140509590825296 [label="encoder.layer.6.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590825296 -> 140509587558752
+ 140509587558752 [label=AccumulateGrad]
+ 140509588196272 -> 140509588196416
+ 140509588196368 -> 140509588195936
+ 140509590825056 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590825056 -> 140509588196368
+ 140509588196368 [label=AccumulateGrad]
+ 140509588195792 -> 140509588195936
+ 140509590824976 [label="encoder.layer.6.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590824976 -> 140509588195792
+ 140509588195792 [label=AccumulateGrad]
+ 140509588195312 -> 140509588194976
+ 140509588195312 [label=UnsqueezeBackward0]
+ 140509588196848 -> 140509588195312
+ 140509588196848 [label=UnsqueezeBackward0]
+ 140509588195504 -> 140509588196848
+ 140509588195504 [label=MulBackward0]
+ 140509587695024 -> 140509588195504
+ 140509587695024 [label=ViewBackward0]
+ 140509587696656 -> 140509587695024
+ 140509587696656 [label=CloneBackward0]
+ 140509587852832 -> 140509587696656
+ 140509587852832 [label=ExpandBackward0]
+ 140517615594896 -> 140509587852832
+ 140517615594896 [label=UnsqueezeBackward0]
+ 140517615594992 -> 140517615594896
+ 140517615594992 [label=SoftmaxBackward0]
+ 140517615595088 -> 140517615594992
+ 140517615595088 [label=MmBackward0]
+ 140517615595184 -> 140517615595088
+ 140517615595184 [label=ToCopyBackward0]
+ 140517615595328 -> 140517615595184
+ 140517615595328 [label=DivBackward0]
+ 140517615595424 -> 140517615595328
+ 140517615595424 [label=SumBackward1]
+ 140517615595472 -> 140517615595424
+ 140517615595472 [label=MulBackward0]
+ 140509587694352 -> 140517615595472
+ 140517615595136 -> 140517615595088
+ 140517615595136 [label=TBackward0]
+ 140517615595232 -> 140517615595136
+ 140517615595232 [label=ToCopyBackward0]
+ 140517615595280 -> 140517615595232
+ 140509590839840 [label="encoder.layer.6.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509590839840 -> 140517615595280
+ 140517615595280 [label=AccumulateGrad]
+ 140509588194448 -> 140509588165008
+ 140509588194448 [label=ViewBackward0]
+ 140509588196080 -> 140509588194448
+ 140509588196080 [label=CloneBackward0]
+ 140509588195120 -> 140509588196080
+ 140509588195120 [label=ExpandBackward0]
+ 140509587852112 -> 140509588195120
+ 140509587852112 [label=UnsqueezeBackward0]
+ 140509587694160 -> 140509587852112
+ 140509587694160 [label=NativeLayerNormBackward0]
+ 140517615594848 -> 140509587694160
+ 140517615594848 [label=AddBackward0]
+ 140517615726656 -> 140517615594848
+ 140517615726656 [label=NativeDropoutBackward0]
+ 140517615726896 -> 140517615726656
+ 140517615726896 [label=ViewBackward0]
+ 140517615726992 -> 140517615726896
+ 140517615726992 [label=AddmmBackward0]
+ 140517615727088 -> 140517615726992
+ 140517615727088 [label=ToCopyBackward0]
+ 140517615727280 -> 140517615727088
+ 140509590841760 [label="encoder.layer.6.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590841760 -> 140517615727280
+ 140517615727280 [label=AccumulateGrad]
+ 140517615727040 -> 140517615726992
+ 140517615727040 [label=ViewBackward0]
+ 140517615727328 -> 140517615727040
+ 140517615727328 [label=GeluBackward0]
+ 140517615727424 -> 140517615727328
+ 140517615727424 [label=ViewBackward0]
+ 140517615727520 -> 140517615727424
+ 140517615727520 [label=AddmmBackward0]
+ 140517615727616 -> 140517615727520
+ 140517615727616 [label=ToCopyBackward0]
+ 140517615727808 -> 140517615727616
+ 140509590842000 [label="encoder.layer.6.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590842000 -> 140517615727808
+ 140517615727808 [label=AccumulateGrad]
+ 140517615727568 -> 140517615727520
+ 140517615727568 [label=ViewBackward0]
+ 140517615727856 -> 140517615727568
+ 140517615727856 [label=ToCopyBackward0]
+ 140517615726800 -> 140517615727856
+ 140517615726800 [label=SliceBackward0]
+ 140517615728000 -> 140517615726800
+ 140517615728000 [label=SliceBackward0]
+ 140517615728096 -> 140517615728000
+ 140517615728096 [label=SliceBackward0]
+ 140509587850768 -> 140517615728096
+ 140517615727232 -> 140517615727520
+ 140517615727232 [label=TBackward0]
+ 140517615727760 -> 140517615727232
+ 140517615727760 [label=ToCopyBackward0]
+ 140517615728192 -> 140517615727760
+ 140509590841920 [label="encoder.layer.6.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509590841920 -> 140517615728192
+ 140517615728192 [label=AccumulateGrad]
+ 140517615726752 -> 140517615726992
+ 140517615726752 [label=TBackward0]
+ 140517615727472 -> 140517615726752
+ 140517615727472 [label=ToCopyBackward0]
+ 140517615727952 -> 140517615727472
+ 140509590841680 [label="encoder.layer.6.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590841680 -> 140517615727952
+ 140517615727952 [label=AccumulateGrad]
+ 140517615726800 -> 140517615594848
+ 140517615595040 -> 140509587694160
+ 140509590841440 [label="encoder.layer.6.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590841440 -> 140517615595040
+ 140517615595040 [label=AccumulateGrad]
+ 140517615594944 -> 140509587694160
+ 140509590841520 [label="encoder.layer.6.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590841520 -> 140517615594944
+ 140517615594944 [label=AccumulateGrad]
+ 140509588193344 -> 140509588194160
+ 140509588193344 [label=TBackward0]
+ 140509588194544 -> 140509588193344
+ 140509588194544 [label=ToCopyBackward0]
+ 140509588194928 -> 140509588194544
+ 140509590840000 [label="encoder.layer.7.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590840000 -> 140509588194928
+ 140509588194928 [label=AccumulateGrad]
+ 140509588168464 -> 140509588168176
+ 140509588168464 [label=UnsafeViewBackward0]
+ 140509588168560 -> 140509588168464
+ 140509588168560 [label=CloneBackward0]
+ 140509588193776 -> 140509588168560
+ 140509588193776 [label=ExpandBackward0]
+ 140509588194256 -> 140509588193776
+ 140509588194256 [label=TransposeBackward0]
+ 140509588194832 -> 140509588194256
+ 140509588194832 [label=PermuteBackward0]
+ 140509587694448 -> 140509588194832
+ 140509587694448 [label=ViewBackward0]
+ 140517615595376 -> 140509587694448
+ 140517615595376 [label=ViewBackward0]
+ 140509588193392 -> 140517615595376
+ 140509588193392 [label=AddmmBackward0]
+ 140517615727136 -> 140509588193392
+ 140517615727136 [label=ToCopyBackward0]
+ 140517615728048 -> 140517615727136
+ 140509590840560 [label="encoder.layer.7.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590840560 -> 140517615728048
+ 140517615728048 [label=AccumulateGrad]
+ 140517615726944 -> 140509588193392
+ 140517615726944 [label=ViewBackward0]
+ 140517615727376 -> 140517615726944
+ 140517615727376 [label=ToCopyBackward0]
+ 140509588165008 -> 140517615727376
+ 140517615726704 -> 140509588193392
+ 140517615726704 [label=TBackward0]
+ 140517615727664 -> 140517615726704
+ 140517615727664 [label=ToCopyBackward0]
+ 140517615728240 -> 140517615727664
+ 140509590840240 [label="encoder.layer.7.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590840240 -> 140517615728240
+ 140517615728240 [label=AccumulateGrad]
+ 140509588166736 -> 140509588166832
+ 140509588166736 [label=UnsafeViewBackward0]
+ 140509588167504 -> 140509588166736
+ 140509588167504 [label=CloneBackward0]
+ 140509588167792 -> 140509588167504
+ 140509588167792 [label=ExpandBackward0]
+ 140509588168080 -> 140509588167792
+ 140509588168080 [label=PermuteBackward0]
+ 140509588166928 -> 140509588168080
+ 140509588166928 [label=ViewBackward0]
+ 140509588167120 -> 140509588166928
+ 140509588167120 [label=ViewBackward0]
+ 140509588194736 -> 140509588167120
+ 140509588194736 [label=AddmmBackward0]
+ 140517615594704 -> 140509588194736
+ 140517615594704 [label=ToCopyBackward0]
+ 140517615727712 -> 140517615594704
+ 140509590839760 [label="encoder.layer.7.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590839760 -> 140517615727712
+ 140517615727712 [label=AccumulateGrad]
+ 140509587695792 -> 140509588194736
+ 140509587695792 [label=ViewBackward0]
+ 140517615728336 -> 140509587695792
+ 140517615728336 [label=ToCopyBackward0]
+ 140509588165008 -> 140517615728336
+ 140509588193488 -> 140509588194736
+ 140509588193488 [label=TBackward0]
+ 140517615727184 -> 140509588193488
+ 140517615727184 [label=ToCopyBackward0]
+ 140517615728384 -> 140517615727184
+ 140509590840480 [label="encoder.layer.7.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590840480 -> 140517615728384
+ 140517615728384 [label=AccumulateGrad]
+ 140509588165056 -> 140509588165488
+ 140509588165056 [label=TBackward0]
+ 140509588166256 -> 140509588165056
+ 140509588166256 [label=ToCopyBackward0]
+ 140509588166496 -> 140509588166256
+ 140509590839600 [label="encoder.layer.7.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590839600 -> 140509588166496
+ 140509588166496 [label=AccumulateGrad]
+ 140509588165008 -> 140509588164912
+ 140509588164720 -> 140509588139888
+ 140509590839520 [label="encoder.layer.7.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590839520 -> 140509588164720
+ 140509588164720 [label=AccumulateGrad]
+ 140509588164672 -> 140509588139888
+ 140509985419152 [label="encoder.layer.7.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509985419152 -> 140509588164672
+ 140509588164672 [label=AccumulateGrad]
+ 140509588138160 -> 140509588138640
+ 140509588138160 [label=TBackward0]
+ 140509588138928 -> 140509588138160
+ 140509588138928 [label=ToCopyBackward0]
+ 140509588139456 -> 140509588138928
+ 140509591342032 [label="encoder.layer.7.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591342032 -> 140509588139456
+ 140509588139456 [label=AccumulateGrad]
+ 140509588137296 -> 140509588137536
+ 140509588137296 [label=TBackward0]
+ 140509588138448 -> 140509588137296
+ 140509588138448 [label=ToCopyBackward0]
+ 140509588139216 -> 140509588138448
+ 140509591341712 [label="encoder.layer.7.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591341712 -> 140509588139216
+ 140509588139216 [label=AccumulateGrad]
+ 140509588137056 -> 140509588137104
+ 140509588136816 -> 140509588136912
+ 140509591341472 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591341472 -> 140509588136816
+ 140509588136816 [label=AccumulateGrad]
+ 140509588136720 -> 140509588136912
+ 140509591341792 [label="encoder.layer.7.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591341792 -> 140509588136720
+ 140509588136720 [label=AccumulateGrad]
+ 140509588136624 -> 140509588136432
+ 140509588136624 [label=UnsqueezeBackward0]
+ 140509588137200 -> 140509588136624
+ 140509588137200 [label=NativeLayerNormBackward0]
+ 140509588137680 -> 140509588137200
+ 140509588137680 [label=AddBackward0]
+ 140509588139024 -> 140509588137680
+ 140509588139024 [label=NativeDropoutBackward0]
+ 140509588138256 -> 140509588139024
+ 140509588138256 [label=ViewBackward0]
+ 140509588139312 -> 140509588138256
+ 140509588139312 [label=AddmmBackward0]
+ 140509588137968 -> 140509588139312
+ 140509588137968 [label=ToCopyBackward0]
+ 140509588165776 -> 140509588137968
+ 140509591342192 [label="encoder.layer.7.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591342192 -> 140509588165776
+ 140509588165776 [label=AccumulateGrad]
+ 140509588165104 -> 140509588139312
+ 140509588165104 [label=ViewBackward0]
+ 140509588166448 -> 140509588165104
+ 140509588166448 [label=GeluBackward0]
+ 140509588166064 -> 140509588166448
+ 140509588166064 [label=ViewBackward0]
+ 140509588167600 -> 140509588166064
+ 140509588167600 [label=AddmmBackward0]
+ 140509588168272 -> 140509588167600
+ 140509588168272 [label=ToCopyBackward0]
+ 140509588193968 -> 140509588168272
+ 140509591341552 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591341552 -> 140509588193968
+ 140509588193968 [label=AccumulateGrad]
+ 140509588167984 -> 140509588167600
+ 140509588167984 [label=ViewBackward0]
+ 140517615727904 -> 140509588167984
+ 140517615727904 [label=ToCopyBackward0]
+ 140509588137056 -> 140517615727904
+ 140509588165872 -> 140509588167600
+ 140509588165872 [label=TBackward0]
+ 140517615726848 -> 140509588165872
+ 140517615726848 [label=ToCopyBackward0]
+ 140517615728288 -> 140517615726848
+ 140509591341232 [label="encoder.layer.7.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591341232 -> 140517615728288
+ 140517615728288 [label=AccumulateGrad]
+ 140509588164816 -> 140509588139312
+ 140509588164816 [label=TBackward0]
+ 140509588166016 -> 140509588164816
+ 140509588166016 [label=ToCopyBackward0]
+ 140509588193536 -> 140509588166016
+ 140509591340992 [label="encoder.layer.7.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591340992 -> 140509588193536
+ 140509588193536 [label=AccumulateGrad]
+ 140509588137056 -> 140509588137680
+ 140509588137584 -> 140509588137200
+ 140509591340752 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591340752 -> 140509588137584
+ 140509588137584 [label=AccumulateGrad]
+ 140509588136576 -> 140509588137200
+ 140509591341072 [label="encoder.layer.7.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591341072 -> 140509588136576
+ 140509588136576 [label=AccumulateGrad]
+ 140509588136096 -> 140509588136240
+ 140509588136096 [label=UnsqueezeBackward0]
+ 140509588138064 -> 140509588136096
+ 140509588138064 [label=UnsqueezeBackward0]
+ 140509588139408 -> 140509588138064
+ 140509588139408 [label=MulBackward0]
+ 140509588139696 -> 140509588139408
+ 140509588139696 [label=SoftmaxBackward0]
+ 140509588167312 -> 140509588139696
+ 140509588167312 [label=MmBackward0]
+ 140509588165392 -> 140509588167312
+ 140509588165392 [label=ToCopyBackward0]
+ 140517615728480 -> 140509588165392
+ 140517615728480 [label=DivBackward0]
+ 140517615728672 -> 140517615728480
+ 140517615728672 [label=SumBackward1]
+ 140517615728768 -> 140517615728672
+ 140517615728768 [label=MulBackward0]
+ 140509588137056 -> 140517615728768
+ 140517615728144 -> 140509588167312
+ 140517615728144 [label=TBackward0]
+ 140517615728720 -> 140517615728144
+ 140517615728720 [label=ToCopyBackward0]
+ 140517615728816 -> 140517615728720
+ 140509590823376 [label="encoder.layer.7.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509590823376 -> 140517615728816
+ 140517615728816 [label=AccumulateGrad]
+ 140509588106928 -> 140509588077488
+ 140509588106928 [label=IndexBackward0]
+ 140509588137008 -> 140509588106928
+ 140509588137008 [label=NativeLayerNormBackward0]
+ 140509588136336 -> 140509588137008
+ 140509588136336 [label=AddBackward0]
+ 140517615728864 -> 140509588136336
+ 140517615728864 [label=NativeDropoutBackward0]
+ 140517615728528 -> 140517615728864
+ 140517615728528 [label=ViewBackward0]
+ 140517615729008 -> 140517615728528
+ 140517615729008 [label=AddmmBackward0]
+ 140517615729104 -> 140517615729008
+ 140517615729104 [label=ToCopyBackward0]
+ 140517615729296 -> 140517615729104
+ 140509590826656 [label="encoder.layer.7.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509590826656 -> 140517615729296
+ 140517615729296 [label=AccumulateGrad]
+ 140517615729056 -> 140517615729008
+ 140517615729056 [label=ViewBackward0]
+ 140517615729344 -> 140517615729056
+ 140517615729344 [label=GeluBackward0]
+ 140517615729440 -> 140517615729344
+ 140517615729440 [label=ViewBackward0]
+ 140517615729536 -> 140517615729440
+ 140517615729536 [label=AddmmBackward0]
+ 140517615729632 -> 140517615729536
+ 140517615729632 [label=ToCopyBackward0]
+ 140517615729824 -> 140517615729632
+ 140509590826896 [label="encoder.layer.7.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509590826896 -> 140517615729824
+ 140517615729824 [label=AccumulateGrad]
+ 140517615729584 -> 140517615729536
+ 140517615729584 [label=ViewBackward0]
+ 140517615729872 -> 140517615729584
+ 140517615729872 [label=ToCopyBackward0]
+ 140517615728624 -> 140517615729872
+ 140517615728624 [label=SliceBackward0]
+ 140517615730016 -> 140517615728624
+ 140517615730016 [label=SliceBackward0]
+ 140517615730112 -> 140517615730016
+ 140517615730112 [label=SliceBackward0]
+ 140509588139888 -> 140517615730112
+ 140517615729248 -> 140517615729536
+ 140517615729248 [label=TBackward0]
+ 140517615729776 -> 140517615729248
+ 140517615729776 [label=ToCopyBackward0]
+ 140517615730208 -> 140517615729776
+ 140509985417872 [label="encoder.layer.7.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509985417872 -> 140517615730208
+ 140517615730208 [label=AccumulateGrad]
+ 140517615728912 -> 140517615729008
+ 140517615728912 [label=TBackward0]
+ 140517615729488 -> 140517615728912
+ 140517615729488 [label=ToCopyBackward0]
+ 140517615729968 -> 140517615729488
+ 140509590826416 [label="encoder.layer.7.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509590826416 -> 140517615729968
+ 140517615729968 [label=AccumulateGrad]
+ 140517615728624 -> 140509588136336
+ 140509588138736 -> 140509588137008
+ 140509590826736 [label="encoder.layer.7.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590826736 -> 140509588138736
+ 140509588138736 [label=AccumulateGrad]
+ 140509588136048 -> 140509588137008
+ 140509590824496 [label="encoder.layer.7.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509590824496 -> 140509588136048
+ 140509588136048 [label=AccumulateGrad]
+ 140509588105392 -> 140509588106352
+ 140509588105392 [label=TBackward0]
+ 140509588106640 -> 140509588105392
+ 140509588106640 [label=ToCopyBackward0]
+ 140509588165584 -> 140509588106640
+ 140509590823616 [label="encoder.layer.8.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590823616 -> 140509588165584
+ 140509588165584 [label=AccumulateGrad]
+ 140509588105200 -> 140509588105296
+ 140509588105200 [label=UnsafeViewBackward0]
+ 140509588136144 -> 140509588105200
+ 140509588136144 [label=CloneBackward0]
+ 140509588106064 -> 140509588136144
+ 140509588106064 [label=ExpandBackward0]
+ 140509588106448 -> 140509588106064
+ 140509588106448 [label=TransposeBackward0]
+ 140509588107216 -> 140509588106448
+ 140509588107216 [label=PermuteBackward0]
+ 140509588106880 -> 140509588107216
+ 140509588106880 [label=ViewBackward0]
+ 140517615728960 -> 140509588106880
+ 140517615728960 [label=ViewBackward0]
+ 140517615729200 -> 140517615728960
+ 140517615729200 [label=AddmmBackward0]
+ 140517615729728 -> 140517615729200
+ 140517615729728 [label=ToCopyBackward0]
+ 140517615729920 -> 140517615729728
+ 140509590823776 [label="encoder.layer.8.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509590823776 -> 140517615729920
+ 140517615729920 [label=AccumulateGrad]
+ 140517615729680 -> 140517615729200
+ 140517615729680 [label=ViewBackward0]
+ 140517615730256 -> 140517615729680
+ 140517615730256 [label=ToCopyBackward0]
+ 140509588077488 -> 140517615730256
+ 140517615728432 -> 140517615729200
+ 140517615728432 [label=TBackward0]
+ 140517615729392 -> 140517615728432
+ 140517615729392 [label=ToCopyBackward0]
+ 140517615730400 -> 140517615729392
+ 140509590823856 [label="encoder.layer.8.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590823856 -> 140517615730400
+ 140517615730400 [label=AccumulateGrad]
+ 140509588103856 -> 140509588103520
+ 140509588103856 [label=UnsafeViewBackward0]
+ 140509588104240 -> 140509588103856
+ 140509588104240 [label=CloneBackward0]
+ 140509588104480 -> 140509588104240
+ 140509588104480 [label=ExpandBackward0]
+ 140509588104912 -> 140509588104480
+ 140509588104912 [label=PermuteBackward0]
+ 140509588104048 -> 140509588104912
+ 140509588104048 [label=ViewBackward0]
+ 140509588105968 -> 140509588104048
+ 140509588105968 [label=ViewBackward0]
+ 140509588106736 -> 140509588105968
+ 140509588106736 [label=AddmmBackward0]
+ 140509588105584 -> 140509588106736
+ 140509588105584 [label=ToCopyBackward0]
+ 140517615730160 -> 140509588105584
+ 140509590824016 [label="encoder.layer.8.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509590824016 -> 140517615730160
+ 140517615730160 [label=AccumulateGrad]
+ 140509588103952 -> 140509588106736
+ 140509588103952 [label=ViewBackward0]
+ 140517615730496 -> 140509588103952
+ 140517615730496 [label=ToCopyBackward0]
+ 140509588077488 -> 140517615730496
+ 140517615728576 -> 140509588106736
+ 140517615728576 [label=TBackward0]
+ 140517615730064 -> 140517615728576
+ 140517615730064 [label=ToCopyBackward0]
+ 140517615730544 -> 140517615730064
+ 140509590824096 [label="encoder.layer.8.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590824096 -> 140517615730544
+ 140517615730544 [label=AccumulateGrad]
+ 140509588077584 -> 140509588077968
+ 140509588077584 [label=TBackward0]
+ 140509588078256 -> 140509588077584
+ 140509588078256 [label=ToCopyBackward0]
+ 140509588103664 -> 140509588078256
+ 140509590823296 [label="encoder.layer.8.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509590823296 -> 140509588103664
+ 140509588103664 [label=AccumulateGrad]
+ 140509588077488 -> 140509588076960
+ 140509588077104 -> 140509588076912
+ 140509590823136 [label="encoder.layer.8.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509590823136 -> 140509588077104
+ 140509588077104 [label=AccumulateGrad]
+ 140509588076000 -> 140509588076912
+ 140509591342912 [label="encoder.layer.8.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591342912 -> 140509588076000
+ 140509588076000 [label=AccumulateGrad]
+ 140509588074800 -> 140509588075760
+ 140509588074800 [label=TBackward0]
+ 140509588076336 -> 140509588074800
+ 140509588076336 [label=ToCopyBackward0]
+ 140509588077008 -> 140509588076336
+ 140509591342992 [label="encoder.layer.8.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591342992 -> 140509588077008
+ 140509588077008 [label=AccumulateGrad]
+ 140509588074704 -> 140509588074656
+ 140509588074704 [label=UnsafeViewBackward0]
+ 140509588075376 -> 140509588074704
+ 140509588075376 [label=CloneBackward0]
+ 140509588075664 -> 140509588075376
+ 140509588075664 [label=ExpandBackward0]
+ 140509588076144 -> 140509588075664
+ 140509588076144 [label=TransposeBackward0]
+ 140509588076816 -> 140509588076144
+ 140509588076816 [label=PermuteBackward0]
+ 140509588077296 -> 140509588076816
+ 140509588077296 [label=ViewBackward0]
+ 140509588077440 -> 140509588077296
+ 140509588077440 [label=ViewBackward0]
+ 140509588077920 -> 140509588077440
+ 140509588077920 [label=AddmmBackward0]
+ 140509588078544 -> 140509588077920
+ 140509588078544 [label=ToCopyBackward0]
+ 140509588104432 -> 140509588078544
+ 140509591342752 [label="encoder.layer.8.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509591342752 -> 140509588104432
+ 140509588104432 [label=AccumulateGrad]
+ 140509588075088 -> 140509588077920
+ 140509588075088 [label=ViewBackward0]
+ 140509588104720 -> 140509588075088
+ 140509588104720 [label=ToCopyBackward0]
+ 140509588105776 -> 140509588104720
+ 140509588105776 [label=ViewBackward0]
+ 140509588106256 -> 140509588105776
+ 140509588106256 [label=CloneBackward0]
+ 140517615730352 -> 140509588106256
+ 140517615730352 [label=ExpandBackward0]
+ 140517615730592 -> 140517615730352
+ 140517615730592 [label=UnsqueezeBackward0]
+ 140517615539152 -> 140517615730592
+ 140509588103568 -> 140509588077920
+ 140509588103568 [label=TBackward0]
+ 140509588103280 -> 140509588103568
+ 140509588103280 [label=ToCopyBackward0]
+ 140509588104960 -> 140509588103280
+ 140509591342672 [label="encoder.layer.8.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509591342672 -> 140509588104960
+ 140509588104960 [label=AccumulateGrad]
+ 140509588048624 -> 140509588048432
+ 140509588048624 [label=UnsafeViewBackward0]
+ 140509588048960 -> 140509588048624
+ 140509588048960 [label=CloneBackward0]
+ 140509588049392 -> 140509588048960
+ 140509588049392 [label=ExpandBackward0]
+ 140509588048816 -> 140509588049392
+ 140509588048816 [label=PermuteBackward0]
+ 140509588048720 -> 140509588048816
+ 140509588048720 [label=ViewBackward0]
+ 140509588075568 -> 140509588048720
+ 140509588075568 [label=ViewBackward0]
+ 140509588076624 -> 140509588075568
+ 140509588076624 [label=AddmmBackward0]
+ 140509588076432 -> 140509588076624
+ 140509588076432 [label=ToCopyBackward0]
+ 140509588103376 -> 140509588076432
+ 140509591340592 [label="encoder.layer.8.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509591340592 -> 140509588103376
+ 140509588103376 [label=AccumulateGrad]
+ 140509588077200 -> 140509588076624
+ 140509588077200 [label=ViewBackward0]
+ 140509588104000 -> 140509588077200
+ 140509588104000 [label=ToCopyBackward0]
+ 140509588105776 -> 140509588104000
+ 140509588074608 -> 140509588076624
+ 140509588074608 [label=TBackward0]
+ 140517615730640 -> 140509588074608
+ 140517615730640 [label=ToCopyBackward0]
+ 140517615730448 -> 140517615730640
+ 140509591342512 [label="encoder.layer.8.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509591342512 -> 140517615730448
+ 140517615730448 [label=AccumulateGrad]
+ 140509588047088 -> 140509588047376
+ 140509588047088 [label=TBackward0]
+ 140509588048144 -> 140509588047088
+ 140509588048144 [label=ToCopyBackward0]
+ 140509588048528 -> 140509588048144
+ 140509591340832 [label="encoder.layer.8.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591340832 -> 140509588048528
+ 140509588048528 [label=AccumulateGrad]
+ 140509588046896 -> 140509588046608
+ 140509588046320 -> 140509588046416
+ 140509591340512 [label="encoder.layer.8.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591340512 -> 140509588046320
+ 140509588046320 [label=AccumulateGrad]
+ 140509588045888 -> 140509588046416
+ 140509591340272 [label="encoder.layer.8.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591340272 -> 140509588045888
+ 140509588045888 [label=AccumulateGrad]
+ 140509588024432 -> 140509588024912
+ 140509588024432 [label=TBackward0]
+ 140509588046128 -> 140509588024432
+ 140509588046128 [label=ToCopyBackward0]
+ 140509588046512 -> 140509588046128
+ 140509591319952 [label="encoder.layer.8.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591319952 -> 140509588046512
+ 140509588046512 [label=AccumulateGrad]
+ 140509588023568 -> 140509588023856
+ 140509588023568 [label=TBackward0]
+ 140509588024576 -> 140509588023568
+ 140509588024576 [label=ToCopyBackward0]
+ 140509588025008 -> 140509588024576
+ 140509591320032 [label="encoder.layer.8.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591320032 -> 140509588025008
+ 140509588025008 [label=AccumulateGrad]
+ 140509588023376 -> 140509588023280
+ 140509588023088 -> 140509588023184
+ 140509591319792 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591319792 -> 140509588023088
+ 140509588023088 [label=AccumulateGrad]
+ 140509588022992 -> 140509588023184
+ 140509591319712 [label="encoder.layer.8.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591319712 -> 140509588022992
+ 140509588022992 [label=AccumulateGrad]
+ 140509588022800 -> 140509588022704
+ 140509588022800 [label=UnsqueezeBackward0]
+ 140509588023472 -> 140509588022800
+ 140509588023472 [label=NativeLayerNormBackward0]
+ 140509588023952 -> 140509588023472
+ 140509588023952 [label=AddBackward0]
+ 140509588024528 -> 140509588023952
+ 140509588024528 [label=NativeDropoutBackward0]
+ 140509588046032 -> 140509588024528
+ 140509588046032 [label=ViewBackward0]
+ 140509588045936 -> 140509588046032
+ 140509588045936 [label=AddmmBackward0]
+ 140509588047472 -> 140509588045936
+ 140509588047472 [label=ToCopyBackward0]
+ 140509588047520 -> 140509588047472
+ 140509591320512 [label="encoder.layer.8.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591320512 -> 140509588047520
+ 140509588047520 [label=AccumulateGrad]
+ 140509588047040 -> 140509588045936
+ 140509588047040 [label=ViewBackward0]
+ 140509588048048 -> 140509588047040
+ 140509588048048 [label=GeluBackward0]
+ 140509588049440 -> 140509588048048
+ 140509588049440 [label=ViewBackward0]
+ 140509588048912 -> 140509588049440
+ 140509588048912 [label=AddmmBackward0]
+ 140509588077680 -> 140509588048912
+ 140509588077680 [label=ToCopyBackward0]
+ 140517615729152 -> 140509588077680
+ 140509591319472 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591319472 -> 140517615729152
+ 140517615729152 [label=AccumulateGrad]
+ 140509588075856 -> 140509588048912
+ 140509588075856 [label=ViewBackward0]
+ 140517615268000 -> 140509588075856
+ 140517615268000 [label=ToCopyBackward0]
+ 140509588023376 -> 140517615268000
+ 140509588074560 -> 140509588048912
+ 140509588074560 [label=TBackward0]
+ 140517615267904 -> 140509588074560
+ 140517615267904 [label=ToCopyBackward0]
+ 140517615268144 -> 140517615267904
+ 140509591319552 [label="encoder.layer.8.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591319552 -> 140517615268144
+ 140517615268144 [label=AccumulateGrad]
+ 140509588046992 -> 140509588045936
+ 140509588046992 [label=TBackward0]
+ 140509588075184 -> 140509588046992
+ 140509588075184 [label=ToCopyBackward0]
+ 140517615730304 -> 140509588075184
+ 140509591319312 [label="encoder.layer.8.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591319312 -> 140517615730304
+ 140517615730304 [label=AccumulateGrad]
+ 140509588023376 -> 140509588023952
+ 140509588023760 -> 140509588023472
+ 140509591319072 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591319072 -> 140509588023760
+ 140509588023760 [label=AccumulateGrad]
+ 140509588022896 -> 140509588023472
+ 140509591318992 [label="encoder.layer.8.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591318992 -> 140509588022896
+ 140509588022896 [label=AccumulateGrad]
+ 140509588022416 -> 140509588022512
+ 140509588022416 [label=UnsqueezeBackward0]
+ 140509588024240 -> 140509588022416
+ 140509588024240 [label=UnsqueezeBackward0]
+ 140509588024096 -> 140509588024240
+ 140509588024096 [label=MulBackward0]
+ 140509588047664 -> 140509588024096
+ 140509588047664 [label=SoftmaxBackward0]
+ 140509588049200 -> 140509588047664
+ 140509588049200 [label=MmBackward0]
+ 140509588046080 -> 140509588049200
+ 140509588046080 [label=ToCopyBackward0]
+ 140517615268048 -> 140509588046080
+ 140517615268048 [label=DivBackward0]
+ 140517615268336 -> 140517615268048
+ 140517615268336 [label=SumBackward1]
+ 140517615268432 -> 140517615268336
+ 140517615268432 [label=MulBackward0]
+ 140509588023376 -> 140517615268432
+ 140517615267952 -> 140509588049200
+ 140517615267952 [label=TBackward0]
+ 140517615268384 -> 140517615267952
+ 140517615268384 [label=ToCopyBackward0]
+ 140517615268480 -> 140517615268384
+ 140509591321392 [label="encoder.layer.8.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509591321392 -> 140517615268480
+ 140517615268480 [label=AccumulateGrad]
+ 140509588021840 -> 140509587963664
+ 140509588021840 [label=IndexBackward0]
+ 140509588023136 -> 140509588021840
+ 140509588023136 [label=NativeLayerNormBackward0]
+ 140509588022608 -> 140509588023136
+ 140509588022608 [label=AddBackward0]
+ 140517615268528 -> 140509588022608
+ 140517615268528 [label=NativeDropoutBackward0]
+ 140517615268192 -> 140517615268528
+ 140517615268192 [label=ViewBackward0]
+ 140517615268672 -> 140517615268192
+ 140517615268672 [label=AddmmBackward0]
+ 140517615268768 -> 140517615268672
+ 140517615268768 [label=ToCopyBackward0]
+ 140517615268960 -> 140517615268768
+ 140509591339792 [label="encoder.layer.8.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591339792 -> 140517615268960
+ 140517615268960 [label=AccumulateGrad]
+ 140517615268720 -> 140517615268672
+ 140517615268720 [label=ViewBackward0]
+ 140517615269008 -> 140517615268720
+ 140517615269008 [label=GeluBackward0]
+ 140517615269104 -> 140517615269008
+ 140517615269104 [label=ViewBackward0]
+ 140517615269200 -> 140517615269104
+ 140517615269200 [label=AddmmBackward0]
+ 140517615269296 -> 140517615269200
+ 140517615269296 [label=ToCopyBackward0]
+ 140517615269488 -> 140517615269296
+ 140509591340032 [label="encoder.layer.8.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591340032 -> 140517615269488
+ 140517615269488 [label=AccumulateGrad]
+ 140517615269248 -> 140517615269200
+ 140517615269248 [label=ViewBackward0]
+ 140517615269536 -> 140517615269248
+ 140517615269536 [label=ToCopyBackward0]
+ 140517615268288 -> 140517615269536
+ 140517615268288 [label=SliceBackward0]
+ 140517615269680 -> 140517615268288
+ 140517615269680 [label=SliceBackward0]
+ 140517615269776 -> 140517615269680
+ 140517615269776 [label=SliceBackward0]
+ 140509588076912 -> 140517615269776
+ 140517615268912 -> 140517615269200
+ 140517615268912 [label=TBackward0]
+ 140517615269440 -> 140517615268912
+ 140517615269440 [label=ToCopyBackward0]
+ 140517615269872 -> 140517615269440
+ 140509591340352 [label="encoder.layer.8.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591340352 -> 140517615269872
+ 140517615269872 [label=AccumulateGrad]
+ 140517615268576 -> 140517615268672
+ 140517615268576 [label=TBackward0]
+ 140517615269152 -> 140517615268576
+ 140517615269152 [label=ToCopyBackward0]
+ 140517615269632 -> 140517615269152
+ 140509591340112 [label="encoder.layer.8.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591340112 -> 140517615269632
+ 140517615269632 [label=AccumulateGrad]
+ 140517615268288 -> 140509588022608
+ 140509588022176 -> 140509588023136
+ 140509591339872 [label="encoder.layer.8.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591339872 -> 140509588022176
+ 140509588022176 [label=AccumulateGrad]
+ 140509588046560 -> 140509588023136
+ 140509591339552 [label="encoder.layer.8.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591339552 -> 140509588046560
+ 140509588046560 [label=AccumulateGrad]
+ 140509588021312 -> 140509587991520
+ 140509588021312 [label=TBackward0]
+ 140509588021648 -> 140509588021312
+ 140509588021648 [label=ToCopyBackward0]
+ 140509588048336 -> 140509588021648
+ 140509591321632 [label="encoder.layer.9.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591321632 -> 140509588048336
+ 140509588048336 [label=AccumulateGrad]
+ 140509587991472 -> 140509587991568
+ 140509587991472 [label=UnsafeViewBackward0]
+ 140509587992144 -> 140509587991472
+ 140509587992144 [label=CloneBackward0]
+ 140509587992528 -> 140509587992144
+ 140509587992528 [label=ExpandBackward0]
+ 140509587991856 -> 140509587992528
+ 140509587991856 [label=TransposeBackward0]
+ 140509588022320 -> 140509587991856
+ 140509588022320 [label=PermuteBackward0]
+ 140509588021936 -> 140509588022320
+ 140509588021936 [label=ViewBackward0]
+ 140517615268624 -> 140509588021936
+ 140517615268624 [label=ViewBackward0]
+ 140517615268864 -> 140517615268624
+ 140517615268864 [label=AddmmBackward0]
+ 140517615269392 -> 140517615268864
+ 140517615269392 [label=ToCopyBackward0]
+ 140517615269584 -> 140517615269392
+ 140509591322192 [label="encoder.layer.9.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509591322192 -> 140517615269584
+ 140517615269584 [label=AccumulateGrad]
+ 140517615269344 -> 140517615268864
+ 140517615269344 [label=ViewBackward0]
+ 140517615269920 -> 140517615269344
+ 140517615269920 [label=ToCopyBackward0]
+ 140509587963664 -> 140517615269920
+ 140517615268096 -> 140517615268864
+ 140517615268096 [label=TBackward0]
+ 140517615269056 -> 140517615268096
+ 140517615269056 [label=ToCopyBackward0]
+ 140517615270064 -> 140517615269056
+ 140509591321872 [label="encoder.layer.9.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591321872 -> 140517615270064
+ 140517615270064 [label=AccumulateGrad]
+ 140509587990128 -> 140509587989840
+ 140509587990128 [label=UnsafeViewBackward0]
+ 140509587990512 -> 140509587990128
+ 140509587990512 [label=CloneBackward0]
+ 140509587990800 -> 140509587990512
+ 140509587990800 [label=ExpandBackward0]
+ 140509587991040 -> 140509587990800
+ 140509587991040 [label=PermuteBackward0]
+ 140509587990224 -> 140509587991040
+ 140509587990224 [label=ViewBackward0]
+ 140509587992336 -> 140509587990224
+ 140509587992336 [label=ViewBackward0]
+ 140509587990080 -> 140509587992336
+ 140509587990080 [label=AddmmBackward0]
+ 140509588021360 -> 140509587990080
+ 140509588021360 [label=ToCopyBackward0]
+ 140517615269824 -> 140509588021360
+ 140509591322432 [label="encoder.layer.9.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509591322432 -> 140517615269824
+ 140517615269824 [label=AccumulateGrad]
+ 140509588021744 -> 140509587990080
+ 140509588021744 [label=ViewBackward0]
+ 140517615270160 -> 140509588021744
+ 140517615270160 [label=ToCopyBackward0]
+ 140509587963664 -> 140517615270160
+ 140517615268240 -> 140509587990080
+ 140517615268240 [label=TBackward0]
+ 140517615269728 -> 140517615268240
+ 140517615269728 [label=ToCopyBackward0]
+ 140517615270208 -> 140517615269728
+ 140509591322112 [label="encoder.layer.9.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591322112 -> 140517615270208
+ 140517615270208 [label=AccumulateGrad]
+ 140509587988688 -> 140509587988784
+ 140509587988688 [label=TBackward0]
+ 140509587989648 -> 140509587988688
+ 140509587989648 [label=ToCopyBackward0]
+ 140509587989936 -> 140509587989648
+ 140509591321712 [label="encoder.layer.9.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591321712 -> 140509587989936
+ 140509587989936 [label=AccumulateGrad]
+ 140509587963664 -> 140509587963280
+ 140509587963376 -> 140509587963040
+ 140509591321232 [label="encoder.layer.9.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591321232 -> 140509587963376
+ 140509587963376 [label=AccumulateGrad]
+ 140509587962032 -> 140509587963040
+ 140509591321472 [label="encoder.layer.9.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591321472 -> 140509587962032
+ 140509587962032 [label=AccumulateGrad]
+ 140509587961120 -> 140509587961600
+ 140509587961120 [label=TBackward0]
+ 140509587962224 -> 140509587961120
+ 140509587962224 [label=ToCopyBackward0]
+ 140509587962896 -> 140509587962224
+ 140509591311760 [label="encoder.layer.9.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591311760 -> 140509587962896
+ 140509587962896 [label=AccumulateGrad]
+ 140509587960688 -> 140509587960976
+ 140509587960688 [label=TBackward0]
+ 140509587961744 -> 140509587960688
+ 140509587961744 [label=ToCopyBackward0]
+ 140509587962608 -> 140509587961744
+ 140509591311440 [label="encoder.layer.9.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591311440 -> 140509587962608
+ 140509587962608 [label=AccumulateGrad]
+ 140509587960496 -> 140509587960112
+ 140509587960208 -> 140509588463424
+ 140509591311200 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591311200 -> 140509587960208
+ 140509587960208 [label=AccumulateGrad]
+ 140509587960016 -> 140509588463424
+ 140509591311520 [label="encoder.layer.9.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591311520 -> 140509587960016
+ 140509587960016 [label=AccumulateGrad]
+ 140509588463376 -> 140509588463184
+ 140509588463376 [label=UnsqueezeBackward0]
+ 140509587960160 -> 140509588463376
+ 140509587960160 [label=NativeLayerNormBackward0]
+ 140509587960640 -> 140509587960160
+ 140509587960640 [label=AddBackward0]
+ 140509587963184 -> 140509587960640
+ 140509587963184 [label=NativeDropoutBackward0]
+ 140509587961648 -> 140509587963184
+ 140509587961648 [label=ViewBackward0]
+ 140509587962320 -> 140509587961648
+ 140509587962320 [label=AddmmBackward0]
+ 140509587963472 -> 140509587962320
+ 140509587963472 [label=ToCopyBackward0]
+ 140509587989168 -> 140509587963472
+ 140509591311920 [label="encoder.layer.9.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591311920 -> 140509587989168
+ 140509587989168 [label=AccumulateGrad]
+ 140509587963568 -> 140509587962320
+ 140509587963568 [label=ViewBackward0]
+ 140509587989744 -> 140509587963568
+ 140509587989744 [label=GeluBackward0]
+ 140509587989072 -> 140509587989744
+ 140509587989072 [label=ViewBackward0]
+ 140509587990560 -> 140509587989072
+ 140509587990560 [label=AddmmBackward0]
+ 140509587991280 -> 140509587990560
+ 140509587991280 [label=ToCopyBackward0]
+ 140509588022224 -> 140509587991280
+ 140509591311280 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591311280 -> 140509588022224
+ 140509588022224 [label=AccumulateGrad]
+ 140509587990992 -> 140509587990560
+ 140509587990992 [label=ViewBackward0]
+ 140517615270016 -> 140509587990992
+ 140517615270016 [label=ToCopyBackward0]
+ 140509587960496 -> 140517615270016
+ 140509587988880 -> 140509587990560
+ 140509587988880 [label=TBackward0]
+ 140517615268816 -> 140509587988880
+ 140517615268816 [label=ToCopyBackward0]
+ 140517615270112 -> 140517615268816
+ 140509591310960 [label="encoder.layer.9.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591310960 -> 140517615270112
+ 140517615270112 [label=AccumulateGrad]
+ 140509587961264 -> 140509587962320
+ 140509587961264 [label=TBackward0]
+ 140509587989456 -> 140509587961264
+ 140509587989456 [label=ToCopyBackward0]
+ 140509587992048 -> 140509587989456
+ 140509591310720 [label="encoder.layer.9.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591310720 -> 140509587992048
+ 140509587992048 [label=AccumulateGrad]
+ 140509587960496 -> 140509587960640
+ 140509587960592 -> 140509587960160
+ 140509591310480 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591310480 -> 140509587960592
+ 140509587960592 [label=AccumulateGrad]
+ 140509587959920 -> 140509587960160
+ 140509591310800 [label="encoder.layer.9.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591310800 -> 140509587959920
+ 140509587959920 [label=AccumulateGrad]
+ 140509588463280 -> 140509588462944
+ 140509588463280 [label=UnsqueezeBackward0]
+ 140509588463472 -> 140509588463280
+ 140509588463472 [label=UnsqueezeBackward0]
+ 140509587962704 -> 140509588463472
+ 140509587962704 [label=MulBackward0]
+ 140509587963856 -> 140509587962704
+ 140509587963856 [label=SoftmaxBackward0]
+ 140509587990320 -> 140509587963856
+ 140509587990320 [label=MmBackward0]
+ 140509587960304 -> 140509587990320
+ 140509587960304 [label=ToCopyBackward0]
+ 140517615270304 -> 140509587960304
+ 140517615270304 [label=DivBackward0]
+ 140517615270496 -> 140517615270304
+ 140517615270496 [label=SumBackward1]
+ 140517615270592 -> 140517615270496
+ 140517615270592 [label=MulBackward0]
+ 140509587960496 -> 140517615270592
+ 140517615269968 -> 140509587990320
+ 140517615269968 [label=TBackward0]
+ 140517615270544 -> 140517615269968
+ 140517615270544 [label=ToCopyBackward0]
+ 140517615270640 -> 140517615270544
+ 140509591313200 [label="encoder.layer.9.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509591313200 -> 140517615270640
+ 140517615270640 [label=AccumulateGrad]
+ 140509588462416 -> 140509588428880
+ 140509588462416 [label=IndexBackward0]
+ 140509588462896 -> 140509588462416
+ 140509588462896 [label=NativeLayerNormBackward0]
+ 140509587963088 -> 140509588462896
+ 140509587963088 [label=AddBackward0]
+ 140517615270688 -> 140509587963088
+ 140517615270688 [label=NativeDropoutBackward0]
+ 140517615270352 -> 140517615270688
+ 140517615270352 [label=ViewBackward0]
+ 140517615270832 -> 140517615270352
+ 140517615270832 [label=AddmmBackward0]
+ 140517615270928 -> 140517615270832
+ 140517615270928 [label=ToCopyBackward0]
+ 140517615271120 -> 140517615270928
+ 140509591320672 [label="encoder.layer.9.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591320672 -> 140517615271120
+ 140517615271120 [label=AccumulateGrad]
+ 140517615270880 -> 140517615270832
+ 140517615270880 [label=ViewBackward0]
+ 140517615271168 -> 140517615270880
+ 140517615271168 [label=GeluBackward0]
+ 140517615271264 -> 140517615271168
+ 140517615271264 [label=ViewBackward0]
+ 140517615271360 -> 140517615271264
+ 140517615271360 [label=AddmmBackward0]
+ 140517615271456 -> 140517615271360
+ 140517615271456 [label=ToCopyBackward0]
+ 140517615271648 -> 140517615271456
+ 140509591320752 [label="encoder.layer.9.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591320752 -> 140517615271648
+ 140517615271648 [label=AccumulateGrad]
+ 140517615271408 -> 140517615271360
+ 140517615271408 [label=ViewBackward0]
+ 140517615271696 -> 140517615271408
+ 140517615271696 [label=ToCopyBackward0]
+ 140517615270448 -> 140517615271696
+ 140517615270448 [label=SliceBackward0]
+ 140517615271840 -> 140517615270448
+ 140517615271840 [label=SliceBackward0]
+ 140517615271888 -> 140517615271840
+ 140517615271888 [label=SliceBackward0]
+ 140509587963040 -> 140517615271888
+ 140517615271072 -> 140517615271360
+ 140517615271072 [label=TBackward0]
+ 140517615271600 -> 140517615271072
+ 140517615271600 [label=ToCopyBackward0]
+ 140517615271552 -> 140517615271600
+ 140509591320912 [label="encoder.layer.9.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591320912 -> 140517615271552
+ 140517615271552 [label=AccumulateGrad]
+ 140517615270736 -> 140517615270832
+ 140517615270736 [label=TBackward0]
+ 140517615271312 -> 140517615270736
+ 140517615271312 [label=ToCopyBackward0]
+ 140517615271792 -> 140517615271312
+ 140509591320992 [label="encoder.layer.9.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591320992 -> 140517615271792
+ 140517615271792 [label=AccumulateGrad]
+ 140517615270448 -> 140509587963088
+ 140509587962128 -> 140509588462896
+ 140509591320432 [label="encoder.layer.9.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591320432 -> 140509587962128
+ 140509587962128 [label=AccumulateGrad]
+ 140509587961072 -> 140509588462896
+ 140509591318592 [label="encoder.layer.9.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591318592 -> 140509587961072
+ 140509587961072 [label=AccumulateGrad]
+ 140509588461168 -> 140509588462128
+ 140509588461168 [label=TBackward0]
+ 140509588462512 -> 140509588461168
+ 140509588462512 [label=ToCopyBackward0]
+ 140509587988592 -> 140509588462512
+ 140509591313440 [label="encoder.layer.10.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591313440 -> 140509587988592
+ 140509587988592 [label=AccumulateGrad]
+ 140509588461072 -> 140509588460784
+ 140509588461072 [label=UnsafeViewBackward0]
+ 140509588461456 -> 140509588461072
+ 140509588461456 [label=CloneBackward0]
+ 140509588461744 -> 140509588461456
+ 140509588461744 [label=ExpandBackward0]
+ 140509588462224 -> 140509588461744
+ 140509588462224 [label=TransposeBackward0]
+ 140509588463088 -> 140509588462224
+ 140509588463088 [label=PermuteBackward0]
+ 140509588462800 -> 140509588463088
+ 140509588462800 [label=ViewBackward0]
+ 140517615270784 -> 140509588462800
+ 140517615270784 [label=ViewBackward0]
+ 140517615271024 -> 140517615270784
+ 140517615271024 [label=AddmmBackward0]
+ 140517615271744 -> 140517615271024
+ 140517615271744 [label=ToCopyBackward0]
+ 140517615321248 -> 140517615271744
+ 140509591313600 [label="encoder.layer.10.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509591313600 -> 140517615321248
+ 140517615321248 [label=AccumulateGrad]
+ 140517615271504 -> 140517615271024
+ 140517615271504 [label=ViewBackward0]
+ 140517615321296 -> 140517615271504
+ 140517615321296 [label=ToCopyBackward0]
+ 140509588428880 -> 140517615321296
+ 140517615270256 -> 140517615271024
+ 140517615270256 [label=TBackward0]
+ 140517615321152 -> 140517615270256
+ 140517615321152 [label=ToCopyBackward0]
+ 140517615321440 -> 140517615321152
+ 140509591313680 [label="encoder.layer.10.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591313680 -> 140517615321440
+ 140517615321440 [label=AccumulateGrad]
+ 140509588429936 -> 140509588430704
+ 140509588429936 [label=UnsafeViewBackward0]
+ 140509588460112 -> 140509588429936
+ 140509588460112 [label=CloneBackward0]
+ 140509588460400 -> 140509588460112
+ 140509588460400 [label=ExpandBackward0]
+ 140509588460688 -> 140509588460400
+ 140509588460688 [label=PermuteBackward0]
+ 140509588459632 -> 140509588460688
+ 140509588459632 [label=ViewBackward0]
+ 140509588461504 -> 140509588459632
+ 140509588461504 [label=ViewBackward0]
+ 140509588462704 -> 140509588461504
+ 140509588462704 [label=AddmmBackward0]
+ 140509588461024 -> 140509588462704
+ 140509588461024 [label=ToCopyBackward0]
+ 140517615271216 -> 140509588461024
+ 140509591313840 [label="encoder.layer.10.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509591313840 -> 140517615271216
+ 140517615271216 [label=AccumulateGrad]
+ 140509588459680 -> 140509588462704
+ 140509588459680 [label=ViewBackward0]
+ 140517615321536 -> 140509588459680
+ 140517615321536 [label=ToCopyBackward0]
+ 140509588428880 -> 140517615321536
+ 140517615270400 -> 140509588462704
+ 140517615270400 [label=TBackward0]
+ 140517615321392 -> 140517615270400
+ 140517615321392 [label=ToCopyBackward0]
+ 140517615321584 -> 140517615321392
+ 140509591313920 [label="encoder.layer.10.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591313920 -> 140517615321584
+ 140517615321584 [label=AccumulateGrad]
+ 140509588428928 -> 140509588429360
+ 140509588428928 [label=TBackward0]
+ 140509588430128 -> 140509588428928
+ 140509588430128 [label=ToCopyBackward0]
+ 140509588430368 -> 140509588430128
+ 140509591313120 [label="encoder.layer.10.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591313120 -> 140509588430368
+ 140509588430368 [label=AccumulateGrad]
+ 140509588428880 -> 140509588428784
+ 140509588428448 -> 140509588428592
+ 140509591312640 [label="encoder.layer.10.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591312640 -> 140509588428448
+ 140509588428448 [label=AccumulateGrad]
+ 140509588427824 -> 140509588428592
+ 140509591312880 [label="encoder.layer.10.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591312880 -> 140509588427824
+ 140509588427824 [label=AccumulateGrad]
+ 140509588426816 -> 140509588427536
+ 140509588426816 [label=TBackward0]
+ 140509588427728 -> 140509588426816
+ 140509588427728 [label=ToCopyBackward0]
+ 140509588428400 -> 140509588427728
+ 140509591312720 [label="encoder.layer.10.crossattention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591312720 -> 140509588428400
+ 140509588428400 [label=AccumulateGrad]
+ 140509588405840 -> 140509588405504
+ 140509588405840 [label=UnsafeViewBackward0]
+ 140509588406032 -> 140509588405840
+ 140509588406032 [label=CloneBackward0]
+ 140509588427008 -> 140509588406032
+ 140509588427008 [label=ExpandBackward0]
+ 140509588427488 -> 140509588427008
+ 140509588427488 [label=TransposeBackward0]
+ 140509588428208 -> 140509588427488
+ 140509588428208 [label=PermuteBackward0]
+ 140509588428688 -> 140509588428208
+ 140509588428688 [label=ViewBackward0]
+ 140509588429264 -> 140509588428688
+ 140509588429264 [label=ViewBackward0]
+ 140509588429744 -> 140509588429264
+ 140509588429744 [label=AddmmBackward0]
+ 140509588430320 -> 140509588429744
+ 140509588430320 [label=ToCopyBackward0]
+ 140509588460208 -> 140509588430320
+ 140509591312480 [label="encoder.layer.10.crossattention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509591312480 -> 140509588460208
+ 140509588460208 [label=AccumulateGrad]
+ 140509588429648 -> 140509588429744
+ 140509588429648 [label=ViewBackward0]
+ 140509588460592 -> 140509588429648
+ 140509588460592 [label=ToCopyBackward0]
+ 140509588461264 -> 140509588460592
+ 140509588461264 [label=ViewBackward0]
+ 140517615270976 -> 140509588461264
+ 140517615270976 [label=CloneBackward0]
+ 140509588459584 -> 140517615270976
+ 140509588459584 [label=ExpandBackward0]
+ 140517615321632 -> 140509588459584
+ 140517615321632 [label=UnsqueezeBackward0]
+ 140517615539152 -> 140517615321632
+ 140509588426864 -> 140509588429744
+ 140509588426864 [label=TBackward0]
+ 140509588461936 -> 140509588426864
+ 140509588461936 [label=ToCopyBackward0]
+ 140509588460880 -> 140509588461936
+ 140509591312400 [label="encoder.layer.10.crossattention.self.key.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509591312400 -> 140509588460880
+ 140509588460880 [label=AccumulateGrad]
+ 140509588404064 -> 140509588404208
+ 140509588404064 [label=UnsafeViewBackward0]
+ 140509588404880 -> 140509588404064
+ 140509588404880 [label=CloneBackward0]
+ 140509588405168 -> 140509588404880
+ 140509588405168 [label=ExpandBackward0]
+ 140509588405552 -> 140509588405168
+ 140509588405552 [label=PermuteBackward0]
+ 140509588404304 -> 140509588405552
+ 140509588404304 [label=ViewBackward0]
+ 140509588405936 -> 140509588404304
+ 140509588405936 [label=ViewBackward0]
+ 140509588427968 -> 140509588405936
+ 140509588427968 [label=AddmmBackward0]
+ 140509588428112 -> 140509588427968
+ 140509588428112 [label=ToCopyBackward0]
+ 140509588459920 -> 140509588428112
+ 140509591310560 [label="encoder.layer.10.crossattention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509591310560 -> 140509588459920
+ 140509588459920 [label=AccumulateGrad]
+ 140509588428976 -> 140509588427968
+ 140509588428976 [label=ViewBackward0]
+ 140509588429888 -> 140509588428976
+ 140509588429888 [label=ToCopyBackward0]
+ 140509588461264 -> 140509588429888
+ 140509588426960 -> 140509588427968
+ 140509588426960 [label=TBackward0]
+ 140517615321680 -> 140509588426960
+ 140517615321680 [label=ToCopyBackward0]
+ 140517615321344 -> 140517615321680
+ 140509591312240 [label="encoder.layer.10.crossattention.self.value.weight
+ (768, 1408)" fillcolor=lightblue]
+ 140509591312240 -> 140517615321344
+ 140517615321344 [label=AccumulateGrad]
+ 140509588402576 -> 140509588402864
+ 140509588402576 [label=TBackward0]
+ 140509588403584 -> 140509588402576
+ 140509588403584 [label=ToCopyBackward0]
+ 140509588404016 -> 140509588403584
+ 140509591311040 [label="encoder.layer.10.crossattention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591311040 -> 140509588404016
+ 140509588404016 [label=AccumulateGrad]
+ 140509588402384 -> 140509588373360
+ 140509588372784 -> 140509588373456
+ 140509591293760 [label="encoder.layer.10.crossattention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591293760 -> 140509588372784
+ 140509588372784 [label=AccumulateGrad]
+ 140509588402240 -> 140509588373456
+ 140509591293520 [label="encoder.layer.10.crossattention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591293520 -> 140509588402240
+ 140509588402240 [label=AccumulateGrad]
+ 140509588372016 -> 140509588372496
+ 140509588372016 [label=TBackward0]
+ 140509588372688 -> 140509588372016
+ 140509588372688 [label=ToCopyBackward0]
+ 140509588373168 -> 140509588372688
+ 140509591289920 [label="encoder.layer.10.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591289920 -> 140509588373168
+ 140509588373168 [label=AccumulateGrad]
+ 140509588371008 -> 140509588371440
+ 140509588371008 [label=TBackward0]
+ 140509588372208 -> 140509588371008
+ 140509588372208 [label=ToCopyBackward0]
+ 140509588372928 -> 140509588372208
+ 140509591290240 [label="encoder.layer.10.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591290240 -> 140509588372928
+ 140509588372928 [label=AccumulateGrad]
+ 140509588370960 -> 140509588370864
+ 140509588370528 -> 140509588370672
+ 140509591285328 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591285328 -> 140509588370528
+ 140509588370528 [label=AccumulateGrad]
+ 140509588370576 -> 140509588370672
+ 140509591285248 [label="encoder.layer.10.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591285248 -> 140509588370576
+ 140509588370576 [label=AccumulateGrad]
+ 140509588370384 -> 140509588370192
+ 140509588370384 [label=UnsqueezeBackward0]
+ 140509588371056 -> 140509588370384
+ 140509588371056 [label=NativeLayerNormBackward0]
+ 140509588371536 -> 140509588371056
+ 140509588371536 [label=AddBackward0]
+ 140509588373072 -> 140509588371536
+ 140509588373072 [label=NativeDropoutBackward0]
+ 140509588371968 -> 140509588373072
+ 140509588371968 [label=ViewBackward0]
+ 140509588402288 -> 140509588371968
+ 140509588402288 [label=AddmmBackward0]
+ 140509588403248 -> 140509588402288
+ 140509588403248 [label=ToCopyBackward0]
+ 140509588403440 -> 140509588403248
+ 140509591284528 [label="encoder.layer.10.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591284528 -> 140509588403440
+ 140509588403440 [label=AccumulateGrad]
+ 140509588402960 -> 140509588402288
+ 140509588402960 [label=ViewBackward0]
+ 140509588403536 -> 140509588402960
+ 140509588403536 [label=GeluBackward0]
+ 140509588405360 -> 140509588403536
+ 140509588405360 [label=ViewBackward0]
+ 140509588404592 -> 140509588405360
+ 140509588404592 [label=AddmmBackward0]
+ 140509588429456 -> 140509588404592
+ 140509588429456 [label=ToCopyBackward0]
+ 140517615321776 -> 140509588429456
+ 140509591284768 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591284768 -> 140517615321776
+ 140517615321776 [label=AccumulateGrad]
+ 140509588427248 -> 140509588404592
+ 140509588427248 [label=ViewBackward0]
+ 140517615321488 -> 140509588427248
+ 140517615321488 [label=ToCopyBackward0]
+ 140509588370960 -> 140517615321488
+ 140509588404688 -> 140509588404592
+ 140509588404688 [label=TBackward0]
+ 140517615321728 -> 140509588404688
+ 140517615321728 [label=ToCopyBackward0]
+ 140517615321968 -> 140517615321728
+ 140509591285088 [label="encoder.layer.10.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591285088 -> 140517615321968
+ 140517615321968 [label=AccumulateGrad]
+ 140509588402768 -> 140509588402288
+ 140509588402768 [label=TBackward0]
+ 140509588405648 -> 140509588402768
+ 140509588405648 [label=ToCopyBackward0]
+ 140509588405072 -> 140509588405648
+ 140509591284848 [label="encoder.layer.10.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591284848 -> 140509588405072
+ 140509588405072 [label=AccumulateGrad]
+ 140509588370960 -> 140509588371536
+ 140509588371344 -> 140509588371056
+ 140509591284608 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591284608 -> 140509588371344
+ 140509588371344 [label=AccumulateGrad]
+ 140509588370480 -> 140509588371056
+ 140509591285008 [label="encoder.layer.10.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591285008 -> 140509588370480
+ 140509588370480 [label=AccumulateGrad]
+ 140509588370000 -> 140509588370096
+ 140509588370000 [label=UnsqueezeBackward0]
+ 140509588371824 -> 140509588370000
+ 140509588371824 [label=UnsqueezeBackward0]
+ 140509588371728 -> 140509588371824
+ 140509588371728 [label=MulBackward0]
+ 140509588370048 -> 140509588371728
+ 140509588370048 [label=SoftmaxBackward0]
+ 140509588403824 -> 140509588370048
+ 140509588403824 [label=MmBackward0]
+ 140517615321824 -> 140509588403824
+ 140517615321824 [label=ToCopyBackward0]
+ 140517615321872 -> 140517615321824
+ 140517615321872 [label=DivBackward0]
+ 140517615322160 -> 140517615321872
+ 140517615322160 [label=SumBackward1]
+ 140517615322256 -> 140517615322160
+ 140517615322256 [label=MulBackward0]
+ 140509588370960 -> 140517615322256
+ 140517615322064 -> 140509588403824
+ 140517615322064 [label=TBackward0]
+ 140517615322208 -> 140517615322064
+ 140517615322208 [label=ToCopyBackward0]
+ 140517615322304 -> 140517615322208
+ 140509591291120 [label="encoder.layer.10.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509591291120 -> 140517615322304
+ 140517615322304 [label=AccumulateGrad]
+ 140509588369520 -> 140509588315344
+ 140509588369520 [label=IndexBackward0]
+ 140509588370768 -> 140509588369520
+ 140509588370768 [label=NativeLayerNormBackward0]
+ 140509588372448 -> 140509588370768
+ 140509588372448 [label=AddBackward0]
+ 140517615322352 -> 140509588372448
+ 140517615322352 [label=NativeDropoutBackward0]
+ 140517615322016 -> 140517615322352
+ 140517615322016 [label=ViewBackward0]
+ 140517615322496 -> 140517615322016
+ 140517615322496 [label=AddmmBackward0]
+ 140517615322592 -> 140517615322496
+ 140517615322592 [label=ToCopyBackward0]
+ 140517615322784 -> 140517615322592
+ 140509591293040 [label="encoder.layer.10.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591293040 -> 140517615322784
+ 140517615322784 [label=AccumulateGrad]
+ 140517615322544 -> 140517615322496
+ 140517615322544 [label=ViewBackward0]
+ 140517615322832 -> 140517615322544
+ 140517615322832 [label=GeluBackward0]
+ 140517615322928 -> 140517615322832
+ 140517615322928 [label=ViewBackward0]
+ 140517615323024 -> 140517615322928
+ 140517615323024 [label=AddmmBackward0]
+ 140517615323120 -> 140517615323024
+ 140517615323120 [label=ToCopyBackward0]
+ 140517615323312 -> 140517615323120
+ 140509591293280 [label="encoder.layer.10.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591293280 -> 140517615323312
+ 140517615323312 [label=AccumulateGrad]
+ 140517615323072 -> 140517615323024
+ 140517615323072 [label=ViewBackward0]
+ 140517615323360 -> 140517615323072
+ 140517615323360 [label=ToCopyBackward0]
+ 140517615322112 -> 140517615323360
+ 140517615322112 [label=SliceBackward0]
+ 140517615323504 -> 140517615322112
+ 140517615323504 [label=SliceBackward0]
+ 140517615323600 -> 140517615323504
+ 140517615323600 [label=SliceBackward0]
+ 140509588428592 -> 140517615323600
+ 140517615322736 -> 140517615323024
+ 140517615322736 [label=TBackward0]
+ 140517615323264 -> 140517615322736
+ 140517615323264 [label=ToCopyBackward0]
+ 140517615323696 -> 140517615323264
+ 140509591293600 [label="encoder.layer.10.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591293600 -> 140517615323696
+ 140517615323696 [label=AccumulateGrad]
+ 140517615322400 -> 140517615322496
+ 140517615322400 [label=TBackward0]
+ 140517615322976 -> 140517615322400
+ 140517615322976 [label=ToCopyBackward0]
+ 140517615323456 -> 140517615322976
+ 140509591293360 [label="encoder.layer.10.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591293360 -> 140517615323456
+ 140517615323456 [label=AccumulateGrad]
+ 140517615322112 -> 140509588372448
+ 140509588369808 -> 140509588370768
+ 140509591293120 [label="encoder.layer.10.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591293120 -> 140509588369808
+ 140509588369808 [label=AccumulateGrad]
+ 140509588403104 -> 140509588370768
+ 140509591292800 [label="encoder.layer.10.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591292800 -> 140509588403104
+ 140509588403104 [label=AccumulateGrad]
+ 140509588347344 -> 140509588348304
+ 140509588347344 [label=TBackward0]
+ 140509588348880 -> 140509588347344
+ 140509588348880 [label=ToCopyBackward0]
+ 140509588402480 -> 140509588348880
+ 140509591291360 [label="encoder.layer.11.attention.self.query.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591291360 -> 140509588402480
+ 140509588402480 [label=AccumulateGrad]
+ 140509588347104 -> 140509588347248
+ 140509588347104 [label=UnsafeViewBackward0]
+ 140509588347920 -> 140509588347104
+ 140509588347920 [label=CloneBackward0]
+ 140509588348208 -> 140509588347920
+ 140509588348208 [label=ExpandBackward0]
+ 140509588348688 -> 140509588348208
+ 140509588348688 [label=TransposeBackward0]
+ 140509588347632 -> 140509588348688
+ 140509588347632 [label=PermuteBackward0]
+ 140509588369712 -> 140509588347632
+ 140509588369712 [label=ViewBackward0]
+ 140517615322448 -> 140509588369712
+ 140517615322448 [label=ViewBackward0]
+ 140517615322688 -> 140517615322448
+ 140517615322688 [label=AddmmBackward0]
+ 140517615323216 -> 140517615322688
+ 140517615323216 [label=ToCopyBackward0]
+ 140517615323408 -> 140517615323216
+ 140509591291920 [label="encoder.layer.11.attention.self.key.bias
+ (768)" fillcolor=lightblue]
+ 140509591291920 -> 140517615323408
+ 140517615323408 [label=AccumulateGrad]
+ 140517615323168 -> 140517615322688
+ 140517615323168 [label=ViewBackward0]
+ 140517615323744 -> 140517615323168
+ 140517615323744 [label=ToCopyBackward0]
+ 140509588315344 -> 140517615323744
+ 140517615321200 -> 140517615322688
+ 140517615321200 [label=TBackward0]
+ 140517615322880 -> 140517615321200
+ 140517615322880 [label=ToCopyBackward0]
+ 140517615323888 -> 140517615322880
+ 140509591291600 [label="encoder.layer.11.attention.self.key.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591291600 -> 140517615323888
+ 140517615323888 [label=AccumulateGrad]
+ 140509588345808 -> 140509588345616
+ 140509588345808 [label=UnsafeViewBackward0]
+ 140509588346144 -> 140509588345808
+ 140509588346144 [label=CloneBackward0]
+ 140509588346576 -> 140509588346144
+ 140509588346576 [label=ExpandBackward0]
+ 140509588346864 -> 140509588346576
+ 140509588346864 [label=PermuteBackward0]
+ 140509588346000 -> 140509588346864
+ 140509588346000 [label=ViewBackward0]
+ 140509588348112 -> 140509588346000
+ 140509588348112 [label=ViewBackward0]
+ 140509588348400 -> 140509588348112
+ 140509588348400 [label=AddmmBackward0]
+ 140509588369616 -> 140509588348400
+ 140509588369616 [label=ToCopyBackward0]
+ 140517615323648 -> 140509588369616
+ 140509591292160 [label="encoder.layer.11.attention.self.value.bias
+ (768)" fillcolor=lightblue]
+ 140509591292160 -> 140517615323648
+ 140517615323648 [label=AccumulateGrad]
+ 140509588369904 -> 140509588348400
+ 140509588369904 [label=ViewBackward0]
+ 140517615323984 -> 140509588369904
+ 140517615323984 [label=ToCopyBackward0]
+ 140509588315344 -> 140517615323984
+ 140517615321920 -> 140509588348400
+ 140517615321920 [label=TBackward0]
+ 140517615323552 -> 140517615321920
+ 140517615323552 [label=ToCopyBackward0]
+ 140517615324032 -> 140517615323552
+ 140509591291840 [label="encoder.layer.11.attention.self.value.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591291840 -> 140517615324032
+ 140517615324032 [label=AccumulateGrad]
+ 140509588315536 -> 140509588315824
+ 140509588315536 [label=TBackward0]
+ 140509588345328 -> 140509588315536
+ 140509588345328 [label=ToCopyBackward0]
+ 140509588345712 -> 140509588345328
+ 140509591291440 [label="encoder.layer.11.attention.output.dense.weight
+ (768, 768)" fillcolor=lightblue]
+ 140509591291440 -> 140509588345712
+ 140509588345712 [label=AccumulateGrad]
+ 140509588315344 -> 140509588314960
+ 140509588315056 -> 140509588314768
+ 140509591290960 [label="encoder.layer.11.attention.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591290960 -> 140509588315056
+ 140509588315056 [label=AccumulateGrad]
+ 140509588313568 -> 140509588314768
+ 140509591291200 [label="encoder.layer.11.attention.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591291200 -> 140509588313568
+ 140509588313568 [label=AccumulateGrad]
+ 140509588312272 -> 140509588313328
+ 140509588312272 [label=TBackward0]
+ 140509588313904 -> 140509588312272
+ 140509588313904 [label=ToCopyBackward0]
+ 140509588314576 -> 140509588313904
+ 140509591260912 [label="encoder.layer.11.experts.experts.0.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591260912 -> 140509588314576
+ 140509588314576 [label=AccumulateGrad]
+ 140509588312848 -> 140509588313232
+ 140509588312848 [label=TBackward0]
+ 140509588312128 -> 140509588312848
+ 140509588312128 [label=ToCopyBackward0]
+ 140509588314192 -> 140509588312128
+ 140509591260592 [label="encoder.layer.11.experts.experts.0.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591260592 -> 140509588314192
+ 140509588314192 [label=AccumulateGrad]
+ 140509588312608 -> 140509591317376
+ 140509591314832 -> 140509591314640
+ 140509591260352 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591260352 -> 140509591314832
+ 140509591314832 [label=AccumulateGrad]
+ 140509591317568 -> 140509591314640
+ 140509591260832 [label="encoder.layer.11.experts.experts.0.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591260832 -> 140509591317568
+ 140509591317568 [label=AccumulateGrad]
+ 140509591315408 -> 140509588282864
+ 140509591315408 [label=UnsqueezeBackward0]
+ 140509591268800 -> 140509591315408
+ 140509591268800 [label=NativeLayerNormBackward0]
+ 140509588313088 -> 140509591268800
+ 140509588313088 [label=AddBackward0]
+ 140509588314864 -> 140509588313088
+ 140509588314864 [label=NativeDropoutBackward0]
+ 140509588312224 -> 140509588314864
+ 140509588312224 [label=ViewBackward0]
+ 140509588314000 -> 140509588312224
+ 140509588314000 [label=AddmmBackward0]
+ 140509588315008 -> 140509588314000
+ 140509588315008 [label=ToCopyBackward0]
+ 140509588315920 -> 140509588315008
+ 140509591259952 [label="encoder.layer.11.experts.experts.1.output_query.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591259952 -> 140509588315920
+ 140509588315920 [label=AccumulateGrad]
+ 140509588315152 -> 140509588314000
+ 140509588315152 [label=ViewBackward0]
+ 140509588315488 -> 140509588315152
+ 140509588315488 [label=GeluBackward0]
+ 140509588345232 -> 140509588315488
+ 140509588345232 [label=ViewBackward0]
+ 140509588346384 -> 140509588345232
+ 140509588346384 [label=AddmmBackward0]
+ 140509588347056 -> 140509588346384
+ 140509588347056 [label=ToCopyBackward0]
+ 140509588345904 -> 140509588347056
+ 140509591260192 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591260192 -> 140509588345904
+ 140509588345904 [label=AccumulateGrad]
+ 140509588346624 -> 140509588346384
+ 140509588346624 [label=ViewBackward0]
+ 140517615323840 -> 140509588346624
+ 140517615323840 [label=ToCopyBackward0]
+ 140509588312608 -> 140517615323840
+ 140509588346096 -> 140509588346384
+ 140509588346096 [label=TBackward0]
+ 140517615322640 -> 140509588346096
+ 140517615322640 [label=ToCopyBackward0]
+ 140517615323936 -> 140517615322640
+ 140509591260112 [label="encoder.layer.11.experts.experts.1.intermediate_query.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591260112 -> 140517615323936
+ 140517615323936 [label=AccumulateGrad]
+ 140509588312464 -> 140509588314000
+ 140509588312464 [label=TBackward0]
+ 140509588344944 -> 140509588312464
+ 140509588344944 [label=ToCopyBackward0]
+ 140509588347728 -> 140509588344944
+ 140509591259872 [label="encoder.layer.11.experts.experts.1.output_query.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591259872 -> 140509588347728
+ 140509588347728 [label=AccumulateGrad]
+ 140509588312608 -> 140509588313088
+ 140509588313136 -> 140509591268800
+ 140509591259632 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591259632 -> 140509588313136
+ 140509588313136 [label=AccumulateGrad]
+ 140509588312752 -> 140509591268800
+ 140509591260432 [label="encoder.layer.11.experts.experts.1.output_query.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591260432 -> 140509588312752
+ 140509588312752 [label=AccumulateGrad]
+ 140509588282672 -> 140509588283152
+ 140509588282672 [label=UnsqueezeBackward0]
+ 140509591318432 -> 140509588282672
+ 140509591318432 [label=UnsqueezeBackward0]
+ 140509588314384 -> 140509591318432
+ 140509588314384 [label=MulBackward0]
+ 140509588315440 -> 140509588314384
+ 140509588315440 [label=SoftmaxBackward0]
+ 140509588345520 -> 140509588315440
+ 140509588345520 [label=MmBackward0]
+ 140509588312656 -> 140509588345520
+ 140509588312656 [label=ToCopyBackward0]
+ 140517615324128 -> 140509588312656
+ 140517615324128 [label=DivBackward0]
+ 140517615324320 -> 140517615324128
+ 140517615324320 [label=SumBackward1]
+ 140517615324416 -> 140517615324320
+ 140517615324416 [label=MulBackward0]
+ 140509588312608 -> 140517615324416
+ 140517615323792 -> 140509588345520
+ 140517615323792 [label=TBackward0]
+ 140517615324368 -> 140517615323792
+ 140517615324368 [label=ToCopyBackward0]
+ 140517615324464 -> 140517615324368
+ 140509591282928 [label="encoder.layer.11.experts.gate.weight
+ (2, 768)" fillcolor=lightblue]
+ 140509591282928 -> 140517615324464
+ 140517615324464 [label=AccumulateGrad]
+ 140509588282432 -> 140509588281712
+ 140509588282432 [label=IndexBackward0]
+ 140509588283248 -> 140509588282432
+ 140509588283248 [label=IndexBackward0]
+ 140509591317952 -> 140509588283248
+ 140509591317952 [label=NativeLayerNormBackward0]
+ 140509588345040 -> 140509591317952
+ 140509588345040 [label=AddBackward0]
+ 140517615324560 -> 140509588345040
+ 140517615324560 [label=NativeDropoutBackward0]
+ 140517615324608 -> 140517615324560
+ 140517615324608 [label=ViewBackward0]
+ 140517615324704 -> 140517615324608
+ 140517615324704 [label=AddmmBackward0]
+ 140517615324800 -> 140517615324704
+ 140517615324800 [label=ToCopyBackward0]
+ 140517615324992 -> 140517615324800
+ 140509591290400 [label="encoder.layer.11.output.dense.bias
+ (768)" fillcolor=lightblue]
+ 140509591290400 -> 140517615324992
+ 140517615324992 [label=AccumulateGrad]
+ 140517615324752 -> 140517615324704
+ 140517615324752 [label=ViewBackward0]
+ 140517615325040 -> 140517615324752
+ 140517615325040 [label=GeluBackward0]
+ 140517615325136 -> 140517615325040
+ 140517615325136 [label=ViewBackward0]
+ 140517615324944 -> 140517615325136
+ 140517615324944 [label=AddmmBackward0]
+ 140517615382736 -> 140517615324944
+ 140517615382736 [label=ToCopyBackward0]
+ 140517615382928 -> 140517615382736
+ 140509591290480 [label="encoder.layer.11.intermediate.dense.bias
+ (3072)" fillcolor=lightblue]
+ 140509591290480 -> 140517615382928
+ 140517615382928 [label=AccumulateGrad]
+ 140517615382688 -> 140517615324944
+ 140517615382688 [label=ViewBackward0]
+ 140517615382976 -> 140517615382688
+ 140517615382976 [label=ToCopyBackward0]
+ 140517615324512 -> 140517615382976
+ 140517615324512 [label=SliceBackward0]
+ 140517615383120 -> 140517615324512
+ 140517615383120 [label=SliceBackward0]
+ 140517615383216 -> 140517615383120
+ 140517615383216 [label=SliceBackward0]
+ 140509588314768 -> 140517615383216
+ 140517615382592 -> 140517615324944
+ 140517615382592 [label=TBackward0]
+ 140517615382880 -> 140517615382592
+ 140517615382880 [label=ToCopyBackward0]
+ 140517615383312 -> 140517615382880
+ 140509591290640 [label="encoder.layer.11.intermediate.dense.weight
+ (3072, 768)" fillcolor=lightblue]
+ 140509591290640 -> 140517615383312
+ 140517615383312 [label=AccumulateGrad]
+ 140517615324080 -> 140517615324704
+ 140517615324080 [label=TBackward0]
+ 140517615324896 -> 140517615324080
+ 140517615324896 [label=ToCopyBackward0]
+ 140517615383072 -> 140517615324896
+ 140509591290720 [label="encoder.layer.11.output.dense.weight
+ (768, 3072)" fillcolor=lightblue]
+ 140509591290720 -> 140517615383072
+ 140517615383072 [label=AccumulateGrad]
+ 140517615324512 -> 140509588345040
+ 140509588314672 -> 140509591317952
+ 140509591290160 [label="encoder.layer.11.output.LayerNorm.weight
+ (768)" fillcolor=lightblue]
+ 140509591290160 -> 140509588314672
+ 140509588314672 [label=AccumulateGrad]
+ 140509588313712 -> 140509591317952
+ 140509591290000 [label="encoder.layer.11.output.LayerNorm.bias
+ (768)" fillcolor=lightblue]
+ 140509591290000 -> 140509588313712
+ 140509588313712 [label=AccumulateGrad]
+ 140509588281712 -> 140509988778688
+}
diff --git a/test.pdf/backward_graph.pdf b/test.pdf/backward_graph.pdf
new file mode 100644
index 0000000..7f162b0
Binary files /dev/null and b/test.pdf/backward_graph.pdf differ
diff --git a/test.txt b/test.txt
new file mode 100644
index 0000000..516c092
--- /dev/null
+++ b/test.txt
@@ -0,0 +1,360 @@
+tmp_name = [name for name, p in model.named_parameters() if (p.requires_grad and '10.expert' in name)]
+
+tmp = [p for name, p in model.named_parameters() if (p.requires_grad and '10.expert' in name)]
+
+tensor([[-1.4032e-02, 3.7242e-03, 8.4997e-03, -3.4016e-03, -6.4855e-03,
+ 4.3595e-02, 3.4423e-02, -8.6274e-03, -1.9702e-02, 9.1813e-03,
+ 1.1643e-02, 2.3939e-02, -2.0908e-02, 3.4555e-03, 9.1636e-03,
+ 1.5413e-02, 2.4148e-02, -1.0880e-03, 1.1193e-02, -1.3591e-02,
+ 9.3484e-03, 1.5999e-02, -9.6086e-04, 3.8322e-02, -8.0687e-03,
+ -1.4056e-02, 3.9486e-02, 3.5167e-02, -9.3226e-03, -1.0493e-02,
+ -2.5795e-02, -9.7541e-03, 4.4437e-03, 7.7226e-03, 7.5210e-03,
+ -1.3526e-02, -5.0316e-03, -1.1149e-02, 6.0583e-03, 2.0564e-02,
+ -6.4477e-03, 1.4170e-02, -3.7847e-02, 1.1780e-02, 1.3321e-02,
+ -8.2501e-03, -1.0298e-02, 1.4805e-02, -1.2432e-02, -1.9159e-02,
+ -5.7095e-04, -3.8618e-02, -2.4230e-02, -1.4991e-03, -1.4114e-02,
+ -1.5365e-02, 1.5640e-02, -4.8623e-02, -2.9991e-02, 1.2796e-02,
+ -4.9917e-03, 2.3846e-03, 7.7368e-03, 1.2913e-02, 1.5300e-02,
+ 8.5125e-03, 1.1582e-02, 8.1161e-03, 4.2259e-03, 7.6109e-03,
+ -2.0747e-02, -3.5099e-03, 2.2282e-02, 5.0493e-02, -1.7849e-02,
+ -3.7106e-02, -1.4944e-02, -1.4582e-02, -2.2458e-02, -4.6173e-05,
+ -8.1270e-03, 1.9037e-02, -2.0086e-02, 3.0980e-03, -9.3947e-03,
+ 1.3054e-02, 2.3203e-02, -9.9304e-03, -2.6038e-02, 1.8679e-02,
+ 9.2081e-03, -2.1770e-02, -1.6568e-03, -3.6503e-02, 2.0054e-02,
+ 1.2886e-02, -1.8021e-02, 3.4457e-02, -1.3704e-02, -6.1498e-03,
+ -8.6769e-03, 1.5024e-02, -1.3875e-02, 1.7416e-02, -1.1178e-02,
+ -2.4088e-02, -1.7802e-02, 3.3326e-02, -1.1216e-02, -8.6330e-03,
+ -5.5359e-03, -1.1939e-02, -1.7777e-02, -2.8666e-02, -3.8280e-02,
+ 4.2682e-02, 1.4946e-02, 9.6427e-03, 8.2754e-03, -1.0516e-03,
+ 2.9560e-02, 2.4552e-03, -4.8354e-02, 1.5568e-02, 2.5881e-02,
+ -1.7354e-02, -3.1232e-02, 2.3683e-02, -2.3239e-02, 2.2966e-02,
+ 5.6349e-03, -8.7595e-03, 1.5173e-02, 2.7660e-02, -4.3304e-03,
+ -2.5330e-02, -2.1795e-02, 1.6856e-02, -2.1587e-04, 2.3707e-02,
+ -2.3667e-02, 3.5378e-02, -7.9245e-03, 7.1029e-04, -3.2800e-02,
+ -1.5402e-03, -8.5634e-03, -1.1356e-02, -2.1935e-03, -1.8854e-02,
+ -1.9705e-03, -3.8333e-02, 2.9131e-02, -4.4470e-02, -2.0893e-03,
+ 1.2937e-02, -1.7116e-02, 2.7778e-02, 1.0311e-02, -6.4017e-03,
+ 3.7647e-02, -1.9953e-02, -5.3925e-03, 3.6978e-02, -1.5534e-02,
+ 1.2241e-02, 1.3597e-02, 2.0703e-03, 2.4213e-03, 9.2604e-03,
+ 6.6108e-03, -5.8213e-03, 9.8167e-03, -9.8300e-04, -1.0236e-02,
+ 2.9581e-02, 1.0987e-02, 2.0046e-02, -1.0500e-02, -3.2221e-03,
+ -2.6303e-02, 1.3688e-02, -2.2529e-02, -5.7654e-03, 1.1784e-02,
+ 1.6221e-02, 2.8743e-02, 5.7565e-03, 1.8129e-02, 1.5140e-02,
+ -1.1748e-02, -1.7528e-02, 4.7977e-02, 1.5568e-02, 4.7030e-04,
+ 3.2757e-03, 1.6631e-02, 1.9986e-02, -7.3463e-03, 1.1435e-02,
+ -1.4739e-02, -3.2959e-03, -2.8770e-03, 2.9260e-02, 1.7007e-02,
+ 3.0611e-02, 2.2102e-02, -3.3819e-02, -1.9403e-02, 2.5524e-02,
+ 3.0738e-02, -1.9951e-02, -1.4553e-02, -1.5796e-02, -2.3143e-02,
+ -2.8826e-02, 2.4739e-02, -5.8602e-03, 4.1871e-02, 5.0821e-04,
+ 3.3493e-02, 2.3524e-02, 2.3191e-02, 9.0416e-03, 3.3262e-02,
+ -1.6805e-02, 1.1545e-02, -1.7195e-02, -3.8696e-02, -8.4358e-04,
+ -8.1605e-03, 3.1372e-03, 1.0726e-03, 1.0865e-03, 1.0760e-02,
+ -5.2421e-03, 1.3039e-02, 3.6873e-04, 1.0464e-02, -1.1544e-02,
+ -2.2775e-02, -4.8439e-02, -1.0711e-02, 4.4236e-03, 2.0351e-02,
+ 2.4479e-03, -1.9968e-02, -2.2941e-02, -2.0486e-02, -1.9528e-02,
+ -2.3176e-02, -3.2731e-03, 1.1789e-02, 2.0921e-02, 2.9809e-03,
+ -8.8507e-03, -3.5716e-02, 8.8418e-03, 5.3665e-05, -1.1288e-02,
+ -7.5571e-03, 2.1053e-02, -3.7381e-03, -4.0165e-03, -2.2628e-03,
+ 3.7554e-03, -1.6597e-02, 7.6946e-03, -3.2689e-02, 2.2016e-02,
+ 5.5122e-03, 4.5455e-02, 6.7586e-03, 1.5714e-02, 5.2125e-03,
+ 3.9596e-03, 1.8134e-02, 1.5834e-03, -1.6239e-02, -1.3889e-02,
+ -2.3522e-02, 1.4738e-02, 5.5867e-03, -7.0727e-03, -2.8140e-03,
+ 1.6849e-02, -3.1327e-02, -3.2443e-02, 4.7851e-03, 1.2980e-02,
+ -2.0014e-04, -9.9475e-03, 8.0657e-03, 1.9468e-02, -1.5774e-02,
+ 1.7017e-02, -8.7196e-03, -4.0681e-03, -6.9754e-03, -2.2007e-02,
+ -6.6217e-03, -1.8219e-02, 4.2186e-02, -5.6621e-03, -9.3449e-03,
+ -1.1662e-02, 2.8700e-02, -9.0654e-03, 3.1569e-02, -2.9825e-03,
+ -3.8198e-02, -5.2723e-02, -4.8325e-02, -2.7871e-03, 5.1127e-03,
+ 1.4511e-02, 9.3245e-03, -2.3339e-02, -8.6658e-03, 1.5276e-02,
+ -1.5823e-02, -3.4476e-03, 1.4601e-02, 6.3504e-03, -1.4307e-02,
+ 2.2817e-02, 2.1998e-02, 1.7330e-02, -2.4448e-02, 4.0178e-03,
+ 3.2280e-03, -1.2721e-02, 1.9661e-02, 7.5263e-03, 2.0245e-02,
+ 4.5525e-02, -1.5658e-02, -4.0676e-02, 9.3160e-03, 1.1920e-02,
+ -1.9317e-02, 1.7848e-02, -5.8601e-03, 1.1786e-03, 8.3864e-03,
+ -1.8341e-02, 2.5985e-02, -1.1387e-02, -1.5069e-02, -2.8097e-02,
+ 2.4966e-02, 1.4790e-02, 2.0424e-02, -1.3062e-02, 3.1314e-02,
+ 1.7811e-02, 7.2393e-03, 1.4413e-02, -1.2746e-02, 3.1039e-02,
+ -1.1697e-02, -1.4826e-02, -8.8397e-03, 1.5157e-02, -1.5855e-02,
+ -1.8157e-03, 1.3024e-02, -1.8902e-03, 2.5212e-02, -3.4886e-02,
+ 4.3029e-02, -4.0842e-02, 1.1362e-02, -1.4654e-02, -1.3337e-02,
+ -3.1832e-02, 3.6222e-03, 8.2804e-03, -1.4269e-02, 2.8399e-03,
+ -1.2008e-02, 2.4685e-02, -4.3070e-03, 6.3163e-03, -1.3517e-02,
+ -1.3807e-02, 2.4617e-02, 2.1453e-02, 4.7332e-03, 9.1636e-03,
+ -1.2881e-02, 1.9077e-02, 1.7571e-04, -5.2817e-03, -2.8821e-02,
+ 5.8223e-03, -3.0979e-02, 2.4609e-02, 3.6666e-02, -1.0950e-02,
+ 2.0421e-02, -2.6378e-03, 3.1825e-02, -9.6689e-04, -2.8398e-02,
+ -2.7513e-02, 1.6946e-02, -2.4110e-02, -1.3575e-02, -1.3443e-02,
+ 8.4217e-03, 2.6754e-02, -2.3309e-03, -2.5086e-02, 1.1844e-02,
+ 1.4152e-02, 1.2989e-02, -5.7336e-03, 4.7391e-03, 3.4106e-02,
+ 1.0142e-02, -1.8029e-02, -1.5410e-04, -1.3548e-02, 9.1742e-03,
+ -3.0150e-02, 1.5666e-02, 4.3049e-03, 1.6273e-02, 2.0672e-02,
+ -1.2458e-02, 4.5496e-02, 3.2131e-02, -3.0967e-03, 2.1891e-02,
+ 2.5524e-02, -1.1998e-02, -1.8866e-03, -1.0945e-02, 5.9930e-03,
+ -8.4233e-03, -8.9095e-03, -1.8261e-02, 1.9308e-02, -1.9728e-02,
+ -1.4216e-02, 1.4952e-02, 5.7355e-04, -2.4753e-02, -1.0948e-02,
+ 1.0965e-02, 1.3607e-03, 3.4974e-02, -4.1396e-03, 2.5519e-02,
+ 1.0364e-02, -1.5851e-02, -4.9224e-03, 1.0903e-02, -1.0523e-04,
+ 3.1355e-02, -1.5105e-02, 5.6972e-03, -8.4078e-03, -1.9868e-02,
+ 1.7186e-03, 2.9396e-02, -4.1439e-02, 1.4124e-02, -3.7745e-03,
+ 3.3007e-02, 8.0368e-04, 8.5574e-03, 1.7269e-02, 1.1955e-02,
+ 8.8142e-03, -1.3123e-02, 1.6817e-02, -1.5456e-02, -1.3868e-02,
+ 2.4139e-02, -9.1566e-03, -1.8477e-02, -4.7972e-03, -6.8459e-03,
+ 1.6818e-02, 3.1645e-03, -3.0901e-02, -5.6036e-03, -1.4758e-02,
+ 2.0473e-02, -7.5411e-05, 2.0673e-03, -7.0061e-03, 9.5544e-03,
+ 1.6600e-02, -1.7315e-02, -2.0168e-02, -5.3008e-03, 2.0206e-02,
+ 2.4209e-03, 2.1205e-02, -8.9188e-03, -4.1350e-04, -1.0638e-02,
+ 1.3705e-02, 9.5925e-05, 3.8877e-02, 3.2884e-02, -2.7730e-03,
+ 1.0052e-02, 1.9311e-02, 1.1341e-02, -1.2988e-02, -1.7157e-02,
+ 3.2095e-02, -1.8493e-02, -9.2551e-03, -2.6509e-03, -1.1130e-02,
+ 1.6581e-02, 1.0216e-02, 1.3687e-02, 1.1860e-02, -3.0462e-03,
+ -1.2082e-02, 2.8502e-03, -1.2620e-02, 8.8330e-03, 1.7357e-02,
+ 1.8383e-02, -2.3130e-02, -3.2654e-02, 1.2853e-02, -7.8144e-03,
+ 1.9418e-04, 3.8635e-03, 4.9333e-02, 1.9350e-02, -2.0643e-02,
+ 8.4650e-04, 5.0242e-02, 1.6576e-02, -8.9166e-03, -5.8805e-03,
+ -4.1484e-02, 9.3217e-03, -1.1292e-02, -8.7944e-03, -3.3190e-03,
+ 5.7970e-03, -6.6078e-03, -2.4052e-02, -5.6347e-03, 8.4539e-03,
+ 1.9250e-02, 7.9559e-03, -3.0055e-03, -3.0398e-04, 2.7007e-02,
+ 3.1046e-03, 1.8332e-02, 5.5470e-03, 6.6815e-03, 1.1466e-02,
+ 1.9738e-02, 1.2176e-02, -2.0220e-02, 8.6928e-03, 4.2451e-03,
+ 4.4517e-03, -5.1524e-03, 1.0805e-02, -2.1935e-02, -1.7575e-02,
+ -1.2529e-02, -2.2191e-02, -1.0854e-02, -9.4462e-03, -2.9102e-02,
+ 2.6752e-02, -1.0919e-02, -2.6724e-02, 8.3694e-04, 2.9832e-03,
+ 1.4416e-02, -2.9906e-02, 2.3556e-02, -6.6624e-03, 2.6671e-02,
+ -3.6474e-02, 1.7237e-02, -2.5176e-02, 6.5560e-03, -2.6062e-02,
+ -2.3838e-02, 3.0629e-02, 2.5382e-02, 1.2302e-02, -1.1665e-02,
+ -7.0603e-03, 1.9931e-02, 2.3401e-02, -2.6047e-03, -2.7728e-02,
+ -1.7212e-02, 2.3061e-02, -2.5961e-02, 3.9764e-04, -2.9022e-02,
+ -1.5546e-03, 4.5519e-03, 2.3589e-02, -3.5005e-02, 4.1890e-03,
+ -1.5586e-02, 1.2389e-02, -2.1045e-02, 1.6377e-03, -1.1328e-02,
+ 1.0195e-02, 6.4322e-03, -3.8431e-02, 2.2918e-02, -4.0123e-03,
+ 6.6680e-02, 4.1135e-02, -1.5031e-02, -1.3550e-02, -2.2566e-02,
+ -2.3622e-03, -2.9323e-02, 2.1756e-02, 1.8399e-03, -4.2460e-03,
+ -1.5128e-03, -2.4731e-02, 1.8663e-02, 1.3469e-02, -1.3897e-02,
+ 2.6399e-02, -8.0740e-03, -4.6753e-03, 3.9857e-02, 6.2364e-03,
+ 2.2371e-03, 2.1501e-03, 5.9443e-02, 1.3574e-02, 7.6483e-03,
+ -6.2290e-03, 1.4324e-02, 1.2572e-02, 2.7331e-02, -6.0165e-03,
+ -5.9154e-03, -3.7000e-02, 1.4001e-02, 1.2869e-02, -2.8854e-02,
+ -9.4147e-03, 8.3965e-03, -1.4530e-03, -7.4215e-03, 9.0369e-03,
+ -2.4612e-02, 2.0625e-02, 2.2329e-02, -1.5216e-02, 1.4947e-03,
+ -3.6020e-02, -2.0702e-02, -4.0410e-02, -1.3157e-02, -1.5085e-02,
+ 1.2911e-02, -2.7552e-02, -2.9781e-02, -4.7424e-03, 2.0521e-02,
+ -4.0043e-02, -4.8763e-02, -1.3175e-02, 2.6802e-02, 2.8869e-02,
+ 6.5014e-03, -2.3213e-02, 1.4438e-02, -7.6318e-03, -1.9928e-03,
+ 1.8509e-03, 2.9728e-03, 1.5225e-02, -2.9405e-03, -7.2875e-03,
+ 2.9562e-05, -1.8661e-02, 9.1341e-03, -2.4919e-02, 2.9786e-02,
+ 9.5186e-03, 1.5435e-02, -1.1080e-02, 1.1192e-02, -2.7315e-03,
+ 6.9769e-05, -1.5392e-02, 4.9892e-03, 7.9857e-03, 2.0063e-02,
+ -2.0283e-02, -1.2596e-02, -4.1985e-04, -6.9686e-03, -5.4704e-02,
+ -1.9142e-02, 9.9706e-03, 2.3217e-02, -5.0579e-03, -4.9132e-02,
+ 2.0023e-02, -2.6238e-02, 1.0709e-02, 2.1528e-02, -1.6390e-03,
+ -6.7829e-03, 1.3211e-02, -9.6793e-03, 1.3130e-02, -1.2878e-02,
+ 1.7365e-02, 1.2509e-02, 1.2986e-03, -3.9292e-02, 9.5784e-03,
+ -8.0514e-03, -3.5619e-02, -3.2298e-02, 6.5933e-04, 9.9298e-03,
+ 3.7268e-02, -3.4047e-02, -7.8385e-03, 2.3999e-02, 1.0386e-02,
+ 1.7853e-02, -1.0122e-04, 5.2483e-04, -7.3150e-03, 1.0818e-02,
+ 1.6245e-02, -3.5619e-02, -9.9190e-03, 4.0132e-03, 9.7788e-03,
+ 2.7039e-02, -4.7858e-02, -2.0010e-02, -2.3702e-02, 7.8376e-04,
+ -2.5326e-02, 1.1698e-02, -1.3041e-02, 3.8634e-03, 9.3083e-03,
+ 4.8204e-03, 3.9503e-02, -4.1356e-03]], requires_grad=True)
+model.Qformer.bert.encoder.layer[10].experts.gate.weight
+
+layer 11
+0:
+model.Qformer.bert.encoder.layer[11].output.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad
+
+nan:
+model.Qformer.bert.encoder.layer[11].attention.output.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].attention.self.query.weight.grad
+model.Qformer.bert.encoder.layer[11].experts.intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].experts.output_query.dense.weight.grad
+
+None:
+model.Qformer.bert.encoder.layer[11].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].output_query.dense.weight.grad
+
+layer 8
+0:
+model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[2].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[0].output_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[2].output_query.dense.weight.grad
+
+nan:
+model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[1].output_query.dense.weight.grad
+(Qformer)model.Qformer.bert.encoder.layer[8].intermediate_query.dense.weight.grad
+
+None:
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad == None
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.requires_grad == True
+
+
+model.Qformer.bert.encoder.layer[6].experts.gate.weight
+Qformer.bert.encoder.layer.6.experts.gate.weight
+
+tensor([[-0.0089, -0.0123, -0.0168, ..., -0.0072, 0.0295, -0.0167],
+ [ 0.0305, 0.0277, -0.0215, ..., 0.0149, 0.0016, -0.0415],
+ [ 0.0199, 0.0151, 0.0237, ..., 0.0007, 0.0023, 0.0167]],
+ requires_grad=True)
+
+tensor([[-0.0089, -0.0123, -0.0168, ..., -0.0072, 0.0295, -0.0167],
+ [ 0.0305, 0.0277, -0.0215, ..., 0.0149, 0.0016, -0.0415],
+ [ 0.0199, 0.0151, 0.0237, ..., 0.0007, 0.0023, 0.0167]],
+ requires_grad=True)
+
+
+tensor([[ 4.5972e-02, -1.5231e-02, -6.9533e-03, 3.2431e-02, -7.9703e-03,
+ 1.5567e-02, 2.9619e-03, -2.2609e-04, 1.8580e-02, -2.8783e-02,
+ 1.3093e-02, -1.0594e-02, 1.1918e-02, 4.4701e-02, 2.0108e-02,
+ -1.1011e-03, -8.2449e-03, 8.8876e-03, 4.6096e-03, 2.3274e-02,
+ -9.2557e-03, 2.5704e-03, 1.8919e-02, -5.3251e-03, -3.2665e-03,
+ -3.2663e-02, -5.6756e-02, -2.3400e-02, 1.3674e-02, -6.6185e-03,
+ 1.4429e-03, 1.2354e-02, 2.5934e-03, 2.1895e-02, -1.9793e-02,
+ 1.5497e-03, 4.3056e-03, -4.0023e-02, 9.8740e-03, 3.8631e-03,
+ -1.2918e-02, -3.6782e-02, -9.8365e-03, 3.2182e-02, 2.3729e-02,
+ 2.3509e-03, 1.8473e-02, 1.5583e-02, -1.1029e-02, -1.0738e-02,
+ -3.0278e-02, -9.8731e-03, -1.0500e-02, 7.9832e-05, -1.0345e-02,
+ 8.2803e-03, -5.9923e-03, -1.2669e-02, 1.2065e-03, 7.5720e-03,
+ -1.9286e-02, 4.0070e-02, 3.6221e-03, -1.7486e-02, 2.1725e-02,
+ -3.3231e-02, 7.3948e-03, -1.0924e-02, 3.1448e-02, 1.2101e-02,
+ 6.1737e-03, -2.0851e-02, -3.7964e-02, 8.0938e-03, -8.8967e-03,
+ 2.5925e-02, -7.8063e-04, 8.6102e-03, 2.7370e-02, 1.2323e-02,
+ 4.0606e-03, 3.9316e-02, -1.0837e-02, -2.6835e-03, 3.1941e-03,
+ -1.2017e-02, -2.3022e-02, 8.3533e-03, -2.2668e-02, 1.4438e-02,
+ -2.3664e-02, 4.5595e-02, -1.0962e-02, 1.7547e-02, -1.6739e-03,
+ 1.2048e-02, 2.0544e-02, 2.8837e-02, -1.6736e-02, 2.1207e-02,
+ 8.7612e-03, 2.8757e-02, -3.8561e-03, 8.4050e-03, -1.1503e-02,
+ -5.8332e-03, 1.5734e-02, -1.0773e-02, 7.5827e-03, 6.5794e-03,
+ 2.4291e-02, 2.6811e-02, 1.1681e-02, -3.3246e-02, 4.5776e-03,
+ -9.0628e-04, -2.9400e-02, 4.2933e-03, 1.5885e-03, 5.5757e-02,
+ 7.5518e-03, 1.0099e-02, 5.3507e-03, -3.0182e-02, 2.0830e-02,
+ 1.0102e-02, -9.3074e-03, 3.1161e-02, -1.7800e-02, -4.4445e-03,
+ -3.1503e-02, 2.3028e-02, 8.3472e-03, 7.4444e-03, 1.8838e-02,
+ -1.1977e-02, -2.6713e-02, 1.1364e-02, 8.3522e-04, 3.3736e-03,
+ 6.9425e-03, -2.0632e-02, 1.8155e-02, -2.1711e-02, -3.4703e-02,
+ -3.6268e-03, -4.8810e-03, -2.8142e-02, -1.5781e-02, -3.3166e-02,
+ -2.9910e-02, -9.7459e-03, -6.7474e-03, 1.7988e-02, 9.0176e-03,
+ 1.9452e-02, 4.2009e-02, 1.7217e-02, 1.4959e-02, -1.6552e-02,
+ -3.8206e-03, -2.4889e-02, 7.7993e-03, -1.9285e-02, -1.9770e-02,
+ 2.6936e-02, -5.0484e-03, -2.5117e-02, -2.3122e-02, 1.3754e-02,
+ 1.6025e-02, -9.1569e-03, -2.0068e-02, -1.6013e-02, -2.1775e-02,
+ -2.4154e-02, 6.2840e-03, -1.3684e-02, 2.5378e-02, -1.3166e-02,
+ -1.2201e-02, 1.0011e-02, -8.2324e-03, -5.6623e-03, -1.0383e-02,
+ -1.6251e-02, 1.0723e-02, -3.0207e-03, -6.9374e-03, -2.3161e-03,
+ -2.0850e-03, -3.4216e-02, 3.3997e-02, 3.7444e-02, -3.4273e-02,
+ 1.5051e-02, -9.5605e-03, -2.6979e-03, 1.8848e-02, 2.3090e-02,
+ 1.9669e-02, -3.9656e-02, 1.0453e-02, 5.2222e-03, -7.2493e-03,
+ 1.4122e-02, 5.6583e-04, -1.3991e-02, 4.0975e-02, 1.3947e-02,
+ 4.6919e-03, 7.9121e-03, 2.6936e-02, 1.2338e-02, 1.9048e-02,
+ 7.7740e-03, -6.4494e-03, -5.2965e-02, 8.1929e-03, -1.3503e-02,
+ 3.7466e-03, -3.3504e-02, -8.1192e-03, 1.0463e-02, -2.1568e-02,
+ 1.0076e-02, -1.3420e-02, -6.3353e-04, 7.4253e-03, 2.2281e-02,
+ 5.2829e-03, 1.4102e-02, 1.4427e-02, 1.6331e-02, -2.3305e-04,
+ -4.4875e-02, 6.5300e-03, 2.4963e-02, 2.2141e-03, 3.9830e-02,
+ 1.1405e-02, 8.6810e-03, -2.0404e-03, -1.8579e-03, 1.4765e-02,
+ 5.4752e-03, -1.3364e-02, -1.3082e-03, 1.5873e-03, 1.9309e-02,
+ 3.4367e-02, 1.8459e-02, -1.1323e-02, -1.8764e-02, -1.5370e-02,
+ 3.6180e-03, 2.8253e-02, -1.6867e-03, 3.5884e-03, -2.1952e-02,
+ -1.5026e-02, -2.1070e-02, -1.2149e-02, 1.1162e-02, -3.0343e-02,
+ -4.1372e-02, 1.0880e-02, 2.2365e-02, 1.2896e-02, 2.9694e-02,
+ -8.4248e-03, -7.8876e-03, -6.7049e-03, 2.3700e-02, 4.7528e-03,
+ -7.8350e-03, -5.9220e-03, 3.8396e-02, -4.1598e-02, -2.3161e-03,
+ 1.3419e-02, 7.1029e-03, 1.4195e-02, -1.1124e-02, 1.5812e-02,
+ -1.9789e-02, -2.3883e-02, -8.2788e-04, 1.4670e-02, -2.1482e-02,
+ -1.1182e-02, -1.6532e-02, -8.0637e-03, -3.7822e-02, 3.9402e-02,
+ -1.4097e-03, -7.6648e-03, -3.7156e-02, 2.5791e-02, 6.1038e-03,
+ -6.3429e-03, 3.2865e-03, 3.6277e-02, 9.4312e-03, -2.1003e-02,
+ -3.6885e-03, 1.7147e-02, -1.3079e-02, -4.9414e-02, -3.2066e-02,
+ 1.4835e-02, -2.9742e-02, 1.8358e-02, -2.1733e-02, 3.0256e-03,
+ 1.7825e-02, 1.1079e-02, 1.1619e-02, -2.3680e-02, -7.8721e-03,
+ 2.4456e-03, 4.3608e-02, -4.5674e-03, -3.6818e-02, 3.3952e-02,
+ 3.3108e-02, -3.1665e-03, -2.3468e-03, 1.5091e-02, 7.0856e-03,
+ 1.1723e-02, -2.0713e-02, -6.9180e-03, 3.7929e-02, 3.7671e-03,
+ 4.6663e-02, 9.5301e-03, 1.2638e-02, -6.5623e-03, -3.1771e-03,
+ -1.7568e-02, 1.8711e-03, -1.2310e-02, 2.1518e-02, 4.3408e-03,
+ -6.7171e-03, -5.0451e-03, 2.6870e-02, -1.9832e-02, 7.0422e-03,
+ 1.1274e-02, -2.4637e-02, -4.8450e-03, 2.1892e-02, -2.6059e-02,
+ 1.5605e-02, -1.1617e-02, -1.9273e-02, -8.6735e-04, -9.8002e-04,
+ -1.8553e-02, 2.1239e-02, 2.1078e-02, -1.2091e-02, 9.7025e-03,
+ 1.3426e-02, -1.1710e-02, -2.2242e-03, 6.4133e-03, -1.4820e-02,
+ 1.4682e-02, 3.0679e-02, 1.1526e-02, 1.0072e-02, -1.1572e-02,
+ 2.6128e-02, 4.0879e-03, -1.7936e-02, 1.3715e-02, -2.3667e-02,
+ 2.0419e-03, -1.6887e-02, 1.2595e-02, -2.1988e-02, -2.3777e-02,
+ -1.0399e-02, 2.4868e-03, -1.2265e-02, -1.8543e-02, 3.4672e-02,
+ 2.1114e-02, 2.0523e-02, 7.6818e-03, 2.9282e-02, -5.9593e-03,
+ -2.8496e-02, 2.8482e-03, 3.6874e-04, 4.7455e-02, -2.9770e-02,
+ -2.0684e-02, -2.0749e-02, -5.7681e-02, -2.6175e-03, -2.4488e-02,
+ -5.2550e-03, -7.1191e-03, 3.8192e-02, 4.3438e-02, 5.4181e-03,
+ 2.8392e-02, 1.9493e-02, -3.5262e-02, 1.4839e-02, 4.6481e-03,
+ 1.7219e-02, 2.0160e-02, 4.9998e-03, 2.1316e-02, -8.7929e-04,
+ -2.1542e-02, 3.9816e-03, 1.5879e-02, 9.9231e-03, 1.3962e-02,
+ -5.3418e-03, 3.9857e-02, 2.0997e-02, -2.1291e-05, 1.8133e-02,
+ -1.2472e-02, 4.9437e-03, -1.5099e-02, 4.8860e-02, 6.1980e-03,
+ 2.0197e-02, 1.3141e-04, -3.1087e-03, -2.2718e-03, 2.3804e-02,
+ 6.0726e-03, -2.0485e-02, -2.0514e-02, -2.7679e-02, -3.0412e-02,
+ -1.7661e-02, -1.7462e-02, 7.5216e-03, 2.2238e-02, 1.1413e-03,
+ 2.6647e-02, -2.3855e-02, 2.2652e-03, -4.3256e-03, -9.3274e-03,
+ 2.5149e-02, 6.8432e-03, 4.2664e-03, 3.8221e-02, 7.7480e-03,
+ 8.7203e-03, -1.2851e-03, -1.1325e-02, -1.0650e-02, -2.8079e-02,
+ -1.5375e-02, 2.2630e-02, -4.3439e-03, 1.3493e-02, -1.8223e-02,
+ 9.9750e-03, -2.4560e-02, 1.0904e-03, -3.1198e-02, 4.7331e-03,
+ 1.6713e-02, -1.7653e-02, -3.8674e-02, 1.5458e-02, 4.0555e-02,
+ 6.9451e-03, 1.1988e-03, 8.0718e-04, 3.9985e-03, -2.2781e-02,
+ 8.1173e-04, 2.0106e-02, -1.2800e-02, -1.2961e-02, -2.1273e-02,
+ -4.4104e-05, -3.6080e-02, -1.9392e-02, 3.2862e-02, -5.6041e-03,
+ 2.3288e-02, -4.6795e-02, 1.7282e-02, 5.7052e-03, 2.2405e-02,
+ 1.9871e-03, -1.4333e-02, 5.3773e-03, 4.3568e-02, 9.8980e-03,
+ -1.9403e-03, 1.8981e-02, -2.5712e-02, -3.3621e-03, 2.9886e-02,
+ 1.3326e-03, 1.1318e-02, -3.3238e-03, -1.5494e-02, -3.0565e-02,
+ 1.7137e-02, -2.7874e-02, -1.1257e-02, 3.2250e-02, -2.5293e-02,
+ -3.0693e-03, -2.7787e-02, 1.4931e-02, 2.4202e-03, -4.0572e-03,
+ 5.0273e-03, 9.7496e-03, 2.2601e-02, 3.2389e-02, -1.1910e-02,
+ 9.1037e-03, 5.6000e-02, -1.9640e-02, 1.5469e-02, -3.3027e-02,
+ 1.4839e-02, 2.5071e-02, -1.2687e-02, -1.3466e-02, 1.9031e-02,
+ -7.3403e-03, -1.5207e-02, -1.4486e-02, 2.0678e-02, -4.1996e-02,
+ 1.0585e-02, 3.6276e-02, 6.1149e-03, 1.6405e-02, 1.5643e-02,
+ 1.5060e-02, -5.1235e-03, -2.2824e-02, -1.3752e-02, -1.5742e-02,
+ 2.4032e-02, -2.1782e-03, -1.3158e-02, 3.9482e-03, 3.2267e-02,
+ -2.2632e-03, 1.2055e-02, 4.4731e-02, 1.8271e-02, -1.1486e-02,
+ 1.7836e-02, 1.7886e-03, -2.4020e-02, 2.6064e-02, -2.2122e-04,
+ 1.8643e-02, -2.9808e-02, -6.1845e-03, -4.4464e-03, 8.8374e-04,
+ 1.5268e-02, 1.7205e-03, 5.7832e-02, -1.7486e-02, 1.1897e-02,
+ 5.8081e-02, 1.7667e-02, -7.7282e-03, 1.4036e-02, -1.4936e-03,
+ 6.0635e-04, 1.6124e-03, -1.6916e-02, -1.1239e-02, 1.8497e-02,
+ 1.2334e-03, -2.0706e-02, 3.2959e-03, 2.9186e-02, 3.7506e-02,
+ 1.2037e-02, -1.4903e-02, 8.5606e-03, 3.4136e-03, 1.1850e-02,
+ -7.4782e-03, 5.3924e-03, -2.4772e-02, 2.6840e-02, -2.7656e-02,
+ -3.2637e-02, -1.2779e-02, 1.0730e-02, 1.4096e-03, 3.1572e-02,
+ 7.8976e-04, 3.1674e-02, 8.5333e-03, -1.2679e-02, 1.1176e-02,
+ -2.0446e-02, 1.8628e-02, -4.0158e-02, -2.3358e-02, -2.2504e-02,
+ -2.8759e-02, -1.4597e-02, -8.5879e-03, 1.0550e-02, -3.5556e-02,
+ -1.9046e-02, -1.9159e-02, -2.2703e-02, -7.2056e-03, 4.2380e-02,
+ -9.7475e-03, -2.4754e-02, 1.3992e-03, -1.0411e-02, 1.5708e-02,
+ -8.2899e-03, -6.4856e-03, 1.6359e-02, -5.1969e-04, -5.0958e-03,
+ -4.1232e-02, 2.7349e-03, -1.7723e-02, 1.3388e-02, 2.2776e-03,
+ -2.0786e-02, -1.8082e-02, -2.4866e-03, 2.2141e-02, 6.9998e-03,
+ -5.5714e-03, 2.1088e-02, 5.8745e-03, 1.2788e-02, 4.2977e-03,
+ 5.8631e-03, -1.8121e-02, 1.9242e-03, 2.3622e-02, 1.4917e-02,
+ -5.3198e-03, -3.9222e-02, -2.4697e-02, 9.1218e-03, -1.0711e-02,
+ 1.0268e-02, 1.5148e-02, -4.4508e-02, 4.6783e-03, 2.8093e-03,
+ 9.1253e-03, -7.3281e-03, 1.0114e-03, -9.2369e-04, 1.4841e-02,
+ 2.2642e-02, 2.3675e-02, 1.3902e-02, -5.6343e-03, 1.4851e-02,
+ -9.5169e-03, -3.1721e-02, 1.6696e-02, 2.9285e-02, -1.4090e-02,
+ 2.1128e-02, 4.8656e-02, 3.8431e-02, -3.5470e-02, -4.8230e-03,
+ -1.6513e-02, 4.1917e-02, 8.9090e-03, -1.4022e-04, 4.0182e-03,
+ 7.1723e-03, 3.1419e-02, -4.8508e-03, 1.7768e-03, -7.3688e-03,
+ 3.4637e-03, -2.3227e-02, 3.9606e-05, -2.4731e-02, -1.3640e-02,
+ -5.1718e-03, 2.6662e-02, -1.2871e-02, -1.6009e-02, -5.3720e-03,
+ 2.7397e-04, -3.4016e-03, 2.6429e-02, 3.8069e-02, 1.0929e-02,
+ -1.0620e-02, 1.2165e-02, -2.6018e-02, 1.6021e-02, 4.0644e-02,
+ -8.0898e-03, -3.5198e-02, -1.9602e-02, 2.4986e-02, -5.8400e-03,
+ 3.2070e-02, -1.8265e-02, -5.4518e-03, 2.8195e-02, 5.5598e-02,
+ -3.9959e-02, 1.5521e-02, -2.8416e-02, 3.1130e-02, -1.0038e-02,
+ 2.1522e-02, -1.1654e-02, 2.2382e-02, -5.4467e-03, -2.2840e-02,
+ 2.7036e-03, -4.4607e-02, -4.1953e-02, 2.0079e-02, -5.0121e-03,
+ -1.7495e-02, 4.4070e-03, 3.7400e-04, 1.0899e-02, 1.7008e-02,
+ -1.6307e-02, -1.9986e-02, -2.3865e-02, -2.5618e-02, -2.9981e-02,
+ -2.7230e-03, 2.7079e-02, 5.2920e-03, 2.1069e-02, -2.5896e-02,
+ -1.6256e-02, -1.4182e-03, 1.1829e-02, 1.0360e-02, 2.8883e-02,
+ -6.8762e-03, 1.4032e-02, -4.3389e-03]], requires_grad=True)
\ No newline at end of file
diff --git a/test1.txt b/test1.txt
new file mode 100644
index 0000000..a6e7a8b
--- /dev/null
+++ b/test1.txt
@@ -0,0 +1,109 @@
+from torchviz import make_dot
+dot = make_dot(query_output.last_hidden_state, params=dict(self.Qformer.bert.named_parameters()))
+log_dir = '/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/'
+dot.render(filename="Pre_PromptMoE_RawProb_backward_graph", directory=log_dir, format="pdf")
+
+
+# Pre-Prompt-MoE
+model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[6].experts.experts[0].dense1.weight.grad
+model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[0].dense1.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[1].dense1.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[2].dense1.weight.grad
+
+
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight
+model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[10].intermediate.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight
+model.Qformer.bert.encoder.layer[10].experts.experts[2].dense1.weight
+model.Qformer.bert.encoder.layer[10].experts.experts[1].dense1.weight
+model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
+model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight == model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
+
+# Pre-MoE gate-sentence
+# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 不更新
+
+# Pre-MoE gate-token
+# 正常更新
+
+# Post-MoE gate-sentence
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
+# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 正常更新
+# model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad 全是0/-0
+# model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad 全是0/-0
+
+# Route-MoE
+# Pre-MoE 算的beam_scores有问题
+
+# Post-Route 会更新多个expert的参数;会更新gate的参数
+# Layer 6 更新了两个expert的参数 (layer 6 layer 8)
+# model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad 是0?都是0
+# model.Qformer.bert.encoder.layer[11].output.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[6].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[6].experts.experts[1].intermediate_query.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[7].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[7].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[7].experts.experts[1].intermediate_query.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[9].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[9].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[9].experts.experts[1].intermediate_query.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[10].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[10].experts.experts[1].intermediate_query.dense.weight.grad
+
+model.Qformer.bert.encoder.layer[11].experts.gate.weight.grad
+model.Qformer.bert.encoder.layer[11].experts.experts[0].intermediate_query.dense.weight.grad
+model.Qformer.bert.encoder.layer[11].experts.experts[1].intermediate_query.dense.weight.grad
+
+
+(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.10.experts.experts.0.dense1.weight']
+[Parameter containing:
+tensor([[-0.0328, 0.0414, 0.0010, ..., -0.0068, 0.0244, 0.0587],
+ [ 0.0120, 0.0458, 0.0171, ..., -0.0439, -0.0107, -0.0397],
+ [ 0.0239, 0.0191, -0.0145, ..., 0.0008, -0.0067, 0.0090],
+ ...,
+ [ 0.0174, -0.0465, -0.0106, ..., -0.0095, 0.0153, -0.0195],
+ [-0.0151, -0.0082, -0.0320, ..., -0.0016, -0.0232, -0.0147],
+ [ 0.0142, -0.0286, 0.0161, ..., -0.0160, -0.0306, -0.0272]],
+ device='cuda:0', requires_grad=True)]
+(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.8.experts.experts.0.dense1.weight']
+[Parameter containing:
+tensor([[ 0.0024, 0.0218, -0.0186, ..., -0.0178, -0.0067, 0.0820],
+ [-0.0759, -0.0002, -0.0548, ..., 0.0292, 0.0531, 0.0779],
+ [-0.0220, -0.0037, -0.0520, ..., -0.0426, -0.0261, -0.0357],
+ ...,
+ [-0.0448, 0.0471, 0.0133, ..., -0.0062, -0.0217, -0.0203],
+ [ 0.0532, 0.0197, 0.0320, ..., -0.0010, -0.0838, 0.0682],
+ [ 0.0284, 0.0038, -0.0007, ..., -0.0305, 0.0296, 0.0056]],
+ device='cuda:0', requires_grad=True)]
+(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.6.experts.experts.0.dense1.weight']
+[Parameter containing:
+tensor([[ 6.5176e-02, -4.6473e-02, -2.7396e-02, ..., 2.1774e-03,
+ 6.1457e-02, 1.9180e-03],
+ [ 7.3707e-03, 6.1392e-02, -2.7108e-02, ..., 4.0778e-02,
+ -1.9791e-02, -1.1612e-02],
+ [ 2.1193e-02, -3.8323e-02, -6.0238e-02, ..., -1.4539e-02,
+ 9.2965e-02, 3.9153e-02],
+ ...,
+ [ 5.3203e-03, -1.7276e-02, -3.2191e-02, ..., -1.6435e-02,
+ -1.8553e-02, -2.8158e-02],
+ [-6.9853e-02, 9.2719e-03, -1.8895e-03, ..., -2.6425e-02,
+ 1.4880e-03, 3.4505e-02],
+ [-1.2168e-03, 3.7038e-02, 4.8047e-02, ..., -3.4523e-03,
+ -1.3030e-05, -1.4778e-02]], device='cuda:0', requires_grad=True)]
\ No newline at end of file