mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-23 22:20:50 +00:00
109 lines
5.8 KiB
Plaintext
109 lines
5.8 KiB
Plaintext
from torchviz import make_dot
|
||
dot = make_dot(query_output.last_hidden_state, params=dict(self.Qformer.bert.named_parameters()))
|
||
log_dir = '/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/'
|
||
dot.render(filename="Post_Route_Universal_PromptMoE_RawProb_backward_graph", directory=log_dir, format="pdf")
|
||
|
||
|
||
# Pre-Prompt-MoE
|
||
model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[6].experts.experts[0].dense1.weight.grad
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.experts[0].dense1.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.experts[1].dense1.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.experts[2].dense1.weight.grad
|
||
|
||
|
||
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight
|
||
model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[10].intermediate.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[2].dense1.weight
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[1].dense1.weight
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
|
||
model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight == model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
|
||
|
||
# Pre-MoE gate-sentence
|
||
# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 不更新
|
||
|
||
# Pre-MoE gate-token
|
||
# 正常更新
|
||
|
||
# Post-MoE gate-sentence
|
||
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
|
||
# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 正常更新
|
||
# model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad 全是0/-0
|
||
# model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad 全是0/-0
|
||
|
||
# Route-MoE
|
||
# Pre-MoE 算的beam_scores有问题
|
||
|
||
# Post-Route 会更新多个expert的参数;会更新gate的参数
|
||
# Layer 6 更新了两个expert的参数 (layer 6 layer 8)
|
||
# model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad 是0?都是0
|
||
# model.Qformer.bert.encoder.layer[11].output.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[6].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[6].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[7].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[7].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[7].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[9].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[9].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[9].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[10].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
model.Qformer.bert.encoder.layer[11].experts.gate.weight.grad
|
||
model.Qformer.bert.encoder.layer[11].experts.experts[0].intermediate_query.dense.weight.grad
|
||
model.Qformer.bert.encoder.layer[11].experts.experts[1].intermediate_query.dense.weight.grad
|
||
|
||
|
||
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.10.experts.experts.0.dense1.weight']
|
||
[Parameter containing:
|
||
tensor([[-0.0328, 0.0414, 0.0010, ..., -0.0068, 0.0244, 0.0587],
|
||
[ 0.0120, 0.0458, 0.0171, ..., -0.0439, -0.0107, -0.0397],
|
||
[ 0.0239, 0.0191, -0.0145, ..., 0.0008, -0.0067, 0.0090],
|
||
...,
|
||
[ 0.0174, -0.0465, -0.0106, ..., -0.0095, 0.0153, -0.0195],
|
||
[-0.0151, -0.0082, -0.0320, ..., -0.0016, -0.0232, -0.0147],
|
||
[ 0.0142, -0.0286, 0.0161, ..., -0.0160, -0.0306, -0.0272]],
|
||
device='cuda:0', requires_grad=True)]
|
||
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.8.experts.experts.0.dense1.weight']
|
||
[Parameter containing:
|
||
tensor([[ 0.0024, 0.0218, -0.0186, ..., -0.0178, -0.0067, 0.0820],
|
||
[-0.0759, -0.0002, -0.0548, ..., 0.0292, 0.0531, 0.0779],
|
||
[-0.0220, -0.0037, -0.0520, ..., -0.0426, -0.0261, -0.0357],
|
||
...,
|
||
[-0.0448, 0.0471, 0.0133, ..., -0.0062, -0.0217, -0.0203],
|
||
[ 0.0532, 0.0197, 0.0320, ..., -0.0010, -0.0838, 0.0682],
|
||
[ 0.0284, 0.0038, -0.0007, ..., -0.0305, 0.0296, 0.0056]],
|
||
device='cuda:0', requires_grad=True)]
|
||
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.6.experts.experts.0.dense1.weight']
|
||
[Parameter containing:
|
||
tensor([[ 6.5176e-02, -4.6473e-02, -2.7396e-02, ..., 2.1774e-03,
|
||
6.1457e-02, 1.9180e-03],
|
||
[ 7.3707e-03, 6.1392e-02, -2.7108e-02, ..., 4.0778e-02,
|
||
-1.9791e-02, -1.1612e-02],
|
||
[ 2.1193e-02, -3.8323e-02, -6.0238e-02, ..., -1.4539e-02,
|
||
9.2965e-02, 3.9153e-02],
|
||
...,
|
||
[ 5.3203e-03, -1.7276e-02, -3.2191e-02, ..., -1.6435e-02,
|
||
-1.8553e-02, -2.8158e-02],
|
||
[-6.9853e-02, 9.2719e-03, -1.8895e-03, ..., -2.6425e-02,
|
||
1.4880e-03, 3.4505e-02],
|
||
[-1.2168e-03, 3.7038e-02, 4.8047e-02, ..., -3.4523e-03,
|
||
-1.3030e-05, -1.4778e-02]], device='cuda:0', requires_grad=True)] |