MiniGPT-4/test1.txt

109 lines
5.8 KiB
Plaintext
Raw Normal View History

from torchviz import make_dot
dot = make_dot(query_output.last_hidden_state, params=dict(self.Qformer.bert.named_parameters()))
log_dir = '/mnt/pfs-guan-ssai/nlu/wanghanzi/multimodal/PromptMoE/'
dot.render(filename="Pre_PromptMoE_RawProb_backward_graph", directory=log_dir, format="pdf")
# Pre-Prompt-MoE
model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[6].experts.experts[0].dense1.weight.grad
model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight.grad
model.Qformer.bert.encoder.layer[8].experts.experts[0].dense1.weight.grad
model.Qformer.bert.encoder.layer[8].experts.experts[1].dense1.weight.grad
model.Qformer.bert.encoder.layer[8].experts.experts[2].dense1.weight.grad
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight
model.Qformer.bert.encoder.layer[9].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[10].intermediate.dense.weight.grad
model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad
model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight
model.Qformer.bert.encoder.layer[10].experts.experts[2].dense1.weight
model.Qformer.bert.encoder.layer[10].experts.experts[1].dense1.weight
model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
model.Qformer.bert.encoder.layer[10].intermediate_query.dense.weight == model.Qformer.bert.encoder.layer[10].experts.experts[0].dense1.weight
# Pre-MoE gate-sentence
# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 不更新
# Pre-MoE gate-token
# 正常更新
# Post-MoE gate-sentence
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
# model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad 正常更新
# model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad 全是0/-0
# model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad 全是0/-0
# Route-MoE
# Pre-MoE 算的beam_scores有问题
# Post-Route 会更新多个expert的参数会更新gate的参数
# Layer 6 更新了两个expert的参数 (layer 6 layer 8)
# model.Qformer.bert.encoder.layer[11].intermediate.dense.weight.grad 是0都是0
# model.Qformer.bert.encoder.layer[11].output.dense.weight.grad
model.Qformer.bert.encoder.layer[6].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[6].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[6].experts.experts[1].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[7].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[7].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[7].experts.experts[1].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[8].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[8].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[8].experts.experts[1].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[9].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[9].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[9].experts.experts[1].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[10].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[10].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[10].experts.experts[1].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[11].experts.gate.weight.grad
model.Qformer.bert.encoder.layer[11].experts.experts[0].intermediate_query.dense.weight.grad
model.Qformer.bert.encoder.layer[11].experts.experts[1].intermediate_query.dense.weight.grad
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.10.experts.experts.0.dense1.weight']
[Parameter containing:
tensor([[-0.0328, 0.0414, 0.0010, ..., -0.0068, 0.0244, 0.0587],
[ 0.0120, 0.0458, 0.0171, ..., -0.0439, -0.0107, -0.0397],
[ 0.0239, 0.0191, -0.0145, ..., 0.0008, -0.0067, 0.0090],
...,
[ 0.0174, -0.0465, -0.0106, ..., -0.0095, 0.0153, -0.0195],
[-0.0151, -0.0082, -0.0320, ..., -0.0016, -0.0232, -0.0147],
[ 0.0142, -0.0286, 0.0161, ..., -0.0160, -0.0306, -0.0272]],
device='cuda:0', requires_grad=True)]
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.8.experts.experts.0.dense1.weight']
[Parameter containing:
tensor([[ 0.0024, 0.0218, -0.0186, ..., -0.0178, -0.0067, 0.0820],
[-0.0759, -0.0002, -0.0548, ..., 0.0292, 0.0531, 0.0779],
[-0.0220, -0.0037, -0.0520, ..., -0.0426, -0.0261, -0.0357],
...,
[-0.0448, 0.0471, 0.0133, ..., -0.0062, -0.0217, -0.0203],
[ 0.0532, 0.0197, 0.0320, ..., -0.0010, -0.0838, 0.0682],
[ 0.0284, 0.0038, -0.0007, ..., -0.0305, 0.0296, 0.0056]],
device='cuda:0', requires_grad=True)]
(Pdb) [p for n, p in self.model.named_parameters() if n == 'Qformer.bert.encoder.layer.6.experts.experts.0.dense1.weight']
[Parameter containing:
tensor([[ 6.5176e-02, -4.6473e-02, -2.7396e-02, ..., 2.1774e-03,
6.1457e-02, 1.9180e-03],
[ 7.3707e-03, 6.1392e-02, -2.7108e-02, ..., 4.0778e-02,
-1.9791e-02, -1.1612e-02],
[ 2.1193e-02, -3.8323e-02, -6.0238e-02, ..., -1.4539e-02,
9.2965e-02, 3.9153e-02],
...,
[ 5.3203e-03, -1.7276e-02, -3.2191e-02, ..., -1.6435e-02,
-1.8553e-02, -2.8158e-02],
[-6.9853e-02, 9.2719e-03, -1.8895e-03, ..., -2.6425e-02,
1.4880e-03, 3.4505e-02],
[-1.2168e-03, 3.7038e-02, 4.8047e-02, ..., -3.4523e-03,
-1.3030e-05, -1.4778e-02]], device='cuda:0', requires_grad=True)]