# 序列到序列学习（seq2seq）

!!!

1. 非常不理解原文为什么要用l.sum().backward(), 而不用l.mean().backward(), 在softmax的scratch那一节中明确采用了l.mean(), 然后再用optimizer.step(). 这里的损失l.shape=(`batch_size`,) . 每一个元素表示【一个样本在各个时间步的平均损失】
2. 也不懂为什么用l.sum()除以num_tokens表示的到底是什么意思，l.sum 既然是batch的综合【对时间步求平均后再对batch求和】，就应该除以batch_size, 但是原文除以num_tokens是什么鬼？？？

``````class Seq2SeqDecoder(d2l.Decoder):
"""用于序列到序列学习的循环神经网络解码器"""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers,
dropout=dropout)
self.dense = nn.Linear(num_hiddens, vocab_size)

def init_state(self, enc_outputs, *args):
#return enc_outputs[1]
return (enc_outputs[1], enc_outputs[1][-1])

def forward(self, X, state):
# 输出'X'的形状：(batch_size,num_steps,embed_size)
X = self.embedding(X).permute(1, 0, 2)
# 广播context，使其具有与X相同的num_steps
context = state[-1].repeat(X.shape[0], 1, 1)
# new
encode = state[1]
state = state[0]
# new end
X_and_context = torch.cat((X, context), 2)
output, state = self.rnn(X_and_context, state)
output = self.dense(output).permute(1, 0, 2)
# output的形状:(batch_size,num_steps,vocab_size)
# state[0]的形状:(num_layers,batch_size,num_hiddens)
#return output, state
return output, (state, encode)
``````

``````go . => va !, bleu 1.000
i lost . => j'ai perdu ., bleu 1.000
he's calm . => il est paresseux ., bleu 0.658
i'm home . => je suis chez moi ., bleu 1.000
``````

``````loss = MaskedSoftmaxCELoss()
loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long),
torch.tensor([4, 2, 0]))
``````

tensor([2.3026, 1.1513, 0.0000])

l.sum()是3.4539， num_tokens是4+2+0=6，这时候print输出的loss显示就是3.4539/6=0.57565

``````loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long),
torch.tensor([4, 4, 4]))
``````

tensor([2.3026, 2.3026, 2.3026])

l.sum()是6.9078， num_tokens是4+4+4=12，这时候print输出的loss显示就是6.9078/12=0.57565

···output, state = self.rnn(X)···

···output, state = self.rnn(X， state)··· 吗？