I am trying to create a translator from English to Spanish based on the Transformer architecture and code mostly taken from the Pytorch Docs. For training I used a Kaggle Dataset.
I trained the neural network until it was doing well. In the training with input and target it is doing well. Now I want to translate custom sentences (I do not have a translation/target for them). With my implementation that should do this job, I only get SOS tokens returned.
Here again the architecture of the Transformer Model:
class Seq2SeqTransformer(nn.Module):
"""
Basic Transformer for Neural Machine Translation tasks.
"""
def __init__(self,
num_encoder_layers: int,
num_decoder_layers: int,
emb_size: int,
nhead: int,
src_vocab_size: int,
tgt_vocab_size: int,
dim_feedforward: int = 512,
dropout: float = 0.1
):
super(Seq2SeqTransformer, self).__init__()
self.transformer = nn.Transformer(
d_model=emb_size,
nhead=nhead,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward,
dropout=dropout)
self.generator = nn.Linear(emb_size, tgt_vocab_size)
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)
def forward(self,
src: torch.Tensor,
trg: torch.Tensor,
src_mask: torch.Tensor,
tgt_mask: torch.Tensor,
src_padding_mask: torch.Tensor,
tgt_padding_mask: torch.Tensor,
memory_key_padding_mask: torch.Tensor
):
src_emb = self.positional_encoding(self.src_tok_emb(src))
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
outs = self.transformer(
src_emb,
tgt_emb,
src_mask,
tgt_mask,
None,
src_padding_mask,
tgt_padding_mask,
memory_key_padding_mask)
return self.generator(outs)
def encode(self, src: torch.Tensor, src_mask: torch.Tensor):
return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)
def decode(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: torch.Tensor):
return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)
Now (after training) I have a well trained model that seems to work well with the model.forward(src, target, ...) function. I implemented a function based on model.forward() to generate some translations for random sentences taken from a torch.utils.data.DataLoader. I used it to get progress insights while training and evaluating my model (get_sentence_variants() is for decoding sentences in different human readable formats and get_bleu_score() is implemented to calculate the BLEU score):
def get_transformer_translation(loader: torch.utils.data.DataLoader):
model.eval()
data_iterator = iter(loader)
input_batch, target_batch = next(data_iterator)
input_sentence, target_sentence = input_batch[:, 0], target_batch[:, 0]
# input shape: (83) = (eng_seq_len), target shape: (80) = (spa_seq_len)
eng_decoded_words, eng_filtered_words, eng_decoded_sentence = get_sentence_variants(input_sentence, eng_itos)
spa_decoded_words, spa_filtered_words, spa_decoded_sentence = get_sentence_variants(target_sentence, spa_itos)
input_sentence = input_sentence.unsqueeze(1)
target_sentence = target_sentence.unsqueeze(1)
# input shape: (83, 1) = (seq_len_input, batch_size), target shape: (80, 1) = (seq_len_target, batch_size)
outputs = model.forward(
src=input_sentence,
trg=target_sentence,
src_mask=None,
tgt_mask=None,
src_padding_mask=None,
tgt_padding_mask=None,
memory_key_padding_mask=None
)
# outputs: (80, 1, 16557) = (target_seq_len, batch_size, vocab_size_target)
result_indices = torch.argmax(outputs, dim=-1)
# result_indices (80, 1) = (seq_len_target, batch_size)
model_predicted_words, model_filtered_words, model_decoded_sentence = get_sentence_variants(result_indices, spa_itos)
bleu_score = get_bleu_score(model_predicted_words, spa_decoded_words)
model.train()
return eng_decoded_words, eng_filtered_words, eng_decoded_sentence, spa_decoded_words, spa_filtered_words, spa_decoded_sentence, model_predicted_words, model_filtered_words, model_decoded_sentence, bleu_score
For example during training, I used it to generate the outputs in the following manner (shows that model is trained properly):
Eng: if i find your passport , i ' ll call you .
Spa: si encuentro tu pasaporte , te llamaré .
Pre: si encuentro tu pasaporte , te llamaré .
Loss: 0.010215843096375465, Bleu: 100.0
Now I want to translate my own sentences. I am struggling to implement a function that does this job for me. I can imagine that I use wrong masks during the training (I am no neural net expert yet). However, here is the code:
def get_translation(model, src, src_mask, max_len, start_symbol):
src = src.to(device)
src_mask = src_mask.to(device)
memory = model.encode(src, src_mask)
# memory: (input_seq_len, 1, src_emb_dim)
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
for i in range(max_len-1):
memory = memory.to(device)
tgt_mask = (generate_square_subsequent_mask(ys.size(0))).to(device)
# tgt_mask: (num_predicted_tokens, num_predicted_tokens)
out = model.decode(ys, memory, tgt_mask)
out = out.transpose(0, 1)
# out after transform: (1, tgt_emb_dim)
prob = model.generator(out[:, -1])
# prob: (batch_size, tgt_vocab_size)
_, next_word = torch.max(prob, dim=1)
next_word = next_word.item()
ys = torch.cat([ys,
torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
# ys: (num_predicted_tokens, 1)
#if next_word == EOS_IDX:
# break
return ys
...and here the code where I call the function above:
user_input = "i love you."
padded_user_input = pad_punctuation(user_input.lower())
tokenized_user_input = padded_user_input.split() + ["<EOS>"]
padded_token_sequence = np.zeros(eng_padded_seq_len, dtype=int)
token_sequence = [word_to_number(word, eng_stoi) for word in tokenized_user_input]
padded_token_sequence[:len(token_sequence)] = token_sequence
padded_token_sequence = torch.tensor(padded_token_sequence, dtype=int).to(device)
input_sentence = padded_token_sequence.unsqueeze(1)
# input_sentence: (seq_len_input, 1), needs batch size = 1
src_mask = generate_square_subsequent_mask(eng_padded_seq_len)#(torch.zeros(eng_padded_seq_len, eng_padded_seq_len)).type(torch.bool)
print(src_mask)
tgt_tokens = get_translation(model=model, src=input_sentence, src_mask=src_mask, max_len=spa_padded_seq_len, start_symbol=spa_stoi["<SOS>"])
print(tgt_tokens)
generate_square_subsequent_mask() creates masks in the following format:
tensor([[0., -inf, -inf, ..., -inf, -inf, -inf],
[0., 0., -inf, ..., -inf, -inf, -inf],
[0., 0., 0., ..., -inf, -inf, -inf],
...,
[0., 0., 0., ..., 0., -inf, -inf],
[0., 0., 0., ..., 0., 0., -inf],
[0., 0., 0., ..., 0., 0., 0.]])
tgt_tokens always has shape (tgt_seq_len, 1). All values in it are equal to start_symbol, the last param in get_translation, so currently I only receive back SOS tokens.
Where is the mistake and how can I fix it?