Problem during Custom Sentence Translations with Seq2Seq Transformer Model (English to Spanish)

37 Views Asked by At

I am trying to create a translator from English to Spanish based on the Transformer architecture and code mostly taken from the Pytorch Docs. For training I used a Kaggle Dataset.

I trained the neural network until it was doing well. In the training with input and target it is doing well. Now I want to translate custom sentences (I do not have a translation/target for them). With my implementation that should do this job, I only get SOS tokens returned.

Here again the architecture of the Transformer Model:

class Seq2SeqTransformer(nn.Module):
  """
  Basic Transformer for Neural Machine Translation tasks.
  """
  def __init__(self,
    num_encoder_layers: int,
    num_decoder_layers: int,
    emb_size: int,
    nhead: int,
    src_vocab_size: int,
    tgt_vocab_size: int,
    dim_feedforward: int = 512,
    dropout: float = 0.1
  ):
    super(Seq2SeqTransformer, self).__init__()
    self.transformer = nn.Transformer(
      d_model=emb_size,
      nhead=nhead,
      num_encoder_layers=num_encoder_layers,
      num_decoder_layers=num_decoder_layers,
      dim_feedforward=dim_feedforward,
      dropout=dropout)

    self.generator = nn.Linear(emb_size, tgt_vocab_size)
    self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
    self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
    self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

  def forward(self,
    src: torch.Tensor,
    trg: torch.Tensor,
    src_mask: torch.Tensor,
    tgt_mask: torch.Tensor,
    src_padding_mask: torch.Tensor,
    tgt_padding_mask: torch.Tensor,
    memory_key_padding_mask: torch.Tensor
  ):
    src_emb = self.positional_encoding(self.src_tok_emb(src))
    tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
    outs = self.transformer(
      src_emb,
      tgt_emb,
      src_mask,
      tgt_mask,
      None,
      src_padding_mask,
      tgt_padding_mask,
      memory_key_padding_mask)
    return self.generator(outs)

  def encode(self, src: torch.Tensor, src_mask: torch.Tensor):
    return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

  def decode(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: torch.Tensor):
    return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

Now (after training) I have a well trained model that seems to work well with the model.forward(src, target, ...) function. I implemented a function based on model.forward() to generate some translations for random sentences taken from a torch.utils.data.DataLoader. I used it to get progress insights while training and evaluating my model (get_sentence_variants() is for decoding sentences in different human readable formats and get_bleu_score() is implemented to calculate the BLEU score):

def get_transformer_translation(loader: torch.utils.data.DataLoader):
  model.eval()

  data_iterator = iter(loader)
  input_batch, target_batch = next(data_iterator)
  input_sentence, target_sentence = input_batch[:, 0], target_batch[:, 0]
  # input shape: (83) = (eng_seq_len), target shape: (80) = (spa_seq_len)

  eng_decoded_words, eng_filtered_words, eng_decoded_sentence = get_sentence_variants(input_sentence, eng_itos)
  spa_decoded_words, spa_filtered_words, spa_decoded_sentence = get_sentence_variants(target_sentence, spa_itos)

  input_sentence = input_sentence.unsqueeze(1)
  target_sentence = target_sentence.unsqueeze(1)
  # input shape: (83, 1) = (seq_len_input, batch_size), target shape: (80, 1) = (seq_len_target, batch_size)
 
  outputs = model.forward(
    src=input_sentence,
    trg=target_sentence,
    src_mask=None,
    tgt_mask=None,
    src_padding_mask=None,
    tgt_padding_mask=None,
    memory_key_padding_mask=None
  )
  # outputs: (80, 1, 16557) = (target_seq_len, batch_size, vocab_size_target)

  result_indices = torch.argmax(outputs, dim=-1)
  # result_indices (80, 1) = (seq_len_target, batch_size)

  model_predicted_words, model_filtered_words, model_decoded_sentence = get_sentence_variants(result_indices, spa_itos)

  bleu_score = get_bleu_score(model_predicted_words, spa_decoded_words)

  model.train()
  return eng_decoded_words, eng_filtered_words, eng_decoded_sentence, spa_decoded_words, spa_filtered_words, spa_decoded_sentence, model_predicted_words, model_filtered_words, model_decoded_sentence, bleu_score

For example during training, I used it to generate the outputs in the following manner (shows that model is trained properly):

Eng: if i find your passport , i ' ll call you .
Spa: si encuentro tu pasaporte , te llamaré .
Pre: si encuentro tu pasaporte , te llamaré .
Loss: 0.010215843096375465, Bleu: 100.0

Now I want to translate my own sentences. I am struggling to implement a function that does this job for me. I can imagine that I use wrong masks during the training (I am no neural net expert yet). However, here is the code:

def get_translation(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    # memory: (input_seq_len, 1, src_emb_dim)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))).to(device)
        # tgt_mask: (num_predicted_tokens, num_predicted_tokens)

        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        # out after transform: (1, tgt_emb_dim)

        prob = model.generator(out[:, -1])
        # prob: (batch_size, tgt_vocab_size)
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        # ys: (num_predicted_tokens, 1)
        #if next_word == EOS_IDX:
        #    break
    return ys

...and here the code where I call the function above:

user_input = "i love you."

padded_user_input = pad_punctuation(user_input.lower())
tokenized_user_input = padded_user_input.split() + ["<EOS>"]
padded_token_sequence = np.zeros(eng_padded_seq_len, dtype=int)
token_sequence = [word_to_number(word, eng_stoi) for word in tokenized_user_input]
padded_token_sequence[:len(token_sequence)] = token_sequence
padded_token_sequence = torch.tensor(padded_token_sequence, dtype=int).to(device)
input_sentence = padded_token_sequence.unsqueeze(1)
# input_sentence: (seq_len_input, 1), needs batch size = 1

src_mask = generate_square_subsequent_mask(eng_padded_seq_len)#(torch.zeros(eng_padded_seq_len, eng_padded_seq_len)).type(torch.bool)
print(src_mask)

tgt_tokens = get_translation(model=model, src=input_sentence, src_mask=src_mask, max_len=spa_padded_seq_len, start_symbol=spa_stoi["<SOS>"])
print(tgt_tokens)

generate_square_subsequent_mask() creates masks in the following format:

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

tgt_tokens always has shape (tgt_seq_len, 1). All values in it are equal to start_symbol, the last param in get_translation, so currently I only receive back SOS tokens.

Where is the mistake and how can I fix it?

0

There are 0 best solutions below