gpt2 tokenizer issue ( AssertionError: Cannot handle batch sizes > 1 if no padding token is defined )

130 Views Asked by At

I am trying to train gpt2 on an IMDB Sentimental dataset for a classification task.

The dataset looks like the following:

enter image description here

my code is as follows:

    import pandas as pd
    from sklearn.model_selection import train_test_split
    from torch.utils.data import DataLoader, TensorDataset
    from transformers import GPT2Tokenizer, GPT2ForSequenceClassification , AdamW
    import numpy as np
    import torch

    df = pd.read_csv('data/IMDB.csv')
    x = df['text'].tolist()
    y = df['label'].tolist()
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.add_special_tokens({'pad_token':'<|endoftext|>'})
    tokenizer.padding_side="left"
    tokenized_text = [tokenizer.encode(text, truncation=True, padding='max_length', max_length=128) for text in x]
    
    # Step 3: Split the dataset into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(tokenized_text, y, test_size=0.2, random_state=42)
    # Convert tokens to PyTorch tensors
    x_train_tensors = torch.tensor(x_train)
    y_train_tensors = torch.tensor(y_train)
    x_test_tensors = torch.tensor(x_test)
    y_test_tensors = torch.tensor(y_test)
    
    # Create DataLoader for training and testing sets
    train_dataset = TensorDataset(x_train_tensors, y_train_tensors)
    test_dataset = TensorDataset(x_test_tensors, y_test_tensors)
    
    batch_size =8
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)  # Assuming binary classification
    # Move the model to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    # Training loop
    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
    
            optimizer.zero_grad()
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
        # Evaluation on the test set
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in test_dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
    
                outputs = model(inputs)
                predictions = torch.argmax(outputs.logits, dim=1)
    
                total += labels.size(0)
                correct += (predictions == labels).sum().item()
    
            accuracy = correct / total
            print(f"Epoch {epoch + 1}, Test Accuracy: {accuracy:.4f}")

I got the following error at outputs = model(inputs, labels=labels):

AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.

I already defined the no padding token so I have no idea why this assertion error appears.

1

There are 1 best solutions below

0
MMM On

The issue was solved by configuring the model pad toked ID

model.config.pad_token_id=50256