BERT classifier adding previous and next row as context

25 Views Asked by At

I'm working on a classifier in order to sort the text of a page into utterances and other textual information. I've annotated some training data and wondering if there is a way to pass this contextual, i.e. the tag of the previous segment and of the next segment in order to improve prediction.

I've tried to encode my df in this way (I might be going about this all wrong)

def encode(df):
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Group by 'id' and create an integer index within each group
df['id'] = df.groupby('id').ngroup()

input_ids = []
attention_masks = []
token_type_ids = []

for i, row in df.iterrows():
    # Encode the current content
    encoded_dict = tokenizer.encode_plus(
        row['content'],
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

    # Get the index of the current row
    current_index = i

    # Get the previous and next rows
    previous_index = current_index - 1
    next_index = current_index + 1

    # Create token_type_ids for the current, previous, and next sentences
    current_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])
    
    # Check if the previous and next rows exist
    if previous_index >= 0:
        previous_token_type_ids = torch.ones_like(encoded_dict['input_ids'])
    else:
        previous_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])

    if next_index < len(df):
        next_token_type_ids = 2 * torch.ones_like(encoded_dict['input_ids'])
    else:
        next_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])

    # Combine token_type_ids for all segments
    combined_token_type_ids = torch.cat([current_token_type_ids, previous_token_type_ids, next_token_type_ids], dim=1)
    token_type_ids.append(combined_token_type_ids)

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
labels = torch.tensor(df['tag'].tolist())

return input_ids, attention_masks, token_type_ids, labels

but I'm getting the following error during this code block:

    for epoch in range(args.n_epochs):
    train_loss = 0
    model.train()
    for batch in tqdm(train_loader, total=len(train_loader)):
        model.zero_grad()    

        input_ids = batch[0].to(args.device)
        input_mask = batch[1].to(args.device)
        token_type_ids = batch[2].to(args.device)
        labels = batch[3].to(args.device)
        output = model(input_ids,
                    token_type_ids=token_type_ids,
                    attention_mask=input_mask,
                    labels=labels)
        loss = output.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

in embedding return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) IndexError: index out of range in self

Any and all help is much appreciated!

0

There are 0 best solutions below