I'm working on a classifier in order to sort the text of a page into utterances and other textual information. I've annotated some training data and wondering if there is a way to pass this contextual, i.e. the tag of the previous segment and of the next segment in order to improve prediction.
I've tried to encode my df in this way (I might be going about this all wrong)
def encode(df):
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# Group by 'id' and create an integer index within each group
df['id'] = df.groupby('id').ngroup()
input_ids = []
attention_masks = []
token_type_ids = []
for i, row in df.iterrows():
# Encode the current content
encoded_dict = tokenizer.encode_plus(
row['content'],
add_special_tokens=True,
max_length=512,
truncation=True,
padding='max_length',
return_attention_mask=True,
return_tensors='pt'
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
# Get the index of the current row
current_index = i
# Get the previous and next rows
previous_index = current_index - 1
next_index = current_index + 1
# Create token_type_ids for the current, previous, and next sentences
current_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])
# Check if the previous and next rows exist
if previous_index >= 0:
previous_token_type_ids = torch.ones_like(encoded_dict['input_ids'])
else:
previous_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])
if next_index < len(df):
next_token_type_ids = 2 * torch.ones_like(encoded_dict['input_ids'])
else:
next_token_type_ids = torch.zeros_like(encoded_dict['input_ids'])
# Combine token_type_ids for all segments
combined_token_type_ids = torch.cat([current_token_type_ids, previous_token_type_ids, next_token_type_ids], dim=1)
token_type_ids.append(combined_token_type_ids)
# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
labels = torch.tensor(df['tag'].tolist())
return input_ids, attention_masks, token_type_ids, labels
but I'm getting the following error during this code block:
for epoch in range(args.n_epochs):
train_loss = 0
model.train()
for batch in tqdm(train_loader, total=len(train_loader)):
model.zero_grad()
input_ids = batch[0].to(args.device)
input_mask = batch[1].to(args.device)
token_type_ids = batch[2].to(args.device)
labels = batch[3].to(args.device)
output = model(input_ids,
token_type_ids=token_type_ids,
attention_mask=input_mask,
labels=labels)
loss = output.loss
train_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
in embedding return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) IndexError: index out of range in self
Any and all help is much appreciated!