for epoch in range(1, num_epochs + 1):
**l, acc = train_epoch(epoch, num_epochs, train_loader, model, criterion, optimizer)**
train_loss_avg.append(l)
train_accuracy.append(acc)
This is the code I'm trying to run, the error is at the blocked line with the function
def train_epoch(epoch, num_epochs, data_loader, model, criterion, optimizer):
model.train()
losses = AverageMeter()
accuracies = AverageMeter()
t = []
for i, (inputs, targets) in enumerate(data_loader):
if torch.cuda.is_available():
targets = targets.type(torch.cuda.LongTensor)
inputs = inputs.cuda()
_, outputs = model(inputs)
loss = criterion(outputs, targets.type(torch.cuda.LongTensor))
acc = calculate_accuracy(outputs, targets.type(torch.cuda.LongTensor))
losses.update(loss.item(), inputs.size(0))
accuracies.update(acc, inputs.size(0))
optimizer.zero_grad()
loss.backward()
# Apply gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
optimizer.step()
sys.stdout.write(
"\r[Epoch %d/%d] [Batch %d / %d] [Loss: %f, Acc: %.2f%%]"
% (
epoch,
num_epochs,
i,
len(data_loader),
losses.avg,
accuracies.avg))
torch.save(model.state_dict(), '/content/checkpoint.pt')
return losses.avg, accuracies.avg
I couldn't find where the problem is?
I've tried using
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch._C._cuda_init()
torch.manual_seed(42)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
and changing the 'targets' variable in the train_epoch function. But, nothing worked