RuntimeError: CUDA error: device-side assert triggered Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions

149 Views Asked by At
for epoch in range(1, num_epochs + 1):
    **l, acc = train_epoch(epoch, num_epochs, train_loader, model, criterion, optimizer)**
    train_loss_avg.append(l)
    train_accuracy.append(acc)

This is the code I'm trying to run, the error is at the blocked line with the function

def train_epoch(epoch, num_epochs, data_loader, model, criterion, optimizer):
    model.train()
    losses = AverageMeter()
    accuracies = AverageMeter()
    t = []

    for i, (inputs, targets) in enumerate(data_loader):
        if torch.cuda.is_available():
            targets = targets.type(torch.cuda.LongTensor)
            inputs = inputs.cuda()

        _, outputs = model(inputs)
        loss = criterion(outputs, targets.type(torch.cuda.LongTensor))
        acc = calculate_accuracy(outputs, targets.type(torch.cuda.LongTensor))

        losses.update(loss.item(), inputs.size(0))
        accuracies.update(acc, inputs.size(0))

        optimizer.zero_grad()
        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        sys.stdout.write(
            "\r[Epoch %d/%d] [Batch %d / %d] [Loss: %f, Acc: %.2f%%]"
            % (
                epoch,
                num_epochs,
                i,
                len(data_loader),
                losses.avg,
                accuracies.avg))

    torch.save(model.state_dict(), '/content/checkpoint.pt')
    return losses.avg, accuracies.avg

I couldn't find where the problem is?

I've tried using

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch._C._cuda_init()
torch.manual_seed(42)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

and changing the 'targets' variable in the train_epoch function. But, nothing worked

0

There are 0 best solutions below