I am implementing nn.DataParallel class to utilize multiple GPUs on single machine. I have followed some stack overflow questions and answers but still get a simple error. I have no idea why I am getting this error.
Followed Questions
Code
# Utilize multiple GPUS
if 'cuda' in device:
print(device)
print("using data parallel")
net = torch.nn.DataParallel(model_ft) # make parallel
cudnn.benchmark = True
# Transfer the model to GPU
#model_ft = model_ft.to(device)
# # Print model summary
# print('Model Summary:-\n')
# for num, (name, param) in enumerate(model_ft.named_parameters()):
# print(num, name, param.requires_grad)
# summary(model_ft, input_size=(3, size, size))
# print(model_ft)
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Learning rate decay
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
# Model training routine
print("\nTraining:-\n")
def train_model(model, criterion, optimizer, scheduler, num_epochs=30):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
# Tensorboard summary
writer = SummaryWriter()
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs
labels = labels
inputs = inputs.to(device, non_blocking=True)
labels = labels.to(device, non_blocking=True)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# Record training loss and accuracy for each phase
if phase == 'train':
writer.add_scalar('Train/Loss', epoch_loss, epoch)
writer.add_scalar('Train/Accuracy', epoch_acc, epoch)
writer.flush()
else:
writer.add_scalar('Valid/Loss', epoch_loss, epoch)
writer.add_scalar('Valid/Accuracy', epoch_acc, epoch)
writer.flush()
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
# Train the model
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=num_epochs)
# Save the entire model
print("\nSaving the model...")
torch.save(model_ft, PATH)
Traceback
Traceback (most recent call last):
File "/home2/coremax/Documents/pytorch-image-classification/train.py", line 263, in <module>
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
File "/home2/coremax/Documents/pytorch-image-classification/train.py", line 214, in train_model
outputs = model(inputs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/timm/models/resnet.py", line 730, in forward
x = self.forward_features(x)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/timm/models/resnet.py", line 709, in forward_features
x = self.conv1(x)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 463, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
As shown in the error, the issue comes from the fact that the input you provided and the model are not the same type, the first one being
torch.cuda.FloatTensorand the second onetorch.FloatTensor. As you can see, the issue is that one (the input) in on GPU while the other (the weights of the model) is still on CPU. This issue can be fixed by moving the model to GPU in the beginning. I see that the correct line is commented in the beginning of the code you provided,model_ft = model_ft.to(device). Uncommenting this line should fix this problem.