Pytorch model runs fine stand alone but throws a runtime error when run with optuna

97 Views Asked by At

I am trying to tune the hyper parameters of my pytorch model using Optuna, but everytime I run the optimizer it gives the following error.

[W 2024-02-05 17:19:26,007] Trial 2 failed with parameters: {'hidden_state': 64, 'droup_out_prec': 0.18615371906093597, 'num_epochs': 14, 'encoder_lr': 0.021112576066074633, 'decoder_lr': 0.0006833950215216012, 'learning_rate': 1.9257784640609453e-05, 'control_factor_ce': 0.03524950489764759, 'control_factor_kl': 0.13410725114961825, 'batch_size': 256} because of the following error: RuntimeError('one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!').
Traceback (most recent call last):
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/root/.ipykernel/38222/command-4071806974828746-3917727534", line 149, in __call__
    output = train_model(model = model_to_train,
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 386, in train_model
    raise e
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
    loss.backward()
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
[W 2024-02-05 17:19:26,018] Trial 2 failed with value None.
backword success:  0
Traceback (most recent call last):
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
    loss.backward()
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

    

The model architecture is

    class EncoderLSTM(nn.Module):
    def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,has_channel,bidirectional = False):
        super(EncoderLSTM, self).__init__()
        self.hidden_size_lstm = hidden_size_lstm
        self.num_layers_lstm = num_layers_lstm
        self.feature_num = feature_num
        self.bias = bias
        self.has_channel = has_channel
        self.bidirectional = bidirectional
    

        self.lstm1 = nn.LSTM(input_size = self.feature_num, hidden_size  = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)

        self.fc_encoder = nn.Linear(self.hidden_size_lstm, self.hidden_size_lstm)
        self.fc_encoder.time_distributed = True

    def forward(self, x):
        if self.has_channel:
            x=x.view(x.size(0),x.size(2),x.size(3))

        h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
        c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state


        out, (hn, cn) = self.lstm1(x, (h0, c0))
        out = self.fc_encoder(out)
        return out, (hn, cn)

    class DecoderLSTM(nn.Module):
    def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,output_size,droup_out_prec = 0.2):
        super(DecoderLSTM, self).__init__()

        self.hidden_size_lstm = hidden_size_lstm
        self.num_layers_lstm = num_layers_lstm
        self.feature_num = feature_num
        self.bias = bias
        self.output_size = output_size
        self.droup_out_prec = droup_out_prec


        self.decoder_net = nn.LSTM(input_size = self.hidden_size_lstm, hidden_size  = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)

        self.fc_decoder_1 = nn.Linear(self.hidden_size_lstm, int(self.hidden_size_lstm/2))
        self.fc_decoder_1.time_distributed = True

        self.fc_decoder_2 = nn.Linear(int(self.hidden_size_lstm/2), self.output_size)
        self.fc_decoder_2.time_distributed = True

        self.relu_decoder_1= nn.ReLU(inplace=False)
        self.dropout_decoder_1 = nn.Dropout(self.droup_out_prec)
        self.relu_decoder_2= nn.ReLU(inplace=False)

    def forward(self, out, hn,cn,MAX_TIMESTEP = 4,target_tensor = None,return_state = False):

        out_decoder_list = []
        for time in range(MAX_TIMESTEP):
            # print(time)
            
            # print("encoder after fc")
            # print(output.size())
            out, (hn, cn) = self.decoder_net(out, (hn, cn))
            # print("Seq2Seq_decoder")
            # print(output.size())


            out_reg = torch.squeeze(hn, 0).clone()

            out_reg = self.fc_decoder_1(out_reg)
            out_reg = self.relu_decoder_1(out_reg)
            out_reg = self.dropout_decoder_1(out_reg)

            out_reg = self.fc_decoder_2(out_reg)

            out_decoder_list.append(out_reg)



            if target_tensor is not None:
                output = target_tensor[time]


        out_decoder_list = torch.cat(out_decoder_list, dim=1)
        if return_state:
            return out_decoder_list,(hn, cn)
        else:
            return out_decoder_list    


    class seq2seqModel_indipendent(nn.Module):
    def __init__(self,encoder = None,decoder = None, training = True):
        super(seq2seqModel_indipendent, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.training = training




    def __call__(self,x,MAX_TIMESTEP = 4,return_state = False):
        if self.training:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()  
        # if self.encoder:         
        out, (hn, cn) = self.encoder(x)
        out = self.decoder(out = out, hn = hn,cn = cn,MAX_TIMESTEP = MAX_TIMESTEP,return_state = return_state)
        return out
    
    
    def train(self):
        self.training = True

    def eval(self):
        self.training = False
    def state_dict(self):
        return (self.encoder.state_dict(),self.decoder.state_dict())

And tranning lops is like

     for epoch in range(num_epochs):
      count = 0
      # print('epoch:',epoch)
      if verbos == 3 or verbos == 2:
        print('*'*100)
        print(f'Running epoch: {epoch}')
      if train:
        if not model.training:
          model.train()
        temp_train = []
        for i, (inputs, labels_reg) in enumerate(dataloader_train):
            inputs = inputs.to(device)
            labels_reg = labels_reg.to(device).reshape(-1,4)
            

            # optimizer.zero_grad()
            out_reg = model(inputs)




            loss = criterion_train(out_reg, labels_reg)
            # loss.backward(retain_graph=True)
            # print('iter:',i)

            loss.backward()

            if (i - count) > 0:
              print('iter: ',i)
              print('epoch:',epoch) 
            count += 1
          
            
            if grad_clip is not None:
              torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)

            # optimizer.step()
            for opt in optimizer:
              opt.step()

            for opt in optimizer:
              opt.zero_grad()

            # for opt in optimizer: 
            #   opt.zero_grad()
            loss_list_train.append(loss.item())
            temp_train.append(loss.item())
            
            if verbos == 3:
              print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                    .format(epoch, num_epochs, i, len(dataset_train)//batch_size, loss.item()))

        loss_list_train_epoch.append(sum(temp_train)/len(temp_train))    
        if scheduler is not None and verbos == 3:

          for schedule,opt in zip(scheduler,optimizer):
              before_lr = opt.param_groups[0]["lr"]
              schedule.step()
              after_lr = opt.param_groups[0]["lr"]
              print("Epoch %d: lr %.6f -> %.6f" % (epoch, before_lr, after_lr))

Loss function used is

class CE_prob_KL_loss_function(nn.Module):

def __init__(self,main_loss = MeanAbsoluteError(),control_factor_ce = 1e-2,control_factor_kl = 1e-2,check = False):

    super().__init__()
    self.main_loss = main_loss
    self.control_factor_ce = control_factor_ce
    self.control_factor_kl = control_factor_kl
    self.check = check


def forward(self,predicted, original,MAX_TIMESTEP = 4):

    orig_mean = torch.mean(original, dim=1).view(-1,1)
    orig_std = torch.std(original, dim=1).view(-1,1)

    # Setting stdev to 1 for dist that have less than 1 stdev in orignal
    orig_std = torch.where(orig_std <1, 1, orig_std)

    # orig_std_temp = orig_std + 2
    orig_std_temp = torch.add(orig_std, 2)
    
    p = torch.distributions.Normal(orig_mean,orig_std_temp) 
    

    log_loss = 0

    
    for time in range(MAX_TIMESTEP):


        # We compute the probablity of predictions in the orignal distribution
        # below line prints the log of probablity of predicted[:,time]
        # print('log prob of p:',p.log_prob(predicted[:,time]))

        pz_mean = torch.exp(p.log_prob(predicted[:,time]))
        # below line prints the probablity of predicted[:,time]
        # print('pz_mean: ',pz_mean)
        
        # We compute the probablity of actuals in the orignal distribution
        qz_mean = torch.exp(p.log_prob(original[:,time]))

        # print('pz_mean:', pz_mean)
        # print('qz_mean:',qz_mean)

        #We multiply -1 to the probablity to the left of mean as distributions are semetric and mean +- stdev will have the same prob. This will help distinguish between these probs
        original = torch.where(original[:,time]>=orig_mean, pz_mean, torch.mul(torch.tensor(-1),pz_mean))
        predicted = torch.where(predicted[:,time]>=orig_mean, qz_mean, torch.mul(torch.tensor(-1),qz_mean))


        # We compute the diffrence in the probablity of predicted and orignal in the distribution defined by orignal
        mk = torch.abs(pz_mean-qz_mean)

        # We cut off the values to 1 as we are dealing with probablities
        mk = torch.where(mk>1, 1, mk)
        
        #This is where we compute cross entropy.This step means across all batches for a timestep
        log_loss += torch.mul( torch.mean(torch.log(1-mk)), -1)
    

    loss_acc = torch.mean(log_loss)

    if self.check:
        print("loss_acc: ",loss_acc)



    var_lable = torch.var(original, dim= 1).mean()
    var_pred = torch.var(predicted, dim= 1).mean()  
    mean_lable = torch.mean(original, dim=1).mean()
    mean_pred = torch.mean(predicted, dim=1 ).mean()

    # Introduced to avoide division by 0 when var_pred is 0 
    var_pred = torch.add(var_pred,1e-8)


    # used earlier
    kl_temp = (torch.pow(mean_lable - mean_pred, 2) / var_pred  + var_lable / var_pred - 1.0 -  torch.log(var_lable)  + torch.log(var_pred))#.mean()
    
    # kl_temp = torch.add(torch.sub(torch.sub(torch.add(torch.div(torch.pow(torch.sub(mean_lable,mean_pred), 2),var_pred) , torch.div(var_lable,var_pred)),1),torch.log(var_lable)),torch.log(var_pred))
    # print("kl_temp not defined")

    # self.kl_loss( input = torch.log())
    # KL_loss = 0.5 * torch.sum(kl_temp)
    KL_loss = torch.mul(torch.sum(kl_temp), 0.5)



    if self.main_loss is None:
        return loss_acc
    else:
        if self.check:
            print("ce_loss: ",(self.control_factor_ce*loss_acc)) #
            print("kl_loss: ",(self.control_factor_kl*KL_loss) ) #
        # print(predicted[0].size())

        if self.check:
            print("main_loss: ",self.main_loss(predicted, original))
        return self.main_loss(predicted, original) + torch.mul(self.control_factor_ce,loss_acc) + torch.mul(self.control_factor_kl,KL_loss)
1

There are 1 best solutions below

2
Muhammed Yunus On

Update

Replace log_loss += torch.mul(...) with log_loss = log_loss + torch.mul(...).

They do the same thing but in a different way. The original code uses an in-place operation +=, but this interferes with gradient tracking and causes torch to error. The modified line does not use an in-place operation, so it averts that problem.


The error is saying that you are not modifying a tensor correctly somewhere. The error condition occurs in train_model() when you call loss.backward(). Check how you compute loss and don't use in-place operations for it - that may resolve the problem based on the information in the message.

It would be helpful to see more code, as the part that's erroring doesn't seem to be included in your original post.