I have Neural network of 3 fully connected layers. Doing backpropagation manually except 3rd FC layer, all the FC layer bias gradient is coming wrong

46 Views Asked by Utkarsh Kathuria At 06 December 2023 at 15:16

Neural network:

    self.fc1 = nn.Linear(1024, 512)
    self.bn1 = nn.BatchNorm1d(512)
    self.drop1 = nn.Dropout(0.4)
    self.fc2 = nn.Linear(512, 256)
    self.bn2 = nn.BatchNorm1d(256)
    self.drop2 = nn.Dropout(0.4)
    self.fc3 = nn.Linear(256, num_class=10)

Forward function looks like this:

    def fc1_hook_fn(grad):
           fc1_x.grad = grad.clone()
    def bn1_hook_fn(grad):
           bn1_x.grad = grad.clone()
    def relu1_hook_fn(grad):
           relu1_x.grad = grad.clone()
    def drop1_hook_fn(grad):
           drop1_x.grad = grad.clone()
    
    fc1_x = self.fc1(x)
    fc1_x.register_hook(fc1_hook_fn)
    
    bn1_x = self.bn1(fc1_x)
    bn1_x.register_hook(bn1_hook_fn)
    
    relu1_x = F.relu(bn1_x)
    relu1_x.register_hook(relu1_hook_fn)
    
    drop1_x,mask1 = self.dropout_layer(relu1_x, 0.4)
    drop1_x.register_hook(drop1_hook_fn)
    
    def fc2_hook_fn(grad):
           fc2_x.grad = grad.clone()
    
    def bn2_hook_fn(grad):
           bn2_x.grad = grad.clone()
    
    def relu2_hook_fn(grad):
           relu2_x.grad = grad.clone()
    
    def drop2_hook_fn(grad):
           drop2_x.grad = grad.clone()
    
    fc2_x = self.fc2(drop1_x)
    fc2_x.register_hook(fc2_hook_fn)
    
    bn2_x = self.bn2(fc2_x)
    bn2_x.register_hook(bn2_hook_fn)
    
    relu2_x = F.relu(bn2_x)
    relu2_x.register_hook(relu2_hook_fn)
           
    drop2_x, mask2 = self.dropout_layer(relu2_x, 0.4)
    drop2_x.register_hook(drop2_hook_fn)
    
    def fc3_hook_fn(grad):
           fc3_x.grad = grad.clone()
    
    fc3_x = self.fc3(drop2_x)
    fc3_x.register_hook(fc3_hook_fn)
     
    x = F.log_softmax(fc3_x, -1)

Then I apply log_softmax and nll_loss. I used the following lines to print the weights and bias gradients directly

  for name, param in classifier.named_parameters():
            print(f'parameters name is :{name}')
            print(f'parameters shape is :{param.shape}')
            param.register_hook(lambda grad, name=name: print(name, grad))  # Register a hook to print gradients

Now I write the backpropagation as follows to match the weights and bias gradients with manual calculations. In the training adam optimizer has been used. But in my knowledge adam optimizer is just used to update weights and bias and have nothing to do with bias gradients calculated by this step[param.register_hook(lambda grad, name=name: print(name, grad))]. If I am wrong here please let me know.

# Backward pass

 dz3 = softmax(fc3) # fc3 is the output of 3rd FC layer
    m = drop2.shape[0]
    
    dz3[range(m), target] -= 1
    dz3 /= m
    dW3 = np.dot(dz3.T, drop2) # drop2 is the output of 2nd dropout layer
    db3 = np.sum(dz3, axis=0, keepdims=True)
    
    
    # derivative of loss wrt derivative of input of fc3
    dX3 = np.dot(dz3, fc3_weights)
    dDropout2 = dX3 * (mask2/0.6)
    dRelu2 = dDropout2 * np.where(bn2 > 0, 1, 0) #bn2 is the output of 2nd batchnorm layer

    def batchnorm_backward(dout, x, gamma, beta, epsilon=1e-05):
        N, D = x.shape
        x_minus_mean = x - np.mean(x, axis=0)
        print(f'x_minus_mean shape is :{x_minus_mean.shape}')
        var = np.var(x, axis=0)
        sqrt_var_plus_eps = np.sqrt(var + epsilon)
        inv_sqrt_var_plus_eps = 1.0 / sqrt_var_plus_eps
    
        dx_normalized = dout * gamma
        print(f'dx_normalized shape is :{dx_normalized.shape}')
    
        dvar = np.sum(dx_normalized * x_minus_mean, axis=0) * -0.5 * inv_sqrt_var_plus_eps**3
        dmean = np.sum(dx_normalized * -inv_sqrt_var_plus_eps, axis=0) + dvar * np.sum(-2.0 * x_minus_mean, axis=0) / N
    
        dx = dx_normalized * inv_sqrt_var_plus_eps + dvar * 2.0 * x_minus_mean / N + dmean / N
    
        dgamma = np.sum(dout * (x - np.mean(x, axis=0)) / sqrt_var_plus_eps, axis=0)
        dbeta = np.sum(dout, axis=0)
    
    dbn2_x, dbn2_w, dbn2_b = batchnorm_backward(dRelu2, fc2, bn2_weights, bn2_bias, epsilon=1e-5) #fc2 is the O/P of 2nd FC layer
    dW2 = np.dot(dbn2_x.T, drop1) #drop1 is O/P of 1st dropout layer
    db2 = np.sum(dbn2_x, axis=0, keepdims=True)
    
    dX2 = np.dot(dbn2_x, fc2_weights)

Until here everything is correct except db2 (bias gradient of 2nd FC layer) value and I could not understand what did I do wrong. I kinda facing similar problem in bias gradient of BP of convolution layer too.

Original Q&A

I have Neural network of 3 fully connected layers. Doing backpropagation manually except 3rd FC layer, all the FC layer bias gradient is coming wrong

There are 0 best solutions below

Related Questions in PYTHON

Related Questions in DEEP-LEARNING

Related Questions in NEURAL-NETWORK

Related Questions in BACKPROPAGATION

Trending Questions

Popular # Hahtags

Popular Questions