Neural network:
self.fc1 = nn.Linear(1024, 512)
self.bn1 = nn.BatchNorm1d(512)
self.drop1 = nn.Dropout(0.4)
self.fc2 = nn.Linear(512, 256)
self.bn2 = nn.BatchNorm1d(256)
self.drop2 = nn.Dropout(0.4)
self.fc3 = nn.Linear(256, num_class=10)
Forward function looks like this:
def fc1_hook_fn(grad):
fc1_x.grad = grad.clone()
def bn1_hook_fn(grad):
bn1_x.grad = grad.clone()
def relu1_hook_fn(grad):
relu1_x.grad = grad.clone()
def drop1_hook_fn(grad):
drop1_x.grad = grad.clone()
fc1_x = self.fc1(x)
fc1_x.register_hook(fc1_hook_fn)
bn1_x = self.bn1(fc1_x)
bn1_x.register_hook(bn1_hook_fn)
relu1_x = F.relu(bn1_x)
relu1_x.register_hook(relu1_hook_fn)
drop1_x,mask1 = self.dropout_layer(relu1_x, 0.4)
drop1_x.register_hook(drop1_hook_fn)
def fc2_hook_fn(grad):
fc2_x.grad = grad.clone()
def bn2_hook_fn(grad):
bn2_x.grad = grad.clone()
def relu2_hook_fn(grad):
relu2_x.grad = grad.clone()
def drop2_hook_fn(grad):
drop2_x.grad = grad.clone()
fc2_x = self.fc2(drop1_x)
fc2_x.register_hook(fc2_hook_fn)
bn2_x = self.bn2(fc2_x)
bn2_x.register_hook(bn2_hook_fn)
relu2_x = F.relu(bn2_x)
relu2_x.register_hook(relu2_hook_fn)
drop2_x, mask2 = self.dropout_layer(relu2_x, 0.4)
drop2_x.register_hook(drop2_hook_fn)
def fc3_hook_fn(grad):
fc3_x.grad = grad.clone()
fc3_x = self.fc3(drop2_x)
fc3_x.register_hook(fc3_hook_fn)
x = F.log_softmax(fc3_x, -1)
Then I apply log_softmax and nll_loss. I used the following lines to print the weights and bias gradients directly
for name, param in classifier.named_parameters():
print(f'parameters name is :{name}')
print(f'parameters shape is :{param.shape}')
param.register_hook(lambda grad, name=name: print(name, grad)) # Register a hook to print gradients
Now I write the backpropagation as follows to match the weights and bias gradients with manual calculations. In the training adam optimizer has been used. But in my knowledge adam optimizer is just used to update weights and bias and have nothing to do with bias gradients calculated by this step[param.register_hook(lambda grad, name=name: print(name, grad))]. If I am wrong here please let me know.
# Backward pass
dz3 = softmax(fc3) # fc3 is the output of 3rd FC layer
m = drop2.shape[0]
dz3[range(m), target] -= 1
dz3 /= m
dW3 = np.dot(dz3.T, drop2) # drop2 is the output of 2nd dropout layer
db3 = np.sum(dz3, axis=0, keepdims=True)
# derivative of loss wrt derivative of input of fc3
dX3 = np.dot(dz3, fc3_weights)
dDropout2 = dX3 * (mask2/0.6)
dRelu2 = dDropout2 * np.where(bn2 > 0, 1, 0) #bn2 is the output of 2nd batchnorm layer
def batchnorm_backward(dout, x, gamma, beta, epsilon=1e-05):
N, D = x.shape
x_minus_mean = x - np.mean(x, axis=0)
print(f'x_minus_mean shape is :{x_minus_mean.shape}')
var = np.var(x, axis=0)
sqrt_var_plus_eps = np.sqrt(var + epsilon)
inv_sqrt_var_plus_eps = 1.0 / sqrt_var_plus_eps
dx_normalized = dout * gamma
print(f'dx_normalized shape is :{dx_normalized.shape}')
dvar = np.sum(dx_normalized * x_minus_mean, axis=0) * -0.5 * inv_sqrt_var_plus_eps**3
dmean = np.sum(dx_normalized * -inv_sqrt_var_plus_eps, axis=0) + dvar * np.sum(-2.0 * x_minus_mean, axis=0) / N
dx = dx_normalized * inv_sqrt_var_plus_eps + dvar * 2.0 * x_minus_mean / N + dmean / N
dgamma = np.sum(dout * (x - np.mean(x, axis=0)) / sqrt_var_plus_eps, axis=0)
dbeta = np.sum(dout, axis=0)
dbn2_x, dbn2_w, dbn2_b = batchnorm_backward(dRelu2, fc2, bn2_weights, bn2_bias, epsilon=1e-5) #fc2 is the O/P of 2nd FC layer
dW2 = np.dot(dbn2_x.T, drop1) #drop1 is O/P of 1st dropout layer
db2 = np.sum(dbn2_x, axis=0, keepdims=True)
dX2 = np.dot(dbn2_x, fc2_weights)
Until here everything is correct except db2 (bias gradient of 2nd FC layer) value and I could not understand what did I do wrong. I kinda facing similar problem in bias gradient of BP of convolution layer too.