I have been learning the calculus behind neural networks and have attempted to implement it using the MNIST database. However, I am encountering challenges with fluctuating cost values, even after experimenting with various learning rates. In an attempt to debug, I inserted print statements to check if changes were occurring with the weights matrix and if different results are being obtained, and indeed, they were. I initially initialized the weights normally and did not normalize the image values. Subsequently, I normalized them and initialized the weights using the Xavier/Glorot initialization. The cost function employed here is the Mean Squared Error. Despite these adjustments, I find myself stuck and seeking guidance.
import numpy as np
class Layer:
def __init__(self, m, n, activation):
self.input = m
self.weights = np.random.rand(m.shape[1], n) * np.sqrt(2 / (m.shape[1] + n))
self.bias = np.zeros((1, n))
self.activation = activation.lower()
self.res = self.getValues()
def apply_activation(self, values):
if self.activation == 'relu':
return np.maximum(0, values)
elif self.activation == 'sigmoid':
return 1 / (1 + np.exp(-values))
elif self.activation == 'softmax':
exp_values = np.exp(values - np.max(values, axis=-1, keepdims=True))
return exp_values / np.sum(exp_values, axis=-1, keepdims=True)
else:
return values
def getValues(self):
vals = np.dot(self.input, self.weights)
neuron_vals = vals + self.bias
return self.apply_activation(np.array(neuron_vals))
def update(self, learningRate, n_vals, grad):
self.weights -= learningRate * np.dot(n_vals.T, grad)
self.bias -= learningRate * grad
The above code snippet initializes the layers
class Model:
def __init__(self, learningRate=0.01):
self.epoch = 0
self.al = learningRate
self.inpLay = None
self.hidLay_1 = None
self.hidLay_2 = None
self.outLay = None
self.initiateLayers()
@staticmethod
def flat(img):
img = np.array(img) / 255
return np.matrix(np.concatenate(img).flatten())
@staticmethod
def softmax_derivative(z):
s = z.reshape(-1, z.shape[-1])
exp_values = np.exp(s - np.max(s, axis=-1, keepdims=True))
softmax_output = exp_values / np.sum(exp_values, axis=-1, keepdims=True)
jacobian_matrix = np.diagflat(softmax_output) - np.einsum('ij,ik->ijk', softmax_output, softmax_output)
return np.sum(jacobian_matrix, axis=-1)
def initiateLayers(self):
img = x_train[self.epoch]
self.inpLay = self.flat(img)
self.hidLay_1 = Layer(self.inpLay, 16, "Sigmoid")
self.hidLay_2 = Layer(self.hidLay_1.res, 16, "Sigmoid")
self.outLay = Layer(self.hidLay_2.res, 10, "SoftMax")
self.epoch += 1
def costFunc(self, output, label):
cost = np.sum(np.square((output - label)))
J_wb = cost / 2
return J_wb
def gd_OH(self, n_vals, label):
softmax_del = self.softmax_derivative(n_vals)
return np.multiply((n_vals - label), softmax_del)
def gd(self, n_vals, grad, w):
return np.multiply(np.dot(w, grad.T).T, (n_vals * (1 - n_vals)))
def feedForward(self, img):
self.inpLay = self.flat(img)
self.hidLay_1.input = self.inpLay
self.hidLay_2.input = self.hidLay_1.res
self.outLay.input = self.hidLay_2.res
return self.outLay.getValues()
def backPropagation(self, label):
grad = self.gd_OH(self.outLay.res, label)
self.outLay.update(self.al, self.hidLay_2.res, grad)
grad_2 = self.gd(self.hidLay_2.res, grad, self.outLay.weights)
self.hidLay_2.update(self.al, self.hidLay_1.res, grad_2)
grad_3 = self.gd(self.hidLay_1.res, grad_2, self.hidLay_2.weights)
self.hidLay_1.update(self.al, self.inpLay, grad_3)
def train(self):
label = y_train[self.epoch - 1]
out = self.outLay.res
while True:
labels = np.zeros(10)
labels[label] += 1
labels = np.matrix(labels)
cost = self.costFunc(out, labels)
print(f"Epoch: {self.epoch} :: Cost: {cost}")
self.backPropagation(labels)
if self.epoch == 100:
break
img, label = x_train[self.epoch], y_train[self.epoch]
out = self.feedForward(img)
self.epoch += 1
def predict(self, img):
return self.feedForward(img)
The above code defines the model.
nn = Model()
nn.train()
import random
r = random.randint(0, len(x_test))
testImg, testLabel = x_test[r], y_test[r]
prediction = nn.predict(testImg)
print(prediction, testLabel)
The provided code initializes a model object, allowing for training and testing.
The Cost Values over the epochs reveals a consistent pattern without improvement, even after running the code for approximately 10,000 epochs.
What modifications can be implemented to enhance the model's performance?
Observing cost fluctuations from the initial epochs, I attempted to find solutions on online forums and consulted Michael Nielsen's Neural Network book without success. I used ChatGPT as well, as a final measure, but the solutions provided by it was repetitive and did not address the issue, even after implementing suggested changes. I cross-checked the mathematics alongside the code and everything appeared to be in order. I considered the possibility that the weight initialization using the Xavier/Glorot method might be causing problems, but even after removing it, the issue persisted. I am struggling to identify the root cause.