Neural Network backpropagation algorithm only partially training in python

31 Views Asked by At

I am writing a neural network to identify digits from the MNIST database. It's primarily based on this code https://github.com/SebLague/Neural-Network-Experiments

My neural network appears to be encountering issues with backpropagation. While it succeeded in learning certain digits like 0, it fails at all the other digits. I've made a graph of the accuracy of the model based on some testing data, and it seems to get stuck around 40% accuracy while simultaneously fluctuating a large amount. It looks something like this:

Graph of accuracy based on pass #:

Graph of accuracy based on pass #

Here is the layer code and network code:

import numpy as np

def ReLU_calculate(inputs):
    return np.maximum(0, inputs)


def Softmax_calculate(inputs):
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    return probabilities

def ReLU_derivative(input):
    if input > 0:
        return 1
    return 0


def Softmax_derivative(inputs, index):
    expSum = 0
    for input in inputs:
        expSum = expSum + np.exp(input)

    ex = np.exp(inputs[index])

    return (ex * expSum - ex * ex) / (expSum * expSum)

def cost_derivative(output, y):
    if output == 0 or output == 1:
        return 0
    else:
        return (-1 * output + y) / (output * (output - 1))

class Layer:
    def __init__(self, numNodesIn, numNodes):
        self.weights = 0.1 * np.random.randn(numNodesIn, numNodes)
        self.biases = np.zeros((1, numNodes))
        self.inputs = None
        self.weighted_inputs = None
        self.activations = None
        self.nodes_in = numNodesIn
        self.nodes_out = numNodes
        self.cost_gradientW = None
        self.cost_gradientB = None

    def forward(self, inputs):
        self.inputs = inputs
        self.weighted_inputs = np.dot(inputs, self.weights) + self.biases

    def hidden_activation(self):
        self.activations = ReLU_calculate(self.weighted_inputs)

    def output_activation(self):
        self.activations = Softmax_calculate(self.weighted_inputs)
    def CalculateOutputLayerNodeValues(self, data, expectedOutputs):
        nodeValues = []
        for i in range(data.batch_size):
            current_nodeValues = []
            for x in range(len(expectedOutputs[i])):
                costDerivative = cost_derivative(self.activations[i][x], expectedOutputs[i][x])
                activationDerivative = Softmax_derivative(self.weighted_inputs[i], x)
                current_nodeValues.append(costDerivative * activationDerivative)
            nodeValues.append(current_nodeValues)
        return nodeValues

    def UpdateGradients(self, data, nodeValues):
        cost_gradientW = [[0 for x in range(self.nodes_out)] for j in range(self.nodes_in)]
        cost_gradientB = [0 for x in range(self.nodes_out)]
        for i in range(data.batch_size):
            for nodeOut in range(self.nodes_out):
                nodeValue = nodeValues[i][nodeOut]
                for nodeIn in range(self.nodes_in):
                    derivativeCostWrtWeight = self.inputs[i][nodeIn] * nodeValue
                    cost_gradientW[nodeIn][nodeOut] += derivativeCostWrtWeight

                derivativeCostWrtBias = 1 * nodeValues[i][nodeOut]
                cost_gradientB[nodeOut] += derivativeCostWrtBias

        self.cost_gradientW = cost_gradientW
        self.cost_gradientB = cost_gradientB

    def CalculateHiddenLayerNodeValues(self, data, oldLayer, oldNodeValues):
        newNodeValues = []
        for batch in range(data.batch_size):
            smallerNodeValues = []
            for i in range(self.nodes_out):
                newNodeValue = 0
                for j in range(len(oldNodeValues)):
                    newNodeValue += oldLayer.weights[i][j] * oldNodeValues[batch][j]

                newNodeValue = newNodeValue * ReLU_derivative(self.weighted_inputs[batch][i])
                smallerNodeValues.append(newNodeValue)
            newNodeValues.append(smallerNodeValues)

        return newNodeValues

    def ApplyGradients(self, learnRate):
        for nodeOut in range(self.nodes_out):
            for nodeIn in range(self.nodes_in):
                self.weights[nodeIn][nodeOut] -= self.cost_gradientW[nodeIn][nodeOut] * learnRate

            self.biases[0][nodeOut] -= self.cost_gradientB[nodeOut] * learnRate

        self.cost_gradientB = None
        self.cost_gradientW = None

Layer code:

import numpy as np
import matplotlib.pyplot as plt

from Layer import Layer
from keras.datasets import mnist


class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossentropy(Loss):
    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]

        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)

        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

class Data:
    def __init__(self):
        self.batch_size = 0
        self.inputs = []
        self.activations = []
        self.weighted_inputs = []
        self.nodeValues = []


class Network:
    def __init__(self, size):
        self.length = len(size) - 1
        self.network = []
        self.data = Data()
        for i in range(len(size) - 1):
            layer = Layer(size[i], size[i + 1])
            self.network.append(layer)

    # assuming that there must be at least 2 layers in the network
    def forward(self, X):
        # create new data object to store data for the current pass
        self.data = Data()
        self.data.batch_size = len(X)
        # set first output to the input values passed into forward
        current_output = X
        self.data.activations.append(current_output)
        # Pass forward through the neural network
        for i in range(self.length - 1):
            self.network[i].forward(current_output)
            self.data.weighted_inputs.append(self.network[i].weighted_inputs)
            self.network[i].hidden_activation()
            current_output = (self.network[i].activations)
            self.data.activations.append(current_output)

        self.network[self.length - 1].forward(current_output)
        self.data.weighted_inputs.append(self.network[self.length - 1].weighted_inputs)
        self.network[self.length - 1].output_activation()
        final_output = self.network[self.length - 1].activations
        self.data.activations.append(final_output)
        return final_output

    def UpdateAllGradients(self, inputs, expectedOutputs, learnRate):
        self.forward(inputs)
        outputLayer = self.network[self.length - 1]
        nodeValues = outputLayer.CalculateOutputLayerNodeValues(self.data, expectedOutputs)
        outputLayer.UpdateGradients(self.data, nodeValues)

        for i in reversed(range(self.length - 1)):
            hiddenLayer = self.network[i]
            nodeValues = hiddenLayer.CalculateHiddenLayerNodeValues(self.data, self.network[i + 1], nodeValues)
            hiddenLayer.UpdateGradients(self.data, nodeValues)

        for layer in network.network:
            layer.ApplyGradients(learnRate)

    def test_accuracy(self, inputs, expected):
        total = 0
        size = len(inputs)
        inputs_flat = []
        for x in range(len(inputs)):
            inputs_flat.append(inputs[x].flatten())
        outputs = self.forward(inputs_flat)
        for sample in range(len(outputs)):
            index = np.where(outputs[sample] == max(outputs[sample]))[0][0]
            if index == expected[sample]:
                total += 1
        return total / size

(train_X, y_train), (test_X, y_test) = mnist.load_data()

print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(y_train.shape))
print('X_test:  ' + str(test_X.shape))
print('Y_test:  ' + str(y_test.shape))

x_train = train_X.astype("float32") / 255
x_test = test_X.astype("float32") / 255

network = Network([784, 50, 16, 10])

cost_function = Loss_CategoricalCrossentropy()
costs = []
accuracy = []
for i in range(10000):
    print("Pass: ", i, "\n")
    inputs = [x_train[i].flatten()]
    outputs = np.zeros((1, 10))
    outputs[0][y_train[i]] = 1
    network.UpdateAllGradients(inputs, outputs, 0.1)
    print(np.array(network.data.activations[-1]), outputs)
    accuracy.append(network.test_accuracy(x_test, y_test))
    costs.append(cost_function.calculate(np.array(network.data.activations[-1]),
                                outputs))
    print("\n")

plt.plot(accuracy)
plt.show()

Not exactly sure what is going wrong with the backpropagation. Any input is appreciated

So far, I've gone over the code for any errors and tried modifying details related to the neural network like the size of the network and layers. In addition, I tried modifying the random weights generation, but received similar results.

0

There are 0 best solutions below