I'm tasked with writing a ANN using only NumPy (no TensorFlow, PyTorch , etc.) on the iris dataset. I'm running 2000 epochs and it seems by the time of epoch 40 the accuracy of the network stays at 0.66. Also the parameters while debugging are either extremely high or extremely low (for example, for self.layers[0], the self.output parameter is [-59.2447737,-79.13719157,-57.27055739,117.26796309,127.71775426] on epoch 400.
My network has 4 input nodes, a single hidden layer with 5 nodes and an output layer with 3 nodes corresponding to the 3 types of irises.
I'm confused as to why that's the case. The learning rate is low (0.01), the weights and biases vectors are initialized with low values, and I normalized the input data.
Any help with this would be highly appreciated. My code:
main.py:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from network import NeuralNetwork
from layer import Layer
if __name__ == "__main__":
iris = load_iris()
data, target, target_names = iris.data, iris.target, iris.target_names
scaler = StandardScaler()
# One hot encoding to ap the target array to match the 3 neurons output structure
one_hot_targets = []
for i in range(len(target)):
vec = np.zeros(len(target_names))
vec[target[i]] = 1
one_hot_targets.append(vec)
one_hot_targets = np.array(one_hot_targets)
X_train, X_test, Y_train, Y_test = train_test_split(data, one_hot_targets, test_size=0.33, shuffle=True)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
learning_rate = 0.01
# Init a network and add it's layers. Input layer is represented by the input, and not by an actual layer
network = NeuralNetwork(learning_rate)
network.add_layer(Layer(4, 5)) # hidden layer 1
network.add_layer(Layer(5, 3)) # output layer
# Train the network for a number of epochs
network.train(X_train_scaled, Y_train, epochs=2000)
# Test for the test data seperated earlier
output, accuracy = network.test(X_test_scaled, Y_test)
# Print testing output
for i in range(len(output)):
prediction = target_names[np.argmax(output[i])]
answer = target_names[np.argmax(Y_test[i])]
print(f"For testing row: {X_test[i]}, the prediction was {prediction} and the answer was {answer}")
print(f"Network test accuracy: {accuracy:.4f}")
network.py:
import numpy as np
from utils import calc_error
np.random.seed(10)
class NeuralNetwork:
def __init__(self, learning_rate=0.1):
self.layers = []
self.learning_rate = learning_rate
def add_layer(self, layer):
# Layers must be added in order
self.layers.append(layer)
def forward_propagate(self, input):
output = input
for layer in self.layers:
output = layer.forward_propagate(output)
return output
def back_propagate(self, error):
for layer in reversed(self.layers):
error = layer.back_propagate(error)
def train_iteration(self, input, target):
output = self.forward_propagate(input)
# Calculate the error between the output and the target value
error = output - target
# Backpropagate the error through the network
self.back_propagate(error)
# Update the weights and biases of the layers
for layer in self.layers:
layer.weights -= self.learning_rate * layer.d_weights
layer.biases -= self.learning_rate * layer.d_biases
def train_epoch(self, inputs, targets):
for i in range(len(inputs)):
x = inputs[i]
y = targets[i]
self.train_iteration(x, y)
def train(self, inputs, targets, epochs=4000):
for epoch in range(epochs):
self.train_epoch(inputs, targets)
if epoch % (epochs / 100) == 0:
_, accuracy = self.test(inputs, targets)
print(f"Epoch {epoch} --> Training Accuracy:{accuracy}")
def predict(self, input):
output = self.forward_propagate(input)
return output
def test(self, inputs, targets):
output, correct = [], 0
for i in range(len(inputs)):
x, y = inputs[i], targets[i]
guess = self.predict(x)
is_correct = y[guess.argmax()] == 1
correct += is_correct
output.append(guess)
return output, (correct / len(inputs))
layer.py:
import numpy as np
from utils import sigmoid, deriv_sigmoid
np.random.seed(10)
class Layer:
def __init__(self, num_inputs, num_neurons, activation_function=sigmoid, derivative_activation_function=deriv_sigmoid):
self.weights = np.random.randn(num_inputs, num_neurons) * 0.01
self.biases = np.zeros((1, num_neurons))
self.activation_function = activation_function
self.derivative_activation_function = derivative_activation_function
def forward_propagate(self, input):
self.input = input
self.output = np.dot(input, self.weights) + self.biases
self.activated_output = self.activation_function(self.output)
return self.activated_output
def back_propagate(self, error):
error = self.derivative_activation_function(error)
reshaped_input = self.input.T.reshape((np.max(self.input.shape), 1)) # ensures dot product always works
self.d_weights = np.dot(reshaped_input, error)
self.d_biases = np.sum(error, axis=0, keepdims=True)
self.d_input = np.dot(error, self.weights.T)
return self.d_input
utils.py:
import numpy as np
def sigmoid(x):
return (1 / (1 + np.exp(-x)))
def deriv_sigmoid(x):
return np.multiply(x, 1-x)
Your end layer should use softmax activation as you have three classes. Your first layer should use relu/ leaky relu activation. You need to give their respective derivative functions as well.
Sigmoid is applicable only for binary classes with end layer having 1 neuron. Intermediate layers cannot have Sigmoid activation function.
To make the point more clear, you have 3 neurons in output layer. So you expect a signal from 3 neurons based on which you can decide the predicted class of one record. This signal comes in the form of logits. When the logits pass through softmax activation they get converted to probability values one for each class e.g. [0.1, 0.6, 0.2]. Based on this, since the index 1 has highest probability, the predicted class is 2.
Now coming to the problem you will face while trying to implement softmax activation. The derivative of softmax is jacobian which involves partial derivatives.
Implementing partial derivatives is a bit overkill for the problem at hand. You can safely assume the difference between predictions and one hot encoded target as the gradient of end layer having softmax.
All the best in learning ML.