import numpy as np
from scipy.special import expit as sigmoid
from scipy.special import softmax as sm
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from math import sqrt
from math import log
class NeuralNet:
def __init__(self, num_features, num_hidden1 ,alpha,alpha1, max_epochs, num_output, _EPSILON):
super().__init__()
self.num_features=num_features # number of input nodes (features)
self.num_hidden1=num_hidden1 # number of hidden nodes for 1st hidden layer
self.alpha=alpha # learning rate
self.alpha1=alpha1 # alpha for para RelU
self.max_epochs=max_epochs # maximum number of epochs
self.num_output=num_output # number of output nodes
self._EPSILON=_EPSILON
self.loss = [] #list to store losses per 100 epochs
self.trainingaccur=[] # list to store training accuracy per 100 epochs
self.devaccur=[]
self.Weights_Input_to_H1=np.random.randn(self.num_hidden1, self.num_features)*(0.1)
self.Bias_Input_to_H1=np.zeros([self.num_hidden1,1])
self.Weights_H1_to_output=np.random.randn(self.num_output, self.num_hidden1)*(0.1)
self.Bias_H1_to_output=np.zeros([self.num_output,1])
self.dWeights_Input_to_H1=np.zeros([self.num_hidden1, self.num_features])
self.dBias_Input_to_H1=np.zeros([self.num_hidden1,1])
self.dWeights_H1_to_output=np.zeros([self.num_output, self.num_hidden1])
self.dBias_H1_to_output=np.zeros([self.num_output,1])
def relU(self,X):
return np.maximum(X, 0)
def Para_relU(self,alpha,X):
return np.maximum(X,alpha*X)
def Para_deriv_wrt_X(self,alpha,X):
X[X>0]=1
X[X<=0]=alpha
return X
def Para_deriv_wrt_alpha(self,alpha,X):
return np.where(X<=0,alpha*X,0)
def deriv(self,X):
X[X>0]=1
X[X<=0]=0
return X
def softmax(self,x):
e=np.exp(x)
for i in range(e.shape[1]):
e[:,i]=e[:,i]/np.sum(e[:,i])
return e
# TODO: complete implementation for forward pass
def forward(self, X):
self.z1=np.dot((self.Weights_Input_to_H1),(X))+self.Bias_Input_to_H1
self.a1=self.Para_relU(self.alpha1,self.z1)
self.z2=np.dot((self.Weights_H1_to_output),(self.a1))+self.Bias_H1_to_output
self.a2=self.softmax((self.z2))
return self.a2
# TODO: complete implementation for backpropagation
# the following Numpy functions may be useful: np.dot, np.sum, np.tanh, numpy.ndarray.T
def backprop(self, X, t):
self.dz2=(self.a2.reshape(self.num_output,-1)-t.reshape(self.num_output,-1))/((self.num_output)*(X.shape[1]))
self.dBias_H1_to_output=np.sum(self.dz2,axis=1,keepdims=True)
self.dWeights_H1_to_output=np.dot((self.dz2),self.a1.T)
self.dz1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_X(self.alpha1,self.z1))
self.dalpha1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_alpha(self.alpha1,self.z1))
self.dalpha1=np.sum(self.dalpha1)
self.dBias_Input_to_H1=np.sum(self.dz1,axis=1,keepdims=True)
self.dWeights_Input_to_H1=np.dot((self.dz1),X.T)
#TODO: complete implementation for fitting data, and change the existing code if needed
def fit(self, x_train_data, y_train_data,x_dev_data,y_dev_data):
for step in range(self.max_epochs):
self.forward(x_train_data)
self.backprop(x_train_data, y_train_data)
self.Bias_H1_to_output=self.Bias_H1_to_output-((self.alpha)*(self.dBias_H1_to_output))
self.Weights_H1_to_output=self.Weights_H1_to_output-((self.alpha)*(self.dWeights_H1_to_output))
self.Bias_Input_to_H1=self.Bias_Input_to_H1-((self.alpha)*(self.dBias_Input_to_H1))
self.Weights_Input_to_H1=self.Weights_Input_to_H1-((self.alpha)*(self.dWeights_Input_to_H1))
self.alpha1=self.alpha1-((self.alpha)*(self.dalpha1))
if step % 100 == 0:
self.CCloss=log_loss(np.transpose(y_train_data),np.transpose(self.a2),eps=self._EPSILON,normalize=True)
self.trainingaccuracy=accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0))
self.devaccuracy=accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0))
print(f'step: {step}, loss: {self.CCloss:3.150f}')
print(accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0)))
print(accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0)))
print(self.dalpha1)
print(self.alpha1)
self.loss.append(self.CCloss)
self.trainingaccur.append(self.trainingaccuracy)
self.devaccur.append(self.devaccuracy)
def predict(self,X,y=None):
self.forward(X)
if(self.num_output>1):
y_hat=np.argmax(self.a2, axis=0)
temp=accuracy_score(y_hat,y)
else:
y_hat=np.where(self.a2>0.5,1,0)
temp=accuracy_score(y_hat,y)
return temp,y_hat
I tried to implement the gradient for the parameter of the parametric reLU which is dalpha1. However , not only is the value 0 when I print it out , I am getting a value very close to 0 when I tried to unit test the code by debugging using external cells.
I am not sure where the error is as when I checked the dimensions of each terms in the self.dalpha1 value everything is as according to expectations. Can someone suggest things that I may have overlooked so as to point me in the right direction?
I tried to replace the self.dalpha1 in the fit class with the entire np.sum expression, yet the alpha1 parameter is still not updated.
The issue seems to be that the parametric wrt x function also modifies the z1 array on passing it into the function.
By changing the function into using np.where , the z1 array is prevented from changing whilst still returning the correct array of parametric derivative wrt x.