I have created a neural network with 1 hidden layer and with parametric RelU as the activation for the hidden layer

50 Views Asked by At
import numpy as np
from scipy.special import expit as sigmoid
from scipy.special import softmax as sm
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from math import sqrt
from math import log

class NeuralNet:
    def __init__(self, num_features, num_hidden1 ,alpha,alpha1, max_epochs, num_output, _EPSILON):
        super().__init__()
        self.num_features=num_features  # number of input nodes (features)
        self.num_hidden1=num_hidden1  # number of hidden nodes for 1st hidden layer
        self.alpha=alpha  # learning rate
        self.alpha1=alpha1 # alpha for para RelU
        self.max_epochs=max_epochs # maximum number of epochs
        self.num_output=num_output # number of output nodes
        self._EPSILON=_EPSILON
        self.loss = [] #list to store losses per 100 epochs 
        self.trainingaccur=[] # list to store training accuracy per 100 epochs 
        self.devaccur=[]
        self.Weights_Input_to_H1=np.random.randn(self.num_hidden1, self.num_features)*(0.1)
        self.Bias_Input_to_H1=np.zeros([self.num_hidden1,1])
        self.Weights_H1_to_output=np.random.randn(self.num_output, self.num_hidden1)*(0.1)
        self.Bias_H1_to_output=np.zeros([self.num_output,1])
        self.dWeights_Input_to_H1=np.zeros([self.num_hidden1, self.num_features])
        self.dBias_Input_to_H1=np.zeros([self.num_hidden1,1])
        self.dWeights_H1_to_output=np.zeros([self.num_output, self.num_hidden1])
        self.dBias_H1_to_output=np.zeros([self.num_output,1])
        
        

        
    
    def relU(self,X):
        return np.maximum(X, 0)

    def Para_relU(self,alpha,X):
        return np.maximum(X,alpha*X)

    def Para_deriv_wrt_X(self,alpha,X):
        X[X>0]=1
        X[X<=0]=alpha

        return X
        

    def Para_deriv_wrt_alpha(self,alpha,X):
        return np.where(X<=0,alpha*X,0)

        
        

    def deriv(self,X):
        X[X>0]=1
        X[X<=0]=0
        
        return X
        
        


    
    def softmax(self,x):
        e=np.exp(x)
        for i in range(e.shape[1]):
            e[:,i]=e[:,i]/np.sum(e[:,i])
        return e

    
    

    
        
    # TODO: complete implementation for forward pass
    def forward(self, X):
        self.z1=np.dot((self.Weights_Input_to_H1),(X))+self.Bias_Input_to_H1
        self.a1=self.Para_relU(self.alpha1,self.z1)
        self.z2=np.dot((self.Weights_H1_to_output),(self.a1))+self.Bias_H1_to_output
        self.a2=self.softmax((self.z2))
        return self.a2
        
        
        
    
    # TODO: complete implementation for backpropagation
    # the following Numpy functions may be useful: np.dot, np.sum, np.tanh, numpy.ndarray.T
    def backprop(self, X, t):
      
        self.dz2=(self.a2.reshape(self.num_output,-1)-t.reshape(self.num_output,-1))/((self.num_output)*(X.shape[1]))
        self.dBias_H1_to_output=np.sum(self.dz2,axis=1,keepdims=True)
        self.dWeights_H1_to_output=np.dot((self.dz2),self.a1.T)
        self.dz1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_X(self.alpha1,self.z1))
        self.dalpha1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_alpha(self.alpha1,self.z1))
        self.dalpha1=np.sum(self.dalpha1)
        self.dBias_Input_to_H1=np.sum(self.dz1,axis=1,keepdims=True)
        self.dWeights_Input_to_H1=np.dot((self.dz1),X.T)
        
        
        
        
        
            
                
                
                
              
                        
                
      
        
        
    
    #TODO: complete implementation for fitting data, and change the existing code if needed
    def fit(self, x_train_data, y_train_data,x_dev_data,y_dev_data):
       
        
        
        for step in range(self.max_epochs):
            self.forward(x_train_data)
            self.backprop(x_train_data, y_train_data)
            self.Bias_H1_to_output=self.Bias_H1_to_output-((self.alpha)*(self.dBias_H1_to_output))
            self.Weights_H1_to_output=self.Weights_H1_to_output-((self.alpha)*(self.dWeights_H1_to_output))
            self.Bias_Input_to_H1=self.Bias_Input_to_H1-((self.alpha)*(self.dBias_Input_to_H1))
            self.Weights_Input_to_H1=self.Weights_Input_to_H1-((self.alpha)*(self.dWeights_Input_to_H1))
            self.alpha1=self.alpha1-((self.alpha)*(self.dalpha1))
            


            if step % 100 == 0:
                self.CCloss=log_loss(np.transpose(y_train_data),np.transpose(self.a2),eps=self._EPSILON,normalize=True)
                self.trainingaccuracy=accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0))
                self.devaccuracy=accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0))
                print(f'step: {step},  loss: {self.CCloss:3.150f}') 
                print(accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0)))
                print(accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0)))
                print(self.dalpha1)
                print(self.alpha1)
                self.loss.append(self.CCloss)
                self.trainingaccur.append(self.trainingaccuracy)
                self.devaccur.append(self.devaccuracy)
                
              
            
            
    def predict(self,X,y=None):
        self.forward(X)
        if(self.num_output>1):
            y_hat=np.argmax(self.a2, axis=0)
            temp=accuracy_score(y_hat,y)
        else:
            y_hat=np.where(self.a2>0.5,1,0)
            temp=accuracy_score(y_hat,y)
        return temp,y_hat

I tried to implement the gradient for the parameter of the parametric reLU which is dalpha1. However , not only is the value 0 when I print it out , I am getting a value very close to 0 when I tried to unit test the code by debugging using external cells.

I am not sure where the error is as when I checked the dimensions of each terms in the self.dalpha1 value everything is as according to expectations. Can someone suggest things that I may have overlooked so as to point me in the right direction?

I tried to replace the self.dalpha1 in the fit class with the entire np.sum expression, yet the alpha1 parameter is still not updated.

1

There are 1 best solutions below

0
Yuan Zhi Lee On

The issue seems to be that the parametric wrt x function also modifies the z1 array on passing it into the function.

By changing the function into using np.where , the z1 array is prevented from changing whilst still returning the correct array of parametric derivative wrt x.



import numpy as np
from scipy.special import expit as sigmoid
from scipy.special import softmax as sm
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from math import sqrt
from math import log

class NeuralNet:
    def __init__(self, num_features, num_hidden1 ,alpha,alpha1, max_epochs, num_output, _EPSILON):
        super().__init__()
        self.num_features=num_features  # number of input nodes (features)
        self.num_hidden1=num_hidden1  # number of hidden nodes for 1st hidden layer
        self.alpha=alpha  # learning rate
        self.alpha1=alpha1 # alpha for para RelU
        self.max_epochs=max_epochs # maximum number of epochs
        self.num_output=num_output # number of output nodes
        self._EPSILON=_EPSILON
        self.loss = [] #list to store losses per 100 epochs 
        self.trainingaccur=[] # list to store training accuracy per 100 epochs 
        self.devaccur=[]
        self.Weights_Input_to_H1=np.random.randn(self.num_hidden1, self.num_features)*(0.1)
        self.Bias_Input_to_H1=np.zeros([self.num_hidden1,1])
        self.Weights_H1_to_output=np.random.randn(self.num_output, self.num_hidden1)*(0.1)
        self.Bias_H1_to_output=np.zeros([self.num_output,1])
        self.dWeights_Input_to_H1=np.zeros([self.num_hidden1, self.num_features])
        self.dBias_Input_to_H1=np.zeros([self.num_hidden1,1])
        self.dWeights_H1_to_output=np.zeros([self.num_output, self.num_hidden1])
        self.dBias_H1_to_output=np.zeros([self.num_output,1])
        
        

        
    
    def relU(self,X):
        return np.maximum(X, 0)

    def Para_relU(self,alpha,X):
        return np.maximum(X,alpha*X)

    def Para_deriv_wrt_X(self,alpha,X):
        
        return np.where(X<=0,alpha,1)
        

    def Para_deriv_wrt_alpha(self,alpha,X):
        return np.where(X<=0,X,0)

        
        

    def deriv(self,X):
        return np.where(X<=0,0,1)
        
        
        


    
    def softmax(self,x):
        e=np.exp(x)
        for i in range(e.shape[1]):
            e[:,i]=e[:,i]/np.sum(e[:,i])
        return e

    
    

    
        
    # TODO: complete implementation for forward pass
    def forward(self, X):
        self.z1=np.dot((self.Weights_Input_to_H1),(X))+self.Bias_Input_to_H1
        self.a1=self.Para_relU(self.alpha1,self.z1)
        self.z2=np.dot((self.Weights_H1_to_output),(self.a1))+self.Bias_H1_to_output
        self.a2=self.softmax((self.z2))
        return self.a2
        
        
        
    
    # TODO: complete implementation for backpropagation
    # the following Numpy functions may be useful: np.dot, np.sum, np.tanh, numpy.ndarray.T
    def backprop(self, X, t):
      
        self.dz2=(self.a2.reshape(self.num_output,-1)-t.reshape(self.num_output,-1))/((self.num_output)*(X.shape[1]))
        self.dBias_H1_to_output=np.sum(self.dz2,axis=1,keepdims=True)
        self.dWeights_H1_to_output=np.dot((self.dz2),self.a1.T)
        self.dz1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_X(self.alpha1,self.z1))
        self.dalpha1=(np.dot(self.Weights_H1_to_output.T,self.dz2)) * (self.Para_deriv_wrt_alpha(self.alpha1,self.z1))
        self.dalpha1_scalar=np.sum(self.dalpha1)
        self.dBias_Input_to_H1=np.sum(self.dz1,axis=1,keepdims=True)
        self.dWeights_Input_to_H1=np.dot((self.dz1),X.T)
        
        
        
        
        
            
                
                
                
              
                        
                
      
        
        
    
    #TODO: complete implementation for fitting data, and change the existing code if needed
    def fit(self, x_train_data, y_train_data,x_dev_data,y_dev_data):
       
        
        
        for step in range(self.max_epochs):
            self.forward(x_train_data)
            self.backprop(x_train_data, y_train_data)
            self.Bias_H1_to_output=self.Bias_H1_to_output-((self.alpha)*(self.dBias_H1_to_output))
            self.Weights_H1_to_output=self.Weights_H1_to_output-((self.alpha)*(self.dWeights_H1_to_output))
            self.Bias_Input_to_H1=self.Bias_Input_to_H1-((self.alpha)*(self.dBias_Input_to_H1))
            self.Weights_Input_to_H1=self.Weights_Input_to_H1-((self.alpha)*(self.dWeights_Input_to_H1))
            self.alpha1=self.alpha1-((self.alpha)*(self.dalpha1_scalar))
            

            if step % 100 == 0:
                self.CCloss=log_loss(np.transpose(y_train_data),np.transpose(self.a2),eps=self._EPSILON,normalize=True)
                self.trainingaccuracy=accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0))
                self.devaccuracy=accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0))
                print(f'step: {step},  loss: {self.CCloss:3.150f}') 
                print(accuracy_score(np.argmax(y_train_data,axis=0),np.argmax(self.forward(x_train_data),axis=0)))
                print(accuracy_score(np.argmax(y_dev_data,axis=0),np.argmax(self.forward(x_dev_data),axis=0)))
                self.loss.append(self.CCloss)
                self.trainingaccur.append(self.trainingaccuracy)
                self.devaccur.append(self.devaccuracy)
                print(self.dalpha1)
                print(self.dalpha1_scalar)
                print(self.alpha1)


            


          
              
            
            
    def predict(self,X,y=None):
        self.forward(X)
        if(self.num_output>1):
            y_hat=np.argmax(self.a2, axis=0)
            temp=accuracy_score(y_hat,y)
        else:
            y_hat=np.where(self.a2>0.5,1,0)
            temp=accuracy_score(y_hat,y)
        return temp,y_hat