My first neural network was using sigmoid activation function and was working fine. Now I want to switch to more advanced activation function(ReLu). But with ReLu my NN doesn't work at all. 90% of errors, while using sigmoid there were 4% of errors. I can't find bug in code. Help me.
class NeuralNetwork:
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate = 0.1):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate
        self.weights_ih = np.random.normal(0.0, pow(input_nodes, -0.5), (hidden_nodes, input_nodes))
        self.weights_ho = np.random.normal(0.0, pow(hidden_nodes, -0.5), (output_nodes, hidden_nodes))
        self.bias_h = np.random.normal(0.0, pow(1, -0.5), (hidden_nodes, 1))
        self.bias_o = np.random.normal(0.0, pow(1, -0.5), (output_nodes, 1))
    def activation_function(self, x):
        return x * (x > 0)
    def activation_function_d(self, x):
        return 1 * (x >= 0)
    def train(self, inputs_list, targets_list):
        inputs = np.array(inputs_list, ndmin=2).T
        targets = np.array(targets_list, ndmin=2).T
        # Feedforward
        hidden_inputs = np.dot(self.weights_ih, inputs) + self.bias_h
        hidden = self.activation_function(hidden_inputs)
        output_inputs = np.dot(self.weights_ho, hidden) + self.bias_o
        outputs = self.activation_function(output_inputs)
        # Calculate errors
        output_errors = targets - outputs
        hidden_errors = np.dot(self.weights_ho.T, output_errors)
        # Calculate gradients
        output_gradient = output_errors * self.activation_function_d(output_inputs) * self.learning_rate
        hidden_gradient = hidden_errors * self.activation_function_d(hidden_inputs) * self.learning_rate
        # Calculate deltas
        output_deltas = np.dot(output_gradient, hidden.T)
        hidden_deltas = np.dot(hidden_gradient, inputs.T)
        # Adjust weights and biases by deltas and gradients
        self.weights_ho += output_deltas
        self.weights_ih += hidden_deltas
        self.bias_o     += output_gradient
        self.bias_h     += hidden_gradient
    def predict(self, inputs_list):
        inputs = np.array(inputs_list, ndmin=2).T
        hidden = self.activation_function(np.dot(self.weights_ih, inputs) + self.bias_h)
        outputs = self.activation_function(np.dot(self.weights_ho, hidden) + self.bias_o)
        return outputs.flatten().tolist()
And training code:
with open('mnist_train.csv') as train_file:
    for str in train_file:
        data = [int(char) for char in str.split(',')]
        inputs = data[1:]
        targets = [1 if i == data[0] else 0 for i in range(10)]
        nn.train(inputs, targets)
				
                        
The last layer should always use sigmoid (in the binary case) regardless of what you are trying to do.
The sigmoid function is used to estimate the probabilities that an example is in a given class, the prediction of an example is the class which the example has the highest probability to be in.
To conclude, change this:
to this
and in the training:
to:
and
to