I've tried to implement the following neural network for XOR gate approximation. I've used the binary cross entropy for cost function. The cost always goes to around 0.69 and gets saturated. And for all inputs the network outputs 0.5. I've tried varying number of epochs, learning rate, number of layers but got no change. Where am I going wrong?
import numpy as np
import matplotlib.pyplot as plt
epsilon = 1e-15
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def relu(x):
return np.maximum(0, x)
def sigmoid_backwards(A):
return A * (1 - A)
def relu_backwards(A):
return A >= 0
def init_parameters(layer_dims):
paramters = {}
L = len(layer_dims)
for i in range(1, L):
paramters['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1]) * 0.001
paramters['b' + str(i)] = np.zeros((layer_dims[i], 1))
return paramters
def forward_pass(X, paramters, g):
layer_vals = [X]
A = X
L = len(g)
for i in range(1, L):
A_prev = A
Z = np.dot(paramters['W' + str(i)], A_prev) + paramters['b' + str(i)]
A = activations[g[i]](Z)
layer_vals.append(A)
return layer_vals
def predict(X, paramters, g):
layer_vals = [X]
A = X
L = len(g)
for i in range(1, L):
A_prev = A
Z = np.dot(paramters['W' + str(i)], A_prev) + paramters['b' + str(i)]
A = activations[g[i]](Z)
layer_vals.append(A)
return layer_vals[-1][0]
def backward_pass(y_true, layer_vals, paramters, g, learning_rate=0.01):
m = y_true.shape[1]
dA = -y_true/(layer_vals[-1] + epsilon) + (1-y_true)/(1-layer_vals[-1] + epsilon)
for i in range(len(layer_vals)-1, 0, -1):
dZ = dA * activations_backwards[g[i]](layer_vals[i])
dA_prev = np.dot(paramters['W' + str(i)].T, dZ)
dW = 1/m * np.dot(dZ, layer_vals[i-1].T)
db = 1/m * np.sum(dZ, axis=1, keepdims=True)
dA = dA_prev
paramters['W' + str(i)] -= learning_rate * dW
paramters['b' + str(i)] -= learning_rate * db
return paramters
def compute_cost(y, output):
m = y.shape[1]
return -1/m * np.sum(y * np.log(output+epsilon) + (1-y) * np.log(1-output+epsilon))
activations = {
'sigmoid': sigmoid,
'relu': relu
}
activations_backwards = {
'sigmoid': sigmoid_backwards,
'relu': relu_backwards
}
X = np.array([[0.000000, 0.000000, 1.000000, 1.000000],
[0.000000, 1.000000, 0.000000, 1.000000]], dtype=float)
y = np.array([[0.0, 1.0, 1.0, 0.0]], dtype=float)
layer_dims = (2, 3, 3, 1)
#g = ['linear', 'sigmoid', 'sigmoid', 'sigmoid']
g = ['linear', 'relu', 'relu', 'sigmoid']
epochs = 1000
learning_rate = 0.01
paramters = init_parameters(layer_dims)
layer_vals = forward_pass(X, paramters, g)
costs = []
for i in range(epochs):
parameters = backward_pass(y, layer_vals, paramters, g, learning_rate=learning_rate)
layer_vals = forward_pass(X, paramters, g)
cost = compute_cost(y, layer_vals[-1])
costs.append(cost)
if (i+1) % 10 == 0:
print(f"After {i+1} epochs at learning rate {learning_rate:.4f}, cost: ", cost)
plt.plot(costs)
plt.show()
print(predict(X, paramters, g))
Here is the curve for cost function value
Cost Curve.
And the output is always
[0.5 0.5 0.5 0.5]