I'm implementing a fully connected neural network for MNIST (not convolutional!) and I'm having a problem. When I make multiple forward passes and backward passes, the exponents get abnormally high and python is unable to calculate them. It seems to me that I incorrectly registered backward_pass. Could you help me with this. Here are the network settings:
w_1 = np.random.uniform(-0.5, 0.5, (128, 784))
b_1 = np.random.uniform(-0.5, 0.5, (128, 1))
w_2 = np.random.uniform(-0.5, 0.5, (10, 128))
b_2 = np.random.uniform(-0.5, 0.5, (10, 1))
X_train shape: (784, 31500)
y_train shape: (31500,)
X_test shape: (784, 10500)
y_test shape: (10500,)
def sigmoid(x, alpha):
return 1 / (1 + np.exp(-alpha * x))
def dx_sigmoid(x, alpha):
exp_neg_x = np.exp(-alpha * x)
return alpha * exp_neg_x / ((1 + exp_neg_x)**2)
def ReLU(x):
return np.maximum(0, x)
def dx_ReLU(x):
return np.where(x > 0, 1, 0)
def one_hot(y):
one_hot_y = np.zeros((y.size, y.max() + 1))
one_hot_y[np.arange(y.size), y] = 1
one_hot_y = one_hot_y.T
return one_hot_y
def forward_pass(X, w_1, b_1, w_2, b_2):
layer_1 = np.dot(w_1, X) + b_1
layer_1_act = ReLU(layer_1)
layer_2 = np.dot(w_2, layer_1_act) + b_2
layer_2_act = sigmoid(layer_2, 0.01)
return layer_1, layer_1_act, layer_2, layer_2_act
def backward_pass(layer_1, layer_1_act, layer_2, layer_2_act, X, y, w_2):
one_hot_y = one_hot(y)
n_samples = one_hot_y.shape[1]
d_loss_by_layer_2_act = (2 / n_samples) * np.sum(one_hot_y - layer_2_act, axis=1).reshape(-1, 1)
d_layer_2_act_by_layer_2 = dx_sigmoid(layer_2, 0.01)
d_loss_by_layer_2 = d_loss_by_layer_2_act * d_layer_2_act_by_layer_2
d_layer_2_by_w_2 = layer_1_act.T
d_loss_by_w_2 = np.dot(d_loss_by_layer_2, d_layer_2_by_w_2)
d_loss_by_b_2 = np.sum(d_loss_by_layer_2, axis=1).reshape(-1, 1)
d_layer_2_by_layer_1_act = w_2.T
d_loss_by_layer_1_act = np.dot(d_layer_2_by_layer_1_act, d_loss_by_layer_2)
d_layer_1_act_by_layer_1 = dx_ReLU(layer_1)
d_loss_by_layer_1 = d_loss_by_layer_1_act * d_layer_1_act_by_layer_1
d_layer_1_by_w_1 = X.T
d_loss_by_w_1 = np.dot(d_loss_by_layer_1, d_layer_1_by_w_1)
d_loss_by_b_1 = np.sum(d_loss_by_layer_1, axis=1).reshape(-1, 1)
return d_loss_by_w_1, d_loss_by_b_1, d_loss_by_w_2, d_loss_by_b_2
for epoch in range(epochs):
layer_1, layer_1_act, layer_2, layer_2_act = forward_pass(X_train, w_1, b_1, w_2, b_2)
d_loss_by_w_1, d_loss_by_b_1, d_loss_by_w_2, d_loss_by_b_2 = backward_pass(layer_1, layer_1_act,
layer_2, layer_2_act,
X_train, y_train,
w_2)
w_1 -= learning_rate * d_loss_by_w_1
b_1 -= learning_rate * d_loss_by_b_1
w_2 -= learning_rate * d_loss_by_w_2
b_2 -= learning_rate * d_loss_by_b_2
_, _, _, predictions = forward_pass(X_train, w_1, b_1, w_2, b_2)
predictions = predictions.argmax(axis=0)
accuracy = accuracy_score(predictions, y_train)
print(f"epoch: {epoch} / acuracy: {accuracy}")
My loss is MSE: (1 / n_samples) * np.sum((one_hot_y - layer_2_act)**2, axis=0)
This is my calculations calculations
I tried to decrease lr, set the alpha coefficient to the exponent (e^(-alpha * x) for sigmoid), I divided my entire sample by 255. and still the program cannot learn because the numbers are too large