I am trying to build a neural network from scratch, so this means I will only be using numpy and pandas for defining the my neural network class. I tried appropriating some code that chatgpt gave me but its so buggy that I have decided to start from scratch. I have some preliminary questions regarding which activations functions to use. The iris dataset, included 5 columns with the first four columns being numerical values and the last being the iris classification. My neural network will have 3 layers, according to chat gpt, since the classification task is restricted to 3 possible types, the activation function between hidden and output layers should be the soft max. So my two activation functions will be sigmoid (between input and hidden) and softmax (between hidden and output). Here is the code I've been using so far:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
class NeuralNet():
def __init__(self, i_dim, h_dim, o_dim, lr):
self.i_dim = i_dim
self.h_dim = h_dim
self.o_dim = o_dim
self.lr = lr
self.weights1 = np.random.randn(self.i_dim, self.h_dim) / np.sqrt(self.i_dim)
self.bias1 = np.zeros((1, self.h_dim))
self.weights2 = np.random.randn(self.h_dim, self.o_dim) / np.sqrt(self.h_dim)
self.bias2 = np.zeros((1, self.o_dim))
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def softmax(self, x):
exps = np.exp(x - np.max(x, axis=1, keepdims=True))
return exps / np.sum(exps, axis=1, keepdims=True)
def forward(self, X):
self.layer1 = self.sigmoid(np.dot(X, self.weights1) + self.bias1)
self.layer2 = self.softmax(np.dot(self.layer1, self.weights2) + self.bias2)
return self.layer2
def sigmoid_derivative(self, x):
return x * (1 - x)
def softmax_derivative(self, x):
s = x.reshape(-1, 1)
return np.diagflat(s) - np.dot(s, s.T)
def cross_ent_loss(self, y, y_hat):
y_reshaped = np.zeros((y.size, y.max() + 1))
y_reshaped[np.arange(y.size), y] = 1
sample_losses = - y_reshaped * np.log(y_hat)
loss = np.mean(sample_losses)
return loss
def backward(self, X, y, y_hat):
y_hat = self.forward(X)
loss = self.cross_ent_loss(y, y_hat)
d_softmax = self.softmax_derivative(y_hat)
d_sigmoid = self.sigmoid_derivative(self.layer1)
d_weights2 = np.dot(self.layer1.T, (2 * (y - y_hat)[:, np.newaxis, :] * d_softmax))
d_bias2 = np.sum(2 * (y - y_hat)[:, np.newaxis, :] * d_softmax, axis=0, keepdims=True)
d_weights1 = np.dot(X.T, (np.dot(2 * (y - y_hat)[:, np.newaxis, :] * d_softmax, self.weights2.T) * d_sigmoid))
d_bias1 = np.sum(np.dot(2 * (y - y_hat)[:, np.newaxis, :] * d_softmax, self.weights2.T) * d_sigmoid, axis=0)
self.weights1 -= self.lr * d_weights1
self.bias1 -= self.lr * d_bias1
self.weights2 -= self.lr * d_weights2
self.bias2 -= self.lr * d_bias2
def train(self, X, y, epochs):
for epoch in range(epochs):
y_hat = self.forward(X)
self.backward(X, y, y_hat)
loss = self.cross_ent_loss(y, y_hat)
print(f"Epoch {epoch + 1}: loss = {loss:.4f}")
def predict(self, X):
return self.forward(X)
df = pd.read_csv('/Users/eliasbrasildesouza/Downloads/iris.data', header=None)
X_train = df.iloc[:, :4].values
y_train = df.iloc[:, -1].values
le = LabelEncoder()
y_train = le.fit_transform(y_train)
nn = NeuralNet(4, 5, 3, 0.1)
nn.train(X_train, y_train, epochs=1000)
y_pred = nn.predict(X_train)
y_pred_labels = np.argmax(y_pred, axis=1)
print(y_pred)
The error I'm getting is:
d_weights2 = np.dot(self.layer1.T, (2 * (y - y_hat)[:, np.newaxis, :] * d_softmax))
~~^~~~~~~
ValueError: operands could not be broadcast together with shapes (150,) (150,3)
I'm not sure why y and y_hat have incompatible dimensions. I have tried multiple different ways around this via reshaping and am still getting an issue with the dimensions.