I'm a big fan of the youtube channel 3Blue1Brown and his series on Neural networks really got me excited on the subject. I decided to create my own neural network in python from scratch engaging deeply in the mathematics. So with the help from the MNIST database on hand-written numbers I got started and succeded with the task after 2 weeks. I have since then been further developing my code so that I can adjust the number of neurons and hidden layers neatly within the code. I also experimented with different activation functions. The best accuracy I've gotten is about 95% with 2 hidden layers of 16 neurons and 5 minutes of training.
Now, my question is fairly vague but I am now looking for the next challenge within the area, do you guys have any suggestions?
I now have the framework set up so I'd love some new type of problem with a bigger dataset or something or maybe should I work more on my existing problem to increase the accuracy of the ouput further?
What do you guys think?
Yours, Emil
(Here's the code if anyone is interested)
import pickle
import gzip
import numpy as np
import random
import time
import pickle
import gzip
import numpy as np
import random
import time
class mnistClass:
def __init__(self, inputAmount=784, layers=2, layerSize=16, outputSize=10, loops=1, sampleSize=100):
with gzip.open('mnist.pkl.gz', 'rb') as f:
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
self.A, self.y = train_set
self.V, self.v2 = valid_set
self.dataSize = len(self.A)
self.inputAmount = inputAmount
self.layers = layers
self.layerSize = layerSize
self.outputSize = outputSize
self.loops = loops
self.sampleSize = sampleSize
self.iterations = int(self.dataSize/self.sampleSize)
self.clock = time.time()
self.Weights = []
self.Biases = []
self.initializeArrays()
self.initializeTraining()
print("Accuracy: " + str(self.getAccuracy()) + "%")
def initializeArrays(self):
for i in range(self.layers):
if self.layers - i > 2: #Adding middle layers
self.Weights.append(np.random.rand(self.layerSize, self.layerSize)-0.5)
if self.layers - i > 1:
self.Biases.append(np.random.rand(self.layerSize)-0.5)
if self.layers > 1:
self.Weights.insert(0, np.random.rand(self.layerSize, self.inputAmount)-0.5)
self.Weights.insert(len(self.Weights), np.random.rand(self.outputSize, self.layerSize)-0.5)
else:
self.Weights.insert(len(self.Weights), np.random.rand(self.outputSize, self.inputAmount)-0.5)
self.Biases.insert(len(self.Biases), np.random.rand(self.outputSize)-0.5)
def sigmoid(self, x, shiftType):
if shiftType == 0:
result = 1/(1+np.exp(-x))
elif shiftType == 1:
result = 2 * (1/(1+np.exp(-x))) - 1
return result
def sigmoidPrime(self, x, shiftType):
if shiftType == 0:
result = self.sigmoid(x, 0) - self.sigmoid(x, 0)**2
elif shiftType == 1:
result = 2*np.exp(-x)/(1+np.exp(-x))**2
return result
def Rdependance(self, Z, layer1, layer2, multi=False): #How R depends on a preceeding R
multi = layer1-layer2 > 1
if not multi:
if layer1 == self.layers-1:
shiftType = 0
else:
shiftType = 1
R1_R2_differential = np.multiply(self.Weights[layer1], self.sigmoidPrime(Z[layer1]+self.Biases[layer1], shiftType)[:, np.newaxis])
result = R1_R2_differential
else:
chainRule = []
for i in reversed(range(layer2, layer1)):
chainRule.append(self.Rdependance(Z, i+1, i))
result = chainRule[0]
for i in range(len(chainRule)-1):
result = np.dot(result, chainRule[i+1])
return result
def RWdependance(self, R, Z, dataCaseNo, layer): #How R depends on connecting Weights
if layer == self.layers-1:
shiftType = 0
else:
shiftType = 1
R_W_differential = self.Weights[layer]/self.Weights[layer]
mergeW_Z = np.multiply(R_W_differential, self.sigmoidPrime(Z[layer]+self.Biases[layer], shiftType)[:, np.newaxis])
if layer == 0:
R_W_differential = np.multiply(mergeW_Z.T, self.A[dataCaseNo][:, np.newaxis]).T
else:
R_W_differential = np.multiply(mergeW_Z.T, R[layer-1][:, np.newaxis]).T
return R_W_differential
def RBdependance(self, Z, layer): #How R depends on internal Biases
if layer == self.layers-1:
shiftType = 0
else:
shiftType = 1
R_B_differential = np.multiply(self.Rdependance(Z, self.layers-1, layer).T, self.sigmoidPrime(Z[layer]+self.Biases[layer], shiftType)[:, np.newaxis]).T
return R_B_differential
def integralWeightCost(self, R, Z, dataCaseNo, quadDifferential, layer): # Cost of system for weights
if layer == self.layers-1:
nodes = np.identity(self.outputSize)
else:
nodes = self.Rdependance(Z, self.layers-1, layer)
cost_differential = np.multiply(nodes, quadDifferential[:, np.newaxis])
cost_differential = np.sum(cost_differential, 0)
result = np.multiply(self.RWdependance(R, Z, dataCaseNo, layer), cost_differential[:, np.newaxis])
return result
def integralBiasCost(self, Z, quadDifferential, layer): # Cost of system for biases
if layer == self.layers-1:
nodes = np.identity(self.outputSize)
else:
nodes = self.RBdependance(Z, layer)
cost_differential = np.multiply(nodes, quadDifferential[:, np.newaxis])
result = np.sum(cost_differential, 0)
return result
def initializeTraining(self):
for loop in range(self.loops):
for iteration in range(self.iterations):
avg_cost = 0
avg_deltaWeights = []
avg_deltaBiases = []
for i in range(len(self.Weights)): #Creating zeros of weight arrays
avg_deltaWeights.append(self.Weights[i]*0)
for i in range(len(self.Biases)):
avg_deltaBiases.append(self.Biases[i]*0)
for dataCaseNo in range(iteration*self.sampleSize, iteration*self.sampleSize + self.sampleSize):
if self.layers == 1:
shiftType = 0
else:
shiftType = 1
Y1 = np.zeros(self.outputSize)
Y1[self.y[dataCaseNo]] = 1
Z = []
Z.append(np.dot(self.Weights[0], self.A[dataCaseNo]))
R = []
R.append(self.sigmoid(Z[0]+self.Biases[0], shiftType))
for i in range(1, self.layers):
if i == self.layers-1:
shiftType = 0
else:
shiftType = 1
Z.append(np.dot(self.Weights[i], R[i-1]))
R.append(self.sigmoid(Z[i]+self.Biases[i], shiftType))
C = np.sum((R[-1] - Y1)**2)
avg_cost += C
quadDifferential = 2 * (R[-1]-Y1)
for i in range(self.layers):
avg_deltaWeights[i] += self.integralWeightCost(R, Z, dataCaseNo, quadDifferential, i)
avg_deltaBiases[i] += self.integralBiasCost(Z, quadDifferential, i)
avg_cost = avg_cost/self.sampleSize
for i in range(self.layers):
self.Weights[i] = self.Weights[i] - avg_deltaWeights[i]/self.sampleSize
self.Biases[i] = self.Biases[i] - avg_deltaBiases[i]/self.sampleSize
print("Average cost: " + str(round(avg_cost, 4)))
print("\n" + "*"*25 + " " + str(loop+1) +" " + "*"*25 + "\n")
executionEndTime = round((time.time() - self.clock), 2)
print("Completed " + str(self.loops) + " rounds of " + str(self.sampleSize*self.iterations) + " samples (sampleSize: " + str(self.sampleSize) + "), " + " in " + str(executionEndTime) + " seconds..")
print("Layers: " + str(self.layers))
print("Middle layer nodes: " + str(self.layerSize))
print("Input amount: " + str(self.inputAmount))
amountVariables = 0
for i in range(self.layers):
amountVariables += self.Weights[i].size
amountVariables += self.Biases[i].size
print("Variables: " + str(amountVariables))
print("Output size: " + str(self.outputSize))
time.sleep(2)
def getAccuracy(self):
runs = 10000
correct = 0
print("Testing validation set accuracy over " + str(runs) + " samples...\n")
for i in range(runs):
if self.layers == 1:
shiftType = 0
else:
shiftType = 1
ran = i
Y1 = np.zeros(self.outputSize)
Y1[self.v2[ran]] = 1
Z = []
Z.append(np.dot(self.Weights[0], self.V[ran]))
R = []
R.append(self.sigmoid(Z[0]+self.Biases[0], shiftType))
for i in range(1, self.layers):
if i == self.layers-1:
shiftType = 0
else:
shiftType = 1
Z.append(np.dot(self.Weights[i], R[i-1]))
R.append(self.sigmoid(Z[i]+self.Biases[i], shiftType))
result = np.where(R[-1] == np.amax(R[-1]))
maxNum = result[0][0]
if int(self.v2[ran]) == int(maxNum):
correct += 1
accuracy = correct*100/runs
return accuracy
instance = mnistClass(784, 3, 16, 10, 2, 100)
#(input, layers, layer size, output, loops, sample subsize)
#input - amount of nodes in input data
#layers - amount of layers including last output layer but not first input layer
#layer size - amount of nodes in hidden layers
#output - amount of nodes in output layer
#loops - how many times to train through the entire data set
#sample subsize - what quantity of data samples to average the gradient on