1

I have a computing project where I want my program to recognise handwritten letters and numbers. I started off by creating a program that can recognise numbers from the MNIST dataset and works just fine. I used a CNN (Convolutional Neural Network) from Michael Nielson's book (see code below for the neural network). I was wondering how I can implement EMNIST into this now (as MNIST is already a part of the NIST and EMNIST dataset and layed out in a similar format as well (I think)).

import random
import json
import numpy as np
import matplotlib.pyplot as plt
def load(filename):
    """Load a neural network from the file ``filename``.  Returns an
    instance of Network.

    """
    print("loading dataset...")
    f = open(filename, "r")
    data = json.load(f)
    f.close()
    net = Network(data["sizes"])
    net.weights = [np.array(w) for w in data["weights"]]
    net.biases = [np.array(b) for b in data["biases"]]
    return net
class Network(object):

    def __init__(self, sizes):
        """The list ``sizes`` contains the number of neurons in the
        respective layers of the network.  For example, if the list
        was [2, 3, 1] then it would be a three-layer network, with the
        first layer containing 2 neurons, the second layer 3 neurons,
        and the third layer 1 neuron.  The biases and weights for the
        network are initialized randomly, using a Gaussian
        distribution with mean 0, and variance 1.  Note that the first
        layer is assumed to be an input layer, and by convention we
        won't set any biases for those neurons, since biases are only
        ever used in computing the outputs from later layers."""
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                    for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        """Return the output of the network if ``a`` is input."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):
        """Train the neural network using mini-batch stochastic
        gradient descent.  The ``training_data`` is a list of tuples
        ``(x, y)`` representing the training inputs and the desired
        outputs.  The other non-optional parameters are
        self-explanatory.  If ``test_data`` is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out.  This is useful for
        tracking progress, but slows things down substantially."""

        training_data = list(training_data)
        n = len(training_data)

        if test_data:
            test_data = list(test_data)
            n_test = len(test_data)

        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {} : {} / {}".format(j,self.evaluate(test_data),n_test));
                if j == epochs-1:
                    self.save("model1")
            else:
                print("Epoch {} complete".format(j))

    def update_mini_batch(self, mini_batch, eta):
        """Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``
        is the learning rate."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        """Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
    # Note that the variable l in the loop below is used a little
    # differently to the notation in Chapter 2 of the book.  Here,
    # l = 1 means the last layer of neurons, l = 2 is the
    # second-last layer, and so on.  It's a renumbering of the
    # scheme in the book, used here to take advantage of the fact
    # that Python can use negative indices in lists.
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        """Return the number of test inputs for which the neural
        network outputs the correct result. Note that the neural
        network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation."""
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """Return the vector of partial derivatives \partial C_x /
        \partial a for the output activations."""
        return (output_activations-y)
    def save(self, filename):
        """Save the neural network to the file ``filename``."""
        data = {"sizes": self.sizes,
                "weights": [w.tolist() for w in self.weights],
                "biases": [b.tolist() for b in self.biases]}
        f = open(filename, "w")
        json.dump(data, f)
        f.close()
    def test(self, img):
        saveImg = cv2.resize(img, (28, 28))
        gray = rgb2gray(saveImg)
        gray2 = inverte(gray)
        cv2.imshow("orig 28x28", saveImg)
        cv2.imshow("gray 28x28", gray)
        cv2.imshow("inverte 28x28", gray2)
        cv2.imwrite("28x28.jpg", gray2)
        imgInput = np.reshape(gray2, (784, 1))
        result = np.argmax(self.feedforward(imgInput))
        return result
#### Miscellaneous functions
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

def inverte(imagem):
    return (255-imagem)

def getImg(img, net):
    result = net.test(img)
    num = 0
    print("Output is: " + str(result))
import mnist_loader #loading the mnist dataset
import cv2
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
import os
if os.path.exists("model1") == False:
    print("model1.json doesn't exist")
    net = Network([784, 30, 10])  
    net.SGD(training_data, 44, 10, 0.5, test_data=test_data)
else:
    print("model1 already exists")
    net = load("model1")    
    print("completed loading")
for x in range(1, 2):
    name = "thick1.jpg"
    print(name)
    image = cv2.imread(name)
    cv2.resize(image, (28,28))
    getImg(image, net)
  • The only thing that should realistically change is the number of output classes and the mapping from output classes to letter. – erip Jan 21 '22 at 15:51
  • Thank you for the advice, I thought about this as well, I am a bit confused about how to implement it though. Anyways, thanks – sahilGaikwad Jan 24 '22 at 14:08
  • Please trim your code to make it easier to find your problem. Follow these guidelines to create a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). – Faisal Nazik Jan 31 '22 at 15:49

1 Answers1

0

I think I have managed to solve this problem now. As Erip pointed out, the only thing that should really only change is the number of output classes, and that is true. However, using Michael Nielson's NN gives me an accuracy of less than 10%. This might be due to many reasons such as not using datasets that are correctly labelled or datasets that are not normalised. Due to many problems, I was facing with accuracy, I just resorted to using Tensorflow, which works effortlessly for me, giving me an accuracy of about 80% and is incredibly accurate during recognition. This is my TensorFlow implementation

import pandas as pd
import numpy as np
training_data = pd.read_csv("emnist-balanced-train.csv") # reading training data from csv file using pandas module
testing_data = pd.read_csv("emnist-balanced-test.csv") # reading testing data from csv file using pandas module
#size of training_data is (88799, 785)
#size of testing_data is (14799, 785)
train_data = np.array(training_data, dtype = np.float64) # converts training_data to numpy arrays so elements can be called using its indexes
test_data = np.array(testing_data, dtype = np.float64) # converts testing_data to numpy arrays so elements can be called using its indexes 
train_labels = []
for x in range(0, len(train_data)):
    train_labels.append(train_data[x][0]) # I add the first element of train_data[x] (the label) to train_labels
test_labels = []
for x in range(0, len(test_data)):
    test_labels.append(test_data[x][0]) # I add the first element of test_data[x] (the label) to test_labels
train_data = train_data/255.0
test_data = test_data/255.0
new_train_data = []
for x in range(0, len(train_data)):
    img = np.resize(train_data[x][1:], (28,28)) # I have to reshape the EMNIST data from (784) to (28, 28) for np.fliplr() and np.rot90() to work
    img_flip = np.fliplr(img) # flips image
    img_rotate = np.rot90(img_flip) # rotates image by 90 degrees
    new_train_data.append(img_rotate) # save it back as an element of train_data
new_test_data = []
for x in range(0, len(test_data)):
    img = np.resize(test_data[x][1:], (28,28)) # I have to reshape the EMNIST data from (784) to (28, 28) for np.fliplr() and np.rot90() to work
    img_flip = np.fliplr(img) # flips image
    img_rotate = np.rot90(img_flip) # rotates image by 90 degrees
    new_test_data.append(img_rotate) # save it back as an element of test_data
import tensorflow as tf
model = tf.keras.Sequential([ 
    tf.keras.layers.Conv2D(32,3, input_shape=(28,28, 1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(input_shape=(28,28, 1)),
    tf.keras.layers.Dense(512,activation='relu'),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(47,activation='softmax')
])
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
new_train_data = np.array(new_train_data, dtype = np.float64) # I will convert new_train_data to a NumPy array as tensorflow CNN can only accept NumPy arrays
new_train_data = np.resize(new_train_data, (len(new_train_data), 28, 28, 1)) # since the input for the sequential CNN is (28, 28, 1) we need to resize the new_train_data to facilitate that
train_labels = np.resize(train_labels, (len(train_labels), 1)) # the training label array is being formatted from list of number labels to list of arrays containging number labels
new_test_data = np.array(new_test_data, dtype = np.float64) # I will convert new_train_data to a NumPy array as tensorflow CNN can only accept NumPy arrays
new_test_data = np.resize(new_test_data, (len(new_test_data), 28, 28, 1)) # since the input for the sequential CNN is (28, 28, 1) we need to resize the new_test_data to facilitate that
test_labels = np.resize(test_labels, (len(test_labels), 1)) # the testing label array is being formatted from list of number labels to list of arrays containging number labels
train_labels = np.array(train_labels, np.uint8)
test_labels = np.array(test_labels, np.uint8)
history = model.fit(new_train_data, train_labels,  epochs = 50, validation_data=(new_test_data, test_labels))
model.save("final_CNN")

I have attached the csv files "emnist-balanced-train.csv" and "emnist-balanced-test.csv" in my GitHub: https://github.com/SahilG25/MachineLearningHandwritingRecognitionTensorflow

You can check out the GUI I made on my GitHub that demonstrates the neural network's handwriting recognition using user input.