How to replace the softmax layer with sigmoid layer? (Theano, MNIST classification)

Question

I need your help. I am trying to modify the python-based neural network for MNIST data classification developed by M. Nielsen [http://neuralnetworksanddeeplearning.com/index.html]. Particularly, I am using networ3.py script. It employs Theano library.

The last layer in this network is softmax, but in the future I want to use this program for regression purposes and, therefore, I need to modify it changing the last layer to sigmoid one.

When I simply change the

activation_fn=softmax

to

activation_fn=sigmoid

the program is not working properly.

The important parts of the code are provided below.

# Initialization of the neural network
net = Network([
               ConvPoolLayer(input_shape=(mini_batch_size, 1, 28, 28),
                             filter_shape=(20, 1, 5, 5),
                             poolsize=(2, 2),
                             activation_fn=ReLU),
               ConvPoolLayer(input_shape=(mini_batch_size, 20, 12, 12),
                             filter_shape=(40, 20, 5, 5),
                             poolsize=(2, 2),
                             activation_fn=ReLU),
               FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU, p_dropout=0.0),
               SoftmaxLayer(n_in=100, n_out=10, activation_fn=softmax, p_dropout=0.0)],
               mini_batch_size)

...

# Softmax layer
class SoftmaxLayer(object):

    def __init__(self, n_in, n_out, activation_fn, p_dropout):
        self.n_in = n_in
        self.n_out = n_out
        self.activation_fn = activation_fn
        self.p_dropout = p_dropout
        # Initialize weights and biases
        self.w = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
                                          dtype=theano.config.floatX), name='w', borrow=True)
        self.b = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
                                          dtype=theano.config.floatX), name='b', borrow=True)
        self.params = [self.w, self.b]

    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
        self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
        self.y_out = T.argmax(self.output, axis=1)  # ??? Change
        self.inpt_dropout = dropout_layer(inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
        self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)

    # Return the log-likelihood cost
    def cost(self, net):
        return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])

    # Return the accuracy for the mini-batch
    def accuracy(self, y):
        return T.mean(T.eq(y, self.y_out))

score 0 · Answer 1 · answered May 17 '20 at 00:15

I made following modifications:

1) changed the way how targets are represented (before it was 0, 5, 8, ... or any number corresponding to the picture that should be classified). Now it is vectors with 10 elements 0 is equivalent to [1,0,0,0..,0], 5 is equivalent to [0,0,0,0,0,1,0,...0], etc. Correspondingly code should be fixed for this new format (minor modifications).

2) changed the layer definition (code is below). The main changes are in cost and accuracy.

# Fully connected layer
class FullyConnectedLayer(object):

    def __init__(self, n_in, n_out, activation_fn, p_dropout):
        self.n_in = n_in
        self.n_out = n_out
        self.activation_fn = activation_fn
        self.p_dropout = p_dropout
        # Initialize weights and biases
        self.w = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
                                          dtype=theano.config.floatX), name='w', borrow=True)
        self.b = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
                                          dtype=theano.config.floatX), name='b', borrow=True)
        self.params = [self.w, self.b]

    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
        self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
        self.y_out = self.output #T.argmax(self.output, axis=1)  # ??? Change
        self.inpt_dropout = dropout_layer(inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
        self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)

    # Return the cross-entropy cost    ??? Change
    def cost(self, net):
        xent = -net.y*T.log(self.output_dropout) - (1-net.y)*T.log(1-self.output_dropout)
        return T.mean(xent)

    # Accuracy for the mini-batch
    def accuracy(self, y):
        y_pred = T.argmax(self.y_out, axis=1)
        y_targ = T.argmax(y, axis=1)
        return T.mean(T.eq(y_targ, y_pred))

How to replace the softmax layer with sigmoid layer? (Theano, MNIST classification)

1 Answers1