I've currently been working with CNN's and am having a hard time with what I believe is overfitting. Specifically, even though my training data converges to a minimum error, my validation data still refuses to drop in respect to error. My input data that I'm using is 512 x 650 x 1 x 4000 (2D data, 4000 samples) and there are only two classes to the data that I'm trying to distinguish between (class A and class B). I'm aware I in the future need many more samples, but for now, I just would like to see my validation error decline even a little before I invest in generating more data.
My networks have all been around 60-70 layers long and have included the following types of layers:
Block Example
Convolutional Layers [3 x 3] filter size, stride [1 x 1], padding [1 1 1 1]
ReLU Layers (Non-linearity)
Batch normalization (Tremendous help to training data convergence and implementation speed)
Max Pooling Layers [2 x 2] filters sizes, stride [2 x 2], padding [0 0 0 0]
I then repeat this "block" until my input data is a 1 x 1 x N size where I then run it through a a few fully connected layers, and then into a softmax.
My actual MatConvNet code is below for inspection and the output plots are attached. For the plots, blue represents my training error and orange represents my validation error. I'm linking my most recent from the code below.
My Questions:
1) How does one know what filter sizes to use for their data? I know its an empirical process, but surely there is some kind of intuition behind this? I've read papers (VGG.net, and more) on using the [3x3] small filters and using a lot of them, but even after designing a 70 layer network with this in mind, still no validation error decline.
2) I have tried dropout layers due to their popularity of reducing over fitting... I placed the dropout layers throughout my network after the ReLU and pooling layers in the "block" shown above, but between all convolutional layers. It unfortunately had no effect on my validation data, the error was still the same. Next I tried only using it after the fully connected layers since thats where the most neurons (or feature maps) are being created in my architecture, and still no luck. I've read the paper on dropout. Should I give up on using it? Is there once again "a trick" to this?
3) If I try a smaller network (I've read that's a descent way to deal with overfitting) how do I effectively reduce the size of my data? Just max pooling?
ANY suggestions would be wonderful.
Again, thank you all for reading this long question. I assure you I've done my research, and found that asking here might help me more in the long run.
MatConvNet Code (Matlab Toolbox for CNN Design)
opts.train.batchSize = 25;
opts.train.numEpochs = 200 ;
opts.train.continue = true ;
opts.train.gpus = [1] ;
opts.train.learningRate = 1e-3;
opts.train.weightDecay = 0.04;
opts.train.momentum = 0.9;
opts.train.expDir = 'epoch_data';
opts.train.numSubBatches = 1;
bopts.useGpu = numel(opts.train.gpus) > 0 ;
load('imdb4k.mat');
net = dagnn.DagNN() ;
% Block #1
net.addLayer('conv1', dagnn.Conv('size', [3 3 1 64], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'input'}, {'conv1'}, {'conv1f' 'conv1b'});
net.addLayer('relu1', dagnn.ReLU(), {'conv1'}, {'relu1'}, {});
net.addLayer('bn1', dagnn.BatchNorm('numChannels', 64), {'relu1'}, {'bn1'}, {'bn1f', 'bn1b', 'bn1m'});
net.addLayer('pool1', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn1'}, {'pool1'}, {});
% Block #2
net.addLayer('conv2', dagnn.Conv('size', [3 3 64 64], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool1'}, {'conv2'}, {'conv2f' 'conv2b'});
net.addLayer('relu2', dagnn.ReLU(), {'conv2'}, {'relu2'}, {});
net.addLayer('bn2', dagnn.BatchNorm('numChannels', 64), {'relu2'}, {'bn2'}, {'bn2f', 'bn2b', 'bn2m'});
net.addLayer('pool2', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn2'}, {'pool2'}, {});
% Block #3
net.addLayer('conv3', dagnn.Conv('size', [3 3 64 128], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool2'}, {'conv3'}, {'conv3f' 'conv3b'});
net.addLayer('relu3', dagnn.ReLU(), {'conv3'}, {'relu3'}, {});
net.addLayer('bn3', dagnn.BatchNorm('numChannels', 128), {'relu3'}, {'bn3'},
{'bn3f', 'bn3b', 'bn3m'});
net.addLayer('pool3', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn3'}, {'pool3'}, {});
% Block #4
net.addLayer('conv4', dagnn.Conv('size', [3 3 128 128], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool3'}, {'conv4'}, {'conv4f' 'conv4b'});
net.addLayer('relu4', dagnn.ReLU(), {'conv4'}, {'relu4'}, {});
net.addLayer('bn4', dagnn.BatchNorm('numChannels', 128), {'relu4'}, {'bn4'}, {'bn4f', 'bn4b', 'bn4m'});
net.addLayer('pool4', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn4'}, {'pool4'}, {});
% Block #5
net.addLayer('conv5', dagnn.Conv('size', [3 3 128 256], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool4'}, {'conv5'}, {'conv5f' 'conv5b'});
net.addLayer('relu5', dagnn.ReLU(), {'conv5'}, {'relu5'}, {});
net.addLayer('bn5', dagnn.BatchNorm('numChannels', 256), {'relu5'}, {'bn5'}, {'bn5f', 'bn5b', 'bn5m'});
net.addLayer('pool5', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn5'}, {'pool5'}, {});
% Block #6
net.addLayer('conv6', dagnn.Conv('size', [3 3 256 256], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool5'}, {'conv6'}, {'conv6f' 'conv6b'});
net.addLayer('relu6', dagnn.ReLU(), {'conv6'}, {'relu6'}, {});
net.addLayer('bn6', dagnn.BatchNorm('numChannels', 256), {'relu6'}, {'bn6'}, {'bn6f', 'bn6b', 'bn6m'});
net.addLayer('pool6', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn6'}, {'pool6'}, {});
% Block #7
net.addLayer('conv7', dagnn.Conv('size', [3 3 256 512], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool6'}, {'conv7'}, {'conv7f' 'conv7b'});
net.addLayer('relu7', dagnn.ReLU(), {'conv7'}, {'relu7'}, {});
net.addLayer('bn7', dagnn.BatchNorm('numChannels', 512), {'relu7'}, {'bn7'}, {'bn7f', 'bn7b', 'bn7m'});
net.addLayer('pool7', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn7'}, {'pool7'}, {});
% Block #8
net.addLayer('conv8', dagnn.Conv('size', [3 3 512 512], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool7'}, {'conv8'}, {'conv8f' 'conv8b'});
net.addLayer('relu8', dagnn.ReLU(), {'conv8'}, {'relu8'}, {});
net.addLayer('bn8', dagnn.BatchNorm('numChannels', 512), {'relu8'}, {'bn8'}, {'bn8f', 'bn8b', 'bn8m'});
net.addLayer('pool8', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [1 2], 'pad', [0 0 0 0]), {'bn8'}, {'pool8'}, {});
% Block #9
net.addLayer('conv9', dagnn.Conv('size', [3 3 512 512], 'hasBias', true, 'stride', [1, 1], 'pad', [1 1 1 1]), {'pool8'}, {'conv9'}, {'conv9f' 'conv9b'});
net.addLayer('relu9', dagnn.ReLU(), {'conv9'}, {'relu9'}, {});
net.addLayer('bn9', dagnn.BatchNorm('numChannels', 512), {'relu9'}, {'bn9'}, {'bn9f', 'bn9b', 'bn9m'});
net.addLayer('pool9', dagnn.Pooling('method', 'max', 'poolSize', [2, 2], 'stride', [2 2], 'pad', [0 0 0 0]), {'bn9'}, {'pool9'}, {});
% Incorporate MLP
net.addLayer('fc1', dagnn.Conv('size', [1 1 512 1000], 'hasBias', true, 'stride', [1, 1], 'pad', [0 0 0 0]), {'pool9'}, {'fc1'}, {'conv15f' 'conv15b'});
net.addLayer('relu10', dagnn.ReLU(), {'fc1'}, {'relu10'}, {});
net.addLayer('bn10', dagnn.BatchNorm('numChannels', 1000), {'relu10'}, {'bn10'}, {'bn10f', 'bn10b', 'bn10m'});
net.addLayer('classifier', dagnn.Conv('size', [1 1 1000 2], 'hasBias', true, 'stride', [1, 1], 'pad', [0 0 0 0]), {'bn10'}, {'classifier'}, {'conv16f' 'conv16b'});
net.addLayer('prob', dagnn.SoftMax(), {'classifier'}, {'prob'}, {});
% The dagnn.Loss computes the loss incurred by the prediction scores X given the categorical labels
net.addLayer('objective', dagnn.Loss('loss', 'softmaxlog'), {'prob', 'label'}, {'objective'}, {});
net.addLayer('error', dagnn.Loss('loss', 'classerror'), {'prob','label'}, 'error') ;