I was trying to build a 5-layer neural network to classify a 3 classes, 178 instances and 13 features dataset. Basically I was following the guideline given here. I have written down my own code in Matlab and it can successfully run. However, the training result turns out to be very bad. The model keep predict the same class as the output. I could not found where is wrong with my code, or the model doesn't fit for the data? Could someone help me find where the problem is? Thank you very much.
My Matlab training code is shown below:
%% Initialization
numclass = 3; % num of class
c = 13; % num of feature
% for each layer, initialize each parameter w
and each b to a small random value near zero
w1 = normrnd(0,0.01,[c,10]); % Input layer -> layer 2 (10 nodes)
b1 = normrnd(0,0.01,[1,10]);
w2 = normrnd(0,0.01,[10,6]); % layer 2 -> layer 3 (6 nodes)
b2 = normrnd(0,0.01,[1,6]);
w3 = normrnd(0,0.01,[6,4]); % layer 3 -> layer 4 (4 nodes)
b3 = normrnd(0,0.01,[1,4]);
w4 = normrnd(0,0.01,[4,numclass]); % layer 4 -> Output layer (3 nodes/class label)
b4 = normrnd(0,0.01,[1,numclass]);
Iter = 0;
lambda = 0.5; % regularization coefficient
%% Batch Training
while Iter < 200
Iter = Iter+1
d_w1 = 0; d_w2 = 0; d_w3 = 0; d_b1 = 0; d_b2 = 0; d_b3 = 0;
d_w4 = 0; d_b4 = 0;
for i = 1:r
% Forward propagation
a1 = X0(i,:); % X0 is training data, each row represents a instance with 13 features
% Input layer -> Layer 2
z2 = a1*w1+b1;
a2 = sigmoid(z2);
% Layer 2 -> Layer 3
z3 = a2*w2+b2;
a3 = sigmoid(z3);
% Layer 3 -> Layer 4
z4 = a3*w3+b3;
a4 = sigmoid(z4);
% Layer 4 -> Output Layer
z5 = a4*w4+b4;
a5 = sigmoid(z5);
% Backward propagation
y = zeros(1,numclass);
y(Y0(i)) = 1; % Y0 is the training label ({1,2,3} in this case), each element indicates which class the instance belongs to
% Output Layer -> Layer 4
delta5 = (-(y-a5).*d_sigmoid(z5))';
% Output Layer -> Layer 3
delta4 = (w4*delta5).*d_sigmoid(z4');
% Layer 3 -> Layer 2
delta3 = (w3*delta4).*d_sigmoid(z3');
% Layer 2 -> Layer I
delta2 = (w2*delta3).*d_sigmoid(z2');
% Compute the desired partial derivatives
d_w1 = d_w1 + (delta2*a1)';
d_b1 = d_b1 + delta2';
d_w2 = d_w2 + (delta3*a2)';
d_b2 = d_b2 + delta3';
d_w3 = d_w3 + (delta4*a3)';
d_b3 = d_b3 + delta4';
d_w4 = d_w4 + (delta5*a4)';
d_b4 = d_b4 + delta5';
end
eta = 0.8; % leraning rate
% weights and bias updating
w1 = w1 - eta*((1/r*d_w1)+ lambda*w1);
b1 = b1 - eta*1/r*d_b1;
w2 = w2 - eta*((1/r*d_w2)+ lambda*w2);
b2 = b2 - eta*1/r*d_b2;
w3 = w3 - eta*((1/r*d_w3)+ lambda*w3);
b3 = b3 - eta*1/r*d_b3;
w4 = w4 - eta*((1/r*d_w4)+ lambda*w4);
b4 = b4 - eta*1/r*d_b4;
end
sigmoid and d_sigmoid function are shown below:
function y = sigmoid(x);
L=1;
k=10;
x0=0;
y = L./(1+exp(-k*(x-x0)));
end
function y = d_sigmoid(x)
tmp = sigmoid(x);
y = tmp.*(1-tmp);
end
The prediction code is shown below:
%% Prediction: X1 is testing data, and Y1 is a vector of testing label
[r,c] = size(X1);
for i = 1:r
A1 = X1(i,:);
% Input layer -> Layer 2
Z2 = A1*w1+b1;
A2 = sigmoid(Z2);
% Layer 2 -> Layer 3
Z3 = A2*w2+b2;
A3 = sigmoid(Z3);
% Layer 3 -> Layer 4
Z4 = A3*w3+b3;
A4 = sigmoid(Z4);
% Layer 4 -> Output Layer
Z5 = A4*w4+b4;
A5 = sigmoid(Z5);
pred(i) = find(A5==max(A5))
end
error = length(find((pred'-Y1)~=0))