I tried to implement Backpropagation through time manually, but in the end the network isn't converging. I tried looking around on the net for descriptions and courses on BPTT, and the code does everything accordingly:
- Forward propagation
- Error prpoagation backwards
- Gradient calculation based on the expected values
- Updating the weights based on the gradient and a learning rate
The way I understand recurrent derivatives, is that in case of recurrent Neural networks, the input from a previous step can not be considered as a constant. So e.g.:
The derivative of w1
in the 3rd step depends not only the input of the current step, but on the previous steps as well. That's why dw1[1] = net_inputs_train[first_sample_index + 1][0];
is incorrect, it needs to be dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
.
Everything else is supposed to be backpropagation "only" in an unfolded network.. Unfortunately this program just doesn't work, the error just jumps around without the net converging..
I don't know what else I could do to make this work, maybe I misunderstood the concept of it completely...
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
int main(int argc, char *argv[]){
srand(time(nullptr));
/* Manual BPTT with one custom implemented Neuron */
double number_of_samples = 3; /* Binary addition dataset */
vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
{1,1}, {0,0}, {0,0}, /* 100 + 100 = 110 */
{1,0}, {0,1}, {1,0}, /* 101 + 010 = 111*/
{1,0}, {1,1}, {0,0}, /* 110 + 010 = 111 */
};
vector<vector<double>> expected_output = { /* 1 output in each step */
{1}, {1}, {0}, /* 110 */
{1}, {1}, {1}, /* 111 */
{1}, {1}, {1}, /* 111 */
};
double w1 = 0.5;
double w2 = 0.5;
double w3 = 0.5;
double b = 0.0;
vector<double> neuron_data(3,0);
vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/
vector<double> dw1(3,0); /* derivatives for weights for each sequence */
vector<double> dw2(3,0);
vector<double> dw3(3,0);
vector<double> derb(3,0);
int first_sample_index;
double manual_error = 1.0;
double learning_rate = 1e-2;
while(manual_error > learning_rate){
for(int mbIter = 0; mbIter < 4; ++mbIter){
first_sample_index = (rand()%(static_cast<int>(number_of_samples)));
/* Fill in the data and derviatives */
neuron_data[0] = (
net_inputs_train[first_sample_index][0] * w1
+ net_inputs_train[first_sample_index][1] * w2
+ b
);
dw1[0] = net_inputs_train[first_sample_index][0];
dw2[0] = net_inputs_train[first_sample_index][1];
dw3[0] = 0;
derb[0] = 1;
neuron_data[1] = (
net_inputs_train[first_sample_index + 1][0] * w1
+ net_inputs_train[first_sample_index + 1][1] * w2
+ neuron_data[0] * w3
+ b
);
dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
dw3[1] = neuron_data[0] + w3 * dw3[0];
derb[1] = 1 + derb[0] * w3;
neuron_data[2] = (
net_inputs_train[first_sample_index + 2][0] * w1
+ net_inputs_train[first_sample_index + 2][1] * w2
+ neuron_data[1] * w3
+ b
);
dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
dw3[2] = neuron_data[1] + w3 * dw3[1];
derb[2] = 1 + derb[1] * w3;
/* Calculate the error and the gradients */
manual_error = (
pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
+pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
+pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
);
neuron_deriv[2] = (
(-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
);
neuron_deriv[1] = (
(-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
+ (w3 * neuron_deriv[2])
);
neuron_deriv[0] = (
(-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
+ (w3 * neuron_deriv[1])
);
w1 += (learning_rate * (
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / number_of_samples);
w2 += (learning_rate * (
neuron_deriv[2] * dw2[2]
+ neuron_deriv[1] * dw2[1]
+ neuron_deriv[0] * dw2[0]
) / number_of_samples);
w3 += (learning_rate * (
neuron_deriv[2] * dw3[2]
+ neuron_deriv[1] * dw3[1]
+ neuron_deriv[0] * dw3[0]
) / number_of_samples);
b += (learning_rate * (
neuron_deriv[2] * derb[2]
+ neuron_deriv[1] * derb[1]
+ neuron_deriv[0] * derb[0]
) / number_of_samples);
std::cout << "\r Error: " << manual_error << " \n";
}
}
return 0;
}
Edit: One interesting thing, is that the training converges if w1 += (learning_rate * (...)/number_of_samples);
is switched to w1 += ((...)/number_of_samples);