I was attempting to perform multiple linear regression using GSL. The estimator for beta is beta=(X'X)^(-1)X'y. Three approaches were used -
- direct, i.e. calculate the right-hand side of this formula using required operations;
- solve the X'Xb=X'y, i.e. solve the system Ax=b, where A=X'X, b=X'y, and x=b;
- use function gsl_multifit_linear. The test data are from here.
All three approaches gave same wrong result:
beta1=0.0365505, beta2=0.0435827, alpha=0.645627.
The same data file produces correct output in R:
beta1=0.086409, beta2=0.087602, alpha=-4.1.
Results for another dataset (WHO life expectancy data from here) also differed between GSL and R.
Most likely I have a mistake somewhere. But it would be good if at least somebody can confirm that GSL multiple linear regression works correctly.
Edit1. The language is C++. The code for option1, i.e. beta=(X'X)^(-1)X'y
void mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
//the function accepts vector of predictant values and matrix of predictands
//convert input vectors into gsl_vector
int n=predictandVector.size();
int m=predictorMatrix[0].size();
gsl_vector*y=gsl_vector_alloc(n);
gsl_matrix*x=gsl_matrix_alloc(n,m);
for(int i=0;i<n;i++) {
gsl_vector_set(y,i,predictandVector[i]);
for(int j=0;j<m;j++) {
gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
}
}
//multiply X' by X
gsl_matrix*xxTranspose=gsl_matrix_alloc(m,m);
gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0,x,x,0.0,xxTranspose);
/////////////////////////////////
//perform LU decomposition of xxTranspose to find it's inverse
gsl_permutation*p=gsl_permutation_alloc(m);
int s;
gsl_linalg_LU_decomp(xxTranspose,p,&s);
//////////////////////////////
//compute inverse of xxTranspose matrix
gsl_matrix*xxTransposeInverse=gsl_matrix_alloc(m,m);
gsl_linalg_LU_invert(xxTranspose,p,xxTransposeInverse);
//////////////////////////////
//multiply inverse of xxTranspose matrix by xTranspose
gsl_matrix*xxTransposeInverseXtranspose=gsl_matrix_alloc(m,n);
gsl_blas_dgemm(CblasNoTrans,CblasTrans,1.0,xxTransposeInverse,x,0.0,xxTransposeInverseXtranspose);
//////////////////////////////////////
//multiply matrix (X'X)^(-1)X' and vector y; this must give values of beta
gsl_vector*b=gsl_vector_alloc(m);
gsl_blas_dgemv(CblasNoTrans,1.0,xxTransposeInverseXtranspose,y,0.0,b);
//write beta values to file
FILE*f;
f=fopen("molsResult.dat","w");
gsl_vector_fprintf(f,b,"%g");
////////////////////
}
The code for option2, i.e. solve the system X'Xbeta=X'y
oid mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
//convert input vectors into gsl_vector
int n=predictandVector.size();
int m=predictorMatrix[0].size();
gsl_vector*y=gsl_vector_alloc(n);
gsl_matrix*x=gsl_matrix_alloc(n,m);
for(int i=0;i<n;i++) {
gsl_vector_set(y,i,predictandVector[i]);
for(int j=0;j<m;j++) {
gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
}
}
/////////////////////
//compute X'X
gsl_matrix*xxTranspose=gsl_matrix_alloc(m,m);
gsl_blas_dgemm(CblasTrans,CblasNoTrans,1.0,x,x,0.0,xxTranspose);
/////////////////////////////////
//compute X'y
gsl_vector*XtransposeY=gsl_vector_alloc(m);
gsl_blas_dgemv(CblasTrans,1.0,x,y,0.0,XtransposeY);
//////////////////////////////////////
//solve the linear system X'Xb=X'y to find coefficients beta
gsl_vector*b=gsl_vector_alloc(m);
int k=n;
if(m<n) {
k=m;
}
gsl_vector*tau=gsl_vector_alloc(k);
gsl_linalg_QR_decomp(xxTranspose,tau);
gsl_linalg_QR_solve(xxTranspose,tau,XtransposeY,b);
//write beta values into file
FILE*f;
f=fopen("molsResult.dat","w");
gsl_vector_fprintf(f,b,"%g");
////////////////////////////////
}
The code for option3, i.e. using gsl_multifit_linear.
void mols(vector <double> predictandVector,vector <vector <double> > predictorMatrix) {
//convert input vectors into gsl_vector
int n=predictandVector.size();
int m=predictorMatrix[0].size();
gsl_vector*y=gsl_vector_alloc(n);
gsl_matrix*x=gsl_matrix_alloc(n,m);
for(int i=0;i<n;i++) {
gsl_vector_set(y,i,predictandVector[i]);
for(int j=0;j<m;j++) {
gsl_matrix_set(x,i,j,predictorMatrix[i][j]);
}
}
///////////////////////////
//Use gsl_multifit_linear
gsl_multifit_linear_workspace*w=gsl_multifit_linear_alloc(n,2);
double chisq;
gsl_matrix*cov=gsl_matrix_alloc(m,m);
gsl_vector*b=gsl_vector_alloc(m);
gsl_multifit_linear(x,y,b,cov,&chisq,w);
//////////////////////
//write beta values into file
FILE*f;
f=fopen("molsResult.dat","w");
gsl_vector_fprintf(f,b,"%g");
//////////////////////////
}
Edit2. Also to absolutely assure that input data is read in correctly the following code as added to the option3 code:
FILE*xyWrite;
xyWrite=fopen("molsInputData.dat","w");
for(int i=0;i<n;i++) {
fputs(to_string(gsl_vector_get(y,i)).c_str(),xyWrite);
fputs(" ",xyWrite);
fputs(to_string(gsl_matrix_get(x,i,0)).c_str(),xyWrite);
fputs(" ",xyWrite);
fputs(to_string(gsl_matrix_get(x,i,1)).c_str(),xyWrite);
fputs("\n",xyWrite);
}
This code writes gsl_vector(s) x and y (used as input) into file. Then this file was loaded into R and multiple linear regression performed. The results were correct. This proves that input data is read in correctly.