for some reason when I use cblas_ddot for dot product between vectors, it takes more time than my own inner product (in the comments you can see my inner product).
My lecturer said that the library should be implemented faster at the CPU level.
My code:
#include <iostream>
#include <iomanip>
#include <cmath>
#include <cblas.h>
#define TYPE double
using namespace std;
TYPE sqrt_root(TYPE val){
TYPE guess = val/2;
for (int i=0;i <=3;i++){
guess = (0.5f)*(guess + val/guess);
}
return guess;
}
template<size_t size>
float L2_Norm(TYPE (&A)[size]){
TYPE sum = 0;
// cout << (sizeof(A)/sizeof(A[0])) << endl;
// v(a,b,c) = sqrt(a^2+b^2+c^2)
for (int i=0;i<=size - 1;i++){
sum += A[i]*A[i];
}
return sqrt_root(sum);
}
template <size_t rows, size_t cols>
void Transpose_Matrix(TYPE (&A)[rows][cols],TYPE (&A_T)[cols][rows]){
for (int r = 0;r<=rows-1;r++){
for(int c=0;c<=cols-1;c++)
A_T[c][r] = A[r][c];
}
}
template <size_t rows, size_t cols>
void QR_Decomp_32f(TYPE (&A)[rows][cols],TYPE (&Q)[rows][cols],TYPE (&R)[cols][cols])
{
TYPE Q_T[cols][rows]; // cols,rows instead rows,cols for transpose
TYPE R_T[cols][cols];
// for (int i =0;i<=rows-1;i++)
// for (int j = 0; j < cols - 1; j++) {
// Q_T[j][i] = 0;
// R_T[j][i] = 0;
// }
TYPE A_T[cols][rows];
Transpose_Matrix(A,A_T);
TYPE a0_norm = L2_Norm(A_T[0]);
// Set q0 = a0/R[0,0] = a0/||a0||
for(int i = 0; i<=rows-1;i++){
Q_T[0][i] = A_T[0][i]/a0_norm;
}
//R[0,0] = ||a0||
R_T[0][0] = a0_norm;
// run on the columns of A and Q but instead of columns we threat them as a rows because of the transpose
for (int i = 1; i<= cols-1;i++){
// qi = ai
for (int k=0; k <= rows-1;k++){
Q_T[i][k] = A_T[i][k];
}
for (int j=0; j <= i-1;j++){
// R[j][i] = qj_t * ai
R_T[j][i] = cblas_ddot(rows,Q_T[j],1,Q_T[i],1);
//Inner product
// for (int s = 0; s<=rows-1;s++){
// R_T[j][i] = R_T[j][i] + Q_T[j][s]*Q_T[i][s];
// }
// qi = qi - R[j][i]*qj
for (int s=0;s<=rows-1;s++){
Q_T[i][s] = Q_T[i][s] - R_T[j][i]*Q_T[j][s];
}
}
// R[i][i] = ||qi||
R_T[i][i] = L2_Norm(Q_T[i]);
// qi = qi/R[i][i]
for (int s=0; s<=rows-1;s++){
Q_T[i][s] = Q_T[i][s]/R_T[i][i];
}
}
Transpose_Matrix(Q_T,Q);
Transpose_Matrix(R_T,R);
}
int main() {
TYPE A[4][3] = {{2,1,2},
{1,-2,1},
{1,2,3},
{1,1,1}};
TYPE Q[4][3];
TYPE R[3][3];
QR_Decomp_32f(A,Q,R);
cout << Q << endl;
for (int r = 0; r<=4-1;r++){
for (int c = 0; c<=3-1;c++){
cout << Q[r][c] << ' ';}
cout<< "" << endl;
}
}
While my code is around 0.001, when I use the cblas_ddot is around 0.019.
I have downloaded the openblas with brew install -vd openblas
and added the cmake -DCMAKE_CXX_FLAGS=-I/usr/local/opt/openblas/include
.
I use Clion to compile and run my code on a MacBook Pro 2019 (Intel CPU i9)
any idea why?