cblas_ddot is slower than my own dot product implementation

Question

for some reason when I use cblas_ddot for dot product between vectors, it takes more time than my own inner product (in the comments you can see my inner product).

My lecturer said that the library should be implemented faster at the CPU level.

My code:

#include <iostream>
#include <iomanip>
#include <cmath>
#include <cblas.h>
#define TYPE double
using namespace std;

TYPE sqrt_root(TYPE val){
    TYPE guess = val/2;
    for (int i=0;i <=3;i++){
        guess = (0.5f)*(guess + val/guess);
    }
    return guess;
}

template<size_t size>
float L2_Norm(TYPE (&A)[size]){
    TYPE sum = 0;
//    cout << (sizeof(A)/sizeof(A[0])) << endl;
    // v(a,b,c) = sqrt(a^2+b^2+c^2)
    for (int i=0;i<=size - 1;i++){
        sum += A[i]*A[i];
    }
    return sqrt_root(sum);
}
template <size_t rows, size_t cols>
void Transpose_Matrix(TYPE (&A)[rows][cols],TYPE (&A_T)[cols][rows]){
    for (int r = 0;r<=rows-1;r++){
        for(int c=0;c<=cols-1;c++)
            A_T[c][r] = A[r][c];
    }
}

template <size_t rows, size_t cols>
void QR_Decomp_32f(TYPE (&A)[rows][cols],TYPE (&Q)[rows][cols],TYPE (&R)[cols][cols])
{
    TYPE Q_T[cols][rows]; // cols,rows instead rows,cols for transpose
    TYPE R_T[cols][cols];
//    for (int i =0;i<=rows-1;i++)
//        for (int j = 0; j < cols - 1; j++) {
//            Q_T[j][i] = 0;
//            R_T[j][i] = 0;
//        }

    TYPE A_T[cols][rows];
    Transpose_Matrix(A,A_T);
    TYPE a0_norm = L2_Norm(A_T[0]);
    //    Set q0 = a0/R[0,0] = a0/||a0||
    for(int i = 0; i<=rows-1;i++){
        Q_T[0][i] = A_T[0][i]/a0_norm;
    }
    //R[0,0] = ||a0||
    R_T[0][0] = a0_norm;
    // run on the columns of A and Q but instead of columns we threat them as a rows because of the transpose
    for (int i = 1; i<= cols-1;i++){
        // qi = ai
        for (int k=0; k <= rows-1;k++){
            Q_T[i][k] = A_T[i][k];
        }

        for (int j=0; j <= i-1;j++){
            // R[j][i] = qj_t * ai
            R_T[j][i] = cblas_ddot(rows,Q_T[j],1,Q_T[i],1);
            
            //Inner product
//            for (int s = 0; s<=rows-1;s++){
//                R_T[j][i] = R_T[j][i] + Q_T[j][s]*Q_T[i][s];
//            }

            // qi = qi - R[j][i]*qj
            for (int s=0;s<=rows-1;s++){
                Q_T[i][s] = Q_T[i][s] - R_T[j][i]*Q_T[j][s];
            }
        }
        // R[i][i] = ||qi||
        R_T[i][i] = L2_Norm(Q_T[i]);
//         qi = qi/R[i][i]
        for (int s=0; s<=rows-1;s++){
            Q_T[i][s] = Q_T[i][s]/R_T[i][i];
        }
    }
    Transpose_Matrix(Q_T,Q);
    Transpose_Matrix(R_T,R);

}



int main() {


    TYPE A[4][3] = {{2,1,2},
                         {1,-2,1},
                         {1,2,3},
                         {1,1,1}};


    TYPE Q[4][3];
    TYPE R[3][3];

    QR_Decomp_32f(A,Q,R);


     cout << Q << endl;

    for (int r = 0; r<=4-1;r++){
         for (int c = 0; c<=3-1;c++){
             cout << Q[r][c] << ' ';}
         cout<< "" << endl;
}
}

While my code is around 0.001, when I use the cblas_ddot is around 0.019.

I have downloaded the openblas with brew install -vd openblas and added the cmake -DCMAKE_CXX_FLAGS=-I/usr/local/opt/openblas/include.

I use Clion to compile and run my code on a MacBook Pro 2019 (Intel CPU i9)

any idea why?

@Frank you right, I will edit the code in the question but still, it is slower even when I fix it.. — Roy Amoyal, Apr 12 '22 at 17:09
@Frank it is not the same vertor. one is Q_T[i] and the another one is Q_T[j] — Roy Amoyal, Apr 12 '22 at 17:18

cblas_ddot is slower than my own dot product implementation

0 Answers0