I am trying to get inverse of many matrices, which is in the end to find solutions to the system.
I am doing this via the sequence of these CUBLAS function calls,
/*
Aims to solve Ax = RHS;
x = Ainv * RHS;
*/
cublas<t>getrfBatched // to find LU decomposition of A
cublas<t>getriBatched // to get inverse Ainv
cublas<t>gemmBatched // to multiply Ainv to RHS, to get the solution
While cublas<t>getrfBatched
will tell whether the given matrix is singular or not, it seems that it does not detect nearly singular matrix (which is mathematically singular, but not singular on a computer due to truncations, etc).
The direct example is
A = [[1 , 2 , 3],
[4 , 5 , 6],
[7 , 8 , 9]];
While cublas detect singularity for transpose(A)
, but it does not detect singularity for A
. In fact, Python Numpy also does not detect singularity for A
and I expect it is due to similar algorithms for LU decomposition and so on.
How I would be able to detect singular matrice in a robust way on CUDA?
Added
This is the lines of the code that I used for this example.
//base - https://stackoverflow.com/questions/22887167/cublas-incorrect-inversion-for-matrix-with-zero-pivot
#include <cstdio>
#include <cstdlib>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#define N 3
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
int main() {
const int numMat = 6;
double h_A1[N*N];
double h_A2[N*N];
double h_A3[N*N];
double h_A4[N*N];
double h_A5[N*N];
double h_A6[N*N];
h_A1[IDX2C(0,0,N)] = 0.5; h_A1[IDX2C(0,1,N)] = 3. ; h_A1[IDX2C(0,2,N)] = 4. ;
h_A1[IDX2C(1,0,N)] = 1. ; h_A1[IDX2C(1,1,N)] = 3. ; h_A1[IDX2C(1,2,N)] = 10.;
h_A1[IDX2C(2,0,N)] = 4. ; h_A1[IDX2C(2,1,N)] = 9. ; h_A1[IDX2C(2,2,N)] = 16.;
h_A2[IDX2C(0,0,N)] = 0. ; h_A2[IDX2C(0,1,N)] = 3. ; h_A2[IDX2C(0,2,N)] = 4. ;
h_A2[IDX2C(1,0,N)] = 1. ; h_A2[IDX2C(1,1,N)] = 3. ; h_A2[IDX2C(1,2,N)] = 10.;
h_A2[IDX2C(2,0,N)] = 4. ; h_A2[IDX2C(2,1,N)] = 9. ; h_A2[IDX2C(2,2,N)] = 16.;
h_A3[IDX2C(0,0,N)] = 0. ; h_A3[IDX2C(0,1,N)] = 1. ; h_A3[IDX2C(0,2,N)] = 4. ;
h_A3[IDX2C(1,0,N)] = 3. ; h_A3[IDX2C(1,1,N)] = 3. ; h_A3[IDX2C(1,2,N)] = 9. ;
h_A3[IDX2C(2,0,N)] = 4. ; h_A3[IDX2C(2,1,N)] = 10.; h_A3[IDX2C(2,2,N)] = 16.;
h_A4[IDX2C(0,0,N)] = 0. ; h_A4[IDX2C(0,1,N)] = 3. ; h_A4[IDX2C(0,2,N)] = 4. ;
h_A4[IDX2C(1,0,N)] = 1. ; h_A4[IDX2C(1,1,N)] = 5. ; h_A4[IDX2C(1,2,N)] = 6. ;
h_A4[IDX2C(2,0,N)] = 9. ; h_A4[IDX2C(2,1,N)] = 8. ; h_A4[IDX2C(2,2,N)] = 2. ;
h_A5[IDX2C(0,0,N)] = 22.; h_A5[IDX2C(0,1,N)] = 3. ; h_A5[IDX2C(0,2,N)] = 4. ;
h_A5[IDX2C(1,0,N)] = 1. ; h_A5[IDX2C(1,1,N)] = 5. ; h_A5[IDX2C(1,2,N)] = 6. ;
h_A5[IDX2C(2,0,N)] = 9. ; h_A5[IDX2C(2,1,N)] = 8. ; h_A5[IDX2C(2,2,N)] = 2. ;
h_A6[IDX2C(0,0,N)] = 1. ; h_A6[IDX2C(0,1,N)] = 2. ; h_A6[IDX2C(0,2,N)] = 3. ;
h_A6[IDX2C(1,0,N)] = 4. ; h_A6[IDX2C(1,1,N)] = 5. ; h_A6[IDX2C(1,2,N)] = 6. ;
h_A6[IDX2C(2,0,N)] = 7. ; h_A6[IDX2C(2,1,N)] = 8. ; h_A6[IDX2C(2,2,N)] = 9. ;
double h_rhs[N*N];
h_rhs[IDX2C(0,0,N)] = 1.; h_rhs[IDX2C(0,1,N)] = 1.; h_rhs[IDX2C(0,2,N)] = 1.;
h_rhs[IDX2C(1,0,N)] = 2.; h_rhs[IDX2C(1,1,N)] = 2.; h_rhs[IDX2C(1,2,N)] = 2.;
h_rhs[IDX2C(2,0,N)] = 3.; h_rhs[IDX2C(2,1,N)] = 3.; h_rhs[IDX2C(2,2,N)] = 3.;
for (int i=0; i<N*N; i++) { printf("%f ", h_rhs[i]); }
printf("\n");
double h_sol1[N*N];
double h_sol2[N*N];
double h_sol3[N*N];
double h_sol4[N*N];
double h_sol5[N*N];
double h_sol6[N*N];
double h_inv1[N*N];
double h_inv2[N*N];
double h_inv3[N*N];
double h_inv4[N*N];
double h_inv5[N*N];
double h_inv6[N*N];
double *h_APtr[numMat], *h_AinvPtr[numMat], *h_SolPtr[numMat];
h_APtr[0] = &(h_A1[0]);
h_APtr[1] = &(h_A2[0]);
h_APtr[2] = &(h_A3[0]);
h_APtr[3] = &(h_A4[0]);
h_APtr[4] = &(h_A5[0]);
h_APtr[5] = &(h_A6[0]);
h_AinvPtr[0] = &(h_inv1[0]);
h_AinvPtr[1] = &(h_inv2[0]);
h_AinvPtr[2] = &(h_inv3[0]);
h_AinvPtr[3] = &(h_inv4[0]);
h_AinvPtr[4] = &(h_inv5[0]);
h_AinvPtr[5] = &(h_inv6[0]);
h_SolPtr[0] = &(h_sol1[0]);
h_SolPtr[1] = &(h_sol2[0]);
h_SolPtr[2] = &(h_sol3[0]);
h_SolPtr[3] = &(h_sol4[0]);
h_SolPtr[4] = &(h_sol5[0]);
h_SolPtr[5] = &(h_sol6[0]);
// allocation
double *d_A[numMat], *d_Ainv[numMat]; // on Host
double **d_APtr, **d_AinvPtr; // d_A version of Device
for (int i=0; i<numMat; i++) {
cudaMalloc((void**)&d_A[i], N*N*sizeof(double)); // elements of d_A gets allocated on Device
cudaMalloc((void**)&d_Ainv[i], N*N*sizeof(double));
}
cudaMalloc((void**)&d_APtr, numMat*sizeof(double *));
cudaMalloc((void**)&d_AinvPtr, numMat*sizeof(double *));
double *d_Rhs[numMat], *d_Sol[numMat];
double **d_RhsPtr, **d_SolPtr;
for (int i=0; i<numMat; i++) {
cudaMalloc((void**)&d_Rhs[i], N*N*sizeof(double));
cudaMalloc((void**)&d_Sol[i], N*N*sizeof(double));
}
cudaMalloc((void**)&d_RhsPtr, numMat*sizeof(double *));
cudaMalloc((void**)&d_SolPtr, numMat*sizeof(double *));
int *P, *INFO; // specific to cublas<t>getrfBatched
cudaMalloc(&P, N*numMat*sizeof(int));
cudaMalloc(&INFO, numMat*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
// memcpy
for (int i=0; i<numMat; i++) {
cudaMemcpy(d_A[i], h_APtr[i], N*N*sizeof(double), cudaMemcpyHostToDevice);
}
cudaMemcpy(d_APtr, d_A, numMat*sizeof(double *), cudaMemcpyHostToDevice);
cudaMemcpy(d_AinvPtr, d_Ainv, numMat*sizeof(double *), cudaMemcpyHostToDevice);
for (int i=0; i<numMat; i++) {
cudaMemcpy(d_Rhs[i], h_rhs, N*N*sizeof(double), cudaMemcpyHostToDevice);
}
cudaMemcpy(d_RhsPtr, d_Rhs, numMat*sizeof(double *), cudaMemcpyHostToDevice);
cudaMemcpy(d_SolPtr, d_Sol, numMat*sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D fail");
// CUBLAS
cublasHandle_t myhandle;
cublasStatus_t cublas_result;
cublas_result = cublasCreate_v2(&myhandle);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
// getrfBatched
cublasDgetrfBatched(myhandle, N, d_APtr, N, P, INFO, numMat);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
int INFOh[numMat];
cudaMemcpy(&INFOh, INFO, numMat*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy H2D fail (2)");
for (int i=0; i<numMat; i++) {
if (INFOh[i] != 0) {
fprintf(stderr, "ERROR: something wrong in INFOh\n");
fprintf(stderr, " - batch %d, INFO %d\n", i, INFOh[i]);
}
}
// getriBatched
cublasDgetriBatched(myhandle, N, d_APtr, N, P, d_AinvPtr, N, INFO, numMat);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
cudaMemcpy(&INFOh, INFO, numMat*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy H2D fail (3)");
// print inverse
printf("\n Inverses \n");
for (int i=0; i<numMat; i++) {
cudaMemcpy(h_AinvPtr[i], d_Ainv[i], N*N*sizeof(double), cudaMemcpyDeviceToHost);
}
cudaCheckErrors("cudaMemcpy D2H fail");
double *tmpptr, *tmpptrInv;
for (int k=0; k<numMat; k++) {
tmpptr = h_APtr[k];
tmpptrInv = h_AinvPtr[k];
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
printf("%f\t", tmpptr[IDX2C(i,j,N)]);
}
for (int j=0; j<N; j++) {
printf("%f\t", tmpptrInv[IDX2C(i,j,N)]);
}
printf("\n");
}
printf("\n");
}
// gemmBatched
double alpha = 1.0;
double beta = 0.0;
cublas_result = cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N,
N,N,N,
&alpha, d_AinvPtr, N, d_RhsPtr, N,
&beta, d_SolPtr, N,
numMat);
assert(cublas_result == CUBLAS_STATUS_SUCCESS);
cudaFree(P);
cudaFree(INFO);
cublasDestroy_v2(myhandle);
// print solution
printf("\n Solutions \n");
for (int i=0; i<numMat; i++) {
cudaMemcpy(h_SolPtr[i], d_Sol[i], N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_A[i]);
cudaFree(d_Ainv[i]);
}
cudaFree(d_APtr);
cudaFree(d_AinvPtr);
cudaCheckErrors("cudaMemcpy D2H fail (2)");
for (int k=0; k<numMat; k++) {
tmpptr = h_APtr[k];
tmpptrInv = h_SolPtr[k];
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
printf("%f\t", tmpptr[IDX2C(i,j,N)]);
}
for (int j=0; j<N; j++) {
printf("%f\t", tmpptrInv[IDX2C(i,j,N)]);
}
printf("\n");
}
printf("\n");
}
return 0;
}
This code is messy, and maybe some allocations are not properly freed till the end of the code (I had put more focus on make it run and understand the CUBLAS application). I do not want you to read through this messy code, just wanted to demonstrate indeed CUBLAS does not detect singular matrix in some cases with directly compilable, and runable set of code lines.
It solves batched 6 systems, each of them consists of 3x3 A, and share the same 3x3 RHS.
It is the h_A6
which should be detected as singular.
When I run this, the result looks like
...
Inverses
...
1.000000 2.000000 3.000000 3152519739159346.500000 -6305039478318690.000000 3152519739159346.000000
4.000000 5.000000 6.000000 -6305039478318695.000000 12610078956637384.000000 -6305039478318692.000000
7.000000 8.000000 9.000000 3152519739159348.000000 -6305039478318693.000000 3152519739159346.000000
We can see that for some singular matrix, it gives absurd result even with double precision.