I am new to using cuda and the magma libraries. I'm trying out some functions on a test problem, a 2D heat equation. The code I wrote seemed to work perfectly for grid sizes of 32, 64, and 128. But it produced wrong results for grid sizes of 256 or larger. I am only posting part of the code here, just enough to reproduce the error. Transferring the final matrix and looking at it in matlab shows that the second call to magmablas_dgemm introduced errors into the solution.
Is there anyone out there who can see why this code would break for larger grid sizes?
int main(int argc, char* argv[])
{
// Get parameters for problem set up
int side_width = atoi(argv[1]); //assuming square grid, N/32 integer
double dx = 2.0 / (side_width-1);
double dt = 0.25 * dx;
//double Tend = dt*3;// 0.5;
// create memory pointers for derivative operator matrices and solution matrix
double* U;
double* Dleft;
double* Dright;
double* dev_U;
double* dev_Dleft;
double* dev_Dright;
//initialize the MAGMA system
magma_init();
magma_int_t N = side_width;
// temp variables required by MAGMA functions
magma_int_t *piv, info, err;
piv = (magma_int_t*)malloc(N*sizeof(magma_int_t));
// Allocate memory for matrices on host and device
err = magma_dmalloc_cpu(&U, N*N);
err += magma_dmalloc_cpu(&Dleft, N*N);
err += magma_dmalloc_cpu(&Dright, N*N);
err += magma_dmalloc(&dev_U, N*N);
err += magma_dmalloc(&dev_Dleft, N*N);
err += magma_dmalloc(&dev_Dright, N*N);
if (err){
printf("error in allocation. err number = %d\n", err);
exit(1);
}
// zero out matrices (not efficient but correct)
for (int k=0; k<N*N; ++k ){
U[k] = 1.0;
Dleft[k] = 0.0;
Dright[k] = 0.0;
}
//create derivative operator matrices
double a = dt/2.0/dx/dx;
double b = dt/dx/dx;
Dleft[0] = 1.0;
Dleft[N*N-1] = 1.0;
for (int k=1; k<N-1; ++k) {
Dleft[k*N + k-1] = -a;
Dleft[k*N + k] = 1+b;
Dleft[k*N + k+1] = -a;
Dright[k*N + k-1] = a;
Dright[k*N + k] = 1-b;
Dright[k*N + k+1] = a;
}
// Determine block and thread amounts
int grid_dim = ((side_width + 31)/32) ;
int block_dim = 32;
dim3 gridDim(grid_dim, grid_dim);
dim3 blockDim(block_dim, block_dim);
//copy data from host to device
magma_dsetmatrix(N, N, U, N, dev_U, N);
magma_dsetmatrix(N, N, Dleft, N, dev_Dleft, N);
magma_dsetmatrix(N, N, Dright, N, dev_Dright, N);
// LU factorize the left hand operator matrix
magma_dgetrf_gpu(N, N, dev_Dleft, N, piv, &info);
double tn = 0; //time counter
// needed to take first step outside while loop because of some tricky transpose nonsense happening
tn += dt;
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans,MagmaNoTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
// implicit step solve : Dleft*U=Uhat
magma_dgetrs_gpu(MagmaTrans, N, N, dev_Dleft, N, piv, dev_U, N, &info);
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans, MagmaTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
printf("GPU matrix U at time %3.3f \n ", tn);
magma_dprint_gpu(16, 16, dev_U, N);
//copy solution from device to host
magma_dgetmatrix(N, N, dev_U, N, U, N);
//write data to file
char filename[256];
char str_t[128];
sprintf(str_t, "%d", N );
sprintf(filename, "ADI_%s.bin", str_t);
FILE* fileID = fopen(filename, "wb");
for (int i=0; i<N*N; ++i){
fwrite(&U[i],sizeof(double),1,fileID);
}
fclose(fileID);
free(U);
free(Dleft);
free(Dright);
magma_free(dev_U);
magma_free(dev_Dleft);
magma_free(dev_Dright);
free(piv);
magma_finalize();
return 0;
}