Recently I started to study the CUDA technology, which allows me to increase the speed of calculations by several times thanks to the paralleling technology. The program I wrote has to calculate the determinant of a square matrix using the Gaussian method. Unfortunately, for some reason it always tells me that the determinant of a matrix is equal to one, although it is not. In addition, the time is also shown incorrectly. Most likely, there is an error in calculate_determinant function, but I haven't been able to find it.
I work in Ubuntu 22.02 and this is how I compile my program:
nvcc test2.cu -o test2
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024
__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
int j = threadIdx.x + blockDim.x * blockIdx.x;
double ratio;
int i, k;
double det_local;
// Initialize local determinant to 1 for all threads
if (threadIdx.x == 0) {
det_local = 1.0;
}
__syncthreads();
if (j < size) {
for (i = 0; i < size - 1; i++) {
// Synchronize threads to ensure previous row operations are complete
__syncthreads();
// Perform row operations on matrix elements below the diagonal
if (j > i) {
if (device_matrix[i * size + i]) {
ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
} else {
ratio = 1;
}
for (k = i; k < size; k++) {
device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
}
} else {
break;
}
}
// Multiply local determinant by diagonal element in the last row processed by this thread
det_local *= device_matrix[(j)*size + j];
}
// Thread 0 writes the final determinant to global memory
if (threadIdx.x == 0) {
*det = det_local;
}
}
__host__ double* build_matrix(uint32_t size) {
uint32_t i, j;
double* matrix = (double*)malloc(size * size * sizeof(double));
if (matrix == NULL) {
printf("Memory allocation error in build_matrix");
exit(2);
}
printf("OG matrix:\n");
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
matrix[i * size + j] = rand() % 10;
printf("%.3f ", matrix[i * size + j]);
}
printf("\n");
}
return matrix;
}
__host__ int main() {
uint32_t size, i;
printf("Enter the size of the matrix: ");
scanf("%u", &size);
if (size > MAX_SIZE) {
printf("The matrix is too big");
return 1;
}
srand((unsigned)time(NULL));
double* matrix = build_matrix(size);
double* device_matrix;
double host_det = 1.0;
double* device_det;
// Allocate memory on device
cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
cudaMalloc((void**)&device_det, sizeof(double));
// Copy matrix and determinant from host to device
cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);
// Recording time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//float start2 = clock();
calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
cudaThreadSynchronize();
// float end = clock();
// Recording time
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
// Copy determinant back from device to host
cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(device_matrix);
cudaFree(device_det);
free(matrix);
printf("Determinant is : %.3f\n", host_det);
printf("Time elapsed: %.2f\n", elapsedTime);
// printf("Time of execution = %.2f\n", end - start2);
return 0;
}
I've tried different ways, including changing the way I form a matrix, but there is still no result. Maybe there are some experienced CUDA users who can share their experience?