2

Recently I started to study the CUDA technology, which allows me to increase the speed of calculations by several times thanks to the paralleling technology. The program I wrote has to calculate the determinant of a square matrix using the Gaussian method. Unfortunately, for some reason it always tells me that the determinant of a matrix is equal to one, although it is not. In addition, the time is also shown incorrectly. Most likely, there is an error in calculate_determinant function, but I haven't been able to find it.

I work in Ubuntu 22.02 and this is how I compile my program: nvcc test2.cu -o test2

The output of the ./test2

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include <clocale>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <cuda.h>

#define MAX_SIZE 10000
#define BLOCK_SIZE 512
#define THREAD_SIZE 1024

__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
     int j = threadIdx.x + blockDim.x * blockIdx.x;
    double ratio;
    int i, k;
    double det_local;

    // Initialize local determinant to 1 for all threads
    if (threadIdx.x == 0) {
        det_local = 1.0;
    }
    __syncthreads();

    if (j < size) {
        for (i = 0; i < size - 1; i++) {
            // Synchronize threads to ensure previous row operations are complete
            __syncthreads();

            // Perform row operations on matrix elements below the diagonal
            if (j > i) {
                if (device_matrix[i * size + i]) {
                    ratio = device_matrix[j * size + i] / device_matrix[i * size + i];
                } else {
                    ratio = 1;
                }
                for (k = i; k < size; k++) {
                    device_matrix[j * size + k] -= ratio * device_matrix[i * size + k];
                }
            } else {
                break;
            }
        }
        // Multiply local determinant by diagonal element in the last row processed by this thread
        det_local *= device_matrix[(j)*size + j];
    }

    // Thread 0 writes the final determinant to global memory
    if (threadIdx.x == 0) {
        *det = det_local;
    }
}


__host__ double* build_matrix(uint32_t size) {
    uint32_t i, j;
    double* matrix = (double*)malloc(size * size * sizeof(double));
    if (matrix == NULL) {
        printf("Memory allocation error in build_matrix");
        exit(2);
    }

    printf("OG matrix:\n");
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            matrix[i * size + j] = rand() % 10;
            printf("%.3f ", matrix[i * size + j]);
        }
        printf("\n");
    }

    return matrix;
}

__host__ int main() {

    uint32_t size, i;
    printf("Enter the size of the matrix: ");
    scanf("%u", &size);
    if (size > MAX_SIZE) {
        printf("The matrix is too big");
        return 1;
    }

    srand((unsigned)time(NULL));

    double* matrix = build_matrix(size);
    double* device_matrix;
    double host_det = 1.0;
    double* device_det;

    // Allocate memory on device
    cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
    cudaMalloc((void**)&device_det, sizeof(double));

    // Copy matrix and determinant from host to device
    cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);

    //  Recording time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    //float start2 = clock();


   
    calculate_determinant <<<BLOCK_SIZE+1, THREAD_SIZE >>> (device_matrix, size, device_det);
    cudaThreadSynchronize();

   // float end = clock();


    // Recording time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);


    // Copy determinant back from device to host
    cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);

    // Free memory on device
    cudaFree(device_matrix);
    cudaFree(device_det);
    free(matrix);

    printf("Determinant is : %.3f\n", host_det);
    printf("Time elapsed:  %.2f\n", elapsedTime);
   // printf("Time of execution =  %.2f\n", end - start2);

    return 0;
}

I've tried different ways, including changing the way I form a matrix, but there is still no result. Maybe there are some experienced CUDA users who can share their experience?

Fillenbill
  • 21
  • 3
  • 2
    gaussian methods often require synchronization, and yours indicates that (" // Synchronize threads to ensure previous row operations are complete"). The sync method you have (`__syncthreads()`) only synchronizes threads in a block. It does not provide inter-block synchronization, so this cannot work for more than 1 block. You are launching 513 blocks. – Robert Crovella Mar 26 '23 at 20:26
  • I tried to launch `calculate_determinant <<<1, THREAD_SIZE >>> (device_matrix, size, device_det); ` with only one block, but the output still didn't change – Fillenbill Mar 26 '23 at 20:33
  • 4
    threads other than 0 are using `det_local` uninitialized. – Robert Crovella Mar 26 '23 at 20:39
  • 3
    Also, doing a `break` in a loop with `__syncthreads()` is not a good idea as it means not all active threads may participate in the barrier, resulting in a deadlock. – Homer512 Mar 27 '23 at 07:18
  • when I run your code as posted on a V100, and enter a matrix size of 32, I get a result: `Determinant is : 3.000`. So I am not able to reproduce your observation that the determinant reported is always 1. – Robert Crovella Mar 29 '23 at 01:34

1 Answers1

1

It wasn't evident to me what algorithm you were trying to use from your code. Therefore I did a google search, found this and implemented that.

  • The gaussian method has various problems such as handling of zero values. I'm not going to try to cover all that. I imagine there are certain input patterns that would cause this method to blow up. It would require extensive special-casing to work around; I have not done that. I assume this is a learning exercise. You should not use my code below for serious numerical work.
  • I've not tested this code except for matrix sizes of 2 and 3.
  • The synchronization method you have chosen only works for 1 block, so this is limited to matrix sizes of up to 1024.
  • You could extend this to higher than 1024 using a grid-stride loop albeit still using only a single block
  • You could extend this more generally using cooperative groups grid sync to multi-block.

Example:

$ cat t2234.cu
#include <stdio.h>
#include <stdint.h>
#include <time.h>

#define THREAD_SIZE 1024 // must be power-of-2, between 2 and 1024
#define MAX_SIZE THREAD_SIZE

__global__ void calculate_determinant(double* device_matrix, int size, double* det) {
    __shared__ double smem[THREAD_SIZE];
    int col = threadIdx.x; // requires changes to work with more than 1 block
    double ratio;
    int row, tcol;

    for (tcol = 0; tcol < size-1; tcol++)
      for (row = tcol+1; row < size; row++) {
        // Perform row operations on matrix elements to zero out elements below main diagonal
        ratio = device_matrix[row * size + tcol] / device_matrix[tcol * size + tcol];
        __syncthreads();
        device_matrix[row * size + col] -= ratio * device_matrix[tcol * size + col];
        // Synchronize threads to ensure previous row operations are complete
        __syncthreads();
        }
        // compute determinant - parallel sweep product reduction
    smem[threadIdx.x] = device_matrix[threadIdx.x*size+threadIdx.x];
    for (int st = THREAD_SIZE/2; st > 0; st>>=1){
      __syncthreads();
      if ((col<st)&&((col+st)<blockDim.x)) smem[col] *= smem[col+st];}
    if (!threadIdx.x) *det = smem[0];
}


__host__ double* build_matrix(uint32_t size) {
    uint32_t i, j;
    double* matrix = (double*)malloc(size * size * sizeof(double));
    if (matrix == NULL) {
        printf("Memory allocation error in build_matrix");
        exit(2);
    }

    printf("OG matrix:\n");
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            matrix[i * size + j] = rand() % 10;
            if ((i ==0) && (matrix[i*size+j] == 0)) matrix[i*size+j] = 1;
            printf("%.3f ", matrix[i * size + j]);
        }
        printf("\n");
    }

    return matrix;
}

__host__ int main() {

    uint32_t size;
    printf("Enter the size of the matrix: ");
    scanf("%u", &size);
    if (size > MAX_SIZE) {
        printf("The matrix is too big");
        return 1;
    }

    srand((unsigned)time(NULL));

    double* matrix = build_matrix(size);
    double* device_matrix;
    double host_det = 1.0;
    double* device_det;

    // Allocate memory on device
    cudaMalloc((void**)&device_matrix, size * size * sizeof(double));
    cudaMalloc((void**)&device_det, sizeof(double));

    // Copy matrix and determinant from host to device
    cudaMemcpy(device_matrix, matrix, size * size * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(device_det, &host_det, sizeof(double), cudaMemcpyHostToDevice);

    //  Recording time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    //float start2 = clock();



    calculate_determinant <<<1, size >>> (device_matrix, size, device_det);
    cudaDeviceSynchronize();

   // float end = clock();


    // Recording time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);


    // Copy determinant back from device to host
    cudaMemcpy(&host_det, &(device_det[0]), sizeof(double), cudaMemcpyDeviceToHost);

    // Free memory on device
    cudaFree(device_matrix);
    cudaFree(device_det);
    free(matrix);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("CUDA error: %s\n", cudaGetErrorString(err));
    printf("Determinant is : %.3f\n", host_det);
    printf("Time elapsed:  %.2f\n", elapsedTime);
   // printf("Time of execution =  %.2f\n", end - start2);

    return 0;
}
$ nvcc -o t2234 t2234.cu
$ ./t2234
Enter the size of the matrix: 3
OG matrix:
5.000 1.000 8.000
1.000 3.000 2.000
9.000 3.000 2.000
Determinant is : -176.000
Time elapsed:  0.05
$
  • The reported result above seems to match what is reported in online calculators such as this one.
  • If you are not able to get similar results, run your code with compute-sanitizer to see if you have any system/hardware/cuda-setup problems.
  • For development work, as you are learning and trying to debug something, it's probably better not to have your code generate a new random matrix every time you run it, but rather start with a fixed matrix until you get that working.
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257