I'm new to CUDA and sure that I'm doing something that's simple enough to fix, but I'm also not sure what to exactly search to find an answer. I've tried looking around but to no avail.
I have a few functions in my code that I want to perform matrix operations with, so instead of writing the code to allocate the memory multiple times, I want to use a function to do that for me. My issue is that the memory location is not being passed back to the function calling my MatrixInitCUDA function.
If I directly allocate the memory in my matrix functions it works as expected, but the issue I'm running into is that my pointer to device memory is only being assigned to the pointer inside of the MatrixInitCUDA function.
Initially I thought that there might have been some kind of type conversion of the arguments, so I included the typeinfo header and printed out the type of the device argument before and after cudaMalloc (no change - not surprising). I've tried passing in double pointers for the device matrix arguments but that doesn't seem to work either, although I'm not I did it properly either.
// Compile using nvcc <file> -lcublas -o <output>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <typeinfo>
// Define block size for thread allocation
#define BLOCK_DIM 32
#define N 10
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int A_height, A_width, B_height, B_width, C_height, C_width;
} MatrixSize;
void SetMatrixSize(MatrixSize *matrixSize,
unsigned int widthA, unsigned int heightA,
unsigned int widthB, unsigned int heightB,
unsigned int widthC, unsigned int heightC)
{
matrixSize->A_height = heightA;
matrixSize->A_width = widthA;
matrixSize->B_height = heightB;
matrixSize->B_width = widthB;
matrixSize->C_height = heightC;
matrixSize->C_width = widthC;
}
void MatrixInitCUDA(int argc, char **argv, int &devID, MatrixSize *matrixSize,
float *host_matrixA, float *host_matrixB, float *host_matrixC,
float *dev_matrixA, float *dev_matrixB, float *dev_matrixC)
{
// Assign CUDA variables
devID = 0;
cudaGetDevice(&devID);
cudaError_t err;
// Assign size variables
size_t matrixA_size = matrixSize->A_height * matrixSize->A_width * sizeof(float);
printf("Allocation size: %d\tMatrix Size: %d\n", (int) matrixA_size, matrixSize->A_height * matrixSize->A_width);
size_t matrixB_size = matrixSize->B_height * matrixSize->B_width * sizeof(float);
size_t matrixC_size = matrixSize->C_height * matrixSize->C_width * sizeof(float);
printf("PRE ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
// Allocate memory on GPU
err = cudaMalloc((void **) &dev_matrixA, matrixA_size);
printf("POST ALLOC TYPE: %s\n", typeid(typeof(dev_matrixA)).name());
printf("DEV A POST ALLOC: %p\n", dev_matrixA);
if (err != cudaSuccess) printf("Allocate matrix A: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixB, matrixB_size);
if (err != cudaSuccess) printf("Allocate matrix B: %s\n", cudaGetErrorString(err));
err = cudaMalloc((void **) &dev_matrixC, matrixC_size);
if (err != cudaSuccess) printf("Allocate matrix C: %s\n", cudaGetErrorString(err));
// Copy data from host PC to GPU
err = cudaMemcpy(dev_matrixA, host_matrixA, matrixA_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix A to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixB, host_matrixB, matrixB_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix B to GPU: %s\n", cudaGetErrorString(err));
err =cudaMemcpy(dev_matrixC, host_matrixC, matrixC_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) printf("Copy matrix C to GPU: %s\n", cudaGetErrorString(err));
}
int main(int argc, char **argv)
{
// Create memory for Layer 1, Layer 2, Layer 3 vectors
// float *layer1 = malloc(784*sizeof(floats)))
// Create memory for Weight 1->2, Weight 2->3 matrices
// Layer 1 will read from file for input (X) values
// Layer 2 and 3 will be calculated
int devID = 0;
cudaGetDevice(&devID);
// Testing hadamard product, init function, and set matrix size function
float *host_A, *host_B, *host_C, *dev_A = NULL, *dev_B = NULL, *dev_C = NULL;
MatrixSize *mallocTest = (MatrixSize *) calloc(sizeof(MatrixSize), 1);
size_t calcSize = N * N * sizeof(float);
host_A = (float *) calloc(calcSize, 1);
host_B = (float *) calloc(calcSize, 1);
host_C = (float *) calloc(calcSize, 1);
SetMatrixSize(mallocTest, N, N, N, N, N, N);
printf("DEV A PRE ALLOC: %p\n", dev_A);
// Initialize memory on GPU
MatrixInitCUDA(argc, argv, devID, mallocTest,
host_A, host_B, host_C,
dev_A, dev_B, dev_C);
printf("DEV A POST INIT: %p\n", dev_A);
return 0;
}
Here's the output I get if I compile and run this code:
DEV A PRE ALLOC: (nil)
Allocation size: 400 Matrix Size: 100
PRE ALLOC TYPE: Pf
POST ALLOC TYPE: Pf
DEV A POST ALLOC: 0x10208400000
DEV A POST INIT: (nil)