For two matrices X and Q of size 4x3 and 2x3 which in memory look like
x = [0 1 2 3 4 5 6 7 8 9 10 11]
q = [3 4 5 6 7 8]
I tried to use cublas multiplication cublasSgemm, but I couldn't manage to get expected results.
Since they are stored in row-major order so they should be interpreted as 3x4 and 3x2 so it seemed for me that
cublasSgemm(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
where
dim = 3
x_rows_num = 4
q_rows_num = 2
would work but in that case I got error
** On entry to SGEMM parameter number 8 had an illegal value
I also tried shuffling parameters a bit but I couldn't find any setup that would work.
So is it possible to multiply them without changing to column-major order?
EDIT:
So I got exepected results with changes made in this working example:
#include <cublas_v2.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
int main()
{
int x_rows_num = 4;
int q_rows_num = 2;
int dim = 3;
int N = x_rows_num*dim;
int M = q_rows_num*dim;
float *x, *q, *x_q_multiplication;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&q, M*sizeof(float));
cudaMallocManaged(&x_q_multiplication, q_rows_num*x_rows_num*dim);
for (int i = 0; i< N; i++) x[i] = i*1.0f;
for (int i = 0; i< M; i++) q[i] = (i + 3)*1.0f;
float *q_device;
cudaMallocManaged(&q_device, M*sizeof(float));
cudaMemcpy(q_device, q, M*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.f;
float beta = 0.f;
cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
x_rows_num, q_rows_num, dim,
&alpha,
x, dim,
q, dim,
&beta,
x_q_multiplication, x_rows_num);
cudaDeviceSynchronize();
for (int i = 0; i < q_rows_num*x_rows_num; i++) std::cout << x_q_multiplication[i] << " ";
cudaFree(x);
cudaFree(q);
cudaFree(x_q_multiplication);
return 0;
}
However I'am still not sure why dim became leading dimension