Array multiplication in Cuda produce in error

Question

I want to multiply two arrays and I have a problem with Cuda RunTime API. There is an error: cuda runtime API error 11: invalid argument. What is the argument? Help me, please The code:

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <fstream>

// includes, project
#include <cufft.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <shrQATest.h>

#define SIGNAL_SIZE  8192
#define SIGNAL_COUNT 384

// Complex data type
typedef float2 Complex; 

static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __device__ __host__ inline Complex ComplexAdd(Complex );
static __global__ void ComplexPointwiseMA(Complex* , Complex* , Complex* );

// Complex addition 
static __device__ __host__ inline Complex ComplexAdd(Complex a,Complex b)
{
    Complex c;
    c.x = a.x + b.x;
    c.y = a.y + b.y;
    return c;
}

// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
    Complex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}

// Complex Mul 'n Add function
static __global__ void ComplexPointwiseMA(Complex* mas1, Complex* mas2, Complex*mas_tmp)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    int j=0, k=0; 
    Complex c;
    int size=SIGNAL_COUNT*SIGNAL_SIZE;
    for (int i = threadID; i <size; i += numThreads)
    {
        if((i%SIGNAL_SIZE==0)&&(j<SIGNAL_COUNT))
            j++;
        c=ComplexMul(mas1[i], mas2[j]);
        mas_tmp[k]=ComplexAdd(mas_tmp[k],c);
        if(k>=SIGNAL_SIZE-1)
            k=0;
        else
            k++;
    }
}

int main(int argc, char** argv)
{
    int threads_per_block=256;
    int blocks_per_grid=16;
    //dim3 dimBlock(SIGNAL_COUNT, SIGNAL_SIZE); // threads 
    //dim3 dimGrid(SIGNAL_COUNT); // 384 blocks in a grid
    cudaEvent_t start, stop;
    float elapsedTime;
    //shrQAStart(argc, argv);
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // For random
    srand(1001);

    // Allocate host memory for the mass
    Complex* mas_tmp=NULL;
    mas_tmp =new Complex [SIGNAL_COUNT*SIGNAL_SIZE];

    Complex* mas1=NULL;
    mas1 =new Complex [SIGNAL_COUNT*SIGNAL_SIZE];
    for(unsigned int i = 0; i <SIGNAL_COUNT*SIGNAL_SIZE; ++i)
    {
        mas1[i].x = rand() / (float)RAND_MAX;
        mas1[i].y = rand() / (float)RAND_MAX;
    }

    Complex* mas2=NULL;
    mas2 =new Complex [SIGNAL_COUNT];
    for(unsigned int i = 0; i < SIGNAL_COUNT; ++i)
    {
        mas2[i].x = rand() / (float)RAND_MAX;
        mas2[i].y = rand() / (float)RAND_MAX;
    }

    // Timer
    cudaEventRecord(start,0);

    // Allocate device memory for mass
    Complex* mastmp_=NULL; // temporary 
    cutilSafeCall(cudaMalloc(&mastmp_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex)));
    Complex* mas1_=NULL; // signal
    cutilSafeCall(cudaMalloc(&mas1_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex)));
    Complex* mas2_=NULL; // coefficient
    cutilSafeCall(cudaMalloc(&mas2_, SIGNAL_COUNT*sizeof(Complex)));

    // Copy host data to device
    cutilSafeCall(cudaMemcpy(mas1_, mas1, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex),    cudaMemcpyHostToDevice));
    cutilSafeCall(cudaMemcpy(mas2_, mas2, SIGNAL_COUNT*sizeof(Complex), cudaMemcpyHostToDevice));

    // Calling  Mul 'n Add function
    ComplexPointwiseMA<<<blocks_per_grid, threads_per_block>>>(mas1_, mas2_, mastmp_);

    // Check if kernel execution generated and error
    cutilCheckMsg("Kernel execution failed [ ComplexPointwiseMA ]");

    // Copy device memory to host
    cutilSafeCall(cudaMemcpy(mastmp_, mas_tmp, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start,stop);
    printf("Time %3.10f ms\n", elapsedTime);

    // Free memory  
    free(mas1);
    free(mas2);
    free(mas_tmp);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cutilSafeCall(cudaFree(mas1_));
    cutilSafeCall(cudaFree(mas2_));
    cutilSafeCall(cudaFree(mastmp_));

    cutilDeviceReset();
    system ("pause");
    return 0;
}

Please pay much closer attention to your code formatting in future. You can see tips on the conventions for posting and formatting code in [this meta stackoverflow question](http://meta.stackexchange.com/q/22186/163653). — talonmies, Jan 09 '13 at 08:18
Also, you haven't explained where in the code the error occurs. — talonmies, Jan 09 '13 at 08:36
You could use `cublasCgemv()` to do the matrix-vector multiplication to simplify your code and maximize the performance. — kangshiyin, Jan 09 '13 at 09:08

score 2 · Answer 1 · answered Jan 09 '13 at 09:34

The following line in your code will generate the error of Invalid Argument.

// Copy device memory to host
    cutilSafeCall(cudaMemcpy(mastmp_, mas_tmp, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));

In the function cudaMemcpy, the first argument is the destination pointer and the 2nd argument is the source pointer. You have specified the flag cudaMemcpyDeviceToHost but the host(destination) and device(source) pointers are interchanged.

You should be doing this (swap the first 2 arguments):

cutilSafeCall(cudaMemcpy(mas_tmp, mastmp_,SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));

Array multiplication in Cuda produce in error

1 Answers1