0

The printing output is always 0, after executing the kernel function. After some testing, cudaMemcpy is still correct. But the kernel seems not working, can not get correct data from d_inputs. Could somebody help explain? Thanks!

#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>

#define N 32

__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid<N) {
    double val =(double) d_inputs[tid];
    /*for (int iter=0; iter < niters; iter++){
    val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
    val = (val / 3.0) + 102.0;
    val = (val + 1.07) - 103.0;
    val = (val / 1.037) + 104.0;
    val = (val + 3.00) - 105.0;
    val = (val / 0.22) + 106.0;
    }*/
    val = val + 1.0;
    //printf("This is %f\n",val);
    d_outputs[tid] = val;
}
}

int main(int argc, char **argv)
{

    int niters = 10;
    printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);

    int inputs[N];
    for (int i = 0; i<N; i++){
    inputs[i] = i+1;
    }

    int d_inputs[N];
    double d_outputs[N];
    double outputs[N];

    cudaMalloc( (void**)&d_inputs, N*sizeof(int));
    cudaMalloc( (void**)&d_outputs, N*sizeof(double));
    printf("test %d \n", inputs[3]);
    cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
    printf("test %d \n", d_inputs[1]);
    Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
    //cudaDeviceSynchronize();
    cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
    for(int j =0;j<10; j++){
        printf("Outputs[%d] is:  %f and %f\n",j, d_outputs[j], outputs[j]);
        }
    cudaFree(d_inputs);
    cudaFree(d_outputs);

    return EXIT_SUCCESS;
}
user45690
  • 1
  • 4
  • 1
    You have absolutely no error checking in your code, so it could be failing at any point and you won't know. Add error checking after every call that can fail and see if that helps to narrow the problem down. – Paul R Nov 30 '16 at 14:16
  • CHECK(cudaMalloc( (void**)&d_inputs, N*sizeof(int))); CHECK(cudaGetLastError()); CHECK(cudaMalloc( (void**)&d_outputs, N*sizeof(double))); CHECK(cudaGetLastError()); printf("test %d \n", inputs[3]); CHECK(cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice)); CHECK(cudaGetLastError()); printf("test %d \n", d_inputs[1]); Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs); CHECK(cudaGetLastError()); – user45690 Nov 30 '16 at 14:36
  • After add this, I get the error, in the line of cudaMemcpy, says invalid argument – user45690 Nov 30 '16 at 14:37

1 Answers1

5
  1. Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, it will be useful for others trying to help you. If you had used proper cuda error checking here, you would be informed that your cudaMemcpy operations are reporting an invalid argument, due to item 3 below.
  2. Your code will not compile. cpu is not defined anywhere.
  3. We don't allocate for, or create device pointers like this:

    int d_inputs[N];
    double d_outputs[N];
    

    Those are creating stack variables (arrays) that the compiler is allowed to treat as if it were a constant pointer. Instead you should do it like this:

    int *d_inputs;
    double *d_outputs;
    

    the compiler understands that these are modifiable pointers (which you will modify later with cudaMalloc).

  4. Once you fix the issue in item 3, this will not be legal:

    printf("test %d \n", d_inputs[1]);
    

    as this requires dereferencing a device pointer (d_inputs) in host code, which is illegal in CUDA, at least as you have done so here. You have a similar problem in the printf statement later in your code as well (with d_outputs).

The following code has the above items addressed to some degree, and seems to run correctly for me:

$ cat t44.cu
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>
#include <math.h>

#define N 32

__global__ void Kernel_double(int niters, int* d_inputs,double* d_outputs)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid<N) {
    double val =(double) d_inputs[tid];
    /*for (int iter=0; iter < niters; iter++){
    val = (sqrt(pow(val,2.0)) + 5.0) - 101.0;
    val = (val / 3.0) + 102.0;
    val = (val + 1.07) - 103.0;
    val = (val / 1.037) + 104.0;
    val = (val + 3.00) - 105.0;
    val = (val / 0.22) + 106.0;
    }*/
    val = val + 1.0;
    //printf("This is %f\n",val);
    d_outputs[tid] = val;
}
}

int main(int argc, char **argv)
{

    int niters = 10;
    int cpu = 0;
    printf("Iterate %d times with GPU 0 or CPU 1: %d\n", niters, cpu);

    int inputs[N];
    for (int i = 0; i<N; i++){
    inputs[i] = i+1;
    }

    int *d_inputs;
    double *d_outputs;
    double outputs[N];

    cudaMalloc( (void**)&d_inputs, N*sizeof(int));
    cudaMalloc( (void**)&d_outputs, N*sizeof(double));
    printf("test %d \n", inputs[3]);
    cudaMemcpy(d_inputs, inputs, N*sizeof(int), cudaMemcpyHostToDevice);
//    printf("test %d \n", d_inputs[1]);
    Kernel_double<<<16,2>>>(niters, d_inputs,d_outputs);
    //cudaDeviceSynchronize();
    cudaMemcpy(outputs, d_outputs, N*sizeof(double), cudaMemcpyDeviceToHost);
    for(int j =0;j<10; j++){
        printf("Outputs[%d] is: %f\n",j, outputs[j]);
        }
    cudaFree(d_inputs);
    cudaFree(d_outputs);

    return EXIT_SUCCESS;
}
$ nvcc -lineinfo -arch=sm_61 -o t44 t44.cu
$ cuda-memcheck ./t44
========= CUDA-MEMCHECK
Iterate 10 times with GPU 0 or CPU 1: 0
test 4
Outputs[0] is: 2.000000
Outputs[1] is: 3.000000
Outputs[2] is: 4.000000
Outputs[3] is: 5.000000
Outputs[4] is: 6.000000
Outputs[5] is: 7.000000
Outputs[6] is: 8.000000
Outputs[7] is: 9.000000
Outputs[8] is: 10.000000
Outputs[9] is: 11.000000
========= ERROR SUMMARY: 0 errors
$
Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257