0

My cuda code has the error unspecified launch failure with cudaMemory.I use my own computer with the system win10 and cuda 8.0 to run my code.I have checked my code for many times and I have not found the problem.I do not think I have the problem of memory crossed and I have checked that my kernel function is alright.Could you please help me with my code? Here is my code:

__global__ void Add(float* a,float* dist)
{
    int i = blockIdx.x;
    int j = threadIdx.x;
    float sum = 0;
    dist[i * 1024 + j] = 0.0;
    for (int k = 0; k < 10240; k++)
    {
         sum += (a[i * 10240 + k] - a[j * 10240 + k])*(a[i * 10240 + k] - a[j * 10240 + k]);
    }
    dist[i * 1024 + j] = sum;
}

int main()
{
    float* a, *distance;
    a = (float*)malloc(sizeof(float) * 1024 * 10240);
    distance = (float*)malloc(sizeof(float) * 1024 * 1024);
    if (a == NULL || distance == NULL)  printf("error\n");
    cudaError_t cudaStatus;
    for (int i = 0; i < 1024; i++)
    {
        for (int j = 0; j < 10240; j++)
        {
            a[i * 10240 + j] = i + 1.0 / 100 * j;
        }
    }
    for (int i = 0; i < 1024; i++)
    {
        for (int j = 0; j < 1024; j++)
        {
            distance[i * 1024 + j] = 0.0;
        }  
    }
    float* dev_a,* dev_distance;
    cudaMalloc((void**)&dev_a, 1024 * 10240*sizeof(float));
    cudaMalloc((void**)&dev_distance, 1024 * 1024 * sizeof(float));
    cudaMemcpy(dev_a, a, 1024 * 10240 * sizeof(float), cudaMemcpyHostToDevice);
    unsigned int start = clock();
    Add <<<1024,1024 >>>(dev_a,dev_distance);
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        printf( "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
     }
    cudaStatus = cudaMemcpy(distance, dev_distance, 1024 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        printf("cudaMemcpy: %s\n", cudaGetErrorString(cudaStatus));
    }
    unsigned int last = clock() - start;
    cudaFree(dev_a);
    printf("%u", last);
    return 0;
}
talonmies
  • 70,661
  • 34
  • 192
  • 269
hjw
  • 1
  • 1
    Your code runs without errors for me. However the kernel will take 2 seconds or more to run, probably. You may be hitting a timeout. Since you are on windows, you may want to google "cuda WDDM TDR" and start reading. – Robert Crovella May 15 '17 at 03:54
  • Thank you for your answer above.The problem is that the code runs more than 2 seconds on the gpu. After I have changed my system settings. The code is alright now. Thank you very much – hjw May 19 '17 at 05:29

0 Answers0