My cuda code has the error unspecified launch failure with cudaMemory.I use my own computer with the system win10 and cuda 8.0 to run my code.I have checked my code for many times and I have not found the problem.I do not think I have the problem of memory crossed and I have checked that my kernel function is alright.Could you please help me with my code? Here is my code:
__global__ void Add(float* a,float* dist)
{
int i = blockIdx.x;
int j = threadIdx.x;
float sum = 0;
dist[i * 1024 + j] = 0.0;
for (int k = 0; k < 10240; k++)
{
sum += (a[i * 10240 + k] - a[j * 10240 + k])*(a[i * 10240 + k] - a[j * 10240 + k]);
}
dist[i * 1024 + j] = sum;
}
int main()
{
float* a, *distance;
a = (float*)malloc(sizeof(float) * 1024 * 10240);
distance = (float*)malloc(sizeof(float) * 1024 * 1024);
if (a == NULL || distance == NULL) printf("error\n");
cudaError_t cudaStatus;
for (int i = 0; i < 1024; i++)
{
for (int j = 0; j < 10240; j++)
{
a[i * 10240 + j] = i + 1.0 / 100 * j;
}
}
for (int i = 0; i < 1024; i++)
{
for (int j = 0; j < 1024; j++)
{
distance[i * 1024 + j] = 0.0;
}
}
float* dev_a,* dev_distance;
cudaMalloc((void**)&dev_a, 1024 * 10240*sizeof(float));
cudaMalloc((void**)&dev_distance, 1024 * 1024 * sizeof(float));
cudaMemcpy(dev_a, a, 1024 * 10240 * sizeof(float), cudaMemcpyHostToDevice);
unsigned int start = clock();
Add <<<1024,1024 >>>(dev_a,dev_distance);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
printf( "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMemcpy(distance, dev_distance, 1024 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("cudaMemcpy: %s\n", cudaGetErrorString(cudaStatus));
}
unsigned int last = clock() - start;
cudaFree(dev_a);
printf("%u", last);
return 0;
}