I would like to generate an array of integers using GPU. I found some solutions in NVIDIA Documentation and based on it I wrote the simple code below. When I run it, it works well but only if arraySize variable is less or equal 291670. For greater values, calling cudaDeviceSynchronize() returns cudaErrorLaunchFailure (error 4) - "unspecified launch failure".
In my solution I need much more longer arrays.
Is this some restriction on array length or my fault?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>
#include <helper_cuda.h>
#include <curand.h>
#include <stdio.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <cstdlib>
__device__ const int MAX_THREADS_PER_BLOCK = 1024;
__device__ const int MAX_BLOCKS = 65535;
__device__ const unsigned int arraySize = 291670;
__global__ void _rndInit_(unsigned int seed, curandState_t* states, unsigned int arraySize) {
long tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < arraySize) {
curand_init(seed, tid, 0, &states[tid]);
tid += blockDim.x * gridDim.x;
}
}
void rndInit(unsigned int seed, curandState_t* states, int arraySize) {
int threads = 128;
int blocks = std::min((arraySize + threads - 1) / threads, MAX_BLOCKS);
_rndInit_ <<< blocks, threads >>>(time(0), states, arraySize);
}
int main() {
curandState_t* d_states;
cudaError_t cudaStatus;
checkCudaErrors(cudaMalloc((void**)&d_states, arraySize * sizeof(curandState_t)));
rndInit(time(0), d_states, arraySize);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
std::cout << cudaStatus;
cudaFree(d_states);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
std::cout << cudaStatus;
return 1;
}
return 0;
}