I am looking at switching from nvidia to amd for my compute card because I want double precision support. Before doing this I decided to learn opencl on my nvidia card to see if I like it. I want to convert the following code from CUDA to OpenCL. I am using the curand library to generate uniformly and normally distributed random numbers. Each thread needs to be able to create a different sequence of random numbers and generate a few million per thread. Here is the code. How would I go about this in OpenCL. Everything I have read online seems to imply that I should generate a buffer of random numbers and then use that on the gpu but this is not practical for me.
template<int NArgs, typename OptimizationFunctor>
__global__
void statistical_solver_kernel(float* args_lbounds,
float* args_ubounds,
int trials,
int initial_temp,
unsigned long long seed,
float* results,
OptimizationFunctor f)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= trials)
return;
curandState rand;
curand_init(seed, idx, 0, &rand);
float x[NArgs];
for(int i = 0; i < NArgs; i++)
{
x[i] = curand_uniform(&rand) * (args_ubounds[i]- args_lbounds[i]) + args_lbounds[i];
}
float y = f(x);
for(int t = initial_temp - 1; t > 0; t--)
{
float t_percent = (float)t / initial_temp;
float x_prime[NArgs];
for(int i = 0; i < NArgs; i++)
{
x_prime[i] = curand_normal(&rand) * (args_ubounds[i] - args_lbounds[i]) * t_percent + x[i];
x_prime[i] = fmaxf(args_lbounds[i], x_prime[i]);
x_prime[i] = fminf(args_ubounds[i], x_prime[i]);
}
float y_prime = f(x_prime);
if(y_prime < y || (y_prime - y) / y_prime < t_percent)
{
y = y_prime;
for(int i = 0; i < NArgs; i++)
{
x[i] = x_prime[i];
}
}
}
float* rptr = results + idx * (NArgs + 1);
rptr[0] = y;
for(int i = 1; i <= NArgs; i++)
rptr[i] = x[i - 1];
}