I would like to know how to generate a Cartesian product of more than two lists using CUDA.
How do I make this code work with three or more lists?
It works with two lists but not with three, I tried /, % without success.
Its basic.
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/copy.h>
#include <iterator>
__global__ void cartesian_product(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < a_size * b_size * c_size)
{
unsigned int a_idx = idx / a_size;
unsigned int b_idx = idx % a_size;
// ?
unsigned int c_idx = idx % a_size;
printf("a[a_idx] and b[b_idx] and c[c_idx] are: %d %d %d\n\n",a[a_idx], b[b_idx], c[c_idx]);
//1 3 5 , 1 3 6 , 1 4 5 , 1 4 6 , 2 3 5 , 2 3 6 , 2 4 5 , 2 4 6
//0 0 0 , 0 0 1 , 0 1 0 , 0 1 1 , 1 0 0 , 1 0 1 , 1 1 0 , 1 1 1
}
}
int main()
{
// host_vector is stored in host memory while device_vector livesin GPU device memory.
// a has storage for 2 integers
thrust::device_vector<int> a(2);
// initialize individual elements
a[0] = 1;
a[1] = 2;
// b has storage for 2 integers
thrust::device_vector<int> b(2);
// initialize individual elements
b[0] = 3;
b[1] = 4;
// d has storage for 2 integers
thrust::device_vector<int> c(2);
// initialize individual elements
c[0] = 5;
c[1] = 6;
unsigned int block_size = 256;
unsigned int num_blocks = (8 + (block_size - 1)) / block_size;
// raw_pointer_cast creates a "raw" pointer from a pointer-like type, simply returning the wrapped pointer, should it exist.
cartesian_product<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(a.data()), a.size(),
thrust::raw_pointer_cast(b.data()), b.size(),
thrust::raw_pointer_cast(c.data()), c.size());
return 0;
}
How do I get the right c_idx in the kernel and subsequent arrays if I want more than three lists?