A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
There is no problem with shared memory size. You may wish to review Mark Harris' canonical parallel reduction tutorial and look at the later methods to understand how we can use a loop to populate shared memory, reducing values into shared memory as we go. Once the input loop is completed, then we begin the block-sweep phase of the reduction. This doesn't impose any special requirements on the shared memory per block.
Here's a worked example demonstrating both a thrust::reduce_by_key
method (single call) and a CUDA block-segmented method (single kernel call):
$ cat t1535.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <cstdlib>
#define IMAX(x,y) (x>y)?x:y
#define IMIN(x,y) (x<y)?x:y
typedef int dtype;
const int ncubes = 3;
struct Cube {
dtype* x;
dtype* y;
dtype* z;
int size;
};
struct my_f
{
template <typename T1, typename T2>
__host__ __device__
thrust::tuple<dtype,dtype> operator()(T1 t1, T2 t2){
thrust::tuple<dtype,dtype> r;
thrust::get<0>(r) = IMAX(thrust::get<0>(t1),thrust::get<0>(t2));
thrust::get<1>(r) = IMIN(thrust::get<1>(t1),thrust::get<1>(t2));
return r;
}
};
const int MIN = -1;
const int MAX = 0x7FFFFFFF;
const int BS = 512;
template <typename T>
__global__ void block_segmented_minmax_reduce(const T * __restrict__ in, T * __restrict__ max, T * __restrict__ min, const size_t * __restrict__ slen){
__shared__ T smax[BS];
__shared__ T smin[BS];
size_t my_seg_start = slen[blockIdx.x];
size_t my_seg_size = slen[blockIdx.x+1] - my_seg_start;
smax[threadIdx.x] = MIN;
smin[threadIdx.x] = MAX;
for (size_t idx = my_seg_start+threadIdx.x; idx < my_seg_size; idx += BS){
T my_val = in[idx];
smax[threadIdx.x] = IMAX(my_val, smax[threadIdx.x]);
smin[threadIdx.x] = IMIN(my_val, smin[threadIdx.x]);}
for (int s = BS>>1; s > 0; s>>=1){
__syncthreads();
if (threadIdx.x < s){
smax[threadIdx.x] = IMAX(smax[threadIdx.x], smax[threadIdx.x+s]);
smin[threadIdx.x] = IMIN(smin[threadIdx.x], smin[threadIdx.x+s]);}
}
if (!threadIdx.x){
max[blockIdx.x] = smax[0];
min[blockIdx.x] = smin[0];}
}
int main() {
// data setup
Cube *c = new Cube[ncubes];
thrust::host_vector<size_t> csize(ncubes+1);
csize[0] = 100;
csize[1] = 1047;
csize[2] = 5000;
csize[3] = 0;
c[0].x = new dtype[csize[0]];
c[1].x = new dtype[csize[1]];
c[2].x = new dtype[csize[2]];
size_t ctot = 0;
for (int i = 0; i < ncubes; i++) ctot+=csize[i];
// method 1: thrust
// concatenate
thrust::host_vector<dtype> h_d(ctot);
size_t start = 0;
for (int i = 0; i < ncubes; i++) {thrust::copy_n(c[i].x, csize[i], h_d.begin()+start); start += csize[i];}
for (size_t i = 0; i < ctot; i++) h_d[i] = rand();
thrust::device_vector<dtype> d_d = h_d;
// build flag vector
thrust::device_vector<int> d_f(d_d.size());
thrust::host_vector<size_t> coff(csize.size());
thrust::exclusive_scan(csize.begin(), csize.end(), coff.begin());
thrust::device_vector<size_t> d_coff = coff;
thrust::scatter(thrust::constant_iterator<int>(1), thrust::constant_iterator<int>(1)+ncubes, d_coff.begin(), d_f.begin());
thrust::inclusive_scan(d_f.begin(), d_f.end(), d_f.begin());
// min/max reduction
thrust::device_vector<dtype> d_max(ncubes);
thrust::device_vector<dtype> d_min(ncubes);
thrust::reduce_by_key(d_f.begin(), d_f.end(), thrust::make_zip_iterator(thrust::make_tuple(d_d.begin(), d_d.begin())), thrust::make_discard_iterator(), thrust::make_zip_iterator(thrust::make_tuple(d_max.begin(), d_min.begin())), thrust::equal_to<int>(), my_f());
thrust::host_vector<dtype> h_max = d_max;
thrust::host_vector<dtype> h_min = d_min;
std::cout << "Thrust Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "Thrust Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
// method 2: CUDA kernel (block reduce)
block_segmented_minmax_reduce<<<ncubes, BS>>>(thrust::raw_pointer_cast(d_d.data()), thrust::raw_pointer_cast(d_max.data()), thrust::raw_pointer_cast(d_min.data()), thrust::raw_pointer_cast(d_coff.data()));
thrust::copy_n(d_max.begin(), ncubes, h_max.begin());
thrust::copy_n(d_min.begin(), ncubes, h_min.begin());
std::cout << "CUDA Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "CUDA Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1535 t1535.cu
$ ./t1535
Thrust Maxima:
2145174067,2147469841,2146753918,
Thrust Minima:
35005211,2416949,100669,
CUDA Maxima:
2145174067,2147469841,2146753918,
CUDA Minima:
35005211,2416949,100669,
$
For a small number of Cube
objects, the thrust method is likely to be faster. It will tend to make better use of medium to large GPUs than the block method will. For a large number of Cube
objects, the block method should also be fairly efficient.