-1

I have a thrust::device_vector <float> vec. Assume that vec.size() = L and that N < L. I want to find the largest N elements in vec along with their indices. How can we do this efficiently using raw CUDA or thrust?

Nathan Tuggy
  • 2,237
  • 27
  • 30
  • 38
Hieu Pham
  • 97
  • 1
  • 8

2 Answers2

2

A simple solution is to first sort the values and then select the last N elements.

The following example selects the N=5 largest elements and their original indices from L=18 values.

compile using

nvcc -std=c++11 nlargest.cu -o nlargest


output when running ./nlargest

d_values:   1   2   3   4   5   6   7   8   9   4   5   6   7   8   9   0   1   2   
d_indices:  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  
d_values:   0   1   1   2   2   3   4   4   5   5   6   6   7   7   8   8   9   9   
d_indices:  15  0   16  1   17  2   3   9   4   10  5   11  6   12  7   13  8   14  
d_values_s: 7   8   8   9   9   
d_indices_s:12  7   13  8   14  

nlargest.cu

#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <iostream>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}


int main()
{

    const int size = 18;
    const int select_size = 5;

    float values[size] = {1,2,3,
                          4,5,6,
                          7,8,9,
                          4,5,6,
                          7,8,9,
                          0,1,2
    };

    thrust::host_vector<float> h_values (values, values+size);
    thrust::device_vector<float> d_values = h_values;
    thrust::device_vector<int> d_indices(size);
    thrust::sequence(d_indices.begin(), d_indices.end());

    PRINTER(d_values);
    PRINTER(d_indices);
    thrust::sort(zip(d_values.begin(), d_indices.begin()),zip(d_values.end(), d_indices.end()));
    PRINTER(d_values);
    PRINTER(d_indices);

    thrust::device_vector<float> d_values_s(select_size);
    thrust::device_vector<int> d_indices_s(select_size);

    thrust::copy(zip(d_values.end()-select_size, d_indices.end()-select_size),
                zip(d_values.end(), d_indices.end()),
                zip(d_values_s.begin(), d_indices_s.begin())
                );
    PRINTER(d_values_s);
    PRINTER(d_indices_s);

    return 0;
}
m.s.
  • 16,063
  • 7
  • 53
  • 88
1

You should checkout this question.

I like Ricky Bobby's answer (if N is much smaller than L).

I also suggest looking at the following paper. Fast K-selection Algorithm for Graphics Processing Units by Alabi T et al.

It provides 3 different parallel algorithms for K-selection. The bucket-select they describe performed best. The algorithm has two steps:

Step 1 (pre-processing to split the original data into vectors of size less than 2^21 elements)

Step 2:

1. Choose bucket containing Kth element
2. split the bucket again.
3. Repeat until the kth element is found (the min and max of the new bucket are equal).

Now you'll already have the other k-1 greatest elements partitioned into buckets.

This method is also referred to as distributive partitioning.

Timothy Murphy
  • 1,322
  • 8
  • 16