I have a thrust::device_vector <float> vec
. Assume that vec.size() = L
and that N < L
. I want to find the largest N
elements in vec
along with their indices. How can we do this efficiently using raw CUDA or thrust
?
2 Answers
A simple solution is to first sort the values and then select the last N
elements.
The following example selects the N=5
largest elements and their original indices from L=18
values.
compile using
nvcc -std=c++11 nlargest.cu -o nlargest
output when running ./nlargest
d_values: 1 2 3 4 5 6 7 8 9 4 5 6 7 8 9 0 1 2
d_indices: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
d_values: 0 1 1 2 2 3 4 4 5 5 6 6 7 7 8 8 9 9
d_indices: 15 0 16 1 17 2 3 9 4 10 5 11 6 12 7 13 8 14
d_values_s: 7 8 8 9 9
d_indices_s:12 7 13 8 14
nlargest.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl;
}
template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
int main()
{
const int size = 18;
const int select_size = 5;
float values[size] = {1,2,3,
4,5,6,
7,8,9,
4,5,6,
7,8,9,
0,1,2
};
thrust::host_vector<float> h_values (values, values+size);
thrust::device_vector<float> d_values = h_values;
thrust::device_vector<int> d_indices(size);
thrust::sequence(d_indices.begin(), d_indices.end());
PRINTER(d_values);
PRINTER(d_indices);
thrust::sort(zip(d_values.begin(), d_indices.begin()),zip(d_values.end(), d_indices.end()));
PRINTER(d_values);
PRINTER(d_indices);
thrust::device_vector<float> d_values_s(select_size);
thrust::device_vector<int> d_indices_s(select_size);
thrust::copy(zip(d_values.end()-select_size, d_indices.end()-select_size),
zip(d_values.end(), d_indices.end()),
zip(d_values_s.begin(), d_indices_s.begin())
);
PRINTER(d_values_s);
PRINTER(d_indices_s);
return 0;
}

- 16,063
- 7
- 53
- 88
You should checkout this question.
I like Ricky Bobby's answer (if N is much smaller than L).
I also suggest looking at the following paper. Fast K-selection Algorithm for Graphics Processing Units by Alabi T et al.
It provides 3 different parallel algorithms for K-selection. The bucket-select they describe performed best. The algorithm has two steps:
Step 1 (pre-processing to split the original data into vectors of size less than 2^21 elements)
Step 2:
1. Choose bucket containing Kth element
2. split the bucket again.
3. Repeat until the kth element is found (the min and max of the new bucket are equal).
Now you'll already have the other k-1 greatest elements partitioned into buckets.
This method is also referred to as distributive partitioning.

- 1,322
- 8
- 16
-
Link is dead, please replace – Krupip Jul 10 '19 at 14:03
-
@opa I replaced the link but could no longer find a link to the direct paper only to it's download page from ACM. – Timothy Murphy Jul 24 '19 at 18:23
-
I had to wayback machine it – Krupip Jul 24 '19 at 21:23