One possible approach would be to use a sequence of:
thrust::transform
- to convert the input data to all 1 or 0:
0 27 42 0 18 99 94 91 0 -- input data
0 1 1 0 1 1 1 1 0 -- this will be our "mask vector"
thrust::inclusive_scan
- to convert the mask vector into a progressive sequence:
0 1 1 0 1 1 1 1 0 -- "mask" vector
0 1 2 2 3 4 5 6 6 -- "sequence" vector
Another thrust::transform
to mask off the non-increasing values:
0 1 1 0 1 1 1 1 0 -- "mask" vector
0 1 2 2 3 4 5 6 6 -- "sequence" vector
-------------------------
0 1 2 0 3 4 5 6 0 -- result of "AND" operation
Note that we could combine the first two steps with thrust::transform_inclusive_scan
and then perform the third step as a thrust::transform
with a slightly different transform functor. This modification allows us to dispense with the creation of the temporary "mask" vector.
Here's a fully worked example showing the "modified" approach using thrust::transform_inclusive_scan
:
$ cat t635.cu
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>
#include <thrust/generate.h>
#include <thrust/copy.h>
#define DSIZE 20
#define PCT_ZERO 40
struct my_unary_op
{
__host__ __device__
int operator()(const int data) const
{
return (!data) ? 0:1;}
};
struct my_binary_op
{
__host__ __device__
int operator()(const int d1, const int d2) const
{
return (!d1) ? 0:d2;}
};
int main(){
// generate DSIZE random 32-bit integers, PCT_ZERO% are zero
thrust::host_vector<int> h_data(DSIZE);
thrust::generate(h_data.begin(), h_data.end(), rand);
for (int i = 0; i < DSIZE; i++)
if ((rand()%100)< PCT_ZERO) h_data[i] = 0;
else h_data[i] %= 1000;
thrust::device_vector<int> d_data = h_data;
thrust::device_vector<int> d_result(DSIZE);
thrust::transform_inclusive_scan(d_data.begin(), d_data.end(), d_result.begin(), my_unary_op(), thrust::plus<int>());
thrust::transform(d_data.begin(), d_data.end(), d_result.begin(), d_result.begin(), my_binary_op());
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t635 t635.cu
$ ./t635
0,886,777,0,793,0,386,0,649,0,0,0,0,59,763,926,540,426,0,736,
0,1,2,0,3,0,4,0,5,0,0,0,0,6,7,8,9,10,0,11,
$
Responding to the update, this new information makes the problem more difficult to solve, in my opinion. Histogramming techniques come to mind, but without any limits on the occupied range of the 32 bit integers (labels) or any limits on the number of times a particular label may be duplicated within the data set, histogramming techniques seem impractical. This leads me to consider sorting the data.
An approach like this should work:
- Use
thrust::sort
to sort the data.
- Use
thrust::unique
to remove the duplicates.
- The sorted data with duplicates removed now gives us our ordering for our output set [0,1,2, ...]. Let's call this our "map". We can use a parallel binary-search technique to convert each label in the original data set to it's mapped output value.
This process seems pretty "expensive" to me. I would suggest reconsidering the upstream labelling operation to see if it can be re-designed to produce a data set more amenable to efficient downstream processing.
Anyway here is a fully worked example:
$ cat t635.cu
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/copy.h>
#define DSIZE 20
#define PCT_ZERO 40
#define RNG 10
#define nTPB 256
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_data, int *my_idx, const unsigned len_data){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx < len_data){
unsigned sp_a;
int val = my_data[idx];
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = sp_a;
}
}
int main(){
// generate DSIZE random 32-bit integers, PCT_ZERO% are zero
thrust::host_vector<int> h_data(DSIZE);
thrust::generate(h_data.begin(), h_data.end(), rand);
for (int i = 0; i < DSIZE; i++)
if ((rand()%100)< PCT_ZERO) h_data[i] = 0;
else h_data[i] %= RNG;
thrust::device_vector<int> d_data = h_data;
thrust::device_vector<int> d_result = d_data;
thrust::sort(d_result.begin(), d_result.end());
thrust::device_vector<int> d_unique = d_result;
int unique_size = thrust::unique(d_unique.begin(), d_unique.end()) - d_unique.begin();
find_my_idx<<< (DSIZE+nTPB-1)/nTPB , nTPB >>>(thrust::raw_pointer_cast(d_unique.data()), unique_size, thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d_result.data()), DSIZE);
thrust::copy(d_data.begin(), d_data.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc t635.cu -o t635
$ ./t635
0,6,7,0,3,0,6,0,9,0,0,0,0,9,3,6,0,6,0,6,
0,2,3,0,1,0,2,0,4,0,0,0,0,4,1,2,0,2,0,2,
$