Is there a function to perform nested thrust function?-CodePudding

I'm new to CUDA and the thrust library. I'm learning and trying to implement a function that will have a for loop doing a thrust function. Is there a way to convert this loop into another thrust function? Or should I use a CUDA kernel to achieve this?

I have come up with code like this

// thrust functor
struct GreaterthanX
{
    const float _x;
    GreaterthanX(float x) : _x(x) {}

    __host__ __device__ bool operator()(const float &a) const
    {
        return a > _x;
    }
};

int main(void)
{
    // fill a device_vector with
    // 3 2 4 5
    // 0 -2 3 1
    // 9 8 7 6
    int row = 3;
    int col = 4;
    thrust::device_vector<int> vec(row * col);
    thrust::device_vector<int> count(row);
    vec[0] = 3;
    vec[1] = 2;
    vec[2] = 4;
    vec[3] = 5;
    vec[4] = 0;
    vec[5] = -2;
    vec[6] = 3;
    vec[7] = 1;
    vec[8] = 9;
    vec[9] = 8;
    vec[10] = 7;
    vec[11] = 6;

    // Goal: For each row, count the number of elements greater than 2. 
    // And then find the row with the max count

    // count the element greater than 2 in vec
    for (int i = 0; i < row; i  )
    {
        count[i] = thrust::count_if(vec.begin(), vec.begin()   i * col, GreaterthanX(2));
    }

    thrust::device_vector<int>::iterator result = thrust::max_element(count.begin(), count.end());
    int max_val = *result;
    unsigned int position = result - count.begin();

    printf("result = %d at position %d\r\n", max_val, position);
    // result = 4 at position 2

    return 0;
}

My goal is to find the row that has the most elements greater than 2. I'm struggling at how to do this without a loop. Any suggestions would be very appreciated. Thanks.

CodePudding user response：

Here is an implementation using thrust::reduce_by_key in conjunction with multiple smart iterators.

I also took the freedom to sprinkle in some const, auto and lambdas for elegance and readability. Due to the lambdas, you will need to use the -extended-lambda flag for nvcc.

thrust::distance is the canonical way of subtracting Thrust iterators.

#include <cassert>

#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/distance.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>

int main(void)
{
    // fill a device_vector with
    // 3 2 4 5
    // 0 -2 3 1
    // 9 8 7 6
    int const row = 3;
    int const col = 4;
    thrust::device_vector<int> vec(row * col);
    vec[0] = 3;
    vec[1] = 2;
    vec[2] = 4;
    vec[3] = 5;
    vec[4] = 0;
    vec[5] = -2;
    vec[6] = 3;
    vec[7] = 1;
    vec[8] = 9;
    vec[9] = 8;
    vec[10] = 7;
    vec[11] = 6;
    thrust::device_vector<int> count(row);

    // Goal: For each row, count the number of elements greater than 2. 
    // And then find the row with the max count

    // count the element greater than 2 in vec

    // counting iterator avoids read from global memory, gives index into vec
    auto keys_in_begin = thrust::make_counting_iterator(0);
    auto keys_in_end = thrust::make_counting_iterator(row * col);
    
    // transform vec on the fly
    auto vals_in_begin = thrust::make_transform_iterator(
        vec.cbegin(), 
        [] __device__ (int val) { return val > 2 ? 1 : 0; });
    
    // discard to avoid write to global memory
    auto keys_out_begin = thrust::make_discard_iterator();
    
    auto vals_out_begin = count.begin();
    
    // transform keys (indices) into row indices and then compare
    // the divisions are the reason one might rather use MatX for higher dimensional data
    auto binary_predicate = [col] __device__ (int i, int j){ return i / col == j / col; };
    
    // this function returns a new end for count 
    // b/c the final number of elements is often not known beforehand
    auto new_ends = thrust::reduce_by_key(keys_in_begin, keys_in_end,
                                         vals_in_begin,
                                         keys_out_begin,
                                         vals_out_begin,
                                         binary_predicate);
    // make sure that we didn't provide too small of an output vector
    assert(thrust::get<1>(new_ends) == count.end());

    auto result = thrust::max_element(count.begin(), count.end());
    int const max_val = *result;
    auto const position = thrust::distance(count.begin(), result);

    printf("result = %d at position %d\r\n", max_val, position);
    // result = 4 at position 2

    return 0;
}