NVidia thrust arbitrary transform with three-dimensional grid-CodePudding

I want to parallelize the following nested for loop on the GPU using NVidia thrust.

// complex multiplication
inline __host__ __device__  float2 operator* (const float2 a, const float2 b) {
    return make_float2(a.x * b.x - a.y * b.y, a.x * b.y   a.y * b.x);
}

int main()
{
    const int M = 100, N = 100, K = 100;
    float2 A[M*N*K], E[M*N*K];

    float vec_M[M], vec_N[N], vec_K[K];

    for (int i_M = 0; i_M < M; i_M  )
    {
        for (int i_N = 0; i_N < N; i_N  )
        {
            for (int i_K = 0; i_K < K; i_K  )
            {
                unsigned int idx = i_M * (N * K)   i_N * K   i_K; // linear index
                float2 a = A[idx];
                float b = vec_M[i_M];
                float c = vec_N[i_N];
                float d = vec_K[i_K];
                float arg1 = (b   c) / d;
                float2 val1 = make_float2(cosf(arg1), sinf(arg1));
                E[idx].x = a * val1; // calls the custom multiplication operator
            }
        }
    }

    return 0;
}

I could implement this as

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin(), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end(),   E.end())),
                 custom_functor());

similar to the example given in thrust/examples/arbitrary_transformation.cu.

However, to do that, I have to create the matrices B, C, and D, each of size M x N x K, even though three vectors of size M, N, and K are sufficient to represent the corresponding values. Creating these matrices requires (3 * M * N * K) / (M N K) more memory than just using the vectors.

Is there something like MATLAB's meshgrid() function that allows to create a 3D grid using three vectors without actually storing these matrices? In my mind the pseudo code would look like this:

thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), meshgrid(vec_M.begin(), vec_N.begin(), vec_K.begin()), E.begin())),
                 thrust::make_zip_iterator(thrust::make_tuple(A.end(),   meshgrid(vec_M.end(),   vec_N.end(),   vec_K.end()),   E.end())),
                 custom_functor());

CodePudding user response：

You can simply collapse the nested loop into a single loop and use for_each with a counting iterator. In the functor, you need to calculate the three indices from the single loop variable.

#include <iostream>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

struct Op{
    int N;
    int M;
    int K;

    Op(int n, int m, int k) : N(n), M(m), K(k){}

    void operator()(int x){
        const int k = x % K;
        const int m = (x / K) % M;
        const int n = (x / (K * M));
        std::cerr << "x = " << x << " (" << n << ", " << m << ", " << k << ")\n";
    }
};

int main(){
    const int N = 2;
    const int M = 3;
    const int K = 2;

    Op op{N,M,K};

    std::cerr << "with nested loops\n";
    for(int n = 0; n < N; n  ){
        for(int m = 0; m < M; m  ){
            for(int k = 0; k < K; k  ){
                std::cerr << "(" << n << ", " << m << ", " << k << ")\n";        
            }
        }
    }
    std::cerr << "\n";

    std::cerr << "with single loop\n";
    for(int x = 0; x < N * M * K; x  ){
        op(x);
    }
    std::cerr << "\n";

    std::cerr << "with thrust\n";
    auto xIterator = thrust::make_counting_iterator(0);
    thrust::for_each(thrust::seq, xIterator, xIterator   N*M*K, op);
}

Output

with nested loops
(0, 0, 0)
(0, 0, 1)
(0, 1, 0)
(0, 1, 1)
(0, 2, 0)
(0, 2, 1)
(1, 0, 0)
(1, 0, 1)
(1, 1, 0)
(1, 1, 1)
(1, 2, 0)
(1, 2, 1)

with single loop
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

with thrust
x = 0 (0, 0, 0)
x = 1 (0, 0, 1)
x = 2 (0, 1, 0)
x = 3 (0, 1, 1)
x = 4 (0, 2, 0)
x = 5 (0, 2, 1)
x = 6 (1, 0, 0)
x = 7 (1, 0, 1)
x = 8 (1, 1, 0)
x = 9 (1, 1, 1)
x = 10 (1, 2, 0)
x = 11 (1, 2, 1)

On a side note: CUDA has a function sincosf which may be more efficient than calling both sin and cos with the same argument. https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g9456ff9df91a3874180d89a94b36fd46