Home > Enterprise >  Can a CUDA 'dot product' kernel accelerate bulk RMS computation?
Can a CUDA 'dot product' kernel accelerate bulk RMS computation?

Time:01-13

New to CUDA here but hoping its use might reduce the amount of time that single-threaded CPU code takes to compute root-mean-squares (specifically, the RMS of each of the u contiguous disjoint sub-arrays of length v contained in an array A of length u*v housing floats between 1 and -1).

I had hoped to use this sample to do the vast majority of the work*, but am finding that even the 1st step of separating A's sub-arrays with 0s for memory coalescing purposes (which I'm doing via single-threaded CPU code) takes longer than the entire CPU-based RMS calculation using 'ballpark' values u=200, v=5000!

I'm vaguely aware that there might be a way to apply padding at the same time A is copied to device memory, as I am that perhaps a second kernel could be used to perform the padding, but am not sure if exploring these approaches is worthwhile. I'm also aware of Thrust and cuBLAS, but the linked sample seems to my simple mind more likely to provide the desired acceleration (given that some care must be taken to prepare kernel input).

Are the 'ballpark' values above (which are similar to those in the sample) simply too small to allow the GPU to come into its own? Neither number is likely to be able to be raised to the next power of 10, alas. Would be very grateful for any input from those a little more familiar with GPU computing than myself. FWIW, here's the single-threaded CPU code I'm trying to improve upon (rms function) and a little context for it:

const size_t num     = 5000; // might be able to increase by a factor of <2
const size_t numSegs =  200; // might be able to increase by a factor of <5

float rms(const float a[], const size_t origin, const size_t length)
{
    float sumOfSquares = 0.0f;

    for (size_t i = 0; i < length;   i)
        sumOfSquares  = a[origin   i] * a[origin   i];

    return std::sqrt(sumOfSquares / float(length));
}

int main()
{
    ...

    float* array = (float*)malloc(num * numSegs * sizeof(float));
    float* RMSes = (float*)malloc(numSegs * sizeof(float));

    // array fill omitted; A[i] lies between -1 and 1 inclusive

    for (size_t segNum = 0; segNum < numSegs;   segNum)
    {
        RMSes[segNum] = rms(array, segNum * num, num);
    }
    ...
}

*given that RMS(A)=sqrt(B/C), where B is 'A dot A' and C is A's length

Edit: the CUDA-based approach is certainly working, but is currently a lot slower than host code

CodePudding user response:

If you are targeting x86 CPUs, you should use SSE acceleration.

Here are some power accelerated examples you can use for testing.
Both MSVC and GCC do a very good job at optimizing the code below. These cold be further optimized for a single input vector.

I won't go into the details on how to detect the capability of the target CPU, so the code below is not complete. But it does show what you'll need to test for selecting the right function at run-time.

My cpuinfo is a singleton based on this library https://github.com/steinwurf/cpuid

#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <emmintrin.h> // msvc
#endif

float dot_product_plain(const float* a, const float* b, size_t count) noexcept
{
    float result = 0.f;
    while (count--)
        result  = *a   * *b  ;
    return result;
}

float dot_product_avx2(const float* a, const float* b, size_t count) noexcept
{
    auto acc = _mm256_set1_ps(0.f);

    const auto do_8 = [&acc](const float* a, const float* b) {
        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_loadu_ps(a), _mm256_loadu_ps(b)));
    };

    while (count >= 32)
    {
        do_8(a, b);
        do_8(a   8, b   8);
        do_8(a   16, b   16);
        do_8(a   24, b   24);

        a  = 32;
        b  = 32;
        count -= 32;
    }
    while (count >= 8)
    {
        do_8(a, b);

        a  = 8;
        b  = 8;
        count -= 8;
    }
    if (count)
    {
        alignas(32) float buf_a[8];
        alignas(32) float buf_b[8];

        const auto N = std::min(count, size_t{8});

        for (size_t i = 0; i < N;   i)
        {
            buf_a[i] = a[i];
            buf_b[i] = b[i];
        }
        for (size_t i = N; i < 8;   i)
        {
            buf_a[i] = 0;
            buf_b[i] = 0;
        }
        do_8(buf_a, buf_b);
    }

    auto acc2   = _mm256_permutevar8x32_ps(acc, _mm256_set_epi32(0, 0, 0, 0, 7, 6, 5, 4));
    auto acc128 = _mm256_castps256_ps128(_mm256_add_ps(acc, acc2));
    acc128      = _mm_add_ps(acc128, _mm_shuffle_ps(acc128, acc128, _MM_SHUFFLE(0, 3, 0, 1)));
    acc128      = _mm_add_ps(acc128, _mm_shuffle_ps(acc128, acc128, _MM_SHUFFLE(0, 0, 0, 2)));

    float result = 0;
    _mm_store_ss(&result, acc128);
    return result;
}

You should check which of these to execute at runtime, using a cpuid library such as this one: https://github.com/steinwurf/cpuid

Here's what the selection code may look like.

dot_product_proc_t get_dot_product_proc() noexcept
{
    if (can_dot_product_avx2())
        return dot_product_avx2;
    if (can_dot_product_sse2())
        return dot_product_sse2;
    return dot_product_plain;
}

bool can_dot_product_sse2() noexcept
{
    return cpu_info()->SSE2();
}

bool can_dot_product_avx2() noexcept
{
    return cpu_info()->SSE() && cpu_info()->AVX();
}

Here is the same using SSE2, which targets older processors:

float dot_product_sse2(const float* a, const float* b, size_t count) noexcept
{
    auto acc           = _mm_setzero_ps();
    const auto fmacc_4 = [&acc](const float* a, const float* b) {
        acc = _mm_add_ps(acc, _mm_mul_ps(_mm_loadu_ps(a), _mm_loadu_ps(b)));
    };

    while (count >= 16)
    {
        fmacc_4(a, b);
        fmacc_4(a   4, b   4);
        fmacc_4(a   8, b   8);
        fmacc_4(a   12, b   12);

        a  = 16;
        b  = 16;
        count -= 16;
    }
    while (count >= 4)
    {
        fmacc_4(a, b);

        a  = 4;
        b  = 4;
        count -= 4;
    }

    acc = _mm_add_ps(acc, _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(0, 3, 0, 1)));
    acc = _mm_add_ps(acc, _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(0, 0, 0, 2)));

    float result{};
    _mm_store_ps1(&result, acc);

    while (count--)
        result  = *(a  ) * *(b  );

    return result;
}

CodePudding user response:

Here is an implementation using Thrust/CUB. It should serve as a reference/sanity check in terms of performance for any CUDA solution that you come up with. As the CUB algorithm does not know about the regularity of your problem, it should totally be possible to write a faster CUDA implementation than this one in theory. But getting a significant speedup over this reference might be highly non-trivial in practice.

This problem is totally suitable for GPU-computing, but your problem size might still be too small for the GPU to shine.

I decided to use cub::DeviceSegmentedReduce instead of thrust::reduce_by_key (which uses CUB in the backend) here for better performance and because it is easier to get around measuring the overhead of allocating temporary storage.

#include <cmath>

#include <iostream>

#include <cub/cub.cuh>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/random.h>
#include <thrust/random/uniform_real_distribution.h>

constexpr size_t num     = 5000; // might be able to increase by a factor of <2
constexpr size_t numSegs =  200; // might be able to increase by a factor of <5

template <typename T>
class ScaleBy
{
    T factor_;

    public:
    ScaleBy(T factor) noexcept : factor_{factor} {}

    __host__ __device__
    T operator()(T val) const noexcept { return factor_ * val; }
};

template <typename T>
struct Square
{
    __host__ __device__
    T operator()(T val) const noexcept { return val * val; }
};

template <typename T>
class RootMean
{
    T norm_;

    public:
    RootMean(T norm) noexcept : norm_{norm} {}

    __host__ __device__
    T operator()(T sum) const noexcept { return sqrt(sum / norm_); }
};

float rms(thrust::host_vector<float> const &a, const size_t origin, const size_t length)
{
    float sumOfSquares = 0.0f;

    for (size_t i = 0; i < length;   i)
        sumOfSquares  = a[origin   i] * a[origin   i];

    return std::sqrt(sumOfSquares / float(length));
}

void segmented_rms_host(thrust::host_vector<float> const &array,
                        thrust::host_vector<float> &RMSes)
{
    for (size_t segNum = 0; segNum < numSegs;   segNum)
    {
        RMSes[segNum] = rms(array, segNum * num, num);
    }
}

void segmented_rms_device(thrust::device_vector<float> const &d_array,
                          thrust::device_vector<float> &d_RMSes,
                          uint8_t *d_temp_storage,
                          size_t &temp_storage_bytes)
{
    auto seg_size = d_array.size() / d_RMSes.size();
    auto origin_iter = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0ull),
        ScaleBy<size_t>{seg_size});
    auto input_iter = thrust::make_transform_iterator(
        d_array.cbegin(),
        Square<float>{});
    auto output_iter = thrust::make_transform_output_iterator(
        d_RMSes.begin(),
        RootMean<float>{static_cast<float>(seg_size)});

    cub::DeviceSegmentedReduce::Sum(d_temp_storage,
                                    temp_storage_bytes,
                                    input_iter,
                                    output_iter, numSegs,
                                    origin_iter, origin_iter   1);
}

int main()
{
    thrust::default_random_engine rng(123456789);
    thrust::uniform_real_distribution<float> dist(-1.0f, 1.0f); // excludes 1.0f, but ok for testing

    thrust::host_vector<float> array(num * numSegs);
    thrust::host_vector<float> RMSes_ref(numSegs);

    for (size_t i = 0ull; i < array.size();   i)
    {
        array[i] = dist(rng);
    }

    segmented_rms_host(array, RMSes_ref);

    thrust::device_vector<float> d_array(array);
    thrust::device_vector<float> d_RMSes(numSegs);

    // Determine temporary device storage requirements
    size_t temp_storage_bytes = 0;
    segmented_rms_device(d_array, d_RMSes, nullptr, temp_storage_bytes);
    // Allocate temporary storage
    thrust::device_vector<uint8_t> d_temp_storage(temp_storage_bytes);

    segmented_rms_device(d_array, d_RMSes,
                         thrust::raw_pointer_cast(d_temp_storage.data()), temp_storage_bytes);

    thrust::host_vector<float> RMSes(d_RMSes);
    for (size_t i = 0ull; i < numSegs;   i)
    {
        if (std::abs(RMSes_ref[i] - RMSes[i]) / RMSes_ref[i] > 1.0e-4f)
        {
            std::cout << "Big deviation detected at i = " << i
                      << ": RMS_ref = " << RMSes_ref[i]
                      << " while RMS = " << RMSes[i] << '\n';
        }
    }

    return 0;
}
  • Related