I'm trying to imitate CUDA/OpenCL workflow using vectorized functions like this:
#include <omp.h>
#include <iostream>
#include <string>
#include <functional>
#include<cmath>
template<typename Type, int Simd>
struct KernelData
{
alignas(32)
Type data[Simd];
inline void readFrom(const Type * const __restrict__ ptr) noexcept
{
for(int i=0;i<Simd;i )
{
data[i] = ptr[i];
}
}
inline void writeTo(Type * const __restrict__ ptr) const noexcept
{
for(int i=0;i<Simd;i )
{
ptr[i] = data[i];
}
}
inline const KernelData<Type,Simd> sqrt() const noexcept
{
KernelData<Type,Simd> result;
for(int i=0;i<Simd;i )
{
result.data[i] = std::sqrt(data[i]);
}
return result;
}
};
template<int mask>
struct KernelDataFactory
{
KernelDataFactory()
{
}
template<typename Type>
inline
KernelData<Type,mask> generate() const
{
return KernelData<Type,mask>();
}
};
template<int SimdWidth, typename... Args>
class Kernel
{
public:
Kernel(std::function<void(int,int, Args...)> kernelPrm)
{
kernel = kernelPrm;
}
void run(int n, Args... args)
{
const int nLoop = (n/SimdWidth);
for(int i=0;i<nLoop;i )
{
kernel(i*SimdWidth,SimdWidth, args...);
}
if((n/SimdWidth)*SimdWidth != n)
{
const int m = n%SimdWidth;
for(int i=0;i<m;i )
{
kernel(nLoop*SimdWidth i,1, args...);
}
}
}
private:
std::function<void(int,int, Args...)> kernel;
};
// cpu cycles from stackoverflow
#include <stdint.h> // <cstdint> is preferred in C , but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
int main(int argC, char** argV)
{
constexpr int simd = 16;
constexpr int n = 1003;
Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
const int id = simdGroupId;
if(simdWidth == simd)
{
const KernelDataFactory<simd> factory;
auto a = factory.generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
}
else
{
const KernelDataFactory<1> factory;
auto a = factory.generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
}
});
alignas(32)
float i[n],o[n];
for(int j=0;j<n;j )
i[j]=j;
auto t1 = readTSC();
for(int k=0;k<10000;k )
kernel.run(n,i,o);
auto t2 = readTSC();
for(int i=n-10;i<n;i )
{
std::cout<<"i="<<i<<" value="<<o[i]<<std::endl;
}
std::cout<<0.0001f*(t2-t1)/(float)(15*n)<<" cycles per sqrt"<<std::endl;
return 0;
}
but the part that is given by user has to be duplicated like this:
Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
const int id = simdGroupId;
if(simdWidth == simd)
{
const KernelDataFactory<simd> factory;
auto a = factory.generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
}
else
{
const KernelDataFactory<1> factory;
auto a = factory.generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
}
});
the only difference is the compile-time known two templates to produce:
KernelDataFactory<1> and KernelDataFactory<simd>
With define macros, it is easy to duplicate just the function body of the lambda. I'm trying to do this without using any define macro. Is there a simple way to do it such that user only gives this:
auto a = factory.generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
and it is automatically duplicated by the implementation?
What the current implementation does is:
- takes n
- breaks it into two parts
- run vectorized code until
n - (n%simd)
point is reached - run scalar code from
n - (n%simd)
ton
KernelDataFactory template argument (has to be compile-time known) (1 and simd) is used to let compiler generate vectorized code. (On godbolt.org (avx512), it runs at "0.9 cycles per sqrt" speed and on my system (avx1) it is 3.8 cycles per sqrt.)
CodePudding user response:
You could use a generic lambda (C 14) to achieve something like this. Note that this requires you to change the type of Kernel::kernel
and change the creation of the kernel a bit to allow for automatic type deduction:
Kernel
template<int SimdWidth, typename F, typename... Args>
class Kernel
{
public:
Kernel(F&& kernelPrm)
: kernel(std::move(kernelPrm))
{
}
void run(int n, Args... args)
{
const int nLoop = (n / SimdWidth);
for (int i = 0; i < nLoop; i )
{
CallKernel(i * SimdWidth, SimdWidth, args...);
}
if ((n / SimdWidth) * SimdWidth != n)
{
const int m = n % SimdWidth;
for (int i = 0; i < m; i )
{
CallKernel(nLoop * SimdWidth i, 1, args...);
}
}
}
private:
// helper function creating the factory and passing it to kernel
void CallKernel(int simdGroupId, int simdWidth, Args... args)
{
const int id = simdGroupId;
if (simdWidth == SimdWidth)
{
const KernelDataFactory<SimdWidth> factory;
kernel(factory, id, args...);
}
else
{
const KernelDataFactory<1> factory;
kernel(factory, id, args...);
}
}
F kernel;
};
Helpers
These helpers are necessary to deduce the second template argument of Kernel.
// helper for specifying the parameter pack
template<class...Args>
struct KernelArgs
{};
template<int SimdWidth, typename F, class...Args>
auto CreateKernel(F&& kernelPrm, KernelArgs<Args...> const&)
{
return Kernel<SimdWidth, F, Args...>(std::forward<F>(kernelPrm));
}
main
...
auto kernel = CreateKernel<simd>([](auto& factory, int const id, float* input, float* output)
{
auto a = factory.template generate<float>();
a.readFrom(input id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output id);
}, KernelArgs<float*, float*>{});
...