I have found some solutions where each AVX2 register holds both, the real and imaginary part of the complex numbers. I am interested in a solution where each AVX2 registers holds either the real or the imaginary part.
Assuming we have 4 AVX2 registers:R1, I1, R2, I2
Registers R1, I1
form 4 complex numbers. Same applies for the remaining two registers. Now I want to multiply the 4 complex numbers of R1, I1
with the 4 complex numbers of R2, I2
. What would be the most efficient way to do this?
CodePudding user response:
You wrote you have AVX2, all Intel and AMD AVX2 processors also support FMA3. For this reason, I would do it like that.
// 4 FP64 complex numbers stored in 2 AVX vectors,
// de-interleaved into real and imaginary vectors
struct Complex4
{
__m256d r, i;
};
// Multiply 4 complex numbers by another 4 numbers
Complex4 mul4( Complex4 a, Complex4 b )
{
Complex4 prod;
prod.r = _mm256_mul_pd( a.r, b.r );
prod.i = _mm256_mul_pd( a.r, b.i );
prod.r = _mm256_fnmadd_pd( a.i, b.i, prod.r );
prod.i = _mm256_fmadd_pd( a.i, b.r, prod.i );
return prod;
}
Or if you targeting that one VIA processor which doesn’t have FMA, replace the FMA intrinsics with the following lines:
prod.r = _mm256_sub_pd( prod.r, _mm256_mul_pd( a.i, b.i ) );
prod.i = _mm256_add_pd( prod.i, _mm256_mul_pd( a.i, b.r ) );
CodePudding user response:
By doing 4x4 multiplications on simple loops with 16 results accumulated on 4 accumulator cells:
template<int simd>
__attribute__((always_inline))
inline
void multiply( double * const __restrict__ real1,
double * const __restrict__ imag1,
double * const __restrict__ real2,
double * const __restrict__ imag2,
double * const __restrict__ outReal,
double * const __restrict__ outImag)
{
alignas(64)
double tmp1[simd];
alignas(64)
double tmp2[simd];
alignas(64)
double tmp3[simd];
#pragma GCC ivdep
for(int j=0;j<simd;j )
{
alignas(32)
const double r2 = real2[j];
alignas(32)
const double i2 = imag2[j];
#pragma GCC ivdep
for(int i=0;i<simd;i )
tmp1[i] = real1[i]*r2;
#pragma GCC ivdep
for(int i=0;i<simd;i )
tmp2[i] = imag1[i]*i2;
#pragma GCC ivdep
for(int i=0;i<simd;i )
tmp3[i] = real1[i] * i2;
#pragma GCC ivdep
for(int i=0;i<simd;i )
tmp3[i] =r2 * imag1[i];
#pragma GCC ivdep
for(int i=0;i<simd;i )
outReal[i] =tmp1[i]-tmp2[i];
#pragma GCC ivdep
for(int i=0;i<simd;i )
outImag[i] =tmp3[i];
}
}
#include<iostream>
#include<ctime>
#include<chrono>
class Bench
{
public:
Bench(size_t * targetPtr)
{
target=targetPtr;
t1 = std::chrono::duration_cast< std::chrono::nanoseconds >(std::chrono::high_resolution_clock::now().time_since_epoch());
}
~Bench()
{
t2 = std::chrono::duration_cast< std::chrono::nanoseconds >(std::chrono::high_resolution_clock::now().time_since_epoch());
if(target)
{
*target= t2.count() - t1.count();
}
else
{
std::cout << (t2.count() - t1.count())/1000000000.0 << " seconds" << std::endl;
}
}
private:
size_t * target;
std::chrono::nanoseconds t1,t2;
};
int main()
{
constexpr int simd=4;
constexpr int n=512;
alignas(64)
double real[n];
alignas(64)
double imag[n];
// 4 values to multiply with input
alignas(64)
double real2[simd];
alignas(64)
double imag2[simd];
// 4x4 accumulators
alignas(64)
double realOut[simd*simd];
alignas(64)
double imagOut[simd*simd];
for(int i=0;i<n;i )
{
std::cin>>real[i];
std::cin>>imag[i];
}
size_t t;
for(int r=0;r<100;r )
{
{
Bench b(&t);
for(int i=0;i<n;i =simd)
{
multiply<simd>(real i,imag i,real2,imag2,realOut,imagOut);
}
}
std::cout<<t<<" ns "<<((n/simd)*(simd*simd*8)/(double)t)<<" gflops"<<std::endl;
size_t xorv = 0;
for(int i=0;i<n;i )
{
xorv ^= (size_t)( realOut[i] * imagOut[i]);
}
std::cout<<xorv<<std::endl;
}
return 0;
}
and with skylake avx2 compiler flags of -O3 -march=skylake -ftree-vectorize -ffast-math -mavx2 -mprefer-vector-width=256 -fno-math-errno -std=c 20
GCC 12.1 compiler produces this:
// preparation of R2,I2
.L17:
call std::chrono::_V2::system_clock::now()
vmovq xmm3, QWORD PTR [rbp-8688]
vmovapd YMMWORD PTR [rbp-8688], ymm3
vmovq xmm3, QWORD PTR [rbp-8656]
vbroadcastsd ymm2, QWORD PTR [rbp-8616]
vmovapd YMMWORD PTR [rbp-8656], ymm3
vbroadcastsd ymm3, QWORD PTR [rbp-8624]
vbroadcastsd ymm5, QWORD PTR [rbp-8600]
vaddpd ymm3, ymm3, ymm2
vbroadcastsd ymm2, QWORD PTR [rbp-8608]
vbroadcastsd ymm4, QWORD PTR [rbp-8560]
vaddpd ymm2, ymm2, ymm5
vbroadcastsd ymm0, QWORD PTR [rbp-8552]
vbroadcastsd ymm1, QWORD PTR [rbp-8544]
vaddpd ymm12, ymm3, ymm2
vbroadcastsd ymm2, QWORD PTR [rbp-8536]
vaddpd ymm0, ymm0, ymm4
vaddpd ymm1, ymm1, ymm2
vmovq xmm6, QWORD PTR [rbp-8752]
vmovq xmm14, QWORD PTR [rbp-8784]
vaddpd ymm11, ymm1, ymm0
vmovq xmm10, QWORD PTR [rbp-8832]
vmovq xmm13, QWORD PTR [rbp-8816]
mov r12, rax
vmovq xmm8, QWORD PTR [rbp-8720]
vmovapd ymm4, ymm6
vmovapd YMMWORD PTR [rbp-8720], ymm14
vmovq xmm9, QWORD PTR [rbp-8824]
lea rdx, [rbp-8240]
mov rax, r13
vmovapd ymm6, ymm10
vmovapd ymm14, ymm13
// actual computation
.L5:
vmovapd ymm7, YMMWORD PTR [rdx]
vmovapd ymm15, YMMWORD PTR [rax 64]
vunpcklpd ymm2, ymm7, YMMWORD PTR [rdx 32]
vunpckhpd ymm0, ymm7, YMMWORD PTR [rdx 32]
vmovapd ymm7, YMMWORD PTR [rdx 64]
vpermpd ymm0, ymm0, 216
vunpckhpd ymm1, ymm7, YMMWORD PTR [rdx 96]
vunpcklpd ymm3, ymm7, YMMWORD PTR [rdx 96]
vpermpd ymm1, ymm1, 216
vunpcklpd ymm5, ymm0, ymm1
vpermpd ymm5, ymm5, 216
vfmadd231pd ymm4, ymm12, ymm5
vpermpd ymm3, ymm3, 216
vpermpd ymm2, ymm2, 216
vunpcklpd ymm7, ymm2, ymm3
vpermpd ymm7, ymm7, 216
vfmadd231pd ymm6, ymm12, ymm7
vunpckhpd ymm0, ymm0, ymm1
vmovapd ymm1, YMMWORD PTR [rax]
vunpckhpd ymm2, ymm2, ymm3
vpermpd ymm3, ymm2, 216
vmovapd YMMWORD PTR [rbp-8752], ymm4
vunpcklpd ymm2, ymm1, YMMWORD PTR [rax 32]
vunpcklpd ymm4, ymm15, YMMWORD PTR [rax 96]
vpermpd ymm2, ymm2, 216
vpermpd ymm4, ymm4, 216
vmovapd YMMWORD PTR [rbp-8784], ymm6
vunpcklpd ymm6, ymm2, ymm4
vunpckhpd ymm2, ymm2, ymm4
vpermpd ymm2, ymm2, 216
vmulpd ymm4, ymm12, ymm2
vunpckhpd ymm10, ymm15, YMMWORD PTR [rax 96]
vunpckhpd ymm1, ymm1, YMMWORD PTR [rax 32]
vpermpd ymm10, ymm10, 216
vpermpd ymm1, ymm1, 216
vmovapd YMMWORD PTR [rbp-8816], ymm4
vunpcklpd ymm4, ymm1, ymm10
vunpckhpd ymm1, ymm1, ymm10
vpermpd ymm6, ymm6, 216
vpermpd ymm1, ymm1, 216
vmulpd ymm15, ymm12, ymm6
vmulpd ymm10, ymm12, ymm1
vpermpd ymm4, ymm4, 216
vmulpd ymm13, ymm12, ymm4
vpermpd ymm0, ymm0, 216
vfmadd231pd ymm9, ymm12, ymm3
vfmadd231pd ymm8, ymm12, ymm0
vfmadd132pd ymm7, ymm15, ymm11
vfmadd213pd ymm3, ymm11, YMMWORD PTR [rbp-8816]
vfmadd132pd ymm0, ymm10, ymm11
vfmadd132pd ymm5, ymm13, ymm11
vfnmadd231pd ymm9, ymm11, ymm2
vaddpd ymm2, ymm7, YMMWORD PTR [rbp-8688]
vaddpd ymm3, ymm3, YMMWORD PTR [rbp-8656]
vaddpd ymm7, ymm0, YMMWORD PTR [rbp-8720]
sub rax, -128
vfnmadd213pd ymm6, ymm11, YMMWORD PTR [rbp-8784]
vfnmadd213pd ymm4, ymm11, YMMWORD PTR [rbp-8752]
vfnmadd231pd ymm8, ymm11, ymm1
vaddpd ymm14, ymm14, ymm5
vmovapd YMMWORD PTR [rbp-8688], ymm2
vmovapd YMMWORD PTR [rbp-8656], ymm3
vmovapd YMMWORD PTR [rbp-8720], ymm7
sub rdx, -128
cmp rax, rbx
jne .L5
So it does these:
- broadcasts a value to be re-used as whole register,
- multiplies it with all other 4 values using permute instruction
- repeats until 16 results are ready (accumulated on 4 outputs).
It is only 45-53 gflops (depending on godbolt.org server instance) because it is memory bandwidth bottlenecked for the input part (where groups of 4 values are read linearly) and there are not enough registers available to hold both output accumulators (4 of them) and temporary computations so it goes to memory for intermediate values but at least they come from cache.
When 256bit vector length is forced on AVX512-capable CPU (same godbolt.org server), the extra register capacity makes 92 gflops with the following assembly:
.L8:
vmovapd ymm1, YMMWORD PTR [rdx]
vmovapd ymm12, YMMWORD PTR [rdx 32]
vmovapd ymm3, YMMWORD PTR [rdx 64]
vmovapd ymm9, YMMWORD PTR [rdx 96]
vmovapd ymm8, ymm1
vpermt2pd ymm8, ymm7, ymm12
vpermt2pd ymm1, ymm6, ymm12
vmovapd ymm12, ymm3
vpermt2pd ymm12, ymm7, ymm9
vpermt2pd ymm3, ymm6, ymm9
vmovapd ymm9, ymm8
vpermt2pd ymm9, ymm7, ymm12
vpermt2pd ymm8, ymm6, ymm12
vmovapd ymm12, ymm1
vpermt2pd ymm1, ymm6, ymm3
vfmadd231pd ymm0, ymm1, ymm5
vfmadd231pd ymm11, ymm9, ymm5
vfmadd231pd ymm2, ymm8, ymm5
vpermt2pd ymm12, ymm7, ymm3
vfmadd231pd ymm10, ymm12, ymm5
vmovapd ymm21, ymm0
vmovapd ymm0, YMMWORD PTR [rax]
vmovapd ymm17, YMMWORD PTR [rax 32]
vmovapd ymm3, YMMWORD PTR [rax 64]
vmovapd ymm18, ymm11
vmovapd ymm19, ymm2
vmovapd ymm11, YMMWORD PTR [rax 96]
vmovapd ymm2, ymm0
vpermt2pd ymm2, ymm7, ymm17
vpermt2pd ymm0, ymm6, ymm17
vmovapd ymm17, ymm3
vpermt2pd ymm17, ymm7, ymm11
vpermt2pd ymm3, ymm6, ymm11
vmovapd ymm20, ymm10
vmovapd ymm11, ymm2
vmovapd ymm10, ymm0
vpermt2pd ymm11, ymm7, ymm17
vpermt2pd ymm2, ymm6, ymm17
vpermt2pd ymm10, ymm7, ymm3
vpermt2pd ymm0, ymm6, ymm3
vmulpd ymm24, ymm11, ymm5
vmulpd ymm22, ymm2, ymm5
vmulpd ymm23, ymm10, ymm5
vmulpd ymm17, ymm0, ymm5
vmovapd ymm3, ymm12
vfmadd132pd ymm9, ymm24, ymm4
vfmadd132pd ymm8, ymm22, ymm4
vfmadd132pd ymm3, ymm23, ymm4
vfmadd132pd ymm1, ymm17, ymm4
sub rax, -128
vfnmadd132pd ymm11, ymm18, ymm4
vfnmadd132pd ymm10, ymm20, ymm4
vfnmadd132pd ymm2, ymm19, ymm4
vfnmadd132pd ymm0, ymm21, ymm4
vaddpd ymm14, ymm14, ymm9
vaddpd ymm15, ymm15, ymm3
vaddpd ymm13, ymm13, ymm8
vaddpd ymm16, ymm16, ymm1
sub rdx, -128
cmp rax, rbx
jne .L8
it accesses memory less because of more registers enabled.
When AVX512 is enabled(on same godbolt.org server) (both on vector length and native architecture tuning), the performance reaches 135 GFLOPS and 25 ZMM registers are used. It looks like too many registers are required if exactly 4x4 tiling is to be made. Maybe you should try other options if algorithm allows. Maybe 3x4, 5x4, 2x8 or some other options can give you more performance, if this is an actual register-tiling of a greater scale multiplication (like complex matrix-matrix multiplication with cache-tiling layer, register-tiling layer, etc).
This may not be the fastest as there is no intrinsic usage, only simple loops and not-very-dependable compiler support. Using intrinsics will give better performance.