Multiplication of complex numbers using AVX2-CodePudding

I have found some solutions where each AVX2 register holds both, the real and imaginary part of the complex numbers. I am interested in a solution where each AVX2 registers holds either the real or the imaginary part.
Assuming we have 4 AVX2 registers:R1, I1, R2, I2
Registers R1, I1 form 4 complex numbers. Same applies for the remaining two registers. Now I want to multiply the 4 complex numbers of R1, I1 with the 4 complex numbers of R2, I2. What would be the most efficient way to do this?

CodePudding user response：

You wrote you have AVX2, all Intel and AMD AVX2 processors also support FMA3. For this reason, I would do it like that.

// 4 FP64 complex numbers stored in 2 AVX vectors,
// de-interleaved into real and imaginary vectors
struct Complex4
{
    __m256d r, i;
};

// Multiply 4 complex numbers by another 4 numbers
Complex4 mul4( Complex4 a, Complex4 b )
{
    Complex4 prod;
    prod.r = _mm256_mul_pd( a.r, b.r );
    prod.i = _mm256_mul_pd( a.r, b.i );
    prod.r = _mm256_fnmadd_pd( a.i, b.i, prod.r );
    prod.i = _mm256_fmadd_pd( a.i, b.r, prod.i );
    return prod;
}

Or if you targeting that one VIA processor which doesn’t have FMA, replace the FMA intrinsics with the following lines:

prod.r = _mm256_sub_pd( prod.r, _mm256_mul_pd( a.i, b.i ) );
prod.i = _mm256_add_pd( prod.i, _mm256_mul_pd( a.i, b.r ) );

CodePudding user response：

By doing 4x4 multiplications on simple loops with 16 results accumulated on 4 accumulator cells:

template<int simd>
__attribute__((always_inline))
inline
void multiply(  double * const __restrict__ real1, 
                double * const __restrict__ imag1, 
                double * const __restrict__ real2, 
                double * const __restrict__ imag2,
                double * const __restrict__ outReal, 
                double * const __restrict__ outImag)
{
    alignas(64)
    double tmp1[simd];
    alignas(64)
    double tmp2[simd];
    alignas(64)
    double tmp3[simd];

    #pragma GCC ivdep
    for(int j=0;j<simd;j  )
    {
        alignas(32)
        const double r2 = real2[j];
        alignas(32)
        const double i2 = imag2[j];

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            tmp1[i] = real1[i]*r2;

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            tmp2[i] = imag1[i]*i2;

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            tmp3[i] = real1[i] * i2;

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            tmp3[i]  =r2 * imag1[i];

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            outReal[i] =tmp1[i]-tmp2[i];

        #pragma GCC ivdep
        for(int i=0;i<simd;i  )
            outImag[i] =tmp3[i];
    }
}

#include<iostream>
#include<ctime>
#include<chrono>
    class Bench
    {
    public:
        Bench(size_t * targetPtr)
        {
            target=targetPtr;
            t1 =  std::chrono::duration_cast< std::chrono::nanoseconds >(std::chrono::high_resolution_clock::now().time_since_epoch());
        }

        ~Bench()
        {
            t2 =  std::chrono::duration_cast< std::chrono::nanoseconds >(std::chrono::high_resolution_clock::now().time_since_epoch());
            if(target)
            {
                *target= t2.count() - t1.count();
            }
            else
            {
                std::cout << (t2.count() - t1.count())/1000000000.0 << " seconds" << std::endl;
            }
        }
    private:
        size_t * target;
        std::chrono::nanoseconds t1,t2;
    };
int main()
{
    constexpr int simd=4;
    constexpr int n=512;
    alignas(64)
    double real[n];
    alignas(64)
    double imag[n];

    // 4 values to multiply with input 
    alignas(64)
    double real2[simd];
    alignas(64)
    double imag2[simd];   

    // 4x4 accumulators 
    alignas(64)
    double realOut[simd*simd];
    alignas(64)
    double imagOut[simd*simd];

    for(int i=0;i<n;i  )
    {
        std::cin>>real[i]; 
        std::cin>>imag[i];
    }

    size_t t;
    for(int r=0;r<100;r  )
    {
        {
            Bench b(&t);
            for(int i=0;i<n;i =simd)
            {
                multiply<simd>(real i,imag i,real2,imag2,realOut,imagOut);
            }
        }
        std::cout<<t<<" ns    "<<((n/simd)*(simd*simd*8)/(double)t)<<" gflops"<<std::endl;
        size_t xorv = 0;
        for(int i=0;i<n;i  )
        {
            xorv ^= (size_t)( realOut[i] * imagOut[i]);
        }
        std::cout<<xorv<<std::endl;
    }
    return 0;
}

and with skylake avx2 compiler flags of -O3 -march=skylake -ftree-vectorize -ffast-math -mavx2 -mprefer-vector-width=256 -fno-math-errno -std=c 20

GCC 12.1 compiler produces this:

// preparation of R2,I2
.L17:
    call    std::chrono::_V2::system_clock::now()
    vmovq   xmm3, QWORD PTR [rbp-8688]
    vmovapd YMMWORD PTR [rbp-8688], ymm3
    vmovq   xmm3, QWORD PTR [rbp-8656]
    vbroadcastsd    ymm2, QWORD PTR [rbp-8616]
    vmovapd YMMWORD PTR [rbp-8656], ymm3
    vbroadcastsd    ymm3, QWORD PTR [rbp-8624]
    vbroadcastsd    ymm5, QWORD PTR [rbp-8600]
    vaddpd  ymm3, ymm3, ymm2
    vbroadcastsd    ymm2, QWORD PTR [rbp-8608]
    vbroadcastsd    ymm4, QWORD PTR [rbp-8560]
    vaddpd  ymm2, ymm2, ymm5
    vbroadcastsd    ymm0, QWORD PTR [rbp-8552]
    vbroadcastsd    ymm1, QWORD PTR [rbp-8544]
    vaddpd  ymm12, ymm3, ymm2
    vbroadcastsd    ymm2, QWORD PTR [rbp-8536]
    vaddpd  ymm0, ymm0, ymm4
    vaddpd  ymm1, ymm1, ymm2
    vmovq   xmm6, QWORD PTR [rbp-8752]
    vmovq   xmm14, QWORD PTR [rbp-8784]
    vaddpd  ymm11, ymm1, ymm0
    vmovq   xmm10, QWORD PTR [rbp-8832]
    vmovq   xmm13, QWORD PTR [rbp-8816]
    mov     r12, rax
    vmovq   xmm8, QWORD PTR [rbp-8720]
    vmovapd ymm4, ymm6
    vmovapd YMMWORD PTR [rbp-8720], ymm14
    vmovq   xmm9, QWORD PTR [rbp-8824]
    lea     rdx, [rbp-8240]
    mov     rax, r13
    vmovapd ymm6, ymm10
    vmovapd ymm14, ymm13
// actual computation
.L5:
    vmovapd ymm7, YMMWORD PTR [rdx]
    vmovapd ymm15, YMMWORD PTR [rax 64]
    vunpcklpd       ymm2, ymm7, YMMWORD PTR [rdx 32]
    vunpckhpd       ymm0, ymm7, YMMWORD PTR [rdx 32]
    vmovapd ymm7, YMMWORD PTR [rdx 64]
    vpermpd ymm0, ymm0, 216
    vunpckhpd       ymm1, ymm7, YMMWORD PTR [rdx 96]
    vunpcklpd       ymm3, ymm7, YMMWORD PTR [rdx 96]
    vpermpd ymm1, ymm1, 216
    vunpcklpd       ymm5, ymm0, ymm1
    vpermpd ymm5, ymm5, 216
    vfmadd231pd     ymm4, ymm12, ymm5
    vpermpd ymm3, ymm3, 216
    vpermpd ymm2, ymm2, 216
    vunpcklpd       ymm7, ymm2, ymm3
    vpermpd ymm7, ymm7, 216
    vfmadd231pd     ymm6, ymm12, ymm7
    vunpckhpd       ymm0, ymm0, ymm1
    vmovapd ymm1, YMMWORD PTR [rax]
    vunpckhpd       ymm2, ymm2, ymm3
    vpermpd ymm3, ymm2, 216
    vmovapd YMMWORD PTR [rbp-8752], ymm4
    vunpcklpd       ymm2, ymm1, YMMWORD PTR [rax 32]
    vunpcklpd       ymm4, ymm15, YMMWORD PTR [rax 96]
    vpermpd ymm2, ymm2, 216
    vpermpd ymm4, ymm4, 216
    vmovapd YMMWORD PTR [rbp-8784], ymm6
    vunpcklpd       ymm6, ymm2, ymm4
    vunpckhpd       ymm2, ymm2, ymm4
    vpermpd ymm2, ymm2, 216
    vmulpd  ymm4, ymm12, ymm2
    vunpckhpd       ymm10, ymm15, YMMWORD PTR [rax 96]
    vunpckhpd       ymm1, ymm1, YMMWORD PTR [rax 32]
    vpermpd ymm10, ymm10, 216
    vpermpd ymm1, ymm1, 216
    vmovapd YMMWORD PTR [rbp-8816], ymm4
    vunpcklpd       ymm4, ymm1, ymm10
    vunpckhpd       ymm1, ymm1, ymm10
    vpermpd ymm6, ymm6, 216
    vpermpd ymm1, ymm1, 216
    vmulpd  ymm15, ymm12, ymm6
    vmulpd  ymm10, ymm12, ymm1
    vpermpd ymm4, ymm4, 216
    vmulpd  ymm13, ymm12, ymm4
    vpermpd ymm0, ymm0, 216
    vfmadd231pd     ymm9, ymm12, ymm3
    vfmadd231pd     ymm8, ymm12, ymm0
    vfmadd132pd     ymm7, ymm15, ymm11
    vfmadd213pd     ymm3, ymm11, YMMWORD PTR [rbp-8816]
    vfmadd132pd     ymm0, ymm10, ymm11
    vfmadd132pd     ymm5, ymm13, ymm11
    vfnmadd231pd    ymm9, ymm11, ymm2
    vaddpd  ymm2, ymm7, YMMWORD PTR [rbp-8688]
    vaddpd  ymm3, ymm3, YMMWORD PTR [rbp-8656]
    vaddpd  ymm7, ymm0, YMMWORD PTR [rbp-8720]
    sub     rax, -128
    vfnmadd213pd    ymm6, ymm11, YMMWORD PTR [rbp-8784]
    vfnmadd213pd    ymm4, ymm11, YMMWORD PTR [rbp-8752]
    vfnmadd231pd    ymm8, ymm11, ymm1
    vaddpd  ymm14, ymm14, ymm5
    vmovapd YMMWORD PTR [rbp-8688], ymm2
    vmovapd YMMWORD PTR [rbp-8656], ymm3
    vmovapd YMMWORD PTR [rbp-8720], ymm7
    sub     rdx, -128
    cmp     rax, rbx
    jne     .L5

So it does these:

broadcasts a value to be re-used as whole register,
multiplies it with all other 4 values using permute instruction
repeats until 16 results are ready (accumulated on 4 outputs).

It is only 45-53 gflops (depending on godbolt.org server instance) because it is memory bandwidth bottlenecked for the input part (where groups of 4 values are read linearly) and there are not enough registers available to hold both output accumulators (4 of them) and temporary computations so it goes to memory for intermediate values but at least they come from cache.

When 256bit vector length is forced on AVX512-capable CPU (same godbolt.org server), the extra register capacity makes 92 gflops with the following assembly:

.L8:
  vmovapd ymm1, YMMWORD PTR [rdx]
  vmovapd ymm12, YMMWORD PTR [rdx 32]
  vmovapd ymm3, YMMWORD PTR [rdx 64]
  vmovapd ymm9, YMMWORD PTR [rdx 96]
  vmovapd ymm8, ymm1
  vpermt2pd ymm8, ymm7, ymm12
  vpermt2pd ymm1, ymm6, ymm12
  vmovapd ymm12, ymm3
  vpermt2pd ymm12, ymm7, ymm9
  vpermt2pd ymm3, ymm6, ymm9
  vmovapd ymm9, ymm8
  vpermt2pd ymm9, ymm7, ymm12
  vpermt2pd ymm8, ymm6, ymm12
  vmovapd ymm12, ymm1
  vpermt2pd ymm1, ymm6, ymm3
  vfmadd231pd ymm0, ymm1, ymm5
  vfmadd231pd ymm11, ymm9, ymm5
  vfmadd231pd ymm2, ymm8, ymm5
  vpermt2pd ymm12, ymm7, ymm3
  vfmadd231pd ymm10, ymm12, ymm5
  vmovapd ymm21, ymm0
  vmovapd ymm0, YMMWORD PTR [rax]
  vmovapd ymm17, YMMWORD PTR [rax 32]
  vmovapd ymm3, YMMWORD PTR [rax 64]
  vmovapd ymm18, ymm11
  vmovapd ymm19, ymm2
  vmovapd ymm11, YMMWORD PTR [rax 96]
  vmovapd ymm2, ymm0
  vpermt2pd ymm2, ymm7, ymm17
  vpermt2pd ymm0, ymm6, ymm17
  vmovapd ymm17, ymm3
  vpermt2pd ymm17, ymm7, ymm11
  vpermt2pd ymm3, ymm6, ymm11
  vmovapd ymm20, ymm10
  vmovapd ymm11, ymm2
  vmovapd ymm10, ymm0
  vpermt2pd ymm11, ymm7, ymm17
  vpermt2pd ymm2, ymm6, ymm17
  vpermt2pd ymm10, ymm7, ymm3
  vpermt2pd ymm0, ymm6, ymm3
  vmulpd ymm24, ymm11, ymm5
  vmulpd ymm22, ymm2, ymm5
  vmulpd ymm23, ymm10, ymm5
  vmulpd ymm17, ymm0, ymm5
  vmovapd ymm3, ymm12
  vfmadd132pd ymm9, ymm24, ymm4
  vfmadd132pd ymm8, ymm22, ymm4
  vfmadd132pd ymm3, ymm23, ymm4
  vfmadd132pd ymm1, ymm17, ymm4
  sub rax, -128
  vfnmadd132pd ymm11, ymm18, ymm4
  vfnmadd132pd ymm10, ymm20, ymm4
  vfnmadd132pd ymm2, ymm19, ymm4
  vfnmadd132pd ymm0, ymm21, ymm4
  vaddpd ymm14, ymm14, ymm9
  vaddpd ymm15, ymm15, ymm3
  vaddpd ymm13, ymm13, ymm8
  vaddpd ymm16, ymm16, ymm1
  sub rdx, -128
  cmp rax, rbx
  jne .L8

it accesses memory less because of more registers enabled.

When AVX512 is enabled(on same godbolt.org server) (both on vector length and native architecture tuning), the performance reaches 135 GFLOPS and 25 ZMM registers are used. It looks like too many registers are required if exactly 4x4 tiling is to be made. Maybe you should try other options if algorithm allows. Maybe 3x4, 5x4, 2x8 or some other options can give you more performance, if this is an actual register-tiling of a greater scale multiplication (like complex matrix-matrix multiplication with cache-tiling layer, register-tiling layer, etc).

This may not be the fastest as there is no intrinsic usage, only simple loops and not-very-dependable compiler support. Using intrinsics will give better performance.