Home > Back-end >  AVX performance slower for bitwise xor op and popcount
AVX performance slower for bitwise xor op and popcount

Time:02-23

I am new to writing some avx intrinsics based code so need some help in understanding if my observations are expected. I have 2 methods implementing distance computations, both methods take 2 float arrays and its dimension and returns a float distance. The first method computes a euclidean distance

   static float
    compute_l2Square(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
        float *pVect1 = (float *) pVect1v;
        float *pVect2 = (float *) pVect2v;
        size_t qty = *((size_t *) qty_ptr);
        float __attribute__((aligned(32))) TmpRes[8];
        size_t qty16 = qty >> 4;

        const float *pEnd1 = pVect1   (qty16 << 4);

        __m256 diff, v1, v2;
        __m256 sum = _mm256_set1_ps(0);

        while (pVect1 < pEnd1) {
            v1 = _mm256_loadu_ps(pVect1);
            pVect1  = 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2  = 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));

            v1 = _mm256_loadu_ps(pVect1);
            pVect1  = 8;
            v2 = _mm256_loadu_ps(pVect2);
            pVect2  = 8;
            diff = _mm256_sub_ps(v1, v2);
            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
        }

        _mm256_store_ps(TmpRes, sum);
        return TmpRes[0]   TmpRes[1]   TmpRes[2]   TmpRes[3]   TmpRes[4]   TmpRes[5]   TmpRes[6]   TmpRes[7];
    }

The second method computes a bitwise xor and then counts number of 1 i.e hamming distance

static float compute_hamming(const void* __restrict pVect1v,
                     const void* __restrict pVect2v,
                     const void* __restrict qty_ptr) {
   float *pVect1 = (float *) pVect1v;
   float *pVect2 = (float *) pVect2v;
   size_t qty = *((size_t *)qty_ptr);
   uint64_t __attribute__((aligned(32))) TmpRes[4];
   size_t qty16 = qty >> 4;

   const float *pEnd1 = pVect1   (qty16 << 4);
   int res = 0;
   __m256 diff, v1, v2;
    while (pVect1 < pEnd1) {
              v1 = _mm256_loadu_ps(pVect1);
              pVect1  = 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2  = 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res  = __builtin_popcountll(TmpRes[0])   __builtin_popcountll(TmpRes[1])
                __builtin_popcountll(TmpRes[2])   __builtin_popcountll(TmpRes[3]);

              v1 = _mm256_loadu_ps(pVect1);
              pVect1  = 8;
              v2 = _mm256_loadu_ps(pVect2);
              pVect2  = 8;
              diff = _mm256_xor_ps(v1, v2);
              _mm256_store_si256( (__m256i*)TmpRes,  _mm256_castps_si256(diff));
              res  = __builtin_popcountll(TmpRes[0])   __builtin_popcountll(TmpRes[1])
                              __builtin_popcountll(TmpRes[2])   __builtin_popcountll(TmpRes[3]);
          }
  return res;
    }

For the same number of bits, l2 square distance computation is much faster than hamming i.e almost 2x-4x 9 ( i.e computing l2 distance for 512 bits which 16 floats is faster than computing hamming on the 16 floats) . I am not really sure if this is expected . To me it seems that popcount and storing the results to temp is causing some slowness , because when i modify the l2 distance computation to do xor operation instead of sub i.e replace _mm256_sub_ps with _mm256_xor_ps the l2 computation becomes more fast.

I am benchmarking on a mac os, which has avx instruction support. Also another observation is a non avx implementation of hamming distance using just loop : sum = popcount(vec_a[i] ^ vec_b[i]) is also giving similar numbers as avx implementation . I also checked that avx instructions and methods are invoked just for sanity checks.

The non vectorized implementation :

static float compute_hamming(const void* __restrict pVect1,
                     const void* __restrict pVect2,
                     const void* __restrict qty_ptr) {
  size_t qty = *((size_t *)qty_ptr);
  int res = 0;


  const float *pVect1LL = (const float *)pVect1;
  const float *pVect2LL = (const float *)pVect2;
  for (unsigned i = 0; i < qty; i = i   2) {
    if (i   1 == qty) {
      unsigned int v1;
      unsigned int v2;
      memcpy(&v1, &pVect1LL[i], sizeof(float));
      memcpy(&v2, &pVect2LL[i], sizeof(float));
      res  = __builtin_popcount(v1 ^ v2);
      break;
    }
    uint64_t v1;
    uint64_t v2;
    memcpy(&v1, &pVect1LL[i], sizeof(float) * 2);
    memcpy(&v2, &pVect2LL[i], sizeof(float) * 2);

    res  = __builtin_popcountll(v1 ^ v2);
  }

  return res;
}

Need some help and recommendations on improving the performance since the bottleneck is distance computation method.

CodePudding user response:

You could speed up your l2Square version more by using _mm256_fmadd_ps, if you're targeting Haswell and newer. (And Piledriver, except you're on a Mac and you probably don't care about AMD Hackintosh machines.)

Equally or more importantly, by using two separate __m256 sum0, sum1 accumulators to hide FP latency, adding them together at the end before reducing. (With an efficient hsum, not just store and then scalar add of each element in turn.)


Without hardware SIMD popcount (AVX512 VPOPCOUNTDQ), yes of course it's going to be slower, especially if the compiler doesn't vectorize those per-element __builtin_popcountll(vec[0]) ... into SIMD popcount using a nibble LUT or something (vpshufb).

The way you're doing it is actually making things worse for clang, by getting it to do SIMD XOR but then actually extract to scalar instead of just using scalar XOR and popcnt in the first place; note the vpextrq instructions in the asm. Clang can auto-vectorize __builtin_popcountll in a loop (in a not-terrible but not great way), but not like this. (Actually, SIMD XOR and then scalar extract for popcnt is not nearly as bad as I thought, but only if you use 128-bit vectors; see the "sse-cpu" results from Wojciech Mula's git repo linked below where even SSE for pure loads doesn't slow it down much.)

For example, clang auto-vectorizes this with YMM vectors inside the loop. (Godbolt showing this and your code) Unfortunately it does a bad job with char* arrays, and with unsigned instead of unsigned long it only uses XMM vectors.

float compute_hamming_autovec(const unsigned long* __restrict a, 
                              const unsigned long* __restrict b,
                              size_t qty)    // by value to keep it simpler, IDK why you'd want to pass this by reference with a void*
{
    //const unsigned char *__restrict a = pVect1v, *__restrict b = pVect2v;
    unsigned long sum = 0;
    for (size_t i=0 ; i<qty*4 ; i  ){
        unsigned long tmp1=a[i], tmp2=b[i];
        //memcpy(&tmp1, a i, 4);
        //memcpy(&tmp2, b i, 4);
        sum  = __builtin_popcountll(tmp1 ^ tmp2);
    }
    return sum;
}

Using memcpy for unaligned aliasing-safe loads from char* also seemed to defeat vectorization, or some variation on this used scalar load and xor; you may need typdef uint64_t aliasing_unaligned_u64 __attribute__((aligned(4), may_alias)). (I used aligned(4) on the assumption you're pointing it at aligned floats.)

However, your best bet is to manually vectorize the SIMD popcount. See https://github.com/WojciechMula/sse-popcount/. That also avoids any futzing with types to make strict-aliasing-safe code that will auto-vectorize nicely over arrays of float data.

For large counts, it's possible to go even faster than a good implementation of using just vpshufb ymm / vertical sum inner loop / vpsadbw to hsum to qwords before it can overflow. For example, the Harley Seal SIMD popcount code in that repo is about 20% faster on Skylake than the best "avx-lookup" implementation from the same repo, for arrays of size 4096 bytes. (And twice as fast as "avx2-lookup-original"; I forget what the difference was.) See results for clang on Skylake

Changing popcnt_AVX2_lookup to take two pointers and _mm256_xor_si256 is trivial, just replace the __m256i vec = _mm256_loadu with those couple statements. Or do the same with Harley-Seal if your arrays are large enough to warrant it; it shouldn't cause any extra register pressure since it can compile to a load / memory-source-vpxor.

Also tweak its unroll factor to be good with your typical problem sizes.


Since small size is apparently common for your use-case (which I didn't realize originally):

Another thing to consider with your real use case is how frequently you'll have odd sizes. If AVX2-lookup is only good with a multiple of the unroll factor, and needs unrolling to keep up, you might end up with a lot of your inputs spending a lot of time in its fallback path. So it would either be important to make that efficient, or be a good reason to drop it and just use SSE2 XOR scalar popcnt which can easily do 16-byte granularity without a problem.

CodePudding user response:

Yeah, your observations are expected. Your code for Euclidean is more or less OK, but your code for Hamming is very inefficient.

Since you mentioned AVX1 but not AVX2, I assume you don’t have AVX2. In that case, I would do it like that, untested.

// Count set bits in every byte,
// add slices of 8 bytes together into a vector of two int64 lanes.
inline __m128i popcntSse( __m128i vec )
{
    // Isolate low and high 4 bit pieces from each byte
    const __m128i lowMask = _mm_set1_epi8( 0xF );
    __m128i a = _mm_and_si128( lowMask, vec );
    __m128i b = _mm_andnot_si128( lowMask, vec );
    b = _mm_srli_epi32( b, 4 );

    // Apply the lookup table
    const __m128i lookup = _mm_setr_epi8(
    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
    );
    a = _mm_shuffle_epi8( lookup, a );
    b = _mm_shuffle_epi8( lookup, b );

    // Add two pieces together; adding 2 numbers [ 0 .. 4 ] each, not gonna overflow
    __m128i res = _mm_add_epi8( a, b );

    // Return horizontal sum of bytes
    return _mm_sad_epu8( res, _mm_setzero_si128() );
}

static float computeHammingDistance( const float* p1, const float* p2, size_t count )
{
    const float* const p1EndAligned = p1   ( count / 4 ) * 4;
    const size_t remainder = ( count % 4 );

    __m128i acc = _mm_setzero_si128();
    // Process most of these values doing 4 of them per iteration
    while( p1 < p1EndAligned )
    {
        __m128i v1 = _mm_loadu_si128( ( const __m128i* )p1 );
        __m128i v2 = _mm_loadu_si128( ( const __m128i* )p2 );
        p1  = 4;
        p2  = 4;
        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    if( remainder != 0 )
    {
        __m128i v1, v2;
        // Load 1 .. 3 32-bit values into vectors
        switch( remainder )
        {
        case 1:
            v1 = _mm_cvtsi32_si128( *(const int*)( p1 ) );
            v2 = _mm_cvtsi32_si128( *(const int*)( p2 ) );
            break;
        case 2:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            break;
        case 3:
            v1 = _mm_cvtsi64_si128( *(const int64_t*)( p1 ) );
            v2 = _mm_cvtsi64_si128( *(const int64_t*)( p2 ) );
            v1 = _mm_insert_epi32( v1, *(const int*)( p1   2 ), 2 );
            v2 = _mm_insert_epi32( v2, *(const int*)( p2   2 ), 2 );
            break;
        }

        __m128i xx = _mm_xor_si128( v1, v2 );
        acc = _mm_add_epi64( acc, popcntSse( xx ) );
    }

    // Horizontally add both lanes in the accumulator
    uint64_t result = (uint64_t)_mm_cvtsi128_si64( acc )   (uint64_t)_mm_extract_epi64( acc, 1 );

    // Convert to float;
    // note you will lose precision after about 16.7 millions of set bits due to FP32 on output.
    return (float)result;
}
  • Related