Home > Mobile >  Fast pyrDown image with AVX instructions
Fast pyrDown image with AVX instructions

Time:01-13

I have 2 pyrDown implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?

// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  __m128i v0 = _mm_load_si128((const __m128i *)src);
  __m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
  return _mm_avg_epu8(v0, v1);
}

// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
                  uint8_t*__restrict__ dst,
                  size_t srcStep,
                  size_t size)
{
    const __m128i vk1 = _mm_set1_epi8(1);
    const __m128i add2 = _mm_set1_epi16(2);
    size_t dstsize = size/2;
    for (size_t i = 0; i < dstsize - 15; i  = 16)
    {
        const size_t ii = i*2;
        // based on https://stackoverflow.com/a/45564565/820795
        __m128i left  = average2RowsSingle(src ii, srcStep);
        __m128i right = average2RowsSingle(src ii 16, srcStep);
        
        __m128i w0 = _mm_maddubs_epi16(left, vk1);        // unpack and horizontal add
        __m128i w1 = _mm_maddubs_epi16(right, vk1);
        w0 = _mm_srli_epi16(w0, 1);                     // divide by 2
        w1 = _mm_srli_epi16(w1, 1);
        w0 = _mm_packus_epi16(w0, w1);                  // pack
        
        _mm_storeu_si128((__m128i *)&dst[i], w0);
    }
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  auto v0 = _mm256_load_si256((const __m256i*)src);
  auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
  return _mm256_avg_epu8(v0, v1);
}

void average2Rows(const uint8_t* __restrict__ src,
                     uint8_t*__restrict__ dst,
                     size_t srcStep,
                     size_t size) {
  const __m128i vk1 = _mm_set1_epi8(1);
  size_t dstsize = size/2;
  const signed char o = -1; // make shuffle zero
  const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
                                            o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
  const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
                                            o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
  for (size_t i = 0; i < dstsize - 31; i  = 32)
  {
    const size_t ii = i * 2;
    auto left = average2RowsSingle(src   ii, srcStep);
    auto right = average2RowsSingle(src   ii   32, srcStep);

    auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
    auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
    left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);

    w0 = _mm256_shuffle_epi8(right, vec_r_i16);
    w1 = _mm256_shuffle_epi8(right, vec_l_i16);
    right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);

    left = _mm256_packus_epi16(left, right);

    _mm256_storeu_si256((__m256i *) &dst[i], left);
  }
}

Wrong result after AVX implementation: wrong_result_img

CodePudding user response:

With help of @chtz I come up to this code:

inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
  auto v0 = _mm256_loadu_si256((const __m256i *)src);
  auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
  return _mm256_avg_epu8(v0, v1);
}

void average2Rows(const uint8_t* __restrict__ src,
                  uint8_t*__restrict__ dst,
                  size_t srcStep,
                  size_t size) {
  const auto vk1 = _mm256_set1_epi8(1);
  const size_t dstSize = size/2;
  for (size_t i = 0; i < dstSize - 31; i  = 32)
  {
    const size_t ii = i * 2;
    // based on https://stackoverflow.com/a/45564565/820795
    auto left = average2RowsSingle(src   ii, srcStep);
    auto right = average2RowsSingle(src   ii   32, srcStep);

    auto w0 = _mm256_maddubs_epi16(left, vk1);        // unpack and horizontal add
    auto w1 = _mm256_maddubs_epi16(right, vk1);
    w0 = _mm256_srli_epi16(w0, 1);                     // divide by 2
    w1 = _mm256_srli_epi16(w1, 1);
    w0 = _mm256_packus_epi16(w0, w1);                  // pack
    w0 = _mm256_permute4x64_epi64(w0, 0xd8);           // shuffle to get correct order

    _mm256_storeu_si256((__m256i *)&dst[i], w0);
  }
}

Result image: correct_image

  • Related