I have 2 pyrDown
implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?
// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
__m128i v0 = _mm_load_si128((const __m128i *)src);
__m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
return _mm_avg_epu8(v0, v1);
}
// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size)
{
const __m128i vk1 = _mm_set1_epi8(1);
const __m128i add2 = _mm_set1_epi16(2);
size_t dstsize = size/2;
for (size_t i = 0; i < dstsize - 15; i = 16)
{
const size_t ii = i*2;
// based on https://stackoverflow.com/a/45564565/820795
__m128i left = average2RowsSingle(src ii, srcStep);
__m128i right = average2RowsSingle(src ii 16, srcStep);
__m128i w0 = _mm_maddubs_epi16(left, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(right, vk1);
w0 = _mm_srli_epi16(w0, 1); // divide by 2
w1 = _mm_srli_epi16(w1, 1);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i], w0);
}
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_load_si256((const __m256i*)src);
auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const __m128i vk1 = _mm_set1_epi8(1);
size_t dstsize = size/2;
const signed char o = -1; // make shuffle zero
const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
for (size_t i = 0; i < dstsize - 31; i = 32)
{
const size_t ii = i * 2;
auto left = average2RowsSingle(src ii, srcStep);
auto right = average2RowsSingle(src ii 32, srcStep);
auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
w0 = _mm256_shuffle_epi8(right, vec_r_i16);
w1 = _mm256_shuffle_epi8(right, vec_l_i16);
right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
left = _mm256_packus_epi16(left, right);
_mm256_storeu_si256((__m256i *) &dst[i], left);
}
}
Wrong result after AVX implementation:
CodePudding user response:
With help of @chtz I come up to this code:
inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_loadu_si256((const __m256i *)src);
auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const auto vk1 = _mm256_set1_epi8(1);
const size_t dstSize = size/2;
for (size_t i = 0; i < dstSize - 31; i = 32)
{
const size_t ii = i * 2;
// based on https://stackoverflow.com/a/45564565/820795
auto left = average2RowsSingle(src ii, srcStep);
auto right = average2RowsSingle(src ii 32, srcStep);
auto w0 = _mm256_maddubs_epi16(left, vk1); // unpack and horizontal add
auto w1 = _mm256_maddubs_epi16(right, vk1);
w0 = _mm256_srli_epi16(w0, 1); // divide by 2
w1 = _mm256_srli_epi16(w1, 1);
w0 = _mm256_packus_epi16(w0, w1); // pack
w0 = _mm256_permute4x64_epi64(w0, 0xd8); // shuffle to get correct order
_mm256_storeu_si256((__m256i *)&dst[i], w0);
}
}