What's the best way to transpose a matrix consisting of RGB values (24bits) using AVX/SIMD inst-CodePudding

I'm currently trying to transpose large matrices which store RGB values. I'm new to AVX2 programming and I've found many examples of how to transpose matrices with 32bit values. However, I'm unsure of the fastest way to transpose when I have 24bit values.

Wim's answer to how to transpose a matrix consisting of 8bit values is promising for my situation. However, it relies heavily on 32bit operations which don't work in my case since they move 1.33 pixels rather than 1.

One main approach I'm considering is converting the RGB values into RGBX values by adding a padding byte to each pixel and then using a fast 32bit transpose like Z Boson's and Peter Cordes' implementation here.

However, the issue with this approach is I need to store the final values in RGB format with no padding. This means I'd have to do unaligned loads of 8 RGB pixels each, shuffle in the extra 0 bytes, transpose it using the 32bit approach, then shuffle out the extra 0 bytes and do unaligned stores. Does the time added by the extra shuffle instructions and unaligned loads/stores make this not worth implementing over a serial approach? I'm using a haswell CPU which doesn't have AVX512.

In case it's unclear I need a fast way to convert from this:

R0G0B0 R1G1B1 R2G2B2 R3G3B3 
R4G4B4 R5G5B5 R6G6B6 R7G7B7 
R8G8B8 R9G9B9 RaGaBa RbGbBb 
RcGcBc RdGdBd ReGeBe RfGfBf

To this: (but instead of 4x4 I need to do this for matrices over 200x200)

R0G0B0 R4G4B4 R8G8B8 RcGcBc 
R1G1B1 R5G5B5 R9G9B9 RdGdBd 
R2G2B2 R6G6B6 RaGaBa ReGeBe 
R3G3B3 R7G7B7 RbGbBb RfGfBf

Any advice would be greatly appreciated.

CodePudding user response：

Here’s a function which transposes 8x8 block of that image.
Call that function in a loop transposing these blocks, and writing into another locations of the output image.

Note that unless your image size is a multiple of 8, you gonna need another version of that function, to handle the remainder areas.

// Load 12 bytes from memory into the vector
inline __m128i load12( const uint8_t* rsi )
{
    __m128i vec = _mm_loadu_si64( rsi );
    return _mm_insert_epi32( vec, *(const int*)( rsi   8 ), 2 );
}
// Load 12 bytes from memory, upcast to AVX vector
inline __m256i load12Low( const uint8_t* rsi )
{
    return _mm256_castsi128_si256( load12( rsi ) );
}
// Load 12 bytes from memory into upper half of the AVX vector
inline __m256i load12High( __m256i low, const uint8_t* rsi )
{
    return _mm256_inserti128_si256( low, load12( rsi ), 1 );
}
// Store bytes [ 0 .. 11 ] and [ 16 .. 27 ] of the AVX vector
inline void store24( __m256i vec, uint8_t* rdi )
{
    _mm_storeu_si128( ( __m128i* )rdi, _mm256_castsi256_si128( vec ) );
    __m128i high = _mm256_extracti128_si256( vec, 1 );
    _mm_storeu_si64( rdi   12, high );
    *(int*)( rdi   20 ) = _mm_extract_epi32( high, 2 );
}

// Transpose dense 8x8 block of RGB24 pixels
void transpose8x8( uint8_t* rdi, size_t destStride, const uint8_t* rsi, size_t sourceStride )
{
    // Load top half of the matrix into lower 4 lanes of 8 vectors
    __m256i vectors[ 8 ];
    vectors[ 0 ] = load12Low( rsi );
    vectors[ 1 ] = load12Low( rsi   12 );
    rsi  = sourceStride;
    vectors[ 2 ] = load12Low( rsi );
    vectors[ 3 ] = load12Low( rsi   12 );
    rsi  = sourceStride;
    vectors[ 4 ] = load12Low( rsi );
    vectors[ 5 ] = load12Low( rsi   12 );
    rsi  = sourceStride;
    vectors[ 6 ] = load12Low( rsi );
    vectors[ 7 ] = load12Low( rsi   12 );
    rsi  = sourceStride;
    // Load bottom half of the matrix into upper half of these vectors
    vectors[ 0 ] = load12High( vectors[ 0 ], rsi );
    vectors[ 1 ] = load12High( vectors[ 1 ], rsi   12 );
    rsi  = sourceStride;
    vectors[ 2 ] = load12High( vectors[ 2 ], rsi );
    vectors[ 3 ] = load12High( vectors[ 3 ], rsi   12 );
    rsi  = sourceStride;
    vectors[ 4 ] = load12High( vectors[ 4 ], rsi );
    vectors[ 5 ] = load12High( vectors[ 5 ], rsi   12 );
    rsi  = sourceStride;
    vectors[ 6 ] = load12High( vectors[ 6 ], rsi );
    vectors[ 7 ] = load12High( vectors[ 7 ], rsi   12 );

    // Expand 3 byte values into 4 byte ones
    __m128i perm16 = _mm_setr_epi8( 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );
    __m256i perm = _mm256_broadcastsi128_si256( perm16 );
    vectors[ 0 ] = _mm256_shuffle_epi8( vectors[ 0 ], perm );
    vectors[ 1 ] = _mm256_shuffle_epi8( vectors[ 1 ], perm );
    vectors[ 2 ] = _mm256_shuffle_epi8( vectors[ 2 ], perm );
    vectors[ 3 ] = _mm256_shuffle_epi8( vectors[ 3 ], perm );
    vectors[ 4 ] = _mm256_shuffle_epi8( vectors[ 4 ], perm );
    vectors[ 5 ] = _mm256_shuffle_epi8( vectors[ 5 ], perm );
    vectors[ 6 ] = _mm256_shuffle_epi8( vectors[ 6 ], perm );
    vectors[ 7 ] = _mm256_shuffle_epi8( vectors[ 7 ], perm );

    // https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html

    // Transpose 2x2 blocks
    __m256i a = vectors[ 0 ];
    __m256i b = vectors[ 2 ];
    vectors[ 0 ] = _mm256_unpacklo_epi32( a, vectors[ 1 ] );
    vectors[ 2 ] = _mm256_unpackhi_epi32( a, vectors[ 1 ] );
    vectors[ 1 ] = _mm256_unpacklo_epi32( b, vectors[ 3 ] );
    vectors[ 3 ] = _mm256_unpackhi_epi32( b, vectors[ 3 ] );

    a = vectors[ 4 ];
    b = vectors[ 6 ];
    vectors[ 4 ] = _mm256_unpacklo_epi32( a, vectors[ 5 ] );
    vectors[ 6 ] = _mm256_unpackhi_epi32( a, vectors[ 5 ] );
    vectors[ 5 ] = _mm256_unpacklo_epi32( b, vectors[ 7 ] );
    vectors[ 7 ] = _mm256_unpackhi_epi32( b, vectors[ 7 ] );

    // Transpose 4x4 blocks
    a = vectors[ 0 ];
    vectors[ 0 ] = _mm256_unpacklo_epi64( a, vectors[ 1 ] );
    vectors[ 1 ] = _mm256_unpackhi_epi64( a, vectors[ 1 ] );
    a = vectors[ 2 ];
    vectors[ 2 ] = _mm256_unpacklo_epi64( a, vectors[ 3 ] );
    vectors[ 3 ] = _mm256_unpackhi_epi64( a, vectors[ 3 ] );
    a = vectors[ 4 ];
    vectors[ 4 ] = _mm256_unpacklo_epi64( a, vectors[ 5 ] );
    vectors[ 5 ] = _mm256_unpackhi_epi64( a, vectors[ 5 ] );
    a = vectors[ 6 ];
    vectors[ 6 ] = _mm256_unpacklo_epi64( a, vectors[ 7 ] );
    vectors[ 7 ] = _mm256_unpackhi_epi64( a, vectors[ 7 ] );

    // Gather RGB values across 16-byte lanes
    perm16 = _mm_setr_epi8( 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1 );
    perm = _mm256_broadcastsi128_si256( perm16 );
    vectors[ 0 ] = _mm256_shuffle_epi8( vectors[ 0 ], perm );
    vectors[ 1 ] = _mm256_shuffle_epi8( vectors[ 1 ], perm );
    vectors[ 2 ] = _mm256_shuffle_epi8( vectors[ 2 ], perm );
    vectors[ 3 ] = _mm256_shuffle_epi8( vectors[ 3 ], perm );
    vectors[ 4 ] = _mm256_shuffle_epi8( vectors[ 4 ], perm );
    vectors[ 5 ] = _mm256_shuffle_epi8( vectors[ 5 ], perm );
    vectors[ 6 ] = _mm256_shuffle_epi8( vectors[ 6 ], perm );
    vectors[ 7 ] = _mm256_shuffle_epi8( vectors[ 7 ], perm );

    // Store these pixels back
    store24( vectors[ 0 ], rdi );
    rdi  = destStride;
    store24( vectors[ 1 ], rdi );
    rdi  = destStride;
    store24( vectors[ 2 ], rdi );
    rdi  = destStride;
    store24( vectors[ 3 ], rdi );
    rdi  = destStride;
    store24( vectors[ 4 ], rdi );
    rdi  = destStride;
    store24( vectors[ 5 ], rdi );
    rdi  = destStride;
    store24( vectors[ 6 ], rdi );
    rdi  = destStride;
    store24( vectors[ 7 ], rdi );
}

I’d like to add that modern software and hardware tends to avoid dealing with RGB24 images in memory. For instance, Windows graphics dropped the support in Vista, in D3D10. The hardware and drivers only support texture formats with 8/16/32/64/128 bits per pixel, despite the 33% VRAM overhead for uncompressed RGB24 bitmaps.