neon spreading load with zero-fill-CodePudding

I've got an incoming bytestream of blocks of 16 uint8_t that I need to expand into 4x uint32x4_t neon registers for further processing. This is going to run on a core based on Cortex-A55. Here's an example bytestream: {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xF}.

Here's what I've got so far:

#include <stdint.h>
#if defined(__aarch64__)
    #include <arm_neon.h>
#else
    typedef unsigned int uint32x4_t __attribute__ ((vector_size (16)));
    typedef unsigned char uint8x16_t __attribute__ ((vector_size (16)));
#endif

#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
    #define select_u8x4_from_u8x16( a, b, c, d) {255,255,255,(a),255,255,255,(b),255,255,255,(c),255,255,255,(d)}
#else
    #define select_u8x4_from_u8x16( a, b, c, d) {(a),255,255,255,(b),255,255,255,(c),255,255,255,(d),255,255,255}
#endif

//Wrapper around vqtbl1q_u8()
static inline uint8x16_t table_16u8(uint8x16_t mat, uint8x16_t indexes)
{
#if defined( __aarch64__ )
    return vqtbl1q_u8(mat, indexes);
#else
    uint8x16_t result;
    for( unsigned i = 0; i < sizeof(mat);   i )
    {
        result[i] = mat[indexes[i]];
    }
    return result;
#endif
}

uint32_t test_function(const uint8_t * samples, unsigned num_samples/*always divisible by 16*/)
{
static const uint8x16_t idx_a = select_u8x4_from_u8x16(0,1,2,3);
static const uint8x16_t idx_b = select_u8x4_from_u8x16(4,5,6,7);
static const uint8x16_t idx_c = select_u8x4_from_u8x16(8,9,10,11);
static const uint8x16_t idx_d = select_u8x4_from_u8x16(12,13,14,15);

uint32x4_t dummy_accumulator = {0,0,0,0};
for(unsigned x = 0; x < num_samples; x  = 16)
    {
    /*Begin section I'd like help with*/
    uint8x16_t pxvect = *((uint8x16_t*)(samples x));

    uint32x4_t temp_a = (uint32x4_t)table_16u8(pxvect, idx_a);/*holds {0x0,0x1,0x2,0x3}*/
    uint32x4_t temp_b = (uint32x4_t)table_16u8(pxvect, idx_b);/*holds {0x4,0x5,0x6,0x7}*/
    uint32x4_t temp_c = (uint32x4_t)table_16u8(pxvect, idx_c);/*holds {0x8,0x9,0xA,0xB}*/
    uint32x4_t temp_d = (uint32x4_t)table_16u8(pxvect, idx_d);/*holds {0xC,0xD,0xE,0xF}*/
    /*End section I'd like help with.*/

    /*Sum the values to produce a return value*/
    dummy_accumulator  = temp_a;
    dummy_accumulator  = temp_b;
    dummy_accumulator  = temp_c;
    dummy_accumulator  = temp_d;
    }

return dummy_accumulator[0] dummy_accumulator[1] dummy_accumulator[2] dummy_accumulator[3];
}

uint32_t test_harness(void)
{
uint8_t test_vec[] = {0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0x8,0x9,0xA,0xB,0xC,0xD,0xF};
return test_function(test_vec, sizeof(test_vec));
}

I've seen VLD4, but that packs the results, and I don't want that. If I calculate transposed(I'd prefer not to, there's a cost for the rest of the math not shown), my first pass was:

uint32_t test_function(const uint8_t * samples, unsigned num_samples/*always divisible by 16*/)
{
#define splat_u32x4(a){(a),(a),(a),(a)}
static const uint32x4_t mask_a = splat_u32x4(0xffUL);
static const uint32x4_t mask_b = splat_u32x4(0xffUL<<8);
static const uint32x4_t mask_c = splat_u32x4(0xffUL<<16);
static const uint32x4_t mask_d = splat_u32x4(0xffUL<<24);

uint32x4_t dummy_accumulator = {0,0,0,0};
for(unsigned x = 0; x < num_samples; x  = 16)
    {
    /*Begin section I'd like help with*/
    uint8x16_t pxvect = *((uint8x16_t*)(samples x));

    uint32x4_t temp_a = ((uint32x4_t)pxvect & mask_a) >> 0; /*holds{0x0,0x4,0x8,0xC}*/
    uint32x4_t temp_b = ((uint32x4_t)pxvect & mask_b) >> 8; /*holds{0x1,0x5,0x9,0xD}*/
    uint32x4_t temp_c = ((uint32x4_t)pxvect & mask_c) >> 16;/*holds{0x2,0x6,0xA,0xE}*/
    uint32x4_t temp_d = ((uint32x4_t)pxvect & mask_d) >> 24;/*holds{0x3,0x7,0xB,0xF}*/
    /*End section I'd like help with.*/

    /*Sum the values to produce a return value*/
    dummy_accumulator  = temp_a;
    dummy_accumulator  = temp_b;
    dummy_accumulator  = temp_c;
    dummy_accumulator  = temp_d;
    }

return dummy_accumulator[0] dummy_accumulator[1] dummy_accumulator[2] dummy_accumulator[3];
}

I'd like to do this operation of loading 16 bytes and spreading them into 4x zero-extended uint32x4_t registers as quickly as possible, ideally in linear order rather than 4x4 transposed. Is there a better way to do so?

CodePudding user response：

Alternatively - use widening adds with zero:

// Widen 16x U8 to 2x 8x U16 values
vaddl_u8()
vaddl_high_u8()

// Widen 8x U16 to 2x 4x U32 values
vaddl_u16()
vaddl_high_u16()

// Widen 8x U16 to 2x 4x U32 values
vaddl_u16()
vaddl_high_u16()

This still ends up as 6 ops, but because these only have a single result register it may be faster than vzip.

CodePudding user response：

I'd use vzip for this one:

q0 = 16 packed byte values
q1 = Zeros
q2 = Zeros
q3 = Zeros

vzip.u8  q0, q2 // Interleave u8 and zeros to get u16 values
vzip.u16 q0, q1 // Interleave u16 and zeros to get u32 values
vzip.u16 q2, q3 // Interleave u16 and zeros to get u32 values

Values end up in linear order in q0, q1, q2, q3. The downside of vzip is that each iteration clobbers the zero registers, so you end up needing to reload q1/2/3 with zero for each iteration.