Say i have a 256 bit wide vector like this:
00000000 00000000 11100110 00000000
00000000 00000000 00000000 00000000
00000000 00000000 10000101 00000000
00000000 00000000 01111110 00000000
00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000
00000000 00000000 00001100 00000000
00000000 00000000 00000000 00000000
What would be the most efficient way to get a 8 bit mask
that looks a little something like this: 10110010
where every set bit represents a 32 bit integer lane that is > 0
using AVX2 and everything that both amd and intel support
CodePudding user response:
Assuming signed integer lanes:
inline uint8_t positiveMask_epi32( __m256i vec )
{
// Compare 32-bit integers for i > 0
const __m256i zero = _mm256_cmpgt_epi32( vec, _mm256_setzero_si256() );
// Collect high bits
const int mask = _mm256_movemask_ps( _mm256_castsi256_ps( zero ) );
// Return that value
return (uint8_t)mask;
}
If they’re unsigned integers:
inline uint8_t nonZeroMask_epu32( __m256i vec )
{
// Compare 32-bit integers for i == 0
const __m256i eqZero = _mm256_cmpeq_epi32( vec, _mm256_setzero_si256() );
// Collect high bits
const int mask = _mm256_movemask_ps( _mm256_castsi256_ps( eqZero ) );
// Flip lowest 8 bits in the result, we want 1 for non-zeros
return (uint8_t)( mask ^ 0xFF );
}