Is there a fast way to count the number of unique elements in a simd vector (AVX and any SSE) without converting to array? I want to use it in a specific bruteforcer as an optimization so I want it ideally to be as fast as possible.
Currently Iam doing:
// count the number of unique elements
int uniqueCount(v16n a) {
alignas(16) unsigned char v[16];
_mm_store_si128((v16n*)v, a);
int count = 1;
for(int i = 1; i < 16; i ) {
int j;
for(j = 0; j < i; j )
if(v[i] == v[j])
break;
if(i == j) count ;
}
return count;
}
CodePudding user response:
Here’s one possible implementation. The code requires SSSE3, SSE 4.1, and slightly benefits from AVX2 when available.
// Count unique bytes in the vector
size_t countUniqueBytes( __m128i vec )
{
size_t result = 1;
// Accumulator for the bytes encountered so far, initialize with broadcasted first byte
#ifdef __AVX2__
__m128i partial = _mm_broadcastb_epi8( vec );
#else
__m128i partial = _mm_shuffle_epi8( vec, _mm_setzero_si128() );
#endif
// Permutation vector to broadcast these bytes
const __m128i one = _mm_set1_epi8( 1 );
__m128i perm = one;
// If you use GCC, uncomment following line and benchmark, may or may not help:
// #pragma GCC unroll 1
for( int i = 1; i < 16; i )
{
// Broadcast i-th byte from the source vector
__m128i bc = _mm_shuffle_epi8( vec, perm );
perm = _mm_add_epi8( perm, one );
// Compare bytes with the partial vector
__m128i eq = _mm_cmpeq_epi8( bc, partial );
// Append current byte to the partial vector
partial = _mm_alignr_epi8( bc, partial, 1 );
// Increment result if the byte was not yet in the partial vector
// Compilers are smart enough to do that with `sete` instruction, no branches
int isUnique = _mm_testz_si128( eq, eq );
result = ( isUnique ? (size_t)1 : (size_t)0 );
}
return result;
}