Please tell me how can add values from a SIMD vector of the same type, but the values themselves, which are occupied by a different number of bytes in these SIMD vectors.
Here's an example:
int main()
{
//--------------------------------------------------------------
int my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
__m128i my_int_sequence_m128i_1 = _mm_loadu_si128((__m128i*) & my_int_sequence[0]);
__m128i my_int_sequence_m128i_2 = _mm_loadu_si128((__m128i*) & my_int_sequence[4]);
__m128i my_int_sequence_m128i_3 = _mm_loadu_si128((__m128i*) & my_int_sequence[8]);
__m128i my_int_sequence_m128i_4 = _mm_loadu_si128((__m128i*) & my_int_sequence[12]);
//--------------------------------------------------------------
//-----------------------------------------------------------------------
char my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i my_char_mask_m128i = _mm_loadu_si128((__m128i*) &my_char_mask[0]);
//-----------------------------------------------------------------------
}
That is, I have an array of int values in the my_int_sequence array - and since all 16 int values will not fit in one __m128i vector, I load these values 4 values into the 4th __m128i vectors.
I also have an array of 16 bytes, which I also loaded into the my_char_mask_my_m128i vector.
And now I want to add to each 4 byte value of the my_int_sequence_m128i_x vectors, as if the corresponding one-byte value from the my_char_mask_my_m128i vector.
The problem is obvious that I need to add up, as it were, different dimensions. Is it possible?
Perhaps I need each byte of the vector my_char_mask_my_m128i - how to transform it into 4 bytes?
CodePudding user response:
Perhaps I need each byte of the vector my_char_mask_my_m128i - how to transform it into 4 bytes?
You're looking for the SSE4.1 intrinsic _mm_cvtepi8_epi32()
, which takes the first 4 (signed) 8-bit integers in the SSE vector and sign-extends them into 32-bit integers. Combine that with some shifting to move the next 4 into place for the next extension, and you get something like:
#include <iostream>
#include <cstdint>
#include <emmintrin.h>
#include <smmintrin.h>
void print_int4(__m128i vec) {
alignas(16) std::int32_t ints[4];
_mm_store_si128(reinterpret_cast<__m128i*>(ints), vec);
std::cout << '[' << ints[0] << ", " << ints[1] << ", " << ints[2] << ", "
<< ints[3] << ']';
}
int main(void) {
alignas(16) std::int32_t
my_int_sequence[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
alignas(16) std::int8_t
my_char_mask[16] = { 1,0,1,1,0,1,0,1,1,1,0,1,0,1,0,1 };
__m128i char_mask = _mm_load_si128(reinterpret_cast<__m128i*>(my_char_mask));
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n = 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence n));
// Convert the next 4 chars to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// Shift out those 4 chars
char_mask = _mm_srli_si128(char_mask, 4);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
print_int4(vec);
std::cout << " ";
print_int4(chars_to_add);
std::cout << " = ";
print_int4(sum);
std::cout << '\n';
}
}
Example (Note that you usually have to tell your compiler to generate SSE 4.1 instructions - with g
and clang
use the appropriate -march=XXXX
option or -msse4.1
):
$ g -O -Wall -Wextra -std=gnu 11 -msse4.1 demo.cc
$ ./a.out
[0, 1, 2, 3] [1, 0, 1, 1] = [1, 1, 3, 4]
[4, 5, 6, 7] [0, 1, 0, 1] = [4, 6, 6, 8]
[8, 9, 10, 11] [1, 1, 0, 1] = [9, 10, 10, 12]
[12, 13, 14, 15] [0, 1, 0, 1] = [12, 14, 14, 16]
Alternative version suggested by Peter Cordes if your compiler is new enough to have _mm_loadu_si32()
:
// Loop through the 32-bit int array 4 at a time
for (int n = 0; n < 16; n = 4) {
// Load the next 4 ints
__m128i vec =
_mm_load_si128(reinterpret_cast<__m128i*>(my_int_sequence n));
// Load the next 4 chars
__m128i char_mask = _mm_loadu_si32(my_char_mask n);
// Convert them to ints
__m128i chars_to_add = _mm_cvtepi8_epi32(char_mask);
// And add together
__m128i sum = _mm_add_epi32(vec, chars_to_add);
// Do more stuff
}