#include <stdio.h>
#include <iostream>
#include <random>
using namespace std;
volatile int res = 0;
void copy(char* __restrict__ dst, char* __restrict__ src) {
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
}
void copyOffset(char* __restrict__ dst, char* __restrict__ src, size_t offset) {
dst[0] = src[offset 0];
dst[1] = src[offset 1];
dst[2] = src[offset 2];
dst[3] = src[offset 3];
}
void copyAsInt(char *dst, char *src) {
*((int*)dst) = *((int*)src);
}
//----
void copy16(char* __restrict__ dst, char* __restrict__ src) {
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
dst[4] = src[4];
dst[5] = src[5];
dst[6] = src[6];
dst[7] = src[7];
dst[8] = src[8];
dst[9] = src[9];
dst[10] = src[10];
dst[11] = src[11];
dst[12] = src[12];
dst[13] = src[13];
dst[14] = src[14];
dst[15] = src[15];
}
void copyOffset16(char* __restrict__ dst, char* __restrict__ src, size_t offset) {
dst[0] = src[offset 0];
dst[1] = src[offset 1];
dst[2] = src[offset 2];
dst[3] = src[offset 3];
dst[4] = src[offset 4];
dst[5] = src[offset 5];
dst[6] = src[offset 6];
dst[7] = src[offset 7];
dst[8] = src[offset 8];
dst[9] = src[offset 9];
dst[10] = src[offset 10];
dst[11] = src[offset 11];
dst[12] = src[offset 12];
dst[13] = src[offset 13];
dst[14] = src[offset 14];
dst[15] = src[offset 15];
}
int main() {
char *a = new char[1001], *b = new char[16];
//--- which pair of statements below is unsafe or not equal each other?
copyOffset(b, a, 20);
res = b[rand() % 4]; // use b[] for something to prevent optimization
copy(b, &a[20]);
res = b[rand() % 4];
//--- non 4 bytes aligned
copyOffset(b, a, 18);
res = b[rand() % 4];
copy(b, &a[18]);
res = b[rand() % 4];
//---
copyOffset16(b, a, 26);
res = b[rand() % 16];
copy(b, &a[26]);
res = b[rand() % 16];
return 1;
}
I'm trying to copy 4 bytes (both source and dest are ensured to be allocated). However, the source address might not be 4 bytes aligned. To copy 4 bytes, I expect the compiler to emit a copy DWORD instruction like in copyAsInt()
. I'm using -O3 -mavx
flag, and use godbolt with gcc 11.2 to see assembly code.
The function copy()
is translated to be the same as copyAsInt()
, as expected. However, for some reason, the function copyOffset() is translated to copying each byte separately.
copy(char*, char*):
mov eax, DWORD PTR [rsi]
mov DWORD PTR [rdi], eax
ret
copyOffset(char*, char*, unsigned long):
movzx eax, BYTE PTR [rsi rdx]
mov BYTE PTR [rdi], al
movzx eax, BYTE PTR [rsi 1 rdx]
mov BYTE PTR [rdi 1], al
movzx eax, BYTE PTR [rsi 2 rdx]
mov BYTE PTR [rdi 2], al
movzx eax, BYTE PTR [rsi 3 rdx]
mov BYTE PTR [rdi 3], al
ret
Meanwhile, the function copy16() and copyOffset16() are both vectorized as expected.
copy16(char*, char*):
vmovdqu xmm0, XMMWORD PTR [rsi]
vmovdqu XMMWORD PTR [rdi], xmm0
ret
copyOffset16(char*, char*, unsigned long):
vmovdqu xmm0, XMMWORD PTR [rsi rdx]
vmovdqu XMMWORD PTR [rdi], xmm0
ret
So why isn't copyOffset()
optimized by the compiler to use mov DWORD
? Also, is there any pair of statements in main()
that is unsafe or might behave unexpectedly?
Edit: switching to x86-64 gcc (trunk) causes gcc to emit the expected instruction. So I guess this behavior is just due to compiler heuristic.
CodePudding user response:
Because it is more efficient that way. XMM (SSE) or any other SIMD instructions in general tend to be very heavy and as such have very high latency. The compiler takes this in account.
This obsession with SSE/AVX is really overblown. SSE/AVX can be useful in very specific optimizations but in 99% of where they are used by compilers they are actually ineffective.
Anecdotal evidence but I once recompiled a very large optimization binary with "-march=x86_64" which removes almost all SIMD extensions and the performance was actually better that with "-march=native".
A simple MOVB (byte) operation has a latency of 1 cycle and can execute up to 4 operations at the same time on Intel Skylake. So all the 8 instructions in there can be executed effectively in 2 cycles.
The MOVUPS instruction has about 5 cycles latency with a maximum throughput of 2 (at the same time), depending on platform and you need two of them: from memory into %xmm0 and from %xmm0 to memory, in total from 5 to 10 cycles.
Of course, things are way more complex than this because of the pipeline behavior, micro-ops, ports, L1/2/3 cache etc but this is a first-order attempt to explain what the compiler is doing.
Reference: https://www.agner.org/optimize/instruction_tables.pdf