I've met a strange bug of g 9.3.0 -O2 on linux
The code below is converted from my code of the SJT algorithm.
If I keep the last line init
in generate
, the time cost is 1200 ms
.
if I delete it, the time cost is 600 ms
.
This bug appears on ubuntu20.04 with g 9.3.0. I've tested it on win10 and macOS with g 9.3.0, the bug doesn't appear. I've also tested it on linux with g 8 and g 10, the bug doesn't appear, either.
Here is the code. The original question is 69468547.
I want to know what causes this strange "time cost double" behavior?
20211008: I reproduce this bug in another way. Here is the whole code.I execute the strange_func
(SJT algorithm) twice in generate
, the first one's time cost is 653ms
and the second one's is 1322ms
. You can reproduce the bug with gcc9.3.0 on linux. I've also tried gcc10, there is no bug.
#include <cstdio>
#include <cstring>
#include <chrono>
using namespace std::chrono;
#define MAXN 100
struct Permutation {
int N;
char s[2*MAXN];
int r[MAXN];
inline void init() {
memset(s, 0, sizeof(s));
memset(r, 0, sizeof(r));
}
void generate(int n) {
N = n;
init();
auto start = steady_clock::now();
strange_func();
auto end = steady_clock::now();
auto duration = duration_cast<milliseconds>(end - start);
printf("time cost(ms): %ld\n", duration.count());
init();
}
void strange_func() {
int k = N, t = -1;
while (true) {
r[N] = 1;
if (r[N] < N) {
char c = s[k]; s[k] = s[k t]; s[k t] = c;
k = t;
} else {
int i = N;
while (r[i] == i)
r[i] = 0, r[--i] = 1;
if (i == 0) break;
t = 0;
}
}
}
} perm;
int main() {
int n;
scanf("%d", &n);
perm.generate(n);
return 0;
}
CodePudding user response:
The fact that init()
is called after the strange_func()
function call change the generated assembly code of the variable swap (between s[k]
and s[k t]
) in the loop in strange_func()
! The apparent minor assembly change has a huge impact on the performance as the loop is very sensitive to micro-optimizations and the generated code with the init()
is clearly less efficient. Such a change is likely due to fragile compiler heuristics (with a clear chaotic behaviour in this specific case) and the fact the strange_func()
function call is inlined.
To understand what is going on, let us analyse the assembly generated by the two variants.
Here is the assembly code of the hot loop without (left) and with (right) init()
:
.L2: | .L2:
add ecx, 1 | add esi, 1
mov DWORD PTR 12[rbx rdx*4], ecx | mov DWORD PTR 12[r12 rdx*4], esi
cmp r8d, ecx | cmp ecx, esi
jle .L3 | jle .L3
|
.L13: | .L13:
movsx r9, eax | movsx r9, eax
add eax, esi | add eax, edi
add ecx, 1 | add esi, 1
movsx rdi, eax | movzx r11d, BYTE PTR 4[r12 r9]
movzx r11d, BYTE PTR 4[rbx r9] | movsx r8, eax
mov DWORD PTR 12[rbx rdx*4], ecx | mov DWORD PTR 12[r12 rdx*4], esi
movzx r14d, BYTE PTR 4[rbx rdi] | mov BYTE PTR 15[rsp], r11b
| movzx r11d, BYTE PTR 4[r12 r8]
mov BYTE PTR 4[rbx r9], r14b | mov BYTE PTR 4[r12 r9], r11b
| movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[rbx rdi], r11b | mov BYTE PTR 4[r12 r8], r9b
cmp r8d, ecx | cmp ecx, esi
jg .L13 | jg .L13
|
.L3: | .L3:
jne .L9 | jne .L9
mov rsi, r10 | mov rdi, r10
mov ecx, r8d | mov esi, ecx
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L6: | .L6:
mov edi, DWORD PTR 200[rsi] | mov r11d, DWORD PTR 200[rdi]
sub ecx, 1 | sub esi, 1
sub rsi, 4 | sub rdi, 4
mov DWORD PTR 208[rsi], 0 | mov DWORD PTR 208[rdi], 0
add edi, 1 | lea r8d, 1[r11]
mov DWORD PTR 204[rsi], edi | mov DWORD PTR 204[rdi], r8d
cmp ecx, edi | cmp esi, r8d
je .L6 | je .L6
test ecx, ecx | test esi, esi
je .L14 | je .L14
|
.L7: | .L7:
mov ecx, DWORD PTR 12[rbx rdx*4] | mov esi, DWORD PTR 12[r12 rdx*4]
xor esi, esi | xor edi, edi
jmp .L2 | jmp .L2
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
|
.L9: | .L9:
mov ecx, r8d | mov esi, ecx
test ecx, ecx | test esi, esi
jne .L7 | jne .L7
.p2align 4,,10 | .p2align 4,,10
.p2align 3 | .p2align 3
As we can see, the L13 block contains more instructions with the init()
call. The rest of the blocks look similar.
Here is a detailed analysis of the blocks without init()
:
movsx r9, eax
add eax, esi
add ecx, 1
movsx rdi, eax
movzx r11d, BYTE PTR 4[rbx r9] ; Perform r11b=s[k]
mov DWORD PTR 12[rbx rdx*4], ecx ; Perform r[N] =1 (r[N] was stored in ecx previously)
movzx r14d, BYTE PTR 4[rbx rdi] ; Perform r14b=s[k t]
mov BYTE PTR 4[rbx r9], r14b ; Perform s[k]=r14b
mov BYTE PTR 4[rbx rdi], r11b ; Perform s[k t]=r11b
cmp r8d, ecx
jg .L13
Here is a detailed analysis of the blocks with init()
:
movsx r9, eax
add eax, edi
add esi, 1
movzx r11d, BYTE PTR 4[r12 r9]
movsx r8, eax
mov DWORD PTR 12[r12 rdx*4], esi ; Perform r[N] =1 (r[N] was stored in ecx previously)
mov BYTE PTR 15[rsp], r11b ; Perform c = s[k] (c is stored in memory)
movzx r11d, BYTE PTR 4[r12 r8]
mov BYTE PTR 4[r12 r9], r11b ; Perform s[k]=s[k t]
movzx r9d, BYTE PTR 15[rsp]
mov BYTE PTR 4[r12 r8], r9b ; Perform s[k t]=c
cmp ecx, esi
jg .L13
We can see that in the first case, GCC is able to swap s[k]
and s[k t]
efficiently while in the second case, GCC use store one value in a temporary location in the stack which is clearly less efficient. An in-memory swap is clearly less efficient because of the data dependency and L1 cache latency (generally about 3-4 cycles on modern x86 AMD/Intel processors).
Whether this is a bug or just a missing optimization of GCC 9.3.0 is still unclear. However, this is very hard to tell without delving into an old version of the GCC code not actively maintained anymore (since March 12, 2020).
A quick workaround solution to this issue is to tell GCC not to inline the function using __attribute__((noinline))
. Alternatively, it should be possible to tune GCC heuristic parameters (using the GCC command line) so that this does not happen. Another solution would be to optimize the loop to compute several permutations at once so that such micro-optimizations do not matter so much.