A strange bug of g 9.3.0 -O2 on linux-CodePudding

I've met a strange bug of g 9.3.0 -O2 on linux

The code below is converted from my code of the SJT algorithm. If I keep the last line init in generate, the time cost is 1200 ms. if I delete it, the time cost is 600 ms.

This bug appears on ubuntu20.04 with g 9.3.0. I've tested it on win10 and macOS with g 9.3.0, the bug doesn't appear. I've also tested it on linux with g 8 and g 10, the bug doesn't appear, either.

Here is the code. The original question is 69468547.

I want to know what causes this strange "time cost double" behavior?

20211008: I reproduce this bug in another way. Here is the whole code.I execute the strange_func(SJT algorithm) twice in generate, the first one's time cost is 653ms and the second one's is 1322ms. You can reproduce the bug with gcc9.3.0 on linux. I've also tried gcc10, there is no bug.

#include <cstdio>
#include <cstring>
#include <chrono>

using namespace std::chrono;
#define MAXN 100

struct Permutation {
    int N;
    char s[2*MAXN];
    int r[MAXN];

    inline void init() {
        memset(s, 0, sizeof(s));
        memset(r, 0, sizeof(r));
    }

    void generate(int n) {
        N = n;
        init();
        auto start = steady_clock::now();
        strange_func();
        auto end = steady_clock::now();
        auto duration = duration_cast<milliseconds>(end - start);
        printf("time cost(ms): %ld\n", duration.count());
        init();
    }

    void strange_func() {
        int k = N, t = -1;
        while (true) {
            r[N]  = 1;
            if (r[N] < N) {
                char c = s[k]; s[k] = s[k t]; s[k t] = c;
                k  = t;
            } else {
                int i = N;
                while (r[i] == i)
                    r[i] = 0, r[--i]  = 1;
                if (i == 0) break;
                t = 0;
            }
        }
    }
} perm;

int main() {
    int n;
    scanf("%d", &n);
    perm.generate(n);
    return 0;
}

CodePudding user response：

The fact that init() is called after the strange_func() function call change the generated assembly code of the variable swap (between s[k] and s[k t]) in the loop in strange_func()! The apparent minor assembly change has a huge impact on the performance as the loop is very sensitive to micro-optimizations and the generated code with the init() is clearly less efficient. Such a change is likely due to fragile compiler heuristics (with a clear chaotic behaviour in this specific case) and the fact the strange_func() function call is inlined.

To understand what is going on, let us analyse the assembly generated by the two variants.

Here is the assembly code of the hot loop without (left) and with (right) init():

.L2:                                            |  .L2:
        add     ecx, 1                          |          add     esi, 1
        mov     DWORD PTR 12[rbx rdx*4], ecx    |          mov     DWORD PTR 12[r12 rdx*4], esi
        cmp     r8d, ecx                        |          cmp     ecx, esi
        jle     .L3                             |          jle     .L3
                                                |   
.L13:                                           |  .L13:
        movsx   r9, eax                         |          movsx   r9, eax
        add     eax, esi                        |          add     eax, edi
        add     ecx, 1                          |          add     esi, 1
        movsx   rdi, eax                        |          movzx   r11d, BYTE PTR 4[r12 r9]
        movzx   r11d, BYTE PTR 4[rbx r9]        |          movsx   r8, eax
        mov     DWORD PTR 12[rbx rdx*4], ecx    |          mov     DWORD PTR 12[r12 rdx*4], esi
        movzx   r14d, BYTE PTR 4[rbx rdi]       |          mov     BYTE PTR 15[rsp], r11b
                                                |          movzx   r11d, BYTE PTR 4[r12 r8]
        mov     BYTE PTR 4[rbx r9], r14b        |          mov     BYTE PTR 4[r12 r9], r11b
                                                |          movzx   r9d, BYTE PTR 15[rsp]
        mov     BYTE PTR 4[rbx rdi], r11b       |          mov     BYTE PTR 4[r12 r8], r9b
        cmp     r8d, ecx                        |          cmp     ecx, esi
        jg      .L13                            |          jg      .L13
                                                |   
.L3:                                            |  .L3:
        jne     .L9                             |          jne     .L9
        mov     rsi, r10                        |          mov     rdi, r10
        mov     ecx, r8d                        |          mov     esi, ecx
        .p2align 4,,10                          |          .p2align 4,,10
        .p2align 3                              |          .p2align 3
                                                |   
.L6:                                            |  .L6:
        mov     edi, DWORD PTR 200[rsi]         |          mov     r11d, DWORD PTR 200[rdi]
        sub     ecx, 1                          |          sub     esi, 1
        sub     rsi, 4                          |          sub     rdi, 4
        mov     DWORD PTR 208[rsi], 0           |          mov     DWORD PTR 208[rdi], 0
        add     edi, 1                          |          lea     r8d, 1[r11]
        mov     DWORD PTR 204[rsi], edi         |          mov     DWORD PTR 204[rdi], r8d
        cmp     ecx, edi                        |          cmp     esi, r8d
        je      .L6                             |          je      .L6
        test    ecx, ecx                        |          test    esi, esi
        je      .L14                            |          je      .L14
                                                |   
.L7:                                            |  .L7:
        mov     ecx, DWORD PTR 12[rbx rdx*4]    |          mov     esi, DWORD PTR 12[r12 rdx*4]
        xor     esi, esi                        |          xor     edi, edi
        jmp     .L2                             |          jmp     .L2
        .p2align 4,,10                          |          .p2align 4,,10
        .p2align 3                              |          .p2align 3
                                                |   
.L9:                                            |  .L9:
        mov     ecx, r8d                        |          mov     esi, ecx
        test    ecx, ecx                        |          test    esi, esi
        jne     .L7                             |          jne     .L7
        .p2align 4,,10                          |          .p2align 4,,10
        .p2align 3                              |          .p2align 3

As we can see, the L13 block contains more instructions with the init() call. The rest of the blocks look similar.

Here is a detailed analysis of the blocks without init():

movsx   r9, eax
add     eax, esi
add     ecx, 1
movsx   rdi, eax
movzx   r11d, BYTE PTR 4[rbx r9]                ; Perform r11b=s[k]
mov     DWORD PTR 12[rbx rdx*4], ecx            ; Perform r[N] =1 (r[N] was stored in ecx previously)
movzx   r14d, BYTE PTR 4[rbx rdi]               ; Perform r14b=s[k t]
mov     BYTE PTR 4[rbx r9], r14b                ; Perform s[k]=r14b
mov     BYTE PTR 4[rbx rdi], r11b               ; Perform s[k t]=r11b
cmp     r8d, ecx
jg      .L13

Here is a detailed analysis of the blocks with init():

movsx   r9, eax
add     eax, edi
add     esi, 1
movzx   r11d, BYTE PTR 4[r12 r9]
movsx   r8, eax
mov     DWORD PTR 12[r12 rdx*4], esi            ; Perform r[N] =1 (r[N] was stored in ecx previously)
mov     BYTE PTR 15[rsp], r11b                  ; Perform c = s[k] (c is stored in memory)
movzx   r11d, BYTE PTR 4[r12 r8]
mov     BYTE PTR 4[r12 r9], r11b                ; Perform s[k]=s[k t]
movzx   r9d, BYTE PTR 15[rsp]
mov     BYTE PTR 4[r12 r8], r9b                 ; Perform s[k t]=c
cmp     ecx, esi
jg      .L13

We can see that in the first case, GCC is able to swap s[k] and s[k t] efficiently while in the second case, GCC use store one value in a temporary location in the stack which is clearly less efficient. An in-memory swap is clearly less efficient because of the data dependency and L1 cache latency (generally about 3-4 cycles on modern x86 AMD/Intel processors).

Whether this is a bug or just a missing optimization of GCC 9.3.0 is still unclear. However, this is very hard to tell without delving into an old version of the GCC code not actively maintained anymore (since March 12, 2020).

A quick workaround solution to this issue is to tell GCC not to inline the function using __attribute__((noinline)). Alternatively, it should be possible to tune GCC heuristic parameters (using the GCC command line) so that this does not happen. Another solution would be to optimize the loop to compute several permutations at once so that such micro-optimizations do not matter so much.