I try to compile this code and use loop-specific pragmas to tell the compiler how many times to unroll a counted loop.
#include <vector>
int main() {
std::vector<int> v(8192);
#pragma GCC unroll 8 // 16
for (int i = 0; i < 16; i ) {
for (int j = 0; j < 512; j ) {
v[i*512 j] = i*j;
}
}
return 0;
}
When I place the #pragma GCC unroll 8
before the outer for
loop, the compiler does't unroll.
.L3:
movd xmm7, ecx
mov rax, rsi
movdqa xmm2, xmm6
pshufd xmm3, xmm7, 0
movdqa xmm4, xmm3
psrlq xmm4, 32
.L4:
movdqa xmm0, xmm2
movdqa xmm1, xmm3
paddd xmm2, xmm5
add rax, 16
pmuludq xmm1, xmm0
psrlq xmm0, 32
pmuludq xmm0, xmm4
pshufd xmm1, xmm1, 8
pshufd xmm0, xmm0, 8
punpckldq xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
add ecx, 1
add rsi, 2048
lea rdx, [rax 2048]
cmp ecx, 16
jne .L3
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
xor eax, eax
pop rbp
ret
But when I place the #pragma GCC unroll 16
before the outer for
loop, the compiler unroll the outer loop successfully.
.L2:
lea rdi, [rbp 8]
mov rcx, rbp
movdqa xmm2, XMMWORD PTR .LC0[rip]
xor eax, eax
and rdi, -8
movdqa xmm0, XMMWORD PTR .LC1[rip]
mov QWORD PTR [rbp 0], 0
lea rdx, [rbp 4096]
sub rcx, rdi
movdqa xmm1, xmm2
mov QWORD PTR [rbp 2040], 0
add ecx, 2048
shr ecx, 3
rep stosq
lea rax, [rbp 2048]
.L3:
movdqa xmm3, xmm1
add rax, 16
paddd xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm3
cmp rax, rdx
jne .L3
lea rdx, [rbp 6144]
movdqa xmm3, xmm2
.L4:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
lea rdx, [rbp 8192]
movdqa xmm3, xmm2
.L5:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L5
mov rax, rdx
movdqa xmm3, xmm2
lea rdx, [rbp 10240]
.L6:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L6
mov rdx, rax
movdqa xmm3, xmm2
lea rax, [rbp 12288]
.L7:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L7
lea rdx, [rbp 14336]
movdqa xmm3, xmm2
.L8:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L8
movdqa xmm3, xmm2
.L9:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rbx
jne .L9
lea rdx, [rbp 18432]
movdqa xmm3, xmm2
.L10:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L10
lea rdx, [rbp 20480]
movdqa xmm3, xmm2
.L11:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L11
lea rax, [rbp 22528]
movdqa xmm3, xmm2
.L12:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L12
lea rdx, [rbp 24576]
movdqa xmm4, xmm2
.L13:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L13
lea rdx, [rbp 26624]
movdqa xmm3, xmm2
.L14:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L14
lea rdx, [rbp 28672]
movdqa xmm4, xmm2
.L15:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
paddd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L15
lea rdx, [rbp 30720]
movdqa xmm3, xmm2
.L16:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L16
mov rax, rdx
lea rdx, [rbp 32768]
.L17:
movdqa xmm3, xmm2
add rax, 16
paddd xmm2, xmm0
movdqa xmm1, xmm3
pslld xmm1, 4
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L17
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret
So does compiler only unroll the outer loop completely?
GCC version: g (Compiler-Explorer-Build-gcc-b8ef019ab938471f7f877a1eee3a6374fd8a6ae9-binutils-2.36.1) 12.0.0 20211029 (experimental)
Option: -O2
godbolt: https://godbolt.org/z/zq7TWesY9
CodePudding user response:
https://godbolt.org/z/PT6T1691W it seems that -O2 -funroll-loops
does the trick, apparently that option needs to be on for the pragma to tell GCC how much to unroll. (Update: Or at least makes it have some effect. See comments, this doesn't seem to be a complete answer yet.)
(-funroll-loops
is not on by default unless you use -fprofile-use
, after doing a -fprofile-generate
run and running the program with representative input. It used to be on by default at -O3 a long time ago, but code bloat I-cache pressure usually made that worse for loops that aren't hot. This leads to bass-ackwards situations where the loop where GCC spends most of its time is a few instructions long with SIMD, but the fully-unrolled scalar prologue / epilogue are 10x the number of instructions, especially with wider vectors. Even with AVX-512, GCC usually just uses scalar for odd numbers of elements, not creating a mask. :/)
Fully unrolling loops is something GCC will do even at -O2
, at least for very small trip-counts. (e.g. up to 3 for an int
array p[i] = 1;
, with -O2 -fno-tree-vectorize
). https://godbolt.org/z/P5rvjYj1b
Fully-unrolling larger loops or higher trip counts (when the static code size would increase from doing so, perhaps) is not on by default at -O2
it seems. (GCC calls this peeling a loop in their tuning options/parameters, i.e. peeling all the iterations out of the loop so it goes away. -fpeel-loops
is on with -O3
, but not -O2
. Since GCC11, -fverbose-asm
no longer prints a list of optimization options enabled as asm comments.)
And BTW, it seems auto-vectorization is on by default at -O2
now in GCC trunk. Previously it was only on at -O3
, so that's interesting.