I have these two source files:
const ARR_LEN: usize = 128 * 1024;
pub fn plain_mod_test(x: &[u64; ARR_LEN], m: u64, result: &mut [u64; ARR_LEN]) {
for i in 0..ARR_LEN {
result[i] = x[i] % m;
}
}
and
#include <stdint.h>
#define ARR_LEN (128 * 1024)
void plain_mod_test(uint64_t *x, uint64_t m, uint64_t *result) {
for (int i = 0; i < ARR_LEN; i) {
result[i] = x[i] % m;
}
}
Is my C code a good approximation to the Rust code?
When I compile the C code on godbolt.org x86_64 gcc12.2 -O3
I get the sensible:
plain_mod_test:
mov r8, rdx
xor ecx, ecx
.L2:
mov rax, QWORD PTR [rdi rcx]
xor edx, edx
div rsi
mov QWORD PTR [r8 rcx], rdx
add rcx, 8
cmp rcx, 1048576
jne .L2
ret
But when I do the same for rustc 1.66 -C opt-level=3
I get this verbose output:
example::plain_mod_test:
push rax
test rsi, rsi
je .LBB0_10
mov r8, rdx
xor ecx, ecx
jmp .LBB0_2
.LBB0_7:
xor edx, edx
div rsi
mov qword ptr [r8 8*rcx 8], rdx
mov rcx, r9
cmp r9, 131072
je .LBB0_9
.LBB0_2:
mov rax, qword ptr [rdi 8*rcx]
mov rdx, rax
or rdx, rsi
shr rdx, 32
je .LBB0_3
xor edx, edx
div rsi
jmp .LBB0_5
.LBB0_3:
xor edx, edx
div esi
.LBB0_5:
mov qword ptr [r8 8*rcx], rdx
mov rax, qword ptr [rdi 8*rcx 8]
lea r9, [rcx 2]
mov rdx, rax
or rdx, rsi
shr rdx, 32
jne .LBB0_7
xor edx, edx
div esi
mov qword ptr [r8 8*rcx 8], rdx
mov rcx, r9
cmp r9, 131072
jne .LBB0_2
.LBB0_9:
pop rax
ret
.LBB0_10:
lea rdi, [rip str.0]
lea rdx, [rip .L__unnamed_1]
mov esi, 57
call qword ptr [rip core::panicking::panic@GOTPCREL]
ud2
How do I write Rust code which compiles to assembly similar to that produced by gcc for C?
Update: When I compile the C code with clang 12.0.0 -O3
I get output which looks far more like the Rust assembly than the GCC/C assembly.
i.e. This looks like a GCC vs clang issue, rather than a C vs Rust difference.
plain_mod_test: # @plain_mod_test
mov r8, rdx
xor ecx, ecx
jmp .LBB0_1
.LBB0_6: # in Loop: Header=BB0_1 Depth=1
xor edx, edx
div rsi
mov qword ptr [r8 8*rcx 8], rdx
add rcx, 2
cmp rcx, 131072
je .LBB0_8
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov rax, qword ptr [rdi 8*rcx]
mov rdx, rax
or rdx, rsi
shr rdx, 32
je .LBB0_2
xor edx, edx
div rsi
jmp .LBB0_4
.LBB0_2: # in Loop: Header=BB0_1 Depth=1
xor edx, edx
div esi
.LBB0_4: # in Loop: Header=BB0_1 Depth=1
mov qword ptr [r8 8*rcx], rdx
mov rax, qword ptr [rdi 8*rcx 8]
mov rdx, rax
or rdx, rsi
shr rdx, 32
jne .LBB0_6
xor edx, edx
div esi
mov qword ptr [r8 8*rcx 8], rdx
add rcx, 2
cmp rcx, 131072
jne .LBB0_1
.LBB0_8:
ret
CodePudding user response:
Don’t compare apples to crabs.
Most of the difference between the assembly outputs is due to loop unrolling, which the LLVM code generator used by rustc does much more aggressively than GCC’s. When you compile the same C code with Clang 15, it outputs:
mov r8, rdx
xor ecx, ecx
jmp .LBB0_1
.LBB0_6:
xor edx, edx
div rsi
mov qword ptr [r8 8*rcx 8], rdx
add rcx, 2
cmp rcx, 131072
je .LBB0_8
.LBB0_1:
mov rax, qword ptr [rdi 8*rcx]
mov rdx, rax
or rdx, rsi
shr rdx, 32
je .LBB0_2
xor edx, edx
div rsi
jmp .LBB0_4
.LBB0_2:
xor edx, edx
div esi
.LBB0_4:
mov qword ptr [r8 8*rcx], rdx
mov rax, qword ptr [rdi 8*rcx 8]
mov rdx, rax
or rdx, rsi
shr rdx, 32
jne .LBB0_6
xor edx, edx
div esi
mov qword ptr [r8 8*rcx 8], rdx
add rcx, 2
cmp rcx, 131072
jne .LBB0_1
.LBB0_8:
ret
which is pretty much the same as the Rust version.
Using Clang with -Os
results in assembly much closer to that of GCC:
mov r8, rdx
xor ecx, ecx
.LBB0_1:
mov rax, qword ptr [rdi 8*rcx]
xor edx, edx
div rsi
mov qword ptr [r8 8*rcx], rdx
inc rcx
cmp rcx, 131072
jne .LBB0_1
ret
Likewise does -C opt-level=s
with rustc:
push rax
test rsi, rsi
je .LBB6_4
mov r8, rdx
xor ecx, ecx
.LBB6_2:
mov rax, qword ptr [rdi 8*rcx]
xor edx, edx
div rsi
mov qword ptr [r8 8*rcx], rdx
lea rax, [rcx 1]
mov rcx, rax
cmp rax, 131072
jne .LBB6_2
pop rax
ret
.LBB6_4:
lea rdi, [rip str.0]
lea rdx, [rip .L__unnamed_1]
mov esi, 57
call qword ptr [rip core::panicking::panic@GOTPCREL]
ud2
Of course, there is still the check if m
is zero, leading to a panicking branch. To eliminate that branch without using unstable intrinsics, you can use std::hint::unreachable_unchecked()
:
const ARR_LEN: usize = 128 * 1024;
pub unsafe fn plain_mod_test(x: &[u64; ARR_LEN], m: u64, result: &mut [u64; ARR_LEN]) {
if m == 0 { std::hint::unreachable_unchecked() }
for i in 0..ARR_LEN {
result[i] = x[i] % m
}
}
Now the function will emit identical assembly to Clang. This of course makes the function unsafe
, but such is the price of imitating C.
CodePudding user response:
rustc
uses the LLVM back-end optimizer, so compare against clang
. LLVM unrolls small loops by default.
Recent LLVM is also tuning for Intel CPUs before Ice Lake, where div r64
is much slower than div r32
, so much slower that it's worth branching on.
It's checking if a uint64_t
actually fits in uint32_t
and using 32-bit operand-size for div
. The shr
/je
is doing if ((dividend|divisor)>>32 == 0) use 32-bit
to check the high halves of both operands for being all zero. If it checked the high half of m
once, and made 2 versions of loop, the test would be simpler. But this code will bottleneck on division throughput anyway.
This opportunistic div r32
code-gen will eventually become obsolete, as Ice Lake's integer divider is wide enough not to need way more micro-ops for 64-bit, so performance just depends on the actual values, regardless of whether there are an extra 32 bits of zeros above it or not. AMD has been like that for a while.
But Intel sold a lot of CPUs based re-spins of Skylake (including Cascade Lake servers, and client CPUs up to Comet Lake). While those are still in widespread use, LLVM -mtune=generic
should probably keep doing this.
For more details:
- Trial-division code runs 2x faster as 32-bit on Windows than 64-bit on Linux (a case where we know at compile-time the 64-bit integers will only hold small values, and rewriting the binary to use 32-bit operand-size with zero other changes to alignment or machine code makes it 3x faster on my Skylake CPU.)
- uops for integer DIV instruction
- Can 128bit/64bit hardware unsigned division be faster in some cases than 64bit/32bit division on x86-64 Intel/AMD CPUs?