I was working on mandelbrot algorithm to learn Rust and I found out that empty 25mil(approx 6k image) loop takes 0.5s. I found it quite slow. So I went to test it in python and found out, it takes almost the same time. Has really python's for loop almost zero cost abstraction? Is this really the best I can get with intel i7?
Rust:
use std::time::Instant;
fn main() {
let before = Instant::now();
for i in 0..5000 {
for j in 0..5000 {}
}
println!("Elapsed time: {:.2?}", before.elapsed());
}
>>> Elapsed time: 406.90ms
Python:
import time
s = time.time()
for i in range(5000):
for j in range(5000):
pass
print(time.time()-s)
>>> 0.5715351104736328
UPDATE: If I use initialized tuple instead range, python is even faster than rust -> 0.33s
CodePudding user response:
If you're doing performance testing always build with --release
. By default Cargo builds with debugging information enabled and optimizations disabled. The optimizer will completely eliminate these loops. On the Playground it drops from 975ms to 1.25µs.
Let's take a look at the assembly on Godbolt for just the loops, no timer:
pub fn main() {
for i in 0..5000 {
for j in 0..5000 {}
}
}
<i32 as core::iter::range::Step>::forward_unchecked:
push rax
mov eax, esi
add edi, eax
mov dword ptr [rsp 4], edi
mov eax, dword ptr [rsp 4]
mov dword ptr [rsp], eax
mov eax, dword ptr [rsp]
pop rcx
ret
core::intrinsics::copy_nonoverlapping:
push rax
mov qword ptr [rsp], rsi
mov rsi, rdi
mov rdi, qword ptr [rsp]
shl rdx, 2
call memcpy@PLT
pop rax
ret
core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt:
mov eax, dword ptr [rdi]
cmp eax, dword ptr [rsi]
setl al
and al, 1
movzx eax, al
ret
core::mem::replace:
sub rsp, 40
mov qword ptr [rsp], rdi
mov dword ptr [rsp 12], esi
mov byte ptr [rsp 23], 0
mov byte ptr [rsp 23], 1
mov rax, qword ptr [rip core::ptr::read@GOTPCREL]
call rax
mov ecx, eax
mov dword ptr [rsp 16], ecx
jmp .LBB3_1
.LBB3_1:
mov esi, dword ptr [rsp 12]
mov rdi, qword ptr [rsp]
mov byte ptr [rsp 23], 0
mov rcx, qword ptr [rip core::ptr::write@GOTPCREL]
call rcx
jmp .LBB3_4
.LBB3_2:
test byte ptr [rsp 23], 1
jne .LBB3_8
jmp .LBB3_7
mov rcx, rax
mov eax, edx
mov qword ptr [rsp 24], rcx
mov dword ptr [rsp 32], eax
jmp .LBB3_2
.LBB3_4:
mov eax, dword ptr [rsp 16]
add rsp, 40
ret
.LBB3_5:
jmp .LBB3_2
mov rcx, rax
mov eax, edx
mov qword ptr [rsp 24], rcx
mov dword ptr [rsp 32], eax
jmp .LBB3_5
.LBB3_7:
mov rdi, qword ptr [rsp 24]
call _Unwind_Resume@PLT
ud2
.LBB3_8:
jmp .LBB3_7
core::ptr::read:
sub rsp, 24
mov qword ptr [rsp 8], rdi
mov eax, dword ptr [rsp 20]
mov dword ptr [rsp 16], eax
jmp .LBB4_2
.LBB4_2:
mov rdi, qword ptr [rsp 8]
lea rsi, [rsp 16]
mov edx, 1
call qword ptr [rip core::intrinsics::copy_nonoverlapping@GOTPCREL]
mov eax, dword ptr [rsp 16]
mov dword ptr [rsp 4], eax
mov eax, dword ptr [rsp 4]
add rsp, 24
ret
core::ptr::write:
sub rsp, 4
mov dword ptr [rsp], esi
mov eax, dword ptr [rsp]
mov dword ptr [rdi], eax
add rsp, 4
ret
core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next:
push rax
call qword ptr [rip <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next@GOTPCREL]
mov dword ptr [rsp], eax
mov dword ptr [rsp 4], edx
mov edx, dword ptr [rsp 4]
mov eax, dword ptr [rsp]
pop rcx
ret
core::clone::impls::<impl core::clone::Clone for i32>::clone:
mov eax, dword ptr [rdi]
ret
<I as core::iter::traits::collect::IntoIterator>::into_iter:
mov edx, esi
mov eax, edi
ret
<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next:
sub rsp, 40
mov rsi, rdi
mov qword ptr [rsp 16], rsi
mov rdi, rsi
add rsi, 4
call core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt
mov byte ptr [rsp 31], al
mov al, byte ptr [rsp 31]
test al, 1
jne .LBB9_3
jmp .LBB9_2
.LBB9_2:
mov dword ptr [rsp 32], 0
jmp .LBB9_7
.LBB9_3:
mov rdi, qword ptr [rsp 16]
call core::clone::impls::<impl core::clone::Clone for i32>::clone
mov dword ptr [rsp 12], eax
mov edi, dword ptr [rsp 12]
mov esi, 1
call <i32 as core::iter::range::Step>::forward_unchecked
mov dword ptr [rsp 8], eax
mov esi, dword ptr [rsp 8]
mov rdi, qword ptr [rsp 16]
call qword ptr [rip core::mem::replace@GOTPCREL]
mov dword ptr [rsp 4], eax
mov eax, dword ptr [rsp 4]
mov dword ptr [rsp 36], eax
mov dword ptr [rsp 32], 1
.LBB9_7:
mov eax, dword ptr [rsp 32]
mov edx, dword ptr [rsp 36]
add rsp, 40
ret
example::main:
sub rsp, 72
mov dword ptr [rsp 24], 0
mov dword ptr [rsp 28], 5000
mov edi, dword ptr [rsp 24]
mov esi, dword ptr [rsp 28]
call qword ptr [rip <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp 16], eax
mov dword ptr [rsp 20], edx
mov eax, dword ptr [rsp 20]
mov ecx, dword ptr [rsp 16]
mov dword ptr [rsp 32], ecx
mov dword ptr [rsp 36], eax
.LBB10_2:
mov rax, qword ptr [rip core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp 32]
call rax
mov dword ptr [rsp 44], edx
mov dword ptr [rsp 40], eax
mov eax, dword ptr [rsp 40]
test rax, rax
je .LBB10_5
jmp .LBB10_13
.LBB10_13:
jmp .LBB10_6
ud2
.LBB10_5:
add rsp, 72
ret
.LBB10_6:
mov dword ptr [rsp 48], 0
mov dword ptr [rsp 52], 5000
mov edi, dword ptr [rsp 48]
mov esi, dword ptr [rsp 52]
call qword ptr [rip <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]
mov dword ptr [rsp 8], eax
mov dword ptr [rsp 12], edx
mov eax, dword ptr [rsp 12]
mov ecx, dword ptr [rsp 8]
mov dword ptr [rsp 56], ecx
mov dword ptr [rsp 60], eax
.LBB10_8:
mov rax, qword ptr [rip core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]
lea rdi, [rsp 56]
call rax
mov dword ptr [rsp 68], edx
mov dword ptr [rsp 64], eax
mov eax, dword ptr [rsp 64]
test rax, rax
je .LBB10_11
jmp .LBB10_14
.LBB10_14:
jmp .LBB10_12
ud2
.LBB10_11:
jmp .LBB10_2
.LBB10_12:
jmp .LBB10_8
__rustc_debug_gdb_scripts_section__:
.asciz "\001gdb_load_rust_pretty_printers.py"
DW.ref.rust_eh_personality:
.quad rust_eh_personality
example::main:
ret