Standard Performance Disclaimer As always, you should check your code in your situations and understand what trade-offs are. Start with clear code and speed it up if necessary.
Here are the features broken and made to never be embedded. I did fewer iterations less:
static ITERATION: uint = 10000u; #[inline(never)] fn add_manual<R>(rng: &mut R) -> i64 where R: Rng { let mut num = 0i64; let mut i = 0; while i < ITERATION { num += rng.gen(); i += 1; } num } #[inline(never)] fn add_range<R>(rng: &mut R) -> i64 where R: Rng { let mut num = 0i64; for _ in range(0, ITERATION) { num += rng.gen(); } num } #[inline(never)] fn add_fold<R>(rng: &mut R) -> i64 where R: Rng { rng.gen_iter().take(ITERATION).fold(0i64, |x, y| x + y) } fn main() { let rng = &mut rand::task_rng(); let a = add_manual(rng); let b = add_range(rng); let c = add_fold(rng); println!("{}, {}, {}", a, b, c); }
And the corresponding LLVM IR, from rustc -O --emit ir :
define internal fastcc i64 @_ZN10add_manual20h7014841169593652668E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { entry-block: br label %while_body while_exit: ; preds = %while_body %.lcssa = phi i64 [ %2, %while_body ] ret i64 %.lcssa while_body: ; preds = %while_body, %entry-block %num.06 = phi i64 [ 0, %entry-block ], [ %2, %while_body ] %i.05 = phi i64 [ 0, %entry-block ], [ %3, %while_body ] %1 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0) %2 = add i64 %1, %num.06 %3 = add i64 %i.05, 1 %exitcond = icmp eq i64 %3, 10000 br i1 %exitcond, label %while_exit, label %while_body } define internal fastcc i64 @_ZN9add_range20h9502859027349728313E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { entry-block: br label %for_body for_exit: ; preds = %for_body %.lcssa = phi i64 [ %4, %for_body ] ret i64 %.lcssa for_body: ; preds = %for_body, %entry-block %num.011 = phi i64 [ 0, %entry-block ], [ %4, %for_body ] %1 = phi i64 [ 0, %entry-block ], [ %2, %for_body ] %2 = add i64 %1, 1 %3 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0) %4 = add i64 %3, %num.011 %exitcond = icmp eq i64 %2, 10000 br i1 %exitcond, label %for_exit, label %for_body } define internal fastcc i64 @_ZN8add_fold20h4641190056208329853E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { for_body.lr.ph.i: br label %for_body.i for_body.i: ; preds = %for_body.i, %for_body.lr.ph.i %1 = phi i64 [ 10000, %for_body.lr.ph.i ], [ %2, %for_body.i ] %accum.06.i = phi i64 [ 0, %for_body.lr.ph.i ], [ %4, %for_body.i ] %2 = add i64 %1, -1 %3 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0), !noalias !122 %4 = add i64 %3, %accum.06.i %5 = icmp eq i64 %2, 0 br i1 %5, label %_ZN4iter11IteratorExt4fold20h2099293433551620075E.exit, label %for_body.i _ZN4iter11IteratorExt4fold20h2099293433551620075E.exit: ; preds = %for_body.i %.lcssa = phi i64 [ %4, %for_body.i ] ret i64 %.lcssa }
You can see that add_manual and add_range are basically the same, except for the add position. add_fold also similar, but it counts from 10000 instead of counting. In this case, the optimizer can really make them basically the same. Let us use the built-in benchmarking:
#[bench] fn bench_add_manual(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_manual(rng)); } #[bench] fn bench_add_range(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_range(rng)); } #[bench] fn bench_add_fold(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_fold(rng)); }
Results:
test bench_add_fold ... bench: 540468 ns/iter (+/- 72181) test bench_add_manual ... bench: 543172 ns/iter (+/- 56946) test bench_add_range ... bench: 634156 ns/iter (+/- 96143)
It looks like me. I would say, in this case, at the moment, that there is no significant difference in performance. However, this does not apply to all possible bits of code in a functional style.