Is functional programming in Rust zero-cost?

I have done some functional style programming tests in Rust.

let iteration = 10000000u; let mut rng = rand::task_rng(); println! ("while: {}", std::time::Duration::span(|| { let mut i = 0; let mut num = 0i64; while i<iteration { num+= rng.gen::<i64>(); i +=1; } })); // PT0.38S println!("for: {}", std::time::Duration::span(|| { let mut num = 0i64; for _ in range(0, iteration) { num+= rng.gen::<i64>(); } })); // PT0.37S println! ("fold: {}", std::time::Duration::span(|| { rng.gen_iter::<i64>().take(iteration).fold(0, |x, y| x + y); })); // PT0.37S 

I set the optimization flag to compile it. These three cases took almost the same time. Does this assume that functional programming in Rust is null?

+6
source share
2 answers

Standard Performance Disclaimer As always, you should check your code in your situations and understand what trade-offs are. Start with clear code and speed it up if necessary.

Here are the features broken and made to never be embedded. I did fewer iterations less:

 static ITERATION: uint = 10000u; #[inline(never)] fn add_manual<R>(rng: &mut R) -> i64 where R: Rng { let mut num = 0i64; let mut i = 0; while i < ITERATION { num += rng.gen(); i += 1; } num } #[inline(never)] fn add_range<R>(rng: &mut R) -> i64 where R: Rng { let mut num = 0i64; for _ in range(0, ITERATION) { num += rng.gen(); } num } #[inline(never)] fn add_fold<R>(rng: &mut R) -> i64 where R: Rng { rng.gen_iter().take(ITERATION).fold(0i64, |x, y| x + y) } fn main() { let rng = &mut rand::task_rng(); let a = add_manual(rng); let b = add_range(rng); let c = add_fold(rng); println!("{}, {}, {}", a, b, c); } 

And the corresponding LLVM IR, from rustc -O --emit ir :

 define internal fastcc i64 @_ZN10add_manual20h7014841169593652668E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { entry-block: br label %while_body while_exit: ; preds = %while_body %.lcssa = phi i64 [ %2, %while_body ] ret i64 %.lcssa while_body: ; preds = %while_body, %entry-block %num.06 = phi i64 [ 0, %entry-block ], [ %2, %while_body ] %i.05 = phi i64 [ 0, %entry-block ], [ %3, %while_body ] %1 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0) %2 = add i64 %1, %num.06 %3 = add i64 %i.05, 1 %exitcond = icmp eq i64 %3, 10000 br i1 %exitcond, label %while_exit, label %while_body } define internal fastcc i64 @_ZN9add_range20h9502859027349728313E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { entry-block: br label %for_body for_exit: ; preds = %for_body %.lcssa = phi i64 [ %4, %for_body ] ret i64 %.lcssa for_body: ; preds = %for_body, %entry-block %num.011 = phi i64 [ 0, %entry-block ], [ %4, %for_body ] %1 = phi i64 [ 0, %entry-block ], [ %2, %for_body ] %2 = add i64 %1, 1 %3 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0) %4 = add i64 %3, %num.011 %exitcond = icmp eq i64 %2, 10000 br i1 %exitcond, label %for_exit, label %for_body } define internal fastcc i64 @_ZN8add_fold20h4641190056208329853E(%"struct.std::rand::TaskRng[#1]"* noalias nocapture dereferenceable(8)) unnamed_addr #3 { for_body.lr.ph.i: br label %for_body.i for_body.i: ; preds = %for_body.i, %for_body.lr.ph.i %1 = phi i64 [ 10000, %for_body.lr.ph.i ], [ %2, %for_body.i ] %accum.06.i = phi i64 [ 0, %for_body.lr.ph.i ], [ %4, %for_body.i ] %2 = add i64 %1, -1 %3 = tail call i64 @_ZN4rand11TaskRng.Rng8next_u6420ha021c9365b1d84f2mqmE(%"struct.std::rand::TaskRng[#1]"* noalias dereferenceable(8) %0), !noalias !122 %4 = add i64 %3, %accum.06.i %5 = icmp eq i64 %2, 0 br i1 %5, label %_ZN4iter11IteratorExt4fold20h2099293433551620075E.exit, label %for_body.i _ZN4iter11IteratorExt4fold20h2099293433551620075E.exit: ; preds = %for_body.i %.lcssa = phi i64 [ %4, %for_body.i ] ret i64 %.lcssa } 

You can see that add_manual and add_range are basically the same, except for the add position. add_fold also similar, but it counts from 10000 instead of counting. In this case, the optimizer can really make them basically the same. Let us use the built-in benchmarking:

 #[bench] fn bench_add_manual(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_manual(rng)); } #[bench] fn bench_add_range(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_range(rng)); } #[bench] fn bench_add_fold(b: &mut Bencher) { let rng = &mut rand::task_rng(); b.iter(|| add_fold(rng)); } 

Results:

 test bench_add_fold ... bench: 540468 ns/iter (+/- 72181) test bench_add_manual ... bench: 543172 ns/iter (+/- 56946) test bench_add_range ... bench: 634156 ns/iter (+/- 96143) 

It looks like me. I would say, in this case, at the moment, that there is no significant difference in performance. However, this does not apply to all possible bits of code in a functional style.

+7
source

As a rule, fold (decrease) can compile the equivalent effective manually compiled code and, thus, save the programmer’s time. It is noteworthy that the recursion in the fold is in the tail position, so this is just an easier way to write a loop.

This does not apply to all programs that you write in a functional style.

+1
source

Source: https://habr.com/ru/post/980115/


All Articles