0

Consider the following example:

use rayon::prelude::*;

fn do_something_expensive_returning_lots_of_data(x: u32) -> u32 {
    x
}

fn main() {
    let very_large_array = [1, 2, 3, 4];
    let mut h = std::collections::HashSet::new();
    very_large_array.par_iter().for_each({
        |x| {
            let c = do_something_expensive_returning_lots_of_data(*x);
            h.insert(c);
        }
    });
}

I'm getting the following error:

error[E0596]: cannot borrow `h` as mutable, as it is a captured variable in a `Fn` closure
  --> src/main.rs:13:13
   |
13 |             h.insert(c);
   |             ^ cannot borrow as mutable

My intention is to only have do_something_expensive_returning_lots_of_data executed in a multithreaded fashion, then once it executes, have a single-threaded iterator with results of the calls so that I could then safely mutate h. Is this possible with Rayon?

Aiden4
  • 2,504
  • 1
  • 7
  • 24
d33tah
  • 10,999
  • 13
  • 68
  • 158

2 Answers2

1

If you want to mutate the hashmap in parallelized code you need to do the Arc<Mutex<T>> dance. Why not just collect the results in a HashSet like this:

use rayon::prelude::*;

fn do_something_expensive_returning_lots_of_data(x: u32) -> u32 {
    x
}

fn main() {
    let very_large_array = [1, 2, 3, 4];
    let h = very_large_array
        .par_iter()
        .map(|x| do_something_expensive_returning_lots_of_data(*x))
        .collect::<std::collections::HashSet<_>>();
}

(Edited after comment) If you want to process the values as they arrive I suggest using an mpsc channel like this:

use rayon::prelude::*;
use std::sync::mpsc::sync_channel;

fn do_something_expensive_returning_lots_of_data(x: u32) -> u32 {
    x
}

fn main() {
    let (sender, receiver) = sync_channel(1024); // choose appropriate buffer size
    let very_large_array = [1, 2, 3, 4];
    rayon::join(
        move || {
            very_large_array.par_iter().for_each(|x| {
                sender
                    .send(do_something_expensive_returning_lots_of_data(*x))
                    .unwrap()
            })
        },
        move || {
            while let Ok(x) = receiver.recv() {
                println!("{}", x)
            }
        },
    );
}

(Playground)

HHK
  • 4,852
  • 1
  • 23
  • 40
  • Thanks! That would work, but in my case the structure is a bit more nested: it's actually a HashMap, HashSet<_>>. Because of that, I need to be able to post-process the data as it arrives, outside of the parallel iterator but not all at once. – d33tah Jul 08 '21 at 15:53
  • I have edited my answer to match this requirement. – HHK Jul 08 '21 at 16:43
0

This one is pretty simple: instead of adding elements to the HashSet one at a time during computation, map the data with your computation and then collect it into a HashSet.

use rayon::prelude::*;

fn do_something_expensive_returning_lots_of_data(x: u32) -> u32 {
    x
}

fn main() {
    let very_large_array = [1u32, 2, 3, 4];
    let h = very_large_array
        .into_par_iter()
        .map(do_something_expensive_returning_lots_of_data)
        .collect::<std::collections::HashSet<_>>();
    println!("{:?}", h);
}

Playground

It is important to realize that Rayon has a parallel analog of all of std::iter's components. If you need to extend an existing HashSet you can use the par_extend method.

use rayon::prelude::*;

fn do_something_expensive_returning_lots_of_data(x: u32) -> u32 {
    x
}

fn main() {
    let very_large_array = [1u32, 2, 3, 4];
    let mut h = std::collections::HashSet::new();
    h.par_extend(
        very_large_array
            .into_par_iter()
            .map(do_something_expensive_returning_lots_of_data),
    );
    println!("{:?}", h);
}

playground

Aiden4
  • 2,504
  • 1
  • 7
  • 24