0

I'm currently trying to exec the code bellow and receiving a strange error.

Join sample

let mut person_schema = Schema::new();
    person_schema.with_column("name".parse().unwrap(), DataType::Utf8);
    person_schema.with_column("email".parse().unwrap(), DataType::Utf8);
    person_schema.with_column("age".parse().unwrap(), DataType::UInt32);

    let person_lf = LazyCsvReader::new("person.csv")
        .with_dtype_overwrite(Some(&SchemaRef::from(person_schema)))
        .finish()?;

    let mut account_schema = Schema::new();
    account_schema.with_column("email".parse().unwrap(), DataType::Utf8);
    account_schema.with_column("account_no".parse().unwrap(), DataType::Utf8);

    let mut account_lf = LazyCsvReader::new("account.csv")
        .with_dtype_overwrite(Some(&SchemaRef::from(account_schema)))
        .finish()?;

    let join_lf = person_lf.join(
        account_lf,
        [col("email")],
        [col("email")],
        JoinType::Inner
    );
    
    println!("{}", join_lf.with_streaming(true).collect()? );

Error

thread 'thread 'thread 'thread 'thread '<unnamed>thread 'thread '<unnamed><unnamed>thread '<unnamed><unnamed>' 
panicked at '<unnamed><unnamed>' panicked at '' 
panicked at '<unnamed>assertion failed: buf.is_empty()' 
panicked at 'assertion failed: buf.is_empty()' 
panicked at '' panicked at '', 
' panicked at 'assertion failed: buf.is_empty()', 
assertion failed: buf.is_empty()' 
panicked at 'assertion failed: buf.is_empty()assertion failed: buf.is_empty()
assertion failed: buf.is_empty()assertion failed: buf.is_empty()', ', ', C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs', 
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs', 
', 
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs:
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs:
C:\Users\MSI.cargo\registry\src\github.com-1ecc6299db9ec823\polars-pipe-0.29.0\src\executors\sinks\utils.rs::5:5:5:5::5:5:5:55:5:5:5
:5
5

the file person.csv contains 400K entries and the file accounts 20M entries. the only columns are the ones defined in the schemas. This error only occurs when I try to use with_streaming after joining those datasets.

the entries were generated by another piece of code for sampling purposes and it seems irrelevant the column or datatype that I use for the join.

If I try to collect without the with_streaming there'll be a memory allocation issue.

Code for dataset generation

fn prepare_large_dataset() -> Result<(), csv::Error>{

    let mut person_writer = csv::Writer::from_path("person.csv")?;
    let mut accounts_writer = csv::Writer::from_path("account.csv")?;

    let num_records = 20_000_000;

    for i in 0..num_records {

        let email: String = SafeEmail().fake();

        if i < 450_000 {

            let name: String = Name().fake();
            let age: u32 = rand::thread_rng().gen_range(18..65);

            let person = Person {
                // guid: guid.clone(),
                name: name.clone(),
                email: email.clone(),
                age,
            };

            person_writer.write_record(&[person.name, person.email, person.age.to_string()])?;
        }

        let account = Account {
            // guid,
            email,
            account_no: generate_account_number(),
        };

        accounts_writer.write_record(&[account.email, account.account_no])?;
    }

    person_writer.flush()?;
    accounts_writer.flush()?;


    Ok(())
}

The main objective was to join files bigger than memory to try the streaming feature of polars.

Don
  • 101
  • 2
  • 1
    I find stack**overflow** / StackExchange questions to work best with an explicit answerable question near the bottom of a question post. – greybeard May 29 '23 at 07:40

0 Answers0