-1

How do you read a parquet file into polars in Rust?

I am looking to read in from a parquet file into a polars object in rust and then iterate over each row.

Test
  • 962
  • 9
  • 26

2 Answers2

1
fn main() {
let df = LazyFrame::scan_parquet("table.parquet", Default::default());
let out = df
    .unwrap()
    .filter( col("A").is_in( lit("CHINA" ) ) ) //filter data
    .tail(20) // last 20
    .sort("B", Default::default()) //order column B
    .groupby_stable([col("A")])       //use group by to speed up iteration
    .agg([as_struct(&[col("B"), col("C")])
            .apply(|s| {
                let ca = s.struct_()?;    // Row
                let sa = &ca.fields()[1]; // column using index
                
                //iter over Rows
                let mut index = 0;
                ca.into_iter().for_each(|x|{
                    println!("index {} {:?}\n",index, x);
                    index += 1
                });
                
                //iter over Column and do something with:
                let _result =
                sa.f64()?
                .into_iter()
                .map(|v| match v {
                    v => Some(v)
                    }
                ).collect();
                Ok(_result)                                                
            },
            GetOutput::from_type(DataType::Utf8),
        )
        .alias("OUT")]);

// yield result of the agregation       
println!("{}", out.collect().unwrap());
}
0

The source repository has examples for eager, lazy, and sql approaches to reading parquet:

eager

    use std::io::Cursor;

    use polars_core::df;
    use polars_core::prelude::*;

    use crate::prelude::*;

    #[test]
    fn test_parquet() {
        // In CI: This test will be skipped because the file does not exist.
        if let Ok(r) = polars_utils::open_file("data/simple.parquet") {
            let reader = ParquetReader::new(r);
            let df = reader.finish().unwrap();
            assert_eq!(df.get_column_names(), ["a", "b"]);
            assert_eq!(df.shape(), (3, 2));
        }
    }

lazy (enables reading "larger than memory files")

use polars::prelude::*;

fn main() -> PolarsResult<()> {
    let df = LazyFrame::scan_parquet("../datasets/foods1.parquet", ScanArgsParquet::default())?
        .select([
            // select all columns
            all(),
            // and do some aggregations
            cols(["fats_g", "sugars_g"]).sum().suffix("_summed"),
        ])
        .collect()?;

    dbg!(df);
    Ok(())
}

sql (ubiquitous, can be eager or lazy)

use polars_core::prelude::*;
use polars_lazy::prelude::*;
use polars_sql::*;

#[test]
#[cfg(feature = "parquet")]
fn read_parquet_tbl() {
    let mut context = SQLContext::new();
    let sql = r#"
            CREATE TABLE foods1 AS
            SELECT *
            FROM read_parquet('../../examples/datasets/foods1.parquet')"#;
    let df_sql = context.execute(sql).unwrap().collect().unwrap();
    let create_tbl_res = df! {
        "Response" => ["Create Table"]
    }
    .unwrap();
    assert!(df_sql.frame_equal(&create_tbl_res));
    let df_2 = context
        .execute(r#"SELECT * FROM foods1"#)
        .unwrap()
        .collect()
        .unwrap();
    assert_eq!(df_2.height(), 27);
    assert_eq!(df_2.width(), 4);
}
ecoe
  • 4,994
  • 7
  • 54
  • 72