How do you read a parquet file into polars in Rust?
I am looking to read in from a parquet file into a polars object in rust and then iterate over each row.
How do you read a parquet file into polars in Rust?
I am looking to read in from a parquet file into a polars object in rust and then iterate over each row.
fn main() {
let df = LazyFrame::scan_parquet("table.parquet", Default::default());
let out = df
.unwrap()
.filter( col("A").is_in( lit("CHINA" ) ) ) //filter data
.tail(20) // last 20
.sort("B", Default::default()) //order column B
.groupby_stable([col("A")]) //use group by to speed up iteration
.agg([as_struct(&[col("B"), col("C")])
.apply(|s| {
let ca = s.struct_()?; // Row
let sa = &ca.fields()[1]; // column using index
//iter over Rows
let mut index = 0;
ca.into_iter().for_each(|x|{
println!("index {} {:?}\n",index, x);
index += 1
});
//iter over Column and do something with:
let _result =
sa.f64()?
.into_iter()
.map(|v| match v {
v => Some(v)
}
).collect();
Ok(_result)
},
GetOutput::from_type(DataType::Utf8),
)
.alias("OUT")]);
// yield result of the agregation
println!("{}", out.collect().unwrap());
}
The source repository has examples for eager, lazy, and sql approaches to reading parquet:
use std::io::Cursor;
use polars_core::df;
use polars_core::prelude::*;
use crate::prelude::*;
#[test]
fn test_parquet() {
// In CI: This test will be skipped because the file does not exist.
if let Ok(r) = polars_utils::open_file("data/simple.parquet") {
let reader = ParquetReader::new(r);
let df = reader.finish().unwrap();
assert_eq!(df.get_column_names(), ["a", "b"]);
assert_eq!(df.shape(), (3, 2));
}
}
lazy (enables reading "larger than memory files")
use polars::prelude::*;
fn main() -> PolarsResult<()> {
let df = LazyFrame::scan_parquet("../datasets/foods1.parquet", ScanArgsParquet::default())?
.select([
// select all columns
all(),
// and do some aggregations
cols(["fats_g", "sugars_g"]).sum().suffix("_summed"),
])
.collect()?;
dbg!(df);
Ok(())
}
sql (ubiquitous, can be eager or lazy)
use polars_core::prelude::*;
use polars_lazy::prelude::*;
use polars_sql::*;
#[test]
#[cfg(feature = "parquet")]
fn read_parquet_tbl() {
let mut context = SQLContext::new();
let sql = r#"
CREATE TABLE foods1 AS
SELECT *
FROM read_parquet('../../examples/datasets/foods1.parquet')"#;
let df_sql = context.execute(sql).unwrap().collect().unwrap();
let create_tbl_res = df! {
"Response" => ["Create Table"]
}
.unwrap();
assert!(df_sql.frame_equal(&create_tbl_res));
let df_2 = context
.execute(r#"SELECT * FROM foods1"#)
.unwrap()
.collect()
.unwrap();
assert_eq!(df_2.height(), 27);
assert_eq!(df_2.width(), 4);
}