I know how to read a Parquet file into a Vec<Row>
.
extern crate parquet;
use parquet::file::reader::{FileReader, SerializedFileReader};
use std::{fs, sync::Arc};
use parquet::column::writer::ColumnWriter;
use parquet::{
file::{
properties::WriterProperties,
writer::{FileWriter, SerializedFileWriter},
},
schema::parser::parse_message_type,
schema::types::TypePtr
};
use parquet::record::Row;
use parquet::record::RowAccessor;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;
use std::path::PathBuf;
fn read_parquet(in_path: &Path) -> (Vec<Row>, TypePtr) {
// Read Parquet input file. Return a vector of rows and the Schema
let file = File::open(in_path).unwrap();
let reader = SerializedFileReader::new(file).unwrap();
let row_iter = reader.get_row_iter(None).unwrap();
let num_rows = reader.metadata().file_metadata().num_rows();
let rows: Vec<Row> = row_iter.collect();
println!("num rows: {}", num_rows);
let schema = reader.metadata().file_metadata().schema_descr().root_schema_ptr();
(rows, schema)
}
Now, how to write the same data out? I'm using the parquet crate.
fn to_parquet(data: Vec<Row>, schema: TypePtr, out_path: &Path) {
let props = Arc::new(WriterProperties::builder().build());
let file = fs::File::create(&out_path).unwrap();
let mut writer = SerializedFileWriter::new(file, schema, props).unwrap();
// Now what?
}