2

I'm really new to Polars.

After running this code

fn read_csv() -> Result<(), PolarsError> {
    println!("Hello, polars! ");
    let df = CsvReader::from_path("./test_data/tar-data/csv/unziped/body.csv")?
        .has_header(false)
        .finish()?;

    let df = df.lazy().select([
        col("column_15").str().split("|").alias("origin"),
        col("column_16").str().split("|").alias("destination"),
    ]);
    let mut df = df.collect()?;

    println!("Schema {:?}", df.schema());

    println!("{:?}", df);
    Ok(())
}

I have two columns with lists like bellow. Each column is a List of utf8.

Schema Schema:
name: origin, data type: List(Utf8)
name: destination, data type: List(Utf8)

shape: (10, 2)
┌───────────────────────┬───────────────────────┐
│ origin                ┆ destination           │
│ ---                   ┆ ---                   │
│ list[str]             ┆ list[str]             │
╞═══════════════════════╪═══════════════════════╡
│ ["JOI", "GRU", "DFW"] ┆ ["VCP", "DFW", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "ATL"] ┆ ["GRU", "ATL", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "MEX"] ┆ ["GRU", "MEX", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "MCO"] ┆ ["GRU", "MCO", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...                   ┆ ...                   │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "IAH"] ┆ ["VCP", "IAH", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "ORD"] ┆ ["VCP", "ORD", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "JFK"] ┆ ["GRU", "JFK", "SLC"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["JOI", "GRU", "EWR"] ┆ ["GRU", "EWR", "SLC"] │
└───────────────────────┴───────────────────────┘

Now I need to zip this data info a single column (or Series in Polaris terms) in a form of list of structs.

My goal is to save the DataFrame as JSON with following structure

{
    "MyData": [
        [
            {
                "Origin": "JOI",
                "Destination": "VCP"
            },
            {
                "Origin": "GRU",
                "Destination": "DFW"
            },
            {
                "Origin": "DFW",
                "Destination": "SLC"
            }
        ],
        [
            {
                "Origin": "JOI",
                "Destination": "GRU"
            },
            {
                "Origin": "GRU",
                "Destination": "ATL"
            },
            {
                "Origin": "ATL",
                "Destination": "SLC"
            }
        ],
        ......
    ]
}

Which is actually a kind of

name: MyData, data type: List(List(Struct([Field { name: "origin", dtype: Utf8 }, Field { name: "destination", dtype: Utf8 }])))

I was trying to approach this with apply, fold_exprs, etc, but with no luck.

So, actually my question is how to create a column with list of predefined structs from N columns with list of data?

isaactfa
  • 5,461
  • 1
  • 10
  • 24
Alexey
  • 645
  • 6
  • 21

1 Answers1

1

Unfortunately I think Polars is not the best tool for this job. It makes handling list columns somewhat awkward, especially when you want to include structs in the mix.

Polars's CsvReader is useful, and then I think standard Rust can handle this just fine.

data.csv:

one,two
JOI|GRU|DFW,VCP|DFW|SLC
JOI|GRU|ATL,GRU|ATL|SLC

main.rs:

use polars::prelude::*;
use serde::Serialize;

#[derive(Debug, Serialize)]
#[serde(rename_all = "PascalCase")]
struct Flight {
    origin: String,
    destination: String,
}

fn read_csv() -> Result<Vec<Vec<Flight>>, PolarsError> {
    let df = CsvReader::from_path("src/data.csv")?
        .has_header(true)
        .finish()?;

    let data = df
        .column("one")?
        .utf8()?
        .into_iter()
        .zip(df.column("two")?.utf8()?)
        .filter_map(|(s1, s2)| {
            let s1 = s1?;
            let s2 = s2?;

            Some(
                s1.split('|')
                    .zip(s2.split('|'))
                    .map(|(src, dst)| Flight {
                        origin: src.into(),
                        destination: dst.into(),
                    })
                    .collect::<Vec<_>>(),
            )
        })
        .collect::<Vec<_>>();

    Ok(data)
}

fn main() {
    let data = read_csv().expect("problem reading dataframe");
    let out = serde_json::to_string_pretty(&data).expect("problem serializing");
    println!("{}", out);
}

Result:

[
  [
    {
      "Origin": "JOI",
      "Destination": "VCP"
    },
    {
      "Origin": "GRU",
      "Destination": "DFW"
    },
    {
      "Origin": "DFW",
      "Destination": "SLC"
    }
  ],
  [
    {
      "Origin": "JOI",
      "Destination": "GRU"
    },
    {
      "Origin": "GRU",
      "Destination": "ATL"
    },
    {
      "Origin": "ATL",
      "Destination": "SLC"
    }
  ]
]
BallpointBen
  • 9,406
  • 1
  • 32
  • 62