0

I want to efficiently find the distance from the current row to the previous occurrence. I know polars doesn't have indexes, but the formula would roughly be:

if prior_occurrence {
  (current_row_index - prior_occurrence_index - 1)
} else {
  -1
}

This is the input dataframe:

let df_a = df![
    "a" => [1, 2, 2, 1, 4, 1],
    "b" => ["c","a", "b", "c", "c","a"]
].unwrap();

println!("{}", df_a);
a - i32 b - str
1 c
2 a
2 b
1 c
4 c
1 a

Wanted output:

a - i32 b - str b_dist - i32
1 c -1
2 a -1
2 b -1
1 c 2
4 c 0
1 a 3

What's the most efficient way to go about this?

Carbocarde
  • 23
  • 3
  • [Similar question using Pandas](https://stackoverflow.com/questions/68750108/get-the-index-of-previous-occurrence-of-a-value-in-a-pandas-series) – Carbocarde Jan 10 '23 at 00:28
  • Due to how `Series` are stored I doubt you can do better than using a solution based on `Iterator::enumerate`. – cafce25 Jan 10 '23 at 01:06

1 Answers1

1

python

(df
 .with_row_count("idx")
 .with_columns([
      ((pl.col("idx") - pl.col("idx").shift()).cast(pl.Int32).fill_null(0) - 1)
      .over("a").alias("a_distance_to_a")
 ])
)

rust


fn func1() -> PolarsResult<()> {
    let df_a = df![
    "a" => [1, 2, 2, 1, 4, 1],
    "b" => ["c","a", "b", "c", "c","a"]
    ]?;

    let out = df_a
        .lazy()
        .with_row_count("idx", None)
        .with_columns([((col("idx") - col("idx").shift(1))
            .cast(DataType::Int32)
            .fill_null(0)
            - lit(1))
        .over("a")
        .alias("a_distance_to_a")])
        .collect()?;

    Ok(())

output

shape: (6, 4)
┌─────┬─────┬─────┬─────────────────┐
│ idx ┆ a   ┆ b   ┆ a_distance_to_a │
│ --- ┆ --- ┆ --- ┆ ---             │
│ u32 ┆ i64 ┆ str ┆ i32             │
╞═════╪═════╪═════╪═════════════════╡
│ 0   ┆ 1   ┆ c   ┆ -1              │
│ 1   ┆ 2   ┆ a   ┆ -1              │
│ 2   ┆ 2   ┆ b   ┆ 0               │
│ 3   ┆ 1   ┆ c   ┆ 2               │
│ 4   ┆ 4   ┆ c   ┆ -1              │
│ 5   ┆ 1   ┆ a   ┆ 1               │
└─────┴─────┴─────┴─────────────────┘
ritchie46
  • 10,405
  • 1
  • 24
  • 43
  • 1
    Wow! The index combined with a .over() expression is incredibly clever. Thank you! There are a few minor syntax errors with the rust version (namely the unnecessary borrow in `.cast(&DataType::Int32)`, missing `lit()` wrapper around the 1 and missing brackets around "a" in the .over() expression (`.over(["a"])`). I'll try submitting an edit later once the [edit queue](https://meta.stackexchange.com/questions/84362/why-does-the-suggested-edit-queue-have-a-fixed-size-and-what-is-this-size-on-ea) has space – Carbocarde Jan 10 '23 at 16:42