Is there an equivalent way to to df.groupby().shift in polars? Use pandas.shift() within a group
Asked
Active
Viewed 2,342 times
1 Answers
13
You can use the over
expression to accomplish this in Polars. Using the example from the link...
import polars as pl
df = pl.DataFrame({
'object': [1, 1, 1, 2, 2],
'period': [1, 2, 4, 4, 23],
'value': [24, 67, 89, 5, 23],
})
df.with_column(
pl.col('value').shift().over('object').alias('prev_value')
)
shape: (5, 4)
┌────────┬────────┬───────┬────────────┐
│ object ┆ period ┆ value ┆ prev_value │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞════════╪════════╪═══════╪════════════╡
│ 1 ┆ 1 ┆ 24 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 67 ┆ 24 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 4 ┆ 89 ┆ 67 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ 5 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 23 ┆ 23 ┆ 5 │
└────────┴────────┴───────┴────────────┘
To perform this on more than one column, you can specify the columns in the pl.col
expression, and then use a prefix/suffix to name the new columns. For example:
df.with_columns(
pl.col(['period', 'value']).shift().over('object').prefix("prev_")
)
shape: (5, 5)
┌────────┬────────┬───────┬─────────────┬────────────┐
│ object ┆ period ┆ value ┆ prev_period ┆ prev_value │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞════════╪════════╪═══════╪═════════════╪════════════╡
│ 1 ┆ 1 ┆ 24 ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 67 ┆ 1 ┆ 24 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 4 ┆ 89 ┆ 2 ┆ 67 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ 5 ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 23 ┆ 23 ┆ 4 ┆ 5 │
└────────┴────────┴───────┴─────────────┴────────────┘
Using multiple values with over
Let's use this data.
df = pl.DataFrame(
{
"id": [1] * 5 + [2] * 5,
"date": ["2020-01-01", "2020-01-01", "2020-02-01", "2020-02-01", "2020-02-01"] * 2,
"value1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"value2": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
}
).with_column(pl.col('date').str.strptime(pl.Date))
df
shape: (10, 4)
┌─────┬────────────┬────────┬────────┐
│ id ┆ date ┆ value1 ┆ value2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════╪════════╡
│ 1 ┆ 2020-01-01 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-01-01 ┆ 2 ┆ 20 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 3 ┆ 30 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 4 ┆ 40 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 5 ┆ 50 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-01-01 ┆ 6 ┆ 60 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-01-01 ┆ 7 ┆ 70 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 8 ┆ 80 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 9 ┆ 90 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 10 ┆ 100 │
└─────┴────────────┴────────┴────────┘
We can place a list of our grouping variables in the over
expression (as well as a list in our pl.col
expression). Polars will run them all in parallel.
df.with_columns([
pl.col(["value1", "value2"]).shift().over(['id','date']).prefix("prev_"),
pl.col(["value1", "value2"]).diff().over(['id','date']).suffix("_diff"),
])
shape: (10, 8)
┌─────┬────────────┬────────┬────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ id ┆ date ┆ value1 ┆ value2 ┆ prev_value1 ┆ prev_value2 ┆ value1_diff ┆ value2_diff │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ date ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪════════════╪════════╪════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 1 ┆ 2020-01-01 ┆ 1 ┆ 10 ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-01-01 ┆ 2 ┆ 20 ┆ 1 ┆ 10 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 3 ┆ 30 ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 4 ┆ 40 ┆ 3 ┆ 30 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2020-02-01 ┆ 5 ┆ 50 ┆ 4 ┆ 40 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-01-01 ┆ 6 ┆ 60 ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-01-01 ┆ 7 ┆ 70 ┆ 6 ┆ 60 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 8 ┆ 80 ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 9 ┆ 90 ┆ 8 ┆ 80 ┆ 1 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2020-02-01 ┆ 10 ┆ 100 ┆ 9 ┆ 90 ┆ 1 ┆ 10 │
└─────┴────────────┴────────┴────────┴─────────────┴─────────────┴─────────────┴─────────────┘
-
Does it matter for performance for order? can i do df.with_columns( with over a few times) – Michael WS Jul 24 '22 at 20:17
-
1You can do more than one expression with `over` in a `with_columns` context. I'll edit my answer to show an easy way. – Jul 24 '22 at 20:23
-
Also How do you do two columns? I have in my data: date, id – Michael WS Jul 24 '22 at 20:24
-
I would do stuff like groupby(["date", "id"]).diff() and groupby(["date", "id"]).shift() – Michael WS Jul 24 '22 at 20:24
-
1Sure, let me add that to my answer... – Jul 24 '22 at 20:27
-
fantastic, I am testing a process in polars and wasn't sure on a few of these – Michael WS Jul 24 '22 at 20:57