One way to perform this given your data is to use the groupby_rolling
:
Let's start with your data and create the list of concatenated integers.
import polars as pl
df = pl.DataFrame(
{
"id": [1, 2, 3, 4, 5],
"x": [10, 20, 30, 40, 50],
"y": [100, 200, 300, 400, 500],
}
).with_column(pl.concat_list(pl.all()).alias('cat_list'))
df
shape: (5, 4)
┌─────┬─────┬─────┬──────────────┐
│ id ┆ x ┆ y ┆ cat_list │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════╪══════════════╡
│ 1 ┆ 10 ┆ 100 ┆ [1, 10, 100] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 20 ┆ 200 ┆ [2, 20, 200] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 30 ┆ 300 ┆ [3, 30, 300] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 40 ┆ 400 ┆ [4, 40, 400] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 50 ┆ 500 ┆ [5, 50, 500] │
└─────┴─────┴─────┴──────────────┘
From here, we can use a groupby_rolling
and an index of 3i
against your id
column. In the agg
, if no summary expression is used, then items in the group are automatically concatenated into a list -- in this case, a list of lists.
group_result = (
df
.groupby_rolling(index_column='id', period='3i', closed='right')
.agg(pl.col('cat_list').alias('result'))
)
group_result
shape: (5, 2)
┌─────┬─────────────────────────────────────┐
│ id ┆ result │
│ --- ┆ --- │
│ i64 ┆ list[list[i64]] │
╞═════╪═════════════════════════════════════╡
│ 1 ┆ [[1, 10, 100]] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ [[1, 10, 100], [2, 20, 200]] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ [[1, 10, 100], [2, 20, 200], [3,... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ [[2, 20, 200], [3, 30, 300], [4,... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ [[3, 30, 300], [4, 40, 400], [5,... │
└─────┴─────────────────────────────────────┘
From this point, you can simply add the column to your existing dataset.
df = df.select(
[
pl.all(),
group_result.get_column("result"),
]
)
df
shape: (5, 5)
┌─────┬─────┬─────┬──────────────┬─────────────────────────────────────┐
│ id ┆ x ┆ y ┆ cat_list ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ list[i64] ┆ list[list[i64]] │
╞═════╪═════╪═════╪══════════════╪═════════════════════════════════════╡
│ 1 ┆ 10 ┆ 100 ┆ [1, 10, 100] ┆ [[1, 10, 100]] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 20 ┆ 200 ┆ [2, 20, 200] ┆ [[1, 10, 100], [2, 20, 200]] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 30 ┆ 300 ┆ [3, 30, 300] ┆ [[1, 10, 100], [2, 20, 200], [3,... │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 40 ┆ 400 ┆ [4, 40, 400] ┆ [[2, 20, 200], [3, 30, 300], [4,... │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 50 ┆ 500 ┆ [5, 50, 500] ┆ [[3, 30, 300], [4, 40, 400], [5,... │
└─────┴─────┴─────┴──────────────┴─────────────────────────────────────┘
If you need to adhere to a minimum window size, you can use a when/then/otherwise
and arr.lengths
to set values to None.
df.with_column(
pl.when(pl.col('result').arr.lengths() < 3)
.then(None)
.otherwise(pl.col('result'))
.keep_name()
)
shape: (5, 5)
┌─────┬─────┬─────┬──────────────┬─────────────────────────────────────┐
│ id ┆ x ┆ y ┆ cat_list ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ list[i64] ┆ list[list[i64]] │
╞═════╪═════╪═════╪══════════════╪═════════════════════════════════════╡
│ 1 ┆ 10 ┆ 100 ┆ [1, 10, 100] ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 20 ┆ 200 ┆ [2, 20, 200] ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 30 ┆ 300 ┆ [3, 30, 300] ┆ [[1, 10, 100], [2, 20, 200], [3,... │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 40 ┆ 400 ┆ [4, 40, 400] ┆ [[2, 20, 200], [3, 30, 300], [4,... │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 50 ┆ 500 ┆ [5, 50, 500] ┆ [[3, 30, 300], [4, 40, 400], [5,... │
└─────┴─────┴─────┴──────────────┴─────────────────────────────────────┘