Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range
expression. However, date_range
does not take an Expression as its low
and high
parameters.
However, arange
does allow Expressions as its low
and high
parameters. We can (implicitly) cast the start
and end
dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start
and end
dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high
parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode
to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date
column back to a pl.Date
, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘