(I'm going to assume that your intervals in df_1
do not overlap for a particular id - otherwise, there may not be a unique value
that we can assign to the id/dates combinations in df_2
.)
One way to do this is with join_asof
.
The Algorithm
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start"),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.with_column(
pl.when(pl.col('dates') <= pl.col('end'))
.then(pl.col('value'))
.otherwise(None)
)
.select(['id', 'dates', 'value'])
)
shape: (6, 3)
┌─────┬────────────┬───────┐
│ id ┆ dates ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ abc ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-07 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
└─────┴────────────┴───────┘
In Steps
First, let's append some additional rows to df_2
, to show what will happen if a particular row is not contained in an interval in df_1
. I'll also add a row number, for easier inspection.
df_2 = pl.DataFrame(
{
"id": ["abc", "abc", "456", "abc", "abc", "456", "abc", "abc", "abc"],
"dates": [
date(2022, 1, 2),
date(2022, 3, 4),
date(2022, 5, 11),
date(2022, 1, 4),
date(2022, 3, 7),
date(2022, 5, 13),
date(2021, 12, 31),
date(2022, 3, 1),
date(2023, 1, 1),
],
}
).with_row_count()
df_2
shape: (9, 3)
┌────────┬─────┬────────────┐
│ row_nr ┆ id ┆ dates │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date │
╞════════╪═════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 │
└────────┴─────┴────────────┘
The join_asof
step finds the latest start
date that is on or before the dates
date. Since intervals do not overlap, this is the only interval that might contain the dates
date.
For our purposes, I'll make a copy of the start
column so that we can inspect the results. (The start
column will not be in the results of the join_asof
.)
Note that for a join_asof
, both DataFrames must be sorted by the asof
columns (dates
and start
in this case).
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start").with_column(pl.col("start").alias("start_df1")),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.sort("row_nr")
)
shape: (9, 6)
┌────────┬─────┬────────────┬────────────┬───────┬────────────┐
│ row_nr ┆ id ┆ dates ┆ end ┆ value ┆ start_df1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ date ┆ i64 ┆ date │
╞════════╪═════╪════════════╪════════════╪═══════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 ┆ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
└────────┴─────┴────────────┴────────────┴───────┴────────────┘
The last three rows are the ones that I added.
In the last step, we'll inspect the end
date, and null out any values where dates
is beyond end
.
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start").with_column(pl.col("start").alias("start_df1")),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.with_column(
pl.when(pl.col('dates') <= pl.col('end'))
.then(pl.col('value'))
.otherwise(None)
)
.sort("row_nr")
)
shape: (9, 6)
┌────────┬─────┬────────────┬────────────┬───────┬────────────┐
│ row_nr ┆ id ┆ dates ┆ end ┆ value ┆ start_df1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ date ┆ i64 ┆ date │
╞════════╪═════╪════════════╪════════════╪═══════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 ┆ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 ┆ 2022-02-04 ┆ null ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 ┆ 2022-03-10 ┆ null ┆ 2022-03-04 │
└────────┴─────┴────────────┴────────────┴───────┴────────────┘
You can see that the last three rows that I added (which purposely don't match any intervals in df_1
) have null
as value
.
Instead of using when/then/otherwise
to set value
to null, you can filter these out, if that's what you need.