I would like to subtract two date columns
and get their difference in months unit
instead of days but haven't been able to.
Data:
import polars as pl
from datetime import datetime
test_df = pl.DataFrame(
{
"dt_end": [datetime(2022, 1, 1).date(), datetime(2022, 1, 2).date()],
"dt_start": [datetime(2021, 5, 7).date(), datetime(2020, 7, 8).date()],
}
)
This gives difference in Days
test_df.with_columns(
diff_months = pl.col('dt_end') - pl.col('dt_start')
)
output:
dt_end dt_start diff_months
date date duration[ms]
2022-01-01 2021-05-07 239d
2022-01-02 2020-07-08 543d
I have tried below code for months but that doesn't work
test_df.with_columns(
diff_months = pl.duration(months = pl.col('dt_end') - pl.col('dt_start'))
)
UPDATE
Have also tried below code but that also gives me a different value from the excel.
test_df.with_columns(
diff_months = (pl.col('dt_end').dt.year() - pl.col('dt_start').dt.year()) * 12 +
(pl.col('dt_end').dt.month().cast(pl.Int32) - pl.col('dt_start').dt.month().cast(pl.Int32))
)
output:
dt_end dt_start diff_months
date date i32
2022-01-01 2021-05-07 8
2022-01-02 2020-07-08 18
UPDATE 2:
I think this is the manual way of doing it by using 30.42 as the avg number of days in month
test_df.with_columns(
diff_months = pl.col('dt_end') - pl.col('dt_start')
).with_columns(
diff_months = (pl.col('diff_months').dt.days()/30.42).floor().cast(pl.Int8)
)
output
dt_end dt_start diff_months
date date i8
2022-01-01 2021-05-07 7
2022-01-02 2020-07-08 17
UPDATE 3: Snapshot of actual data with desired Reference excel value
# Data
snapshot_df = pl.DataFrame(
{
"Reporting Month": [datetime(2000, 7, 20).date(), datetime(2000, 8, 20).date(), datetime(2000, 9, 20).date()],
"Origination date": [datetime(1999, 12, 19).date(), datetime(1999, 12, 19).date(), datetime(1999, 12, 19).date()],
"Excel_Reference_Age": [7,8,9]
}
)
comparison
snapshot_df.with_columns(
diff_months = pl.col('Reporting Month') - pl.col('Origination date')
).with_columns(
diff_months = (pl.col('diff_months').dt.days()/30.42).floor().cast(pl.Int16),
diff_months_1 = (pl.col('Reporting Month').dt.year() - pl.col('Origination date').dt.year()) * 12 +
(pl.col('Reporting Month').dt.month().cast(pl.Int32) - pl.col('Origination date').dt.month().cast(pl.Int32)) ,
diff_months_2 = pl.date_range(pl.col("Origination date"), pl.col("Reporting Month"), "1mo").list.lengths()
).select('Excel_Reference_Age','diff_months','diff_months_1','diff_months_2')
output
Excel_Reference_Age diff_months diff_months_1 diff_months_2
i64 i16 i32 u32
7 7 7 8
8 8 8 9
9 9 9 10