Here's my implementation.
generate a table for all the time periods for the current year
and then cross join that table on all the transactions and perform calculations. the final output is assuming you want the status per customer for each period.
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
from datetime import *
from dateutil.relativedelta import relativedelta
year_start = date(datetime.today().year, 1, 1)
n_months = 12
date_list = [year_start + relativedelta(months=i) for i in range(n_months)]
dates_formatted = [(d.strftime("%b %Y"),) for d in date_list]
period_df = spark.createDataFrame(dates_formatted, ["period"])
period_df.show()
current_time_period = "SEP 2022"
df = spark.createDataFrame(
[
("1", "1", "2019-JAN-10"),
("1", "2", "2019-DEC-15"),
("1", "3", "2022-SEP-07"),
],
["customer_id", "transaction_id", "transaction_date"],
)
cols = [
"customer_id",
"txn_id",
"txn_date",
"period",
"txn_current",
"txn_prev_12m",
"txn_prior_prev_12m"
]
txn_period_df = df.crossJoin(period_df)
txn_period_df.show(n=100,truncate=False)
#intermediate:
df = (
txn_period_df
.withColumnRenamed("transaction_id", "txn_id")
.withColumnRenamed("transaction_date", "txn_date")
.withColumn("txn_date_formatted", F.to_date(F.col("txn_date"),"yyyy-MMM-dd"))
# add one month to include tranctions after the 1st
.withColumn("current_date", F.add_months( F.to_date(F.col("period"),"MMM yyyy") , 1) )
#any purchase from October 2021 to September 2022 falls into the 'current' time period.
.withColumn('txn_current_date', F.add_months(F.col("current_date"), -12))
.withColumn(
"txn_current",
F.when(
(F.col("txn_date_formatted") >= F.col("txn_current_date"))
& (F.col("txn_date_formatted") < F.col("current_date")) , F.lit(1)
).otherwise(F.lit(0)),
)
#Purchases from October 2020 to September 2021 fall into the 'previous 12 months'
.withColumn('txn_prev_12m_date', F.add_months(F.col("txn_current_date"), -12))
.withColumn(
"txn_prev_12m",
F.when(
(F.col("txn_date_formatted") >= F.col("txn_prev_12m_date"))
& (F.col("txn_date_formatted") < F.col("txn_current_date")) , F.lit(1)
).otherwise(F.lit(0)),
)
#and purchases from September 2020 and earlier are 'prior to the previous 12 months'
.withColumn(
"txn_prior_prev_12m",
F.when(
(F.col("txn_date_formatted") < F.col("txn_prev_12m_date")) , F.lit(1)
).otherwise(F.lit(0)),
)
.select(cols)
)
df.show()
cols = [
"customer_id",
"txn_period",
"txn_current",
"txn_prev_12m",
"txn_prior_prev_12m",
"status",
]
txn_agg_window = Window.partitionBy(
"customer_id",
"txn_period",
).orderBy(F.col("customer_id"))
#final:
final_df = (
df
.withColumnRenamed("period", "txn_period")
.withColumn(
"txn_current",
F.sum("txn_current").over(txn_agg_window),
)
.withColumn(
"txn_prev_12m",
F.sum("txn_prev_12m").over(txn_agg_window),
)
.withColumn(
"txn_prior_prev_12m",
F.sum("txn_prior_prev_12m").over(txn_agg_window),
)
.withColumn(
"row_num",
F.row_number().over(txn_agg_window),
)
.filter(F.col("row_num") == 1)
.drop("row_num")
.withColumn(
"status",
F.when(
(F.col("txn_prior_prev_12m") > 0)
& (F.col("txn_prev_12m") == 0)
& (F.col("txn_current") > 0), F.lit("Reactivated")
)
.when(
(F.col("txn_prev_12m") > 0)
& (F.col("txn_current") > 0), F.lit("Retained")
)
.when(
(F.col("txn_prior_prev_12m") == 0)
& (F.col("txn_prev_12m") == 0)
& (F.col("txn_current") == 0), F.lit("New")
)
.otherwise(F.lit("NA").cast(StringType())),
)
.select(cols)
.orderBy(F.to_date(F.col("txn_period"),"MMM yyyy").asc())
)
final_df.show()
periods:
+--------+
| period|
+--------+
|Jan 2022|
|Feb 2022|
|Mar 2022|
|Apr 2022|
|May 2022|
|Jun 2022|
|Jul 2022|
|Aug 2022|
|Sep 2022|
|Oct 2022|
|Nov 2022|
|Dec 2022|
+--------+
intermediate:
+-----------+------+-----------+--------+-----------+------------+------------------+
|customer_id|txn_id| txn_date| period|txn_current|txn_prev_12m|txn_prior_prev_12m|
+-----------+------+-----------+--------+-----------+------------+------------------+
| 1| 1|2019-JAN-10|Jan 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Feb 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Mar 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Apr 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|May 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Jun 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Jul 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Aug 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Sep 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Oct 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Nov 2022| 0| 0| 1|
| 1| 1|2019-JAN-10|Dec 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Jan 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Feb 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Mar 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Apr 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|May 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Jun 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Jul 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Aug 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Sep 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Oct 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Nov 2022| 0| 0| 1|
| 1| 2|2019-DEC-15|Dec 2022| 0| 0| 1|
| 1| 3|2022-SEP-07|Jan 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Feb 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Mar 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Apr 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|May 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Jun 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Jul 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Aug 2022| 0| 0| 0|
| 1| 3|2022-SEP-07|Sep 2022| 1| 0| 0|
| 1| 3|2022-SEP-07|Oct 2022| 1| 0| 0|
| 1| 3|2022-SEP-07|Nov 2022| 1| 0| 0|
| 1| 3|2022-SEP-07|Dec 2022| 1| 0| 0|
+-----------+------+-----------+--------+-----------+------------+------------------+
final:
+-----------+----------+-----------+------------+------------------+-----------+
|customer_id|txn_period|txn_current|txn_prev_12m|txn_prior_prev_12m| status|
+-----------+----------+-----------+------------+------------------+-----------+
| 1| Jan 2022| 0| 0| 2| NA|
| 1| Feb 2022| 0| 0| 2| NA|
| 1| Mar 2022| 0| 0| 2| NA|
| 1| Apr 2022| 0| 0| 2| NA|
| 1| May 2022| 0| 0| 2| NA|
| 1| Jun 2022| 0| 0| 2| NA|
| 1| Jul 2022| 0| 0| 2| NA|
| 1| Aug 2022| 0| 0| 2| NA|
| 1| Sep 2022| 1| 0| 2|Reactivated|
| 1| Oct 2022| 1| 0| 2|Reactivated|
| 1| Nov 2022| 1| 0| 2|Reactivated|
| 1| Dec 2022| 1| 0| 2|Reactivated|
+-----------+----------+-----------+------------+------------------+-----------+