You can achieve this by using lead
on mode
and checking for changes from
'x' --> 'y' and vice versa -
from pyspark import SparkContext
from pyspark.sql import SQLContext
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql import Window
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
input_list = [ ('A', 1000,'x')
,('A', 1010,'x')
,('A', 1020,'x')
,('A', 1023,'y')
,('A', 1033,'y')
,('A', 1037,'x')
,('A', 1047,'x')
]
sparkDF = sql.createDataFrame(input_list,['Driver_ID','Timestamp','mode'])
window = Window.partitionBy('Driver_ID').orderBy('Timestamp')
sparkDF = sparkDF.withColumn('mode_lead', F.lead(F.col('mode')).over(window))
### Condition to Identify mode change from 'x' --> 'y'
x_change_cond = ( (F.col('mode') == 'x')
& (F.col('mode') != F.col('mode_lead'))
)
### Condition to Identify mode change from 'y' --> 'x'
### Reason for equality is since the lag timestamp is required when mode changes from y to x
y_change_cond = ( (F.col('mode') == 'y')
& (F.col('mode') == F.col('mode_lead'))
)
sparkDF = sparkDF.withColumn('timestamp_diff_lead',F.lead(F.col('Timestamp')).over(window) - F.col('Timestamp'))
sparkDF = sparkDF.withColumn('timestamp_sum_x_change'
,F.when(x_change_cond,F.sum(F.col('timestamp_diff_lead')).over(window)
).otherwise(0)
)
sparkDF = sparkDF.withColumn('timestamp_sum_y_change',F.when(y_change_cond
,F.sum(F.col('timestamp_diff_lead')).over(window.rowsBetween(0,Window.currentRow))
).otherwise(0))
cols = ['Timestamp','mode','mode_lead','timestamp_diff_lead'
,'timestamp_sum_x_change','timestamp_sum_y_change']
sparkDF.select(*cols).show(truncate=False)
+---------+----+---------+-------------------+----------------------+----------------------+
|Timestamp|mode|mode_lead|timestamp_diff_lead|timestamp_sum_x_change|timestamp_sum_y_change|
+---------+----+---------+-------------------+----------------------+----------------------+
|1000 |x |x |10 |0 |0 |
|1010 |x |x |10 |0 |0 |
|1020 |x |y |3 |23 |0 |
|1023 |y |y |10 |0 |10 |
|1033 |y |x |4 |0 |0 |
|1037 |x |x |10 |0 |0 |
|1047 |x |null |null |0 |0 |
+---------+----+---------+-------------------+----------------------+----------------------+
Using the above result set , you can filter on rows matching either (|
) of the change conditions -
filter_cond = (F.col('timestamp_sum_x_change') != 0) | (F.col('timestamp_sum_y_change') != 0)
sparkDF.filter(filter_cond).select(*cols).show()
+---------+----+---------+-------------------+----------------------+----------------------+
|Timestamp|mode|mode_lead|timestamp_diff_lead|timestamp_sum_x_change|timestamp_sum_y_change|
+---------+----+---------+-------------------+----------------------+----------------------+
| 1020| x| y| 3| 23| 0|
| 1023| y| y| 10| 0| 10|
+---------+----+---------+-------------------+----------------------+----------------------+
Finally you can aggregate the filtered resultset to generate the output -
sparkDF.filter(filter_cond).groupby('Driver_ID').agg(*[ F.count(F.col('Driver_ID')).alias('activation(x)')
,F.max(F.col('timestamp_sum_x_change')).alias('time1')
,F.max(F.col('timestamp_sum_y_change')).alias('time2')
]
).show()
+---------+-------------+-----+-----+
|Driver_ID|activation(x)|time1|time2|
+---------+-------------+-----+-----+
| A| 2| 23| 10|
+---------+-------------+-----+-----+