You can use udf
function to generate the grp
column
def testUdf = udf((col1: String, t0: Int, t1: Int)=> (t1-t0) match {
case x : Int if(x > 50) => 2+col1
case _ => 1+col1
})
Call the udf
function as
df.withColumn("grp", testUdf($"Col1", $"t0", $"t1"))
The udf
function above won't work properly due to null
values in t0
which can be replaced by 0
df.na.fill(0)
I hope this is the answer you are searching for.
Edited
Here's the complete solution using udaf . The process is complex . You've already got easy answer but it might help somebody who might use it
First defining udaf
class Boendal extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("Col1", StringType).add("t0", IntegerType).add("t1", IntegerType).add("rank", IntegerType)
def bufferSchema = new StructType().add("buff", StringType).add("buffer1", IntegerType)
def dataType = StringType
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {
buffer.update(0, "")
buffer.update(1, 0)
}
def update(buffer: MutableAggregationBuffer, input: Row) = {
if (!input.isNullAt(0)) {
val buff = buffer.getString(0)
val col1 = input.getString(0)
val t0 = input.getInt(1)
val t1 = input.getInt(2)
val rank = input.getInt(3)
var value = 1
if((t1-t0) < 50)
value = 1
else
value = (t1-t0)/50
val lastValue = buffer(1).asInstanceOf[Integer]
// if(!buff.isEmpty) {
if (value < lastValue)
value = lastValue
// }
buffer.update(1, value)
var finalString = ""
if(buff.isEmpty){
finalString = rank+";"+value+col1
}
else
finalString = buff+"::"+rank+";"+value+col1
buffer.update(0, finalString)
}
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
val buff1 = buffer1.getString(0)
val buff2 = buffer2.getString(0)
buffer1.update(0, buff1+buff2)
}
def evaluate(buffer: Row) : String = {
buffer.getString(0)
}
}
Then some udfs
def rankUdf = udf((grp: String)=> grp.split(";")(0))
def removeRankUdf = udf((grp: String) => grp.split(";")(1))
And finally call the udaf and udfs
val windowSpec = Window.partitionBy($"Col1").orderBy($"t1")
df = df.withColumn("t0", lag("t1", 1) over windowSpec)
.withColumn("rank", rank() over windowSpec)
df = df.na.fill(0)
val boendal = new Boendal
val df2 = df.groupBy("Col1").agg(boendal($"Col1", $"t0", $"t1", $"rank").as("grp2")).withColumnRenamed("Col1", "Col2")
.withColumn("grp2", explode(split($"grp2", "::")))
.withColumn("rank2", rankUdf($"grp2"))
.withColumn("grp2", removeRankUdf($"grp2"))
df = df.join(df2, df("Col1") === df2("Col2") && df("rank") === df2("rank2"))
.drop("Col2", "rank", "rank2")
df.show(false)
Hope it helps