I have data split into multiple files. I want to load and join the files. I'd like to build a dynamic function that 1. will join n data files into a single data frame 2. given the input of file location and join column (e.g., pk)
I think this can be done with foldLeft, but I am not quite sure how:
Here is my code so far:
@throws
def dataJoin(path:String, fileNames:String*): DataFrame=
{
try
{
val dfList:ArrayBuffer[DataFrame]=new ArrayBuffer
for(fileName <- fileNames)
{
val df:DataFrame=DataFrameUtils.openFile(spark, s"$path${File.separator}$fileName")
dfList += df
}
dfList.foldLeft
{
(df,df1) => joinDataFrames(df,df1, "UID")
}
}
catch
{
case e:Exception => throw new Exception(e)
}
}
def joinDataFrames(df:DataFrame,df1:DataFrame, joinColum:String): Unit =
{
df.join(df1, Seq(joinColum))
}