I am reading excel files using pyspark.All the dataframes are stored inside a list. while merging all the data frames I am getting out of memory error. The code looks below.
def union_spark_dfs(*dfs):
return reduce(lambda df1, df2: df1.unionAll(df2), dfs)
def createExceldataFrame(path):
"""
This function will accept the file path as input And Return the Dataframe
"""
wb = openpyxl.load_workbook(f"/dbfs/mnt/raw/{path}")
sheets = wb.sheetnames
firstsheet = sheets[0]
df = (
spark.read.format("com.crealytics.spark.excel")
.option("dataAddress", f"{firstsheet}!A11:Y36")
.schema(schema)
.option("header", "true")
.load(f"dbfs:/mnt/raw/{path}")
)
for sheet in sheets[1:]:
sheetdf = (
spark.read.format("com.crealytics.spark.excel")
.option("dataAddress", f"{sheet}!A11:Y36")
.schema(schema)
.option("header", "true")
.load(f"dbfs:/mnt/raw/{path}")
)
df = df.unionAll(sheetdf)
return df
latest_files =['file1.xlsx',
'file2.xlsx',
'file3.xlsx',
'file4.xlsx',
'file5.xlsx',
'file6.xlsx']
df_list = [createExceldataFrame(file) for file in latest_files]
final_df = union_spark_dfs(*df_list)
So the union operation is performed on driver node? what is the issue that is causing the driver to stop?