0

I am reading excel files using pyspark.All the dataframes are stored inside a list. while merging all the data frames I am getting out of memory error. The code looks below.

def union_spark_dfs(*dfs):
    return reduce(lambda df1, df2: df1.unionAll(df2), dfs)

def createExceldataFrame(path):
    """
    This function will accept the file path as input And Return the Dataframe
    """
    wb = openpyxl.load_workbook(f"/dbfs/mnt/raw/{path}")
    sheets = wb.sheetnames
    firstsheet = sheets[0]
    df = (
        spark.read.format("com.crealytics.spark.excel")
        .option("dataAddress", f"{firstsheet}!A11:Y36")
        .schema(schema)
        .option("header", "true")
        .load(f"dbfs:/mnt/raw/{path}")
    )
    for sheet in sheets[1:]:
        sheetdf = (
            spark.read.format("com.crealytics.spark.excel")
            .option("dataAddress", f"{sheet}!A11:Y36")
            .schema(schema)
            .option("header", "true")
            .load(f"dbfs:/mnt/raw/{path}")
        )
        df = df.unionAll(sheetdf)
    return df 

latest_files =['file1.xlsx',
 'file2.xlsx',
 'file3.xlsx',
 'file4.xlsx',
 'file5.xlsx',
 'file6.xlsx']

df_list = [createExceldataFrame(file) for file in latest_files]

final_df = union_spark_dfs(*df_list)

So the union operation is performed on driver node? what is the issue that is causing the driver to stop?

code_bug
  • 355
  • 1
  • 12

0 Answers0