I am a novice to PySpark. In my implementation, I have used multiple dataframes. Out of these certain are intermediate dataframes that will not be used later on the code. How can I handle them? I am facing GC OverHead and memory issues. Any help is appreciated
df_total_ch=df_agg.groupby('circle','vendor','zone','nss_id','category','sub_category','node_model','node_name','OAF').agg(func.count('OAF').alias('total_Card_count'))
df_total_ch=df_total_ch.groupby('circle','vendor','zone','nss_id','category','sub_category','node_model','node_name').agg(func.count("*").alias('total_Card_count'))
df_new=df_agg.join(df_total_ch,on=['circle','vendor','zone','nss_id','category','sub_category','node_model','node_name'],how='left_outer')
base_df=df_new.select ('eventtime','circle','vendor','zone','nss_id','category','sub_category','node_model','node_name','OAF','temperature_event_count','temperature_weight','power_event_count','power_weight','hardware_event_count','hardware_weight','other_event_count','other_weight','housekeeping_event_count','housekeeping_weight')
base_df_final = base_df.groupby('eventtime','circle','vendor','zone','nss_id','category','sub_category','node_model','node_name','OAF').agg(func.sum('temperature_event_count').alias('temperature_event_count'),func.sum('temperature_weight').alias('temperature_weight'),func.sum('power_event_count').alias('power_event_count'),func.sum('power_weight').alias('power_weight'),func.sum('hardware_event_count').alias('hardware_event_count'),func.sum('hardware_weight').alias('hardware_weight'),func.sum('other_event_count').alias('other_event_count'),func.sum('other_weight').alias('other_weight'),func.sum('housekeeping_event_count').alias('housekeeping_event_count'),func.sum('housekeeping_weight').alias('housekeeping_weight'))
base_df_final=base_df_final.withColumn('hardware_flag',when(func.col('hardware_event_count') > 0,lit('Yes')).otherwise(lit('No')))
base_df_final=base_df_final.withColumn('power_flag',when(func.col('power_event_count') > 0,lit('Yes')).otherwise(lit('No')))
base_df_final=base_df_final.withColumn('temperature_flag',when(func.col('temperature_event_count') > 0,lit('Yes')).otherwise(lit('No')))
base_df_final=base_df_final.withColumn('others_flag',when(func.col('other_event_count') > 0,lit('Yes')).otherwise(lit('No')))
base_df_final=base_df_final.withColumn('housekeeping_flag',when(func.col('housekeeping_event_count') > 0,lit('Yes')).otherwise(lit('No')))
base_df_final.write.mode("append").saveAsTable("DFT.TBL_TX")
I get the below error
base_df_final.write.mode("append").saveAsTable("DFT.TBL_TX")
File "/opt/cloudera/parcels/CDH-6.3.3-1.cdh6.3.3.p0.1796617/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 775, in saveAsTable
File "/opt/cloudera/parcels/CDH-6.3.3-1.cdh6.3.3.p0.1796617/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/cloudera/parcels/CDH-6.3.3-1.cdh6.3.3.p0.1796617/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/opt/cloudera/parcels/CDH-6.3.3-1.cdh6.3.3.p0.1796617/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o482.saveAsTable.
: org.spark_project.guava.util.concurrent.ExecutionError: java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.spark_project.guava.cache.LocalCache$Segment.get(LocalCache.java:2261)
at org.spark_project.guava.cache.LocalCache.get(LocalCache.java:4000)
at org.spark_project.guava.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.getCachedPlan(SessionCatalog.scala:141)