- You can try to increase the instance type or number workers node if possible .
ThreadPoolExecutor
for runs each of your workers in separate threads within the main process. You can find a sample code here .
Code snippet :
def writeS3(curr_date):
print(f"Starting S3 write for date - {curr_date}")
data_df1 = data_df.withColumn("date1", f.from_unixtime(f.unix_timestamp(data_df.LD_TS), "yyyy-MM-dd"))
display(data_df1)
print(curr_date)
save_df = data_df1.filter(f"date1='{curr_date}'").drop('date1')
save_df.write.parquet(f"s3://location")
jobs = []
results_done = []
total_days = 30
with futures.ThreadPoolExecutor(max_workers=total_days+1) as e:
print(f"{raw_bucket}/{db}/{table}/")
for curr_date in date_range:
print(f"Starting S3 write for date - {curr_date}")
jobs.append(e.submit(writeS3, curr_date))
# result_done = job.result()
# print(f"Job Completed - {result_done}")
print("Task complete")