I'm creating a data pipeline in Azure Synapse.
Basic flow:
grab some CSV files of 837 EDI data. Put those data files on Azure Data Lake (Gen2). Foreach file put data into tabular database table format in Spark DB, named claims. See my flow
My issue: long runtimes. It seems like each file has to create a new Spark session and the overhead is too much (3 min each). I want to "declare" a session via appName and use that throughout. I have 3 test files with 10 rows in one, 2 rows in another, 10 rows in a third. Total time for 22 rows 12 minutes.
In my flow you can see the Foreach loops each have 2 activities (one is a notebook, the other a sproc) based on whether the it's an 837i or 837p.
My notebook code:
'''python
import re
from pyspark.sql.functions import desc, row_number, monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
# create Spark session with necessary configuration
spark = (SparkSession
.builder
.appName("837App")
.config("spark.network.timeout", "600s")
.config("spark.executor.heartbeatInterval", "10s")
.getOrCreate());
# prepping the variables for the source FileName
srcPath = "abfss://folder@server.dfs.core.windows.net";
srcFQFN = f"{srcPath}/{srcFilesDirectory}/{srcFileName}";
dstTableName = "raw_837i";
# read Flat file into a data frame
df = spark.read.load(f"{srcFQFN}",
format = 'csv',
delimiter = f"{srcFileDelimeter}",
header = True
);
# add autoid
adf = df.withColumn('AutoID', row_number().over(Window.orderBy(monotonically_increasing_id())));
# clean up column names
adf = adf.toDF(*(re.sub(r"[\.\s\(\)\-]+", "_", c) for c in adf.columns));
# now the Spark database side...
# create the destination database if it did not exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {sparkDbName}");
# write that dataframe to a Spark table. update mode from overwrite to append if we just want to insert
adf.write.mode("overwrite").saveAsTable(f"{sparkDbName}.{dstTableName}");
Thanks @Sequinex and @Bendemann
What I've tried:
Added a notebook at the beginning of the pipeline to set the session; see Set 837 env in my flow. The intent is that if a session with that appName doesn't exist it will create it, then use it later. That way I spend the 3 minutes spin up time at the front of the pipeline instead of each file.
'''python
from pyspark.sql import SparkSession
# create Spark session with necessary configuration
spark = (SparkSession
.builder
.appName("837App")
.config("spark.network.timeout", "600s")
.config("spark.executor.heartbeatInterval", "10s")
.getOrCreate());
sc = spark.sparkContext;
I cannot prove that it's actually using this appName (so if someone could help with that too).
I've tried:
'''python
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("837App").getOrCreate()
sc
Results:
<SparkContext master=yarn appName=Synapse_sparkPrimary_1618935780>'
Shouldn't appName=837App?
I've also tried to stop the existing session and start mine
'''python
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
sc.stop()
spark = SparkSession.builder.appName("837App").getOrCreate()
sc
But I get the following errors:
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalStateException: Promise already completed.
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:157)
at scala.concurrent.Promise$class.success(Promise.scala:86)
at scala.concurrent.impl.Promise$DefaultPromise.success(Promise.scala:157)
at org.apache.spark.deploy.yarn.ApplicationMaster.org$apache$spark$deploy$yarn$ApplicationMaster$$sparkContextInitialized(ApplicationMaster.scala:392)
at org.apache.spark.deploy.yarn.ApplicationMaster$.sparkContextInitialized(ApplicationMaster.scala:808)
at org.apache.spark.scheduler.cluster.YarnClusterScheduler.postStartHook(YarnClusterScheduler.scala:32)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:566)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 173, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 367, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 136, in __init__
conf, jsc, profiler_cls)
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 198, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 306, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1525, in __call__
answer, self._gateway_client, None, self._fqn)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
return f(*a, **kw)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalStateException: Promise already completed.
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:157)
at scala.concurrent.Promise$class.success(Promise.scala:86)
at scala.concurrent.impl.Promise$DefaultPromise.success(Promise.scala:157)
at org.apache.spark.deploy.yarn.ApplicationMaster.org$apache$spark$deploy$yarn$ApplicationMaster$$sparkContextInitialized(ApplicationMaster.scala:392)
at org.apache.spark.deploy.yarn.ApplicationMaster$.sparkContextInitialized(ApplicationMaster.scala:808)
at org.apache.spark.scheduler.cluster.YarnClusterScheduler.postStartHook(YarnClusterScheduler.scala:32)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:566)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
TIA