I am submitting a Pyspark job with all modules packaged in a zip file like so:
$SPARK_HOME/bin/spark-submit \
--master local[*] \
--deploy-mode client \
--name spark-python \
--conf spark.driver.memory=4g \
--files "https://raw.githubusercontent.com/config.yml" \
--py-files "https://github.com/jobs.zip?raw=true" \
"https://raw.githubusercontent.com/main.py"
But the job throws the following exception:
Traceback (most recent call last):
File "/tmp/spark-637f7cfd-ff09-4784-9aa6-110779426bd4/main_kubernetes_new.py", line 62, in <module>
main()
File "/tmp/spark-637f7cfd-ff09-4784-9aa6-110779426bd4/main_kubernetes_new.py", line 27, in main
spark = SparkSession.builder.getOrCreate()
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 228, in getOrCreate
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 384, in getOrCreate
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 146, in __init__
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 209, in _do_init
File "/opt/spark/python/lib/pyspark.zip/pyspark/context.py", line 321, in _initialize_context
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1568, in __call__
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.io.FileNotFoundException: https://github.com/LorenzWackenhut/py-files-public/blob/main/jobs.zip%3Fraw=true
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1896)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:268)
at org.apache.spark.util.Utils$.doFetchFile(Utils.scala:765)
at org.apache.spark.util.Utils$.fetchFile(Utils.scala:541)
at org.apache.spark.SparkContext.addFile(SparkContext.scala:1633)
at org.apache.spark.SparkContext.$anonfun$new$13(SparkContext.scala:508)
at org.apache.spark.SparkContext.$anonfun$new$13$adapted(SparkContext.scala:508)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:508)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Is the way I supply the .zip file the problem? Or is there something else I overlooked? Thank you in advance!