Spark Cannot execute python script references module imports

Question

I'm facing challenges when using Spark in conjunction with Livy. Specifically, I run into exceptions when my jobs require imports from different modules:

If I don't use the pyfiles attribute: I receive a "file not found" error. If I utilize the pyfiles attribute: I get an "Access denied" error. This is expected as the file isn't located where the system anticipates it to be.

Below is the command and the associated error I'm getting

      $SPARK_HOME/bin/spark-submit --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" --deploy-mode client "$@"
/usr/bin/tini -s -- /opt/spark/bin/spark-submit --conf spark.driver.bindAddress=10.42.2.113 --deploy-mode client --properties-file /opt/spark/conf/spark.properties --class org.apache.spark.deploy.PythonRunner local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/explode_provider_keyword_variations.py
Files: local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/ 
From: /app/soluce_spark_jobs/cassandra_clickhouse_transformation 
To: /app/./cassandra_clickhouse_transformation
Error: Exception in thread "main" java.nio.file.AccessDeniedException: ./cassandra_clickhouse_transformation

This is how I call Spark with Livy

import json
import random
import string
import requests

from spark_client.core.settings import get_settings

app_settings = get_settings()

length = random.randint(5, 15)
result = ''.join(random.choice(string.ascii_letters) for i in range(length))

# Define the Spark job
payload = {
    "name": f"spark_{result}",
    "numExecutors": 2,
    "file": "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/explode_provider_keyword_variations.py",
    "pyFiles": [
        # "local:///app/soluce_spark_jobs/core/cassandra_table_loader.py",
        # "local:///app/soluce_spark_jobs/core/db_transformation_cluster_config.py",
        # "local:///app/soluce_spark_jobs/core/data_loader.py",
        # "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variations_loader.py",
        # "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variaasa tions_suffixes_loader.py",
        # "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variations_loader.py",
        # "/home/somedocker@euro.adroot/PycharmProjects/soluce_spark_jobs"
        # "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/*.py",
        # "local:///app/soluce_spark_jobs/core/*.py"
        "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/"
    ],
    "conf": {
        "spark.yarn.dist.pyFiles": "/app/soluce_spark_jobs",
        "spark.submit.deployMode": "cluster",

        "spark.soluce_cassandra.keyspace": 'provider',
        "spark.soluce_cassandra.table_name": 'provider_related_keywords',

        "spark.soluce_cassandra.host": app_settings.CASSANDRA_STAGING_HOST,
        "spark.soluce_cassandra.username": app_settings.CASSANDRA_STAGING_USERNAME,
        "spark.soluce_cassandra.password": app_settings.CASSANDRA_BACKUP_HOST,

        "spark.soluce_clickhouse.host": app_settings.CLICKHOUSE_STAGING_HOST,
        "spark.soluce_clickhouse.database": 'provider_new',
        "spark.soluce_clickhouse.username": app_settings.CLICKHOUSE_STAGING_USERNAME,
        "spark.soluce_clickhouse.password": app_settings.CLICKHOUSE_STAGING_PASSWORD,

        "spark.kubernetes.container.image": "somedocker/soluce_spark_livy_jobs:5099",
        "spark.kubernetes.driver.container.image": "somedocker/soluce_spark_livy_jobs:5099",
        "spark.kubernetes.namespace": "soluce-batch",
        "spark.kubernetes.authenticate.driver.serviceAccountName": "livy",
    }
}

headers = {
    "Content-Type": "application/json"
}


# Submit the job
def submit_jobs():
    response = requests.post(app_settings.LIVY_URL, data=json.dumps(payload), headers=headers, verify=False)

    # Check the response
    if response.status_code == 201:
        print("Job submitted successfully!")
        print("Response:", response.json())
    else:
        print("Failed to submit the job. Status code:", response.status_code)
        print("Response:", response.text)


submit_jobs()

The Job Definition

from pyspark.sql import SparkSession

from soluce_spark_jobs.core.cassandra_table_loader import CassandraTableLoader
from soluce_spark_jobs.core.db_transformation_cluster_config import DbTransformationConfig
from soluce_spark_jobs.cassandra_clickhouse_transformation.provider.loader.az_keyword_variations_loader import \
    AZ_ProductKeywordVariationsLoader
from soluce_spark_jobs.cassandra_clickhouse_transformation.provider.loader.az_keyword_variations_suffixes_loader import \
    AZ_ProductKeywordVariationsSuffixesLoader


def main():
    spark = SparkSession.builder \
        .appName("explode_az_keyword_variations") \
        .getOrCreate()

Spark Cannot execute python script references module imports

0 Answers0