I'm facing challenges when using Spark in conjunction with Livy. Specifically, I run into exceptions when my jobs require imports from different modules:
If I don't use the pyfiles attribute: I receive a "file not found" error. If I utilize the pyfiles attribute: I get an "Access denied" error. This is expected as the file isn't located where the system anticipates it to be.
Below is the command and the associated error I'm getting
$SPARK_HOME/bin/spark-submit --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" --deploy-mode client "$@"
/usr/bin/tini -s -- /opt/spark/bin/spark-submit --conf spark.driver.bindAddress=10.42.2.113 --deploy-mode client --properties-file /opt/spark/conf/spark.properties --class org.apache.spark.deploy.PythonRunner local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/explode_provider_keyword_variations.py
Files: local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/
From: /app/soluce_spark_jobs/cassandra_clickhouse_transformation
To: /app/./cassandra_clickhouse_transformation
Error: Exception in thread "main" java.nio.file.AccessDeniedException: ./cassandra_clickhouse_transformation
This is how I call Spark with Livy
import json
import random
import string
import requests
from spark_client.core.settings import get_settings
app_settings = get_settings()
length = random.randint(5, 15)
result = ''.join(random.choice(string.ascii_letters) for i in range(length))
# Define the Spark job
payload = {
"name": f"spark_{result}",
"numExecutors": 2,
"file": "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/explode_provider_keyword_variations.py",
"pyFiles": [
# "local:///app/soluce_spark_jobs/core/cassandra_table_loader.py",
# "local:///app/soluce_spark_jobs/core/db_transformation_cluster_config.py",
# "local:///app/soluce_spark_jobs/core/data_loader.py",
# "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variations_loader.py",
# "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variaasa tions_suffixes_loader.py",
# "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/az_keyword_variations_loader.py",
# "/home/somedocker@euro.adroot/PycharmProjects/soluce_spark_jobs"
# "local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/provider/loader/*.py",
# "local:///app/soluce_spark_jobs/core/*.py"
"local:///app/soluce_spark_jobs/cassandra_clickhouse_transformation/"
],
"conf": {
"spark.yarn.dist.pyFiles": "/app/soluce_spark_jobs",
"spark.submit.deployMode": "cluster",
"spark.soluce_cassandra.keyspace": 'provider',
"spark.soluce_cassandra.table_name": 'provider_related_keywords',
"spark.soluce_cassandra.host": app_settings.CASSANDRA_STAGING_HOST,
"spark.soluce_cassandra.username": app_settings.CASSANDRA_STAGING_USERNAME,
"spark.soluce_cassandra.password": app_settings.CASSANDRA_BACKUP_HOST,
"spark.soluce_clickhouse.host": app_settings.CLICKHOUSE_STAGING_HOST,
"spark.soluce_clickhouse.database": 'provider_new',
"spark.soluce_clickhouse.username": app_settings.CLICKHOUSE_STAGING_USERNAME,
"spark.soluce_clickhouse.password": app_settings.CLICKHOUSE_STAGING_PASSWORD,
"spark.kubernetes.container.image": "somedocker/soluce_spark_livy_jobs:5099",
"spark.kubernetes.driver.container.image": "somedocker/soluce_spark_livy_jobs:5099",
"spark.kubernetes.namespace": "soluce-batch",
"spark.kubernetes.authenticate.driver.serviceAccountName": "livy",
}
}
headers = {
"Content-Type": "application/json"
}
# Submit the job
def submit_jobs():
response = requests.post(app_settings.LIVY_URL, data=json.dumps(payload), headers=headers, verify=False)
# Check the response
if response.status_code == 201:
print("Job submitted successfully!")
print("Response:", response.json())
else:
print("Failed to submit the job. Status code:", response.status_code)
print("Response:", response.text)
submit_jobs()
The Job Definition
from pyspark.sql import SparkSession
from soluce_spark_jobs.core.cassandra_table_loader import CassandraTableLoader
from soluce_spark_jobs.core.db_transformation_cluster_config import DbTransformationConfig
from soluce_spark_jobs.cassandra_clickhouse_transformation.provider.loader.az_keyword_variations_loader import \
AZ_ProductKeywordVariationsLoader
from soluce_spark_jobs.cassandra_clickhouse_transformation.provider.loader.az_keyword_variations_suffixes_loader import \
AZ_ProductKeywordVariationsSuffixesLoader
def main():
spark = SparkSession.builder \
.appName("explode_az_keyword_variations") \
.getOrCreate()