Hello fellow Stackoverflowers,
I have been trying to use the DataprocInstantiateInlineWorkflowTemplateOperator
to run a pyspark job. Sadly after following all the documentation I am getting error in Composer ValueError: Protocol message OrderedJob has no "stepID" field.
Here is the template that I am using.
{
"id": "my-workflow-template",
"jobs": [
{
"stepID": "123456dfgy",
"pysparkJob": {
"mainPythonFileUri": "gs://gcp-gmp/app.py"
}
}
],
"name": "My Workflow Template",
"placement": {
"managedCluster": {
"clusterName": "my-managed-cluster",
"config": {
"master_config": {
"disk_config": {
"boot_disk_size_gb": 1024,
"boot_disk_type": "pd-standard"
},
"machine_type_uri": "n1-standard-4",
"num_instances": 1
},
"worker_config": {
"disk_config": {
"boot_disk_size_gb": 1024,
"boot_disk_type": "pd-standard"
},
"machine_type_uri": "n1-standard-4",
"num_instances": 2
}
}
}
}
}
Here is the entire python code.
import json
from datetime import datetime ,timedelta
from airflow import DAG
from airflow.utils.trigger_rule import TriggerRule
from airflow.providers.google.cloud.operators.dataproc import DataprocInstantiateInlineWorkflowTemplateOperator
from airflow.operators.dummy import DummyOperator
DAG_ID= 'Dataproc_Instantiate_Inline_Workflow_TemplateOper_example'
JSON_CONTENT = """{
"id": "my-workflow-template",
"jobs": [
{
"stepID": "123456dfgy",
"pysparkJob": {
"mainPythonFileUri": "gs://my-bucket/app.py"
}
}
],
"name": "My Workflow Template",
"placement": {
"managedCluster": {
"clusterName": "my-managed-cluster",
"config": {
"master_config": {
"disk_config": {
"boot_disk_size_gb": 1024,
"boot_disk_type": "pd-standard"
},
"machine_type_uri": "n1-standard-4",
"num_instances": 1
},
"worker_config": {
"disk_config": {
"boot_disk_size_gb": 1024,
"boot_disk_type": "pd-standard"
},
"machine_type_uri": "n1-standard-4",
"num_instances": 2
}
}
}
}
}"""
template_dict = json.loads(JSON_CONTENT)
default_args = {
'start_date': datetime(2023, 6, 29),
'retries': 1,
'retry_delay': timedelta(minutes=2),
}
dag = DAG(
dag_id = DAG_ID,
default_args=default_args,
schedule_interval=None,
)
start = DummyOperator(
task_id = 'start',
dag = dag
)
create_dataproc_template = DataprocInstantiateInlineWorkflowTemplateOperator(
template = template_dict,
task_id = 'create_dataproc_template',
project_id= 'my-project',
region = 'us-central1',
gcp_conn_id = 'google_cloud_default',
dag = dag
)
complete = DummyOperator(
task_id = 'complete',
trigger_rule = TriggerRule.NONE_FAILED,
dag = dag
)
start >> create_dataproc_template >> complete
Strangely when I was not using the stepID field the error was ValueError: Protocol message OrderedJob has no "pysparkJob" field.
Any help is appreciated.