0

I am trying to connect a Managed Airflow instance to an already provisioned EMR cluster using the following code:

import os
from datetime import datetime
from airflow import DAG
from airflow.models.baseoperator import chain
from airflow.models import Variable
from airflow.contrib.sensors.emr_job_flow_sensor import EmrJobFlowSensor
from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
from airflow.contrib.operators.emr_add_steps_operator import EmrAddStepsOperator
from airflow.contrib.operators.emr_terminate_job_flow_operator import EmrTerminateJobFlowOperator

JOB_FLOW_ROLE = os.getenv('EMR_JOB_FLOW_ROLE', 'EMR_EC2_DefaultRole')
SERVICE_ROLE = os.getenv('EMR_SERVICE_ROLE', 'EMR_DefaultRole')

s3_bucket_dlc = Variable.get("s3_bucket_dlc")
dlc_cluster_id = Variable.get("dlc_cluster_id")
arn = Variable.get("dlc_role_arn")

SPARK_STEPS = [
    {
        'Name': 'calculate_pi',
        'ActionOnFailure': 'CONTINUE',
        'HadoopJarStep': {
            'Jar': 'command-runner.jar',
            'Args': ['/usr/lib/spark/bin/run-example', 'SparkPi', '10'],
        },
    }
]

with DAG(
    dag_id='dlc_emroperator_example',
    start_date=datetime(2022, 9, 5),
    tags=['example'],
    catchup=False,
) as dag:

    job_flow_id = dlc_cluster_id
    
    job_sensor = EmrJobFlowSensor(task_id='check_job_flow', job_flow_id=job_flow_id)

    step_adder = EmrAddStepsOperator(
        task_id='add_steps',
        job_flow_id=job_flow_id,
        steps=SPARK_STEPS,
    )
    
    step_checker = EmrStepSensor(
        task_id='watch_step',
        job_flow_id=job_flow_id,
        step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
    )

    cluster_remover = EmrTerminateJobFlowOperator(
        task_id='remove_cluster',
        job_flow_id=job_flow_id,
    )

    chain(
        job_sensor,
        step_adder,
        step_checker,
        cluster_remover,
    )

However, the connection keeps timing out - it seems to me like I'm not passing the cluster credentials properly, but I can't figure out where I'm going wrong. The above code is a modification of some tutorials I have found online - only that in these tutorials, they are creating the clusters, then running the jobs (I already have a cluster ID passed to me).

Can anybody give me any pointers on where I may be making a mistake?

15b3m
  • 15
  • 2

0 Answers0