0

I need to have a slack notification when my pyspark code/ bootstrap fails with the exact reason for failure in EMR.

I am using airflow 2.0, and I am using on_failure_callback=task_fail_slack_alert to notify slack. But it is not notifying. Is my parameter correct?

cluster_creator = EmrCreateJobFlowOperator(task_id='create_job_flow',
    job_flow_overrides=JOB_FLOW_OVERRIDES)

    step_adder = EmrAddStepsOperator(task_id='add_steps',
    job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    steps=SPARK_STEPS,
    on_failure_callback=task_fail_slack_alert)

    step_checker = EmrStepSensor(task_id='watch_step',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
    aws_conn_id='aws_default',on_failure_callback=task_fail_slack_alert,)

    cluster_remover = EmrTerminateJobFlowOperator(task_id='remove_cluster',
    job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag
    )

I am getting failure notifications, but notexact reason for failure in EMR. How do I get it?

Thanks, Xi

Xi12
  • 939
  • 2
  • 14
  • 27

1 Answers1

0

exception=context.get('exception')is the function which will give exact reason for EMR failure

Example of on_failure_callback using slack:

 step_checker = EmrStepSensor(task_id='watch_step',
                 job_flow_id="{{ task_instance.xcom_pull('create_job_flow', 
                 key='return_value') }}",
        step_id="{{task_instance.xcom_pull(task_ids='add_steps',key='return_value')[0] }}",
        aws_conn_id='aws_default',
        on_failure_callback=task_fail_slack_alert,)    
    

def task_fail_slack_alert(context):
        SLACK_CONN_ID = 'slack'
        slack_webhook_token = BaseHook.get_connection(SLACK_CONN_ID).password
        slack_msg = """
                :red_circle: Task Failed. 
                *Task*: {task}  
                *Dag*: {dag} 
                *Execution Time*: {exec_date}  
                *Log Url*: {log_url} 
                *Error*:{exception}
                """.format(
                task=context.get('task_instance').task_id,
                dag=context.get('task_instance').dag_id,
                exec_date=context.get('execution_date'),
                log_url=context.get('task_instance').log_url,
                exception=context.get('exception') 
               
            )
        failed_alert = SlackWebhookOperator(
            task_id='slack_test',
            http_conn_id='slack',
            webhook_token=slack_webhook_token,
            message=slack_msg,
            username='airflow',
            dag=dag)
        return failed_alert.execute(context=context)
Xi12
  • 939
  • 2
  • 14
  • 27