0

Airflow novice here

Setup

  • env: MWAA
  • db: Redshift

Question: I am trying to write a task to loop through a list, use the list value in SQL and lastly use an if statement to determine appropriate action based on the results. Something like this but struggling a bit. If it is possible what's the best way to achieve it?

def duplicate_check():
    ls = ['table1','table2','table3','table4']
    
    for tables in ls:
        number_of_records = RedshiftSQLOperator(
                                        """
                                            SQL which returns me one record with a int output. Number of order ID duplicates. I'd like to use pass {{tables}} though?
                                        """
                                            )
        if number_of_records > 0:
            send_email = EmailOperator(
                ''
            )
        else:
            None

start >> PythonOperator call above function >> end

AIViz
  • 82
  • 9
  • This is not going to work. From your code it seems that you expect coding in Airflow to be like you code a Python script. it's not the same. Operators push values to xcom. you will need to pull the value in downstream task. – Elad Kalif Mar 09 '23 at 15:06

1 Answers1

0

For those who are trying to do similar thing at their end, this is how I was able to do it on my end.

NOTE: I'm using http request to send notifications to slack channel

def table_duplication_call(tbl_name,sql):
    # print('Start of function', context)
    slack_token = BaseHook.get_connection('slack_default').password
    url = 'https://slack.com/api/chat.postMessage'
#     print(url)
    headers = {
        'Authorization': f'Bearer {slack_token}',
        'Content-Type': 'application/json'
    }
    
    data = {'channel': '#your_channel_name', 
            'mrkdwn':True,
            'text': f""":red_circle: Duplicates in {tbl_name} Table.
                    ```{sql}```
                """}
    response = requests.post(url, headers=headers, json=data)
    return response.json()
    response.raise_for_status()
    print(response.json())


default_args = {'start_date': datetime(2023, 3, 2),
                'retries': 1,
                'retry_delay': timedelta(seconds=20),
                'email_on_failure': False}

with DAG(
    dag_id="load_dag", 
    start_date=datetime(2023, 3, 2), 
    schedule_interval="15 */4 * * *", 
    tags=['some_tag'],
    default_args=default_args,
    catchup = False,
    description='Some description',
    default_view='graph', #grid, graph, duration, gantt, landing_times
    on_failure_callback=None,
    on_success_callback=None,
    # on_failure_callback=send_email
    ) as dag:


    start = DummyOperator(
        task_id = 'start',
        dag=dag,
    )
    
    end = DummyOperator(
        task_id = 'end',
        dag=dag
    )


tbl_ls = ['table1','table2','table3']


def duplication_check_func():
        hook = PostgresHook(postgres_conn_id='redshift_default')
        for tbl in tbl_ls:
            sql = f"""
                                    select
                                            {tbl}.{tbl}_id duplicated_id,
                                            count({tbl}.{tbl}_id ) as cnt
                                        from
                                            datawarehouse.{tbl} 
                                        group by
                                            {tbl}_id 
                                        having
                                            cnt > 1;
                                    """
            results = hook.get_records(sql)
            if len(results)>0:
                table_duplication_call(tbl_name={tbl},sql=sql)
            else:
                None
        # return results
        

db_duplication_notification = PythonOperator(
task_id='db_duplication_notification',
python_callable=duplication_check_func,
dag=dag
)

start >> db_duplication_notification >> end
AIViz
  • 82
  • 9