I'm trying to write ETL using airflow with asynchronous functionality.
I saw two examples in airflow official repo that have implemented ETL but didn't saw any async example.
Simple ETL https://github.com/apache/airflow/blob/master/airflow/example_dags/tutorial_etl_dag.py
TaskFlow API ETL: https://github.com/apache/airflow/blob/master/airflow/example_dags/tutorial_taskflow_api_etl.py
since TaskFlow API is much more better way to make the code readable I have tried to implement some async tasks using eventloop like in this question but without any success.
for example :
import json
import asyncio
from textwrap import dedent
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
default_args = {
'owner': 'airflow',
}
with DAG(
'tutorial_etl_dag',
default_args=default_args,
description='ETL DAG tutorial',
schedule_interval=None,
start_date=days_ago(2),
tags=['example'],
) as dag:
def extract(**kwargs):
ti = kwargs['ti']
data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}'
ti.xcom_push('order_data', data_string)
def transform(**kwargs):
ti = kwargs['ti']
extract_data_string = ti.xcom_pull(task_ids='extract', key='order_data')
order_data = json.loads(extract_data_string)
total_order_value = 0
for value in order_data.values():
total_order_value += value
total_value = {"total_order_value": total_order_value}
total_value_json_string = json.dumps(total_value)
ti.xcom_push('total_order_value', total_value_json_string)
async def load(**kwargs):
ti = kwargs['ti']
total_value_string = ti.xcom_pull(task_ids='transform', key='total_order_value')
total_order_value = json.loads(total_value_string)
# upload total_order_value to cloud storage asynchronously
# await upload_to_s3(total_order_value)
return total_order_value
def run_async():
loop = asyncio.get_event_loop()
result = loop.run_until_complete(load()) # kwargs ????
return result
extract_task = PythonOperator(
task_id='extract',
python_callable=extract,
)
transform_task = PythonOperator(
task_id='transform',
python_callable=transform,
)
load_task = PythonOperator(
task_id='load',
python_callable=run_async,
)
extract_task >> transform_task >> load_task
in the example above the task "load_task" expects to receive kwargs argument (xcom data), how do I pass it from run_async method?
any ideas how to make it work?
Thanks!