I have a DAG which has step read_date_information_file which reads a file and returns the list of queries (which I can access from output). I then want to loop through this and execute the queries on Athena using AWSAthenaOperator for each query in this list.
def get_date_information(ti):
s3 = boto3.client('s3')
data = s3.get_object(Bucket=output_bucket, Key=key)
contents = data['Body'].read().decode("utf-8")
print('Date information is: ', contents)
events_list = contents.split(',')
return events_list
with DAG(
dag_id='adserver_split_job_emr_job_dag',
default_args={
'owner': 'adserver_airflow',
'depends_on_past': False,
'email': ['airflow@example.com'],
'email_on_failure': False,
'email_on_retry': False,
},
dagrun_timeout=timedelta(hours=2),
start_date=datetime(2021, 9, 22, 9),
schedule_interval='20 * * * *',
) as dag:
read_date_information_file = PythonOperator(
task_id="read_date_information_file",
python_callable=get_date_information
)
query_list = read_date_information_file.output
for i, event in enumerate(query_list):
run_query = AWSAthenaOperator(
task_id=f'run_query_{i}',
query=event,
output_location=config.ATHENA_OUTPUT_LOCATION,
database=config.ATHENA_DATABASE_NAME,
aws_conn_id='aws_default'
)
read_date_information_file >> run_query
I get the error such as:
Broken DAG: [/opt/airflow/dags/test.py] Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/models/baseoperator.py", line 593, in __setattr__
super().__setattr__(key, value)
File "/home/airflow/.local/lib/python3.6/site-packages/airflow/utils/timeout.py", line 37, in handle_timeout
raise AirflowTaskTimeout(self.error_message)
airflow.exceptions.AirflowTaskTimeout: DagBag import timeout for /opt/airflow/dags/test.py after 30.0s, PID: 10056
But if I set query_list to hard coded one then it works fine, like:
query_list = ["SELECT 1;", "SELECT 2;", "SELECT 3;"]
Any help in this regard. I am following this looping approach from the solution mentioned here. The difference is that I am looping on the dynamic list which is the output from the previous step.