I'm running airflow inside docker container and getting airflow image (puckel/docker-airflow:latest) from docker hub. I can access Airflow UI through localhost:8080 but without executing the DAG and the error mentioned in the subject above. I'm even writing pip command to install apache-airflow in my Dockerfile. Here is how my Dockerfile, docker-compose.yml and dag.py looks like:
Dockerfile:
FROM puckel/docker-airflow:latest
RUN pip install requests
RUN pip install pandas
RUN pip install 'apache-airflow'
docker-compose.yml:
version: '3.7'
services:
redis:
image: redis:5.0.5
environment:
REDIS_HOST: redis
REDIS_PORT: 6379
ports:
- 6379:6379
postgres:
image: postgres:9.6
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
- PGDATA=/var/lib/postgresql/data/pgdata
volumes:
- ./pgdata:/var/lib/postgresql/data/pgdata
logging:
options:
max-size: 10m
max-file: "3"
webserver:
build: ./dockerfiles
restart: always
depends_on:
- postgres
- redis
environment:
- LOAD_EX=n
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
logging:
options:
max-size: 10m
max-file: "3"
volumes:
- ./dags:/usr/local/airflow/dags
- ./config/airflow.cfg:/usr/local/airflow/airflow.cfg
ports:
- "8080:8080"
command: webserver
healthcheck:
test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
interval: 30s
timeout: 30s
retries: 3
flower:
build: ./dockerfiles
restart: always
depends_on:
- redis
environment:
- EXECUTOR=Celery
ports:
- "5555:5555"
command: flower
scheduler:
build: ./dockerfiles
restart: always
depends_on:
- webserver
volumes:
- ./dags:/usr/local/airflow/dags
environment:
- LOAD_EX=n
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
command: scheduler
worker:
build: ./dockerfiles
restart: always
depends_on:
- scheduler
volumes:
- ./dags:/usr/local/airflow/dags
environment:
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
command: worker
dag.py:
from airflow import DAG
from airflow.operators.subdag import SubDagOperator
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.operators.bash import BashOperator
from datetime import datetime
from random import randint
def _choosing_best_model(ti):
accuracies = ti.xcom_pull(task_ids=[
'training_model_A',
'training_model_B',
'training_model_C'
])
if max(accuracies) > 8:
return 'accurate'
return 'inaccurate'
def _training_model(model):
return randint(1, 10)
with DAG("test",
start_date=datetime(2021, 1 ,1),
schedule_interval='@daily',
catchup=False) as dag:
training_model_tasks = [
PythonOperator(
task_id=f"training_model_{model_id}",
python_callable=_training_model,
op_kwargs={
"model": model_id
}
) for model_id in ['A', 'B', 'C']
]
choosing_best_model = BranchPythonOperator(
task_id="choosing_best_model",
python_callable=_choosing_best_model
)
accurate = BashOperator(
task_id="accurate",
bash_command="echo 'accurate'"
)
inaccurate = BashOperator(
task_id="inaccurate",
bash_command=" echo 'inaccurate'"
)
training_model_tasks >> choosing_best_model >> [accurate, inaccurate]
Am I missing anything here? Please let me know if you can. Thanks :)