Hello good afternoon I have encountered a curious occurrence in Airflow 2.3.0
I have a pipeline that is in charge of deleting the logs in the airflow database (I have the BBDD running on Postgress on AWS).
This is the pipe:
from datetime import datetime, timezone, timedelta
from pathlib import Path
from shutil import rmtree
from airflow.models import DAG, Log, DagRun, TaskInstance, TaskReschedule, Variable
from airflow.jobs.base_job import BaseJob
from airflow.utils.dates import days_ago
from airflow.operators.python import PythonOperator
from airflow.utils.state import State
from airflow import settings
from dags.documents.DEV.config import ENV, BASIC_CONFIG_FACTORY
DAG_NAME = "cleanup_metadata_pipeline"
EXPIRATION_WEEKS = 4
def delete_old_database_entries_by_model(table, date_col):
"""
Delete old database entries where the date is older than EXPIRATION_WEEKS.
Args:
table (_type_): table to delete.
date_col (_type_): date column to check.
"""
expiration_date = datetime.now(timezone.utc) - timedelta(weeks=EXPIRATION_WEEKS)
print(f"Deleting old database entries from {table} older than {expiration_date}...")
session = settings.Session()
query = session.query(table).filter(date_col < expiration_date)
print(f"Session is: {session}")
if "state" in dir(table):
query = query.filter(State.RUNNING != "state")
print(query)
result = query.delete(synchronize_session=False)
session.commit()
print(f"Deleted {result} rows from {table}")
def delete_old_database_entries():
"""
Delete old database entries.
"""
if Variable.get("ENABLE_DB_TRUNCATION", "") != "True":
print("This DAG will delete all data older than %s weeks.", EXPIRATION_WEEKS)
print("To enable this, create an Airflow Variable called ENABLE_DB_TRUNCATION set to 'True'")
print("Skipping truncation until explicitly enabled.")
return
delete_old_database_entries_by_model(TaskInstance, TaskInstance.end_date)
delete_old_database_entries_by_model(DagRun, DagRun.end_date)
delete_old_database_entries_by_model(BaseJob, BaseJob.end_date)
delete_old_database_entries_by_model(Log, Log.dttm)
delete_old_database_entries_by_model(TaskReschedule, TaskReschedule.end_date)
with DAG(
dag_id=DAG_NAME,
start_date=days_ago(1),
max_active_runs=1,
dagrun_timeout=timedelta(minutes=180),
schedule_interval="@daily",
tags=['cleanup_metadata_pipeline', 'tools', 'cleanup'],
catchup=False,
) as dag:
cleanup_old_database_entries = PythonOperator(
task_id="cleanup_old_database_entries",
python_callable=delete_old_database_entries,
)
cleanup_old_database_entries
The question is that also when I go to look at the logs that I have in the persistent volume claim I have noticed that all the logs have been deleted until that same date, 4 weeks from today, but there are also some pipelines that It has lost the logs from the 26th backwards.
Does anyone know if there is any kind of synchronisation when deleting the logs in the DB with the persistan volume claim logs? Is there something I'm doing wrong in my pipeline that has caused some pipes to have logs deleted up to 26 May?