How to integrate mlflow and airflow? Is there any way to connect to mlflow server from airflow

Question

Lets say I have a ML model in mlflow server artifacts. I want to run this model from airflow Dag. Also after running in airflow, metric logs should be visible in mlflow. How can I achieve this? There are connections in airflow, I couldn't find any connection type for mlflow.

hsaltan · Answer 1 · 2022-09-03T17:38:41.400

First, you should run airflow and mlflow servers, and set the artifact paths and databases for both. You can do this locally or on the Cloud. There are many sources on YouTube on how to achieve them.

I will show only the coding part of how you can use airflow and mlflow together. The below code is only a partial and simplified version but explains how you can do it:

# Import libraries
from numpy import loadtxt
import xgboost as xgb
import mlflow
from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from sklearn.metrics import mean_squared_error
from typing import Any
import pickle
from datetime import datetime, timedelta
import logging    

artifact_path = "models_mlflow"
local_path = "/PATH/"
tag = "some tag"
model_name = "my_model"
metric = "metrics.rmse ASC"   

def find_best_params(ti: Any, metric: str, max_results: int) -> dict[str, Any]:

    # Some code here to find best params
    best_params = {'param_key_1': "param_value_1", 'param_key_2': "param_value_2"}

    # Push the best params to XCom
    ti.xcom_push(key='best_params', value=best_params)

def run_best_model(ti: Any) -> str:

    """
        Runs the model with the best parameters searched and found 
        at the earlier phase. Then, saves the model and info in the artifacts 
        folder or bucket.
    """

    best_params = ti.xcom_pull(key='best_params', task_ids=['find_best_params'])
    best_params = best_params[0]
    logging.info(f"Best params '{best_params}' are retrieved from XCom.")

    # Load data from the local disk.
    X_train = loadtxt(f"{local_path}/data/X_train.csv", delimiter=',')
    X_val = loadtxt(f"{local_path}/data/X_val.csv", delimiter=',')
    y_train = loadtxt(f"{local_path}/data/y_train.csv", delimiter=',')
    y_val = loadtxt(f"{local_path}/data/y_val.csv", delimiter=',')
    logging.info("Training and validation datasets are retrieved from the local storage.")

    # Convert to DMatrix data structure for XGBoost.
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)
    logging.info("Training and validation matrix datasets are created for XGBoost.")

    with mlflow.start_run() as run:

        # Get the run_id of the best model
        best_run_id = run.info.run_id
        logging.info(f"Best run id: '{best_run_id}'")

        mlflow.set_tag("model", tag)
        mlflow.log_params(best_params)

        # Train the XGBoost model with the best parameters
        booster = xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

        # Save the model (xgboost_model.bin) locally in the folder "../models/" (in case we want)
        with open(f"{local_path}/models/xgboost_model.bin", "wb") as f_out:
            pickle.dump(booster, f_out)
        logging.info(f"XGBoost model is saved on the path '{local_path}/models/xgboost_model.bin' of the local machine.")

        # Save the model (xgboost_model.bin) using 'log_artifact' in the defined artifacts folder/bucket (in case we want)
        # This is defined on the CLI and as artifact path parameter on AWS Parameter Store:
        # s3://bucket/mlflow/ ... /models_mlflow/
        mlflow.log_artifact(local_path=f"{local_path}/models/xgboost_model.bin", artifact_path=artifact_path)
        logging.info(f"Artifacts are saved on the artifact path '{artifact_path}'.")

        # Save the model (booster) using 'log_model' in the defined artifacts folder/bucket
        # This is defined on the CLI and as artifact path parameter on AWS Parameter Store:
        # s3://bucket/mlflow/ ... /models_mlflow/
        mlflow.xgboost.log_model(booster, artifact_path=artifact_path)
        logging.info(f"XGBoost model is saved on the artifact path '{artifact_path}'.")
        logging.info(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")

        # Push the best run id to XCom
        ti.xcom_push(key='best_run_id', value=best_run_id)
        logging.info(f"The best run id '{best_run_id}' of the model '{model_name}' is pushed to XCom.")

default_args = {
    'owner': 'me',
    'start_date': datetime(2022, 8, 25, 2),
    'end_date': datetime(2022, 12, 25, 2),
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(seconds=10),
}

with DAG(
    dag_id="my_dag_v1",
    default_args=default_args,
    description="Training dag",
    schedule_interval="@monthly", 
    catchup=False,
) as dag:

    task_start_dag = DummyOperator(
        task_id = "start_dag",
    )

    task_find_best_params = PythonOperator(
        task_id='find_best_params',
        python_callable=find_best_params,
        op_kwargs={"metric": metric, "max_results": 5000},
    )

    task_run_best_model = PythonOperator(
        task_id='run_best_model',
        python_callable=run_best_model,
        op_kwargs={"tag": tag},
    )

    task_end_dag = DummyOperator(
        task_id = "end_dag",
    )

task_start_dag >> task_find_best_params >> task_run_best_model >> task_end_dag

If you run servers locally (airflow on http://0.0.0.0:8080 and mlflow on http://0.0.0.0:5000), you can see the results on both web pages. The above code is designed to set the artifacts path on the cloud. You need to set a local path if you want it on the local machine.

How to integrate mlflow and airflow? Is there any way to connect to mlflow server from airflow

1 Answers1