This class ManyModelLocalInferencing provides methods for downloading and running inference on many models (forecasting) trained with ParallelRunStep
Constructor
- init_(self, experiment, training_run_id, cv_output_path, inference_output_path): initializes the class with the following arguments:
- experiment: the experiment object that the training run is associated with.
- training_run_id: the ID of the training run associated with the many models.
cv_output_path: the path where the cross-validation results and downloaded models will be saved.
- inference_output_path: the path where the inference results will be saved.
Methods
- download_best_models(self): downloads the best models for each time series object based on the cross-validation results. The best models are saved in the cv_output_path directory. This method also saves a summary of the downloaded models in the cv_output_path directory.
- local_inferencing(self, test_set: pd.DataFrame): runs inference on the downloaded models using the input test set. The resulting forecast, quantile and rolling forecast results are saved in the inference_output_path directory.
Attributes
- models_downloaded: a boolean value indicating whether the models have been downloaded.
External Dependencies
- os: Python built-in module for interacting with the file system.
- json: Python built-in module for working with JSON data.
- pandas: an open-source data analysis and manipulation tool.
- azureml.pipeline.core.PipelineRun: Azure Machine Learning Python SDK module for managing pipeline runs.
- sklearn.externals.joblib: a set of tools to provide lightweight pipelining in Python, particularly for large numpy arrays.
Note
The documentation assumes that the class is used in a larger context, where the following variables are defined:
- time_series_id_column_names: a string that represents the name of the column containing the time series IDs.
- label_column_name: a string that represents the name of the column containing the label.
import os
import json
import pandas as pd
from azureml.pipeline.core import PipelineRun
from sklearn.externals import joblib
class ManyModelLocalInferencing:
def __init__(self, experiment, training_run_id, cv_output_path, inference_output_path):
"""
Initializes the ManyModelLocalInferencing class.
Parameters:
experiment (azureml.core.Experiment): The experiment object that contains the training run.
training_run_id (str): The ID of the training run.
cv_output_path (str): The path to the directory where the best models will be saved.
inference_output_path (str): The path to the directory where the results of local inferencing will be saved.
"""
self.experiment = experiment
self.training_run_id = training_run_id
self.cv_output_path = cv_output_path
self.inference_output_path = inference_output_path
self.models_downloaded = False
self.results = dict()
def download_best_models(self):
"""
Downloads the best models for each time series object from the Azure ML run and saves them locally.
"""
# get the many models training pipeline run
pipeline = PipelineRun(experiment=self.experiment, run_id=self.training_run_id)
# find the child runs for the many models training step
many_models_runs = []
many_models_train_step = pipeline.find_step_run("many-models-train")[0]
for run_name in many_models_train_step.get_children():
many_models_runs.append(run_name)
# create the output directories if they don't exist
for path in [self.cv_output_path, self.inference_output_path]:
os.makedirs(path, exist_ok=True)
print(f"a new directory '{path}' is created!")
# download the best models for each time series object
summary = []
for run in many_models_runs:
best_model = run.get_best_child()
try:
best_model.download_file("forecast_table", output_file_path=self.cv_output_path)
with open(f'{self.cv_output_path}/forecast_table', "r") as f:
data = json.load(f)
except Exception as e:
print(f"Error downloading for run {run.id}: {str(e)}")
continue
grain_names = '_'.join(data.get('data')[0].get('grain_value_list')[0])
run_preprocessor = best_model.properties['run_preprocessor']
run_algorithm = best_model.properties['run_algorithm']
score = best_model.properties["score"]
# check if there is a previous best model for this time series object
previous_summary = pd.DataFrame(
summary, columns=[time_series_id_column_names,"preprocessor","algorithm","score"]
).query(f"{time_series_id_column_names} == '{grain_names}'")
if not previous_summary.empty:
previous_score = previous_summary.score.min()
else:
previous_score = None
# download the best model if it is better than the previous best model
if previous_score is None or score < previous_score or not os.path.exists(f"{self.cv_output_path}/{grain_names}"):
try:
best_model.download_files(output_directory=f"{self.cv_output_path}/{grain_names}")
except Exception as e:
print(f"Error downloading model files for run {run.id}: {str(e)}")
continue
summary.append({
time_series_id_column_names:grain_names,
"preprocessor": run_preprocessor,
"algorithm": run_algorithm,
"score": score
})
# save the summary to a file
summary_df = pd.DataFrame(summary).groupby(time_series_id_column_names).min()
summary_df.to_csv(f"{self.cv_output_path}/summary.csv")
os.remove(f'{self.cv_output_path}/forecast_table')
self.models_downloaded = True
def local_inferencing(self, test_set: pd.DataFrame):
if not self.models_downloaded:
print(
"Models have not been downloaded. Calling download_best_models first."
)
self.download_best_models()
forecast_results = pd.DataFrame()
quantile_result_list = []
rolling_result_list = []
for sku in test_set[time_series_id_column_names].unique():
test_ = test_set[test_set[time_series_id_column_names] == sku]
fitted_model = joblib.load(f"{self.cv_output_path}/{sku}/outputs/model.pkl")
model_response = fitted_model.forecast(X_pred=test_)[1]
model_response[label_column_name] = test_[label_column_name].values
forecast_results = forecast_results.append(model_response)
X_test = test_set[test_set[time_series_id_column_names] == sku]
y_test = X_test.pop(label_column_name).values
fitted_model.quantiles = [0.05, 0.5, 0.95]
quantile_result_list.append(fitted_model.forecast_quantiles(
X_test
))
# Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set
rolling_result_list.append(fitted_model.rolling_forecast(
X_test, y_test, step=1, ignore_data_errors=True
))
self.results["forecast_results"] = forecast_results
self.results["quantile_results"] = pd.concat(quantile_result_list, sort=False, ignore_index=True)
self.results["rolling_results"] = pd.concat(rolling_result_list, sort=False, ignore_index=True)
for key, value in self.results.items():
print(f'saving {key}')
value.to_csv(f"{self.inference_output_path}/{key}.csv")