Downloaded artifact from MLFlow output error

Question

Model was first logged to mlflow using pycaret

from pycaret.regression import *
s = setup(df_cleaned, 
      target = 'SalePrice',
      use_gpu = True,
      categorical_features = categorical_features_cleaned,
      handle_unknown_categorical = True,
      unknown_categorical_method = 'most_frequent',
      normalize = True,
      normalize_method = 'robust',
      transformation = True,
      feature_selection = True,
      fold = 5,
      log_experiment = True,
      experiment_name = 'project-2',
      silent = True
     )

best = compare_models()

While running the setup code block, the following error quickly flashed before being replaced by pycaret processing message. I manage to capture interrupting the kernel.

2022/09/24 18:01:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '34dfd8d701a64a18b5043bb0529620bb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2022/09/24 18:01:17 WARNING mlflow.utils: Truncated the value of the key `steps`. Truncated value: `[('dtypes', DataTypes_Auto_infer(categorical_features=['MS SubClass', 'MS Zoning', 'Street',
                                           'Alley', 'Lot Shape', 'Land Contour',
                                           'Utilities', 'Lot Config',
                                           'Land Slope', 'Neighborhood',
                                           'Condition 1', 'Condition 2',
                                           'Bldg Type', 'House Style',
                                    ...`
2022/09/24 18:01:17 WARNING mlflow.utils: Truncated the value of the key `dtypes`. Truncated value: `DataTypes_Auto_infer(categorical_features=['MS SubClass', 'MS Zoning', 'Street',
                                           'Alley', 'Lot Shape', 'Land Contour',
                                           'Utilities', 'Lot Config',
                                           'Land Slope', 'Neighborhood',
                                           'Condition 1', 'Condition 2',
                                           'Bldg Type', 'House Style',
                                           'Roof...`
2022/09/24 18:01:17 WARNING mlflow.utils: Truncated the value of the key `dtypes__categorical_features`. Truncated value: `['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Garage Type', 'Garage Finish', 'Garage C...`

Ignoring the warning, i downloaded the artifact from mlflow using the following code and tried running it.

from mlflow.artifacts import download_artifacts
import mlflow.pyfunc

full_path = './mlruns/1/6ff4f982fb7446b6a61b1ddf51feedc6/artifacts/model'
download_artifacts(full_path, dst_path='.') 
    
model = mlflow.pyfunc.load_model(model_uri="./model")
model.predict(df_submit_cleaned)

The error code shown is as follows:

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In [45], line 5, in predict_submit(model, test_df, test_df_original, filename)
      3 try:
      4     # we run this for gridsearchcv or normal sklearn regression
----> 5     predictions = model.predict(test_df)
      6     #zip the predictions with respective id and save to designated filename

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\pyfunc\__init__.py:373, in PyFuncModel.predict(self, data)
    372     data = _enforce_schema(data, input_schema)
--> 373 return self._predict_fn(data)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\sklearn\__init__.py:1686, in _autolog.<locals>._apply_sklearn_descriptor_unbound_method_call_fix.<locals>.patched_IffHasAttrDescriptor__get__.<locals>.out(*args, **kwargs)
   1685 def out(*args, **kwargs):
-> 1686     return self.fn(obj, *args, **kwargs)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\utils\autologging_utils\safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
    431     with set_non_mlflow_warnings_behavior_for_current_thread(
    432         disable_warnings=False,
    433         reroute_warnings=False,
    434     ):
--> 435         return original(*args, **kwargs)
    437 # Whether or not the original / underlying function has been called during the
    438 # execution of patched code

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py:408, in Pipeline.predict(self, X, **predict_params)
    407     Xt = transform.transform(Xt)
--> 408 return self.steps[-1][-1].predict(Xt, **predict_params)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\utils\autologging_utils\safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
    431     with set_non_mlflow_warnings_behavior_for_current_thread(
    432         disable_warnings=False,
    433         reroute_warnings=False,
    434     ):
--> 435         return original(*args, **kwargs)
    437 # Whether or not the original / underlying function has been called during the
    438 # execution of patched code

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\linear_model\_bayes.py:316, in BayesianRidge.predict(self, X, return_std)
    295 """Predict using the linear model.
    296 
    297 In addition to the mean of the predictive distribution, also its
   (...)
    314     Standard deviation of predictive distribution of query points.
    315 """
--> 316 y_mean = self._decision_function(X)
    317 if return_std is False:

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\linear_model\_base.py:216, in LinearModel._decision_function(self, X)
    215 def _decision_function(self, X):
--> 216     check_is_fitted(self)
    218     X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py:72, in _deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
     71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py:1019, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1018 if not attrs:
-> 1019     raise NotFittedError(msg % {'name': type(estimator).__name__})

NotFittedError: This BayesianRidge instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
Cell In [58], line 2
      1 # Run predictions as per normal with 'model' object
----> 2 predict_submit(model, df_submit_cleaned, df_submit_org, 'downloaded_model')

Cell In [45], line 10, in predict_submit(model, test_df, test_df_original, filename)
      7     df_to_submit = zip(test_df_original['Id'],predictions)
      8 except:
      9     #we run this for pycaret model
---> 10     predictions = predict_model(model, data=test_df)
     11     #zip the predictions with respective id and save to designated filename
     12     df_to_submit = zip(test_df_original['Id'],predictions['Label'])

File ~\AppData\Roaming\Python\Python38\site-packages\pycaret\regression.py:1854, in predict_model(estimator, data, drift_report, round, verbose, drift_kwargs)
   1793 def predict_model(
   1794     estimator,
   1795     data: Optional[pd.DataFrame] = None,
   (...)
   1799     drift_kwargs: Optional[dict] = None,
   1800 ) -> pd.DataFrame:
   1802     """
   1803     This function predicts ``Label`` using a trained model. When ``data`` is
   1804     None, it predicts label on the holdout set.
   (...)
   1851 
   1852     """
-> 1854     return pycaret.internal.tabular.predict_model(
   1855         estimator=estimator,
   1856         data=data,
   1857         drift_report=drift_report,
   1858         probability_threshold=None,
   1859         encoded_labels=True,
   1860         round=round,
   1861         verbose=verbose,
   1862         ml_usecase=MLUsecase.REGRESSION,
   1863         drift_kwargs=drift_kwargs,
   1864     )

File ~\AppData\Roaming\Python\Python38\site-packages\pycaret\internal\tabular.py:9116, in predict_model(estimator, data, probability_threshold, encoded_labels, drift_report, raw_score, round, verbose, ml_usecase, display, drift_kwargs)
   9113         probability_threshold = estimator.probability_threshold
   9114     estimator = get_estimator_from_meta_estimator(estimator)
-> 9116 pred = np.nan_to_num(estimator.predict(X_test_))
   9118 try:
   9119     score = estimator.predict_proba(X_test_)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\sklearn\__init__.py:1686, in _autolog.<locals>._apply_sklearn_descriptor_unbound_method_call_fix.<locals>.patched_IffHasAttrDescriptor__get__.<locals>.out(*args, **kwargs)
   1685 def out(*args, **kwargs):
-> 1686     return self.fn(obj, *args, **kwargs)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\utils\autologging_utils\safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
    420 if (
    421     active_session_failed
    422     or autologging_is_disabled(autologging_integration)
   (...)
    429     # warning behavior during original function execution, since autologging is being
    430     # skipped
    431     with set_non_mlflow_warnings_behavior_for_current_thread(
    432         disable_warnings=False,
    433         reroute_warnings=False,
    434     ):
--> 435         return original(*args, **kwargs)
    437 # Whether or not the original / underlying function has been called during the
    438 # execution of patched code
    439 original_has_been_called = False

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py:408, in Pipeline.predict(self, X, **predict_params)
    406 for _, name, transform in self._iter(with_final=False):
    407     Xt = transform.transform(Xt)
--> 408 return self.steps[-1][-1].predict(Xt, **predict_params)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\pyfunc\__init__.py:373, in PyFuncModel.predict(self, data)
    371 if input_schema is not None:
    372     data = _enforce_schema(data, input_schema)
--> 373 return self._predict_fn(data)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\sklearn\__init__.py:1686, in _autolog.<locals>._apply_sklearn_descriptor_unbound_method_call_fix.<locals>.patched_IffHasAttrDescriptor__get__.<locals>.out(*args, **kwargs)
   1685 def out(*args, **kwargs):
-> 1686     return self.fn(obj, *args, **kwargs)

File ~\anaconda3\envs\dsi-sg\lib\site-packages\mlflow\utils\autologging_utils\safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
    420 if (
    421     active_session_failed
    422     or autologging_is_disabled(autologging_integration)
   (...)
    429     # warning behavior during original function execution, since autologging is being
    430     # skipped
    431     with set_non_mlflow_warnings_behavior_for_current_thread(
    432         disable_warnings=False,
    433         reroute_warnings=False,
    434     ):
--> 435         return original(*args, **kwargs)
    437 # Whether or not the original / underlying function has been called during the
    438 # execution of patched code
    439 original_has_been_called = False

File ~\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py:407, in Pipeline.predict(self, X, **predict_params)
    405 Xt = X
    406 for _, name, transform in self._iter(with_final=False):
--> 407     Xt = transform.transform(Xt)
    408 return self.steps[-1][-1].predict(Xt, **predict_params)

File ~\AppData\Roaming\Python\Python38\site-packages\pycaret\internal\preprocess.py:364, in DataTypes_Auto_infer.transform(self, dataset, y)
    362 # drop any columns that were asked to drop
    363 data.drop(columns=self.features_todrop, errors="ignore", inplace=True)
--> 364 data = data[self.final_training_columns]
    366 # also make sure that all the column names are string
    367 data.columns = [str(i) for i in data.columns]

File ~\anaconda3\envs\dsi-sg\lib\site-packages\pandas\core\frame.py:3511, in DataFrame.__getitem__(self, key)
   3509     if is_iterator(key):
   3510         key = list(key)
-> 3511     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   3513 # take() does not accept boolean indexers
   3514 if getattr(indexer, "dtype", None) == bool:

File ~\anaconda3\envs\dsi-sg\lib\site-packages\pandas\core\indexes\base.py:5796, in Index._get_indexer_strict(self, key, axis_name)
   5793 else:
   5794     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5796 self._raise_if_missing(keyarr, indexer, axis_name)
   5798 keyarr = self.take(indexer)
   5799 if isinstance(key, Index):
   5800     # GH 42790 - Preserve name from an Index

File ~\anaconda3\envs\dsi-sg\lib\site-packages\pandas\core\indexes\base.py:5859, in Index._raise_if_missing(self, key, indexer, axis_name)
   5856     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   5858 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 5859 raise KeyError(f"{not_found} not in index")

KeyError: "['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Garage Type', 'Garage Finish', 'Garage Cond', 'Paved Drive', 'Fence', 'Misc Feature', 'Sale Type'] not in index"

I am using mlflow 1.27.0

Really appreciate any advice to resolve this. Thanks!

Downloaded artifact from MLFlow output error

0 Answers0