0

I'm trying to write a custom version of CrossValidator that evaluates (and averages out over the folds) a list of metrics and, as per normal, only picks the best of a particular chosen one (eg: the first of the list). But in doing so, I'd like it to autolog (with mlflow) all of the metrics it evaluates, rather than just the one it optimises.

Inspecting the pyspark.ml.tuning.CrossValidator, I don't see any mlflow code at all. Databricks somehow knows what to log. Does anyone know how it knows what needs to be logged and how to make it autolog what you want if the feature is enabled?

Thanks in advance!

The following seems to work, but only VerboseCrossValidatorModel.avgMetrics and VerboseCrossValidatorModel.stdMetrics are autologged. I'd like VerboseCrossValidatorModel.verboseAvgMetrics and VerboseCrossValidatorModel.verboseStdMetrics to be autologged as well.

from pyspark.sql.functions import col, when
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from typing import List, Sequence, Callable, Tuple, Optional, cast
from multiprocessing.pool import ThreadPool
from pyspark import inheritable_thread_target

def _parallelFitTasksVerbose(
  est: Estimator, train: DataFrame, evas: List[Evaluator], validation: DataFrame, epm: Sequence["ParamMap"], collectSubModel: bool
) -> List[Callable[[], Tuple[int, List[float], Transformer]]]: 
  modelIter = est.fitMultiple(train, epm)
  def singleTask() -> Tuple[int, float, Transformer]:
    index, model = next(modelIter)
    pred = model.transform(validation, epm[index])
    metrics = []
    for eva in evas:
      m = eva.evaluate(pred)
      metrics.append(m)
    return index, metrics, model if collectSubModel else None
  return [singleTask] * len(epm)

def _gen_avg_and_std_metrics_verbose(metrics_all: List[List[List[float]]]) -> List[Tuple[List[float], List[float]]]:
    return np.mean(metrics_all, axis=0).tolist() , np.std(metrics_all, axis=0).tolist()

class VerboseCrossValidatorModel(CrossValidatorModel):
  def __init__(self, bestModel: Model, avgMetrics: Optional[List[List[float]]] = None, subModels: Optional[List[List[Model]]] = None, stdMetrics: Optional[List[List[float]]] = None):
    super(CrossValidatorModel, self).__init__()
    self.bestModel = bestModel
    self.avgMetrics = avgMetrics[0] or []
    self.stdMetrics = stdMetrics[0] or []
    self.subModels = subModels

    self.verboseAvgMetrics = avgMetrics or []
    self.verboseStdMetrics = stdMetrics or []

class VerboseCrossValidator(CrossValidator):
  evaluators: Param[List[Evaluator]] = Param(Params._dummy(), "evaluators", "evaluators used to get performance metrics, of which the first one will be maximised")

  @keyword_only
  def __init__(self,
    *,
    estimator: Optional[Estimator] = None,
    estimatorParamMaps: Optional[List["ParamMap"]] = None,
    evaluator: Optional[Evaluator] = None,
    numFolds: int = 3,
    seed: Optional[int] = None,
    parallelism: int = 1,
    collectSubModels: bool = False,
    foldCol: str = "",
    evaluators: Optional[List[Evaluator]] = None,
) -> None:    
    kwargs = self._input_kwargs
    super(VerboseCrossValidator, self).__init__()
    self._set(**kwargs)

  def getEvaluators(self) -> List[Evaluator]:
    return self.getOrDefault(self.evaluators)

  def setEvaluators(self, value: List[Evaluator]) -> "CrossValidator":
    return self._set(evaluators=value)

  def _fit(self, dataset: DataFrame) -> "CrossValidatorModel":
    est = self.getOrDefault(self.estimator)
    epm = self.getOrDefault(self.estimatorParamMaps)
    numModels = len(epm)
    eva = self.getOrDefault(self.evaluators)
    nFolds = self.getOrDefault(self.numFolds)
    metrics_all = [[[0.0]*len(self.getEvaluators())]*numModels for i in range(nFolds)]
    pool = ThreadPool(processes=min(self.getParallelism(), numModels))
    subModels = None
    collectSubModelsParam = self.getCollectSubModels()
    if collectSubModelsParam:
      subModels = [[None for j in range(numModels)] for i in range(nFolds)]
    datasets = self._kFold(dataset)
    for i in range(nFolds):
      validation = datasets[i][1].cache()
      train = datasets[i][0].cache()
      tasks = map(inheritable_thread_target, _parallelFitTasksVerbose(est, train, eva, validation, epm, collectSubModelsParam))
      for j, metrics, subModel in pool.imap_unordered(lambda f: f(), tasks):
        metrics_all[i][j] = metrics
        if collectSubModelsParam:
          assert subModels is not None
          subModels[i][j] = subModel
      validation.unpersist()
      train.unpersist()
    
    metrics, std_metrics = _gen_avg_and_std_metrics_verbose(metrics_all)
    bestIndex = np.argmax(metrics, axis=0)[0] if eva[0].isLargerBetter() else np.argmin(metrics, axis=0)[0]
    bestModel = est.fit(dataset, epm[bestIndex])
    return self._copyValues(VerboseCrossValidatorModel(bestModel, metrics, cast(List[List[Model]], subModels), std_metrics))
Mateus
  • 13
  • 5

0 Answers0