I'm trying to write a custom version of CrossValidator that evaluates (and averages out over the folds) a list of metrics and, as per normal, only picks the best of a particular chosen one (eg: the first of the list). But in doing so, I'd like it to autolog (with mlflow) all of the metrics it evaluates, rather than just the one it optimises.
Inspecting the pyspark.ml.tuning.CrossValidator, I don't see any mlflow code at all. Databricks somehow knows what to log. Does anyone know how it knows what needs to be logged and how to make it autolog what you want if the feature is enabled?
Thanks in advance!
The following seems to work, but only VerboseCrossValidatorModel.avgMetrics and VerboseCrossValidatorModel.stdMetrics are autologged. I'd like VerboseCrossValidatorModel.verboseAvgMetrics and VerboseCrossValidatorModel.verboseStdMetrics to be autologged as well.
from pyspark.sql.functions import col, when
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from typing import List, Sequence, Callable, Tuple, Optional, cast
from multiprocessing.pool import ThreadPool
from pyspark import inheritable_thread_target
def _parallelFitTasksVerbose(
est: Estimator, train: DataFrame, evas: List[Evaluator], validation: DataFrame, epm: Sequence["ParamMap"], collectSubModel: bool
) -> List[Callable[[], Tuple[int, List[float], Transformer]]]:
modelIter = est.fitMultiple(train, epm)
def singleTask() -> Tuple[int, float, Transformer]:
index, model = next(modelIter)
pred = model.transform(validation, epm[index])
metrics = []
for eva in evas:
m = eva.evaluate(pred)
metrics.append(m)
return index, metrics, model if collectSubModel else None
return [singleTask] * len(epm)
def _gen_avg_and_std_metrics_verbose(metrics_all: List[List[List[float]]]) -> List[Tuple[List[float], List[float]]]:
return np.mean(metrics_all, axis=0).tolist() , np.std(metrics_all, axis=0).tolist()
class VerboseCrossValidatorModel(CrossValidatorModel):
def __init__(self, bestModel: Model, avgMetrics: Optional[List[List[float]]] = None, subModels: Optional[List[List[Model]]] = None, stdMetrics: Optional[List[List[float]]] = None):
super(CrossValidatorModel, self).__init__()
self.bestModel = bestModel
self.avgMetrics = avgMetrics[0] or []
self.stdMetrics = stdMetrics[0] or []
self.subModels = subModels
self.verboseAvgMetrics = avgMetrics or []
self.verboseStdMetrics = stdMetrics or []
class VerboseCrossValidator(CrossValidator):
evaluators: Param[List[Evaluator]] = Param(Params._dummy(), "evaluators", "evaluators used to get performance metrics, of which the first one will be maximised")
@keyword_only
def __init__(self,
*,
estimator: Optional[Estimator] = None,
estimatorParamMaps: Optional[List["ParamMap"]] = None,
evaluator: Optional[Evaluator] = None,
numFolds: int = 3,
seed: Optional[int] = None,
parallelism: int = 1,
collectSubModels: bool = False,
foldCol: str = "",
evaluators: Optional[List[Evaluator]] = None,
) -> None:
kwargs = self._input_kwargs
super(VerboseCrossValidator, self).__init__()
self._set(**kwargs)
def getEvaluators(self) -> List[Evaluator]:
return self.getOrDefault(self.evaluators)
def setEvaluators(self, value: List[Evaluator]) -> "CrossValidator":
return self._set(evaluators=value)
def _fit(self, dataset: DataFrame) -> "CrossValidatorModel":
est = self.getOrDefault(self.estimator)
epm = self.getOrDefault(self.estimatorParamMaps)
numModels = len(epm)
eva = self.getOrDefault(self.evaluators)
nFolds = self.getOrDefault(self.numFolds)
metrics_all = [[[0.0]*len(self.getEvaluators())]*numModels for i in range(nFolds)]
pool = ThreadPool(processes=min(self.getParallelism(), numModels))
subModels = None
collectSubModelsParam = self.getCollectSubModels()
if collectSubModelsParam:
subModels = [[None for j in range(numModels)] for i in range(nFolds)]
datasets = self._kFold(dataset)
for i in range(nFolds):
validation = datasets[i][1].cache()
train = datasets[i][0].cache()
tasks = map(inheritable_thread_target, _parallelFitTasksVerbose(est, train, eva, validation, epm, collectSubModelsParam))
for j, metrics, subModel in pool.imap_unordered(lambda f: f(), tasks):
metrics_all[i][j] = metrics
if collectSubModelsParam:
assert subModels is not None
subModels[i][j] = subModel
validation.unpersist()
train.unpersist()
metrics, std_metrics = _gen_avg_and_std_metrics_verbose(metrics_all)
bestIndex = np.argmax(metrics, axis=0)[0] if eva[0].isLargerBetter() else np.argmin(metrics, axis=0)[0]
bestModel = est.fit(dataset, epm[bestIndex])
return self._copyValues(VerboseCrossValidatorModel(bestModel, metrics, cast(List[List[Model]], subModels), std_metrics))