Does anyone have a good example of integrate Hyperopt into Spark's MlLib? I have been trying to do so on Databricks and continue to get the same error. I am not sure if this is an issue with my objective function or instead if it's something with Spark ML on pyspark and how it hooks into Databricks.
import itertools
from pyspark.sql import functions as f
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, Imputer, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
from itertools import product
from hyperopt import fmin, hp, tpe, STATUS_OK, SparkTrials
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
search_space ={'maxDepth' : hp.choice("maxDepth", np.arange(3, 8, dtype=int)),
'maxIter' : hp.uniform("maxIter", 200,800),
'featureSubsetStrategy' : str(hp.quniform("featureSubsetStrategy", .5,1,.1)),
'minInstancesPerNode' : hp.uniform("min_child_weight", 1,10),
'stepSize' : hp.loguniform('stepSize', np.log(0.01), np.log(0.1)),
'subsamplingRate' : hp.quniform("featureSubsetStrategy", .5,1,.1)
}
evaluator = BinaryClassificationEvaluator(labelCol="positive")
def train(params):
gbtModel = GBTClassifier(labelCol="positive", featuresCol="features").fit(train)
predictions_val = gbtModel.predict(val.map(lambda x: x.features))
labelsAndPredictions = val.map(lambda lp: lp.label).zip(predictions_val)
ROC = evaluator.evaluate(predictions_val, {evaluator.metricName: "areaUnderROC"})
return {'ROC': ROC, 'status': STATUS_OK}
N_HYPEROPT_PROBES = 1000 #can increase, keep small for testing
EARLY_STOPPING = 50
HYPEROPT_ALGO = tpe.suggest
NB_CV_FOLDS = 5 # for testing, can increase
obj_call_count = 0
cur_best_score = 1000000
spark_trials = SparkTrials(parallelism=4)
best = fmin(fn=train,
space=search_space,
algo=HYPEROPT_ALGO,
max_evals=N_HYPEROPT_PROBES,
trials=spark_trials,
verbose=1)
After this runs I get the below error:
Total Trials: 0: 0 succeeded, 0 failed, 0 cancelled. py4j.Py4JException: Method __getstate__([]) does not exist