3

Does anyone have a good example of integrate Hyperopt into Spark's MlLib? I have been trying to do so on Databricks and continue to get the same error. I am not sure if this is an issue with my objective function or instead if it's something with Spark ML on pyspark and how it hooks into Databricks.

import itertools
from pyspark.sql import functions as f
from pyspark.sql import DataFrame
from pyspark.sql.types import *

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, Imputer, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
from itertools import product
from hyperopt import fmin, hp, tpe, STATUS_OK, SparkTrials
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

search_space ={'maxDepth'   : hp.choice("maxDepth", np.arange(3, 8, dtype=int)),
        'maxIter'       : hp.uniform("maxIter", 200,800),
        'featureSubsetStrategy' : str(hp.quniform("featureSubsetStrategy", .5,1,.1)),
        'minInstancesPerNode' : hp.uniform("min_child_weight", 1,10),
        'stepSize'    : hp.loguniform('stepSize', np.log(0.01), np.log(0.1)),
        'subsamplingRate'    : hp.quniform("featureSubsetStrategy", .5,1,.1)   
    }
evaluator = BinaryClassificationEvaluator(labelCol="positive")

def train(params):
  gbtModel = GBTClassifier(labelCol="positive", featuresCol="features").fit(train)
  predictions_val = gbtModel.predict(val.map(lambda x: x.features))
  labelsAndPredictions = val.map(lambda lp: lp.label).zip(predictions_val)
  ROC = evaluator.evaluate(predictions_val, {evaluator.metricName: "areaUnderROC"})

  return {'ROC': ROC, 'status': STATUS_OK}



N_HYPEROPT_PROBES = 1000 #can increase, keep small for testing
EARLY_STOPPING = 50
HYPEROPT_ALGO = tpe.suggest
NB_CV_FOLDS = 5 # for testing, can increase

obj_call_count = 0
cur_best_score = 1000000
spark_trials = SparkTrials(parallelism=4)
best = fmin(fn=train,
             space=search_space,
              algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=spark_trials,
                     verbose=1) 

After this runs I get the below error:

Total Trials: 0: 0 succeeded, 0 failed, 0 cancelled. py4j.Py4JException: Method __getstate__([]) does not exist

JD Haddon
  • 31
  • 2
  • I have the same question but have difficulty understand your code. the "params" was not used in your train function, and train, val tables were not defined. If you had eventually get this to work, please share. thanks – PaulDong Apr 14 '21 at 13:03

1 Answers1

1

Not sure if this is too late but SparkTrials only work with single-machine ML models like the ones found in the scikit-learn library. In the case of Spark MLib you should use Trials (you don't need to pass the trials parameter to the fmin function)

You can find more details here: http://hyperopt.github.io/hyperopt/scaleout/spark/

Since SparkTrials fits and evaluates each model on one Spark worker, it is limited to tuning single-machine ML models and workflows, such as scikit-learn or single-machine TensorFlow. For distributed ML algorithms such as Apache Spark MLlib or Horovod, you can use Hyperopt’s default Trials class.

nescobar
  • 81
  • 1
  • 5