I'm getting started with Raytune and trying to set up a HyperOptSearch
with imbalanced data.
Fitting a pipeline without RandomOverSampler
works fine, but when I add that in, I get the error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough'
Code sample here, and works fine without the RandomOverSampler step:
cfg_hgb = {
'clf__learning_rate' : tune.loguniform(0.001, 0.8),
'clf__max_leaf_nodes' : tune.randint(2,20),
'clf__min_samples_leaf' : tune.randint(50,500),
'clf__max_depth' : tune.randint(2,15),
'clf__l2_regularization' : tune.loguniform(0.001, 1000),
'clf__max_iter' : tune.choice([800]),
}
hyperopt = HyperOptSearch(
# space=cfg_hgb,
metric="mean_auc",
mode="max",
points_to_evaluate=None,
n_initial_points=20,
random_state_seed=RANDOM_STATE,
gamma=0.25,
)
def train_hgb(config):
# LOAD DATA
X, y, nominal, ordinal, numeric = load_clean_data()
# LOAD TRANSFORMERS
prep = Preprocessor(nominal, ordinal, numeric)
# CHOOSE CV STRATEGY
splitter = StratifiedKFold(CV, random_state=RANDOM_STATE, shuffle=True)
# TRAIN
scores = []
for train_ind, val_ind in splitter.split(X,y):
hgb_os = Pipeline(steps=[
('coltrans', prep.transformer_ord),
('ros', RandomOverSampler(random_state=RANDOM_STATE)), # if I comment out, works fine
('clf', HistGradientBoostingClassifier(
categorical_features=prep.cat_feature_mask,
random_state=RANDOM_STATE))
])
hgb_os.set_params(**config)
hgb_os.fit(X.iloc[train_ind], y[train_ind])
y_pred = hgb_os.predict(X.iloc[val_ind])
scores.append(roc_auc_score(y_true=y[val_ind], y_score=y_pred, average="macro"))
# REPORT SCORES
session.report({
'mean_auc' : np.array(scores).mean(),
'std_auc' : np.array(scores).std(),
})
tuner = tune.Tuner(
trainable=train_hgb,
param_space=cfg_hgb,
tune_config=tune.TuneConfig(
num_samples=10,
search_alg=hyperopt,
),
run_config=RunConfig(
name="experiment_name",
local_dir="./results/hgb",
)
)
results = tuner.fit()
Whereas if using ray.tune.sklearn.TuneSearchCV, RandomOverSampler works fine in the pipeline:
hgb_tune = {
'learning_rate' : tune.loguniform(0.001, 0.15),
'max_leaf_nodes' : tune.randint(2,4),
'min_samples_leaf' : tune.randint(160,300),
'max_depth' : tune.randint(2,7),
'l2_regularization' : tune.loguniform(5, 1000),
'max_iter' : tune.choice([400]),
}
hgb_os = Pipeline(steps=[
('trans', prep.transformer_ord),
('ros', RandomOverSampler(random_state=RANDOM_STATE)),
('clf', TuneSearchCV(
HistGradientBoostingClassifier(
categorical_features=prep.cat_feature_mask,
random_state=RANDOM_STATE),
param_distributions=hgb_tune,
cv=CV, scoring=SCORER,
verbose=VERBOSE, search_optimization="bayesian",
n_trials=N_TRIALS, )) # local_dir='~/rayresults/hgbtune'
])
results, params = fit_eval(hgb_os, X_train, X_test, y_train, y_test)
I understand that probably tune is expecting .fit_transform
for intermediate steps whereas RandomOverSampler uses .fit_resample
. Also note that RandomOverSampler
requires imblearn.pipeline.Pipeline
rather than sklearn.pipeline.Pipeline
so perhaps therein lies the problem.
Is there a way to add any form of resampling with the current Tuner API? Or do I need to part out the pipeline and resample it first outside of this loop?
Thanks in advance.