0

I'm getting started with Raytune and trying to set up a HyperOptSearch with imbalanced data. Fitting a pipeline without RandomOverSampler works fine, but when I add that in, I get the error:

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough'

Code sample here, and works fine without the RandomOverSampler step:

cfg_hgb = { 
    'clf__learning_rate'     : tune.loguniform(0.001, 0.8), 
    'clf__max_leaf_nodes'    : tune.randint(2,20),
    'clf__min_samples_leaf'  : tune.randint(50,500),
    'clf__max_depth'         : tune.randint(2,15),
    'clf__l2_regularization' : tune.loguniform(0.001, 1000),
    'clf__max_iter'          : tune.choice([800]),
}
hyperopt = HyperOptSearch(
    # space=cfg_hgb,
    metric="mean_auc",
    mode="max",
    points_to_evaluate=None,
    n_initial_points=20,
    random_state_seed=RANDOM_STATE,
    gamma=0.25,
)

def train_hgb(config):
    # LOAD DATA
    X, y, nominal, ordinal, numeric = load_clean_data()

    # LOAD TRANSFORMERS
    prep = Preprocessor(nominal, ordinal, numeric)
    
    # CHOOSE CV STRATEGY
    splitter = StratifiedKFold(CV, random_state=RANDOM_STATE, shuffle=True)

    # TRAIN
    scores = []
    for train_ind, val_ind in splitter.split(X,y):
        hgb_os = Pipeline(steps=[
            ('coltrans', prep.transformer_ord),
            ('ros', RandomOverSampler(random_state=RANDOM_STATE)), # if I comment out, works fine
            ('clf', HistGradientBoostingClassifier(
                        categorical_features=prep.cat_feature_mask, 
                        random_state=RANDOM_STATE))
        ])
        hgb_os.set_params(**config)
        hgb_os.fit(X.iloc[train_ind], y[train_ind])

        y_pred = hgb_os.predict(X.iloc[val_ind])
        scores.append(roc_auc_score(y_true=y[val_ind], y_score=y_pred, average="macro"))
    
    # REPORT SCORES
    session.report({
        'mean_auc' : np.array(scores).mean(),
        'std_auc' : np.array(scores).std(),
    })

tuner = tune.Tuner(
    trainable=train_hgb,
    param_space=cfg_hgb,
    tune_config=tune.TuneConfig(
        num_samples=10,
        search_alg=hyperopt,
    ),
    run_config=RunConfig(
        name="experiment_name",
        local_dir="./results/hgb",
    )
    
)
results = tuner.fit()

Whereas if using ray.tune.sklearn.TuneSearchCV, RandomOverSampler works fine in the pipeline:

hgb_tune = {
    'learning_rate'     : tune.loguniform(0.001, 0.15),
    'max_leaf_nodes'    : tune.randint(2,4),
    'min_samples_leaf'  : tune.randint(160,300),
    'max_depth'         : tune.randint(2,7),
    'l2_regularization' : tune.loguniform(5, 1000),
    'max_iter'          : tune.choice([400]),
}
hgb_os = Pipeline(steps=[
    ('trans', prep.transformer_ord),
    ('ros', RandomOverSampler(random_state=RANDOM_STATE)),
    ('clf', TuneSearchCV(
        HistGradientBoostingClassifier(
            categorical_features=prep.cat_feature_mask, 
            random_state=RANDOM_STATE),
        param_distributions=hgb_tune, 
        cv=CV, scoring=SCORER, 
        verbose=VERBOSE, search_optimization="bayesian", 
        n_trials=N_TRIALS, )) # local_dir='~/rayresults/hgbtune'
])

results, params = fit_eval(hgb_os, X_train, X_test, y_train, y_test)

I understand that probably tune is expecting .fit_transform for intermediate steps whereas RandomOverSampler uses .fit_resample. Also note that RandomOverSampler requires imblearn.pipeline.Pipeline rather than sklearn.pipeline.Pipeline so perhaps therein lies the problem.

Is there a way to add any form of resampling with the current Tuner API? Or do I need to part out the pipeline and resample it first outside of this loop?

Thanks in advance.

0 Answers0