I have found a way around, that relies on duck-typing, and doesn't get too much in the way.
It relies on passing complete estimators as parameters to the pipeline. We first sample the kind of model, and then its parameters. For that we define two classes that can be sampled :
from sklearn.model_selection import ParameterSampler
class EstimatorSampler:
"""
Class that holds a model and its parameters distribution.
When sampled, the parameters are first sampled and set to the model,
which is returned.
# Arguments
===========
model : sklearn.base.BaseEstimator
param_distributions : dict
Input to ParameterSampler
# Returns
=========
sampled : sklearn.base.BaseEstimator
"""
def __init__(self, model, param_distributions):
self.model = model
self.param_distributions = param_distributions
def rvs(self, random_state=None):
sampled_params = next(iter(
ParameterSampler(self.param_distributions,
n_iter=1,
random_state=random_state)))
return self.model.set_params(**sampled_params)
class ListSampler:
"""
List container that when sampled, returns one of its item,
with probabilities defined by `probs`.
# Arguments
===========
items : 1-D array-like
probs : 1-D array-like of floats
If not None, it should be the same length of `items`
and sum to 1.
# Returns
=========
sampled item
"""
def __init__(self, items, probs=None):
self.items = items
self.probs = probs
def rvs(self, random_state=None):
item = np.random.choice(self.items, p=self.probs)
if hasattr(item, 'rvs'):
return item.rvs(random_state=random_state)
return item
And the rest of the code is defined below.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
pipe = Pipeline([
# the reduce_dim stage is populated by the param_grid
('reduce_dim', None),
('classify', None)
])
N_FEATURES_OPTIONS = [2, 4, 8]
dim_reducers = ListSampler([EstimatorSampler(est, {'n_components': N_FEATURES_OPTIONS})
for est in [PCA(iterated_power=7), NMF()]] +
[EstimatorSampler(SelectKBest(chi2), {'k': N_FEATURES_OPTIONS})])
C_OPTIONS = [1, 10, 100, 1000]
classifiers = EstimatorSampler(LinearSVC(), {'C': C_OPTIONS})
param_dist = {
'reduce_dim': dim_reducers,
'classify': classifiers
}
grid = RandomizedSearchCV(pipe, cv=3, n_jobs=2, scoring='accuracy', param_distributions=param_dist)
digits = load_digits()
grid.fit(digits.data, digits.target)