happy friday.
I am trying to create a pipeline for multiple classifiers.
I started off by finding this
Unfortunately it is a little bit above my skill level right now and I could not get it to work properly, I ended up going the more verbose and lengthy way:
def multi_tester(X_train, y_train):
pipe_1 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', MultiOutputClassifier(RandomForestClassifier()))
])
pipe_2 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', ExtraTreesClassifier())
])
pipe_3 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', AdaBoostClassifier())
])
pipe_4 = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', GradientBoostingClassifier())
])
pars = [
{'clf': [MultiOutputClassifier()]},
{'clf': [ExtraTreesClassifier()]},
{'clf': [AdaBoostClassifier()]},
{'clf': [GradientBoostingClassifier()]}
]
pips = [pipe_1, pipe_2, pipe_3, pipe_4]
pip_names = ['MultiOutputClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier']
scoring = {'AUC': 'roc_auc',
'F1': metrics.make_scorer(metrics.f1_score),
'recall': metrics.make_scorer(metrics.recall_score),
'precision': metrics.make_scorer(metrics.precision_score)}
print ("starting Gridsearch")
for i in range(len(pars)):
gs = GridSearchCV(pips[i], pars[i], scoring = scoring,
cv = 5, verbose=2, refit=False, n_jobs=-1, return_train_score = True)
gs = gs.fit(X_train, y_train)
print ("finished Gridsearch for: ", pip_names[i])
print (gs.best_score_)
Unfortunately I think I declared either the estimators or the params incorrectly because when I run this:
multi_tester(X_train, y_train)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-32-fd713f1ef4da> in <module>
----> 1 multi_tester(X_train, y_train)
<ipython-input-30-287aec48dbda> in multi_tester(X_train, y_train)
25
26 pars = [
---> 27 {'clf__estimator': [MultiOutputClassifier()]},
28 {'clf__estimator': [ExtraTreesClassifier()]},
29 {'clf__estimator': [AdaBoostClassifier()]},
TypeError: __init__() missing 1 required positional argument: 'estimator'
I've gone over the documentation and thought the way I instantiated it covered the default params but, I clearly got it wrong.
If you have any suggestions or input on how I could deal with this it would be greatly appreciated.
-#####################################################################################################-
Also for future reference in case you end up finding stackoverflow to attempt the David Batista code, this is how far I got:
and here are some SO questions that I thought were really helpful:
For now I can't troubleshoot it but it seems like figuring out the params in a way that they are a list will fix the issues.
class ClfSwitcher(BaseEstimator):
def __init__(self, estimator = SGDClassifier(),):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
And then I defined some of the parameters for the models I want to look at like this:
search_space = [{
'ExtraTreesClassifier': { 'n_estimators': [200] },
'RandomForestClassifier': { 'n_estimators': [200] },
'AdaBoostClassifier': { 'n_estimators': [200] },
'GradientBoostingClassifier': { 'n_estimators': [200], 'learning_rate': [0.8, 1.0] },
'SVC': [{'kernel': ['linear'], 'C': [1, 10]}, {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}],
'MultiOutputClassifier': { 'n_estimators': [200] }
}]
finally the pipeline was defined in this way:
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', ClfSwitcher())
])
and the respective scoring:
scoring = {'AUC': 'roc_auc',
'F1': metrics.make_scorer(metrics.f1_score),
'recall': metrics.make_scorer(metrics.recall_score),
'precision': metrics.make_scorer(metrics.precision_score)}
but when I run the grid search it returns an error:
grid = GridSearchCV(estimator = pipeline, param_grid = search_space, cv = 10, scoring = scoring, return_train_score = True,
n_jobs = -1, refit = 'AUC')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-26-7b76d42489a9> in <module>
1 grid = GridSearchCV(estimator = pipeline, param_grid = search_space, cv = 10, scoring = scoring, return_train_score = True,
----> 2 n_jobs = -1, refit = 'AUC')
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in __init__(self, estimator, param_grid, scoring, n_jobs, iid, refit, cv, verbose, pre_dispatch, error_score, return_train_score)
1143 return_train_score=return_train_score)
1144 self.param_grid = param_grid
-> 1145 _check_param_grid(param_grid)
1146
1147 def _run_search(self, evaluate_candidates):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _check_param_grid(param_grid)
369 raise ValueError("Parameter values for parameter ({0}) need "
370 "to be a sequence(but not a string) or"
--> 371 " np.ndarray.".format(name))
372
373 if len(v) == 0:
ValueError: Parameter values for parameter (ExtraTreesClassifier) need to be a sequence(but not a string) or np.ndarray.