I want to have a process which as result gives me a list of machine learning models and their accuracy score but only for the set of params which gives the best result of that type of model.
As example, here just the CV for XGBoost:
dataset:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
from sklearn.model_selection import train_test_split
X = data.drop(['target'], axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
function for finding best params:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
accu = make_scorer(accuracy_score) # I will be using f1 in future
def predict_for_best_params(alg, X_train, y_train, X_test):
params = {'n_estimators': [200, 300, 500]}
clf = GridSearchCV(alg, params, scoring = accu, cv=2)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
y_pred = clf.predict(X_test)
return y_pred
using it on one model:
from xgboost import XGBClassifier
alg = [XGBClassifier()]
y_pred = predict_for_best_params(alg[0], X_train, y_train, X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
What I want to achieve is something like:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
alg = [XGBClassifier(), RandomForrest()] # list of many of them
alg_params = {'XGBClassifier': [{'n_estimators': [200, 300, 500]}],
'RandomForrest': [{'max_depth ': [1, 2, 3, 4]}]}
def predict_for_best_params(alg, X_train, y_train, X_test, params):
clf = GridSearchCV(alg, params, scoring = accu, cv=2)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
y_pred = clf.predict(X_test)
return y_pred
for algo in alg:
params = alg_params[str(algo)][0] #this won't work because str(algo) <> e.g. XGBClassifier() but XGBClassier(all default params)
y_pred = predict_for_best_params(algo, X_train, y_train, X_test, params)
print('{} accuracy is: {}'.format(algo, accuracy_score(y_test, y_pred)))
Is this a good way to achieve it?