I am working on workflows using Pipeline and GridSearchCV.
MWE for RandomForest, as below,
#################################################################
# Libraries
#################################################################
import time
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#################################################################
# Data loading and Symlinks
#################################################################
train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")
#################################################################
# Train Test Split
#################################################################
# Selected features - Training data
X = train.drop(columns='fault_severity')
# Training data
y = train.fault_severity
# Test data
x = test
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
#################################################################
# Pipeline
#################################################################
pipe_rf = Pipeline([
('clf', RandomForestClassifier(random_state=0))
])
parameters_rf = {
'clf__n_estimators':[30,40],
'clf__criterion':['entropy'],
'clf__min_samples_split':[15,20],
'clf__min_samples_leaf':[3,4]
}
grid_rf = GridSearchCV(pipe_rf,
param_grid=parameters_rf,
scoring='neg_mean_absolute_error',
cv=5,
refit=True)
#################################################################
# Modeling
#################################################################
start_time = time.time()
grid_rf.fit(X_train, y_train)
#Calculate the score once and use when needed
mae = grid_rf.score(X_valid,y_valid)
print("Best params : %s" % grid_rf.best_params_)
print("Best training data MAE score : %s" % grid_rf.best_score_)
print("Best validation data MAE score (*) : %s" % mae)
print("Modeling time : %s" % time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
#################################################################
# Prediction
#################################################################
#Predict using the test data with selected features
y_pred = grid_rf.predict(x)
# Transform numpy array to dataframe
y_pred = pd.DataFrame(y_pred)
# Rearrange dataframe
y_pred.columns = ['prediction']
y_pred.insert(0, 'id', x['id'])
# Save to CSV
y_pred.to_csv("data_predict.csv", index = False, header=True)
#Output
# id,prediction
# 11066,0
# 18000,2
# 16964,0
# ...., ....
Have a MWE for XGBoost as below,
#################################################################
# Libraries
#################################################################
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#################################################################
# Data loading and Symlinks
#################################################################
train = pd.read_csv("data_train.csv")
test = pd.read_csv("data_test.csv")
#################################################################
# Train Test Split
#################################################################
# Selected features - Training data
X = train.drop(columns='fault_severity')
# Training data
y = train.fault_severity
# Test data
x = test
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
#################################################################
# DMatrix
#################################################################
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=test)
params = {
'max_depth': 6,
'objective': 'multi:softprob', # error evaluation for multiclass training
'num_class': 3,
'n_gpus': 0
}
#################################################################
# Modeling
#################################################################
start_time = time.time()
bst = xgb.train(params, dtrain)
#################################################################
# Prediction
#################################################################
#Predict using the test data with selected features
y_pred = bst.predict(dtest)
# Transform numpy array to dataframe
y_pred = pd.DataFrame(y_pred)
# Rearrange dataframe
y_pred.columns = ['prediction_0', 'prediction_1', 'prediction_2']
y_pred.insert(0, 'id', x['id'])
# Save to CSV
y_pred.to_csv("data_predict_xgb.csv", index = False, header=True)
# Expected Output:
# id,prediction_0,prediction_1,prediction_2
# 11066,0.4674369,0.46609518,0.06646795
# 18000,0.7578633,0.19379888,0.048337903
# 16964,0.9296321,0.04505246,0.025315404
# ...., ...., ...., ....
Questions:
How does one convert the MWE for XGBoost using the Pipeline and GridSearchCV technique in MWE for RandomForest? Have to use 'num_class' where XGBRegressor() does not support.
How to have a multi-class prediction output for RandomForrest as XGBoost (i.e predict_0, predict_1, predict_2)? The sample output are given in the MWEs above. I found num_class is is not supported by RandomForest Classifier.
I have spent several days working on this and still been blocked. Appreciate some pointers to move forward.
Data: