I am trying to predict the 'Full_Time_Home_Goals
' column (feature).
I have followed the Kaggle example. The code works with the varied dimensions as in my example (419 rows in test data and 892 rows in train data)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Files
data_train = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\train.csv")
data_test = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
X_all = data_train
y_all = data_train['Full_Time_Home_Goals']
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
The errors I am getting is :
With the code as is:
Traceback (most recent call last): File "C:/Users/harsh/PycharmProjects/Kaggle-Machine Learning from Start to Finish with Scikit-Learn/EPL Predicting.py", line 98, in predictions = clf.predict(data_test.drop('Id', axis=1)) File "C:\Users\harsh\PycharmProjects\GitHub\venv\lib\site-packages\sklearn\ensemble_forest.py", line 629, in predict ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2
With the code changed from
predictions = clf.predict(data_test.drop('Id', axis=1))
to predictions = clf.predict(X_test)
, the error is:raise ValueError(msg) ValueError: array length 37921 does not match index length 380
How do I resolve this issue?