X has 3 features, but RandomForestClassifier is expecting 23 features as input

Question

I want to determine the result of an esports game using python, but it doesn't work when i add more data to it, and i don't know why

This is the code i use to predict the outcome of a game:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import tabulate
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import tabulate

matches = pd.read_csv("CURRENT_data.csv", index_col=0)

matches["blue_code"] = matches["blue_team"].astype("category").cat.codes
matches["orange_code"] = matches["orange_team"].astype("category").cat.codes

#matches["minutes"] = matches["duration"].str.replace(":", "", regex=True).astype("int")
matches["date"] = pd.to_datetime(matches["date"])
matches["day_code"] = matches["date"].dt.dayofweek
rf = RandomForestClassifier(n_estimators=500, min_samples_split=10, random_state=1)

train = matches.head(int(matches.shape[0]/100*80))
test = matches.tail(int(matches.shape[0]/100*20))
#matches.head()
matches[matches["blue_team"] == "Aogiri"].sort_values("date")["blue_code"].tolist()[0]
predictors = ["blue_code", "orange_code", "day_code"]
rf.fit(train[predictors], train["winner"])

RandomForestClassifier
RandomForestClassifier(min_samples_split=10, n_estimators=500, random_state=1)
preds = rf.predict(test[predictors])

error = accuracy_score(test["winner"], preds)
error

when i predict after this, using the code below, it works perfectly:

blue_code0 = matches[matches["blue_team"] == "Quadrant"].sort_values("date")["blue_code"].tolist()[0]
orange_code0 = matches[matches["blue_team"] == "Karmine Corp"].sort_values("date")["blue_code"].tolist()[0]
preds0 = rf.predict([[blue_code0, orange_code0, "6"]])
print(preds0)

it gives me back the output 0-1:

[0.]

After this i add data with rolling avarages to my "CURRENT_data.csv":

grouped_matches = matches.groupby("blue_team")
group = grouped_matches.get_group("Moist Esports").sort_values("date")

def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["B_pT", "O_pT", "B_bpm", "O_bpm", "B_amC", "O_amC", "B_tZBoost", "O_tZBoost", "B_amS", "O_amS", "B_totalD", "O_totalD", "B_tSS", "O_tSS", "B_tDT", "O_tDT", "B_tOT", "O_tOT", "B_dI", "O_dI"]
new_cols = [f"{c}_rolling" for c in cols]

matches_rolling = matches.groupby("blue_team").apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling = matches_rolling.droplevel('blue_team')
matches_rolling.index = range(matches_rolling.shape[0])
                          
print(predictors)
def make_predictions(data, predictors):
    train = data.head(int(data.shape[0]/100*80))
    test = data.tail(int(data.shape[0]/100*20))
    rf.fit(train[predictors], train["winner"])
    preds = rf.predict(test[predictors])
    #print(test[predictors])
    combined = pd.DataFrame(dict(actual=test["winner"], predicted=preds), index=test.index)
    error = precision_score(test["winner"], preds)
    return combined, error

combined, error = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "blue_team", "orange_team", "winner"]], left_index=True, right_index=True)

But if i try to predict the same way, after adding rolling average values to my data, it gives me the error below:

ValueError                                Traceback (most recent call last)
Input In [145], in <cell line: 3>()
      1 blue_code0 = matches[matches["blue_team"] == "Quadrant"].sort_values("date")["blue_code"].tolist()[0]
      2 orange_code0 = matches[matches["blue_team"] == "Karmine Corp"].sort_values("date")["blue_code"].tolist()[0]
----> 3 preds0 = rf.predict([[blue_code0, orange_code0, "6"]])
      4 print(preds0)
      8 match0 = matches[matches["blue_team"] == "Moist Esports"]

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:832, in ForestClassifier.predict(self, X)
    811 def predict(self, X):
    812     """
    813     Predict class for X.
    814 
   (...)
    830         The predicted classes.
    831     """
--> 832     proba = self.predict_proba(X)
    834     if self.n_outputs_ == 1:
    835         return self.classes_.take(np.argmax(proba, axis=1), axis=0)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:874, in ForestClassifier.predict_proba(self, X)
    872 check_is_fitted(self)
    873 # Check data
--> 874 X = self._validate_X_predict(X)
    876 # Assign chunk of trees to jobs
    877 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:605, in BaseForest._validate_X_predict(self, X)
    602 """
    603 Validate X whenever one tries to predict, apply, predict_proba."""
    604 check_is_fitted(self)
--> 605 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
    606 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
    607     raise ValueError("No support for np.int64 index based sparse matrices")

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:600, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    597     out = X, y
    599 if not no_val_X and check_params.get("ensure_2d", True):
--> 600     self._check_n_features(X, reset=reset)
    602 return out

File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:400, in BaseEstimator._check_n_features(self, X, reset)
    397     return
    399 if n_features != self.n_features_in_:
--> 400     raise ValueError(
    401         f"X has {n_features} features, but {self.__class__.__name__} "
    402         f"is expecting {self.n_features_in_} features as input."
    403     )

ValueError: X has 3 features, but RandomForestClassifier is expecting 23 features as input.

I don't understand why it asks for 23 values if i use "test[predictors]" and "test[predictors]", which both only have 3 values.

X has 3 features, but RandomForestClassifier is expecting 23 features as input

0 Answers0