I want to determine the result of an esports game using python, but it doesn't work when i add more data to it, and i don't know why
This is the code i use to predict the outcome of a game:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import tabulate
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import tabulate
matches = pd.read_csv("CURRENT_data.csv", index_col=0)
matches["blue_code"] = matches["blue_team"].astype("category").cat.codes
matches["orange_code"] = matches["orange_team"].astype("category").cat.codes
#matches["minutes"] = matches["duration"].str.replace(":", "", regex=True).astype("int")
matches["date"] = pd.to_datetime(matches["date"])
matches["day_code"] = matches["date"].dt.dayofweek
rf = RandomForestClassifier(n_estimators=500, min_samples_split=10, random_state=1)
train = matches.head(int(matches.shape[0]/100*80))
test = matches.tail(int(matches.shape[0]/100*20))
#matches.head()
matches[matches["blue_team"] == "Aogiri"].sort_values("date")["blue_code"].tolist()[0]
predictors = ["blue_code", "orange_code", "day_code"]
rf.fit(train[predictors], train["winner"])
RandomForestClassifier
RandomForestClassifier(min_samples_split=10, n_estimators=500, random_state=1)
preds = rf.predict(test[predictors])
error = accuracy_score(test["winner"], preds)
error
when i predict after this, using the code below, it works perfectly:
blue_code0 = matches[matches["blue_team"] == "Quadrant"].sort_values("date")["blue_code"].tolist()[0]
orange_code0 = matches[matches["blue_team"] == "Karmine Corp"].sort_values("date")["blue_code"].tolist()[0]
preds0 = rf.predict([[blue_code0, orange_code0, "6"]])
print(preds0)
it gives me back the output 0-1:
[0.]
After this i add data with rolling avarages to my "CURRENT_data.csv":
grouped_matches = matches.groupby("blue_team")
group = grouped_matches.get_group("Moist Esports").sort_values("date")
def rolling_averages(group, cols, new_cols):
group = group.sort_values("date")
rolling_stats = group[cols].rolling(3, closed='left').mean()
group[new_cols] = rolling_stats
group = group.dropna(subset=new_cols)
return group
cols = ["B_pT", "O_pT", "B_bpm", "O_bpm", "B_amC", "O_amC", "B_tZBoost", "O_tZBoost", "B_amS", "O_amS", "B_totalD", "O_totalD", "B_tSS", "O_tSS", "B_tDT", "O_tDT", "B_tOT", "O_tOT", "B_dI", "O_dI"]
new_cols = [f"{c}_rolling" for c in cols]
matches_rolling = matches.groupby("blue_team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('blue_team')
matches_rolling.index = range(matches_rolling.shape[0])
print(predictors)
def make_predictions(data, predictors):
train = data.head(int(data.shape[0]/100*80))
test = data.tail(int(data.shape[0]/100*20))
rf.fit(train[predictors], train["winner"])
preds = rf.predict(test[predictors])
#print(test[predictors])
combined = pd.DataFrame(dict(actual=test["winner"], predicted=preds), index=test.index)
error = precision_score(test["winner"], preds)
return combined, error
combined, error = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["date", "blue_team", "orange_team", "winner"]], left_index=True, right_index=True)
But if i try to predict the same way, after adding rolling average values to my data, it gives me the error below:
ValueError Traceback (most recent call last)
Input In [145], in <cell line: 3>()
1 blue_code0 = matches[matches["blue_team"] == "Quadrant"].sort_values("date")["blue_code"].tolist()[0]
2 orange_code0 = matches[matches["blue_team"] == "Karmine Corp"].sort_values("date")["blue_code"].tolist()[0]
----> 3 preds0 = rf.predict([[blue_code0, orange_code0, "6"]])
4 print(preds0)
8 match0 = matches[matches["blue_team"] == "Moist Esports"]
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:832, in ForestClassifier.predict(self, X)
811 def predict(self, X):
812 """
813 Predict class for X.
814
(...)
830 The predicted classes.
831 """
--> 832 proba = self.predict_proba(X)
834 if self.n_outputs_ == 1:
835 return self.classes_.take(np.argmax(proba, axis=1), axis=0)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:874, in ForestClassifier.predict_proba(self, X)
872 check_is_fitted(self)
873 # Check data
--> 874 X = self._validate_X_predict(X)
876 # Assign chunk of trees to jobs
877 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py:605, in BaseForest._validate_X_predict(self, X)
602 """
603 Validate X whenever one tries to predict, apply, predict_proba."""
604 check_is_fitted(self)
--> 605 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
606 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
607 raise ValueError("No support for np.int64 index based sparse matrices")
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:600, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
597 out = X, y
599 if not no_val_X and check_params.get("ensure_2d", True):
--> 600 self._check_n_features(X, reset=reset)
602 return out
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:400, in BaseEstimator._check_n_features(self, X, reset)
397 return
399 if n_features != self.n_features_in_:
--> 400 raise ValueError(
401 f"X has {n_features} features, but {self.__class__.__name__} "
402 f"is expecting {self.n_features_in_} features as input."
403 )
ValueError: X has 3 features, but RandomForestClassifier is expecting 23 features as input.
I don't understand why it asks for 23 values if i use "test[predictors]" and "test[predictors]", which both only have 3 values.