I am trying to train a meta classifier on different features from a pandas dataframe.
The features are either text or categorical in nature.
I am having issues with fitting the model, with the following error 'Found input variables with inconsistent numbers of samples: [1, 48678]'. I understand what the error means, but not how to fix it. Help much appreciated!
The code I am using is as follows:
import pandas as pd
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
# set target label
target_label = ['target']
features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5',
'text_1']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleansed_data[features],
cleansed_data[target_label], test_size=0.2, random_state=0)
text_features = ['text_1']
categorical_features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
# encoder
le = preprocessing.LabelEncoder()
# vectoriser
vectoriser = TfidfVectorizer()
# classifiers
mlp_clf = MLPClassifier()
rf_clf = RandomForestClassifier()
from sklearn.base import TransformerMixin, BaseEstimator
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):
def __init__(self, columns=[]):
self.columns = columns
def transform(self, X, **transform_params):
trans = X[self.columns].copy()
return trans
def fit(self, X, y=None, **fit_params):
return self
# text pipeline
text_steps = [('feature extractor', SelectColumnsTransfomer(text_features)),
('tf-idf', vectoriser),
('classifier', mlp_clf)]
# categorical pipeline
categorical_steps = [('feature extractor',
SelectColumnsTransfomer(categorical_features)),
('label encode', le),
('classifier', rf_clf)]
pl_text = Pipeline(text_steps)
pl_categorical = Pipeline(categorical_steps)
pl_text.fit(X_train, y_train)
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[pl_text, pl_categorical],
use_probas=True,
meta_classifier=LogisticRegression())
EDIT: Here is some code that recreates the issue. 'ValueError: Found input variables with inconsistent numbers of samples: [1, 3]'
d = {'cat_1': ['A', 'A', 'B'], 'cat_2': [1, 2, 3],
'cat_2': ['G', 'H', 'I'], 'cat_3': ['AA', 'DD', 'PP'],
'cat_4': ['X', 'B', 'V'],
'text_1': ['the cat sat on the mat', 'the mat sat on the cat', 'sat on the cat mat']}
features = pd.DataFrame(data=d)
t = [0, 1, 0]
target = pd.DataFrame(data=t)
text_features = ['text_1']
categorical_features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
# text pipeline
text_steps = [('feature extractor', SelectColumnsTransfomer(text_features)),
('tf-idf', vectoriser),
('classifier', mlp_clf)]
# categorical pipeline
categorical_steps = [('feature extractor',
SelectColumnsTransfomer(categorical_features)),
('label encode', le),
('classifier', rf_clf)]
pl_text = Pipeline(text_steps)
pl_categorical = Pipeline(categorical_steps)
pl_text.fit(features, target)
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[pl_text, pl_categorical],
use_probas=True,
meta_classifier=LogisticRegression())
sclf.fit(features, target)