i don't know how to use LabelEncoder in pipeline! i repeatedly get an error
OK! this is my code: import pandas as pd import numpy as np from sklearn.model_selection import train_test_split
bank = pd.read_csv('bankmarketing.csv')
bank.columns = bank.columns.astype('str') bank.columns = bank.columns.str.strip()
train_set, test_set = train_test_split(bank, test_size = 0.3, random_state = 100) train_set.shape
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
df = train_set.copy()
df_label = df['y'].copy()
df = df.drop('y', axis = 1)
df_num_needless = df[['age', 'day', 'compaign', 'previous']].copy()
df_num = df.drop(['age', 'day', 'compaign', 'previous'], axis = 1)
df_num = df[['balance', 'duration', 'pdays']].copy()
df_cat = df[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']].copy()
df_cat1 = df['job']
df_cat2 = df['marital']
df_cat3 = df['education']
df_cat4 = df['default']
df_cat5 = df['housing']
df_cat6 = df['loan']
df_cat7 = df['contact']
df_cat8 = df['month']
df_cat9 = df['poutcome']
num_attrs = list(df_num)
cat_attr1 = list(df_cat1)
cat_attr2 = list(df_cat2)
cat_attr3 = list(df_cat3)
cat_attr4 = list(df_cat4)
cat_attr5 = list(df_cat5)
cat_attr6 = list(df_cat6)
cat_attr7 = list(df_cat7)
cat_attr8 = list(df_cat8)
cat_attr9 = list(df_cat9)
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attrs)),
('scaler', StandardScaler())
])
*: i got error when i wanted to use LabelEncoder so i had to make different pipeline for categorical data:
cat_pipeline1 = Pipeline([
('selector', DataFrameSelector(cat_attr1)),
('encoder', LabelEncoder())
])
cat_pipeline2 = Pipeline([
('selector', DataFrameSelector(cat_attr2)),
('encoder', LabelEncoder())
])
cat_pipeline3 = Pipeline([
('selector', DataFrameSelector(cat_attr3)),
('encoder', LabelEncoder())
])
cat_pipeline4 = Pipeline([
('selector', DataFrameSelector(cat_attr4)),
('encoder', LabelEncoder())
])
cat_pipeline5 = Pipeline([
('selector', DataFrameSelector(cat_attr5)),
('encoder', LabelEncoder())
])
cat_pipeline6 = Pipeline([
('selector', DataFrameSelector(cat_attr6)),
('encoder', LabelEncoder())
])
cat_pipeline7 = Pipeline([
('selector', DataFrameSelector(cat_attr7)),
('encoder', LabelEncoder())
])
cat_pipeline8 = Pipeline([
('selector', DataFrameSelector(cat_attr8)),
('encoder', LabelEncoder())
])
cat_pipeline9 = Pipeline([
('selector', DataFrameSelector(cat_attr9)),
('encoder', LabelEncoder())
])
*: but now i don't know how can i use these pipelines????
full_pipeline = FeatureUnion(transformer_list = [
('num_pipeline', num_pipeline),
('cat_pipeline', ????????????)
])
bank_prepared_df = pd.DataFrame(full_pipeline.fit_transform(df), columns = ['age', 'job', 'marital', 'education', 'default', 'balance',
'housing', 'loan', 'contact', 'day', 'month', 'duration',
'compaign', 'pdays', 'previous', 'poutcome'])
bank_prepared_df.head()