I have a problem where I have to predict a buyer using machine learning (created a dummy dataset). I need to transform the data first before I can use it for machine learning. I am aggregating information per id,visit level which gives me a list of food and cloths bought. This list needs to be one hot encoded before applying classifier model.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
def preprocess(df):
# Only keep rows till buyer=1
df = df.groupby(["id1", "visit"], group_keys=False).apply(
lambda g: g.loc[: g["Buyer"].idxmax()]
)
# Form lists on each id1,visit level
df1 = df.groupby(["id1", "visit"], as_index=False).agg(
is_Pax=("Buyer", "max"),
fruits=("fruits", lambda x: x.dropna().unique().tolist()),
cloths=("cloths", lambda x: x.dropna().unique().tolist()),
)
col = ["fruits", "cloths"]
df_transformed = onehot(df1, col)
return df_transformed
def onehot(df, col):
"""
This function does one hot encoding of a list column.
"""
onehot_list_encoder = MultiLabelBinarizer()
for cl in col:
print("One hot encoding ", cl)
newd = pd.DataFrame(
onehot_list_encoder.fit_transform(df[cl]),
columns=onehot_list_encoder.classes_,
).add_prefix(cl + "_")
df = df.join(newd)
return df
df = pd.DataFrame(np.array([['a', 'a', 'b', 'b','a','a'], [1, 2, 2, 2,1,1],
['Apple', 'Apple', 'Banana', None,'Orange','Pear'],[1,2,1,3,4,5],
[0, 0, 1, 0,1,0]]).T,
columns=['id1', 'visit', 'fruits','cloths','Buyer'])
df['Buyer'] = df['Buyer'].astype('int')
How to create a simple ML model now that does this preprocessing to data (both fit and predict) since in test data, I want the same transformation (i.e. 0 for all columns not present in the test rows), Can pipeline solve this? I am not so good with writing pipelines and am getting errors.
droplist=['id1', 'visit', 'fruits','cloths']
pipe=Pipeline(steps=[
("preprocess",preprocess(df)),
("coltrans",ColumnTransformer([("drop",'drop',droplist)])),
("model",GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)),
])
Can someone help?