I am trying to do clustering as a step in a Pipeline so that I can use the cluster as an additional feature. I have used this post as a guide but I am getting an error on the call to fit_transform()
within the pipeline. My original transformer
is working well so the problem comes when trying to cluster on its output in transformer_cluster
and join them together with feature_union
to get the original features plus the binarized cluster label. What is going wrong here?
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
df = pd.DataFrame({'text': [
'Here is an example of some text data',
'here is another sentence of text',
'yet another random string'],
'label': [1, 1, 0]
})
X, y = df['text'], df['label']
class KMeans_foo(KMeans):
def fit_transform(self, X, y = None):
return self.fit_predict(X)
class ModelTransformer(TransformerMixin):
def __init__(self, model):
self.model = model
def fit(self, *args, **kwargs):
self.model.fit(*args, **kwargs)
return self
def transform(self, X, **transform_params):
return pd.DataFrame(self.model.predict(X))
tf_idf_pipe = Pipeline(steps = [
('tf-idf', TfidfVectorizer(min_df = 0.10, stop_words = 'english'))
])
transformer = ColumnTransformer([
('text_pipe', tf_idf_pipe, 'text')],
sparse_threshold = 0,
remainder = 'drop'
)
transformer_cluster = Pipeline([
('orig', transformer),
('cluster', ModelTransformer(KMeans_foo(3))),
('binarize', LabelBinarizer())
])
feature_union = FeatureUnion([
('orig', transformer),
('cluster', transformer_cluster)
])
print('\nTransformed Training Data:')
print(pd.DataFrame(feature_union.fit_transform(pd.DataFrame(X))))
# print(pd.DataFrame(transformer.fit_transform(pd.DataFrame(X))))
# Transform test data
X_test = pd.DataFrame({'text': [
'Here we have some test data',
'and another one with text'
]})
print('\nTransformed Test Data:')
print(pd.DataFrame(feature_union.transform(X_test)))
# print(pd.DataFrame(transformer.transform(X_test)))
Traceback (most recent call last):
File "/Home/test.py", line 59, in <module>
print(pd.DataFrame(feature_union.fit_transform(pd.DataFrame(X))))
File "/Home/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 1154, in fit_transform
results = self._parallel_func(X, y, fit_params, _fit_transform_one)
File "/Home/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 1176, in _parallel_func
return Parallel(n_jobs=self.n_jobs)(
File "/Home/venv/lib/python3.9/site-packages/joblib/parallel.py", line 1046, in __call__
while self.dispatch_one_batch(iterator):
File "/Home/venv/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
self._dispatch(tasks)
File "/Home/venv/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Home/venv/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/Home/venv/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/Home/venv/lib/python3.9/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/Home/venv/lib/python3.9/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/Home/venv/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 117, in __call__
return self.function(*args, **kwargs)
File "/Home/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 870, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Home/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 422, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params_last_step)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
This is what the desired output looks like, minus the additional cluster features:
Transformed Training Data:
0 1 2 3 4 5
0 0.622766 0.622766 0.000000 0.000000 0.000000 0.473630
1 0.000000 0.000000 0.000000 0.795961 0.000000 0.605349
2 0.000000 0.000000 0.707107 0.000000 0.707107 0.000000
Transformed Test Data:
0 1 2 3 4 5
0 1.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 1.0