sklearn transformation pipeline and featureunion

Question

I got a problem when trying run the following code. It's the machine learning problem of housing price.

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin

num_attributes=list(housing_num)
cat_attributes=['ocean_proximity']
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_names].values

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X,y=None):
        return self # nothing else to do 
    def transform(self, X,y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] 
        population_per_household = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] 
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attributes)),
    ('imputer',Imputer(strategy="median")),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scalar',StandardScaler()),
    ])
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attributes)),
    ('label_binarizer',LabelBinarizer()),
    ])
full_pipeline=FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
    ])

There comes bug when I trying to run:

housing_prepared = full_pipeline.fit_transform(housing)

And the error is shown as:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-141-acd0fd68117b> in <module>()
----> 1 housing_prepared = full_pipeline.fit_transform(housing)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    744             delayed(_fit_transform_one)(trans, weight, X, y,
    745                                         **fit_params)
--> 746             for name, trans, weight in self._iter())
    747 
    748         if not result:

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, weight, X, y, **fit_params)
    587                        **fit_params):
    588     if hasattr(transformer, 'fit_transform'):
--> 589         res = transformer.fit_transform(X, y, **fit_params)
    590     else:
    591         res = transformer.fit(X, y, **fit_params).transform(X)

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    290         Xt, fit_params = self._fit(X, y, **fit_params)
    291         if hasattr(last_step, 'fit_transform'):
--> 292             return last_step.fit_transform(Xt, y, **fit_params)
    293         elif last_step is None:
    294             return Xt

TypeError: fit_transform() takes exactly 2 arguments (3 given)

So my first question is what causes this bug?

After getting this bug, I've tried to figure out why so I run the above transformers one by one as this:

DFS=DataFrameSelector(num_attributes)
a1=DFS.fit_transform(housing)
imputer=Imputer(strategy='median')
a2=imputer.fit_transform(a1)
CAA=CombinedAttributesAdder()
a3=CAA.fit_transform(a2)
SS=StandardScaler()
a4=SS.fit_transform(a3)

DFS2=DataFrameSelector(cat_attributes)
b1=DFS2.fit_transform(housing)
LB=LabelBinarizer()
b2=LB.fit_transform(b1)

result=np.concatenate((a4,b2),axis=1)

These can be executed correctly except that the result I got is a numpy.ndarray with size (16512, 16) while the expected result of housing_prepared = full_pipeline.fit_transform(housing) should be a bumpy.ndarray of size (16512,17). So this is my second question Why causes the difference?

Housing is a DataFrame with size of (16512, 9), only 1 categorical feature and 8 numerical feature.

Thank you in advance.

The first error is due to `LabelBinarizer`. It requires only a single input y, but due to pipeline both X and y will be sent to it. Please share the data and I can help. — Vivek Kumar, Sep 08 '17 at 02:10
@VivekKumar Here's the link, it's the data of housing: https://drive.google.com/file/d/0B12I2_fMO94pVHZhQlVrSlFtZEk/view?usp=sharing — talentcat, Sep 08 '17 at 02:50
Why do you think that the result should have 17 columns instead of 16? — Vivek Kumar, Sep 08 '17 at 05:54
@VivekKumar Actually I also think it should be 16 columns. But this is actually an example on a textbook. The code is theirs. They can successfully run the code that I cannot and they get a 17 columns result which I cannot understand. — talentcat, Sep 08 '17 at 14:58

score 0 · Answer 1 · answered Oct 17 '17 at 14:18

0

Looks like sklearn identifies datatypes in another way than you expect. Make sure numbers are identified as int. Easiest way: Use the data provided by the author of 'your' posted coded. Aurelien Geron Hands on Machine Learning

answered Oct 17 '17 at 14:18

mrtaste

13
4

score 0 · Answer 2 · answered May 20 '18 at 12:39

I had this problem when going through this book. After trying a bunch of workarounds (which I feel was a waste of my time), I gave in and installed the scikit-learn v0.20 dev. Download the wheel here and install it using pip. This should allow you to use the CategoricalEncoder class that has been designed to handle these problems.

score 0 · Answer 3 · answered May 13 '20 at 19:49

0

I ran into this same problem, it was caused by an indentation problem that won't always throw an error (see https://stackoverflow.com/a/14046894/3665886).

If you copied the code from the book directly make sure the code is correctly indented.

answered May 13 '20 at 19:49

user40653

45
1
6

Dr Nisha Arora · Answer 4 · 2020-09-10T06:47:37.243

TypeError: fit_transform() takes exactly 2 arguments (3 given)

Why this bug?

Answer: because you're using LabelBinarizer() which is ideally suitable for the response variable.

What to do?: You have got a few options:

Use OneHotEncoder() instead
Write custom transformer for LabelBinarizer
Use the older version of sklean which supports your code

Difference in the shape of housing_prepared

If you're using this data, then you've 9 predictors (8 numerical & 1 categorical). CombinedAttributesAdder() adds 3 more columns and LabelBinarizer() adds 5 more, so it becomes 17 columns
Remember, sklearn.pipeline.FeatureUnion concatenates the results of multiple transformer objects

When you do it manually, you don't add the original 'ocean_proximity' variables.

Let's see it in action:

print("housing_shape: ", housing.shape)

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

DFS=DataFrameSelector(num_attribs)
a1=DFS.fit_transform(housing)

print('Numerical variables_shape: ', a1.shape)

imputer=SimpleImputer(strategy='median')
a2=imputer.fit_transform(a1)

a2.shape

Same as a1.shape

CAA=CombinedAttributesAdder()
a3=CAA.fit_transform(a2)
SS=StandardScaler()
a4=SS.fit_transform(a3) # added 3 variables
print('Numerical variable shape after CAA: ', a4.shape, '\n')

DFS2=DataFrameSelector(cat_attribs)
b1=DFS2.fit_transform(housing)

print("Categorical variables_shape: ", b1.shape)

LB=LabelBinarizer()
b2=LB.fit_transform(b1) # instead of one column now we have 5 columns
print('categorical variable shape after LabelBinarization: ', b2.shape)

4 columns increased

print(b2)

result=np.concatenate((a4,b2),axis=1)
print('final shape: ', result.shape, '\n') # Final shape

NOTE: transformed columns (results of a4) and binarized columns (result of b2) are not yet added to original dataframe. To do so, you need to convert numpy array b2 to a data frame

new_features = pd.DataFrame(a4)
new_features.shape

ocean_cat = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
ocean_LabelBinarize = pd.DataFrame(b2, columns=[ocean_cat[i] for i in 
range(len(ocean_cat))])

ocean_LabelBinarize

housing_prepared_new = pd.concat([new_features, ocean_LabelBinarize], 
axis=1)

print('Shape of new data prepared by above steps', 
housing_prepared_new.shape)

When we use pipeline it keep the original (ocean_proximity) variable too and the newly created binarize columns too

sklearn transformation pipeline and featureunion

4 Answers4