I got a problem when trying run the following code. It's the machine learning problem of housing price.
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
num_attributes=list(housing_num)
cat_attributes=['ocean_proximity']
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class DataFrameSelector(BaseEstimator,TransformerMixin):
def __init__(self,attribute_names):
self.attribute_names=attribute_names
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
return X[self.attribute_names].values
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X,y=None):
return self # nothing else to do
def transform(self, X,y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
num_pipeline=Pipeline([
('selector',DataFrameSelector(num_attributes)),
('imputer',Imputer(strategy="median")),
('attribs_adder',CombinedAttributesAdder()),
('std_scalar',StandardScaler()),
])
cat_pipeline=Pipeline([
('selector',DataFrameSelector(cat_attributes)),
('label_binarizer',LabelBinarizer()),
])
full_pipeline=FeatureUnion(transformer_list=[
("num_pipeline",num_pipeline),
("cat_pipeline",cat_pipeline),
])
There comes bug when I trying to run:
housing_prepared = full_pipeline.fit_transform(housing)
And the error is shown as:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-141-acd0fd68117b> in <module>()
----> 1 housing_prepared = full_pipeline.fit_transform(housing)
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
744 delayed(_fit_transform_one)(trans, weight, X, y,
745 **fit_params)
--> 746 for name, trans, weight in self._iter())
747
748 if not result:
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, weight, X, y, **fit_params)
587 **fit_params):
588 if hasattr(transformer, 'fit_transform'):
--> 589 res = transformer.fit_transform(X, y, **fit_params)
590 else:
591 res = transformer.fit(X, y, **fit_params).transform(X)
/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
290 Xt, fit_params = self._fit(X, y, **fit_params)
291 if hasattr(last_step, 'fit_transform'):
--> 292 return last_step.fit_transform(Xt, y, **fit_params)
293 elif last_step is None:
294 return Xt
TypeError: fit_transform() takes exactly 2 arguments (3 given)
So my first question is what causes this bug?
After getting this bug, I've tried to figure out why so I run the above transformers one by one as this:
DFS=DataFrameSelector(num_attributes)
a1=DFS.fit_transform(housing)
imputer=Imputer(strategy='median')
a2=imputer.fit_transform(a1)
CAA=CombinedAttributesAdder()
a3=CAA.fit_transform(a2)
SS=StandardScaler()
a4=SS.fit_transform(a3)
DFS2=DataFrameSelector(cat_attributes)
b1=DFS2.fit_transform(housing)
LB=LabelBinarizer()
b2=LB.fit_transform(b1)
result=np.concatenate((a4,b2),axis=1)
These can be executed correctly except that the result I got is a numpy.ndarray with size (16512, 16) while the expected result of housing_prepared = full_pipeline.fit_transform(housing)
should be a bumpy.ndarray of size (16512,17). So this is my second question Why causes the difference?
Housing is a DataFrame with size of (16512, 9), only 1 categorical feature and 8 numerical feature.
Thank you in advance.