I'm creating a data pipeline using scikit learns pipeline. My goal is to add a SimpleImputer to change all the NaN values to the most frequent values using the 'most-frequent'
strategy. Whenever I run it, I get the Following Value Error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
I'm using a Jupyter notebook that is as follows:
import pandas as pd
all_data = pd.read_csv('hospitals_by_county.csv')
# Homemade test, train split function. Returns 2 pandas dataframe subsets of the original.
(train, test) = data_splitter(all_data)
X_train = train
y_train = train['icu_available_beds']
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('scalar', StandardScaler())
])
numeric_features = all_data.select_dtypes(
include=['int64', 'float64']).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features)
])
from sklearn.neural_network import MLPClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', MLPClassifier())])
# Error occurs here.
rf.fit(X_train, y_train)
The full error message is coming from the MLPClassfier, but it to me it seems to indicate that the issue is that there are still NaN values in the data which should not be the case.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-168a6fc83696> in <module>
----> 1 rf.fit(X_train, y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
352 self._log_message(len(self.steps) - 1)):
353 if self._final_estimator != 'passthrough':
--> 354 self._final_estimator.fit(Xt, y, **fit_params)
355 return self
356
/opt/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py in fit(self, X, y)
993 """
994 return self._fit(X, y, incremental=(self.warm_start and
--> 995 hasattr(self, "classes_")))
996
997 @property
/opt/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py in _fit(self, X, y, incremental)
323 hidden_layer_sizes)
324
--> 325 X, y = self._validate_input(X, y, incremental)
326 n_samples, n_features = X.shape
327
/opt/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py in _validate_input(self, X, y, incremental)
930 def _validate_input(self, X, y, incremental):
931 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 932 multi_output=True)
933 if y.ndim == 2 and y.shape[1] == 1:
934 y = column_or_1d(y, warn=True)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
756 if multi_output:
757 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
--> 758 dtype=None)
759 else:
760 y = column_or_1d(y, warn=True)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
576 if force_all_finite:
577 _assert_all_finite(array,
--> 578 allow_nan=force_all_finite == 'allow-nan')
579
580 if ensure_min_samples > 0:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
58 msg_err.format
59 (type_err,
---> 60 msg_dtype if msg_dtype is not None else X.dtype)
61 )
62 # for object dtype data, we only check for NaNs (GH-13254)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').