Randomized Search Value Error: Input contains NaN, infinity or a value too large for dtype('float64'). But data is correct

Question

The error in the title allways occures when i do Gridsearch or RandomizedSearch in sklearn in Python. But i checked my dataframe the X and the y and couldnt find any nans or inf´s. When using the dataset in for training testing a normal regressor/modell it works without anyproblem. So it is occuring only when i do parameter optimization. But i dont get why.

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

mlp = MLPRegressor(max_iter=100)
par_mlp = {
    'hidden_layer_sizes': [(50,50,50), (500,500,500), (100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd','adam'],
    'alpha': [0.001,1000],
    'learning_rate': ['constant','adaptive']
}
gb=GradientBoostingRegressor()
par_gb={
    "loss":["ls", "lad","huber","quantile"],
    "learning_rate":[0.001,1000],
    "n_estimators":[100,1000],
    "criterion": ["friedman_mse", "mse", "mae"],
    
}

rf=RandomForestRegressor()
par_rf={'criterion':['mse','mae'],
'n_estimators': [100,1000]
}

et=ExtraTreesRegressor()
par_et={'criterion':['mae','mse'],
'n_estimators': [100,1000]
}

sv=SVR()
par_sv={'C': [0.001,1000],
        'gamma': [1000,0.001],
        'kernel': ["linear", "poly", "rbf", "sigmoid"],
        "epsilon":[0.001,1000],
        "degree":[1,6]
}

sgd=SGDRegressor()
par_sgd={"loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
         "penalty":["l2","l1","elasticnet"],
         "alpha":[0.001,1000],
         "l1_ratio":[0,1],
         "max_iter":[100,1000],
         "epsilon":[0.001,1000],
         "learning_rate":["constant","optimal","invscaling","adaptive"],
         "eta0":[0.001,1000],
         "power_t":[0.001,1000]
    
}


regressors=[gb,rf,et,sv,sgd]
parameterlist=[par_gb,par_rf,par_et,par_sv,par_sgd]

#%%
bestmodellist=[]
bestparameterlist=[]
bestscorelist=[]
X=pd.DataFrame(X)
X=X.reset_index(drop=True)
scaler=StandardScaler()
X=scaler.fit_transform(X)
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=np.inf, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=-np.inf, strategy='mean')
# X=imp.fit_transform(X)

X=pd.DataFrame(X)
#%%
i=0

scoring=["r2","neg_root_mean_squared_error","max_error"]
cv=ShuffleSplit(n_splits=3, random_state=0, test_size=0.2)
while i<len(parameterlist):
    clf = RandomizedSearchCV(regressors[i], parameterlist[i], n_jobs=1,scoring=scoring[0], cv=cv,n_iter=10,error_score=np.nan)
    #clf = GridSearchCV(regressors[i], parameterlist[i], n_jobs=-1, cv=cv)

    clf.fit(X, y)
    bestmodellist.append(clf.best_estimator_)
    bestparameterlist.append(clf.best_params_)
    bestscorelist.append(clf.best_score_)
    print(i)
    i=i+1

Here is the result for the nan and inf check. the code is below.
0
0
0
0
TEST
0
0
0
0
0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     1.437549
9     0.795685
10    0.115410
11    0.815063
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    1.437549
17    0.000000
18    0.000000
19    0.000000
dtype: float64
1.7976931348623157e+308
(array([], dtype=int64),)
0
0
0
0
TEST
0
0
0
0
23346469.0
1.7976931348623157e+308
(array([], dtype=int64),)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1225 non-null   float64
 1   1       1225 non-null   float64
 2   2       1225 non-null   float64
 3   3       1225 non-null   float64
 4   4       1225 non-null   float64
 5   5       1225 non-null   float64
 6   6       1225 non-null   float64
 7   7       1225 non-null   float64
 8   8       1225 non-null   float64
 9   9       1225 non-null   float64
 10  10      1225 non-null   float64
 11  11      1225 non-null   float64
 12  12      1225 non-null   float64
 13  13      1225 non-null   float64
 14  14      1225 non-null   float64
 15  15      1225 non-null   float64
 16  16      1225 non-null   float64
 17  17      1225 non-null   float64
 18  18      1225 non-null   float64
 19  19      1225 non-null   float64
dtypes: float64(20)
memory usage: 191.5 KB
None

#%%
#print(X.isnull().sum().sum())
#print(y.isnull().sum().sum())

X=pd.DataFrame(X)
print(np.isinf(X).all().sum())
print(np.isneginf(X).all().sum())
print(np.isposinf(X).all().sum())
print(np.isnan(X).all().sum())
print("TEST")
print(np.isnan(X).any().sum())
print(np.isinf(X).any().sum())
print(np.isposinf(X).any().sum())
print(np.isneginf(X).any().sum())

print(np.max(X))
print(np.finfo(np.float64).max)
print(np.where(np.max(X)>=np.finfo(np.float64).max))

print(np.isinf(y).all().sum())
print(np.isneginf(y).all().sum())
print(np.isposinf(y).all().sum())
print(np.isnan(y).all().sum())
print("TEST")
print(np.isnan(y).any().sum())
print(np.isinf(y).any().sum())
print(np.isposinf(y).any().sum())
print(np.isneginf(y).any().sum())

print(np.max(y))

print(np.finfo(np.float64).max)
print(np.where(np.max(y)>=np.finfo(np.float64).max))

print(X.info())

score 1 · Answer 1 · answered Jan 05 '21 at 11:26

1

Try removing sgd from the grid. If that solves your problem, its explained here MLPRegressor error when solver sgd is used

answered Jan 05 '21 at 11:26

Bilal Dadanlar

820
7
14

Randomized Search Value Error: Input contains NaN, infinity or a value too large for dtype('float64'). But data is correct

1 Answers1