I have a list of variables with values encoded in a way which throws Pandas off. For example: I have a column named "Alley" and it has a list of values, one of which is NA
, which stands for "No Alley". However, Pandas interprets this as NaN
. To come across this problem, I am encoding all NaN
values with an arbitrary symbol like XX
. These variables don't actuall have null/missing values. These are just variables whose values are being misinterpreted by Pandas. I am gathering them in a list:
na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
And replacing each NaN
reading with XX
:
for i in na_data:
df[i] = df[i].fillna('XX')
This was the old error I was getting:
Traceback (most recent call last):
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 67, in <module>
print(feature_encoding(train, categorical_columns))
File "C:/Users/security/Downloads/AP/Boston-Kaggle/Model.py", line 50, in feature_encoding
df[i] = df[i].fillna('XX')
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\security\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 129, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index_class_helper.pxi", line 91, in pandas._libs.index.Int64Engine._check_type
KeyError: 'Alley'
The variable Alley
definitely exists in the dataset! I copy/pasta the name from the dataset just for good measure.
This is my entire code (updated):
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
train = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/oo92/Boston-Kaggle/master/test.csv")
categorical_columns = ['MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1',
'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'Fence',
'MiscFeature', 'SaleType', 'SaleCondition', 'Street', 'CentralAir', 'Utilities', 'ExterQual',
'LandSlope', 'ExterCond', 'HeatingQC', 'KitchenQual']
ranked_columns = ['Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
'PoolQC', 'OverallQual', 'OverallCond']
numerical_columns = ['LotArea', 'LotFrontage', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
'BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSf', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
'BsmtHalfBath', 'FullBath', 'HalfBath', 'Bedroom', 'Kitchen', 'TotRmsAbvGrd', 'Fireplaces',
'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
'3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
na_data = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
for i in na_data:
train[i] = train[i].fillna('XX')
#Replaced the NaN values of LotFrontage and MasVnrArea with the mean of their column
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
concatenated_list = categorical_columns + na_data
# take one-hot encoding
OHE_sdf = pd.get_dummies(train[concatenated_list])
# drop the old categorical column from original df
train.drop(columns = categorical_columns, axis = 1, inplace = True)
# attach one-hot encoded columns to original data frame
train = pd.concat([train, OHE_sdf], axis = 1, ignore_index = False)
x_train, x_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size = 0.3, random_state = 42)
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold = 300 * "mean")
sel.fit(x_train, y_train)
sel.get_support()
selected_feat = x_train.columns[sel.get_support()]
print(selected_feat())
This is the new error:
Traceback (most recent call last):
File "/home/onur/Documents/Boston-Kaggle/Model.py", line 49, in <module>
sel.fit(x_train, y_train)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/feature_selection/from_model.py", line 196, in fit
self.estimator_.fit(X, y, **fit_params)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/ensemble/forest.py", line 249, in fit
X = check_array(X, accept_sparse="csc", dtype=DTYPE)
File "/opt/anaconda/envs/lib/python3.7/site-packages/sklearn/utils/validation.py", line 496, in check_array
array = np.asarray(array, dtype=dtype, order=order)
File "/opt/anaconda/envs/lib/python3.7/site-packages/numpy/core/_asarray.py", line 85, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'XX'