I am trying to build a predictive model for a bank that needs to decide who will be approved for a loan based on the applicant's demographic and socio-economic profiles. The dataset has columns that detail an applicant's age, employment, credit history, etc. (a mix of objects and integers).
Below is what I've done up until the Valuerror:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
credit = pd.read_csv('german_credit_risk.csv')
credit.head(10)
credit.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Checking.Account 1000 non-null object
1 Duration 1000 non-null int64
2 Credit.Hist 1000 non-null object
3 Purpose 1000 non-null object
4 Credit.Amount 1000 non-null int64
5 Saving.Account 1000 non-null object
6 Employmnet 1000 non-null object
7 Disposable.Income 1000 non-null int64
8 Personal.Status 1000 non-null object
9 Other.Debtors 1000 non-null object
10 Present.Residence 1000 non-null int64
11 Property 1000 non-null object
12 Age 1000 non-null int64
13 Other.Loans 1000 non-null object
14 Housing 1000 non-null object
15 Existing.Credits 1000 non-null int64
16 Job 1000 non-null object
17 Number.Liable 1000 non-null int64
18 Telephone 1000 non-null object
19 Foreign.Worker 1000 non-null object
20 Cost.Matrix 1000 non-null int64
dtypes: int64(8), object(13)
memory usage: 164.2+ KB
credit.describe(include='all')
_ = sns.heatmap(credit.corr())
col_names = ['Checking.Account', 'Duration', 'Credit.Hist', 'Purpose', 'Credit.Amount', 'Saving.Account', 'Employment', 'Disposable.Income', 'Personal.Status', 'Other.Debtors', 'Present.Residence', 'Property', 'Age', 'Other.Loans', 'Housing', 'Existing.Credits', 'Job', 'Number.Liable', 'Telephone', 'Foreign.Worker', 'Cost.Matrix']
credit.columns = col_names
cols = credit.columns
prediction_col = 'Credit.Hist'
feature_cols = [c for c in cols if c !=prediction_col]
x = credit[feature_cols].values
y = credit[prediction_col].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=41)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=41)
from sklearn import tree
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(x_train, y_train)
ValueError below:
ValueError Traceback (most recent call last)
/tmp/ipykernel_30064/311858196.py in <module>
2
3 tree_model = tree.DecisionTreeClassifier()
----> 4 tree_model.fit(x_train, y_train)
~/anaconda3/lib/python3.7/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
940 sample_weight=sample_weight,
941 check_input=check_input,
--> 942 X_idx_sorted=X_idx_sorted,
943 )
944 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
164 check_y_params = dict(ensure_2d=False, dtype=None)
165 X, y = self._validate_data(
--> 166 X, y, validate_separately=(check_X_params, check_y_params)
167 )
168 if issparse(X):
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
571 # :(
572 check_X_params, check_y_params = validate_separately
--> 573 X = check_array(X, **check_X_params)
574 y = check_array(y, **check_y_params)
575 else:
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
736 array = array.astype(dtype, casting="unsafe", copy=False)
737 else:
--> 738 array = np.asarray(array, order=order, dtype=dtype)
739 except ComplexWarning as complex_warning:
740 raise ValueError(
ValueError: could not convert string to float: 'A12'
I think I understand why the error came up, but I'm still lost. It's because I'm dealing with both integers and objects, correct? Where I'm lost is 'A12'. A12 is varaiable that comes up in multiple columns. So which one? And why that particular one as opposed to A11 or A13 or one of the other variables?