0

I new to python and machine learning. I got an error when trying to implement (decision_regions) plot. I am not sure I understand the problem so I really need help solving this problem.

I think the problem because the target is string maybe I am nor sure. But I do not know how to fix this problem please I need help to fix this

 # import arff data using panda

data = arff.loadarff('Run1/Tr.arff') 
df = pd.DataFrame(data[0]) 
data =pd.DataFrame(df) 
data = data.loc[:,'ATT1':'ATT576'] 
target = df['Class'] 
target=target.astype(str)


#split the data into training and testing 
data_train, data_test, target_train, target_test = train_test_split(data, target,test_size=0.30, random_state=0) 



 model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)

num_est = [1, 2, 3, 10] 
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']

fig = plt.figure(figsize=(10,8)) 
gs = gridspec.GridSpec(2,2) 
grid = itertools.product([0,1],repeat=2)

 for n_est, label, grd in zip(num_est, label, grid):   
    boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est)    boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2)  

plt.title(label)

plt.show();

------------------------------------------------------------------ ValueError                                Traceback (most recent call
> last) <ipython-input-18-646828965d5c> in <module>
>       7     boosting.fit(data_train,target_train)
>       8     ax = plt.subplot(gs[grd[0], grd[1]])
> ----> 9     fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2)  # clf cannot be change because it's a
> parameter
>      10     plt.title(label)
>      11
> 
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/plotting/decision_regions.py
> in plot_decision_regions(X, y, clf, feature_index,
> filler_feature_values, filler_feature_ranges, ax, X_highlight, res,
> legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs,
> scatter_highlight_kwargs)
>     127     """
>     128 
> --> 129     check_Xy(X, y, y_int=True)  # Validate X and y arrays
>     130     dim = X.shape[1]
>     131 
> 
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/utils/checking.py
> in check_Xy(X, y, y_int)
>      14     # check types
>      15     if not isinstance(X, np.ndarray):
> ---> 16         raise ValueError('X must be a NumPy array. Found %s' % type(X))
>      17     if not isinstance(y, np.ndarray):
>      18         raise ValueError('y must be a NumPy array. Found %s' % type(y))
> 
> ValueError: X must be a NumPy array. Found <class
> 'pandas.core.frame.DataFrame'>`enter code here`
yatu
  • 86,083
  • 12
  • 84
  • 139
Rateel
  • 13
  • 1
  • 3

2 Answers2

0

Convert your data into an array then pass it to the function.

numpy_matrix = data.as_matrix()

Gautam Shahi
  • 455
  • 4
  • 14
0

I have used another similer dataset. In your code you are trying to plot with more tan 2 features which is not possible with 'plot_decision_regions' you have to use different methodes discusses in the given link Plotting decision boundary for High Dimension Data. But if you want to use only two features then you can use bellow code.

from scipy.io import arff
import pandas as pd
import itertools
from matplotlib import gridspec
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from matplotlib import pyplot as plt

data = arff.loadarff('TR.arff') 
data = pd.DataFrame(data[0])
df = data.loc[:,['att1','att2','class']]

for col_name in df.columns:
    if(df[col_name].dtype == 'object'):
        df[col_name]= df[col_name].astype('category')
        df[col_name] = df[col_name].cat.codes


target = df['class'] 
df=df.drop(['class'],axis=1)
data_train, data_test, target_train, target_test = train_test_split(df, target,test_size=0.30, random_state=0)

model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10] 
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']

fig = plt.figure(figsize=(10,8)) 
gs = gridspec.GridSpec(2,2) 
grid = itertools.product([0,1],repeat=2)

for n_est, label, grd in zip(num_est, label, grid):   
    boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est)    
    boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train.values , target_train.values, clf=boosting, legend=2)  

plt.title(label)

plt.show();

output

Rahul Verma
  • 2,988
  • 2
  • 11
  • 26