I run Logistic Regression on a very small and simple dataset that is well separable. But I realized that the model cannot find the optimal decision boundary. Where is my mistake?
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
sm_df = pd.DataFrame()
sm_df['x'] = [0.5,4.0,1.0,2.5,2.0,3.5,1.0,3.0, 1.0, 2.0]
sm_df['y'] = [1.0,3.5,1.0,3.5,1.0, 4.5, 2.0,3.0, 0.0, 2.5]
sm_df['Bad_data'] = [True, False, True, False, True, False, True, False, True, False]
log = linear_model.LogisticRegression()
log.fit(sm_df[['x','y']], sm_df['Bad_data'])
test_score = log.score(sm_df[['x','y']], sm_df['Bad_data'])
print("test score: ", test_score)
# Create scatterplot of dataframe
sns.lmplot('x', # Horizontal axis
'y', # Vertical axis
data=sm_df, # Data source
fit_reg=False, # Don't fix a regression line
hue="Bad_data", # Set color
scatter_kws={"marker": "D", # Set marker style
"s": 100}) # S marker size
plt.xlabel('x')
plt.ylabel('y')
# to plot desision bountdary
w0 = log.intercept_
w1, w2 = log.coef_[0]
X = np.array([0,4])
x2 = np.array([-w0/w2, -w0/w2 -w1*4/w2])
plt.plot(X, x2)
t_x = [1.5]
t_y = [1.8]
pr = log.predict([1.5,1.8])
plt.scatter(t_x, # Horizontal axis
t_y, c='r') # S marker size
plt.annotate(pr, ([1.5,1.9]))