I'm learning support vector machine and trying to come up with a simple python implementation (I'm aware of the sklearn package, just to help understand the concepts better) that does simple linear classification. This is the major material I'm referencing.
I'm trying to solve the SVM from primal, by minimizing this:
The derivative of J wrt w is (according to the reference above):
So this is using the "hinge" loss, and C is the penalty parameter. If I understand correctly, setting a larger C will force the SVM to have harder margin.
Below is my code:
import numpy
from scipy import optimize
class SVM2C(object):
def __init__(self,xdata,ydata,c=200.,learning_rate=0.01,
n_iter=5000,method='GD'):
self.values=numpy.unique(ydata)
self.xdata=xdata
self.ydata=numpy.where(ydata==self.values[-1],1,-1)
self.c=c
self.lr=learning_rate
self.n_iter=n_iter
self.method=method
self.m=len(xdata)
self.theta=numpy.random.random(xdata.shape[1])-0.5
def costFunc(self,theta,x,y):
zs=numpy.dot(x,theta)
j=numpy.maximum(0.,1.-y*zs).mean()*self.c+0.5*numpy.sum(theta**2)
return j
def jac(self,theta,x,y):
'''Derivative of cost function'''
zs=numpy.dot(x,theta)
ee=numpy.where(y*zs>=1.,0.,-y)[:,None]
# multiply rows by ee
dj=(ee*x).mean(axis=0)*self.c+theta
return dj
def train(self):
#----------Optimize using scipy.optimize----------
if self.method=='optimize':
opt=optimize.minimize(self.costFunc,self.theta,args=(self.xdata,self.ydata),\
jac=self.jac,method='BFGS')
self.theta=opt.x
#---------Optimize using Gradient descent---------
elif self.method=='GD':
costs=[]
lr=self.lr
for ii in range(self.n_iter):
dj=self.jac(self.theta,self.xdata,self.ydata)
self.theta=self.theta-lr*dj
cii=self.costFunc(self.theta,self.xdata,self.ydata)
costs.append(cii)
self.costs=numpy.array(costs)
return self
def predict(self,xdata):
yhats=[]
for ii in range(len(xdata)):
xii=xdata[ii]
yhatii=xii.dot(self.theta)
yhatii=1 if yhatii>=0 else 0
yhats.append(yhatii)
yhats=numpy.array(yhats)
return yhats
#-------------Main---------------------------------
if __name__=='__main__':
xdata = numpy.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
ydata = numpy.array([1, 1, 2, 2])
mysvm=SVM2C(xdata,ydata,method='GD')
mysvm.train()
from sklearn import svm
clf=svm.SVC(C=50,kernel='linear')
clf.fit(xdata,ydata)
print mysvm.theta
print clf.coef_
#-------------------Plot------------------------
import matplotlib.pyplot as plt
figure=plt.figure(figsize=(12,10),dpi=100)
ax=figure.add_subplot(111)
cmap=plt.cm.jet
nclasses=numpy.unique(ydata).tolist()
colors=[cmap(float(ii)/len(nclasses)) for ii in nclasses]
#----------------Plot training data----------------
for ii in range(len(ydata)):
xii=xdata[ii][0]
yii=xdata[ii][1]
colorii=colors[nclasses.index(ydata[ii])]
ax.plot(xii,yii,color=colorii,marker='o')
plt.show(block=False)
So it is really a toy example where there are only 4 linearly separable training samples and I've dropped the bias term b, and the result w
expected is [0.5, 0.5] (skimage result), while my implementation will tend to give something larger than 0.5 (e.g. [1.4650, 1.4650]), whether using gradient descent or scipy.optimize
. And this only happens when setting the C
parameter to >1, when C==1
, it gives me [0.5, 0.5]. Also when C>1, the scipy.optimize
would fail (I've tried a few different methods e.g. Newton-CG, BFGS), although the final result is close to the gradient descent result.
I'm bit confused why the w
vector stops shrinking. I thought when all data are correctly classified, the slack penalties would stop contributing to the total cost function so it would only optimize J by decreasing the size of w
. Did I get the derivatives wrong?
I know this might be a newbie question and I'm pasting some dirty code, this has been puzzling me for a few days and I have no one around me that could offer help, so any support will be much appreciated!
UPDATE:
Thanks for all the help. I'm updating the code to deal with a slightly more complicated sample. This time I included the bias term and used the following to update it:
And following the feedbacks I got, I tried Nelder-Mead for the scipy.optimize
, and tried 2 adaptive gradient descent methods. Code below:
import numpy
from scipy import optimize
class SVM2C(object):
def __init__(self,xdata,ydata,c=9000,learning_rate=0.001,
n_iter=600,method='GD'):
self.values=numpy.unique(ydata)
# Add 1 dimension for bias
self.xdata=numpy.hstack([xdata,numpy.ones([xdata.shape[0],1])])
self.ydata=numpy.where(ydata==self.values[-1],1,-1)
self.c=c
self.lr=learning_rate
self.n_iter=n_iter
self.method=method
self.m=len(xdata)
self.theta=numpy.random.random(self.xdata.shape[1])-0.5
def costFunc(self,theta,x,y):
zs=numpy.dot(x,theta)
j=numpy.maximum(0.,1.-y*zs).mean()*self.c+0.5*numpy.sum(theta[:-1]**2)
return j
def jac(self,theta,x,y):
'''Derivative of cost function'''
zs=numpy.dot(x,theta)
ee=numpy.where(y*zs>=1.,0.,-y)[:,None]
dj=numpy.zeros(self.theta.shape)
dj[:-1]=(ee*x[:,:-1]).mean(axis=0)*self.c+theta[:-1] # weights
dj[-1]=(ee*self.c).mean(axis=0) # bias
return dj
def train(self):
#----------Optimize using scipy.optimize----------
if self.method=='optimize':
opt=optimize.minimize(self.costFunc,self.theta,args=(self.xdata,self.ydata),\
jac=self.jac,method='Nelder-Mead')
self.theta=opt.x
#---------Optimize using Gradient descent---------
elif self.method=='GD':
costs=[]
lr=self.lr
# ADAM parameteres
beta1=0.9
beta2=0.999
epsilon=1e-8
mt_1=0
vt_1=0
for ii in range(self.n_iter):
t=ii+1
dj=self.jac(self.theta,self.xdata,self.ydata)
'''
mt=beta1*mt_1+(1-beta1)*dj
vt=beta2*vt_1+(1-beta2)*dj**2
mt=mt/(1-beta1**t)
vt=vt/(1-beta2**t)
self.theta=self.theta-lr*mt/(numpy.sqrt(vt)+epsilon)
mt_1=mt
vt_1=vt
cii=self.costFunc(self.theta,self.xdata,self.ydata)
'''
old_theta=self.theta
self.theta=self.theta-lr*dj
if ii>0 and cii>costs[-1]:
lr=lr*0.9
self.theta=old_theta
costs.append(cii)
self.costs=numpy.array(costs)
self.b=self.theta[-1]
self.theta=self.theta[:-1]
return self
def predict(self,xdata):
yhats=[]
for ii in range(len(xdata)):
xii=xdata[ii]
yhatii=numpy.sign(xii.dot(self.theta)+self.b)
yhatii=xii.dot(self.theta)+self.b
yhatii=self.values[-1] if yhatii>=0 else self.values[0]
yhats.append(yhatii)
yhats=numpy.array(yhats)
return yhats
#-------------Main---------------------------------
if __name__=='__main__':
#------------------Sample case 1------------------
#xdata = numpy.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
#ydata = numpy.array([1, 1, 2, 2])
#------------------Sample case 2------------------
from sklearn import datasets
iris=datasets.load_iris()
xdata=iris.data[20:,:2]
ydata=numpy.where(iris.target[20:]>0,1,0)
#----------------------Train----------------------
mysvm=SVM2C(xdata,ydata,method='GD')
mysvm.train()
ntest=20
xtest=2*(numpy.random.random([ntest,2])-0.5)+xdata.mean(axis=0)
from sklearn import svm
clf=svm.SVC(C=50,kernel='linear')
clf.fit(xdata,ydata)
yhats=mysvm.predict(xtest)
yhats2=clf.predict(xtest)
print 'mysvm weights:', mysvm.theta, 'intercept:', mysvm.b
print 'sklearn weights:', clf.coef_, 'intercept:', clf.intercept_
print 'mysvm predict:',yhats
print 'sklearn predict:',yhats2
#-------------------Plot------------------------
import matplotlib.pyplot as plt
figure=plt.figure(figsize=(12,10),dpi=100)
ax=figure.add_subplot(111)
cmap=plt.cm.jet
nclasses=numpy.unique(ydata).tolist()
colors=[cmap(float(ii)/len(nclasses)) for ii in nclasses]
#----------------Plot training data----------------
for ii in range(len(ydata)):
xii=xdata[ii][0]
yii=xdata[ii][1]
colorii=colors[nclasses.index(ydata[ii])]
ax.plot(xii,yii,color=colorii,marker='o',markersize=15)
#------------------Plot test data------------------
for ii in range(ntest):
colorii=colors[nclasses.index(yhats2[ii])]
ax.plot(xtest[ii][0],xtest[ii][1],color=colorii,marker='^',markersize=5)
#--------------------Plot line--------------------
x1=xdata[:,0].min()
x2=xdata[:,0].max()
y1=(-clf.intercept_-clf.coef_[0][0]*x1)/clf.coef_[0][1]
y2=(-clf.intercept_-clf.coef_[0][0]*x2)/clf.coef_[0][1]
y3=(-mysvm.b-mysvm.theta[0]*x1)/mysvm.theta[1]
y4=(-mysvm.b-mysvm.theta[0]*x2)/mysvm.theta[1]
ax.plot([x1,x2],[y1,y2],'-k',label='sklearn line')
ax.plot([x1,x2],[y3,y4],':k',label='mysvm line')
ax.legend(loc=0)
plt.show(block=False)
The new problems I got:
- it is unstable, depending on what the initial random parameters are, the results can be quite different. And about half the time, it will mis-classifiy 1 sample in the training set even if I've set
C
a quite large value. This happens to bothscipy.optimize
and GD. - ADAM approach tends to give
inf
s forvt
, as for largeC
,vt
grows very fast. Am I getting the gradients wrong?
Tons of thanks in advance!