I am setting up a very simple logistic regression problem in scikit-learn and in spark.ml, and the results diverge: the models they learn are different, but I can't figure out why (data is the same, model type is the same, regularization is the same...).
No doubt I am missing some setting on one side or the other. Which setting? How should I set up either scikit or spark.ml to find the same model as its counterpart?
I give the sklearn code and spark.ml code below. Both should be ready to cut-and-paste and run.
scikit-learn code:
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge
X = np.array([
[-0.7306653538519616, 0.0],
[0.6750417712898752, -0.4232874171873786],
[0.1863463229359709, -0.8163423997075965],
[-0.6719842051493347, 0.0],
[0.9699938346531928, 0.0],
[0.22759406190283604, 0.0],
[0.9688721028330911, 0.0],
[0.5993795346650845, 0.0],
[0.9219423508390701, -0.8972778242305388],
[0.7006904841584055, -0.5607635619919824]
])
y = np.array([
0.0,
1.0,
1.0,
0.0,
1.0,
1.0,
1.0,
0.0,
0.0,
0.0
])
m, n = X.shape
# Add intercept term to simulate inputs to GameEstimator
X_with_intercept = np.hstack((X, np.ones(m)[:,np.newaxis]))
l = 0.3
e = LogisticRegression(
fit_intercept=False,
penalty='l2',
C=1/l,
max_iter=100,
tol=1e-11)
e.fit(X_with_intercept, y)
print e.coef_
# => [[ 0.98662189 0.45571052 -0.23467255]]
# Linear regression is called Ridge in sklearn
e = Ridge(
fit_intercept=False,
alpha=l,
max_iter=100,
tol=1e-11)
e.fit(X_with_intercept, y)
print e.coef_
# =>[ 0.32155545 0.17904355 0.41222418]
spark.ml code:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SQLContext
object TestSparkRegression {
def main(args: Array[String]): Unit = {
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val conf = new SparkConf().setAppName("test").setMaster("local")
val sc = new SparkContext(conf)
val sparkTrainingData = new SQLContext(sc)
.createDataFrame(Seq(
LabeledPoint(0.0, Vectors.dense(-0.7306653538519616, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.6750417712898752, -0.4232874171873786)),
LabeledPoint(1.0, Vectors.dense(0.1863463229359709, -0.8163423997075965)),
LabeledPoint(0.0, Vectors.dense(-0.6719842051493347, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.9699938346531928, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.22759406190283604, 0.0)),
LabeledPoint(1.0, Vectors.dense(0.9688721028330911, 0.0)),
LabeledPoint(0.0, Vectors.dense(0.5993795346650845, 0.0)),
LabeledPoint(0.0, Vectors.dense(0.9219423508390701, -0.8972778242305388)),
LabeledPoint(0.0, Vectors.dense(0.7006904841584055, -0.5607635619919824))))
.toDF("label", "features")
val logisticModel = new LogisticRegression()
.setRegParam(0.3)
.setLabelCol("label")
.setFeaturesCol("features")
.fit(sparkTrainingData)
println(s"Spark logistic model coefficients: ${logisticModel.coefficients} Intercept: ${logisticModel.intercept}")
// Spark logistic model coefficients: [0.5451588538376263,0.26740606573584713] Intercept: -0.13897955358689987
val linearModel = new LinearRegression()
.setRegParam(0.3)
.setLabelCol("label")
.setFeaturesCol("features")
.setSolver("l-bfgs")
.fit(sparkTrainingData)
println(s"Spark linear model coefficients: ${linearModel.coefficients} Intercept: ${linearModel.intercept}")
// Spark linear model coefficients: [0.19852664861346023,0.11501200541407802] Intercept: 0.45464906876832323
sc.stop()
}
}