5

I am simulating data and comparing glm.fit , bigglm, speedglm, glmnet, LiblineaR for binary logit model.

testGLMResults_and_speed <- function(N, p, chunk=NULL, Ifsample=FALSE, size=NULL, reps=5){

  library(LiblineaR)
  library(speedglm)
  library(biglm)
  library(glmnet)

  # simulate dataset
  X = scale(matrix(rnorm(N*p), nrow=N, ncol=p))
  X1 = cbind(rep(1, N), X)
  q = as.integer(p/2)
  b = c(rnorm(q+1), rnorm(p-q)*10)
  eta = X1 %*% b

  # simulate Y
  simy <- function(x){
    p = 1/(1 + exp(-eta[x]))
    u = runif(1, 0, 1)
    return(ifelse(u<=p, 1, 0))
  }

  Y = sapply(1:N, simy)
  XYData = as.data.frame(cbind(y=Y, X))

  getSample <- function(X, Y=NULL, size){
    ix = sample(dim(X)[1], size, replace=FALSE)    
    return(list(X=X[ix,], Y=Y[ix]))
  }

  #LiblineaR function
  fL <- function(X, Y, type, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resL = LiblineaR(data=X, labels=Y, type=type)
    return(resL$W)    
  }

  #glmnet
  fNet <- function(X, Y, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resNGLM <- glmnet(x=X, y=Y, family="binomial", standardize=FALSE, type.logistic="modified.Newton")    
    return(c(resNGLM$beta[, resNGLM$dim[2]], 0))
  }

  #glm.fit
  fglmfit <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)      
      X1 = res$X; Y=res$Y;
    }
    resGLM = glm.fit(x=X1, y=Y, family=binomial(link=logit))
    return(c(resGLM$coefficients[2:(p+1)], resGLM$coefficients[1]))
  }

  #speedglm
  fspeedglm <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)  
      X1 = res$X; Y=res$Y;
    }
    resSGLM = speedglm.wfit(y=Y, X=X1, family=binomial(link=logit), row.chunk=chunk)
    return(c(resSGLM$coefficients[2:(p+1)], resSGLM$coefficients[1]))
  }

  #bigglm
  fbigglm <- function(form, XYdf, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(XYdf, Y=NULL, size)  
      XYdf = res$X; 
    }
    resBGLM <- bigglm(formula=form, data=XYdf, family = binomial(link=logit), maxit=20)
    if(resBGLM$converged){
      resBGLM = summary(resBGLM)$mat[,1]
    } else {
      resBGLM = rep(-99, p+1)
    }    
    return(c(resBGLM[2:(p+1)], resBGLM[1]))
  }

  ## benchmarking function
  ## calls reps times and averages parameter values and times over reps runs
  bench_mark <- function(func, args, reps){
    oneRun <- function(x, func, args){
      times = system.time(res <- do.call(func, args))[c("user.self", "sys.self", "elapsed")]
      return(list(times=times, res=res))
    }    
    out = lapply(1:reps, oneRun, func, args)
    out.times = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$times))))
    out.betas = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$res))))
    return(list(times=out.times, betas=out.betas))
  }

  #benchmark LiblineaR
  func="fL"
  args = list(X=X, Y=Y, type=0, Ifsample=Ifsample, size=size)
  res_L0 = bench_mark(func, args, reps)
  args = list(X=X, Y=Y, type=6, Ifsample=Ifsample, size=size)
  res_L6 = bench_mark(func, args, reps)

  #benchmark glmnet 
  func = "fNet"
  args = list(X=X, Y=Y, Ifsample=Ifsample, size=size)
  res_GLMNet = bench_mark(func, args, reps)

  func="fglmfit"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_GLM = bench_mark(func, args, reps)

  func="fspeedglm"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_SGLM = bench_mark(func, args, reps)  

  func = "fbigglm"
  # create formula for bigglm
  xvarname = paste("V", 2:dim(XYData)[2], sep="")
  f  = as.formula(paste("y ~ ", paste(xvarname, collapse="+")))
  args = list(form=f, XYdf=XYData, Ifsample=Ifsample, size=size)
  res_BIGGLM = bench_mark(func, args, reps)

  summarize <- function(var){
    return(rbind(L0=res_L0[[var]], L6=res_L6[[var]], 
                GLMNet=res_GLMNet[[var]], GLMfit=res_GLM[[var]], 
                speedGLM=res_SGLM[[var]], bigGLM=res_BIGGLM[[var]]))
  }

  times = summarize("times")
  betas = rbind(summarize("betas"), betaTRUE = c(b[2:(p+1)], b[1]))
  colnames(betas)[dim(betas)[2]] = "Bias"
  # compare betas with true beta
  betacompare = t(sapply(1:dim(betas)[1], function(x) betas[x,]/betas[dim(betas)[1],]))


  print(paste("Run times averaged over", reps, "runs"))
  print(times)

  print(paste("Beta estimates averaged over", reps, "runs"))
  print(betas)

  print(paste("Ratio Beta estimates averaged over", reps, "runs for all methods (reference is true beta)"))
  print(betacompare)
}

A sample run looks as follows:

> testGLMResults_and_speed(10000, 5, 500, FALSE, 5000, 5)
[1] "Run times averaged over 5 runs"
         user.self sys.self elapsed
L0          0.0152   0.0032  0.0106
L6          0.0108   0.0002  0.0108
GLMNet      0.5660   0.0002  0.5666
GLMfit      0.0836   0.0262  0.0576
speedGLM    0.0438   0.0122  0.0298
bigGLM      0.2414   0.0956  0.1822
[1] "Beta estimates averaged over 5 runs"
                 V1         V2        V3        V4        V5       Bias
L0        0.4611973  0.7113034 -7.373351  4.890485  2.699251  0.3026018
L6        0.4516369  0.7002171 -7.284820  4.829202  2.659993  0.2972566
GLMNet   -0.4873854 -0.7512806  7.839316 -5.200533 -2.868255  0.0000000
GLMfit   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
speedGLM -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
bigGLM   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
betaTRUE -0.4377651 -0.7692994  8.290677 -5.442880 -3.088798 -0.3901147
[1] "Ratio Beta estimates averaged over 5 runs for all methods (reference is true beta)"
            V1         V2         V3         V4         V5       Bias
[1,] -1.053527 -0.9246119 -0.8893544 -0.8985106 -0.8738838 -0.7756739
[2,] -1.031688 -0.9102009 -0.8786760 -0.8872513 -0.8611741 -0.7619724
[3,]  1.113349  0.9765776  0.9455579  0.9554745  0.9285990  0.0000000
[4,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[5,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[6,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[7,]  1.000000  1.0000000  1.0000000  1.0000000  1.0000000  1.0000000
Warning messages:
1: glm.fit: fitted probabilities numerically 0 or 1 occurred 
2: glm.fit: fitted probabilities numerically 0 or 1 occurred 
3: glm.fit: fitted probabilities numerically 0 or 1 occurred 
4: glm.fit: fitted probabilities numerically 0 or 1 occurred 
5: glm.fit: fitted probabilities numerically 0 or 1 occurred 

I am seeing often that the estimates from LiblineaR is close but has the signs reversed! Does anybody know why this happens? It is often the fastest and is even faster with larger datasets I have seen.

I understand that the values will not be the same due to regularization; I am more curious about the signs.

If somebody (rep>1500) can please add the #LiblineaR and #speedglm tags, I would appreciate it.

user1971988
  • 845
  • 7
  • 22

1 Answers1

1

While I haven't used LiblineaR, this kind of thing happens when a logistic regression is estimating the probability of the opposite label to what you expect. Everything else is estimating the probability that Y = 1, so my guess is that LiblineaR is predicting the probability that Y = 0.

Hack-R
  • 22,422
  • 14
  • 75
  • 131
Hong Ooi
  • 56,353
  • 13
  • 134
  • 187
  • OK. But the weird part it that sometimes it is of the opposite sign, sometimes of the correct sign. I was hoping somebody might point me to some inherent instability issues in the algorithm. – user1971988 Oct 23 '13 at 12:40
  • 2
    probably it assigns to the first case in the label vector "1" and "0" to the first occurrence of the other possible value, no matter if it actually is "1" or "0". Does that make sense? – fabiangebert Dec 12 '13 at 08:40
  • So it should be ok to take the abs value of its runtime for this benchmarking? – smci Jan 31 '17 at 06:48