4

I am using h2o and R for a binary classification problem. I was wondering if there is any way to create a learning curve in h2o?

I coded some splits myself and I am plotting the curve alright, but I'd like to know if there is a quick recipe provided by h2o? H2O is known to make the things easier for data people.

Here's what I've been doing based on the answer here Plot learning curves with caret package and R

I customized the loop:

library(data.table)
library(dplyr)
library(h2o)
#
sizes <- seq(0.05, 1.0, 0.05)
#
learnCurve <- data.frame(mid = character(length(sizes)),
                         m = integer(length(sizes)),
                         n1 = integer(length(sizes)),
                         n0 = integer(length(sizes)),
                         trainRMSE = double(length(sizes)),
                         cvRMSE = double(length(sizes)),
                         trainAUC = double(length(sizes)),
                         cvAUC = double(length(sizes)),
                         trainTPR = double(length(sizes)),
                         cvTPR = double(length(sizes)),
                         trainFPR = double(length(sizes)),
                         cvFPR = double(length(sizes))
                         )
#
h2o.init()
for (i in 1:length(sizes)) {
  set.seed(3)
  ind <- sample(nrow(ddf1s), sizes[i]*nrow(ddf1s))
  tr <- ddf1s[ind, ]
  dd <- ddf1 %>% filter(id %in% tr$id)
  #
  setDT(dd)
  #
  dmh2o <- dd[, c(response, predictors), with = FALSE]
  setDT(dmh2o)
  setnames(dmh2o, predictors, newNames)
  #
  ddhex <- as.h2o(dmh2o)
  #
  splitsx <- h2o.splitFrame(data= ddhex, ratios = .7, seed = 1234)
  trainx <- splitsx[[1]]
  validx <- splitsx[[2]]
  #
  gbmTreex <- h2o.gbm(x = newNames, y = response,
                      training_frame = trainx,
                      validation_frame = validx,
                      max_depth = 8,
                      seed = 1234)
  #
  learnCurve$mid[i] <- gbmTreex@model_id
  learnCurve$m[i] <- nrow(dmh2o)
  learnCurve$n0[i] <- nrow(filter(dmh2o, target == 0))
  learnCurve$n1[i] <- nrow(filter(dmh2o, target == 1))
  learnCurve$trainRMSE[i] <- gbmTreex@model$training_metrics@metrics$RMSE
  learnCurve$cvRMSE[i] <- gbmTreex@model$validation_metrics@metrics$RMSE
  learnCurve$trainAUC[i] <- gbmTreex@model$training_metrics@metrics$AUC
  learnCurve$cvAUC[i] <- gbmTreex@model$validation_metrics@metrics$AUC
  learnCurve$trainTPR[i] <- gbmTreex@model$training_metrics@metrics$cm$table$`1`[[2]] / gbmTreex@model$training_metrics@metrics$cm$table$`1`[[3]]
  learnCurve$cvTPR[i] <- gbmTreex@model$validation_metrics@metrics$cm$table$`1`[[2]] / gbmTreex@model$validation_metrics@metrics$cm$table$`1`[[3]]
  learnCurve$trainFPR[i] <- gbmTreex@model$training_metrics@metrics$cm$table$Error[[2]]
  learnCurve$cvFPR[i] <- gbmTreex@model$training_metrics@metrics$cm$table$Error[[2]]
  #
  mstemp <- gbmTreex@model$model_summary
  if(i == 1) {
    ms <- mstemp
  }
  #
  if (i > 1) {
    ms <- rbind(ms, mstemp)
  }
  #
  print(i)#
  rm(gbmTreex, dmh2o, ddhex, splitsx, trainx, validx)
  gc()
}
#
h2o.shutdown()

I did not include the data since the main question is obtaining the model metrics by varying the sizes of the data. I'd like to know if there is another (quicker) way provided by H2O?

maop
  • 194
  • 14

0 Answers0