0

i am trying to implement the minCases-argument into my tuning process of a c5.0 model. As i am using the caret package i am trying to get that argument into the "tuneGrid". For that purpose i found the following Tutorial. https://www.euclidean.com/machine-learning-in-practice/2015/6/12/r-caret-and-parameter-tuning-c50

After implementing the code into my syntax i get the following error:

**Error: The tuning parameter grid should have columns NA, NA, NA, splits**

Anyone knows where there is a mistake? The error occurs as soon as i am building my model "mdl" in the last line of the code.

With regard to the Tutorial mentionend above my current code is the following:

library(datasets)
data(iris)

library('gmodels')
library("RcppCNPy")
library("class")
library("C50")
library('caret')
library('mlbench')


####Customizing the C5.0

C5CustomSort <- function(x) {
  
  x$model <- factor(as.character(x$model), levels = c("rules","tree"))
  x[order(x$trials, x$model, x$splits, !x$winnow),]
  
}

C5CustomLoop <- function (grid) 
{
  loop <- ddply(grid, c("model", "winnow","splits"), function(x) c(trials = max(x$trials)))
  submodels <- vector(mode = "list", length = nrow(loop))
  for (i in seq(along = loop$trials)) {
    index <- which(grid$model == loop$model[i] & grid$winnow == loop$winnow[i] & grid$splits == loop$splits[i])
    trials <- grid[index, "trials"]
    submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
  }
  list(loop = loop, submodels = submodels)
}

C5CustomGrid <- function(x, y, len = NULL) {
  c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
  expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}

C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
  # add the splits parameter to the fit function
  # minCases is a function of splits
  
  theDots <- list(...)
  
  splits   <- param$splits
  minCases <- floor( length(y)/splits ) - 1
  
  if(any(names(theDots) == "control"))
  {
    theDots$control$winnow        <- param$winnow
    theDots$control$minCases      <- minCases
    theDots$control$earlyStopping <- FALSE
  }
  else
    theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
  
  argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
  
  argList <- c(argList, theDots)
  
  do.call("C5.0.default", argList)
  
}

GetC5Info <- function() {
  
  # get the default C5.0 model functions
  c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
  
  # modify the parameters data frame so that it includes splits
  c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(levels(c5ModelInfo$parameters$parameter),'splits'))
  c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(levels(c5ModelInfo$parameters$label),'Splits'))
  c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))
  
  # replace the default c5.0 functions with ones that are aware of the splits parameter
  c5ModelInfo$fit  <- C5CustomFit
  c5ModelInfo$loop <- C5CustomLoop
  c5ModelInfo$grid <- C5CustomGrid
  c5ModelInfo$sort <- C5CustomSort
  
  return (c5ModelInfo)
  
}

c5info <- GetC5Info()

#Building the actual model
x_a <- iris[c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")]
y_a <-as.factor(iris[,c("Species")])

fitControl <- trainControl(method = "cv", number = 10)
grida <- expand.grid( .winnow = "FALSE", .trials=c(1,5,10,15,20), .model="tree", .splits=c(2,5,10,15,20,25,50,100) )
mdl<- train(x=x_a,y=y_a,tuneGrid=grida,trControl=fitControl,method=c5info)
Lars
  • 1
  • 2
  • 1
    Please trim your code to make it easier to find your problem. Follow these guidelines to create a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). – Community Sep 04 '21 at 02:06

1 Answers1

0

the problem seems to be in some of the Custom functions, i have this other version that works for me:

library(caret)
library(C50)
library(mlbench)
library(tidyverse)
library(plyr)
C5CustomSort <- function(x) {
  
  x$model <- factor(as.character(x$model), levels = c("rules","tree"))
  x[order(x$trials, x$model, x$splits, !x$winnow),]

}

C5CustomLoop <- function (grid) 
{
    loop <- ddply(grid, .(winnow,model, splits,trials), function(x) c(trials = max(x$trials)))
    submodels <- vector(mode = "list", length = nrow(loop))
    for (i in seq(along = loop$trials)) {
        index <- which(grid$model == loop$model[i] & grid$winnow == 
            loop$winnow[i] & grid$splits == loop$splits[i])
        trials <- grid[index, "trials"]
        submodels[[i]] <- data.frame(trials = trials[trials != 
            loop$trials[i]],winnow = loop$winnow[i], model=loop$model[i],splits=loop$splits[i])
    }
    list(loop = loop, submodels = submodels)
}

C5CustomGrid <- function(x, y, len = NULL) {
  c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
  expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}

C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {

  
  theDots <- list(...)

  splits   <- loop$splits
  minCases <- floor( length(y)/splits ) - 1

  if(any(names(theDots) == "control"))
    {
    theDots$control$winnow        <- param$winnow
    theDots$control$minCases      <- minCases
    theDots$control$earlyStopping <- FALSE
  }
  else
  theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )

  argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")

  argList <- c(argList, theDots)

  do.call("C5.0.default", argList)

}

GetC5Info <- function() {

  c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]

  c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(c5ModelInfo$parameters$parameter,'splits'))
  c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(c5ModelInfo$parameters$label,'Splits'))
  c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))parameter
  c5ModelInfo$fit  <- C5CustomFit
  c5ModelInfo$loop <- C5CustomLoop
  c5ModelInfo$sort <- C5CustomSort

  return (c5ModelInfo)

}

c5info <- GetC5Info()

fitControl <- trainControl(method = "repeatedcv", number = 10,  repeats = 10)
 splits<-c(5,25,100)

grid <- expand.grid( winnow = c(FALSE), trials=c(5,6), model=c("tree"), splits=c(5,25,100) )
data(PimaIndiansDiabetes2)
x <- PimaIndiansDiabetes2[c("age","glucose","insulin","mass","pedigree","pregnant","pressure","triceps")]
y <- PimaIndiansDiabetes2$diabetes
mdl<- train(x=x,y=y,tuneGrid=grid,trControl=fitControl,method=c5info,verbose=FALSE)
  • Please post an answer (only) if it successfully identifies both the concrete source of the problem and a concrete solution. –  May 27 '22 at 07:10