0

I am working with the R programming language. I defined the following function and I am trying to perform the "random search" algorithm on this function.

First, I loaded the library:

#load library : https://cran.r-project.org/web/packages/randomsearch/index.html 

library(randomsearch)

Then, I defined the function:

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,10)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)


#define function (4 inputs x[1], x[2], x[3], x[4] and 4 outputs f1, f2, f3, f4)

fn <- function(i) {
    x1 <- x[i,1]; x2 <- x[i,2]; x3 <- x[i,3] ; x4 <- x[i,4]
    f <- numeric(4)
    #bin data according to random criteria
    train_data <- train_data %>%
        mutate(cat = ifelse(a1 <= x1 & b1 <= x3, "a",
                            ifelse(a1 <= x2 & b1 <= x4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    
    #calculate  quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 150,1,0 )))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 300,1,0 )))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 400,1,0 )))
    
    f1 = mean(table_a$quant)
    f2 = mean(table_b$quant)
    f3 = mean(table_c$quant)
    
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    # calculate the total mean : this is what needs to be optimized
    
    f4 = mean(final_table$quant)
    
    #add some constraints
    if((x3 - x1) < 0. | (x4 - x2) < 0.) {
        f[1] <- NaN
        f[2] <- NaN
        f[3] <- NaN
        f[4] <- NaN
        
    }
    
    return (f)
}

Finally, I tried to run the "random search" algorithm on this function:

#run algorithm
res = randomsearch(fn, lower = c(80, 80, 80, 80), upper = c(100,120,100,120), minimize = c(TRUE, TRUE, TRUE, TRUE), max.evals = 30)
rs = summary(res)

But this resulted in the following error:

Error in fun(x, ...) : unused argument (x)

Does anyone know why this error is being produced? Is it related to the way I have defined the function "fn"?

Thanks

UseR10085
  • 7,120
  • 3
  • 24
  • 54
stats_noob
  • 5,401
  • 4
  • 27
  • 83
  • 1
    One immediate problem with your function is that is won't know what `x` is, and hence what `x[i,j]` is. `x` is not supplied as an input, but neither is it created within the function, so your function is going to fail on the first line. – Sam Rogers Aug 05 '21 at 03:54
  • @Sam Rogers: thank you for your reply! I have written different versions of the same function in hopes that one of the definitions would work. I can show you some other ways I have defined this function if you would like? – stats_noob Aug 05 '21 at 03:56
  • 1
    On top of this, the way `randomsearch()` is running will make it difficult to debug problems with your function. I suggest ensuring your function works with the expected inputs from `randomsearch()` first, before using it within `randomsearch()`. – Sam Rogers Aug 05 '21 at 03:57
  • 1
    Yes you're welcome to add some more examples of the functions if you want, however I suspect they will suffer from the same issues. I'll add a more detailed answer that may help you, and feel free to post a follow-up question. – Sam Rogers Aug 05 '21 at 04:03

2 Answers2

1

The following code works, but I couldn't get the constraints to work:

library(randomsearch)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,10)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

fun_1  <- function(x) {
    x1 <- x[1]
    x2 <- x[2]
    x3 <- x[3] 
    x4 <- x[4] 
    
    
    #bin data according to random criteria
    train_data <- train_data %>%
        mutate(cat = ifelse(a1 <= x1 & b1 <= x3, "a",
                            ifelse(a1 <= x2 & b1 <= x4, "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1, b1, c1, cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1, b1, c1, cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1, b1, c1, cat)
    
    
    #calculate  quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 200,1,0 )))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 300,1,0 )))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = ifelse(c1 > 400,1,0 )))
    
    f1 = mean(table_a$quant)
    f2 = mean(table_b$quant)
    f3 = mean(table_c$quant)
    
    
    #group all tables
    
    final_table = rbind(table_a, table_b, table_c)
    # calculate the total mean : this is what needs to be optimized
    
    f4 = mean(final_table$quant)
    
    
    
    return(c(f1, f2,f3))
}
    

res = randomsearch(fun_1, lower = c(90, 100, 90, 100), upper = c(100,120,100,120), minimize = c(TRUE, TRUE,TRUE), max.evals = 30)
rs = summary(res)

Now, view the results:

> head(rs)
$pareto.front
         y_1       y_2       y_3       x1       x2       x3       x4
1  0.7605634 0.6851628 0.6400000 91.12101 114.1228 96.77341 117.0649
4  0.7611940 0.6974249 0.5867238 90.16010 110.6879 99.06183 103.1964
5  0.7631579 0.6996337 0.5863570 93.49183 103.2529 90.54579 100.0828
8  0.7804878 0.7196653 0.5791667 93.37388 101.6645 91.03374 100.0605
9  0.7878788 0.6862745 0.5936842 92.85005 106.4595 94.16650 105.3454
14 0.7884615 0.6828423 0.6010782 94.17298 106.6873 91.62018 109.2036

@Sam Rogers: I would be curious to see what you had in mind?

Thanks!

stats_noob
  • 5,401
  • 4
  • 27
  • 83
  • 1
    Nice work solving your own problem! Just be careful with the `train_data` as well. If that ever changes, or isn't available, your function will fail. It would be safer to add it as an explicit input to the function, or if it's always going to be the same thing, then generate it within the function. – Sam Rogers Aug 06 '21 at 07:36
  • 1
    Sorry I didn't get to post an answer - I got caught up with other things. If what you've posted above does what you need, then that's all that matters really. A couple of other things to think about though, is that there are a couple of variables at the end of your function (`final_table` and `f4`) that are not returned, and so probably don't need to be there? – Sam Rogers Aug 06 '21 at 07:46
  • @ Sam Rogers: Thank you for your kind reply! You are correct - "f4" is not needed. In a different rendition of this code, I was using f4 as well. in the question i posted, i decided not to use it. – stats_noob Aug 06 '21 at 14:22
  • @ Sam Rogers : If you have time later, I would be curious to see what you had in mind. I am always trying to learn new ways to solve my problems! :) – stats_noob Aug 06 '21 at 14:24
  • 1
    Ok, I'll have a go when I get some time. Could you please add a bit more detail about the aim of the function? That will just make it a bit quicker for me to work out what you're aiming for. E.g. the function takes as input (is `train_data` always the same setup of random values? Does `x` only ever have 4 values? Etc), does , and returns . Thanks :) – Sam Rogers Aug 06 '21 at 21:32
  • 1
    I.e. what I'm after is a thorough description of the problem you trying to solve, rather than the error your function is producing. That way I can attempt to write a function that will solve your problem, rather than just edit what you've written. – Sam Rogers Aug 06 '21 at 22:09
  • @Sam Rogers : thank you for your reply! I will reply in a few hours (a bit busy atm) – stats_noob Aug 06 '21 at 23:28
  • (I have also been working on this question over here: https://stackoverflow.com/questions/68660559/r-converting-c50-models-to-rpart-models haha) – stats_noob Aug 06 '21 at 23:29
0

As I mentioned in the comments, I'm not entirely sure what you're aiming for, or what problem you're attempting to solve, so all I've done with this really is optimise the code that you've already written. It's entirely possible that I would write very similar code if I was doing this myself from scratch, or it may be very different.

I have added a commented version so you can see what I've done, and also a reduced version with the unneeded code removed. Depending on your ultimate objectives, it's probably possible to still reduce the number of lines of the minimal version, and likely possible to speed it up too, but they may not be important considerations.

One basic principle I've used in this case is that there's no point running code that is not necessary or saving things that are not used.

Commented version

library(randomsearch)
library(tidyverse)

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,10)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)

fun_2  <- function(x) {
    # x1 <- x[1]    # Storing these as additional variables doesn't help at all
    # x2 <- x[2]    # They are only used to bin the data
    # x3 <- x[3] 
    # x4 <- x[4] 
    
    #bin data according to random criteria
    train_data <- train_data %>%
        mutate(cat = ifelse(a1 <= x[1] & b1 <= x[3], "a",
                            ifelse(a1 <= x[2] & b1 <= x[4], "b", "c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") #%>%
    # select(a1, b1, c1, cat) # There are no other columns to select, so this is not needed
    
    b_table = train_data %>%
        filter(cat == "b") #%>%
    # select(a1, b1, c1, cat) 
    
    c_table = train_data %>%
        filter(cat == "c") #%>%
    # select(a1, b1, c1, cat)
    
    #calculate  quantile ("quant") for each bin
    # table_a = data.frame(a_table %>% group_by(cat)    # We don't need another variable for this
                         # %>% mutate(quant = ifelse(c1 > 200,1,0 )))   
    a_table$quant = ifelse(a_table$c1 > 200, 1, 0)  # It can also be simlpified
    
    # table_b = data.frame(b_table%>% group_by(cat) %>%
                             # mutate(quant = ifelse(c1 > 300,1,0 )))
    b_table$quant = ifelse(b_table$c1 > 300, 1, 0)
    
    # table_c = data.frame(c_table%>% group_by(cat) %>%
                             # mutate(quant = ifelse(c1 > 400,1,0 )))
    c_table$quant = ifelse(c_table$c1 > 400, 1, 0)
    
    # f1 = mean(a_table$quant)
    # f2 = mean(b_table$quant)
    # f3 = mean(c_table$quant)

    #group all tables
    
    # final_table = rbind(table_a, table_b, table_c)    # This is not used
    # calculate the total mean : this is what needs to be optimized
    
    # f4 = mean(final_table$quant)  # This is not used
    
    return(c(mean(a_table$quant), mean(b_table$quant), mean(c_table$quant)))
}

Reduced version

fun_2  <- function(x) {
    #bin data according to random criteria
    train_data <- train_data %>%
        mutate(cat = factor(ifelse(a1 <= x[1] & b1 <= x[3], "a",
                            ifelse(a1 <= x[2] & b1 <= x[4], "b", "c"))))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>% filter(cat == "a") 
    b_table = train_data %>% filter(cat == "b") 
    c_table = train_data %>% filter(cat == "c") 
    
    #calculate  quantile ("quant") for each bin
    a_table$quant = ifelse(a_table$c1 > 200, 1, 0) 
    b_table$quant = ifelse(b_table$c1 > 300, 1, 0)
    c_table$quant = ifelse(c_table$c1 > 400, 1, 0)
 
    return(c(mean(a_table$quant), mean(b_table$quant), mean(c_table$quant)))
}

They unfortunately don't produce identical results, but I believe this is due to the random nature of the search, rather than an error in the code.

Hope that helps. I'll have a look at your other question when I get some time.

Sam Rogers
  • 787
  • 1
  • 8
  • 19