0

Objective

Automate the process of finding the best fit distribution using gamlss package and generating random numbers from this distribution

Example

My actual data has several variables. So, I will use 2 variables from iris dataset in this example. Say I want to generate random numbers from best fit distribution on sepal length and petal length. I can do this as follows:

library(gamlss)

# Load data-------
data("iris")


# Define a function that finds the best fit distribution
find_dist <- function(x){
  
  m1 <- fitDist(x, k = 2, type = "realAll", trace = FALSE, try.gamlss = TRUE)

  m1
}


# Best fit distribution for Sepal.Length---------
dist_Sepal.Length <- find_dist(iris$Sepal.Length)

family_Sepal.Length <- dist_Sepal.Length$family[1] # "SEP4"

dist_Sepal.Length$Allpar
# eta.mu eta.sigma    eta.nu   eta.tau 
# 5.8269404 0.3019834 1.8481415 0.8684266 

dist_Sepal.Length$mu.link #identity
dist_Sepal.Length$sigma.link #log
dist_Sepal.Length$nu.link #log
dist_Sepal.Length$tau.link #log

## Generate a random number:
rSEP4(1, mu = 5.827, sigma = exp(0.302), nu = exp(1.848), tau = exp(0.8684))


# Best fit distribution for Petal.Length---------
dist_Petal.Length <- find_dist(iris$Petal.Length)

family_Petal.Length <- dist_Petal.Length$family[1] # ""SEP2"

dist_Petal.Length$Allpar
# eta.mu  eta.sigma     eta.nu    eta.tau 
# 4.248646   1.057717 -26.546283   3.594178 

dist_Petal.Length$mu.link #identity
dist_Petal.Length$sigma.link #log
dist_Petal.Length$nu.link #identity
dist_Petal.Length$tau.link #log

## Generate a random number:
rSEP2(1, mu = 4.249, sigma = exp(1.058), nu = -26.546, tau = exp(3.594))  

Challenges in Creating a Function to Automate Generating Random Numbers

I can extract the distribution from the family attribute and all parameter values from the Allpar attribute. The challenge is that each distribution has different parameters and link functions. Otherwise, I can directly provide Allpar to the random number function.

Please guide me how to automate this process?

umair durrani
  • 5,597
  • 8
  • 45
  • 85

1 Answers1

0

This may not be elegant but achieves my goal to automate the random number generation after getting best fit distribution:

 create_command_to_generate_random_nums <- function(varr){
  
  distt <- fitDist(varr, k = 2, type = "realAll", 
                   trace = FALSE, try.gamlss = TRUE)
  
  family_varr <- distt$family[1] 
  
  family_func <- paste0(family_varr, "()")
  
  x <- eval(parse(text=family_func))
  
  x_namez <- names(x$parameters)
  
  list_of_link_commands <- vector(mode="character", length = length(x_namez))
  list_of_links <- vector(mode="character", length = length(x_namez))
  final_values <- vector(mode="character", length = length(x_namez))
  
  Allpar_values <- distt$Allpar
  
  for (i in 1:length(x_namez)){
    list_of_link_commands[i] <- paste0(family_func, "$", x_namez[i], ".link")
    list_of_links[i] <- eval(parse(text=list_of_link_commands[i]))
    
    if (list_of_links[i] == "log"){
      final_values[i] <- exp(Allpar_values[i])
    } else{
      final_values[i] <- Allpar_values[i]
      
    }
    
  }
  
  command_to_run <- paste0("r", gsub("[()]", "", family_func), "(1, ",
                           paste(na.omit(final_values), collapse = ", "), ")")
  
  return(command_to_run)
  
  
}

Here's the command to run to get a random number from SEP4 distribution (best fit for iris$Sepal.Length):

foo <- create_command_to_generate_random_nums(iris$Sepal.Length)
foo
[1] "rSEP4(1, 5.82694036054071, 1.35253873727413, 6.34801087562196, 2.3831581940834)"

Now, I can use it:

eval(parse(text=foo))
[1] 6.69158
umair durrani
  • 5,597
  • 8
  • 45
  • 85