2

I have a huge data frame but I made a reproducible example with only 100 rows.

The objective of my code is to calculate per value of a column (here V2), the mean and standard deviation given by the function fitdistr. The density function I use as argument is dtruncnorm with a=0 and b=1 as boundaries.

I guess it is just a problem of writing but I cannot fix it.

After that, I would like to apply the function to the whole data frame, using tapply for example. With the wrong density function I indeed managed to execute a for-loop but as expected, it is quite slow.

Here is the code :

df <- as.data.frame(matrix(0,100,2))
df$V1 <- runif(nrow(df),0,1)
df$V2 <- sample(c(1:10),nrow(df), replace=TRUE)

library(MASS)
library(truncnorm)

fitdistr(df[which(df$V2=="1"),"V1"],dtruncnorm, start=list(mean=mean(df[which(df$V2=="1"),"V1"]), sd=sd(df[which(df$V2=="1"),"V1"]))) # dtruncnorm without boundaries a and b specified

fitdistr(df[which(df$V2=="1"),"V1"],dtruncnorm(a=0, b=1), start=list(mean=mean(df[which(df$V2=="1"),"V1"]), sd=sd(df[which(df$V2=="1"),"V1"]))) # this line does not work because dtruncnorm does not have all the correct arguments

# For-loop with the wrong density function
# For loop
p = matrix(0,length(unique(df$V2)),3)
p[,1]=unique(df$V2)
for (i in 1:length(unique(df$V2))){
p[i,2]=fitdistr(df[which(df$V2==unique(df$V2)[i]),"V1"],dtruncnorm, start=list(mean=mean(df[which(df$V2==unique(df$V2)[i]),"V1"]), sd=sd(df[which(df$V2==unique(df$V2)[i]),"V1"])))[1]$estimate[[1]]
p[i,3]=fitdistr(df[which(df$V2==unique(df$V2)[i]),"V1"],dtruncnorm, start=list(mean=mean(df[which(df$V2==unique(df$V2)[i]),"V1"]), sd=sd(df[which(df$V2==unique(df$V2)[i]),"V1"])))[1]$estimate[[2]]
}
# works but slow

res = tapply(df$V1, df$V2, function(x) fitdistr(x, dtruncnorm, start=list(mean=mean(df[which(df$V2==x),"V1"]), sd=sd(df[which(df$V2==x),"V1"]))))
 # problem
user3443183
  • 115
  • 6

0 Answers0