0

I have a dataframe with two grouping variables grp1 and grp2 and a continuous variable val. I would like to trim the lower and upper deciles within the two grouping variables. I have here a solution which is a little cumbersome:

# data
library(slqdf)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
                 , grp2 = rep(c("f","m"),50)
                 , val = rnorm(100)
)
head(x0)
  grp1 grp2          val
1    A    f -0.006799051
2    A    m -0.554560465
3    A    f  3.254289174
4    A    m  0.609900847
5    A    f -0.554915077
6    A    m -0.010012371

Add a counter within the grouping variables:

library(dplyr)
# sort within grouping factors
x0 <- sqldf("SELECT * FROM x0 ORDER BY grp1,grp2, val")
# counter within grouping factors
x0 <- x0 %>% group_by(paste(grp1,grp2)) %>% mutate(counter = row_number(paste(grp1,grp2)))

Add the number of observation within grouping variables:

x0 <- sqldf(paste("SELECT a.*"
                  ,"      , (SELECT COUNT(*)"
                  ,"         FROM x0 b"
                  ,"         WHERE a.grp1 = b.grp1"
                  ,"               AND a.grp2 = b.grp2"
                  ,"         ) n"
                  ," FROM x0 a"
                  )
            )

Compute the percentiles:

x0$p <- with(x0, counter/n)

Remove first and last deciles:

x1 <- subset(x0, p > 0.1 & p < 0.9)

sqldf("SELECT * FROM x1 ORDER BY grp1,grp2,counter LIMIT 10")
   grp1 grp2         val paste(grp1, grp2) counter  n   p
1     A    f -1.20616577               A f       2 10 0.2
2     A    f -0.56528613               A f       3 10 0.3
3     A    f -0.38902717               A f       4 10 0.4
4     A    f  0.07112568               A f       5 10 0.5
5     A    f  0.21885681               A f       6 10 0.6
6     A    f  0.29956892               A f       7 10 0.7
7     A    f  0.58522900               A f       8 10 0.8
8     A    m -1.37907878               A m       2 10 0.2
9     A    m -0.53148055               A m       3 10 0.3
10    A    m -0.26385376               A m       4 10 0.4

# control
nrow(x0)
[1] 100
nrow(x1)
[1] 78

Is there a function which can do that in one line or has someone an idea how it could be done in a more elegant way?

rene
  • 41,474
  • 78
  • 114
  • 152
giordano
  • 2,954
  • 7
  • 35
  • 57

1 Answers1

4

Perhaps this is what you want or at least close. Reproducible data first:

set.seed(2015)
x0 <- data.frame(grp1 = rep(c("A","B","C"), c(20,30,50))
                 , grp2 = rep(c("f","m"),50)
                 , val = rnorm(100)
)

Now we could use dplyr's ntile function to compute the deciles and remove the first and last:

library(dplyr)

x0 %>%
  group_by(grp1, grp2) %>%
  mutate(n = ntile(val, 10)) %>%
  filter(!n %in% c(1, 10))

#Source: local data frame [80 x 4]
#Groups: grp1, grp2
#
#   grp1 grp2          val n
#1     A    f -1.545448388 2
#2     A    m -0.528393243 3
#3     A    f -1.086758791 4
#4     A    m -0.000111512 4
#5     A    f  0.388953783 8
#6     A    m  0.532631272 6
#7     A    m  0.650598453 7
#8     A    f -0.624245435 5
#9     A    m  0.891555027 9
#10    A    f  0.501271527 9
#..  ...  ...          ... 

Or shorter (without creating the column n):

x0 %>%
  group_by(grp1, grp2) %>%
  filter(!ntile(val, 10) %in% c(1, 10))

The description of dplyr::ntile is:

ntile: a rough rank, which breaks the input vector into n buckets.

talat
  • 68,970
  • 21
  • 126
  • 157