1

My goal is to write a function take_by_rank that

  • can operate on arbitrary selection of numeric columns within a data frame;
  • uses non-standard evaluation like base::subset or dplyr verbs;
  • understands the minus sign naturally, so that -foo means "the largest value of foo gets the lowest rank";
  • returns n top or bottom rows by the total rank, which is the sum of ranks computed for each of the selected variables.

I'm interested in both learning the newest dplyr way and looking for alternative approaches, i.e. there is no restriction on package selection (pure base or data.table maybe?).

My current solution is

library(data.table)
library(dplyr)
library(rlang)

take_by_rank <- function(df, ..., n = 100) {
    selected_vars <- quos(...)
    if (!length(selected_vars))
        stop("No variables to rank!")
    prefix <- ".rank_"
    for (i in seq_along(selected_vars)) {
        rank_name <- paste0(prefix, quo_name(selected_vars[[i]]))
        df <- df %>%
            mutate(!!rank_name := frankv(!!selected_vars[[i]]))
    }
    df %>%
        mutate(TotalRank = rowSums(select(df, starts_with(prefix)))) %>%
        arrange(TotalRank) %>%
        top_n(n, -TotalRank)
}

It seems to be okay, but maybe I'm missing something more straightforward. If there's a way to replace the for loop, that would also be nice.

Usage examples (for reference)

take_by_rank(mtcars, mpg, qsec, n = 3)
   mpg cyl disp  hp drat   wt  qsec vs am gear carb .rank_mpg .rank_qsec TotalRank
1 13.3   8  350 245 3.73 3.84 15.41  0  0    3    4         3          3         6
2 15.0   8  301 335 3.54 3.57 14.60  0  1    5    8         6          2         8
3 14.3   8  360 245 3.21 3.57 15.84  0  0    3    4         4          5         9

take_by_rank(mtcars, mpg, qsec, n = -3)
   mpg cyl  disp hp drat    wt  qsec vs am gear carb .rank_mpg .rank_qsec TotalRank
1 22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2      24.5         32      56.5
2 32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1      31.0         27      58.0
3 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1      32.0         28      60.0

take_by_rank(mtcars, mpg, -qsec, n = 3)
   mpg cyl disp  hp drat    wt  qsec vs am gear carb .rank_mpg .rank_-qsec TotalRank
1 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1      14.0           2      16.0
2 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4       1.5          15      16.5
3 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4       1.5          16      17.5
tonytonov
  • 25,060
  • 16
  • 82
  • 98
  • maybe you could replace the for loop with dplyr::mutate_at() – Alex P Jul 26 '17 at 19:31
  • `mutate_at` does have a lot of potential here to replace the loop if you are willing to switch to adding `rank` as a suffix on the new columns instead of the prefix. – aosmith Jul 26 '17 at 20:14
  • @aosmith @AlexP I thought so as well, but `mutate_at` does not treat `-foo` correclty (example 3 and my comment below), so I cannot use it without manual intervention that strips out the minus sign. – tonytonov Jul 27 '17 at 08:49

3 Answers3

2

You can pass the dots to vars() before passing them to mutate_at()

mutate_at(df, vars(...), myfuns)

This is equivalent to passing the dots to tidyselect::vars_select() and then to mutate_at():

vars <- tidyselect::vars_select(tbl_vars(df), ...)
mutate_at(df, vars, myfuns)
Lionel Henry
  • 6,652
  • 27
  • 33
  • Good to know, thanks. However the `-qsec` semantics will have different meaning then (see my other comments). – tonytonov Jul 27 '17 at 13:15
1

As Alex P also suggest you can use mutate_at() to remove the for loop, we can then rewrite the function as:

take_by_rank <- function(df, ..., n = 100) {
  selected_vars <- quos(...)
  if (!length(selected_vars))
    stop("No variables to rank!")
  df <- df %>% 
    mutate_at(selected_vars,  funs(rank = frankv)) %>%  
    mutate(TotalRank = rowSums(select(., ends_with("_rank")))) %>%
    arrange(TotalRank) %>%
    top_n(n, -TotalRank)
}

This will apply frankv to all selected vars and add new columns with the suffix _rank. I also changed the select statement to reference the piped data.frame. If you want to have complete variable name matching for the Totalrank calculation this will work.

 take_by_rank_matching <- function(df, ..., n = 100) {
      selected_vars <- quos(...)
      if (!length(selected_vars))
        stop("No variables to rank!")
      df <- df %>% 
        mutate_at(selected_vars,  funs(rank = frankv)) %>%  
        mutate(TotalRank = rowSums(
          select_at(., unlist(lapply(selected_vars,
                                     function(x)
                                       paste0(quo_label(x), "_rank")))))) %>%
        arrange(TotalRank) %>%
        top_n(n, -TotalRank)
    }

Although I think there might be a cleaner way.

0

As you noted the use of mutate_at make it impossible (or very hard) to work with the -foo behavior.

I propose you this solution. It's not extremely different from what you did.
I changed the for-loop with purrr::map, and streamlined the creation of total_rank.

library(tidyverse)
# ....
library(rlang)
# ....

take_by_rank <- function(df, ..., n = 100) {
    # original quosures
    selected_vars <- quos(...)

    if (!length(selected_vars))
        stop("No variables to rank!")

    suffixed_vars <- map(selected_vars, ~ {
        paste0(quo_name(.x), '_rank') %>%
            as.name() %>%
            as_quosure()
    })

    selected_vars %>%
        map( ~ {
            rank_name <- paste0(quo_name(.x), '_rank')
            df %>%                   # or whatever rank function you want
                mutate(!!rank_name := dense_rank(!!.x))
        }) %>%
        reduce(full_join) %>%
        mutate(total_rank = '+'(!!!suffixed_vars)) %>% # !!! = unquote and splice
        top_n(n, -total_rank)

}

take_by_rank(mtcars, mpg, qsec, n = 3)
#> Joining, by = c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")
#>    mpg cyl disp  hp drat   wt  qsec vs am gear carb mpg_rank qsec_rank
#> 1 14.3   8  360 245 3.21 3.57 15.84  0  0    3    4        3         5
#> 2 13.3   8  350 245 3.73 3.84 15.41  0  0    3    4        2         3
#> 3 15.0   8  301 335 3.54 3.57 14.60  0  1    5    8        5         2
#>   total_rank
#> 1          8
#> 2          5
#> 3          7

take_by_rank(mtcars, mpg, qsec, n = -3)
#> Joining, by = c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")
#>    mpg cyl  disp hp drat    wt  qsec vs am gear carb mpg_rank qsec_rank
#> 1 22.8   4 140.8 95 3.92 3.150 22.90  1  0    4    2       19        30
#> 2 32.4   4  78.7 66 4.08 2.200 19.47  1  1    4    1       24        25
#> 3 33.9   4  71.1 65 4.22 1.835 19.90  1  1    4    1       25        26
#>   total_rank
#> 1         49
#> 2         49
#> 3         51

take_by_rank(mtcars, mpg, -qsec, n = 3)
#> Joining, by = c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb mpg_rank -qsec_rank
#> 1 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1       12          2
#> 2 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4        1         14
#> 3 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4        1         15
#>   total_rank
#> 1         14
#> 2         15
#> 3         16

take_by_rank(mtcars, mpg,  n = 3)
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb mpg_rank total_rank
#> 1 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4        1          1
#> 2 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4        1          1
#> 3 13.3   8  350 245 3.73 3.840 15.41  0  0    3    4        2          2
GGamba
  • 13,140
  • 3
  • 38
  • 47