Group by multiple non-numeric columns in dplyr

Question

I am trying to group_by multiple, only non-numeric columns, in dplyr. The aim is then to create two columns with the percentage and absolute changes.

Right now, I can achieve this by listing all non-numeric column names by hand, and then mutate:

df=df %>%
    group_by(CPTY_CLASS, SCENARIO, INSTITUTE, METHOD_DEC, Deaf) %>%
    mutate(PERC_CH = EUR / EUR[which.min(TIME)]-1,
           ABS_CH = EUR - EUR[which.min(TIME)]
           )

However, if there were more non-numeric columns, listing 900 variables by hand, or choosing the column numbers would not work.

Here is the reproducible dataset:

df=structure(list(SCENARIO = c("AC", "AC", "AC", "AC", "AC", "AC", 
    "AC", "AC", "AC", "AC", "AC", "AC"), INSTITUTE = c("BCR", 
    "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", 
    "BCR", "BCR"), METHOD_DEC = c("BIL", "BIL", "BIL", 
    "BIL", "BIL", "BIL", "CRL", "CRL", "CRL", "CRL", "CRL", 
    "CRL"), CPTY_CLASS = c("SME", "SME", "SME", "SME", "SME", 
    "SME", "BANK", "BANK", "BANK", "BANK", "BANK", "BANK"), Deaf = c("Y", 
    "Y", "Y", "N", "N", "N", "Y", "Y", "Y", "N", "N", "N"), TIME = c(2021L, 
    2022L, 2023L, 2021L, 2022L, 2023L, 2021L, 2022L, 2023L, 2021L, 
    2022L, 2023L), EUR = c(13446.7, 16460.6727685, 19510.2132, 
    79951.1120192, 80847.03547, 84940.1414854, 0, 1047.150372256, 
    302.308772901, 104609.421568, 107719.773397, 103466.689156
    )), row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"
    ))

`df %>% group_by(across(where(is.character)))` is how you can group by non-numeric columns. If you factor columns too, perhaps `where(!is.numeric)` could be better. — Chamkrai, Aug 30 '22 at 09:02
`df %>% group_by(across(where(!is.numeric)))` could account for all other non numeric columns — Maël, Aug 30 '22 at 09:03
@TomHoel AFAIK, this is not a duplicate question, so I'd argue for putting your comment as an answer — Maël, Aug 30 '22 at 09:07

score 0 · Answer 1 · edited Sep 01 '22 at 07:28

I solved my problem by creating a character vector with the names of the group-columns in it. In order to use it within group_by(), I had to use the syntax !!!syms().

group_v <- names(df)[rev(seq(1,ncol(df))[!as.logical(sapply(df, is.numeric))])]
        
df = df %>% group_by(!!!syms(group_v)) %>%
        mutate(PERC_CH = EUR / EUR[which.min(TIME)]-1
               ,
               ABS_CH = EUR - EUR[which.min(TIME)]
        )

Other equivalent solutions, thanks to @Tom Hoel's and @Maël's comments:

library(dplyr)
library(magrittr)
# combine use of across() and where() to select columuns
df %>% 
  group_by(across(where(is.character))) %>% 
  mutate(PERC_CH = EUR / EUR[which.min(TIME)]-1, ABS_CH = EUR - EUR[which.min(TIME)])
#> # A tibble: 12 × 9
#> # Groups:   SCENARIO, INSTITUTE, METHOD_DEC, CPTY_CLASS, Deaf [4]
#>    SCENARIO INSTITUTE METHOD_DEC CPTY_CLASS Deaf   TIME     EUR  PERC_CH ABS_CH
#>    <chr>    <chr>     <chr>      <chr>      <chr> <int>   <dbl>    <dbl>  <dbl>
#>  1 AC       BCR       BIL        SME        Y      2021  13447.   0          0 
#>  2 AC       BCR       BIL        SME        Y      2022  16461.   0.224   3014.
#>  3 AC       BCR       BIL        SME        Y      2023  19510.   0.451   6064.
#>  4 AC       BCR       BIL        SME        N      2021  79951.   0          0 
#>  5 AC       BCR       BIL        SME        N      2022  80847.   0.0112   896.
#>  6 AC       BCR       BIL        SME        N      2023  84940.   0.0624  4989.
#>  7 AC       BCR       CRL        BANK       Y      2021      0  NaN          0 
#>  8 AC       BCR       CRL        BANK       Y      2022   1047. Inf       1047.
#>  9 AC       BCR       CRL        BANK       Y      2023    302. Inf        302.
#> 10 AC       BCR       CRL        BANK       N      2021 104609.   0          0 
#> 11 AC       BCR       CRL        BANK       N      2022 107720.   0.0297  3110.
#> 12 AC       BCR       CRL        BANK       N      2023 103467.  -0.0109 -1143.

df %>% 
  group_by(across(!where(is.numeric))) %>% 
  mutate(PERC_CH = EUR / EUR[which.min(TIME)]-1, ABS_CH = EUR - EUR[which.min(TIME)])
#> # A tibble: 12 × 9
#> # Groups:   SCENARIO, INSTITUTE, METHOD_DEC, CPTY_CLASS, Deaf [4]
#>    SCENARIO INSTITUTE METHOD_DEC CPTY_CLASS Deaf   TIME     EUR  PERC_CH ABS_CH
#>    <chr>    <chr>     <chr>      <chr>      <chr> <int>   <dbl>    <dbl>  <dbl>
#>  1 AC       BCR       BIL        SME        Y      2021  13447.   0          0 
#>  2 AC       BCR       BIL        SME        Y      2022  16461.   0.224   3014.
#>  3 AC       BCR       BIL        SME        Y      2023  19510.   0.451   6064.
#>  4 AC       BCR       BIL        SME        N      2021  79951.   0          0 
#>  5 AC       BCR       BIL        SME        N      2022  80847.   0.0112   896.
#>  6 AC       BCR       BIL        SME        N      2023  84940.   0.0624  4989.
#>  7 AC       BCR       CRL        BANK       Y      2021      0  NaN          0 
#>  8 AC       BCR       CRL        BANK       Y      2022   1047. Inf       1047.
#>  9 AC       BCR       CRL        BANK       Y      2023    302. Inf        302.
#> 10 AC       BCR       CRL        BANK       N      2021 104609.   0          0 
#> 11 AC       BCR       CRL        BANK       N      2022 107720.   0.0297  3110.
#> 12 AC       BCR       CRL        BANK       N      2023 103467.  -0.0109 -1143.
# create a vector of character containing the names of the columns
# and then use it to refer to the columns of df in group_by() using the syntax !!!syms()
grps <- names(df)[!sapply(df, is.numeric)]
df %>% 
  group_by(!!!syms(grps)) %>% 
  mutate(PERC_CH = EUR / EUR[which.min(TIME)]-1, ABS_CH = EUR - EUR[which.min(TIME)])
#> # A tibble: 12 × 9
#> # Groups:   SCENARIO, INSTITUTE, METHOD_DEC, CPTY_CLASS, Deaf [4]
#>    SCENARIO INSTITUTE METHOD_DEC CPTY_CLASS Deaf   TIME     EUR  PERC_CH ABS_CH
#>    <chr>    <chr>     <chr>      <chr>      <chr> <int>   <dbl>    <dbl>  <dbl>
#>  1 AC       BCR       BIL        SME        Y      2021  13447.   0          0 
#>  2 AC       BCR       BIL        SME        Y      2022  16461.   0.224   3014.
#>  3 AC       BCR       BIL        SME        Y      2023  19510.   0.451   6064.
#>  4 AC       BCR       BIL        SME        N      2021  79951.   0          0 
#>  5 AC       BCR       BIL        SME        N      2022  80847.   0.0112   896.
#>  6 AC       BCR       BIL        SME        N      2023  84940.   0.0624  4989.
#>  7 AC       BCR       CRL        BANK       Y      2021      0  NaN          0 
#>  8 AC       BCR       CRL        BANK       Y      2022   1047. Inf       1047.
#>  9 AC       BCR       CRL        BANK       Y      2023    302. Inf        302.
#> 10 AC       BCR       CRL        BANK       N      2021 104609.   0          0 
#> 11 AC       BCR       CRL        BANK       N      2022 107720.   0.0297  3110.
#> 12 AC       BCR       CRL        BANK       N      2023 103467.  -0.0109 -1143.

^{Created on 2022-09-01 with reprex v2.0.2}

Reproducible dataset:

df=structure(list(SCENARIO = c("AC", "AC", "AC", "AC", "AC", "AC", 
                               "AC", "AC", "AC", "AC", "AC", "AC"), INSTITUTE = c("BCR", 
                                                                                  "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", "BCR", 
                                                                                  "BCR", "BCR"), METHOD_DEC = c("BIL", "BIL", "BIL", 
                                                                                                                "BIL", "BIL", "BIL", "CRL", "CRL", "CRL", "CRL", "CRL", 
                                                                                                                "CRL"), CPTY_CLASS = c("SME", "SME", "SME", "SME", "SME", 
                                                                                                                                       "SME", "BANK", "BANK", "BANK", "BANK", "BANK", "BANK"), Deaf = c("Y", 
                                                                                                                                                                                                        "Y", "Y", "N", "N", "N", "Y", "Y", "Y", "N", "N", "N"), TIME = c(2021L, 
                                                                                                                                                                                                                                                                         2022L, 2023L, 2021L, 2022L, 2023L, 2021L, 2022L, 2023L, 2021L, 
                                                                                                                                                                                                                                                                         2022L, 2023L), EUR = c(13446.7, 16460.6727685, 19510.2132, 
                                                                                                                                                                                                                                                                                                79951.1120192, 80847.03547, 84940.1414854, 0, 1047.150372256, 
                                                                                                                                                                                                                                                                                                302.308772901, 104609.421568, 107719.773397, 103466.689156
                                                                                                                                                                                                                                                                         )), row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                                                                                                                         ))

Group by multiple non-numeric columns in dplyr

1 Answers1