9

I'm given data that comes in a wide format. Each row pertains to a variable external to the current table, and possible values relevant for that variable. I'm trying to: (1) pivot to long format, and (2) nest pivoted values.

Example

library(tibble)

df_1 <-
  tribble(~key, ~values.male, ~values.female, ~values.red, ~values.green, ~value,
        "gender", 0.5, 0.5, NA, NA, NA,
        "age", NA, NA, NA, NA, "50",
        "color", NA, NA, TRUE, FALSE, NA,
        "time_of_day", NA, NA, NA, NA, "noon")

## # A tibble: 4 x 6
##   key         values.male values.female values.red values.green value
##   <chr>             <dbl>         <dbl> <lgl>      <lgl>        <chr>
## 1 gender              0.5           0.5 NA         NA           NA   
## 2 age                NA            NA   NA         NA           50   
## 3 color              NA            NA   TRUE       FALSE        NA   
## 4 time_of_day        NA            NA   NA         NA           noon 

In this example, we see that gender can have either female = 0.5 and male = 0.5. On the other hand, age can have only a single value of 50. From row #3 we learn that color can have values of red = TRUE and green = FALSE, and time_of_day = noon.

Thus, a pivoted table should take the nested form of:

my_pivoted_df <-
  structure(
    list(
      var_name = c("gender", "age", "color", "time_of_day"),
      vals = list(
        structure(
          list(
            level = c("male", "female"),
            value = c(0.5,
                      0.5)
          ),
          row.names = c(NA, -2L),
          class = c("tbl_df", "tbl", "data.frame")
        ),
        "50",
        structure(
          list(
            level = c("red", "green"),
            value = c(TRUE,
                      FALSE)
          ),
          row.names = c(NA, -2L),
          class = c("tbl_df", "tbl", "data.frame")
        ),
        "noon"
      )
    ),
    row.names = c(NA, -4L),
    class = c("tbl_df", "tbl",
              "data.frame")
  )


## # A tibble: 4 x 2
##   var_name    vals            
##   <chr>       <list>          
## 1 gender      <tibble [2 x 2]>
## 2 age         <chr [1]>       
## 3 color       <tibble [2 x 2]>
## 4 time_of_day <chr [1]>

My attempt to solve this

There are a couple of problems with df_1. First, the current naming of columns is inconvenient. Headers such as value aren't ideal because they conflict with pivot_longer()'s ".value" mechanism. Second, df_1 has values (in plural) when the key has more than one option (e.g., "red" and "green" for color), but value (singular) when there's just one option for key (such as with age). Below is my unsuccessful code, inspired by this answer.

library(tidyr)
library(dplyr)

df_1 %>%
  rename_with( ~ paste(.x, "single", sep = "."), .cols = value) %>% ## changed the header because otherwise it breaks
  pivot_longer(cols = starts_with("val"),
               names_to = c("whatevs", ".value"), names_sep = "\\.")


## # A tibble: 8 x 7
##   key         whatevs  male female red   green single
##   <chr>       <chr>   <dbl>  <dbl> <lgl> <lgl> <chr> 
## 1 gender      values    0.5    0.5 NA    NA    NA    
## 2 gender      value    NA     NA   NA    NA    NA    
## 3 age         values   NA     NA   NA    NA    NA    
## 4 age         value    NA     NA   NA    NA    50    
## 5 color       values   NA     NA   TRUE  FALSE NA    
## 6 color       value    NA     NA   NA    NA    NA    
## 7 time_of_day values   NA     NA   NA    NA    NA    
## 8 time_of_day value    NA     NA   NA    NA    noon  

I lack some wrangling tricks to solve this.

Emman
  • 3,695
  • 2
  • 20
  • 44

3 Answers3

4

A tidyverse approach to achieve your desired result may look like so:

library(tibble)

df_1 <-
  tribble(~key, ~values.male, ~values.female, ~values.red, ~values.green, ~value,
          "gender", 0.5, 0.5, NA, NA, NA,
          "age", NA, NA, NA, NA, "50",
          "color", NA, NA, TRUE, FALSE, NA,
          "time_of_day", NA, NA, NA, NA, "noon")

library(tidyr)
library(dplyr)
library(purrr)

df_pivoted <- df_1 %>% 
  mutate(across(everything(), as.character)) %>% 
  pivot_longer(-key, names_to = "level", names_prefix = "^values\\.", values_drop_na = TRUE) %>% 
  group_by(key) %>% 
  nest() %>% 
  mutate(data = map(data, ~ if (all(.x$level == "value")) deframe(.x) else .x))
df_pivoted
#> # A tibble: 4 x 2
#> # Groups:   key [4]
#>   key         data            
#>   <chr>       <list>          
#> 1 gender      <tibble [2 × 2]>
#> 2 age         <chr [1]>       
#> 3 color       <tibble [2 × 2]>
#> 4 time_of_day <chr [1]>

EDIT Following the clarification in your comments on the desired result we could simply get rid of the map statement as the end (which basically was meant for converting the tibbles for categories without levels to a vector) and add a mutate statement before nesting to replace the level with NA for categories without a level:

pivot_nest <- function(x) {
  mutate(x, across(everything(), as.character)) %>% 
    pivot_longer(-key, names_to = "level", names_prefix = "^values\\.", values_drop_na = TRUE) %>% 
    group_by(key) %>% 
    mutate(level = ifelse(all(level == "value"), NA_character_, level)) %>% 
    nest() 
}

df_pivoted <- df_1 %>% 
  pivot_nest()
df_pivoted
#> # A tibble: 4 x 2
#> # Groups:   key [4]
#>   key         data            
#>   <chr>       <list>          
#> 1 gender      <tibble [2 × 2]>
#> 2 age         <tibble [1 × 2]>
#> 3 color       <tibble [2 × 2]>
#> 4 time_of_day <tibble [1 × 2]>
df_pivoted$data
#> [[1]]
#> # A tibble: 2 x 2
#>   level value
#>   <chr> <chr>
#> 1 male  0.5  
#> 2 male  0.5  
#> 
#> [[2]]
#> # A tibble: 1 x 2
#>   level value
#>   <chr> <chr>
#> 1 <NA>  50   
#> 
#> [[3]]
#> # A tibble: 2 x 2
#>   level value
#>   <chr> <chr>
#> 1 red   TRUE 
#> 2 red   FALSE
#> 
#> [[4]]
#> # A tibble: 1 x 2
#>   level value
#>   <chr> <chr>
#> 1 <NA>  noon

df_2 <- tribble(~key, ~value, "age", "50", "income", "100000", "time_of_day", "noon")

df_pivoted2 <- df_2 %>% 
  pivot_nest()
df_pivoted2
#> # A tibble: 3 x 2
#> # Groups:   key [3]
#>   key         data            
#>   <chr>       <list>          
#> 1 age         <tibble [1 × 2]>
#> 2 income      <tibble [1 × 2]>
#> 3 time_of_day <tibble [1 × 2]>
df_pivoted2$data
#> [[1]]
#> # A tibble: 1 x 2
#>   level value
#>   <chr> <chr>
#> 1 <NA>  50   
#> 
#> [[2]]
#> # A tibble: 1 x 2
#>   level value 
#>   <chr> <chr> 
#> 1 <NA>  100000
#> 
#> [[3]]
#> # A tibble: 1 x 2
#>   level value
#>   <chr> <chr>
#> 1 <NA>  noon
stefan
  • 90,330
  • 6
  • 25
  • 51
  • Thanks! Is there a way to organize the output (`df_pivoted`) such that its `data` column would not exist? Rather, values under `data` would be in the `value` column. I was thinking that maybe using `dplyr::coalesce()` as the final step could do the trick, but am hesitant. This could break if I had only single values such as `df_2 <- tribble(~key, ~value, "age", "50", "income", "100000", "time_of_day", "noon")` – Emman Jan 04 '21 at 09:13
  • My comment above also reveals a situation I didn't address in the post. What if all values are single such as in `df_2` in the comment? In my real data, this often happens. Then the output would be different in terms of column names (`key` and `data`) as opposed to the `df_1` scenario (`key`, `level`, `value`, `data` in the unnested output format). How can I ensure that the output will *always* have only `key` and `value` columns, and if needed, additional `level` column? – Emman Jan 04 '21 at 09:19
  • I found somewhat of a solution: `df_pivoted %>% unnest(data) %>% {if(all(c("data", "value") %in% colnames(.))) (mutate(., value = coalesce(data, value)) %>% select(-data)) else .} %>% nest()`. But I think it's not so readable and maybe not best coding practice. I'll be happy if there's a simpler/cleaner solution. Thanks! – Emman Jan 04 '21 at 09:49
  • Hi Emman. Not sure whether I got you right. But have a look at my edit. Basically I don't think that we need unnest + ... + nest to get your desired result. – stefan Jan 04 '21 at 12:02
3

One option that will return the same type of output as the supplied input:

df_1 %>%
 group_split(key) %>%
 map_dfr(~ select(., where(~ !all(is.na(.)))) %>%
          pivot_longer(-key, names_to = "level", names_prefix = "^values\\.") %>%
          summarise(key = first(key),
                    vals = if(n() == 1) list(value) else list(tibble(level, value))))

  key         vals            
  <chr>       <list>          
1 age         <chr [1]>       
2 color       <tibble [2 × 2]>
3 gender      <tibble [2 × 2]>
4 time_of_day <chr [1]>  

The structure of output:

$ key : chr [1:4] "age" "color" "gender" "time_of_day"
 $ vals:List of 4
  ..$ : chr "50"
  ..$ : tibble [2 × 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ level: chr [1:2] "red" "green"
  .. ..$ value: logi [1:2] TRUE FALSE
  ..$ : tibble [2 × 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ level: chr [1:2] "male" "female"
  .. ..$ value: num [1:2] 0.5 0.5
  ..$ : chr "noon"
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
1

Here is a data.table solution, because I am more comfortable with the melt and dcast, but should be easily transferable to dplyr:

library(data.table)
df <- setDT(df_1)

plouf <- melt(df,measure.vars = patterns("value")) %>%
  .[!is.na(value),.(key,level = gsub("values.","",variable),value)] 

this gives:

           key  level value
1:      gender   male   0.5
2:      gender female   0.5
3:       color    red  TRUE
4:       color  green FALSE
5:         age  value    50
6: time_of_day  value  noon

You can now just loop over the unique key values to output what you want:

keylist <- unique(plouf$key)
result <- tibble(varname = keylist,
               vals = lapply(keylist,function(x){
                 if(plouf[x == key,level[1]] != "value"){
                   plouf[x == key,.(level,value)]
                 }else{
                   plouf[x == key,value]
                 }
               })
               
)

Here you obtain your nested tibble (with data.tables and characters inside)

denis
  • 5,580
  • 1
  • 13
  • 40