Compute proportions across multiple columns of character values in grouped data frame

Question

In data of this kind, where size is a grouping variable:

df <- data.frame(
  size = c(3,3,4,4,5,5),
  w1 = c("A","B","B","C","B","A"),
  w2 = c("C","A","A","C","C", "B"),
  w3 = c("C","A","B","C","B","B"),
  w4 = c(NA, NA,"C", "B", "A","B"),
  w5 = c(NA, NA,NA, NA, "A","C")
   )

I want to compute the within-column proportions of the character values in the w* columns. This code gets close but seems to factor-in the NA values as well:

df %>%
  group_by(size) %>%
  # deselect columns with only NA:
  select(where(~!all(is.na(.)))) %>%
  pivot_longer(matches("w\\d+$")) %>% 
  count(value, name) %>% 
  group_by(name) %>%
  mutate(n = n / sum(n) * 100) %>% 
  pivot_wider(names_from = name, values_from = n, values_fill = 0)
# A tibble: 11 × 7
    size value    w1    w2    w3    w4    w5
   <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     3 A      16.7  16.7  16.7   0     0  
 2     3 B      16.7   0     0     0     0  
 3     3 C       0    16.7  16.7   0     0  
 4     3 NA      0     0     0    33.3  33.3  # <---
 5     4 A       0    16.7   0     0     0  
 6     4 B      16.7   0    16.7  16.7   0  
 7     4 C      16.7  16.7  16.7  16.7   0  
 8     4 NA      0     0     0     0    33.3  # <---
 9     5 A      16.7   0     0    16.7  16.7
10     5 B      16.7  16.7  33.3  16.7   0  
11     5 C       0    16.7   0     0    16.7

Not only are there rows that should not be there (marked with # <---); the proportions are also incorrect. The desired output is this:

  size value  w1  w2  w3   w4   w5
1    3  A     50  50  50   ΝΑ   ΝΑ
2    3  B     50   0   0   ΝΑ   ΝΑ
3    3  C      0  50  50   ΝΑ   ΝΑ
4    4  A      0  50   0    0   ΝΑ
5    4  B     50   0  50   50   ΝΑ
6    4  C     50  50  50   50   ΝΑ
7    5  A      0   0   0   50   50
8    5  B  B 100  50 100   50    0
9    5  C  A   0  50   0    0   50

score 1 · Answer 1 · answered Aug 01 '23 at 12:30

Extending a previous answer, you can convert to factor and use prop.table by group:

df %>%
  mutate(across(contains("w"), \(x) factor(x, levels = unique(w1)))) %>% 
  group_by(size) %>% 
  reframe(value = unique(df$w1),
          across(-value, \(x) c(prop.table(table(x))*100)))

output

# A tibble: 9 × 7
   size value    w1    w2    w3    w4    w5
  <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1     3 A        50    50    50   NaN   NaN
2     3 B        50     0     0   NaN   NaN
3     3 C         0    50    50   NaN   NaN
4     4 A         0    50     0     0   NaN
5     4 B        50     0    50    50   NaN
6     4 C        50    50    50    50   NaN
7     5 A        50     0     0    50    50
8     5 B        50    50   100    50     0
9     5 C         0    50     0     0    50

score 1 · Accepted Answer · answered Aug 01 '23 at 12:42

I modify your code a little bit:

library(tidyverse)

df %>%
  pivot_longer(w1:w5, values_drop_na = TRUE) %>% 
  group_by(size, name) %>%
  count(value = factor(value), .drop = FALSE) %>%
  mutate(n = n / sum(n) * 100) %>%
  ungroup() %>%
  pivot_wider(names_from = name, values_from = n)

# # A tibble: 9 × 7
#    size value    w1    w2    w3    w4    w5
#   <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1     3 A        50    50    50    NA    NA
# 2     3 B        50     0     0    NA    NA
# 3     3 C         0    50    50    NA    NA
# 4     4 A         0    50     0     0    NA
# 5     4 B        50     0    50    50    NA
# 6     4 C        50    50    50    50    NA
# 7     5 A        50     0     0    50    50
# 8     5 B        50    50   100    50     0
# 9     5 C         0    50     0     0    50

Compute proportions across multiple columns of character values in grouped data frame

2 Answers2