2

I am working with the R programming language.

I have the following dataset:

library(dplyr)

df = structure(list(ethnicity = c("c", "c", "c", "b", "c", "b", "b", 
"b", "c", "a", "b", "b", "a", "b", "c", "a", "c", "c", "a", "a", 
"a", "a", "c", "b", "c", "b", "a", "b", "c", "b", "a", "c", "c", 
"a", "c", "b", "a", "c", "a", "a", "b", "c", "c", "a", "c", "a", 
"c", "b", "a", "b", "a", "a", "c", "a", "b", "a", "a", "c", "a", 
"b", "a", "c", "a", "c", "b", "c", "b", "b", "c", "b", "b", "c", 
"c", "a", "b", "b", "a", "b", "a", "a", "b", "c", "c", "a", "b", 
"a", "b", "a", "c", "c", "b", "c", "a", "b", "b", "c", "b", "a", 
"c", "c"), number_of_degrees = c(3L, 2L, 2L, 3L, 1L, 1L, 3L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 
3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 1L, 
2L, 1L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 2L, 1L, 3L, 2L, 1L, 3L, 
3L, 3L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 3L, 3L, 1L, 
3L, 2L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 3L, 
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 3L, 3L, 2L, 1L, 2L)), class = "data.frame", row.names = c(NA, 
-100L))


df %>%
    # Group the data by number_of_degrees
    group_by(number_of_degrees) %>%
    # Calculate the percentage of each ethnicity within each group
    summarize(
        percent_a = mean(ethnicity == "a") * 100,
        percent_b = mean(ethnicity == "b") * 100,
        percent_c = mean(ethnicity == "c") * 100
    )

This produces the following output:

# A tibble: 3 x 4
  number_of_degrees percent_a percent_b percent_c
              <int>     <dbl>     <dbl>     <dbl>
1                 1      33.3      36.7      30  
2                 2      31.6      21.1      47.4
3                 3      34.4      40.6      25  

My Question: Is there a more "compact" way to write this code such that I don't have to manually write "percent_a","percent_b", etc.? This way, it would be much faster and automatically do it for all values of ethnicity.

ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
stats_noob
  • 5,401
  • 4
  • 27
  • 83

5 Answers5

2

Probably you can try this base R option (the column names might be a bit different from the desired output)

> aggregate(. ~ number_of_degrees, df, \(x) proportions(table(x)))
  number_of_degrees ethnicity.a ethnicity.b ethnicity.c
1                 1   0.3333333   0.3666667   0.3000000
2                 2   0.3157895   0.2105263   0.4736842
3                 3   0.3437500   0.4062500   0.2500000

or

reshape(
    as.data.frame(proportions(table(df), 2)),
    direction = "wide",
    idvar = "number_of_degrees",
    timevar = "ethnicity"
)

which gives

  number_of_degrees    Freq.a    Freq.b    Freq.c
1                 1 0.3333333 0.3666667 0.3000000
4                 2 0.3157895 0.2105263 0.4736842
7                 3 0.3437500 0.4062500 0.2500000

Or, a less compact option with dplyr (sorry for my limited tidyverse knowledge)

table(rev(df)) %>%
    proportions(1) %>%
    as.data.frame.matrix() %>%
    rownames_to_column(var = "number_of_degrees") %>%
    mutate(number_of_degrees = as.integer(number_of_degrees))

which gives

  number_of_degrees         a         b         c
1                 1 0.3333333 0.3666667 0.3000000
2                 2 0.3157895 0.2105263 0.4736842
3                 3 0.3437500 0.4062500 0.2500000
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
1

This is one option:

df |>
  summarise(n = n(),.by = c(number_of_degrees,ethnicity)) |>
  mutate(pct = n / sum(n),.by = number_of_degrees) |>
  select(-n) |>
  pivot_wider(names_from = ethnicity,
              values_from = pct,
              names_prefix = 'percent_',
              names_sort = TRUE)
  
joran
  • 169,992
  • 32
  • 429
  • 468
1

Here is a way. It is not more compact but doesn't hard code ethnicity values.

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
})

df %>%
  count(ethnicity, number_of_degrees) %>%
  group_by(number_of_degrees) %>%
  mutate(n = 100*prop.table(n)) %>%
  pivot_wider(number_of_degrees, 
              names_from = ethnicity, 
              names_glue = "percent_{ethnicity}",
              values_from = n)
#> # A tibble: 3 × 4
#> # Groups:   number_of_degrees [3]
#>   number_of_degrees percent_a percent_b percent_c
#>               <int>     <dbl>     <dbl>     <dbl>
#> 1                 1      33.3      36.7      30  
#> 2                 2      31.6      21.1      47.4
#> 3                 3      34.4      40.6      25

Created on 2023-06-02 with reprex v2.0.2

Rui Barradas
  • 70,273
  • 8
  • 34
  • 66
1

Here is a one-liner to get a table out using this data. No packages are used.

100*proportions(table(df[2:1]), 1)

giving:

                 ethnicity
number_of_degrees        a        b        c
                1 33.33333 36.66667 30.00000
                2 31.57895 21.05263 47.36842
                3 34.37500 40.62500 25.00000

You could consider adding a margin to make it clear that the rows add to 100.

addmargins(100*proportions(table(df[2:1]), 1), 2)

giving:

                 ethnicity
number_of_degrees         a         b         c       Sum
                1  33.33333  36.66667  30.00000 100.00000
                2  31.57895  21.05263  47.36842 100.00000
                3  34.37500  40.62500  25.00000 100.00000
G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341
0

using mosaic::percs for percentages. You can use counts(..., format = 'percent')

mosaic::percs(ethnicity~number_of_degrees, df)[-1]

  number_of_degrees   perc_a   perc_b   perc_c
1                 1 33.33333 36.66667 30.00000
2                 2 31.57895 21.05263 47.36842
3                 3 34.37500 40.62500 25.00000
Onyambu
  • 67,392
  • 3
  • 24
  • 53