0

Here is my minimal example, a reduced version of my datasets of books split by word.

structure(list(word = c("in", "großer", "erregung", "umstehen", 
"bauersleute", "knechte", "und", "mägde", "das", "gehöft", 
"des", "servaz", "amareller", "bauers", "im", "hemmernmoos", 
"und", "besprechen", "den", "einleitung", "lieber", "leser", 
"weißt", "du", "was", "das", "wort", "greenhorn", "bedeutet", 
"eine", "höchst", "ärgerliche", "und", "despektierliche", "bezeichnung", 
"für", "denjenigen", "auf", "zum", "alm", "öhi", "hinauf", 
"vom", "freundlichen", "dorfe", "maienfeld", "führt", "ein", 
"fußweg", "durch", "grüne", "baumreiche", "fluren", "bis", 
"zum", "fuße", "der"), word_id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
16L, 17L, 18L, 19L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), book = c("bergrichters", 
"bergrichters", "bergrichters", "bergrichters", "bergrichters", 
"bergrichters", "bergrichters", "bergrichters", "bergrichters", 
"bergrichters", "bergrichters", "bergrichters", "bergrichters", 
"bergrichters", "bergrichters", "bergrichters", "bergrichters", 
"bergrichters", "bergrichters", "winnetou", "winnetou", "winnetou", 
"winnetou", "winnetou", "winnetou", "winnetou", "winnetou", "winnetou", 
"winnetou", "winnetou", "winnetou", "winnetou", "winnetou", "winnetou", 
"winnetou", "winnetou", "winnetou", "winnetou", "heidilehr", 
"heidilehr", "heidilehr", "heidilehr", "heidilehr", "heidilehr", 
"heidilehr", "heidilehr", "heidilehr", "heidilehr", "heidilehr", 
"heidilehr", "heidilehr", "heidilehr", "heidilehr", "heidilehr", 
"heidilehr", "heidilehr", "heidilehr")), row.names = c(NA, -57L
), groups = structure(list(word = c("alm", "amareller", "ärgerliche", 
"auf", "bauers", "bauersleute", "baumreiche", "bedeutet", "besprechen", 
"bezeichnung", "bis", "das", "den", "denjenigen", "der", "des", 
"despektierliche", "dorfe", "du", "durch", "ein", "eine", "einleitung", 
"erregung", "fluren", "freundlichen", "führt", "für", "fuße", 
"fußweg", "gehöft", "greenhorn", "großer", "grüne", "hemmernmoos", 
"hinauf", "höchst", "im", "in", "knechte", "leser", "lieber", 
"mägde", "maienfeld", "öhi", "servaz", "umstehen", "und", "vom", 
"was", "weißt", "wort", "zum"), .rows = structure(list(40L, 
    13L, 32L, 38L, 14L, 5L, 52L, 29L, 18L, 35L, 54L, c(9L, 26L
    ), 19L, 37L, 57L, 11L, 34L, 45L, 24L, 50L, 48L, 30L, 20L, 
    3L, 53L, 44L, 47L, 36L, 56L, 49L, 10L, 28L, 2L, 51L, 16L, 
    42L, 31L, 15L, 1L, 6L, 22L, 21L, 8L, 46L, 41L, 12L, 4L, c(7L, 
    17L, 33L), 43L, 25L, 23L, 27L, c(39L, 55L)), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 53L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

Because these books have no sections, for the purpose of my analysis I want to insert a 'fake' section column, called 'section', which splits each book (group_by) proportionally in 10 chunks (10 sections per book), based on the actual number of rows per book, and name the sections sequentially from 1 to 10.

With dplyr I could not find a solution and I don't know how to achieve this. Any suggestions? Thanks!

Grig
  • 109
  • 1
  • 10

1 Answers1

1

One way would be to leverage cut to get each book divided into 10 sections.

library(dplyr)

df %>%
  group_by(book) %>%
  mutate(section = cut(row_number(), breaks = 10, labels = FALSE)) 

#   word        word_id book         section
#   <chr>         <int> <chr>          <int>
# 1 in                1 bergrichters       1
# 2 großer            2 bergrichters       1
# 3 erregung          3 bergrichters       2
# 4 umstehen          4 bergrichters       2
# 5 bauersleute       5 bergrichters       3
# 6 knechte           6 bergrichters       3
# 7 und               7 bergrichters       4
# 8 mägde             8 bergrichters       4
# 9 das               9 bergrichters       5
#10 gehöft           10 bergrichters       5
# … with 47 more rows

This can also be done in base R using ave :

df$section <- with(df, ave(word_id, book, FUN = function(x) 
                       cut(seq_along(x), breaks = 10, labels = FALSE)))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213