Operate on columns based on a variable

Question

I have the following data

df <- structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L), newly_engaged = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), qualification = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("A2", "AS"), class = "factor"), subject = structure(c(7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L), .Label = c("Biology", "Chemistry", "Mathematics", 
"Mathematics (Further)", "Mathematics (Pure)", "Mathematics (Statistics)", 
"Physics"), class = "factor"), grade = structure(c(1L, 2L, 3L, 
4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 
7L), .Label = c("S", "A", "B", "C", "D", "E", "No.results"), class = "factor"), 
    c = c(2032L, 3871L, 3728L, 3130L, 2514L, 1796L, 591L, 7694L, 
    5486L, 4885L, 3790L, 2493L, 2734L, 1079L, 2142L, 2082L, 1703L, 
    1273L, 779L, 219L, 4096L, 2880L, 2366L, 1700L, 1139L, 1051L, 
    1807L, 3961L, 3921L, 3237L, 2521L, 1760L, 609L, 8160L, 6661L, 
    7035L, 5934L, 4811L, 6155L, 1009L, 2022L, 2127L, 1664L, 1224L, 
    779L, 192L, 4214L, 3350L, 3336L, 2701L, 2044L, 2280L), e = c(17662L, 
    17662L, 17662L, 17662L, 17662L, 17662L, 17662L, 27082L, 27082L, 
    27082L, 27082L, 27082L, 27082L, 9277L, 9277L, 9277L, 9277L, 
    9277L, 9277L, 9277L, 13232L, 13232L, 13232L, 13232L, 13232L, 
    13232L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 
    38756L, 38756L, 38756L, 38756L, 38756L, 38756L, 9017L, 9017L, 
    9017L, 9017L, 9017L, 9017L, 9017L, 17925L, 17925L, 17925L, 
    17925L, 17925L, 17925L), m = c(0.115049258294644, 0.219171101800476, 
    0.211074623485449, 0.177216623258974, 0.142339485901936, 
    0.101687238138376, 0.0334616691201449, 0.2841001403146, 0.202569972675578, 
    0.180378110922384, 0.139945351155749, 0.0920537626467765, 
    0.100952662284912, 0.116309151665409, 0.230893607847364, 
    0.224425999784413, 0.183572275520103, 0.137221084402285, 
    0.0839711113506521, 0.0236067694297726, 0.309552599758162, 
    0.217654171704958, 0.178808948004837, 0.128476420798065, 
    0.0860792019347038, 0.0794286577992745, 0.101425684777728, 
    0.222328244274809, 0.220083071396498, 0.181690615177369, 
    0.14150202065559, 0.0987876066457117, 0.0341827570722946, 
    0.210548044173805, 0.171870162039426, 0.181520280730726, 
    0.153111776241098, 0.124135617710806, 0.158814119104139, 
    0.11189974492625, 0.224243096373517, 0.235887767550183, 0.184540312742597, 
    0.135743595430853, 0.0863923699678385, 0.0212931130087612, 
    0.235090655509066, 0.186889818688982, 0.186108786610879, 
    0.15068340306834, 0.114030683403068, 0.127196652719665)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -52L), .Names = c("year", 
"newly_engaged", "qualification", "subject", "grade", "c", "e", 
"m"))

and I need to take the difference of the corresponding values of m for 2015 and 2016, to show the difference in the proportions of grades assigned from 2015 to 2016. I thought I could reshape2::cast this and ddplyr::summarise to compute the difference, but I'm not sure how to use cast in the first place.

Did you meant `df$m[df$year==2016] - df$m[df$year==2015]` or with `dplyr` i.e. `df %>% group_by(year) %>% mutate(n = row_number()) %>% group_by(n) %>% summarise(m = diff(m))` — akrun, Apr 20 '17 at 15:47
I think that's what I meant, but I get an `Error in rank(x, ties.method = "first", na.last = "keep") : argument "x" is missing, with no default` with your `dplyr` solution — Morpheu5, Apr 20 '17 at 15:55
Is that error with the example you showed. It is working fine with me using dplyr 0.5.0 — akrun, Apr 20 '17 at 15:56

score 1 · Answer 1 · answered Apr 20 '17 at 15:58

The error will happen if we are loading the plyr library along with the dplyr as there are function names that are same in both and those functions can get masked the other package

df %>%
   group_by(year) %>%
   plyr::mutate(n = row_number()) %>% 
   group_by(n) %>% 
   summarise(m = diff(m))

Error in rank(x, ties.method = "first", na.last = "keep") :
argument "x" is missing, with no default

In that case, specify the dply:: explicitly

df %>% 
   group_by(year) %>% 
   dplyr::mutate(n = row_number()) %>% 
   group_by(n) %>% 
   dplyr::summarise(m = diff(m)) 
# A tibble: 26 × 2
#      n             m
#   <int>         <dbl>
#1      1 -0.0136235735
#2      2  0.0031571425
#3      3  0.0090084479
#4      4  0.0044739919
#5      5 -0.0008374652
#6      6 -0.0028996315
#7      7  0.0007210880
#8      8 -0.0735520961
#9      9 -0.0306998106
#10    10  0.0011421698
# ... with 16 more rows

I'm pretty sure I don't have plyr loaded, and I still get the error… weird. — Morpheu5, Apr 20 '17 at 16:02
@Morpheu5 You can check the `sessionInfo()` or try the method I suggested with `dplyr::` — akrun, Apr 20 '17 at 16:03

score 1 · Accepted Answer · answered Apr 20 '17 at 15:59

Using dplyr and tidyr you can easily recast your dataframe to give the values of m for 2015 and 2016 alongside each other, and then calculate the difference

library(dplyr)
library(tidyr)
df2 <- df %>% select(-c(c,e)) %>% spread(key=year,value=m) %>% mutate(diff=`2016`-`2015`)

df2
# A tibble: 26 × 7
   newly_engaged qualification subject      grade     `2015`     `2016`          diff
           <lgl>        <fctr>  <fctr>     <fctr>      <dbl>      <dbl>         <dbl>
1          FALSE            A2 Physics          S 0.11504926 0.10142568 -0.0136235735
2          FALSE            A2 Physics          A 0.21917110 0.22232824  0.0031571425
3          FALSE            A2 Physics          B 0.21107462 0.22008307  0.0090084479
4          FALSE            A2 Physics          C 0.17721662 0.18169062  0.0044739919
5          FALSE            A2 Physics          D 0.14233949 0.14150202 -0.0008374652
6          FALSE            A2 Physics          E 0.10168724 0.09878761 -0.0028996315
7          FALSE            A2 Physics No.results 0.03346167 0.03418276  0.0007210880
8          FALSE            AS Physics          A 0.28410014 0.21054804 -0.0735520961
9          FALSE            AS Physics          B 0.20256997 0.17187016 -0.0306998106
10         FALSE            AS Physics          C 0.18037811 0.18152028  0.0011421698
# ... with 16 more rows

Operate on columns based on a variable

2 Answers2