0

I have the following data

df <- structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
2015L, 2015L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L), newly_engaged = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), qualification = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("A2", "AS"), class = "factor"), subject = structure(c(7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L), .Label = c("Biology", "Chemistry", "Mathematics", 
"Mathematics (Further)", "Mathematics (Pure)", "Mathematics (Statistics)", 
"Physics"), class = "factor"), grade = structure(c(1L, 2L, 3L, 
4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 
4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 2L, 3L, 4L, 5L, 6L, 
7L), .Label = c("S", "A", "B", "C", "D", "E", "No.results"), class = "factor"), 
    c = c(2032L, 3871L, 3728L, 3130L, 2514L, 1796L, 591L, 7694L, 
    5486L, 4885L, 3790L, 2493L, 2734L, 1079L, 2142L, 2082L, 1703L, 
    1273L, 779L, 219L, 4096L, 2880L, 2366L, 1700L, 1139L, 1051L, 
    1807L, 3961L, 3921L, 3237L, 2521L, 1760L, 609L, 8160L, 6661L, 
    7035L, 5934L, 4811L, 6155L, 1009L, 2022L, 2127L, 1664L, 1224L, 
    779L, 192L, 4214L, 3350L, 3336L, 2701L, 2044L, 2280L), e = c(17662L, 
    17662L, 17662L, 17662L, 17662L, 17662L, 17662L, 27082L, 27082L, 
    27082L, 27082L, 27082L, 27082L, 9277L, 9277L, 9277L, 9277L, 
    9277L, 9277L, 9277L, 13232L, 13232L, 13232L, 13232L, 13232L, 
    13232L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 17816L, 
    38756L, 38756L, 38756L, 38756L, 38756L, 38756L, 9017L, 9017L, 
    9017L, 9017L, 9017L, 9017L, 9017L, 17925L, 17925L, 17925L, 
    17925L, 17925L, 17925L), m = c(0.115049258294644, 0.219171101800476, 
    0.211074623485449, 0.177216623258974, 0.142339485901936, 
    0.101687238138376, 0.0334616691201449, 0.2841001403146, 0.202569972675578, 
    0.180378110922384, 0.139945351155749, 0.0920537626467765, 
    0.100952662284912, 0.116309151665409, 0.230893607847364, 
    0.224425999784413, 0.183572275520103, 0.137221084402285, 
    0.0839711113506521, 0.0236067694297726, 0.309552599758162, 
    0.217654171704958, 0.178808948004837, 0.128476420798065, 
    0.0860792019347038, 0.0794286577992745, 0.101425684777728, 
    0.222328244274809, 0.220083071396498, 0.181690615177369, 
    0.14150202065559, 0.0987876066457117, 0.0341827570722946, 
    0.210548044173805, 0.171870162039426, 0.181520280730726, 
    0.153111776241098, 0.124135617710806, 0.158814119104139, 
    0.11189974492625, 0.224243096373517, 0.235887767550183, 0.184540312742597, 
    0.135743595430853, 0.0863923699678385, 0.0212931130087612, 
    0.235090655509066, 0.186889818688982, 0.186108786610879, 
    0.15068340306834, 0.114030683403068, 0.127196652719665)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -52L), .Names = c("year", 
"newly_engaged", "qualification", "subject", "grade", "c", "e", 
"m"))

and I need to take the difference of the corresponding values of m for 2015 and 2016, to show the difference in the proportions of grades assigned from 2015 to 2016. I thought I could reshape2::cast this and ddplyr::summarise to compute the difference, but I'm not sure how to use cast in the first place.

Morpheu5
  • 2,610
  • 6
  • 39
  • 72
  • Did you meant `df$m[df$year==2016] - df$m[df$year==2015]` or with `dplyr` i.e. `df %>% group_by(year) %>% mutate(n = row_number()) %>% group_by(n) %>% summarise(m = diff(m))` – akrun Apr 20 '17 at 15:47
  • I think that's what I meant, but I get an `Error in rank(x, ties.method = "first", na.last = "keep") : argument "x" is missing, with no default` with your `dplyr` solution – Morpheu5 Apr 20 '17 at 15:55
  • Is that error with the example you showed. It is working fine with me using dplyr 0.5.0 – akrun Apr 20 '17 at 15:56

2 Answers2

1

The error will happen if we are loading the plyr library along with the dplyr as there are function names that are same in both and those functions can get masked the other package

df %>%
   group_by(year) %>%
   plyr::mutate(n = row_number()) %>% 
   group_by(n) %>% 
   summarise(m = diff(m)) 

Error in rank(x, ties.method = "first", na.last = "keep") :
argument "x" is missing, with no default

In that case, specify the dply:: explicitly

df %>% 
   group_by(year) %>% 
   dplyr::mutate(n = row_number()) %>% 
   group_by(n) %>% 
   dplyr::summarise(m = diff(m)) 
# A tibble: 26 × 2
#      n             m
#   <int>         <dbl>
#1      1 -0.0136235735
#2      2  0.0031571425
#3      3  0.0090084479
#4      4  0.0044739919
#5      5 -0.0008374652
#6      6 -0.0028996315
#7      7  0.0007210880
#8      8 -0.0735520961
#9      9 -0.0306998106
#10    10  0.0011421698
# ... with 16 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662
1

Using dplyr and tidyr you can easily recast your dataframe to give the values of m for 2015 and 2016 alongside each other, and then calculate the difference

library(dplyr)
library(tidyr)
df2 <- df %>% select(-c(c,e)) %>% spread(key=year,value=m) %>% mutate(diff=`2016`-`2015`)

df2
# A tibble: 26 × 7
   newly_engaged qualification subject      grade     `2015`     `2016`          diff
           <lgl>        <fctr>  <fctr>     <fctr>      <dbl>      <dbl>         <dbl>
1          FALSE            A2 Physics          S 0.11504926 0.10142568 -0.0136235735
2          FALSE            A2 Physics          A 0.21917110 0.22232824  0.0031571425
3          FALSE            A2 Physics          B 0.21107462 0.22008307  0.0090084479
4          FALSE            A2 Physics          C 0.17721662 0.18169062  0.0044739919
5          FALSE            A2 Physics          D 0.14233949 0.14150202 -0.0008374652
6          FALSE            A2 Physics          E 0.10168724 0.09878761 -0.0028996315
7          FALSE            A2 Physics No.results 0.03346167 0.03418276  0.0007210880
8          FALSE            AS Physics          A 0.28410014 0.21054804 -0.0735520961
9          FALSE            AS Physics          B 0.20256997 0.17187016 -0.0306998106
10         FALSE            AS Physics          C 0.18037811 0.18152028  0.0011421698
# ... with 16 more rows
Andrew Gustar
  • 17,295
  • 1
  • 22
  • 32