1

I have a series of soccer results and wish to find out how many points a team has scored in a particular number of games

Here is the head of a subset with the cumulative points scored during a season since the latest result

I have been wrist=-slapped a couple of times for not using dput so bear with length

allData <- structure(list(team = c("Arsenal", "Tottenham H", "Tottenham H", 
"Arsenal", "Arsenal", "Tottenham H"), venue = c("H", "A", "H", 
"A", "H", "A"), result = c("W", "D", "W", "L", "W", "D"), GF = c(1L, 
0L, 3L, 1L, 3L, 0L), GA = c(0L, 0L, 1L, 2L, 0L, 0L), gameDate = structure(c(1333868400, 
1333782000, 1333263600, 1333177200, 1332572400, 1332572400), class = c("POSIXct", 
"POSIXt"), tzone = ""), season = structure(c(2L, 2L, 2L, 2L, 
2L, 2L), .Label = c("2010/2011", "2011/2012"), class = "factor"), 
 points = c(3, 1, 3, 0, 3, 1), GD = c(1L, 0L, 2L, -1L, 3L, 
0L), cumpts = c(3, 1, 4, 3, 6, 5)), .Names = c("team", "venue", 
"result", "GF", "GA", "gameDate", "season", "points", "GD", "cumpts"
), row.names = c(NA, 6L), class = "data.frame")

and here is the data for one team during one season

spurs <- structure(list(team = c("Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H", 
"Tottenham H", "Tottenham H", "Tottenham H", "Tottenham H"), 
    venue = c("A", "H", "A", "H", "A", "H", "A", "H", "A", "H", 
    "A", "H", "H", "H", "A", "A", "H", "H", "A", "H", "A", "H", 
    "A", "H", "A", "A", "H", "A", "H", "A", "H", "A"), result = c("D", 
    "W", "D", "D", "L", "L", "L", "W", "D", "W", "L", "D", "W", 
    "W", "D", "W", "D", "W", "L", "W", "W", "W", "W", "W", "W", 
    "D", "W", "W", "W", "W", "L", "L"), GF = c(0L, 3L, 0L, 1L, 
    0L, 1L, 2L, 5L, 0L, 3L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    3L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 4L, 2L, 1L, 0L), GA = c(0L, 
    1L, 0L, 1L, 1L, 3L, 5L, 0L, 0L, 1L, 3L, 1L, 0L, 0L, 1L, 0L, 
    1L, 0L, 2L, 0L, 1L, 0L, 1L, 1L, 1L, 2L, 1L, 1L, 0L, 0L, 5L, 
    3L), gameDate = structure(c(1333782000, 1333263600, 1332572400, 
    1332313200, 1331366400, 1330848000, 1330243200, 1328947200, 
    1328515200, 1327996800, 1327219200, 1326528000, 1326268800, 
    1325577600, 1325318400, 1324972800, 1324540800, 1324281600, 
    1323590400, 1322899200, 1322294400, 1321862400, 1320562800, 
    1319958000, 1319353200, 1318748400, 1317538800, 1316847600, 
    1316329200, 1315638000, 1314514800, 1313996400), class = c("POSIXct", 
    "POSIXt"), tzone = ""), season = structure(c(2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010/2011", 
    "2011/2012"), class = "factor"), points = c(1, 3, 1, 1, 0, 
    0, 0, 3, 1, 3, 0, 1, 3, 3, 1, 3, 1, 3, 0, 3, 3, 3, 3, 3, 
    3, 1, 3, 3, 3, 3, 0, 0), GD = c(0L, 2L, 0L, 0L, -1L, -2L, 
    -3L, 5L, 0L, 2L, -1L, 0L, 2L, 1L, 0L, 2L, 0L, 1L, -1L, 3L, 
    2L, 2L, 2L, 2L, 1L, 0L, 1L, 1L, 4L, 2L, -4L, -3L), cumpts = c(1, 
    4, 5, 6, 6, 6, 6, 9, 10, 13, 13, 14, 17, 20, 21, 24, 25, 
    28, 28, 31, 34, 37, 40, 43, 46, 47, 50, 53, 56, 59, 59, 59
    )), .Names = c("team", "venue", "result", "GF", "GA", "gameDate", 
"season", "points", "GD", "cumpts"), row.names = c(NA, -32L), class = "data.frame")

I then have this code on the spurs dataframe to calculate points scored in specific game lengths(here 5)

gameLength <- 5
seasonLength <- nrow(spurs)
cumPoints <- c()
cumPoints[1] <- spurs[gameLength,]$cumpts
for (i in gameLength+1:seasonLength) {
cumPoints[i-(gameLength-1)] <- ((spurs[i,]$cumpts)- 
 (spurs[i-gameLength,]$cumpts))
}
cumPoints <- cumPoints[!is.na(cumPoints)] # not sure why throws up NAs

This produces the correct output

 [1]  6  5  2  4  4  7  7  8  8 10  8 11 11 11  8 10 10 12 12 15 15
[22] 13 13 13 13 13 12  9

but I need to be able to transform the allData with a column containing this data for every season and team in the dataframe.

I am assuming I should use ddply somehow, unless there is a better alternative

joran
  • 169,992
  • 32
  • 429
  • 468
pssguy
  • 3,455
  • 7
  • 38
  • 68
  • What does the line `for (i in gameLength+1:seasonLength) {` have to do with game length? It is just acting on all of the games except for the first 4. Where is the game length in the data – David Robinson Apr 10 '12 at 18:38
  • gameLength is the run of games I am interested in. So here I am interested in points gained by a team in a 5 game stretch. Spurs most recent five games have garnered them 1,3,1,1,0 points totalling 6. The previous one they lost so 0 replaces the 1 and cumPoints[2] is 5 etc. – pssguy Apr 10 '12 at 19:06
  • The operator precedence of ":" versus "+" will trip you up every time with code like this `for (i in gameLength+1:seasonLength)`. You need to get int the practice of using paired parens to avoid that problem. – IRTFM Apr 10 '12 at 21:43

1 Answers1

1

To replicate your output:

library(zoo)
rollapply(spurs$GD, gamelength, sum)

And if you have allData that looks like the spurs data.frame...

rollsum <- function(df, gamelen=gamelength) {
  require(zoo)
  out <- rollapply(df$points, gamelen, sum)

  return(out)
}

ddply(allData, .(team), rollsum)
Justin
  • 42,475
  • 9
  • 93
  • 111
  • Thanks Justin. This appears to have set me on correct path. I want to split by both team and season and this produces an error "Error in list_to_dataframe(res, attr(.data, "split_labels")) : Results do not have equal lengths" This is correct as season length varies. The solution appears to be to use dlply and do some hacking to create a dataframe. May be a better way but seems to work – pssguy Apr 11 '12 at 03:36