0

Objective: From a time-series df, make a plot of each occurrence of a particular state (or factor level) with x timepoints before, and y timepoints after, the onset (i.e. first row) of that state. The graph should be centered on zero (on the x-axis), such that the x timepoints before the event are negative values, and the y timepoints after the event are positive values. This is the same principal as a peristimulus time histogram.

The data: I have time-series data where different states can occur for variable amounts of time. First I use run length encoding (rle) to determine the start and stop of each run of each state (not shown). Second, I use a function, similar to the one described here, to return, say one row above and two rows below the onset of a particular state (state "A" in the example below). Here’s what that data looks like.

df <- data.frame(
  state =      c("A","A","A","A","A","B","A","A","X","Y","Z","A","A","A","B","A","A"),
  start =      c("start","NA","NA","NA","NA","NA","start","NA","NA","NA","NA","start","NA","NA","NA","start","NA"),
  rleGroup =   c("1","1","1","1","1","2","3","3","4","5","6","7","7","7","8","9","9"),
  data = runif(17)
)
df <- df %>% tidyr::unite(stateStart, c(state,start), sep = ".", remove = FALSE)

   stateStart state start rleGroup       data
1     A.start     A start        1 0.85118187
2        A.NA     A    NA        1 0.23502147
3        A.NA     A    NA        1 0.97435662
4        A.NA     A    NA        1 0.45669042
5        A.NA     A    NA        1 0.48271803
6        B.NA     B    NA        2 0.80561653
7     A.start     A start        3 0.27228361
8        A.NA     A    NA        3 0.07008506
9        X.NA     X    NA        4 0.44101076
10       Y.NA     Y    NA        5 0.95173954
11       Z.NA     Z    NA        6 0.65693316
12    A.start     A start        7 0.45831802
13       A.NA     A    NA        7 0.83629347
14       A.NA     A    NA        7 0.62107270
15       B.NA     B    NA        8 0.53294588
16    A.start     A start        9 0.08533221
17       A.NA     A    NA        9 0.28805362

extract.with.context <- function(x, colname, rows, after = 0, before = 0) {
  match.idx  <- which(x[[colname]] %in% rows)
  span       <- seq(from = -before, to = after)
  extend.idx <- c(outer(match.idx, span, `+`))
  extend.idx <- Filter(function(i) i > 0 & i <= nrow(x), extend.idx)
  extend.idx <- sort(unique(extend.idx))
  return(x[extend.idx, , drop = FALSE]) 
}
extracted = extract.with.context(x=df, colname="stateStart", rows=c("A.start"), after = 2, before = 1)

   stateStart state start rleGroup       data
1     A.start     A start        1 0.85118187
2        A.NA     A    NA        1 0.23502147
3        A.NA     A    NA        1 0.97435662
6        B.NA     B    NA        2 0.80561653
7     A.start     A start        3 0.27228361
8        A.NA     A    NA        3 0.07008506
9        X.NA     X    NA        4 0.44101076
11       Z.NA     Z    NA        6 0.65693316
12    A.start     A start        7 0.45831802
13       A.NA     A    NA        7 0.83629347
14       A.NA     A    NA        7 0.62107270
15       B.NA     B    NA        8 0.53294588
16    A.start     A start        9 0.08533221
17       A.NA     A    NA        9 0.28805362

The problem: I want to plot each extracted run of state A (i.e., one timepoint before and two timepoints after A.start). One thought is to make a unique identifier for each run grouping of state "A" (Question 1), and then make a time sequence counter that reflects the desired rows before and after the onset of state A (Question 2).

QUESTION NUMBER 1. Create a unique identifier for each "plotGroup", defined as each run of state A (i.e., a row before A.start and two rows after A.start) I tried this, but it's not quite working

extracted %>% mutate(plotGroup = cumsum(lag(state) == "A" & state != "A"))

it should look like this

extracted$plotGroup <- c("0","0","0","1","1","1","1","2","2","2","2","3","3","3")

   stateStart state start rleGroup       data plotGroup
1     A.start     A start        1 0.85118187         0
2        A.NA     A    NA        1 0.23502147         0
3        A.NA     A    NA        1 0.97435662         0
6        B.NA     B    NA        2 0.80561653         1
7     A.start     A start        3 0.27228361         1
8        A.NA     A    NA        3 0.07008506         1
9        X.NA     X    NA        4 0.44101076         1
11       Z.NA     Z    NA        6 0.65693316         2
12    A.start     A start        7 0.45831802         2
13       A.NA     A    NA        7 0.83629347         2
14       A.NA     A    NA        7 0.62107270         2
15       B.NA     B    NA        8 0.53294588         3
16    A.start     A start        9 0.08533221         3
17       A.NA     A    NA        9 0.28805362         3

QUESTION NUMBER 2. Create a "counter," centered on zero, of one row above and two rows after the A.start This I have no idea how to do! But presumably can make use of "span" in the function This is my desired output

extracted$span <- c("0","1","2","-1","0","1","2","-1","0","1","2","-1","0","1")

   stateStart state start rleGroup       data plotGroup span
1     A.start     A start        1 0.85118187         0    0
2        A.NA     A    NA        1 0.23502147         0    1
3        A.NA     A    NA        1 0.97435662         0    2
6        B.NA     B    NA        2 0.80561653         1   -1
7     A.start     A start        3 0.27228361         1    0
8        A.NA     A    NA        3 0.07008506         1    1
9        X.NA     X    NA        4 0.44101076         1    2
11       Z.NA     Z    NA        6 0.65693316         2   -1
12    A.start     A start        7 0.45831802         2    0
13       A.NA     A    NA        7 0.83629347         2    1
14       A.NA     A    NA        7 0.62107270         2    2
15       B.NA     B    NA        8 0.53294588         3   -1
16    A.start     A start        9 0.08533221         3    0
17       A.NA     A    NA        9 0.28805362         3    1

Ultimate objective: plot data by span for each individual plotgroup

ggplot(data=extracted, aes(x=span, y = data, group = plotGroup)) + geom_line()

enter image description here

redatoms
  • 23
  • 4
  • It's not clear to me from your question or example data what you want to show. What does "onset of each state" mean? I would have thought that meant the first appearance of a given state value, but then I don't know what would come before. Are you imagining the "B" plot would have row 4 at x0 and the prior A values preceding it? – Jon Spring Nov 08 '22 at 19:40
  • Question edited for clarity. I only want to plot state "A", such that the onset (i.e. first row) of a run of state "A" is centered on zero, and one row before and two rows after the onset are also plotted in the line. – redatoms Nov 08 '22 at 19:56
  • Do you potentially want data points to be counted in multiple series, both before one onset and after others? – Jon Spring Nov 09 '22 at 06:20

2 Answers2

0

This approach subdivides the data into plotGroups where each group starts one step before each new A (except for the first grp), and the counter is set at zero for each group's first A. The division point prior is determined by the n in lead(), and we could add a filter to limit the points after.

# edit to fix first group counting
df %>%
  mutate(start = state == "A" & lag(state, default = "") != "A") %>%
  mutate(plotGroup = cumsum(lead(start, n = 1, default = FALSE))) %>%
  group_by(plotGroup) %>%
  mutate(counter = row_number() - row_number()[start]) %>%
  ungroup() %>%
  filter(counter <= 2) %>%
  ggplot(aes(counter, data, group = plotGroup)) +
  geom_line()

Result before plotting:

# A tibble: 14 × 6
   state start rleGroup   data plotGroup counter
   <chr> <lgl> <chr>     <dbl>     <int>   <int>
 1 A     TRUE  1        0.0198         0       0
 2 A     FALSE 1        0.338          0       1
 3 A     FALSE 1        0.635          0       2
 4 B     FALSE 2        0.0138         1      -1
 5 A     TRUE  3        0.218          1       0
 6 A     FALSE 3        0.208          1       1
 7 X     FALSE 4        0.0934         1       2
 8 Z     FALSE 6        0.499          2      -1
 9 A     TRUE  7        0.0417         2       0
10 A     FALSE 7        0.934          2       1
11 A     FALSE 7        0.507          2       2
12 B     FALSE 8        0.555          3      -1
13 A     TRUE  9        0.158          3       0
14 A     FALSE 9        0.437          3       1
Jon Spring
  • 55,165
  • 4
  • 35
  • 53
  • Thanks, but sol’n is not quite right. Compare df2 in OP with df created here (i.e. first five lines). — “counter” needs to start with zero, because first row is start of a run of As. 
— Can counter be made generic (instead of coded as “-2”)? In contrast to this toy example with one timepoint before and two timepoints after the start of each run of As, in the real data it’s something like 50 timepoints before and 100 timepoints after start of each run of As. —Same applies to “plotGroup.” Could this be made generic for any number of rows before/after the onset of state A? – redatoms Nov 09 '22 at 05:15
  • Thanks for explaining, I see now the issue with the first group's numbering. One thing I'm not clear on -- is the data you want to show in your chart a subset of the whole data, the whole data but subdivided, or longer than the original data because points could be part of multiple groups (if more than one timepoint before is included)? – Jon Spring Nov 09 '22 at 06:27
  • Updated answer to align zero with first A per group. – Jon Spring Nov 09 '22 at 07:13
  • Jon, thanks for the comments. I was not posing the question right, so I decided to move up the pipeline and re-write the question. Hope you dont mind. – redatoms Nov 09 '22 at 23:56
  • As far as I can tell my output matches your desired output, right? – Jon Spring Nov 10 '22 at 00:12
  • Yes, it did match my desired output. However, the solution did not generalize to my actual data. So that’s my bad, and why I decided it was necessary to reformulate the question – redatoms Nov 10 '22 at 01:24
0
#Define number of rows you want before and after the zero-centered graph
after <- 2
before <- 1

#made up data 
df <- data.frame(
  state =      c("A","A","A","A","A","B","A","A","X","Y","Z","A","A","A","B","A","A"),
  start =      c("start","NA","NA","NA","NA","NA","start","NA","NA","NA","NA","start","NA","NA","NA","start","NA"),
  rleGroup =   c("1","1","1","1","1","2","3","3","4","5","6","7","7","7","8","9","9"),
  data = runif(17)
)
df <- df %>% tidyr::unite(stateStart, c(state,start), sep = ".", remove = FALSE)

#extract the rows before and after the onset of a particular state
extract.with.context <- function(x, colname, rows, after = 0, before = 0) {
  match.idx  <- which(x[[colname]] %in% rows)
  span       <- seq(from = -before, to = after)
  extend.idx <- c(outer(match.idx, span, `+`))
  extend.idx <- Filter(function(i) i > 0 & i <= nrow(x), extend.idx)
  extend.idx <- sort(unique(extend.idx))
  return(x[extend.idx, , drop = FALSE]) 
}

extracted.df = extract.with.context(x=df, colname="stateStart", rows=c("A.start"), after = after, before = before)

# Create plotGroup
# if we go off starting cue = T/F, and start counting when lead (by "before") is T, 
# then we should get correct plotGroup, regardless whether the desired state is in first row or not 
boo <- extracted.df %>%
  dplyr::mutate(start2 = state == "A" & lag(state, default = "") != "A") %>%
  mutate(plotGroup = cumsum(lead(start2, n = before, default = FALSE)))

#create the counter/sequence to zero the graph 
counter <- rep(NA, times = length(boo$start)) # make an empty counter
starts <- which(boo$start == "start") # find the start positions
counter[starts] <- 0

for(i in 1:after){ # for every position after a start, up to "after"
  indexes <- starts + i # index of positions "i" after the start
  indexes_1 <- indexes[which(indexes %in% 1:length(counter))] # indexes can run over the length of the counter - we only want indexes that are within the length of the counter
  counter[indexes_1] <- i # for those indexes, put in the count, i
}
for(i in 1:before){ # same as for "after", but in reverse for "before"
  indexes <- starts - i
  indexes_1 <- indexes[which(indexes %in% 1:length(counter))]
  counter[indexes_1] <- -i
}

boo$span <- counter
boo
   stateStart state start rleGroup       data start2 plotGroup span
1     A.start     A start        1 0.22771277   TRUE         0    0
2        A.NA     A    NA        1 0.39769158  FALSE         0    1
3        A.NA     A    NA        1 0.42416120  FALSE         0    2
6        B.NA     B    NA        2 0.06402964  FALSE         1   -1
7     A.start     A start        3 0.22233942   TRUE         1    0
8        A.NA     A    NA        3 0.77667057  FALSE         1    1
9        X.NA     X    NA        4 0.36675437  FALSE         1    2
11       Z.NA     Z    NA        6 0.49100719  FALSE         2   -1
12    A.start     A start        7 0.26012695   TRUE         2    0
13       A.NA     A    NA        7 0.88900224  FALSE         2    1
14       A.NA     A    NA        7 0.59714172  FALSE         2    2
15       B.NA     B    NA        8 0.15040234  FALSE         3   -1
16    A.start     A start        9 0.85581300   TRUE         3    0
17       A.NA     A    NA        9 0.15780435  FALSE         3    1

# plot 
ggplot(data=boo, aes(x=span, y = data, group = plotGroup)) + 
  geom_line()
redatoms
  • 23
  • 4
  • Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Nov 15 '22 at 04:55