2

I have a string where I would like to extract spells from a sequence for example,

   A<- c('000001111000', '0110011', '110001')

I would like to get the continuous spell lengths of 0 and 1 in a sequence format. Then using the lengths of the spells I would like to calculate the descriptive statistics like mean, mode, sd etc., (spell_0 and spell_1 are the sequences from the A vector.

For example,

    spell_0  spell_1   mean_spell_0   mean_spell_1

       5-3        4           4               4
       1-2        2-2         1.5             2
        3         2-1         3               1.5

Any suggestions?

user3570187
  • 1,743
  • 3
  • 17
  • 34

5 Answers5

4

Your question includes actually several questions.

From your orignal vector, you first need to get the different sequences, after splitting your strings into characters. This can be achieve with rle as pointed out in comments. Then, for each value ("0" and "1") in your example, you need to get the lengths of each sequence corresponding to the value. You then need to put them in the format you want (though this may not be the most appropriate.

Here is my proposition to do all this:

seqA <- lapply(strsplit(A, ""), rle)

do.call(cbind,lapply(c("0", "1"), # this can be made more general, for example using unique(unlist(strsplit(A, "")))
       function(i){
         do.call(rbind, lapply(seqA, 
                function(x){
                lesSeq <- x$lengths[x$values==i]
                  res <- data.frame(paste(lesSeq, collapse="-"), mean(lesSeq))
                  colnames(res) <- paste(c("spell", "mean_spell"), i, sep="_")
                return(res)
            }))
       }))[, c(1, 3, 2, 4)] # this rearrangment may not be needed...
#  spell_0 spell_1 mean_spell_0 mean_spell_1
#1     5-3       4          4.0          4.0
#2     1-2     2-2          1.5          2.0
#3       3     2-1          3.0          1.5
Cath
  • 23,906
  • 5
  • 52
  • 86
2

You could try something like this:

do.call(rbind,
  lapply(strsplit(A, ""), 
         function(x) {
           lengths <- rle(x)$lengths
           values  <- rle(x)$values
           data.frame(spell_0      = paste(lengths[values == "0"], collapse = "-"),
                      spell_1      = paste(lengths[values == "1"], collapse = "-"),
                      mean_spell_0 = mean(lengths[values == "0"]),
                      mean_spell_1 = mean(lengths[values == "1"]))
           }))

#>   spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1     5-3       4          4.0          4.0
#> 2     1-2     2-2          1.5          2.0
#> 3       3     2-1          3.0          1.5
Allan Cameron
  • 147,086
  • 7
  • 49
  • 87
2

First we extract and count the 0s and 1s:

library(stringr)
spell_0a <- sapply(str_extract_all(A, "0+"), function(x) str_count(x, "0"))
spell_1a <- sapply(str_extract_all(A, "1+"), function(x) str_count(x, "1"))

Then we collapse the results and do the mathematical operations:

df <- data.frame(
# collapse results:
  spell_0 = unlist(lapply(spell_0a, function(x) paste0(x, collapse = "-"))),
  spell_1 = unlist(lapply(spell_1a, function(x) paste0(x, collapse = "-"))),
# calculate means:
  mean_spell_0 = unlist(lapply(spell_0a, function(x) ifelse(length(x)==1, x[1], sum(x[1]+x[2])/2))),
  mean_spell_1 = unlist(lapply(spell_1a, function(x) ifelse(length(x)==1, x[1],sum(x[1]+x[2])/2)))
)

Result:

df
  spell_0 spell_1 mean_spell_0 mean_spell_1
1     5-3       4          4.0          4.0
2     1-2     2-2          1.5          2.0
3       3     2-1          3.0          1.5
Chris Ruehlemann
  • 20,321
  • 4
  • 12
  • 34
1

Here is a tidyverse-friendly solution that avoids the apply functions.

library(tidyverse)
library(stringr)

A <- c('000001111000', '0110011', '110001')

data.frame(A) %>% 
  mutate(A = str_replace_all(A, "01", "0-1"), 
         A = str_replace_all(A, "10", "1-0")) %>% 
  mutate(A_split = str_split(A, "-")) %>% 
  unnest(A_split) %>% 
  mutate(n_0 = str_count(A_split, "0"), 
         n_0 = ifelse(n_0 == 0, NA, n_0), 
         n_1 = str_count(A_split, "1"), 
         n_1 = ifelse(n_1 == 0, NA, n_1)) %>% 
  group_by(A) %>% 
  summarise(spell_0 = paste(na.omit(n_0), collapse = "-"), 
            spell_1 = paste(na.omit(n_1), collapse = "-"), 
            mean_spell_0 = mean(n_0, na.rm = T), 
            mean_spell_1 = mean(n_1, na.rm = T)) 

Result:

#>                A spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1     0-11-00-11     1-2     2-2          1.5          2.0
#> 2 00000-1111-000     5-3       4          4.0          4.0
#> 3       11-000-1       3     2-1          3.0          1.5

Created on 2021-10-25 by the reprex package (v2.0.1)

Kene David Nwosu
  • 828
  • 6
  • 12
1
get_spells <- function(x, char){
  s <- sapply(gregexpr(paste0(char, "+"), x), attr, "match")
  u <- sapply(s, paste0, collapse = "-")
  v <- sapply(s, mean)
  nms <- paste0("spell_", c(char, paste0("mean_", char)))
  setNames(data.frame(u, v) ,nms)
}

do.call(cbind, lapply(0:1, get_spells, x = A))
  spell_0 spell_mean_0 spell_1 spell_mean_1
1     5-3          4.0       4          4.0
2     1-2          1.5     2-2          2.0
3       3          3.0     2-1          1.5

Another way could be:

a <- strsplit(A, "(?<=(.))(?!\\1)", perl=TRUE)


b <- lapply(a, function(x)
  unlist(tapply(nchar(x),sub("(.)+", "\\1", x), function(x)
    c(setNames(paste(x, collapse = '-'), "spell"),
      setNames(mean(x), "mean_spell")))))

d <- type.convert(data.frame(do.call(rbind, b)), as.is = TRUE)
d
  X0.spell X0.mean_spell X1.spell X1.mean_spell
1      5-3           4.0        4           4.0
2      1-2           1.5      2-2           2.0
3        3           3.0      2-1           1.5
Onyambu
  • 67,392
  • 3
  • 24
  • 53