Another base way.
s <- c("ACTTTA", "TTGATG", "CTTACG", "GTACGT")
dna <- c("A", "C", "G", "T")
lup <- setNames(asplit(diag(length(dna)), 1), dna)
lapply(strsplit(s, "", TRUE), \(x) unlist(lup[x], FALSE, FALSE))
#[[1]]
# [1] 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
#
#[[2]]
# [1] 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
#
#[[3]]
# [1] 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
#
#[[4]]
# [1] 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
Or using gsub
. <- gsub("A", "1000", s)
. <- gsub("C", "0100", .)
. <- gsub("G", "0010", .)
. <- gsub("T", "0001", .)
cbind(s, .)
# s .
#[1,] "ACTTTA" "100001000001000100011000"
#[2,] "TTGATG" "000100010010100000010010"
#[3,] "CTTACG" "010000010001100001000010"
#[4,] "GTACGT" "001000011000010000100001"
lapply(strsplit(., "", TRUE), as.integer)
#[[1]]
# [1] 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
#
#[[2]]
# [1] 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
#
#[[3]]
# [1] 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
#
#[[4]]
# [1] 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
Or
lapply(strsplit(s, "", TRUE), \(x) c(diag(length(dna))[,match(x, dna)]))
Or
matrix(diag(4)[,unlist(lapply(strsplit(s, "", TRUE), match, dna), FALSE, FALSE)], length(s), byrow = TRUE)
Or
. <- chartr("ACGT", "1-4", s)
. <- strsplit(., "", TRUE)
matrix(diag(4)[, as.integer(unlist(.))], length(s), byrow = TRUE)
Or
. <- paste(s, collapse="")
. <- chartr("ACGT", "1-4", .)
matrix(diag(1L, 4L)[, utf8ToInt(.) - utf8ToInt("0")], length(s), byrow = TRUE)
Or
matrix(diag(1L, 4L)[,rep(1:4, utf8ToInt("ACGT") - 64)[utf8ToInt(paste(s, collapse="")) - 64]], length(s), byrow = TRUE)
Benchmark
library(magrittr) #For Konrad
s <- c("ACTTTA", "TTGATG", "CTTACG", "GTACGT")
dna <- c("A", "C", "G", "T")
bench::mark(check=FALSE,
GKi1 = {lup <- setNames(asplit(diag(length(dna)), 1), dna)
lapply(strsplit(s, "", TRUE), \(x) unlist(lup[x], FALSE, FALSE))},
GKi2 = {. <- gsub("A", "1000", s, fixed=TRUE)
. <- gsub("C", "0100", ., fixed=TRUE)
. <- gsub("G", "0010", ., fixed=TRUE)
gsub("T", "0001", ., fixed=TRUE)},
GKi3 = lapply(strsplit(s, "", TRUE), \(x) c(diag(length(dna))[,match(x, dna)])),
GKi4 = matrix(diag(4)[,unlist(lapply(strsplit(s, "", TRUE), match, dna), FALSE, FALSE)], length(s), byrow = TRUE),
GKi5 = matrix(diag(4)[, as.integer(unlist(strsplit(chartr("ACGT", "1-4", s), "", TRUE)))], length(s), byrow = TRUE),
GKi6 = matrix(diag(1L, 4L)[, utf8ToInt(chartr("ACGT", "1-4", paste(s, collapse=""))) - utf8ToInt("0")], length(s), byrow = TRUE),
GKi7 = matrix(diag(1L, 4L)[,rep(1:4, utf8ToInt("ACGT") - 64)[utf8ToInt(paste(s, collapse="")) - 64]], length(s), byrow = TRUE),
Konrad = {
strsplit(s, '') |>
lapply(match, table = c('A', 'C', 'G', 'T')) |>
unlist() %>%
{replace(diag(0L, 4L, length(.)), cbind(., seq_along(.)), 1L)}
},
Thomas = matrix(t(diag(4)[match(unlist(lapply(s, utf8ToInt)), utf8ToInt("ACGT")), ]), nrow = length(dna), byrow = TRUE)
)
Result
expression min median itr/s…¹ mem_al…² gc/se…³ n_itr n_gc total…⁴ result
<bch:expr> <bch:t> <bch:t> <dbl> <bch:by> <dbl> <int> <dbl> <bch:t> <list>
1 GKi1 23µs 26.31µs 33203. 3.96MB 69.9 9979 21 300.5ms <NULL>
2 GKi2 6.97µs 8.27µs 109392. 0B 98.5 9991 9 91.3ms <NULL>
3 GKi3 12.39µs 15.41µs 51404. 169.84KB 87.5 9983 17 194.2ms <NULL>
4 GKi4 8.21µs 10.67µs 80760. 1.59KB 121. 9985 15 123.6ms <NULL>
5 GKi5 7.03µs 8.43µs 108751. 6.2KB 76.2 9993 7 91.9ms <NULL>
6 GKi6 6.2µs 7.62µs 118818. 864B 95.1 9992 8 84.1ms <NULL>
7 GKi7 5.64µs 7.46µs 113596. 1.08KB 90.9 9992 8 88ms <NULL>
8 Konrad 11.18µs 13.91µs 60428. 8.92KB 103. 9983 17 165.2ms <NULL>
9 Thomas 8.4µs 10.75µs 77019. 6.34KB 69.4 9991 9 129.7ms <NULL>