Goodmorning everyone,
I am trying to implement my own Smith-Waterman algorithm using R programming, but I am having some trouble in the traceback part, that is, given a matrix m storing the scores, I have to select the cell with the maximum value and then going left, up or diagonal in order to select the cells with the highest values, until we met zero.
Given two strings a and b, a match value, a mismatch value and a gap value:
a_splitted <- unlist(strsplit(a, ""))
b_splitted <- unlist(strsplit(b, ""))
cols <- nchar(a) + 1
rows <- nchar(b) + 1
m <- matrix(0, rows, cols)
coln <- c("", a_splitted)
rown<- c("", b_splitted)
colnames(m) <- coln
rownames(m) <- rown
m1 <- m
for (i in 2:nrow(m)){
for (j in 2:ncol(m)){
score1 <- m[i,j-1] + gap
score2 <- m[i-1,j] + gap
if(colnames(m)[j] == rownames(m)[i]){
score3 <- m[i-1,j-1] + match
} else {
score3 <- m[i-1,j-1] + mismatch
}
scores <- c(score1, score2, score3)
m[i,j] <- max(scores)
m1[i,j] <-which(scores == m[i,j])[1]
if (m[i,j]<0){
m[i,j] <- 0
}
}
}
' This is what I have done for the traceback part, but it takes hours to run and I still don't know if it really works.
Traceback:
i <- length(a_splitted2)
j <- length(b_splitted2)
ax <- character()
ay <- character()
while (m1[i -1,j -1] != 0){
## case 1: best was seq.x[i] aligned to seq.y[j]
s <- m1[i-1,j-1]
if (a_splitted2[i] == b_splitted2[j]) {
s <- s + match
} else {
s <- s + mismatch
}
if (s == m1[i,j]) {
ax <- c(a_splitted2[i], ax)
ay <- c(b_splitted2[j], ay)
i <- i - 1
j <- j - 1
next
}
## case 2: best was seq.x[i] aligned to -
if ((m1[i-1,j] + gap) == m1[i,j]) {
ax <- c(a_splitted2[i], ax)
ay <- c("-", ay)
i <- i-1
next
}
## case 3: best was seq.y[j] aligned to -
if ((m1[i,j-1] + gap) == m1[i,j]) {
ax <- c("-", ax)
ay <- c(b_splitted2[j], ay)
j <- j-1
next
}
}
Any hint is appreciated. Thank You a lot,
Davide