Hi I'm trying to implement a pairwise alignment algorithm for protein sequences. I have this the exact same algorithm working for DNA sequences but for some reason when I copy it over to and implement with protein it doesn't The problem with the traceback portion which in will post below.
Prot1_Seq = "AAGG";
Prot2_Seq = "AAGG";
cout << "\nSequence 1: " << Prot1_Seq << endl;
int Prot1_len = Prot1_Seq.length();
cout << "\nSequence 2: " << Prot2_Seq << endl;
int Prot2_len = Prot2_Seq.length();
int matrix[Prot1_len+1][Prot2_len+1] {0};
// applying the gap penalty to the outer border columns
for (int i {1}; i <= Prot1_len; i++)
{
matrix[i][0] = gap_pen * i;
}
for (int j {1}; j <= Prot2_len; j++)
{
matrix[0][j] = gap_pen * j;
}
fstream myfile;
myfile.open("100pam.txt", ios::in);
if (!myfile.is_open())
{
cerr << "Error can not open file" << endl;
exit(1);
}
const int len = 20;
int pam_matrix[len][len];
string line;
getline(myfile,line); // get ing rid of the first line of pam matrix file, by using getline to grab it
cout << "\n" << endl;
for (int i = 0; i < len; i++)
{
char Row_first_letter;
myfile >> Row_first_letter; // using the >> to grab the first char of each row leaving the pam matrix with just numbers
for(int j = 0; j < len; j++)
{
myfile >> pam_matrix[i][j];
cout << pam_matrix[i][j] << "\t";
}
cout << endl;
}
cout << endl;
int match_score {};
int mismatch {};
char row_labels [20] {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
char col_labels [20] {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
for(int i {0}; i <= Prot1_len; i++)
{
for(int j {0}; j <= Prot2_len; j++)
{
if (i > 0 && j > 0)
{
if (Prot1_Seq[i-1] == Prot2_Seq[j-1])
{
int row = distance(row_labels, find(row_labels, row_labels + 20, Prot1_Seq[i-1]));
int col = distance(col_labels, find(col_labels, col_labels + 20, Prot2_Seq[j-1]));
match_score = pam_matrix[row][col];
matrix[i][j] = matrix[i-1][j-1] + match_score;
}
else if (Prot1_Seq[i-1] != Prot2_Seq[j-1])
{
//row_labels[i-1] = Prot1_Seq[i-1];
//col_labels[i-1] = Prot2_Seq[i-1];
int row = distance(row_labels, find(row_labels, row_labels + 20, Prot1_Seq[i-1]));
int col = distance(col_labels, find(col_labels, col_labels + 20, Prot2_Seq[j-1]));
mismatch = pam_matrix[row][col];
matrix[i][j] = max({matrix[i-1][j-1] + mismatch,
matrix[i-1][j] + gap_pen ,
matrix[i][j-1] + gap_pen});
}
}
cout << matrix[i][j] << "\t";
}
cout << endl;
}
int terminal = matrix[Prot1_len+1-1][Prot2_len+1-1]; // rows and colums are length+1 long. and to get the edge value it's col/row length -1.
cout << terminal << endl;
int temp = terminal;
int decrement_X = Prot1_len+1-1;
int decrement_Y = Prot2_len+1-1;
string Prot1_aligned = "";
string Prot2_aligned = "";
int inc_seq1 = Prot1_len-1;
int inc_seq2 = Prot2_len-1;
while (!(decrement_X == 0 || decrement_Y == 0))
{//while
if (terminal == matrix[decrement_X][decrement_Y-1] + gap_pen) {
temp = matrix[decrement_X][decrement_Y-1];
cout << temp << endl;
terminal = temp;
decrement_Y--;
Prot1_aligned += "_";
Prot2_aligned += Prot2_Seq[inc_seq2];
inc_seq2--;
}
else if (terminal == matrix[decrement_X-1][decrement_Y] + gap_pen) {
temp = matrix[decrement_X-1][decrement_Y];
cout << temp << endl;
terminal = temp;
decrement_X--;
Prot1_aligned += Prot1_Seq[inc_seq1];
Prot2_aligned += "_";
inc_seq1--;
}
else if (terminal == matrix[decrement_X-1][decrement_Y-1] + match_score) {
temp = matrix[decrement_X-1][decrement_Y-1];
cout << temp << endl;
terminal = temp;
decrement_X--;
decrement_Y--;
Prot1_aligned += Prot1_Seq[inc_seq1];
Prot2_aligned += Prot2_Seq[inc_seq2];
inc_seq1--;
inc_seq2--;
}
else if (terminal == matrix[decrement_X-1][decrement_Y-1] + mismatch) {
temp = matrix[decrement_X-1][decrement_Y-1];
cout << temp << endl;
terminal = temp;
decrement_X--;
decrement_Y--;
Prot1_aligned += Prot1_Seq[inc_seq1];
Prot2_aligned += Prot2_Seq[inc_seq2];
inc_seq1--;
inc_seq2--;
}
} //while
reverse(Prot1_aligned.begin(), Prot1_aligned.end());
reverse(Prot2_aligned.begin(), Prot2_aligned.end());
cout << "Performing alignment" << " .................................................. 100%" << endl;
cout << "Alignment complete! Your aligned sequences are:" << endl;
cout << "Sequence 1: " << Prot1_aligned << endl;
cout << "Sequence 2: " << Prot2_aligned << endl;
}
Let's say I have 2 protein sequences AAGG. This would generate a matrix:
0 -5 -10 -15 -20
-5 4 -1 -6 -11
-10 -1 8 3 -2
-15 -6 3 13 8
-20 -11 -2 8 18
using the traceback code I would expect the values: 18, 13, 8, 4, 0. But for some reason my program just stops at 8 and does not else. Not even seg faulting or quit the program. Any solutions?