I have troubles concerning both runtime and memory usage, when using python to compute a suffix tree via Ukkonens Algorithm (linear time and space).
Currently, I want to create a suffix tree for a bacterial genome, consisting of roughly 1,6 Million characters in length. The alphabet contains of 11 different characters. The file itself has a size of 1,6 MB, but the tree in Python uses more than 2 GB RAM and it takes about one minute to finish building the tree. I am storing child nodes in a list of length 11, where each positions corresponds to a character of the alphabet. I have tried using dictionaries, as a lot of None values are stored by using this method. However, this results in even larger memory issues and takes a little bit longer. I am not even storing the sequence itself but only the corresponding intervals. I am no expert, is this overhead, produced by all of the objects? Can this be reduced?
Any suggestions appreciated.
Below is the contructor of a SuffixNode in the tree. As you can see, children are stored in a list. The edge, that connects to the parent node is only stored with _start and _end.
class SuffixNode:
_index = -1
def __init__(self, start, end,suffixlink = None):
self._start = start
self._end = end
self._index = -1
self._suffixlink = None
self._children = [None,None,None,None,None,None,None,None,None,None,None,None]
The algorithm itself, although this might not be quite helpful: class SuffixTree(object):
def __init__(self,sequence):
self.end = len(sequence)+1
self.sequence = sequence+'#'
self.virt_root = SuffixNode(-1,-1)
self.root = SuffixNode(0, 0)
self.virt_root.setChild(self.root,self.getChildPosition('A'))
self.virt_root.setChild(self.root,self.getChildPosition('C'))
self.virt_root.setChild(self.root,self.getChildPosition('G'))
self.virt_root.setChild(self.root,self.getChildPosition('T'))
self.virt_root.setChild(self.root,self.getChildPosition('#'))
self.virt_root.setChild(self.root,self.getChildPosition('M'))
self.virt_root.setChild(self.root,self.getChildPosition('K'))
self.virt_root.setChild(self.root, self.getChildPosition('N'))
self.virt_root.setChild(self.root,self.getChildPosition('W'))
self.virt_root.setChild(self.root, self.getChildPosition('S'))
self.virt_root.setChild(self.root, self.getChildPosition('R'))
self.virt_root.setChild(self.root, self.getChildPosition('Y'))
self.root.setSuffixLink(self.virt_root)
self.virt_root.setChild(self.root,self.getChildPosition('#'))
current_char = sequence[0]
self.root.setChild(SuffixNode(1,self.end,self.root),self.getChildPosition(current_char))
s = self.root
k = 2
bar = progressbar.ProgressBar(maxval=len(self.sequence))
bar.start()
for i in range(2,len(self.sequence)+1):
bar.update(i)
s, k = self._update(s, k, i-1, i)
bar.finish()
self.index(self.root, self.end)
def getChildPosition(self,char):
intToReturn = -1
if char == 'A':
return 0
if char == 'C':
return 1
if char == 'G':
return 2
if char == 'T':
return 3
if char == 'M':
return 4
if char == 'N':
return 5
if char == 'S':
return 6
if char == 'Y':
return 7
if char == 'K':
return 8
if char == 'R':
return 9
if char == 'W':
return 10
if char == '#':
return 11
def _update(self, s, k, p, pos):
current_char = self.sequence[pos-1]
old_r = self.root
s, k = self._canonize(s, k, p)
done, r = self._testAndSplit(s, k, p, current_char)
while done is False:
r.setChild(SuffixNode(pos, self.end, self.root), self.getChildPosition(current_char))
if old_r != self.root:
old_r.setSuffixLink(r)
old_r = r
s, k = self._canonize(s.getSuffixLink(), k, p)
done, r = self._testAndSplit(s, k, p, current_char)
if old_r != self.root:
old_r.setSuffixLink(s)
return s, k
def _canonize(self, s, k, p):
length = p-k+1
while length > 0:
key = self.sequence[k-1]
child = s.getChild(self.getChildPosition(key))
edgeLength = child.getEdgeLength()
if edgeLength > length or child.isLeaf():
break
k = k+edgeLength
length = p-k+1
s = child
return s, k
def _testAndSplit(self, s, k, p, current_char):
length = p-k+1
if length == 0:
child = s.getChild(self.getChildPosition(current_char))
if child is None:
return False, s
else:
return True, s
else:
old_char = self.sequence[k - 1]
child = s.getChild(self.getChildPosition(old_char))
start = child.getStart()
if current_char == self.sequence[start+length-1]:
return True, s
else:
newChild = SuffixNode(child.getStart(), child.getStart()+length-1,self.root)
newChild.setChild(child, self.getChildPosition(self.sequence[child.getStart()+length-1]))
child.setStart(child.getStart()+length)
s.setChild(newChild, self.getChildPosition(old_char))
return False, newChild
def index(self, node, offset):
for child in node.getChildren():
if child is not None:
if child.isLeaf():
child.setIndex(offset-child.getEdgeLength())
else:
child.setIndex(self.index(child,offset-child.getEdgeLength()))
update : updates the tree when inserting next character
canonize : walks towards leaves as far as possible to get the nearest node, where a branchpoint could be.
testandSplit: test, if an edge must be split and generates a new node if yes.
index: generates index for leaves of tree