Both the edit1 and edit2 function above are within 1 or 2 edit distance not exactly 1 or 2 edit distance. I made these 3 functions where all_strings_editx
will return strings (random order) of exactly x edit distance away from the input string.
def all_strings_within_edit1(sequence, bases='ATCG'):
"""
All edits that are one edit away from `sequence`
using a dictionary of bases.
Parameters
----------
sequence: str
bases: str
Returns
-------
sequences: list of str
"""
splits = [(sequence[:i], sequence[i:]) for i in range(len(sequence) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
# In the original code, transpose counts one edit distance
# We count it as two edit distances, so it's not included here
# transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in bases]
inserts = [L + c + R for L, R in splits for c in bases]
return deletes + replaces + inserts
def all_strings_within_editx(sequence, bases='ATCG', edit_distance=1):
"""
Return all strings with a give edit distance away from
Parameters
----------
sequence: str
bases: str
edit_distance: int
Returns
-------
sequences: set of str
"""
if edit_distance == 0:
return {sequence}
elif edit_distance == 1:
return set(all_strings_within_edit1(sequence, bases=bases))
else:
return set(
e2 for e1 in all_strings_within_editx(
sequence, bases=bases, edit_distance=edit_distance-1)
for e2 in all_strings_within_edit1(e1, bases=bases)
)
def all_strings_editx(sequence, bases='ATCG', edit_distance=1):
"""
Return all strings of a give edit distance away from `sequence`
Parameters
----------
sequence: str
bases: str
edit_distance: int
Returns
-------
result: generator of str
"""
if edit_distance == 0:
return [sequence]
all_editx_minus1 = all_strings_within_editx(
sequence, bases=bases, edit_distance=edit_distance-1)
return (
e2 for e1 in all_editx_minus1
for e2 in all_strings_within_edit1(e1, bases=bases)
if e2 not in all_editx_minus1
)