After the answer to @stef I thought I had a better direction. I'm pretty sure this can be improved on but it's much faster than the original and probably O(n log n).
import numpy as np
from pydivsufsort import divsufsort, kasai
from intervaltree import Interval, IntervalTree
def update_ranges(ranges_tree: IntervalTree, new_range: tuple):
"""
This function updates the ranges in the given interval tree.
It removes any intervals from the tree that are fully contained within the new interval.
If the new interval is not fully covered by any existing intervals in the tree, it adds the new interval.
:param ranges_tree: The interval tree to update.
:param new_range: The new interval.
:return: The updated interval tree.
"""
new_interval = Interval(new_range[0], new_range[1])
overlapping_intervals = ranges_tree[new_interval.begin:new_interval.end] # Find overlapping intervals.
# Remove intervals from the IntervalTree that are fully contained within new_interval.
for interval in overlapping_intervals:
if interval.begin >= new_interval.begin and interval.end <= new_interval.end:
ranges_tree.remove(interval)
# If new_interval is not fully covered by any existing intervals, add it to the IntervalTree.
if not any(interval.begin <= new_interval.begin and interval.end >= new_interval.end for interval in overlapping_intervals):
ranges_tree.add(new_interval)
return ranges_tree # Return updated IntervalTree.
def find_unique_repeat_substrings(s: bytes, min_length: int = 4, min_repeats: int = 2):
"""
This function finds unique repeated substrings of a specified minimum length and number of repetitions in the given string.
:param s: The input string.
:param min_length: The minimum length of substrings to consider.
:param min_repeats: The minimum number of repetitions of substrings.
:return: A list of unique repeated substrings that meet the conditions.
"""
if len(s) <= 1:
return []
sa = divsufsort(s)
lcp = kasai(s, sa)
zip_array = np.column_stack((sa, np.roll(lcp, 1, axis=0))) # Roll lcp 1 position and stack with SA.
active_string = False
cnt = 1
string_ranges = IntervalTree()
max_current_str_len = 0
for row in zip_array:
# Evaluating active string; if lcp value is descending, exit active substring.
if active_string and row[1] < max_current_str_len:
if cnt >= min_repeats: # If substring repeated enough times, update string ranges.
string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))
active_string = False
cnt = 1
max_current_str_len = 0
# Evaluating active substring; if lcp value is non-descending, increment repeat count.
elif active_string and row[1] >= max_current_str_len:
cnt += 1
max_current_str_len = max(max_current_str_len,row[1])
# New substring found of at least size min_length; start tracking it.
elif row[1] >= min_length and not active_string:
start_str = row[0]
str_len = row[1]
active_string = True
cnt += 1
max_current_str_len = row[1]
# If a new substring was being tracked at the end of the SA/LCP, update string ranges.
if cnt >= min_repeats:
string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))
# Convert ranges to strings
# Sorted is used to maintain a consistent order of the strings based on their starting position
strings = sorted([s[x.begin:x.end] for x in string_ranges])
return strings # Return a list of unique repeated substrings that meet the conditions.
import unittest
class TestFindUniqueRepeatSubstrings(unittest.TestCase):
def test_find_unique_repeat_substrings(self):
test_cases = [
{
's': "this string is repeated three times in this sentence. string string .".encode(),
'min_length': 4,
'min_repeats': 2,
'expected_result': [b' string ', b'this s'],
},
{
's': "s st str stri strin string".encode(),
'min_length': 4,
'min_repeats': 2,
'expected_result': [b' str', b'stri', b'trin'],
},
{
's': "banana".encode(),
'min_length': 2,
'min_repeats': 2,
'expected_result': [b'ana'],
},
{
's': "".encode(),
'min_length': 3,
'min_repeats': 2,
'expected_result': [],
},
{
's': "a".encode(),
'min_length': 1,
'min_repeats': 1,
'expected_result': [],
},
{
's': "aa".encode(),
'min_length': 1,
'min_repeats': 2,
'expected_result': [b'a'],
},
]
for case in test_cases:
s = case['s']
min_length = case['min_length']
min_repeats = case['min_repeats']
expected_result = case['expected_result']
try:
result = find_unique_repeat_substrings(s, min_length, min_repeats)
self.assertEqual(result, expected_result)
except AssertionError as e:
error_msg = f"AssertionError in test case:\n" \
f"s: {s}\n" \
f"min_length: {min_length}\n" \
f"min_repeats: {min_repeats}\n" \
f"Expected: {expected_result}\n" \
f"Result: {result}\n"
raise AssertionError(error_msg) from e
suite = unittest.TestLoader().loadTestsFromTestCase(TestFindUniqueRepeatSubstrings)
runner = unittest.TextTestRunner()
runner.run(suite)
.
----------------------------------------------------------------------
Ran 1 test in 0.007s
OK