If the number of different values in the array is not too large, you may consider storing arrays as long as the input array, one per unique value, counting the number of appearances of the value until each point. Then you just need to subtract the end values from the beginning values to find how many frequency matches are there:
def range_freq_queries(seq, k, queries):
n = len(seq)
c = freq_counts(seq)
result = [0] * len(queries)
offset = 0
for i, (l, r) in enumerate(queries):
result[i] = range_freq_matches(c, offset, l, r, k, n)
offset = result[i]
return result
def freq_counts(seq):
s = {v: i for i, v in enumerate(set(seq))}
counts = [None] * (len(seq) + 1)
counts[0] = [0] * len(s)
for i, v in enumerate(seq, 1):
counts[i] = list(counts[i - 1])
j = s[v]
counts[i][j] += 1
return counts
def range_freq_matches(counts, offset, start, end, k, n):
start, end = sorted(((start + offset) % n, (end + offset) % n))
num = 0
return sum(1 for cs, ce in zip(counts[start], counts[end + 1]) if ce - cs == k)
seq = [7, 6, 6, 5, 5]
k = 2
queries = [(0, 4), (3, 0), (4, 1)]
print(range_freq_queries(seq, k, queries))
# [2, 1, 1]
You can do it faster with NumPy, too. Since each result depends on the previous one, you will have to loop in any case, but you can use Numba to really accelerate things up:
import numpy as np
import numba as nb
def range_freq_queries_np(seq, k, queries):
seq = np.asarray(seq)
c = freq_counts_np(seq)
return _range_freq_queries_np_nb(seq, k, queries, c)
@nb.njit # This is not necessary but will make things faster
def _range_freq_queries_np_nb(seq, k, queries, c):
n = len(seq)
offset = np.int32(0)
out = np.empty(len(queries), dtype=np.int32)
for i, (l, r) in enumerate(queries):
l = (l + offset) % n
r = (r + offset) % n
l, r = min(l, r), max(l, r)
out[i] = np.sum(c[r + 1] - c[l] == k)
offset = out[i]
return out
def freq_counts_np(seq):
uniq = np.unique(seq)
seq_pad = np.concatenate([[uniq.max() + 1], seq])
comp = seq_pad[:, np.newaxis] == uniq
return np.cumsum(comp, axis=0)
seq = np.array([7, 6, 6, 5, 5])
k = 2
queries = [(0, 4), (3, 0), (4, 1)]
print(range_freq_queries_np(seq, k, queries))
# [2 1 2]
Let's compare it with the original algorithm:
from collections import Counter
def range_freq_queries_orig(seq, k, queries):
n = len(seq)
ans = 0
counter = Counter()
out = [0] * len(queries)
for i, (l, r) in enumerate(queries):
l += ans
l %= n
r += ans
r %= n
if l > r:
l, r = r, l
counter.clear()
counter.update(seq[l:r+1])
ans = sum(1 for v in counter.values() if v == k)
out[i] = ans
return out
Here is a quick test and timing:
import random
import numpy
# Make random input
random.seed(0)
seq = random.choices(range(1000), k=5000)
queries = [(random.choice(range(len(seq))), random.choice(range(len(seq))))
for _ in range(20000)]
k = 20
# Input as array for NumPy version
seq_arr = np.asarray(seq)
# Check all functions return the same result
res1 = range_freq_queries_orig(seq, k, queries)
res2 = range_freq_queries(seq, k, queries)
print(all(r1 == r2 for r1, r2 in zip(res1, res2)))
# True
res3 = range_freq_queries_np(seq_arr, k, queries)
print(all(r1 == r3 for r1, r3 in zip(res1, res3)))
# True
# Timings
%timeit range_freq_queries_orig(seq, k, queries)
# 3.07 s ± 1.11 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit range_freq_queries(seq, k, queries)
# 1.1 s ± 307 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit range_freq_queries_np(seq_arr, k, queries)
# 265 ms ± 726 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Obviously the effectiveness of this depends on the characteristics of the data. In particular, if there are fewer repeated values the time and memory cost to construct the counts table will approach O(n2).