I had an earlier problem where I was looking for a substring while iterating the string and using slicing. Turns out that's a really bad idea regarding performance. str.find
is much faster. But I don't understand why?
import random
import string
import timeit
# Generate 1 MB of random string data
haystack = "".join(random.choices(string.ascii_lowercase, k=1_000_000))
def f():
return [i for i in range(len(haystack)) if haystack[i : i + len(needle)] == needle]
def g():
return [i for i in range(len(haystack)) if haystack.startswith(needle, i)]
def h():
def find(start=0):
while True:
position = haystack.find(needle, start)
if position < 0:
return
start = position + 1
yield position
return list(find())
number = 100
needle = "abcd"
expectation = f()
for func in "fgh":
assert eval(func + "()") == expectation
t = timeit.timeit(func + "()", globals=globals(), number=number)
print(func, t)
Results:
f 26.46937609199813
g 16.11952730899793
h 0.07721933699940564