This is my example so far:
import io
import mmap
import os
import time
# 20 MB test file
filename = "random.bin"
if not os.path.isfile(filename):
with open(filename, "wb") as f:
for _ in range(20):
f.write(os.urandom(1_000_000))
signature = b"\x01\x02\x03\x04"
print("Method 1:")
start_time = time.time()
offsets = []
with open(filename, "rb") as f:
buf = b"\x00" + f.read(len(signature) - 1)
for offset, byte in enumerate(iter(lambda: f.read(1), b"")):
buf = buf[1:] + byte
if buf == signature:
offsets.append(offset)
print(f"{time.time() - start_time:.2f} seconds")
print(offsets)
print("Method 2:")
start_time = time.time()
offsets = []
with open(filename, "rb") as f, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
for offset in range(len(mm)):
if mm[offset : offset + len(signature)] == signature:
offsets.append(offset)
print(f"{time.time() - start_time:.2f} seconds")
print(offsets)
print("Method 3:")
start_time = time.time()
offsets = []
with open(filename, "rb") as f:
offset = 0
chunk = f.read(len(signature) - 1)
while True:
chunk = chunk[-(len(signature) - 1) :] + f.read1(io.DEFAULT_BUFFER_SIZE)
if len(chunk) < len(signature):
# EOF
break
for i in range(len(chunk) - (len(signature) - 1)):
if chunk[i : i + len(signature)] == signature:
offsets.append(offset + i)
offset += len(chunk) - (len(signature) - 1)
print(f"{time.time() - start_time:.2f} seconds")
print(offsets)
I'm searching through a 20 MB test file, looking for signature
of 4 bytes. Switching to method 2 with mmap.mmap
saved 50% of runtime, but it's still really slow. Especially since my actual target file will be between 1 and 10 GB. (Which is why I don't just load the whole file into memory, first.) It is orders of magnitude slower than md5sum random.bin
.
Edit: I've added another method, which doesn't use f.read(1)
, but reads chunks of io.DEFAULT_BUFFER_SIZE
and also uses read1
to prevent any blocking. But it's still not faster.