I have been trying to understand the tradeoff between read
and seek
. For small "jumps" reading unneeded data is faster than skipping it with seek
.
While timing different read/seek chunk sizes to find the tipping point, I came across a odd phenomenon: read(1)
is about 20 times slower than read(2)
, read(3)
, etc. This effect is the same for different read methods, e.g. read()
and readinto()
.
Why is this the case?
Search in the timing results for the following line 2/3 of the way through:
2 x buffered 1 byte readinto bytearray
Environment:
Python 3.5.2 |Continuum Analytics, Inc.| (default, Jul 5 2016, 11:45:57) [MSC v.1900 32 bit (Intel)]
Timing results:
Non-cachable binary data ingestion (file object blk_size = 8192):
- 2 x buffered 0 byte readinto bytearray:
robust mean: 6.01 µs +/- 377 ns
min: 3.59 µs
- Buffered 0 byte seek followed by 0 byte readinto:
robust mean: 9.31 µs +/- 506 ns
min: 6.16 µs
- 2 x buffered 4 byte readinto bytearray:
robust mean: 14.4 µs +/- 6.82 µs
min: 2.57 µs
- 2 x buffered 7 byte readinto bytearray:
robust mean: 14.5 µs +/- 6.76 µs
min: 3.08 µs
- 2 x buffered 2 byte readinto bytearray:
robust mean: 14.5 µs +/- 6.77 µs
min: 3.08 µs
- 2 x buffered 5 byte readinto bytearray:
robust mean: 14.5 µs +/- 6.76 µs
min: 3.08 µs
- 2 x buffered 3 byte readinto bytearray:
robust mean: 14.5 µs +/- 6.73 µs
min: 2.57 µs
- 2 x buffered 49 byte readinto bytearray:
robust mean: 14.5 µs +/- 6.72 µs
min: 2.57 µs
- 2 x buffered 6 byte readinto bytearray:
robust mean: 14.6 µs +/- 6.76 µs
min: 3.08 µs
- 2 x buffered 343 byte readinto bytearray:
robust mean: 15.3 µs +/- 6.43 µs
min: 3.08 µs
- 2 x buffered 2401 byte readinto bytearray:
robust mean: 138 µs +/- 247 µs
min: 4.11 µs
- Buffered 7 byte seek followed by 7 byte readinto:
robust mean: 278 µs +/- 333 µs
min: 15.4 µs
- Buffered 3 byte seek followed by 3 byte readinto:
robust mean: 279 µs +/- 333 µs
min: 14.9 µs
- Buffered 1 byte seek followed by 1 byte readinto:
robust mean: 279 µs +/- 334 µs
min: 15.4 µs
- Buffered 2 byte seek followed by 2 byte readinto:
robust mean: 279 µs +/- 334 µs
min: 15.4 µs
- Buffered 4 byte seek followed by 4 byte readinto:
robust mean: 279 µs +/- 334 µs
min: 15.4 µs
- Buffered 49 byte seek followed by 49 byte readinto:
robust mean: 281 µs +/- 336 µs
min: 14.9 µs
- Buffered 6 byte seek followed by 6 byte readinto:
robust mean: 281 µs +/- 337 µs
min: 15.4 µs
- 2 x buffered 1 byte readinto bytearray:
robust mean: 282 µs +/- 334 µs
min: 17.5 µs
- Buffered 5 byte seek followed by 5 byte readinto:
robust mean: 282 µs +/- 338 µs
min: 15.4 µs
- Buffered 343 byte seek followed by 343 byte readinto:
robust mean: 283 µs +/- 340 µs
min: 15.4 µs
- Buffered 2401 byte seek followed by 2401 byte readinto:
robust mean: 309 µs +/- 373 µs
min: 15.4 µs
- Buffered 16807 byte seek followed by 16807 byte readinto:
robust mean: 325 µs +/- 423 µs
min: 15.4 µs
- 2 x buffered 16807 byte readinto bytearray:
robust mean: 457 µs +/- 558 µs
min: 16.9 µs
- Buffered 117649 byte seek followed by 117649 byte readinto:
robust mean: 851 µs +/- 1.08 ms
min: 15.9 µs
- 2 x buffered 117649 byte readinto bytearray:
robust mean: 1.29 ms +/- 1.63 ms
min: 18 µs
Benchmarking code:
from _utils import BenchmarkResults
from timeit import timeit, repeat
import gc
import os
from contextlib import suppress
from math import floor
from random import randint
### Configuration
FILE_NAME = 'test.bin'
r = 5000
n = 100
reps = 1
chunk_sizes = list(range(7)) + [7**x for x in range(1,7)]
results = BenchmarkResults(description = 'Non-cachable binary data ingestion')
### Setup
FILE_SIZE = int(100e6)
# remove left over test file
with suppress(FileNotFoundError):
os.unlink(FILE_NAME)
# determine how large a file needs to be to not fit in memory
gc.collect()
try:
while True:
data = bytearray(FILE_SIZE)
del data
FILE_SIZE *= 2
gc.collect()
except MemoryError:
FILE_SIZE *= 2
print('Using file with {} GB'.format(FILE_SIZE / 1024**3))
# check enough data in file
required_size = sum(chunk_sizes)*2*2*reps*r
print('File size used: {} GB'.format(required_size / 1024**3))
assert required_size <= FILE_SIZE
# create test file
with open(FILE_NAME, 'wb') as file:
buffer_size = int(10e6)
data = bytearray(buffer_size)
for i in range(int(FILE_SIZE / buffer_size)):
file.write(data)
# read file once to try to force it into system cache as much as possible
from io import DEFAULT_BUFFER_SIZE
buffer_size = 10*DEFAULT_BUFFER_SIZE
buffer = bytearray(buffer_size)
with open(FILE_NAME, 'rb') as file:
bytes_read = True
while bytes_read:
bytes_read = file.readinto(buffer)
blk_size = file.raw._blksize
results.description += ' (file object blk_size = {})'.format(blk_size)
file = open(FILE_NAME, 'rb')
### Benchmarks
setup = \
"""
# random seek to avoid advantageous starting position biasing results
file.seek(randint(0, file.raw._blksize), 1)
"""
read_read = \
"""
file.read(chunk_size)
file.read(chunk_size)
"""
seek_seek = \
"""
file.seek(buffer_size, 1)
file.seek(buffer_size, 1)
"""
seek_read = \
"""
file.seek(buffer_size, 1)
file.read(chunk_size)
"""
read_read_timings = {}
seek_seek_timings = {}
seek_read_timings = {}
for chunk_size in chunk_sizes:
read_read_timings[chunk_size] = []
seek_seek_timings[chunk_size] = []
seek_read_timings[chunk_size] = []
for j in range(r):
#file.seek(0)
for chunk_size in chunk_sizes:
buffer = bytearray(chunk_size)
read_read_timings[chunk_size].append(timeit(read_read, setup, number=reps, globals=globals()))
#seek_seek_timings[chunk_size].append(timeit(seek_seek, setup, number=reps, globals=globals()))
seek_read_timings[chunk_size].append(timeit(seek_read, setup, number=reps, globals=globals()))
for chunk_size in chunk_sizes:
results['2 x buffered {} byte readinto bytearray'.format(chunk_size)] = read_read_timings[chunk_size]
#results['2 x buffered {} byte seek'.format(chunk_size)] = seek_seek_timings[chunk_size]
results['Buffered {} byte seek followed by {} byte readinto'.format(chunk_size, chunk_size)] = seek_read_timings[chunk_size]
### Cleanup
file.close()
os.unlink(FILE_NAME)
results.show()
results.save()
Edit 2020-02-24:
@finefoot requested the _utils
package to be able to run above code.
from collections import OrderedDict
from math import ceil
from statistics import mean, stdev
from contextlib import suppress
import os
import inspect
class BenchmarkResults(OrderedDict):
def __init__(self, *args, description='Benchmark Description', **kwArgs):
self.description = description
return super(BenchmarkResults, self).__init__(*args, **kwArgs)
def __repr__(self):
"""Shows the results for the benchmarks in order of ascending duration"""
characteristic_durations = []
for name, timings in self.items():
try:
characteristic_durations.append(_robust_stats(timings)[0])
except ValueError:
if len(timings) > 1:
characteristic_durations.append(mean(timings))
else:
characteristic_durations.append(timings[0])
indx = _argsort(characteristic_durations)
repr = '{}:\n'.format(self.description)
items = list(self.items())
for i in indx:
name, timings = items[i]
repr += '- {}:\n'.format(name)
try:
stats = _robust_stats(timings)
repr += ' robust mean: {} +/- {}\n'.format(_units(stats[0]), _units(stats[1]))
except ValueError:
repr += ' timings: {}\n'.format(', '.join(map(_units, timings)))
if len(timings) > 1:
repr += ' min: {}\n'.format(_units(min(timings)))
return repr
def show(self):
print(self)
def save(self):
caller = inspect.stack()[1]
filename = os.path.splitext(caller.filename)[0] + '.log'
with open(filename, 'w') as logfile:
logfile.write(repr(self))
def _units(seconds, significant_figures=3):
fmt = '{:.%sg} {}' % significant_figures
if seconds > 1:
return fmt.format(seconds, 's')
elif seconds > 1e-3:
return fmt.format(seconds*1e3, 'ms')
elif seconds > 1e-6:
return fmt.format(seconds*1e6, 'µs')
elif seconds < 1e-6:
return fmt.format(seconds*1e9, 'ns')
elif seconds > 60:
return fmt.format(seconds/60, 'min')
else:
return fmt.format(seconds/3600, 'hrs')
raise ValueError()
def _robust_stats(timings, fraction_to_use=0.8):
if len(timings) < 5:
raise ValueError('To calculate a robust mean, you need at least 5 timing results')
elts_to_prune = int(len(timings) * (1 - fraction_to_use))
# prune at least the highest and the lowest result
elts_to_prune = elts_to_prune if elts_to_prune > 2 else 2
# round to even number --> symmetic pruning
offset = ceil(elts_to_prune / 2)
# sort the timings
timings.sort()
# prune the required fraction of the elements
timings = timings[offset:-offset]
return mean(timings), stdev(timings)
def _argsort(seq):
# http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
return sorted(range(len(seq)), key=seq.__getitem__)
if __name__ == '__main__':
pass