Here is an example of a DoFn that processes multiple items concurrently:
class MultiThreadedDoFn(beam.DoFn):
def __init__(self, func, num_threads=10):
self.func = func
self.num_threads = num_threads
def setup(self):
self.done = False
self.input_queue = queue.Queue(2)
self.output_queue = queue.Queue()
self.threads = [
threading.Thread(target=self.work, daemon=True)
for _ in range(self.num_threads)]
for t in self.threads:
t.start()
def work(self):
while not self.done:
try:
windowed_value = self.input_queue.get(timeout=0.1)
self.output_queue.put(
windowed_value.with_value(func(windowed_value.value)))
except queue.Empty:
pass # check self.done
def start_bundle(self):
self.pending = 0
def process(self, element,
timestamp=beam.DoFn.TimestampParam,
window=beam.DoFn.WindowParam):
self.pending += 1
self.input_queue.put(
beam.transforms.window.WindowedValue(
element, timestamp, (window,)))
try:
while not self.output_queue.empty():
yield self.output_queue.get(block=False)
self.pending -= 1
except queue.Empty:
pass
def finish_bundle(self):
while self.pending > 0:
yield self.output_queue.get()
self.pending -= 1
def teardown(self):
self.done = True
for t in self.threads:
t.join()
It can be used as
def func(n):
time.sleep(n / 10)
return n + 1
with beam.Pipeline() as p:
p | beam.Create([1, 3, 5, 7] * 10 + [9]) | beam.ParDo(MultiThreadedDoFn(func)) | beam.Map(logging.error)