An alternative approach using multi-processing instead of multi-threading follows below.
We empirically found that multi-threading doesn't yield particularly large performance gains, compared to multi-processing. This is probably due to Python's GIL.
This piece of code assumes a file enumerating TypeQL queries that are independent of each other, so they can be parallelised freely.
from typedb.client import TypeDB, TypeDBClient, SessionType, TransactionType
import multiprocessing as mp
import queue
def batch_writer(database, kill_event, batch_queue):
client = TypeDB.core_client("localhost:1729")
session = client.session(database, SessionType.DATA)
while not kill_event.is_set():
try:
batch = batch_queue.get(block=True, timeout=1)
with session.transaction(TransactionType.WRITE) as tx:
for query in batch:
tx.query().insert(query)
tx.commit()
except queue.Empty:
continue
print("Received kill event, exiting worker.")
def start_writers(database, kill_event, batch_queue, parallelism=4):
processes = []
for _ in range(parallelism):
proc = mp.Process(target=batch_writer, args=(database, kill_event, batch_queue))
processes.append(proc)
proc.start()
return processes
def batch(iterable, n=1000):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
if __name__ == '__main__':
batch_size = 100
parallelism = 1
database = "<database name>"
# filePath = "<PATH TO QUERIES FILE - ONE QUERY PER NEW LINE>"
with open(file_path, "r") as file:
statements = file.read().splitlines()[:]
batch_statements = batch(statements, n=batch_size)
total_batches = int(len(statements) / batch_size)
if total_batches % batch_size > 0:
total_batches += 1
batch_queue = mp.Queue(parallelism * 4)
kill_event = mp.Event()
writers = start_writers(database, kill_event, batch_queue, parallelism=parallelism)
for i, batch in enumerate(batch_statements):
batch_queue.put(batch, block=True)
if i*batch_size % 10000 == 0:
print("Loaded: {0}/{1}".format(i*batch_size, total_batches*batch_size))
kill_event.set()
batch_queue.close()
batch_queue.join_thread()
for proc in writers:
proc.join()
print("Done loading")