Python driver provides with a event/callback approach for large results:
https://datastax.github.io/python-driver/query_paging.html
Also, there is a BatchQuery class to use with ORM and it's quite handy:
https://datastax.github.io/python-driver/cqlengine/batches.html?highlight=batchquery
Now, I need to execute BatchQuery in callback handlers of Paged Result object but script just stucks on iterating on current page.
I guess this is due to impossibility of sharing cassandra sessions between threads, while BatchQuery and the "paged result" approach are using threading to manage event settings and callback calls.
Any idea on how to magically sort this situation out? Below you can find some code:
# paged.py
class PagedQuery:
"""
Class to manage paged results.
>>> query = "SELECT * FROM ks.my_table WHERE collectionid=123 AND ttype='collected'" # define query
>>> def handler(page): # define result page handler function
... for t in page:
... print(t)
>>> pq = PagedQuery(query, handler) # instantiate a PagedQuery object
>>> pq.finished_event.wait() # wait for the PagedQuery to handle all results
>>> if pq.error:
... raise pq.error
"""
def __init__(self, query, handler=None):
session = new_cassandra_session()
session.row_factory = named_tuple_factory
statement = SimpleStatement(query, fetch_size=500)
future = session.execute_async(statement)
self.count = 0
self.error = None
self.finished_event = Event()
self.query = query
self.session = session
self.handler = handler
self.future = future
self.future.add_callbacks(
callback=self.handle_page,
errback=self.handle_error
)
def handle_page(self, page):
if not self.handler:
raise RuntimeError('A page handler function was not defined for the query')
self.handler(page)
if self.future.has_more_pages:
self.future.start_fetching_next_page()
else:
self.finished_event.set()
def handle_error(self, exc):
self.error = exc
self.finished_event.set()
# main.py
# script using class above
def main():
query = 'SELECT * FROM ks.my_table WHERE collectionid=10 AND ttype=\'collected\''
def handle_page(page):
b = BatchQuery(batch_type=BatchType.Unlogged)
for obj in page:
process(obj) # some updates on obj...
obj.batch(b).save()
b.execute()
pq = PagedQuery(query, handle_page)
pq.finished_event.wait()
if not pq.count:
print('Empty queryset. Please, check parameters')
if __name__ == '__main__':
main()