How to use elasticsearch sliced scroll with multithreading in python?

Question

I'm pulling data from elastic search using python client scroll id as follows

import pandas as pd
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
index_columns = ['a','b'...............]
message_body = {"size": 1000, "_source": index_columns, "query": {"match_all": {}}}
elastic_data = es.search(index="data", body=message_body, scroll='1m')
at_data = pd.DataFrame([a['_source'] for a in elastic_data['hits']['hits']])
sid = elastic_data['_scroll_id']
scroll_size = len(elastic_data['hits']['hits'])

while scroll_size > 0:
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
    sid = elastic_data_rest['_scroll_id']
    scroll_size = len(elastic_data_rest['hits']['hits'])
    at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)

May I know what is the sliced scroll, whether it helps to pull faster?

I gone through this in https://github.com/elastic/elasticsearch-dsl-py/issues/817

from multiprocessing import Pool

SLICES = 5                                                                      

def dump_slice(slice_no):                                                       
    s = Search()                                                                
    s = s.extra(slice={"id": slice_no, "max": SLICES})                          
    for d in s.scan():                                                          
        print(d.meta.id)

pool = Pool(SLICES)                                                             
pool.map(dump_slice, range(SLICES))

if I slice it, how the scroll id to be used?

I tried this 1st method

def dump_slice(scroll_size):                                                       
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m') 
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])  
    sid = elastic_data_rest['_scroll_id']        
    scroll_size = len(elastic_data_rest['hits']['hits'])                                                    
    elastic_data_rest = elastic_data_rest.extra(slice={"id": sid, "max": SLICES})                          
    for at_data_rest in elastic_data_rest.scan():                                                          
        at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)

it's using full cpu, but running endless.

tried the 2nd method, but couldn't do the get() from the results as it is not iterable, not sure of my mistake

def poolData(elastic_data_rest):
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
    at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)
    return at_data
pool = Pool(processes=4)
while scroll_size > 0:
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
    sid = elastic_data_rest['_scroll_id']
    scroll_size = len(elastic_data_rest['hits']['hits'])
    at_data = pool.apply_async(poolData, elastic_data_rest)

tried method 3 from https://www.codestudyblog.com/cnb2010/1006124017.html

NJOBS = 4
def es_scroll(index, slice_no):
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    print(slice_no)
    message_body = {"slice": {"id": slice_no,"max": NJOBS},
                    "size": 1000,
                    "_source": index_columns,
                    "query": {"match_all": {}}}
    elastic_data = es.search(index=lc_training_ID, body=message_body, scroll='1m')
    sid = elastic_data['_scroll_id']
    scroll_size = len(elastic_data['hits']['hits'])

    # Start scrolling
    df = pd.DataFrame()
    appended_data = []

    while (scroll_size > 0):
        at_data_rest = pd.DataFrame.from_dict([a['_source'] for a in elastic_data['hits']['hits']])
        appended_data.append(at_data_rest)
        elastic_data = es.scroll(scroll_id = sid, scroll = '1m', request_timeout = 30)
        sid = elastic_data['_scroll_id']
        scroll_size = len(elastic_data['hits']['hits'])
    if len(appended_data) > 0: 
        df = pd.concat(appended_data, ignore_index=True, sort = False)
    del appended_data
    gc.collect() 
    es.clear_scroll(body={'scroll_id': sid})
    return df  

def build_parameters(index):
    parameters =[]
    for num in range(0, NJOBS): 
        tuple_parameter = (index, num)
        parameters.append(tuple_parameter)
    return parameters

parameters = build_parameters(lc_training_ID)

with multiprocessing.Pool(processes = NJOBS) as pool:
    result = pool.starmap(es_scroll, parameters)
frame = pd.concat(result, ignore_index=True, sort = False)

runs indefinitely

How to use elasticsearch sliced scroll with multithreading in python?

0 Answers0