0

I'm pulling data from elastic search using python client scroll id as follows

import pandas as pd
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
index_columns = ['a','b'...............]
message_body = {"size": 1000, "_source": index_columns, "query": {"match_all": {}}}
elastic_data = es.search(index="data", body=message_body, scroll='1m')
at_data = pd.DataFrame([a['_source'] for a in elastic_data['hits']['hits']])
sid = elastic_data['_scroll_id']
scroll_size = len(elastic_data['hits']['hits'])

while scroll_size > 0:
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
    sid = elastic_data_rest['_scroll_id']
    scroll_size = len(elastic_data_rest['hits']['hits'])
    at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)

May I know what is the sliced scroll, whether it helps to pull faster?

I gone through this in https://github.com/elastic/elasticsearch-dsl-py/issues/817

from multiprocessing import Pool

SLICES = 5                                                                      

def dump_slice(slice_no):                                                       
    s = Search()                                                                
    s = s.extra(slice={"id": slice_no, "max": SLICES})                          
    for d in s.scan():                                                          
        print(d.meta.id)

pool = Pool(SLICES)                                                             
pool.map(dump_slice, range(SLICES))

if I slice it, how the scroll id to be used?

I tried this 1st method

def dump_slice(scroll_size):                                                       
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m') 
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])  
    sid = elastic_data_rest['_scroll_id']        
    scroll_size = len(elastic_data_rest['hits']['hits'])                                                    
    elastic_data_rest = elastic_data_rest.extra(slice={"id": sid, "max": SLICES})                          
    for at_data_rest in elastic_data_rest.scan():                                                          
        at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)

it's using full cpu, but running endless.

tried the 2nd method, but couldn't do the get() from the results as it is not iterable, not sure of my mistake

def poolData(elastic_data_rest):
    at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
    at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)
    return at_data
pool = Pool(processes=4)
while scroll_size > 0:
    elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
    sid = elastic_data_rest['_scroll_id']
    scroll_size = len(elastic_data_rest['hits']['hits'])
    at_data = pool.apply_async(poolData, elastic_data_rest)

tried method 3 from https://www.codestudyblog.com/cnb2010/1006124017.html

NJOBS = 4
def es_scroll(index, slice_no):
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    print(slice_no)
    message_body = {"slice": {"id": slice_no,"max": NJOBS},
                    "size": 1000,
                    "_source": index_columns,
                    "query": {"match_all": {}}}
    elastic_data = es.search(index=lc_training_ID, body=message_body, scroll='1m')
    sid = elastic_data['_scroll_id']
    scroll_size = len(elastic_data['hits']['hits'])

    # Start scrolling
    df = pd.DataFrame()
    appended_data = []

    while (scroll_size > 0):
        at_data_rest = pd.DataFrame.from_dict([a['_source'] for a in elastic_data['hits']['hits']])
        appended_data.append(at_data_rest)
        elastic_data = es.scroll(scroll_id = sid, scroll = '1m', request_timeout = 30)
        sid = elastic_data['_scroll_id']
        scroll_size = len(elastic_data['hits']['hits'])
    if len(appended_data) > 0: 
        df = pd.concat(appended_data, ignore_index=True, sort = False)
    del appended_data
    gc.collect() 
    es.clear_scroll(body={'scroll_id': sid})
    return df  

def build_parameters(index):
    parameters =[]
    for num in range(0, NJOBS): 
        tuple_parameter = (index, num)
        parameters.append(tuple_parameter)
    return parameters

parameters = build_parameters(lc_training_ID)

with multiprocessing.Pool(processes = NJOBS) as pool:
    result = pool.starmap(es_scroll, parameters)
frame = pd.concat(result, ignore_index=True, sort = False)

runs indefinitely

hanzgs
  • 1,498
  • 17
  • 44

0 Answers0