I'm pulling data from elastic search using python client scroll id as follows
import pandas as pd
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
index_columns = ['a','b'...............]
message_body = {"size": 1000, "_source": index_columns, "query": {"match_all": {}}}
elastic_data = es.search(index="data", body=message_body, scroll='1m')
at_data = pd.DataFrame([a['_source'] for a in elastic_data['hits']['hits']])
sid = elastic_data['_scroll_id']
scroll_size = len(elastic_data['hits']['hits'])
while scroll_size > 0:
elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
sid = elastic_data_rest['_scroll_id']
scroll_size = len(elastic_data_rest['hits']['hits'])
at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)
May I know what is the sliced scroll, whether it helps to pull faster?
I gone through this in https://github.com/elastic/elasticsearch-dsl-py/issues/817
from multiprocessing import Pool
SLICES = 5
def dump_slice(slice_no):
s = Search()
s = s.extra(slice={"id": slice_no, "max": SLICES})
for d in s.scan():
print(d.meta.id)
pool = Pool(SLICES)
pool.map(dump_slice, range(SLICES))
if I slice it, how the scroll id to be used?
I tried this 1st method
def dump_slice(scroll_size):
elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
sid = elastic_data_rest['_scroll_id']
scroll_size = len(elastic_data_rest['hits']['hits'])
elastic_data_rest = elastic_data_rest.extra(slice={"id": sid, "max": SLICES})
for at_data_rest in elastic_data_rest.scan():
at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)
it's using full cpu, but running endless.
tried the 2nd method, but couldn't do the get() from the results as it is not iterable, not sure of my mistake
def poolData(elastic_data_rest):
at_data_rest = pd.DataFrame([a['_source'] for a in elastic_data_rest['hits']['hits']])
at_data = at_data.append(at_data_rest, ignore_index=True, sort=False)
return at_data
pool = Pool(processes=4)
while scroll_size > 0:
elastic_data_rest = es.scroll(scroll_id=sid, scroll='1m')
sid = elastic_data_rest['_scroll_id']
scroll_size = len(elastic_data_rest['hits']['hits'])
at_data = pool.apply_async(poolData, elastic_data_rest)
tried method 3 from https://www.codestudyblog.com/cnb2010/1006124017.html
NJOBS = 4
def es_scroll(index, slice_no):
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
print(slice_no)
message_body = {"slice": {"id": slice_no,"max": NJOBS},
"size": 1000,
"_source": index_columns,
"query": {"match_all": {}}}
elastic_data = es.search(index=lc_training_ID, body=message_body, scroll='1m')
sid = elastic_data['_scroll_id']
scroll_size = len(elastic_data['hits']['hits'])
# Start scrolling
df = pd.DataFrame()
appended_data = []
while (scroll_size > 0):
at_data_rest = pd.DataFrame.from_dict([a['_source'] for a in elastic_data['hits']['hits']])
appended_data.append(at_data_rest)
elastic_data = es.scroll(scroll_id = sid, scroll = '1m', request_timeout = 30)
sid = elastic_data['_scroll_id']
scroll_size = len(elastic_data['hits']['hits'])
if len(appended_data) > 0:
df = pd.concat(appended_data, ignore_index=True, sort = False)
del appended_data
gc.collect()
es.clear_scroll(body={'scroll_id': sid})
return df
def build_parameters(index):
parameters =[]
for num in range(0, NJOBS):
tuple_parameter = (index, num)
parameters.append(tuple_parameter)
return parameters
parameters = build_parameters(lc_training_ID)
with multiprocessing.Pool(processes = NJOBS) as pool:
result = pool.starmap(es_scroll, parameters)
frame = pd.concat(result, ignore_index=True, sort = False)
runs indefinitely