0

I'm converting .avro files to JSON format, then parsing specific data items to be indexed on my elastic search cluster. Each chunk contains roughly 1.8 gigabytes of data and there are about 500 chunks. It doesn't take long to run out of memory, but I thought that was what the bulk helpers were for in the elastic search library.

Am I missing some key detail here?

def connect_elasticsearch(address):
    es = Elasticsearch([address], verify_certs=False)
    if not es.ping():
        raise ValueError("Connection failed")
    return es

#function for running unix command line from python
def run_cmd(args_list):
        print(str(datetime.datetime.now())[:19]+': Running system command: {0}'.format(' '.join(args_list)))
        proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        s_output, s_err = proc.communicate()
        s_return =  proc.returncode
        return s_return, s_output, s_err
def gendata(path):
  cat = subprocess.Popen(["hdfs", "dfs", "-cat", path], stdout=subprocess.PIPE)
  avro_reader = reader(cat.stdout)
  for record in avro_reader:
    yield {
      '_index': 'beta_homepage_survey_2',
      "hostname": record['hostname'],
      "age": record['ts'],
      "text": record['text'],
      "metadata": record['metadata'],
      "source":path}
es = connect_elasticsearch('http://myurl:9200/')
#find all avro files for the homepage survey via the command line (hdfs commands)
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-ls', '/homepage_survey/chunks/*/*.avro'])
lines = out.split('\n')
for line in lines:
  try:
    line = str('/' + line.split(" /")[1])
    print(str(datetime.datetime.now())[:19]+": Indexing File: "+ line)
    helpers.bulk(es,gendata(line))
  except Exception as ex:
    print(str(datetime.datetime.now())[:19] + ": *** Error indexing chunk:- "+ type(ex).__name__)
    continue
print(str(datetime.datetime.now())[:19]+ ": Indexing Complete...")
Remixt
  • 597
  • 6
  • 28
  • Have you checked what is the threshold? I mean what is the smallest number of chunks that you are able to insert without error? Also what is the heap size on es? – Developer Oct 19 '19 at 14:09
  • Where do you get error? on python side or elasticserver side? – Wonka Oct 22 '19 at 16:33

0 Answers0