Elasticsearch data migration from ES version 2.4.6 to ES version 7.9.1 with duplication handle

Question

I have written a code to migrate data from Elasticsearch 2.4.6 to Elasticsearch 7.9.2 through Python code.

But for some fields which have data_type geohash I am getting error

"parse exception", "reason":"empty geohash","type":"mapper_parsing_exception","caused_by":{"type":"illegal_argument_exception","reason":"empty geohash"}


import requests
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers

#############################
# Initialization
#############################

try:
    # Instance creation for different versions of elasticsearch

    es2=Elasticsearch("http://source_cluster_ip:9200")
    es7=Elasticsearch("http://destination_cluster_ip:9200")

    #check there connection status 
    elastic_info2=Elasticsearch.info(es2)
    elastic_info7=Elasticsearch.info(es7)
    print(elastic_info2)
    print(elastic_info7)
except Exception as err:
    es7=None
    es2 = None


def data_migration(es2,es7):
#    Function to migrate data from ES 2.4.6 to ES 7.9.2 

    if True:
        # To fetch max records at a time

        MaxRecordsToFetch = 10000

        # Query to get data of size 10000

        query = {'size':MaxRecordsToFetch, 'query': {'match_all': {}}}

        # Fetching data from elasticsearch version 2.4.6 where data_type is data_collection
        
        data = es2.search(index='source_index_name',body=query,scroll='1m',ignore=400,doc_type="data_collection")
        while True:

            #check if data exists in it

            count = len(data['hits']['hits'])
            ctr = 0
            while (ctr<10) and (ctr< count):
                if not data['hits']['hits']:
                    break
                for doc in data['hits']['hits']:

                    # I have same field names for both the versions.
                    # Main difference is in mapping data type like
                    # in ES 2.4.6 data type is string which is divided into two data type one is text and another one is keyword,if you want it as a keyworkd you add "index":"not_analyzed"
                    # so I have a primary field in my data which is always unique so here I have to manage duplication too 
                    # so I am assigning that value to a variable "articleId" and passing it as an _ID at ingestion time 
                    # Coz even if a duplicate data ingested by chance then it will overwrite the data. 
                    
                    articleId = doc['_source']['articleId']

                    # Here I am assigning the data from _source to a variable and then passing it into elasticsearch as a body
                    
                    res = doc['_source']
                    
                    # query to count the existent of and articleId
                    query={
                            "query": {
                                "bool": {
                                    "must": [
                                        {
                                            "match": {
                                                "articleId": {
                                                    "query": articleId
                                                }
                                            }
                                        }
                                    ]
                                }
                            }
                        }

                    #  Here I am counting number of docs for the articleId in destination_index so I can avoid multiple overwrites
                    # and if a data is exist it will not overwrite it and if doc_count ==0 then only it will ingest .    
                    
                    doc_count=es7.count(index="destination_index", body = query)['count']
                    
                    # print(res)
                    # print(doc_count)
                    # Ingesting data if doc_count for that doc is 0
                    if doc_count == 0 :
                        
                        # print(res)

                        # passing _source of es 2.4.6 into es as body and doc_type ='_doc'
                        
                        status = es7.index(index='destination_index',body=res,doc_type='_doc',id=articleId,ignore=400)
                        print("Data Ingested : ",articleId, f"status: {status}")
                    


                ctr = ctr + 1
                if (count <MaxRecordsToFetch):
                    break
                # scroll id to avoid repetation of same data fetching from source_index
                scroll=data['_scroll_id']
                data = es2.scroll(scroll_id=scroll, scroll='25m')
            
data_migration(es2,es7)

Elasticsearch data migration from ES version 2.4.6 to ES version 7.9.1 with duplication handle

0 Answers0