I have written a code to migrate data from Elasticsearch 2.4.6 to Elasticsearch 7.9.2 through Python code.
But for some fields which have data_type geohash I am getting error
"parse exception", "reason":"empty geohash","type":"mapper_parsing_exception","caused_by":{"type":"illegal_argument_exception","reason":"empty geohash"}
import requests
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers
#############################
# Initialization
#############################
try:
# Instance creation for different versions of elasticsearch
es2=Elasticsearch("http://source_cluster_ip:9200")
es7=Elasticsearch("http://destination_cluster_ip:9200")
#check there connection status
elastic_info2=Elasticsearch.info(es2)
elastic_info7=Elasticsearch.info(es7)
print(elastic_info2)
print(elastic_info7)
except Exception as err:
es7=None
es2 = None
def data_migration(es2,es7):
# Function to migrate data from ES 2.4.6 to ES 7.9.2
if True:
# To fetch max records at a time
MaxRecordsToFetch = 10000
# Query to get data of size 10000
query = {'size':MaxRecordsToFetch, 'query': {'match_all': {}}}
# Fetching data from elasticsearch version 2.4.6 where data_type is data_collection
data = es2.search(index='source_index_name',body=query,scroll='1m',ignore=400,doc_type="data_collection")
while True:
#check if data exists in it
count = len(data['hits']['hits'])
ctr = 0
while (ctr<10) and (ctr< count):
if not data['hits']['hits']:
break
for doc in data['hits']['hits']:
# I have same field names for both the versions.
# Main difference is in mapping data type like
# in ES 2.4.6 data type is string which is divided into two data type one is text and another one is keyword,if you want it as a keyworkd you add "index":"not_analyzed"
# so I have a primary field in my data which is always unique so here I have to manage duplication too
# so I am assigning that value to a variable "articleId" and passing it as an _ID at ingestion time
# Coz even if a duplicate data ingested by chance then it will overwrite the data.
articleId = doc['_source']['articleId']
# Here I am assigning the data from _source to a variable and then passing it into elasticsearch as a body
res = doc['_source']
# query to count the existent of and articleId
query={
"query": {
"bool": {
"must": [
{
"match": {
"articleId": {
"query": articleId
}
}
}
]
}
}
}
# Here I am counting number of docs for the articleId in destination_index so I can avoid multiple overwrites
# and if a data is exist it will not overwrite it and if doc_count ==0 then only it will ingest .
doc_count=es7.count(index="destination_index", body = query)['count']
# print(res)
# print(doc_count)
# Ingesting data if doc_count for that doc is 0
if doc_count == 0 :
# print(res)
# passing _source of es 2.4.6 into es as body and doc_type ='_doc'
status = es7.index(index='destination_index',body=res,doc_type='_doc',id=articleId,ignore=400)
print("Data Ingested : ",articleId, f"status: {status}")
ctr = ctr + 1
if (count <MaxRecordsToFetch):
break
# scroll id to avoid repetation of same data fetching from source_index
scroll=data['_scroll_id']
data = es2.scroll(scroll_id=scroll, scroll='25m')
data_migration(es2,es7)