I am using elasticsearch-py to index tweets (originally in JSON format). In order to preserve special characters like hashtags, user targets and emoticons, I specified a special mapping while creating the index. This is what it looks like:
from elasticsearch import Elasticsearch
import sys,json
es = Elasticsearch()
es.indices.create(
index='ecommercetweets',
body={
"settings" : {
"index" : {
"number_of_shards" : 1,
"number_of_replicas" : 1
},
"analysis" : {
"filter" : {
"tweet_filter" : {
"type" : "word_delimiter",
"type_table": ["# => ALPHA", "@ => ALPHA", ":) => ALPHA", ":( => ALPHA"]
}
},
"analyzer" : {
"tweet_analyzer" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase", "tweet_filter"]
}
}
}
},
"mappings" : {
"tweet" : {
"properties" : {
"text" : {
"analyzer" : "tweet_analyzer"
}
}
}
}
},
ignore=400
)
fin = open(sys.argv[1],"r")
count = 0
for line in fin:
jsonLine = json.loads(line)
doc = {
'tweetId' : jsonLine["id"],
'text' : jsonLine["text"],
'userId' : jsonLine["user"]["id"],
'favorite_count' : jsonLine["favorite_count"],
'retweet_count' :jsonLine["retweet_count"],
'language': jsonLine["lang"],
'dateTime':jsonLine["created_at"],
'location':jsonLine["place"]
}
es.index(index='ecommercetweets', doc_type='tweet', id=count, body=doc)
count+=1
I am searching using this command:
results1 = es.search(index='ecommercetweets',q="text:delivery")
results2 = es.search(index='ecommercetweets',q="text:#delivery")
Both returns the same number of hits, although I am pretty sure this should not be the case for the data I am using.
Am I going wrong with the search command?