My code is as under. Even though I have listed ["is","it","possible"] as stop words filter, but still am getting it in the output search. Could someone help as to why is elastic search not removing them from the input documents while indexing ?
issue_with_stop_word.csv is as under
id,qid1,qid2,question1
5,11,12,How do I recover my Facebook login password?
7,15,16,Is it possible to sleep without dreaming?
11,23,24,How easy is it to hack the login password of a Macbook Air?
12,25,26,How easy is it to hack the login password of a Macbook Air?
13,27,28,Is it possible to know who visited my Facebook profile?
15,31,32,Is it possible to know who visited my Facebook profile?
16,33,34,Is it possible to know who visited my Facebook profile?
18,37,38,Is it possible to hack someone's Facebook messages?
20,41,42,Is it possible to know who visited my Facebook profile?
29,59,60,How do I recover my Facebook password without having to reset it?
31,63,64,What are some special cares for someone with a nose that gets stuffy during the night?
32,65,66,What Game of Thrones villain would be the most likely to give you mercy?
Code is below
from elasticsearch import Elasticsearch
from elasticsearch import helpers
query='Is it possible ?'
index_name = 'sample'
doc_type = 'dummy'
content = 'content'
document = 'question'
identity = 'id'
def main():
es = Elasticsearch('localhost:9200')
create_indices(es, index_name)
res = es.search(index=index_name, doc_type=doc_type,
body={
"query": {
"match": {
'content': "is it possible"
}
}
})
print("%d documents found:" % len(res['hits']['hits']))
for doc in res['hits']['hits']:
print("%s) %s %s" % (doc['_id'], doc['_source']['content'], str(doc['_score'])))
def create_indices(es, index_name):
bulk_data = []
with open('issue_with_stop_word.csv', 'rb') as tsvin:
tsvin.next()
for row in tsvin:
row = unicode(row, errors='replace')
doc = str(row.split(',')[3]).strip()
int_id = int(row.split(',')[1])
value = dict()
value[content] = doc
value[identity] = int_id
bulk_data.append(value)
if es.indices.exists(index_name):
print("deleting '%s' index..." % (index_name))
res = es.indices.delete(index=index_name)
print(" response: '%s'" % (res))
# since we are running locally, use one shard and no replicas
request_body = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"my_stop": {
"type": "stop",
"stopwords": ["is","it","possible"]
}
}
}
}
}
print("creating '%s' index..." % (index_name))
res = es.indices.create(index=index_name, body=request_body)
print(" response: '%s'" % (res))
# bulk index the data
print("bulk indexing...")
actions = [
{
"_index": index_name,
"_type" : doc_type,
"_id": val[identity],
content:val[content]
}
for val in bulk_data
]
res = helpers.bulk(es, actions, refresh = True)
if __name__ == '__main__':
main()