So I used this "tutorial" to start working with special characters in elasticsearch: https://www.elastic.co/guide/en/elasticsearch/guide/current/case-folding.html
I installed the elasticsearch-analysis-icu version 2.7.0 for my elasticsearch version 1.7.x
Creating an index "sonderzeichen" with the icu_tokenizer worked fine (I use nodejs):
var http = require('http');
var body = JSON.stringify(
{
"settings": {
"analysis": {
"analyzer": {
"my_lowercaser": {
"tokenizer": "icu_tokenizer",
"filter": [ "icu_normalizer" ]
}
}
}
}
}
);
var options = {
host: 'localhost',
path: '/sonderzeichen',
port: 9200,
method: "PUT",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
I used the two analyzers as described in the tutorial:
/_analyze?analyzer=my_lowercaser
and
/sonderzeichen/_analyze?analyzer=my_lowercaser
In node it looks like this:
var http = require('http');
var body = decodeURIComponent("Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ");
var options = {
host: 'localhost',
path: '/_analyze?analyzer=standard',
port: 9200,
method: "GET",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
Both return the exact same token, as follows (no matter if I use decodeURIComponent or not):
{
"tokens": [
{
"token": "wei",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "kopfseeadler",
"start_offset": 4,
"end_offset": 16,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "weisskopfseeadler",
"start_offset": 17,
"end_offset": 34,
"type": "<ALPHANUM>",
"position": 3
}
]
}
Elastic still can't seem to handle any special characters, so where did I go wrong?