0

So I used this "tutorial" to start working with special characters in elasticsearch: https://www.elastic.co/guide/en/elasticsearch/guide/current/case-folding.html

I installed the elasticsearch-analysis-icu version 2.7.0 for my elasticsearch version 1.7.x

Creating an index "sonderzeichen" with the icu_tokenizer worked fine (I use nodejs):

    var http = require('http');
    var body = JSON.stringify(
            {
                "settings": {
                    "analysis": {
                        "analyzer": {
                            "my_lowercaser": {
                                "tokenizer": "icu_tokenizer",
                                "filter":  [ "icu_normalizer" ] 
                            }
                        }
                    }
                }
            }
    );

    var options = {
            host: 'localhost',
            path: '/sonderzeichen',
            port: 9200,
            method: "PUT",
            headers: {
                'Content-Type': 'application/json',
                'Content-Length': body.length
            }
    };

    callback = function(response) {
        var str = '';
        response.on('data', function(chunk){
            str += chunk;
        });

        response.on('end', function(){
            console.log(str);
        });
    };


    http.request(options, callback).end(body);

I used the two analyzers as described in the tutorial:

/_analyze?analyzer=my_lowercaser

and

/sonderzeichen/_analyze?analyzer=my_lowercaser

In node it looks like this:

    var http = require('http');

    var body = decodeURIComponent("Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ");

    var options = {
            host: 'localhost',
            path: '/_analyze?analyzer=standard',
            port: 9200,
            method: "GET",
            headers: {
                'Content-Type': 'application/json',
                'Content-Length': body.length
            }
    };

    callback = function(response) {
        var str = '';
        response.on('data', function(chunk){
            str += chunk;
        });

        response.on('end', function(){
            console.log(str);
        });
    };


    http.request(options, callback).end(body);

Both return the exact same token, as follows (no matter if I use decodeURIComponent or not):

    {
      "tokens": [
        {
          "token": "wei",
          "start_offset": 0,
          "end_offset": 3,
          "type": "<ALPHANUM>",
          "position": 1
        },
        {
          "token": "kopfseeadler",
          "start_offset": 4,
          "end_offset": 16,
          "type": "<ALPHANUM>",
          "position": 2
        },
        {
          "token": "weisskopfseeadler",
          "start_offset": 17,
          "end_offset": 34,
          "type": "<ALPHANUM>",
          "position": 3
        }
      ]
    }

Elastic still can't seem to handle any special characters, so where did I go wrong?

JTR
  • 23
  • 5
  • If you're only dealing with German, I'm pretty confident that the [`asciifolding`](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-asciifolding-tokenfilter.html) token filter is more than enough. You should try it before leveraging the UCI plugin. – Val Jan 15 '16 at 12:29
  • I tried and I must be using it wrong: Droped the index and created it again replacing `"analysis":{"analyzer":{"my_lowercaser":{"tokenizer":"icu_tokenizer","filter":["icu_normalizer"]}` with `"analysis":{"analyzer":{"default":{"tokenizer":"standard","filter":["standard","asciifolding"]}` and analyzing I used `/sonderzeichen/_analyze?analyzer=default`. Still get the same tocken as an answer – JTR Jan 15 '16 at 12:44

2 Answers2

0

I've created the sonderzeichen index with the settings below:

curl -XPUT localhost:9200/sonderzeichen -d '{
  "settings": {
    "analysis": {
      "analyzer": {
        "default": {
          "tokenizer": "standard",
          "filter": [
            "standard",
            "asciifolding"
          ]
        }
      }
    }
  }
}'

When done, I've analyzed the input string you mention in your question like this:

 curl -XGET 'localhost:9200/sonderzeichen/_analyze?analyzer=default&pretty' -d 'Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ'

And the output I get is the one below, which looks correct to me

{
  "tokens" : [ {
    "token" : "Weisskopfseeadler",
    "start_offset" : 0,
    "end_offset" : 16,
    "type" : "<ALPHANUM>",
    "position" : 1
  }, {
    "token" : "WEISSKOPFSEEADLER",
    "start_offset" : 17,
    "end_offset" : 34,
    "type" : "<ALPHANUM>",
    "position" : 2
  }, {
    "token" : "aAoOuUssaAeEiIoOuUaAeEiIoOuU",
    "start_offset" : 35,
    "end_offset" : 62,
    "type" : "<ALPHANUM>",
    "position" : 3
  } ]
}
Val
  • 207,596
  • 13
  • 358
  • 360
  • Copied your code exactly into my nodejs code again and still get the wrong token. I will have someone look at it next week and see if they can find the error. I will post an update when we find it. Thanks AGAIN – JTR Jan 15 '16 at 13:17
  • Note that in your Node.js code you're not hitting the `sonderzeichen` index and you're using the `standard` analyzer. You should replace `path` with `/sonderzeichen/_analyze?analyzer=default'` – Val Jan 15 '16 at 13:30
0

We fixed it using the elastic-api for nodejs. For some reason, posting an http-request via node will return a mapperParsing exception! curl will work and using the elasticsearch.Client will work. So the following version for writing special characters (in this case without even using any tokenizer or analyzer) works in our elastic environment:

    var elasticsearch = require('elasticsearch');
    var client = new elasticsearch.Client({
        host: 'localhost:9200',
        log: ''
    });

    client.index({
        index: "sonderzeichen",
        type: "randomType",
        id: "randomId",
        body: "Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ"
    }, function (err, resp) {
        if (err){
                console.error("error in method writeDB: " + err);
                return;
        }
        console.log("callback from db request: " + JSON.stringify(resp));
    });
JTR
  • 23
  • 5