0

I have an autocomplete analyser for a field("keywords"). This field is an array of strings. When I query with a search string I want to show first the documents where a single element of the array keywords matches best. The problem is that if a part of the string matches with more elements of the array "keywords", then this document appears before another that has less but better matches. For example, if I have a query with the word "gas station" the returning documents' keywords are these:

"hits": [
  {
    "_index": "locali_v3",
    "_type": "categories",
    "_id": "5810767ddc536a03b4761acd",
    "_score": 3.1974547,
    "_source": {
      "keywords": [
        "Radio Station",
        "Radio Station"
      ]
    }
  },
  {
    "_index": "locali_v3",
    "_type": "categories",
    "_id": "581076d8dc536a03b4761cc3",
    "_score": 3.0407648,
    "_source": {
      "keywords": [
        "Stationery Store",
        "Stationery Store"
      ]
    }
  },
  {
    "_index": "locali_v3",
    "_type": "categories",
    "_id": "5810767ddc536a03b4761ace",
    "_score": 2.903595,
    "_source": {
      "keywords": [
        "TV Station",
        "TV Station"
      ]
    }
  },
  {
    "_index": "locali_v3",
    "_type": "categories",
    "_id": "581076cddc536a03b4761c87",
    "_score": 2.517158,
    "_source": {
      "keywords": [
        "Praktoreio Ugrwn Kausimwn/Gkaraz",
        "Praktoreio Ygrwn Kaysimwn/Gkaraz",
        "Praktoreio Ugron Kausimon/Gkaraz",
        "Praktoreio Ygron Kaysimon/Gkaraz",
        "Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
        "Gas Station"
      ]
    }
}

The "Gas Station" is fourth, although it has the best single element matching. Is there a way to tell ElasticSearch that I do not care about how many times "gas" or "station" appears in keywords? I want the max element of the array keywords match as the score factor.

My settings are:

{
  "locali": {
    "settings": {
      "index": {
        "creation_date": "1480937810266",
    "analysis": {
      "filter": {
        "autocomplete_filter": {
          "type": "edge_ngram",
          "min_gram": "1",
          "max_gram": "20"
        }
      },
      "analyzer": {
        "keywords": {
          "filter": [
            "lowercase",
            "autocomplete_filter"
          ],
          "char_filter": [
            "my_char_filter"
          ],
          "type": "custom",
          "tokenizer": "standard"
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "mapping",
          "mappings": [
            "ί => ι",
            "Ί => Ι",
            "ή => η",
            "Ή => Η",
            "ύ => υ",
            "Ύ => Υ",
            "ά => α",
            "Ά => Α",
            "έ => ε",
            "Έ => Ε",
            "ό => ο",
            "Ό => Ο",
            "ώ => ω",
            "Ώ => Ω",
            "ϊ => ι",
            "ϋ => υ",
            "ΐ => ι",
            "ΰ => υ"
          ]
        }
      }
    },
    "number_of_shards": "1",
    "number_of_replicas": "1",
    "uuid": "TJjOt9L9QE2HrsUFHM6zJg",
    "version": {
      "created": "2040099"
    }
  }
}
  }
}

And the mappings:

{
  "locali": {
"mappings": {
  "places": {
    "properties": {
      "formattedCategories": {
        "properties": {
          "english": {
            "type": "string"
          },
          "greek": {
            "type": "string"
          }
        }
      },
      "keywords": {
        "type": "string",
        "analyzer": "keywords"
      },
      "loc": {
        "properties": {
          "coordinates": {
            "type": "geo_point"
          }
        }
      },
      "location": {
        "properties": {
          "formattedAddress": {
            "properties": {
              "english": {
                "type": "string"
              },
              "greek": {
                "type": "string"
              }
            }
          },
          "locality": {
            "properties": {
              "english": {
                "type": "string"
              },
              "greek": {
                "type": "string"
              }
            }
          },
          "neighbourhood": {
            "properties": {
              "english": {
                "type": "string"
              },
              "greek": {
                "type": "string"
              }
            }
          }
        }
      },
      "name": {
        "properties": {
          "english": {
            "type": "string"
          },
          "greek": {
            "type": "string"
          }
        }
      },
      "rating": {
        "properties": {
          "rating": {
            "type": "long"
          }
        }
      },
      "seenDetails": {
        "type": "long"
      },
      "verified": {
        "type": "long"
      }
    }
  },
  "regions": {
    "properties": {
      "keywords": {
        "type": "string",
        "analyzer": "keywords"
      },
      "loc": {
        "properties": {
          "coordinates": {
            "type": "geo_point"
          }
        }
      },
      "name": {
        "properties": {
          "english": {
            "type": "string"
          },
          "greek": {
            "type": "string"
          }
        }
      },
      "type": {
        "type": "long"
      },
      "weight": {
        "type": "long"
      }
    }
  },
  "categories": {
    "properties": {
      "keywords": {
        "type": "string",
        "analyzer": "keywords"
      },
      "name": {
        "properties": {
          "english": {
            "type": "string"
          },
          "greek": {
            "type": "string"
          }
        }
      },
      "weight": {
        "type": "long"
      }
    }
  }
}
  }
}
Manolis Karamanis
  • 758
  • 1
  • 10
  • 27

1 Answers1

0

Can you post your query here that you are trying here as well. I tried your example with the following query

{
  "query": {"match": {
    "keywords": "gas station"
    }
  }
}

And i got your desired result.

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 0.081366636,
    "hits": [
      {
        "_index": "stack",
        "_type": "type",
        "_id": "AVjP6QnpdNp-z_ybGd-L",
        "_score": 0.081366636,
        "_source": {
          "keywords": [
            "Praktoreio Ugrwn Kausimwn/Gkaraz",
            "Praktoreio Ygrwn Kaysimwn/Gkaraz",
            "Praktoreio Ugron Kausimon/Gkaraz",
            "Praktoreio Ygron Kaysimon/Gkaraz",
            "Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
            "Gas Station"
          ]
        }
      },
      {
        "_index": "stack",
        "_type": "type",
        "_id": "AVjP5-u5dNp-z_ybGd-I",
        "_score": 0.03182549,
        "_source": {
          "keywords": [
            "Radio Station",
            "Radio Station"
          ]
        }
      },
      {
        "_index": "stack",
        "_type": "type",
        "_id": "AVjP6KiKdNp-z_ybGd-K",
        "_score": 0.03182549,
        "_source": {
          "keywords": [
            "TV Station",
            "TV Station"
          ]
        }
      }
    ]
  }
}

Try this query to see if you are getting desired result. Also you can reply with your mappings, query and ES version if this does't work for you.

Hope this solves your problem. Thanks

user3775217
  • 4,675
  • 1
  • 22
  • 33
  • I have an autocomplete analyzer named "keywords". I am using the same query but the results are the ones I described above. – Manolis Karamanis Dec 05 '16 at 17:34
  • ok. try explain api to learn why it is scoring different with analyzer, also try to check analyzed terms produced by this analyzer stored in inverted index. – user3775217 Dec 05 '16 at 18:04
  • i did some look around, its coming because of different value of tdf/idf on inverted index, since you used edge-ngram tokenizer in mappings. So this tdf and idf is responsible for more boost for first 3 documents. – user3775217 Dec 05 '16 at 18:17
  • try these http://stackoverflow.com/questions/33208587/elasticsearch-score-disable-idf and https://www.elastic.co/guide/en/elasticsearch/guide/current/scoring-theory.html, you can ignore these factors or remove edge n gram – user3775217 Dec 05 '16 at 18:20