3

Here is my query -

{
    "from": 0,
    "size": 100,
    "query": {
        "bool": {
            "filter": [
                {
                    "terms": {
                        "folderId.keyword": [
                            "ff98505e-cdff-43aa-8b05-197bc3f3265e"
                        ],
                        "boost": 1
                    }
                },
                {
                    "terms": {
                        "objectType.keyword": [
                            "File"
                        ],
                        "boost": 1
                    }
                },
                {
                    "term": {
                        "tenantId": {
                            "value": "34202",
                            "boost": 1
                        }
                    }
                }
            ],
            "adjust_pure_negative": true,
            "boost": 1
        }
    },
      "aggs":{
        "_byformat":{
            "terms":{
               "field":"format.keyword",
               "min_doc_count":0,
               "size":200
            }
        }
   }
}

result with min_doc_count = 0 -

  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 3,
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "6adbda83-53ad-457f-a2ab-d5b04c643005",
        "_score" : 0.0,
        "_source" : {
          "format" : "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New XLSX file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "6adbda83-53ad-457f-a2ab-d5b04c643005",
          "status" : "active",
          "expirationDate" : null
        }
      },
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "b1000a15-2d80-41f4-a5df-ba5c27f8e9c6",
        "_score" : 0.0,
        "_source" : {
          "format" : "vnd.ms-excel",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New XLS file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "b1000a15-2d80-41f4-a5df-ba5c27f8e9c6",
          "status" : "active",
          "expirationDate" : null
        }
      },
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "630e9f49-3368-408d-a091-03f253127004",
        "_score" : 0.0,
        "_source" : {
          "format" : "msword",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New DOC file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "630e9f49-3368-408d-a091-03f253127004",
          "status" : "active",
          "expirationDate" : null
        }
      }
    ]
  },
  "aggregations" : {
    "_byformat" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "msword",
          "doc_count" : 1
        },
        {
          "key" : "vnd.ms-excel",
          "doc_count" : 1
        },
        {
          "key" : "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "doc_count" : 1
        },
        {
          "key" : "bmp",
          "doc_count" : 0
        },
        {
          "key" : "gif",
          "doc_count" : 0
        },
        {
          "key" : "html",
          "doc_count" : 0
        }       
      ]
    }
  }
}

result with min_doc_count = 1 -

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 3,
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "6adbda83-53ad-457f-a2ab-d5b04c643005",
        "_score" : 0.0,
        "_source" : {
          "format" : "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New XLSX file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "6adbda83-53ad-457f-a2ab-d5b04c643005",
          "status" : "active",
          "expirationDate" : null
        }
      },
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "b1000a15-2d80-41f4-a5df-ba5c27f8e9c6",
        "_score" : 0.0,
        "_source" : {
          "format" : "vnd.ms-excel",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New XLS file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "b1000a15-2d80-41f4-a5df-ba5c27f8e9c6",
          "status" : "active",
          "expirationDate" : null
        }
      },
      {
        "_index" : "plnesdv1-34202-1",
        "_type" : "_doc",
        "_id" : "630e9f49-3368-408d-a091-03f253127004",
        "_score" : 0.0,
        "_source" : {
          "format" : "msword",
          "externalSharing" : "N",
          "description" : null,
          "dateModified" : null,
          "type" : "application",
          "folderId" : "ff98505e-cdff-43aa-8b05-197bc3f3265e",
          "tags" : [ ],
          "objectType" : "File",
          "dateCreated" : null,
          "name" : "New DOC file",
          "tenantId" : "34202",
          "modifiedBy" : "rdt001",
          "id" : "630e9f49-3368-408d-a091-03f253127004",
          "status" : "active",
          "expirationDate" : null
        }
      }
    ]
  },
  "aggregations" : {
    "_byformat" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "msword",
          "doc_count" : 1
        },
        {
          "key" : "vnd.ms-excel",
          "doc_count" : 1
        },
        {
          "key" : "vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "doc_count" : 1
        }
      ]
    }
  }
}

Aggregations are correct when min_doc_count = 1 and only buckets relevant to hits are fetched.

Could anyone tell me why aggregation is fetching all buckets when min_doc_count = 0 is set. I have gone through the elastic search documentation, it states that this behavior is by design, any other way to get aggregation buckets only for hits and with zero count as well.

Kiran Pawar
  • 79
  • 1
  • 8

2 Answers2

1

First of all you need to understand what is the meaning of seeing buckets with zero counts.

Below is an excerpt from the Terms Aggregation link:

Setting min_doc_count=0 will also return buckets for terms that didn’t match any hit. However, some of the returned terms which have a document count of zero might only belong to deleted documents or documents from other types, so there is no warranty that a match_all query would find a positive document count for those terms.

So most likely it appears to be the count for the deleted documents.

Note that the aggregation would only get calculated on the documents that get filtered by the query.

However you need to keep in mind that while ES keeps merging the segments of indexes behind the scenes(that happens during deletion process), the results for count with 0 may not be consistent and over a period of time may, eventually (if no further dos are deleted from that point onwards) you may not get any terms with 0 count at all once the merging process is completed.

So in a way it is safe to say to your business leads, that they are the counts for deleted docs and you can push the above argument to them. And if they say they need count of docs/terms of deleted docs, it is like finding a document/terms which does not exist in the index and it does not even make sense right.

As per why does this still shows, that is probably due to the segment merging process that happens in ES and it is by design.

So no, you cannot apply query/filter on deleted documents (take a step back and imagine that) and hence you cannot control the data related to docs not available in first place.

Aggregation Query:

You can make use of the below aggregation which would give you as per your requirement mentioned in the comment:

POST <your_index_name>/_search
{
  "size": 0,
  "aggs": {
    "myaggs_count_zero": {                       <--- Agg for count 0
      "terms": {
        "field": "format.keyword"
      },
      "aggs": {
        "document_counts": {
          "value_count" : {
            "field" : "format.keyword"
          }
        },
        "by_account_filtered": {
          "bucket_selector": {
            "buckets_path": {
              "totalDocs": "document_counts"
            },
            "script": "params.totalDocs == 0"
          }
        }
      }
    },
    "myaggs_count_not_zero": {                  <--- Agg for normal count
      "terms": {
        "field": "format.keyword",
        "min_doc_count": 1
      }
    }
  }
}

Note that I've made use of Terms, Value Count and Bucket Selector Aggregations

This may not be what you are looking for but I hope that helps!

Kamal Kunjapur
  • 8,547
  • 2
  • 22
  • 32
  • It helps to understand the documentation to a great depth, thank you. And yes, looks like either i have to use multiple queries to deduce buckets with zero count or pass on the above design mentioned by you two product manager. Still will wait for any other hints and opinions. – Kiran Pawar Jun 26 '20 at 16:36
  • Thanks Kamal! yes, it's one part of my question and I was looking out for both together with single query, i.e. buckets with zero count and buckets with actual count on returned results. – Kiran Pawar Jun 26 '20 at 18:18
  • @KiranPawar I've added the aggregation query as per your requirements. Let me know if that helps. Feel free to accept and/or upvote the answer if you think that would help you. – Kamal Kunjapur Jun 27 '20 at 05:52
  • Thanks Kamal, I tried with above query and did not expected results along with zero count. – Kiran Pawar Jun 29 '20 at 05:19
  • @KiranPawar Are you saying that issue still persists? – Kamal Kunjapur Jun 29 '20 at 05:46
  • Yes, I am not getting buckets with zero count for hits. – Kiran Pawar Jun 29 '20 at 06:13
  • @KiranPawar I guess in that case you do not have the means to control the docs with 0 count. – Kamal Kunjapur Jun 29 '20 at 07:15
  • Yes Kamal, so now will try with two different queries and will merge results to get desired output. – Kiran Pawar Jun 29 '20 at 07:24
  • Solution was to do with two queries. – Kiran Pawar Nov 18 '22 at 06:48
0

Seems you slightly misunderstand the min_doc_count on aggregation,

Setting min_doc_count=0 will also return buckets for terms that didn’t match any hit

It is possible to only return terms that match more than a configured number of hits using the min_doc_count option:

 "aggs":{
        "_byformat":{
            "terms":{
               "field":"format.keyword",
               "min_doc_count":0,
               "size":200
            }
        }
   }

The above aggregation would only return keyword which has been found in 0 hits or more. 0 means all with/without, 1 means minimum 1 document should have that keyword.

A l w a y s S u n n y
  • 36,497
  • 8
  • 60
  • 103
  • Yes, you are right. I was looking out for a way to fetch buckets with zero count but on query results or hits. Is there any way? – Kiran Pawar Jun 26 '20 at 12:06
  • 1
    More specifically, I only need all buckets falling under filter mentioned in above query but with buckets of zero count as well. Currently, it is returning all buckets from whole data under that index with zero count. – Kiran Pawar Jun 26 '20 at 12:20