0

Hi I need to write a specific query that will aggregate data by work shifts in selected time range during days. The issue is I do not want to specify all the ranges directly in date_range aggregation, just want to specify the from -> to time range for the specific day of the aggregation. Is there any possibility how to do this the easy way ?

I have this kind of query:

{
    "_source": false,
    "size": 10000,
    "query": {
        "bool": {
            "must": [
                {
                    "terms": {
                        "streamId": [
                            "ENRG_0054"
                        ]
                    }
                },
                {
                    "range": {
                        "timestamp": {
                            "gte": "2021-02-01T00:00:00Z",
                            "lte": "2021-02-10T01:00:00Z"
                        }
                    }
                }
            ]
        }
    },
    "sort": [
        {
            "timestamp": {
                "order": "asc"
            }
        },
        {
            "_score": {
                "order": "asc"
            }
        }
    ],
    "aggs": {
        "streamId": {
            "terms": {
                "field": "streamId",
                "size": 10000
            },
            "aggs": {
                "days": {
                    "date_histogram": {
                        "field": "timestamp",
                        "interval": "1d"
                    },
                    "aggs": {
                        "shifts": {
                            "date_range": {
                                "field": "timestamp",
                                "format": "HH:mm",
                                "ranges": [
                                    {
                                        "key": "MORNING",
                                        "from": "06:00",
                                        "to": "14:00"
                                    },
                                    {
                                        "key": "AFTERNOON",
                                        "from": "14:00",
                                        "to": "22:00"
                                    }
                                ],
                                "keyed": true
                            },
                            "aggs": {
                                "MAX": {
                                    "max": {
                                        "field": "@floatMessage.value.value"
                                    }
                                },
                                "MIN": {
                                    "min": {
                                        "field": "@floatMessage.value.value"
                                    }
                                },
                                "DIFF": {
                                    "bucket_script": {
                                        "buckets_path": {
                                            "min": "MIN",
                                            "max": "MAX"
                                        },
                                        "script": {
                                            "source": "return (params.max-params.min)"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

but in the result I am getting null for the values as the time ranges are not specified with date.

  "aggregations": {
        "streamId": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": "ENRG_0054",
                    "doc_count": 13343,
                    "days": {
                        "buckets": [
                            {
                                "key_as_string": "2021-02-01T00:00:00.000Z",
                                "key": 1612137600000,
                                "doc_count": 2763,
                                "shifts": {
                                    "buckets": {
                                        "MORNING": {
                                            "from": 2.16E7,
                                            "from_as_string": "06:00",
                                            "to": 5.04E7,
                                            "to_as_string": "14:00",
                                            "doc_count": 0,
                                            "MIN": {
                                                "value": null
                                            },
                                            "MAX": {
                                                "value": null
                                            }
                                        },
                                        "AFTERNOON": {
                                            "from": 5.04E7,
                                            "from_as_string": "14:00",
                                            "to": 7.92E7,
                                            "to_as_string": "22:00",
                                            "doc_count": 0,
                                            "MIN": {
                                                "value": null
                                            },
                                            "MAX": {
                                                "value": null
                                            }
                                        }
                                    }
                                }
                            },

example doc:

{
    "streamId": "ENRG_0054",
    "created": "2021-02-01T00:19:42.905Z",
    "extra": {},
    "location": null,
    "model": "floatMessage",
    "id": "6017491eb112b21488f6c843",
    "value": {
      "unit": "°C",
      "value": 18.94,
      "messageProcessed": "2021-02-01T00:19:41.595Z"
    },
    "timestamp": "2021-02-01T00:19:39.161Z",
    "tags": []
  }


When I generate all the date_ranges for desired timestamp range for the whole query the result is ok, is this the only way how to get the desired result or somebody can suggest how to update the query to meet my requirements ? thx

Joe - GMapsBook.com
  • 15,787
  • 4
  • 23
  • 68

1 Answers1

2

The reason you're not seeing any buckets inside of the data_range aggregation has to do with the datetime vs date inference -- similar to the one I discussed here a while ago.

In short, the date_range aggregation appears confusing when handling time values (HH:mm) as opposed to full datetime values (MM-dd-yyyy HH:mm) because:

  • if no year is provided, it'll default to 1970
  • if no month is provided, it'll default to Jan
  • if no day is provided, it'll default to the 1st of the month (if no month is provided, it'll default to Jan)
  • and so on.

You see, if you added just the year component:

"date_range": {
  "field": "timestamp",
  "format": "HH:mm yyyy",    <---
  "ranges": [
    {
      "key": "MORNING",
      "from": "06:00 2021",  <---
      "to": "14:00 2021"     <---
    }
  ],
  "keyed": true
}

Elasticsearch would return:

"MORNING" : {
  "from" : 2.16E7,
  "from_as_string" : "06:00 1970",   <--- 
  "to" : 5.04E7,
  "to_as_string" : "14:00 1970",     <--- 
  ...
}

Adding a month would solve this particular point-in-time problem but would of course introduce the problem of just being able to aggregate on one single month of one concrete year.

So I'd propose the following

  1. Add one more date field, called time, to your mapping:
{
  "mappings": {
    "properties": {
      "streamId": {
        "type": "keyword"
      },
      ...
      "time": {
        "type": "date",             <---
        "format": "HH:mm:ss.SSSz"
      }
    }
  }
}
  1. Add this new field to each doc (or use an ingest pipeline, or a scripted _update_by_query call):
{
  "streamId": "ENRG_0054",
  ...
  "timestamp": "2021-02-01T00:19:39.161Z",
  "time": "00:19:39.161Z",                 <---
  "tags": []
}
  1. Use the same query as above but aggregate on the time field instead
"days": {
  "date_histogram": {
    "field": "timestamp",     <---
    "interval": "1d"
  },
  "aggs": {
    "shifts": {
      "date_range": {
        "field": "time",      <---
        "format": "HH:mm",
        "ranges": [

That's all there's to it!

P.S. Under the hood, the time values will be auto-assigned to 1970 but that's fine because you're only interested in the time values.

Joe - GMapsBook.com
  • 15,787
  • 4
  • 23
  • 68