I have a single collection (observation) in a DocumentDB cluster. The collection was recently purged but often exceeds 200GB in documents. Below is an example of a document.
Example document
{
"_id" : ObjectId("5edfe9eb8b9b6d37ffc2b9ec"),
"deviceId" : "5e86371746e0fb0001cbbf9b",
"data" : {
"type" : "HUMIDITY",
"reading" : 20.1,
"units" : "rh"
},
"timestamp" : ISODate("2020-06-10T05:00:00.000Z")
}
The poorly performing query is:
db.observation.aggregate([
{
"$match": {
"deviceId": req.params.deviceId,
"timestamp": {
$gte: new Date(req.params.timestamp), $lt: new Date()
}
}
}, {
"$group": {
"_id": {
"$add": [
{ "$subtract": [
{ "$subtract": [ "$timestamp", new Date(0) ] },
{ "$mod": [ { "$subtract": [ "$timestamp", new Date(0) ] }, 1000 * 60 * aggMins ]}
] },
new Date(0)
]
},
"timestamp" : { "$first": "$timestamp" },
"units" : { "$first": "$data.units" },
"avg": {
"$avg": "$data.reading"
},
}
}, {
"$project": {
"_id": 0,
"timestamp": 1,
"avg": 1,
"units": 1
}
}
])
Running explain() on the query results in:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "observation",
"winningPlan" : {
"stage" : "HASH_AGGREGATE",
"inputStage" : {
"stage" : "IXSCAN",
"indexName" : "deviceId_1",
"direction" : "forward"
}
}
},
"executionStats" : {
"executionSuccess" : true,
"executionTimeMillis" : "13092.126",
"planningTimeMillis" : "8.470",
"executionStages" : {
"stage" : "HASH_AGGREGATE",
"nReturned" : "1",
"executionTimeMillisEstimate" : "13083.523",
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : "240266",
"executionTimeMillisEstimate" : "12915.796",
"indexName" : "deviceId_1",
"direction" : "forward"
}
}
},
"ok" : 1
}
I am seeking advice on how to optimize the above query.