0

In post index, postid is primary key and userid is foreign key.

i want all post but only post from one userid, such that only one user have the one post in results sort by postdate(optional latest first)

//Actual Result
[
    {
        userid: "u1",
        postid: "p1"
    },
    {
        userid: "u1",
        postid: "p2"
    },
    {
        userid: "u2",
        postid: "p3"
    },
    {
        userid: "u3",
        postid: "p4"
    },
    {
        userid: "u3",
        postid: "p5"
    },
    {
        userid: "u3",
        postid: "p6"
    }
]

needed as below

//Expecting Result
[
    {
        userid: "u1",
        postid: "p1"
    },
    {
        userid: "u2",
        postid: "p3"
    },
    {
        userid: "u3",
        postid: "p4"
    }
]
Ratan Uday Kumar
  • 5,738
  • 6
  • 35
  • 54

3 Answers3

1

Assuming an index mapping of the form:

PUT user_posts
{
  "mappings": {
    "properties": {
      "userid": {
        "type": "keyword"
      },
      "postid": {
        "type": "keyword"
      },
      "postdate": {
        "type": "date"
      }
    }
  }
}

You could:

  1. aggregate on the userid and order the IDs alphabetically
  2. sub-aggregate on the postid and sort the post by posttime descending via a max aggregation.
  3. filter the response through the filter_path option to only retrieve what you need
POST user_posts/_search?filter_path=aggregations.*.buckets.key,aggregations.*.buckets.*.buckets.key
{
  "size": 0,
  "aggs": {
    "by_userid": {
      "terms": {
        "field": "userid",
        "order": {
          "_key": "asc"
        }, 
        "size": 100
      },
      "aggs": {
        "by_latest_postid": {
          "terms": {
            "field": "postid",
            "size": 1,
            "order": {
              "latest_posttime": "desc"
            }
          },
          "aggs": {
            "latest_posttime": {
              "max": {
                "field": "postdate"
              }
            }
          }
        }
      }
    }
  }
}

Yielding:

{
  "aggregations" : {
    "by_userid" : {
      "buckets" : [
        {
          "key" : "u1",
          "by_latest_postid" : {
            "buckets" : [
              {
                "key" : "p1"
              }
            ]
          }
        },
        {
          "key" : "u2",
          "by_latest_postid" : {
            "buckets" : [
              {
                "key" : "p3"
              }
            ]
          }
        },
        {
          "key" : "u3",
          "by_latest_postid" : {
            "buckets" : [
              {
                "key" : "p4"
              }
            ]
          }
        }
      ]
    }
  }
}

which you can then post-process as you normally would:

...
const response = await ...; // transform the above request for use in the ES JS lib of your choice

const result = response.aggregations.by_userid.buckets.map(b => {
    return {
        userid: b.key,
        postid: b.by_latest_postid.buckets && b.by_latest_postid.buckets[0].key
    }
})
Joe - GMapsBook.com
  • 15,787
  • 4
  • 23
  • 68
1

You can use the top hits sub-aggregation. So first do a terms aggregation by userId, then you can use top-hits with a sort by post-date to get the latest post by each user.

I should say that if you have many userIds and you want the top hit for each one, you should probably use composite aggregation as your top-level agg, and not terms.

Doron Yaacoby
  • 9,412
  • 8
  • 48
  • 59
1

I think you can use top hit for this. Here the sample for this :

DELETE my-index-000001

PUT my-index-000001
{
  "mappings": {
    "properties": {
      "userid": {
        "type": "keyword"
      },
      "postid": {
        "type": "keyword"
      },
      "postdate": {
        "type": "date"
      }
    }
  }
}

PUT my-index-000001/_doc/1
{"userid": "u1", "postid": "p1", "postdate": "2021-03-01"}

PUT my-index-000001/_doc/2
{"userid": "u1", "postid": "p2", "postdate": "2021-03-02"}

PUT my-index-000001/_doc/3
{"userid": "u2", "postid": "p3", "postdate": "2021-03-03"}

PUT my-index-000001/_doc/4
{"userid": "u3", "postid": "p4", "postdate": "2021-03-04"}

PUT my-index-000001/_doc/5
{"userid": "u3", "postid": "p5", "postdate": "2021-03-05"}

PUT my-index-000001/_doc/6
{"userid": "u3", "postid": "p6", "postdate": "2021-03-06"}

These are the sample index creating steps. And here the query :

GET my-index-000001/_search
{
  "size": 0,
  "aggs": {
    "top_users": {
      "terms": {
        "field": "userid",
        "size": 100
      },
      "aggs": {
        "top": {
          "top_hits": {
            "sort": [
              {
                "postdate": {
                  "order": "desc"
                }
              }
            ],
            "_source": {
              "includes": [ "postdate", "postid" ]
            },
            "size": 1
          }
        }
      }
    }
  }
}

And, inside the resultset you can see the top post for the every users inside the aggregations:

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 6,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "top_users" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "u3",
          "doc_count" : 3,
          "top" : {
            "hits" : {
              "total" : {
                "value" : 3,
                "relation" : "eq"
              },
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "my-index-000001",
                  "_type" : "_doc",
                  "_id" : "6",
                  "_score" : null,
                  "_source" : {
                    "postdate" : "2021-03-06",
                    "postid" : "p6"
                  },
                  "sort" : [
                    1614988800000
                  ]
                }
              ]
            }
          }
        },
        {
          "key" : "u1",
          "doc_count" : 2,
          "top" : {
            "hits" : {
              "total" : {
                "value" : 2,
                "relation" : "eq"
              },
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "my-index-000001",
                  "_type" : "_doc",
                  "_id" : "2",
                  "_score" : null,
                  "_source" : {
                    "postdate" : "2021-03-02",
                    "postid" : "p2"
                  },
                  "sort" : [
                    1614643200000
                  ]
                }
              ]
            }
          }
        },
        {
          "key" : "u2",
          "doc_count" : 1,
          "top" : {
            "hits" : {
              "total" : {
                "value" : 1,
                "relation" : "eq"
              },
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "my-index-000001",
                  "_type" : "_doc",
                  "_id" : "3",
                  "_score" : null,
                  "_source" : {
                    "postdate" : "2021-03-03",
                    "postid" : "p3"
                  },
                  "sort" : [
                    1614729600000
                  ]
                }
              ]
            }
          }
        }
      ]
    }
  }
}
hkulekci
  • 1,894
  • 15
  • 27
  • thanks for your effort. it works but I have millions of records, how to use pagination inside aggregation, like from. – Ratan Uday Kumar Mar 19 '21 at 04:55
  • https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html composite aggration will solve the pagination for aggs – hkulekci Apr 04 '21 at 21:15