Home > Back-end >  Find average aggregation of the top N buckets
Find average aggregation of the top N buckets

Time:10-04

I'm trying to determine the average value of the "count" column for the top 5 documents ordered by "timestamp" (in descending order).

Sample Index data:

{"index":{}}
{"title": "foo", "count": 1, "timestamp":"2022-09-15T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 2, "timestamp":"2022-09-14T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 3, "timestamp":"2022-09-16T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 4, "timestamp":"2022-09-13T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 5, "timestamp":"2022-09-12T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 6, "timestamp":"2022-09-11T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 7, "timestamp":"2022-09-19T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 8, "timestamp":"2022-09-18T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 9, "timestamp":"2022-09-10T09:46:43.958Z"}
{"index":{}}
{"title": "foo", "count": 10, "timestamp":"2022-09-17T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 20, "timestamp":"2022-09-10T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 21, "timestamp":"2022-09-11T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 22, "timestamp":"2022-09-12T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 23, "timestamp":"2022-09-13T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 24, "timestamp":"2022-09-14T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 25, "timestamp":"2022-09-15T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 26, "timestamp":"2022-09-17T09:46:43.958Z"}
{"index":{}}
{"title": "bar", "count": 27, "timestamp":"2022-09-18T09:46:43.958Z"}


Search Query (using top hits aggregation):

{
    "size": 0,
    "aggs": {
        "group_by_title": {
            "terms": {
                "field": "title.keyword",
                "size": 10
            },
            "aggs": {
                "take-N-tracks": {
                    "top_hits": {
                        "sort": [
                            {
                                "timestamp": {
                                    "order": "desc"
                                }
                            }
                        ],
                        "_source": {
                            "includes": [
                                "count",
                                "title"
                            ]
                        },
                        "size": 5
                    }
                }
            }
        }
    }
}

Search Result:

{
    "took": 32,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 18,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "group_by_index": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": "foo",
                    "doc_count": 10,
                    "take-N-tracks": {
                        "hits": {
                            "total": {
                                "value": 10,
                                "relation": "eq"
                            },
                            "max_score": null,
                            "hits": [
                                {
                                    "_index": "testidx",
                                    "_id": "-jKbjIMB37c_U4Ro89zA",
                                    "_score": null,
                                    "_source": {
                                        "title": "foo",
                                        "count": 7
                                    },
                                    "sort": [
                                        1663580803958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "-zKbjIMB37c_U4Ro89zA",
                                    "_score": null,
                                    "_source": {
                                        "title": "foo",
                                        "count": 8
                                    },
                                    "sort": [
                                        1663494403958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "_TKbjIMB37c_U4Ro89zA",
                                    "_score": null,
                                    "_source": {
                                        "title": "foo",
                                        "count": 10
                                    },
                                    "sort": [
                                        1663408003958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "9jKbjIMB37c_U4Ro89zA",
                                    "_score": null,
                                    "_source": {
                                        "title": "foo",
                                        "count": 3
                                    },
                                    "sort": [
                                        1663321603958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "9DKbjIMB37c_U4Ro89y7",
                                    "_score": null,
                                    "_source": {
                                        "title": "foo",
                                        "count": 1
                                    },
                                    "sort": [
                                        1663235203958
                                    ]
                                }
                            ]
                        }
                    }
                },
                {
                    "key": "bar",
                    "doc_count": 8,
                    "take-N-tracks": {
                        "hits": {
                            "total": {
                                "value": 8,
                                "relation": "eq"
                            },
                            "max_score": null,
                            "hits": [
                                {
                                    "_index": "testidx",
                                    "_id": "BTKbjIMB37c_U4Ro893A",
                                    "_score": null,
                                    "_source": {
                                        "title": "bar",
                                        "count": 27
                                    },
                                    "sort": [
                                        1663494403958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "BDKbjIMB37c_U4Ro893A",
                                    "_score": null,
                                    "_source": {
                                        "title": "bar",
                                        "count": 26
                                    },
                                    "sort": [
                                        1663408003958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "AzKbjIMB37c_U4Ro893A",
                                    "_score": null,
                                    "_source": {
                                        "title": "bar",
                                        "count": 25
                                    },
                                    "sort": [
                                        1663235203958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "AjKbjIMB37c_U4Ro893A",
                                    "_score": null,
                                    "_source": {
                                        "title": "bar",
                                        "count": 24
                                    },
                                    "sort": [
                                        1663148803958
                                    ]
                                },
                                {
                                    "_index": "testidx",
                                    "_id": "ATKbjIMB37c_U4Ro893A",
                                    "_score": null,
                                    "_source": {
                                        "title": "bar",
                                        "count": 23
                                    },
                                    "sort": [
                                        1663062403958
                                    ]
                                }
                            ]
                        }
                    }
                }
            ]
        }
    }
}

Expected Result:

Under the bucket of foo, the average value should come equal to 5.8 i.e (29/5) Under the bucket of bar, the average value should come equal to 26.6 i.e (133/5)

Use of a metric aggregation with top hits aggregation is required by me. I believe there is a workaround to sort the buckets of top hits aggregation based on this link.

However, is there a method to perform an average aggregation on the response of the top hits?

Update:

If someone could suggest an alternative to using the script provided by @jaspreet, that would be great.

CodePudding user response:

It is not possible to run sub aggregation on top_hits.

You will need to use scripted metric aggregation which will basically iterate through all documents. I have used TreeMap data structure to store timestamp and count combo. By iterating over it we can calculate average.

Query

{
  "size": 0,
  "aggs": {
    "group_by_title": {
      "terms": {
        "field": "title.keyword",
        "size": 10
      },
      "aggs": {
        "top_N_Documents": {
          "scripted_metric": {
            "init_script": "state.count_map=new TreeMap()",
            "map_script": "def date_as_millis = doc['timestamp'].getValue().toInstant().toEpochMilli();state.count_map.put(String.valueOf(date_as_millis), doc['count'].value)",
            "combine_script": "return state",
            "reduce_script": "def sum=0;def count_map_agg=new TreeMap(); for(s in states){count_map_agg.putAll(s.count_map)} def keys = count_map_agg.keySet().toArray(); int size=keys.length;  for(int i=size-1;i>=size-5;i--){sum =count_map_agg.get(keys[i]);} return sum/(float)5;"
          }
        }
      }
    }
  }
}

Results

"aggregations" : {
    "group_by_title" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "foo",
          "doc_count" : 10,
          "top_N_Documents" : {
            "value" : 5.8
          }
        },
        {
          "key" : "bar",
          "doc_count" : 8,
          "top_N_Documents" : {
            "value" : 25.0
          }
        }
      ]
    }
  }

Scripts are slow so if you have performance issues, calculating average at client side might also be a good idea.

  • Related