We are using Elasticsearch 7.9
Our index, called method_info_tree, contains two-level nested fields:
- The top level described a java method,
- the nested level below it describes a thread that ran the method, and
- the third nested level within a thread describes the states of that thread over time.
Below is the mapping of the index in Elasticsearch:
{
"mappings": {
"properties": {
"method_id" : {
"type" : "long"
},
"threads": {
"type": "nested",
"properties": {
"thread_id": {
"type": "long"
},
"states": {
"type": "nested",
"properties": {
"collect_time": {
"type": "date"
},
"state": {
"type": "keyword"
},
"elapsed_time": {
"type" : "integer"
}
}
}
}
}
}
}
}
Here is a sample document in the index:
{
"took" : 13,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5198,
"relation" : "eq"
},
"max_score" : 0.0,
"hits" : [
{
"_index" : "method_info_tree-000001",
"_type" : "_doc",
"_id" : "WiHCCYQBhPdvF01n3kp1",
"_score" : 0.0,
"_routing" : "86163-d5c064d0-55a3-44b9-88fb-c44b7233cfa4",
"_source" : {
"timestamp" : 1666610993800,
"method_id" : 140280075031760,
"threads" : [
{
"thread_id" : 1,
"states_hit" : [
{
"state" : "RUNNABLE",
"collect_time" : 1666610994750,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610994800,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610994850,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610994900,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610994950,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610995000,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610995050,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610995100,
"elapsed_time" : 50
},
{
"state" : "IO",
"collect_time" : 1666610995150,
"elapsed_time" : 50
}
]
}
]
}
}
]
}
}
Note that for each method_id we have many documents for various thread_ids.
I would like, for each method_id, to calculate the sum of elapsed_time field per state (for all threads), something like:
method_id ->
[
{
"state" : "IO",
"elapsed_time" : 566622.0
},
{
"state" : "RUNNABLE",
"elapsed_time" : 566572.0
},
{
"state" : "BLOCKED",
"elapsed_time" : 50.0
}
]
Below is my Elasticsearch query:
GET method_info_tree/_search
{
"from": 0,
"size": 0,
"track_total_hits": true,
"query": {
"bool": {
"filter": [
{
"term": {
"session_id": "86163-d5c064d0-55a3-44b9-88fb-c44b7233cfa4"
}
},
{
"nested": {
"path": "threads.states_hit",
"query": {
"bool": {
"filter": [
{
"range": {
"threads.states_hit.collect_time": {
"gte": 0,
"lte": 2000000000000
}
}
}
]
}
}
}
}
]
}
},
"aggs": {
"top_methods_agg": {
"terms": {
"field": "method_id",
"size": 20
},
"aggs": {
"elapsed_time_agg": {
"nested": {
"path": "threads.states_hit"
},
"aggs": {
"states_range": {
"range": {
"field": "threads.states_hit.collect_time",
"ranges": [
{
"from": 0,
"to": 2000000000001
}
]
},
"aggs": {
"elapsed_time_per_state_agg": {
"terms": {
"field": "threads.states_hit.state",
"size": 10
},
"aggs": {
"elapsed_time": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
}
}
},
"total_self_elapsed_time": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
},
"wasted_elapsed_time": {
"filter": {
"terms": {
"threads.states_hit.state": [
"BLOCKED",
"IO"
]
}
},
"aggs": {
"total_wasted": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
}
}
}
}
}
}
}
}
}
}
}
A sample result would be:
{
"took" : 218,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 5727,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"top_methods_agg" : {
"doc_count_error_upper_bound" : 1,
"sum_other_doc_count" : 73,
"buckets" : [
{
"key" : 140280074341584,
"doc_count" : 728,
"elapsed_time_agg" : {
"doc_count" : 21838,
"states_range" : {
"buckets" : [
{
"key" : "1970-01-01T00:00:00.000Z-2033-05-18T03:33:20.001Z",
"from" : 0.0,
"from_as_string" : "1970-01-01T00:00:00.000Z",
"to" : 2.000000000001E12,
"to_as_string" : "2033-05-18T03:33:20.001Z",
"doc_count" : 21838,
"total_self_elapsed_time" : {
"value" : 1133244.0
},
"wasted_elapsed_time" : {
"doc_count" : 1,
"total_wasted" : {
"value" : 50.0
}
},
"elapsed_time_per_state_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "IO",
"doc_count" : 10919,
"elapsed_time" : {
"value" : 566622.0
}
},
{
"key" : "RUNNABLE",
"doc_count" : 10918,
"elapsed_time" : {
"value" : 566572.0
}
},
{
"key" : "BLOCKED",
"doc_count" : 1,
"elapsed_time" : {
"value" : 50.0
}
}
]
}
}
]
}
}
},
{
"key" : 140282650318928,
"doc_count" : 3,
"elapsed_time_agg" : {
"doc_count" : 3,
"states_range" : {
"buckets" : [
{
"key" : "1970-01-01T00:00:00.000Z-2033-05-18T03:33:20.001Z",
"from" : 0.0,
"from_as_string" : "1970-01-01T00:00:00.000Z",
"to" : 2.000000000001E12,
"to_as_string" : "2033-05-18T03:33:20.001Z",
"doc_count" : 3,
"total_self_elapsed_time" : {
"value" : 150.0
},
"wasted_elapsed_time" : {
"doc_count" : 0,
"total_wasted" : {
"value" : 0.0
}
},
"elapsed_time_per_state_agg" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "RUNNABLE",
"doc_count" : 3,
"elapsed_time" : {
"value" : 150.0
}
}
]
}
}
]
}
}
}
]
}
}
}
Note that I deleted some of the result buckets for better coherence of the example.
My problem: I need to sort the results by 'total_self_elapsed_time' and return only the top 5 results. Since the result is nested, I cannot access the calculated field 'total_self_elapsed_time'. Can you please direct me to how I can add to my query sorting by this aggregated field?
CodePudding user response:
I have replaced states_range range aggregation with filter aggregation. Range aggregation will generate buckets for each ranges specified. So you cannot sort terms based on sub multi buckets.
To sort I have used "order" in terms aggregation.
Query
"aggs": {
"top_methods_agg": {
"terms": {
"field": "method_id",
"size": 20,
"order": {
"elapsed_time_agg>states_range>total_self_elapsed_time": "asc"
}
},
"aggs": {
"elapsed_time_agg": {
"nested": {
"path": "threads.states_hit"
},
"aggs": {
"states_range": {
"filter": {
"range": {
"threads.states_hit.collect_time": {
"gte": 0,
"lte": 2000000000000
}
}
},
"aggs": {
"elapsed_time_per_state_agg": {
"terms": {
"field": "threads.states_hit.state",
"size": 10
},
"aggs": {
"elapsed_time": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
}
}
},
"total_self_elapsed_time": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
},
"wasted_elapsed_time": {
"filter": {
"terms": {
"threads.states_hit.state": [
"BLOCKED",
"IO"
]
}
},
"aggs": {
"total_wasted": {
"sum": {
"field": "threads.states_hit.elapsed_time"
}
}
}
}
}
}
}
}
}
}
}
Try above , see if it works out for you.