I have a collection with 1.3 million documents and my query script is following:
db.UserBehaviorOnApp.aggregate([
{ $match: {"accessTime" : { "$gte" : ISODate("2022-12-06T00:00:00Z"), "$lt" : ISODate("2023-01-05T23:59:59Z")}}},
{ $group: { _id: "$appName", count: { $sum: 1 } } },
{ $sort: {count: -1} },
{ $limit : 5 },
])
I want to retrieve data in current month and group by appName field. The script takes about 5s.
I try to add index on accessTime and appName as compoundIndex : accessTime_1_appName_1. But it seemed doesn't work.The script also takes about 5s. And the script explain is following:
{
"explainVersion" : "1",
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"namespace" : "user_behavior_log.UserBehaviorOnApp",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"accessTime" : {
"$lt" : ISODate("2023-01-06T07:59:59.000 08:00")
}
},
{
"accessTime" : {
"$gte" : ISODate("2022-12-06T08:00:00.000 08:00")
}
}
]
},
"queryHash" : "BDCC37AF",
"planCacheKey" : "F8798B58",
"maxIndexedOrSolutionsReached" : false,
"maxIndexedAndSolutionsReached" : false,
"maxScansToExplodeReached" : false,
"winningPlan" : {
"stage" : "PROJECTION_COVERED",
"transformBy" : {
"appName" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"accessTime" : 1,
"appName" : 1
},
"indexName" : "accessTime_1_appName_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"accessTime" : [ ],
"appName" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"accessTime" : [ "[new Date(1670284800000), new Date(1672963199000))" ],
"appName" : [ "[MinKey, MaxKey]" ]
}
}
},
"rejectedPlans" : [
{
"stage" : "PROJECTION_SIMPLE",
"transformBy" : {
"appName" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"accessTime" : 1,
"deptIdOfOperator" : 1
},
"indexName" : "accessTime_1_deptIdOfOperator_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"accessTime" : [ ],
"deptIdOfOperator" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"accessTime" : [ "[new Date(1670284800000), new Date(1672963199000))" ],
"deptIdOfOperator" : [ "[MinKey, MaxKey]" ]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 1333199,
"executionTimeMillis" : 4925,
"totalKeysExamined" : 1333199,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "PROJECTION_COVERED",
"nReturned" : 1333199,
"executionTimeMillisEstimate" : 158,
"works" : 1333200,
"advanced" : 1333199,
"needTime" : 0,
"needYield" : 0,
"saveState" : 1380,
"restoreState" : 1380,
"isEOF" : 1,
"transformBy" : {
"appName" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 1333199,
"executionTimeMillisEstimate" : 96,
"works" : 1333200,
"advanced" : 1333199,
"needTime" : 0,
"needYield" : 0,
"saveState" : 1380,
"restoreState" : 1380,
"isEOF" : 1,
"keyPattern" : {
"accessTime" : 1,
"appName" : 1
},
"indexName" : "accessTime_1_appName_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"accessTime" : [ ],
"appName" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"accessTime" : [ "[new Date(1670284800000), new Date(1672963199000))" ],
"appName" : [ "[MinKey, MaxKey]" ]
},
"keysExamined" : 1333199,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
},
"nReturned" : 1333199,
"executionTimeMillisEstimate" : 1630
},
{
"$group" : {
"_id" : "$appName",
"count" : {
"$sum" : {
"$const" : 1
}
}
},
"maxAccumulatorMemoryUsageBytes" : {
"count" : 74767392
},
"totalOutputDataSizeBytes" : 237801844,
"usedDisk" : false,
"nReturned" : 1038436,
"executionTimeMillisEstimate" : 4772
},
{
"$sort" : {
"sortKey" : {
"count" : -1
},
"limit" : 5
},
"totalDataSizeSortedBytesEstimate" : 11515,
"usedDisk" : false,
"nReturned" : 5,
"executionTimeMillisEstimate" : 4924
}
],
"serverInfo" : {
"host" : "02a8a2b6c8dc",
"port" : 27017,
"version" : "5.0.5",
"gitVersion" : "d65fd89df3fc039b5c55933c0f71d647a54510ae"
},
"serverParameters" : {
"internalQueryFacetBufferSizeBytes" : 104857600,
"internalQueryFacetMaxOutputDocSizeBytes" : 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes" : 104857600,
"internalDocumentSourceGroupMaxMemoryBytes" : 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes" : 104857600,
"internalQueryProhibitBlockingMergeOnMongoS" : 0,
"internalQueryMaxAddToSetBytes" : 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes" : 104857600
},
"command" : {
"aggregate" : "UserBehaviorOnApp",
"pipeline" : [
{
"$match" : {
"accessTime" : {
"$gte" : ISODate("2022-12-06T08:00:00.000 08:00"),
"$lt" : ISODate("2023-01-06T07:59:59.000 08:00")
}
}
},
{
"$group" : {
"_id" : "$appName",
"count" : {
"$sum" : 1
}
}
},
{
"$sort" : {
"count" : -1
}
},
{
"$limit" : 5
}
],
"cursor" : {
},
"$db" : "user_behavior_log"
},
"ok" : 1,
"$clusterTime" : {
"clusterTime" : Timestamp(1673062241, 1),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : 0
}
},
"operationTime" : Timestamp(1673062241, 1)
}
It seemed that $group will disable index? So,How to optimize query speed for Aggregation Pipeline?
CodePudding user response:
This ended up being a somewhat involved answer, hopefully you find it useful and I didn't waste the last hour :laughs:
As has already been mentioned, I'm not sure you can do much to improve performance here without changing the approach entirely.
The biggest issue you're having is that you're looking at a lot of data, your query matches 1.3 million records (not sure how big your total collection is). Your group stage returns more than 1 million records, so you have high cardinality across $appName
. You then sort those results (which can't use an index) just to get the top 5 rows.
However, when you're dealing with this volume of data, and if this query (or similar) is a common occurrence (e.g., you frequently want to look at the last n days, months, etc) I'd suggest a totally different approach utilising the $merge
pipeline stage. Note that because of your high cardinality on $appName
- the performance improvement here won't be massive, at best it will reduce the number of read documents from 1.3 million to ~1 million.
For a given minimum frequency that you expect to want to observe (e.g., 1hr, 1 day, etc) you can aggregate across UserBehaviorOnApp
and $merge
the output to a new collection (note, there are optimisations to be made here, but for the purposes of brevity it was cleanest to leave it like this):
db.UserBehaviorOnApp.aggregate([
{ $match: { accessTime: { $gt: lastTimeIRanThis } } },
{ $group: {
_id: {
frequency: "hour",
appName: "$appName",
startTime: { $dateTrunc: { date: "$accessTime", unit: "hour" } }
},
count: { $sum: 1 },
}},
{ $merge: {
into: "UserBehaviorOnAppAggregated",
on: "_id",
whenNotMatched: "insert",
whenMatched: [
{ $set: { count: {$add: ["$count", "$$new.count"] } } }
]
}}
])
You run this query on a semi-regular basis - e.g., every 5 minutes, because it's only ever looking at the last 5 minutes of data, it will be very fast. Care needs to be taken not to look at the same records twice (e.g., use a transaction), it also assumes you're accessTime
field always increases and is synced to your server time (though there are ways aroudn this assumption).
You would then issue your queries against the UserBehaviorOnAppAggregated
collection.
A further optimisation is that you can now aggregate over the aggregated data - for example, if you're looking at 30 days of data, you don't care about 1hr granularity (though you might if you're looking at the last day), so you can then issue the following aggregation:
db.UserBehaviorOnAppAggregated.aggregate([
{ $match: { "_id.frequency": "hour", "_id.startTime": { $gt: lastTimeIRanThisRoundedToNearestHour } } },
{ $group: {
_id: {
frequency: "day",
appName: "$_id.appName",
startTime: { $dateTrunc: { date: "$_id.startTime", unit: "day" } }
},
count: { $sum: 1 },
}},
{ $merge: {
into: "UserBehaviorOnAppAggregated",
on: "_id",
whenNotMatched: "insert",
whenMatched: [
{ $set: { count: {$add: ["$count", "$$new.count"] } } }
]
}}
])
Note that here you're reading from the same collection you're merging into!
Now, when you want to look at the last 30 days of data, you run the aggregation like this:
db.UserBehaviorOnAppAggregated.aggregate([
{ $match: {
"_id.frequency": "day",
"_id.startTime" : { "$gte" : ISODate("2022-12-06T00:00:00Z"), "$lt" : ISODate("2023-01-05T23:59:59Z")}
}},
{ $group: { _id: "$_id.appName", count: { $sum: 1 } } },
{ $sort: {count: -1} },
{ $limit : 5 },
])
Now rather than having to read 1.3 million documents, you will read (at most) n * 31
, where n
is the distinct count of appName
. As I mentioned before, because of the high cardinality of appName
- this alone won't have a huge improvement. But it does give you options if you make certain assumptions. For example, you can now say "only include an appName
if it had more than 1 view per hour". Then with an index that ends with count
, you get a vastly more efficient query (this only works if you have at least 5 "hot" apps that are the ones you really care about).
There are of course tradeoffs to this approach - it works substantially better when you have lower cardinality data - e.g., we use this approach to track access and performance metrics across a limited (in the thousands) number of servers, where we have millions of accesses per server per day - we then discard the raw data after a short period - this massively reduces the data we retain, it also allows us to have different retention periods for different time windows, e.g., we keep per minute metrics for 1 day, hourly metrics for 1 week and 3hour metrics (used to view > 1 week of data) for 1 year.
If the cardinality of your data is high, you'll see the opposite - if you see 1 view per day per appName, you'll end up storing 3 records per view (once raw, once for the hourly aggregation and once for the daily aggregation).
CodePudding user response:
There are few ways to optimize performance further:
- If the appName field has a small number of distinct values, you may want to consider using the
$sortByCount
stage instead of the$group
and$sort
stages. This can potentially be more efficient since it uses a different algorithm to compute the count and sort the results.
> db.UserBehaviorOnApp.aggregate([ > { $match: {"accessTime" : { "$gte" : ISODate("2022-12-06T00:00:00Z"), "$lt" : > ISODate("2023-01-05T23:59:59Z")}}}, > { $sortByCount: "$appName" }, > { $limit : 5 }, ])
- If you are running the query on a sharded cluster, make sure that the
accessTime
field is a part of the shard key. This will allow the query to be routed to the appropriate shards and potentially improve the performance.