Home > Blockchain >  Mongodb aggregation to find outliers
Mongodb aggregation to find outliers

Time:10-07

In my mongodb collection documents are stored in the following format:

{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...

I need to extract id's for the document for which the value of "res" is outlier (i.e. value < Q1 - 1.5 * IQR or value > Q3 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.

Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?

CodePudding user response:

If I understand how you want to retrieve outliers, here's one way you might be able to do it.

db.collection.aggregate([
  { // partition res into quartiles
    "$bucketAuto": {
      "groupBy": "$res",
      "buckets": 4
    }
  },
  { // get the max of each quartile
    "$group": {
      "_id": "$_id.max"
    }
  },
  { // sort the quartile maxs
    "$sort": {
      "_id": 1
    }
  },
  { // put sorted quartile maxs into array
    "$group": {
      "_id": null,
      "maxs": {"$push": "$_id"}
    }
  },
  { // assign Q1 and Q3
    "$project": {
      "_id": 0,
      "q1": {"$arrayElemAt": ["$maxs", 0]},
      "q3": {"$arrayElemAt": ["$maxs", 2]}
    }
  },
  { // set IQR
    "$set": {
      "iqr": {
        "$subtract": ["$q3", "$q1"]
      }
    }
  },
  { // assign upper/lower outlier thresholds
    "$project": {
      "outlierThresholdLower": {
        "$subtract": [
          "$q1",
          {"$multiply": ["$iqr", 1.5]}
        ]
      },
      "outlierThresholdUpper": {
        "$add": [
          "$q3",
          {"$multiply": ["$iqr", 1.5]}
        ]
      }
    }
  },
  { // get outlier _id's
    "$lookup": {
      "from": "collection",
      "as": "outliers",
      "let": {
        "oTL": "$outlierThresholdLower",
        "oTU": "$outlierThresholdUpper"
      },
      "pipeline": [
        {
          "$match": {
            "$expr": {
              "$or": [
                {"$lt": ["$res", "$$oTL"]},
                {"$gt": ["$res", "$$oTU"]}
              ]
            }
          }
        },
        {
          "$project": {
            "_id": 1
          }
        }
      ]
    }
  }
])

Try it on mongoplayground.net.

CodePudding user response:

One more option based on @rickhg12hs's answer, is to use $setWindowFields:

db.collection.aggregate([
  {$setWindowFields: {
      sortBy: {res: 1},
      output: {
        totalCount: {$count: {}},
        index: {$sum: 1, window: {documents: ["unbounded", "current"]}}
      }
    }
  },
  {$match: {
      $expr: {$lte: [
          {$abs: {$subtract: [
                {$mod: [
                    {$multiply: [
                        {$add: ["$index", {$round: {$divide: ["$totalCount", 4]}}]}, 2]},
                    "$totalCount"
                  ]}, 0]}
          }, 1]}
  }},
  {$group: {_id: null, res: {$push: "$res"}}},
  {$project: {_id: 0, q1: {$first: "$res"}, q3: {$last: "$res"},
      iqr: {"$subtract": [{$last: "$res"}, {$first: "$res"}]}
  }},
  {$project: {
      outlierThresholdLower: {$subtract: ["$q1", {$multiply: ["$iqr", 1.5]}]},
      outlierThresholdUpper: {$add: ["$q3", {$multiply: ["$iqr", 1.5]}]}
    }
  },
  {$lookup: {
      from: "collection",
      as: "outliers",
      let: {oTL: "$outlierThresholdLower", oTU: "$outlierThresholdUpper"},
      pipeline: [
        {$match: {$expr: {$or: [{$lt: ["$res", "$$oTL"]}, {$gt: ["$res", "$$oTU"]}]}}},
        {$project: {_id: 1}}
      ]
    }
  }
])

See how it works on the playground example

  • Related