Home > Software design >  'join' two MongoDB collections on a key, without creating a nested structure
'join' two MongoDB collections on a key, without creating a nested structure

Time:05-19

I'd like to 'join' two MongoDB collections on a key, without creating a nested structure.

Code:

# Init client
client_mongo = pymongo.MongoClient(
    host="mongo", port=27017, username="user", password="pass"
)

# Use db gen
db = client_mongo.gen

# Create collections
db.create_collection("wikipedia")
db.create_collection("twitter")

# Add sample record to collection twitter
db.twitter.insert_one({'_id': "1527275059001274368",
 'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?'})

# Add sample record to collection wikipedia
db.wikipedia.insert_one({'_id': '6286417074ef92666893148a',
 'name': 'Nike',
 'page_wikipedia': 'Nike,_Inc.',
 'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'})

I'd like to (do the equivalent of) join on key "name" and create a new collection named "twitter_wikipedia", without a nested structure.

Current solution with nested structure:

db.twitter.aggregate(
    [
        {
            "$lookup": {
                "from": "wikipedia",  # other table name
                "localField": "name",  # key field in collection 2
                "foreignField": "name",  # key field in collection 1
                "as": "linked_collections",  # alias for resulting table
            }
            
        },
        {
        '$out': 'twitter_wikipedia'
    }
    ])

Current output

db.twitter_wikipedia.find_one({})
{'_id': '1527275059001274368',
 'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?',
 'linked_collections': [{'_id': '6286417074ef92666893148a',
   'name': 'Nike',
   'page_wikipedia': 'Nike,_Inc.',
   'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'}]}

# Desired output
{'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?',
 'page_wikipedia': 'Nike,_Inc.',
 'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'}

CodePudding user response:

Just perform simple $unwind and $project after you do the $lookup

db.twitter.aggregate([
  {
    "$lookup": {
      "from": "wikipedia",
      "localField": "name",
      "foreignField": "name",
      "as": "linked_collections"
    }
  },
  {
    "$unwind": "$linked_collections"
  },
  {
    "$project": {
      "name": 1,
      "username_twitter": 1,
      "user_id_twitter": 1,
      "text_twitter": 1,
      "page_wikipedia": "$linked_collections.page_wikipedia",
      "text_wikipedia": "$linked_collections.text_wikipedia"
    }
  },
  {
    $out: "twitter_wikipedia"
  }
])

Here is the Mongo Playground for your reference.

  • Related