Home > database >  Extract course ids for instructors from json input
Extract course ids for instructors from json input

Time:06-06

Let's say I've got a nested JSON file as below. If I want to print the courses that each instructor teaches, how do I do that?

{
   "info":{
      "source_objects":[
         {
            "type":"sub-category",
            "id":277438897,
         }
      ],
      "item_type":"course",
      "items":[
         {
            "_class":"course",
            "id":156173119,
            "is_paid":null,
            "trainer":[
               {
                  "id":257585701,
                  "url":"/user/tania_guerra/",
               }
            ], 
      
         {
            "_class":"course",
            "id":12456,
            "is_paid":null,
            "trainer":[
               {
                  "id":257585701,
                  "url":"/user/tania_guerra/",
               }
            ], 
          } 
          *************and more data on the same format****************
       } 
   } 

I'm not sure if there's any simple trick that I'm missing. So far, I've tried the following and it prints the course id and trainer id. But then how do I add all the courses that this trainer trains?

with open (alljson, 'r') as json: # alljson is a directory where multiple json file exists 
  read_json = json.load(json) 
  
  for i in ange(int(len(read_all_json['info']['items']))): 
    cid = read_json['info']['items'][i]['id'] # gets the course id 
    for j in range(int(len(read_json['info']['items'][i]['trainer']))) 
      trainer_id = read_json['info']['items'][i]['trainer'][j]['id'] # gets the trainer id 
      
      # then how do I get course id added to trainer id. for example 
      # 12456---123456***123457***123454***12454 
      # trainer id--- all the courses that this instructor teaches addind *** 

CodePudding user response:

Assuming each trainer has a unique id, you can create a dict of lists, where the keys are trainer ids and the values are lists of course ids:

import os, json

rootdir = 'tmp/test1'

trainers = {}

for root, dirs, files in os.walk(rootdir):
    for filename in files:
        if os.path.splitext(filename)[1] != '.json':
            continue
        filepath = os.path.join(root, filename)

        with open(filepath) as stream:
            data = json.load(stream)

            for item in data['info']['items']:
                cid = item['id']
                for trainer in item['trainer']:
                    key = (trainer['id'], trainer['url'])
                    if key not in trainers:
                        trainers[key] = []
                    trainers[key].append(str(cid))

output = 'trainers.txt'

with open(output, 'w') as stream:
    for (tid, url), cids in sorted(trainers.items()):
        stream.write('%s---%s---%s\n' % (tid, url, ';;;'.join(cids)))
        

Result:

257585701---/user/tania_guerra/---12456;;;7992450;;;7812756;;;156173119;;;562456
918585703---/user/tania_guerra/---7867833;;;14473169;;;156173119

test.json:

{
  "info": {
    "source_objects": [
      {
        "type": "sub-category",
        "id": 277438897
      }
    ],
    "item_type": "course",
    "items": [
      {
        "_class": "course",
        "id": 156173119,
        "is_paid": null,
        "trainer": [
          {
            "id": 257585701,
            "url": "/user/tania_guerra/"
          }
        ]
      },
      {
        "_class": "course",
        "id": 12456,
        "is_paid": null,
        "trainer": [
          {
            "id": 257585701,
            "url": "/user/tania_guerra/"
          }
        ]
      }
    ]
  }
}

CodePudding user response:

I think it's easiest to use a dict or better a defaultdict[int->List[int]]

something like


from collections import defaultdict

with open(alljson, "r") as json:
    items = json["info"]["items"]
trainer_course_mapping = defaultdict(list)

for item in items:
    trainers = item["trainer"]
    for trainer in trainers:
        trainer_course_mapping[trainer["id"]].append(item["id"])

  • Related