Home > OS >  Convert multiple level list of nested dictionaries into single list of dictionaries
Convert multiple level list of nested dictionaries into single list of dictionaries

Time:09-10

I want to convert multiple level list of nested dictionaries into a single list of dictionary

input:

list_ = [
 {'Name': 'Paras Jain',
  'Student': [{'Exam': 90,
               'Grade': 'a',
               'class': [{'age': 10, 'subject': 'hindi'},
                         {'age': 11, 'subject': 'maths'}]},
              {'Exam': 99,
               'Grade': 'b',
               'class': [{'age': 14, 'subject': 'evs'},
                         {'age': 15, 'subject': 'science'}]},
              {'Exam': 97,
               'Grade': 'c',
               'class': [{'age': 10, 'subject': 'history'}]}]},
 {'Name': 'Chunky Pandey',
  'Student': [{'Exam': 89,
               'Grade': 'a',
               'class': [{'age': 9, 'subject': 'no'}]},
              {'Exam': 80, 'Grade': 'b', 'class': []}]},
 {'Name': 'abc', 'Student':[]}
]

Required output:

[{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10, 'subject': 'hindi'},
 {'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11, 'subject': 'maths'},
 {'Exam': 90, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14, 'subject': 'evs'},
 {'Exam': 90, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15, 'subject': 'science'},
 {'Exam': 97, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10, 'subject': 'history'},
 {'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9, 'subject': 'no'},
 {'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 'NA', 'subject': 'NA'},
 {'Exam': 'NA', 'Grade': 'NA', 'Name': 'abc', 'age': 'NA', 'subject': 'NA'}]

CodePudding user response:

First, let's fix your empty class list, so pd.json_normalize doesn't ignore it:

for i, x in enumerate(list_):
    for j, y in enumerate(x['Student']):
        if not y['class']:
            list_[i]['Student'][j]['class'] = [{}]

Then we can use pd.json_normalize:

# Mark the deepest level (Student.class), 
# and all the meta levels (Student.Exam, Student.Grade, and Name):
df = pd.json_normalize(list_, ['Student', 'class'], [['Student', 'Exam'], ['Student', 'Grade'], 'Name'])

# Fix up the column names, we don't need the `Student.` prefix here.
df.columns = df.columns.str.replace('Student.', '', regex=False)

# If you explicitly want 'NA' as the nan value:
# df = df.fillna('NA')

# Convert to dictionary, `records` is what your format is known as.
out = df.to_dict('records')
print(out) # pprint(out, width=150)

Output:

[{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'hindi'},
 {'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11.0, 'subject': 'maths'},
 {'Exam': 99, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14.0, 'subject': 'evs'},
 {'Exam': 99, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15.0, 'subject': 'science'},
 {'Exam': 97, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'history'},
 {'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9.0, 'subject': 'no'},
 {'Exam': 80, 'Grade': 'b', 'Name': 'Chunky Pandey', 'age': nan, 'subject': nan}]

CodePudding user response:

Let's try

for d in list_:
    if not d.get('Student', []):
        if 'Student' in d:
            d['Student'].append({})
        else:
            d['Student'] = [{}]


df = pd.json_normalize(list_, record_path='Student', meta='Name').explode('class', ignore_index=True)
out = df.join(df.pop('class').apply(pd.Series)).drop(columns=0).to_dict(orient='records')
print(df)

   Exam Grade                              class           Name
0  90.0     a    {'age': 10, 'subject': 'hindi'}     Paras Jain
1  90.0     a    {'age': 11, 'subject': 'maths'}     Paras Jain
2  99.0     b      {'age': 14, 'subject': 'evs'}     Paras Jain
3  99.0     b  {'age': 15, 'subject': 'science'}     Paras Jain
4  97.0     c  {'age': 10, 'subject': 'history'}     Paras Jain
5  89.0     a        {'age': 9, 'subject': 'no'}  Chunky Pandey
6  80.0     b                                NaN  Chunky Pandey
7   NaN   NaN                                NaN            abc

pprint(out, width=150)

[{'Exam': 90.0, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'hindi'},
 {'Exam': 90.0, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11.0, 'subject': 'maths'},
 {'Exam': 99.0, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14.0, 'subject': 'evs'},
 {'Exam': 99.0, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15.0, 'subject': 'science'},
 {'Exam': 97.0, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'history'},
 {'Exam': 89.0, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9.0, 'subject': 'no'},
 {'Exam': 80.0, 'Grade': 'b', 'Name': 'Chunky Pandey', 'age': nan, 'subject': nan},
 {'Exam': nan, 'Grade': nan, 'Name': 'abc', 'age': nan, 'subject': nan}]

CodePudding user response:

I tried below two ways but looking for a more optimized way: How I tested for 2700000 records. Just multiple [{}] * 900000 1st way

# tested with 2700000 records
import time
start_time = time.time()
rows = []
  
for data in list_:
    if data['Student']:
        for row in data['Student']:
            if row["class"]: 
                for in_row in row['class']:
                    in_row['Exam'] = row['Exam']
                    in_row['Grade'] = row['Grade']
                    in_row['age'] = in_row['age']
                    in_row['subject'] = in_row['subject']
                    in_row['Name'] = data['Name']
                    rows.append(in_row)
            else:
                rows.append({
                    'Exam': row['Exam'],
                    'Grade': row['Grade'],
                    'age': '',
                    'subject': '',
                    'Name': data['Name']
                })
    else:
        rows.append({
            'Exam': '',
            'Grade': '',
            'age': '',
            'subject': '',
            'Name': data['Name']
        })


end_time = time.time()
print(end_time-start_time)
# 5.117190837860107

2nd way

# tested with 2700000 records
import time
start_time = time.time()
rows = []
for data in list_:
    if data['Student']:
        for row in data['Student']:
            row['Exam'] = row['Exam']
            row['Grade'] = row['Grade']
            row['class'] = row['class']
            row['Name'] = data['Name']
            rows.append(row)
    else:
        rows.append({
            'Exam': '',
            'Grade': '',
            'class': [],
            'Name': data['Name']
        })



res = []

for data in rows:
    if data["class"]:
        for in_row in data['class']:
            in_row['Exam'] = data['Exam']
            in_row['Grade'] = data['Grade']
            in_row['age'] = in_row['age']
            in_row['subject'] = in_row['subject']
            in_row['Name'] = data['Name']
            res.append(in_row)
    else:
        res.append({
            'Exam': data['Exam'],
            'Grade': data['Grade'],
            'age': '',
            'subject': '',
            'Name': data['Name']
        })


end_time = time.time()
print(end_time-start_time)
# 15.956519842147827
  • Related