I want to convert multiple level list of nested dictionaries into a single list of dictionary
input:
list_ = [
{'Name': 'Paras Jain',
'Student': [{'Exam': 90,
'Grade': 'a',
'class': [{'age': 10, 'subject': 'hindi'},
{'age': 11, 'subject': 'maths'}]},
{'Exam': 99,
'Grade': 'b',
'class': [{'age': 14, 'subject': 'evs'},
{'age': 15, 'subject': 'science'}]},
{'Exam': 97,
'Grade': 'c',
'class': [{'age': 10, 'subject': 'history'}]}]},
{'Name': 'Chunky Pandey',
'Student': [{'Exam': 89,
'Grade': 'a',
'class': [{'age': 9, 'subject': 'no'}]},
{'Exam': 80, 'Grade': 'b', 'class': []}]},
{'Name': 'abc', 'Student':[]}
]
Required output:
[{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10, 'subject': 'hindi'},
{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11, 'subject': 'maths'},
{'Exam': 90, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14, 'subject': 'evs'},
{'Exam': 90, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15, 'subject': 'science'},
{'Exam': 97, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10, 'subject': 'history'},
{'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9, 'subject': 'no'},
{'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 'NA', 'subject': 'NA'},
{'Exam': 'NA', 'Grade': 'NA', 'Name': 'abc', 'age': 'NA', 'subject': 'NA'}]
CodePudding user response:
First, let's fix your empty class list, so pd.json_normalize
doesn't ignore it:
for i, x in enumerate(list_):
for j, y in enumerate(x['Student']):
if not y['class']:
list_[i]['Student'][j]['class'] = [{}]
Then we can use pd.json_normalize
:
# Mark the deepest level (Student.class),
# and all the meta levels (Student.Exam, Student.Grade, and Name):
df = pd.json_normalize(list_, ['Student', 'class'], [['Student', 'Exam'], ['Student', 'Grade'], 'Name'])
# Fix up the column names, we don't need the `Student.` prefix here.
df.columns = df.columns.str.replace('Student.', '', regex=False)
# If you explicitly want 'NA' as the nan value:
# df = df.fillna('NA')
# Convert to dictionary, `records` is what your format is known as.
out = df.to_dict('records')
print(out) # pprint(out, width=150)
Output:
[{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'hindi'},
{'Exam': 90, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11.0, 'subject': 'maths'},
{'Exam': 99, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14.0, 'subject': 'evs'},
{'Exam': 99, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15.0, 'subject': 'science'},
{'Exam': 97, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'history'},
{'Exam': 89, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9.0, 'subject': 'no'},
{'Exam': 80, 'Grade': 'b', 'Name': 'Chunky Pandey', 'age': nan, 'subject': nan}]
CodePudding user response:
Let's try
for d in list_:
if not d.get('Student', []):
if 'Student' in d:
d['Student'].append({})
else:
d['Student'] = [{}]
df = pd.json_normalize(list_, record_path='Student', meta='Name').explode('class', ignore_index=True)
out = df.join(df.pop('class').apply(pd.Series)).drop(columns=0).to_dict(orient='records')
print(df)
Exam Grade class Name
0 90.0 a {'age': 10, 'subject': 'hindi'} Paras Jain
1 90.0 a {'age': 11, 'subject': 'maths'} Paras Jain
2 99.0 b {'age': 14, 'subject': 'evs'} Paras Jain
3 99.0 b {'age': 15, 'subject': 'science'} Paras Jain
4 97.0 c {'age': 10, 'subject': 'history'} Paras Jain
5 89.0 a {'age': 9, 'subject': 'no'} Chunky Pandey
6 80.0 b NaN Chunky Pandey
7 NaN NaN NaN abc
pprint(out, width=150)
[{'Exam': 90.0, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'hindi'},
{'Exam': 90.0, 'Grade': 'a', 'Name': 'Paras Jain', 'age': 11.0, 'subject': 'maths'},
{'Exam': 99.0, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 14.0, 'subject': 'evs'},
{'Exam': 99.0, 'Grade': 'b', 'Name': 'Paras Jain', 'age': 15.0, 'subject': 'science'},
{'Exam': 97.0, 'Grade': 'c', 'Name': 'Paras Jain', 'age': 10.0, 'subject': 'history'},
{'Exam': 89.0, 'Grade': 'a', 'Name': 'Chunky Pandey', 'age': 9.0, 'subject': 'no'},
{'Exam': 80.0, 'Grade': 'b', 'Name': 'Chunky Pandey', 'age': nan, 'subject': nan},
{'Exam': nan, 'Grade': nan, 'Name': 'abc', 'age': nan, 'subject': nan}]
CodePudding user response:
I tried below two ways but looking for a more optimized way: How I tested for 2700000 records. Just multiple [{}] * 900000 1st way
# tested with 2700000 records
import time
start_time = time.time()
rows = []
for data in list_:
if data['Student']:
for row in data['Student']:
if row["class"]:
for in_row in row['class']:
in_row['Exam'] = row['Exam']
in_row['Grade'] = row['Grade']
in_row['age'] = in_row['age']
in_row['subject'] = in_row['subject']
in_row['Name'] = data['Name']
rows.append(in_row)
else:
rows.append({
'Exam': row['Exam'],
'Grade': row['Grade'],
'age': '',
'subject': '',
'Name': data['Name']
})
else:
rows.append({
'Exam': '',
'Grade': '',
'age': '',
'subject': '',
'Name': data['Name']
})
end_time = time.time()
print(end_time-start_time)
# 5.117190837860107
2nd way
# tested with 2700000 records
import time
start_time = time.time()
rows = []
for data in list_:
if data['Student']:
for row in data['Student']:
row['Exam'] = row['Exam']
row['Grade'] = row['Grade']
row['class'] = row['class']
row['Name'] = data['Name']
rows.append(row)
else:
rows.append({
'Exam': '',
'Grade': '',
'class': [],
'Name': data['Name']
})
res = []
for data in rows:
if data["class"]:
for in_row in data['class']:
in_row['Exam'] = data['Exam']
in_row['Grade'] = data['Grade']
in_row['age'] = in_row['age']
in_row['subject'] = in_row['subject']
in_row['Name'] = data['Name']
res.append(in_row)
else:
res.append({
'Exam': data['Exam'],
'Grade': data['Grade'],
'age': '',
'subject': '',
'Name': data['Name']
})
end_time = time.time()
print(end_time-start_time)
# 15.956519842147827