I have a nested JSON looking like below:
Input JSON:
data= {
"name": "root",
"type": {"sample":"text"},
"teacher": [
{
"name": "properties",
"type": "feature",
"father": [
{
"name": "print",
"type": "feature",
"children": [
{
"name": "graphic print",
"type": "feature",
"inherits": "true"
},
{
"name": "striped print",
"type": "feature",
"inherits": "true",
"sister": [
{
"name": "pinstriped",
"type": "feature",
"inherits": "true"
},
{
"name": "light stripe",
"type": "feature",
"inherits": "true"
},
{
"name": "wide stripe",
"type": "feature",
"inherits": "true"
}
]
}
]
}
]
},
{
"name": "colours",
"type": "colour",
"father": [
{
"name": "main colours",
"type": "colour",
"reacher": [
{
"name": "black",
"type": "colour",
"sister": [
{
"name": "light black",
"type": "colour",
"inherits": "true"
},
{
"name": "blue black",
"type": "colour",
"inherits": "true"
}
]
},
{
"name": "red",
"type": "colour",
"teacher": [
{
"name": "bright red",
"type": "colour",
"inherits": "true"
},
{
"name": "light red",
"type": "colour"
}
]
}
]
}
]
},
{
"name": "genders",
"type": "gender",
"children": [
{
"name": "female",
"type": "gender"
},
{
"name": "male",
"type": "gender"
}
]
}
]
}
By creating tree traversal code i can fetch all the keys and value of the JSON but now cannot group columns as they are too much normalized
I am using below code on my jSON:
import pandas as pd
def nested_parser(indict, pre=None):
pre = pre[:] if pre else []
if isinstance(indict, dict):
for key, value in indict.items():
if isinstance(value, dict):
for d in nested_parser(value, pre [key '_dict']):
yield d
elif isinstance(value, list) or isinstance(value, tuple):
for idx,v in enumerate(value):
for d in nested_parser(v, pre [key '_array_' str(idx)]):
yield d
else:
yield pre [key, value]
else:
yield pre [indict]
lst = [i for i in nested_parser(data)]
max_dept_elem = len(max(lst, key=len))
padded_list = [i[:-1] [None] * (max_dept_elem-len(i)) i[-1:] if len(i)<max_dept_elem else i for i in lst]
df = pd.DataFrame(padded_list)
df_cols = ["root_level_" str(col) for col in df.columns.values]
df.columns = df_cols
print(df)
mainly this is achieved by complex traversal of dictionary that has lots of list and this way it is working fast. I cannot flatten and explode much faster so using traversal for flattening of complex JSON
root_level_0 root_level_1 root_level_2 root_level_3 root_level_4 root_level_5
0 name None None None None root
1 type_dict sample None None None text
2 teacher_array_0 name None None None properties
3 teacher_array_0 type None None None feature
4 teacher_array_0 father_array_0 name None None print
5 teacher_array_0 father_array_0 type None None feature
6 teacher_array_0 father_array_0 children_array_0 name None graphic print
7 teacher_array_0 father_array_0 children_array_0 type None feature
8 teacher_array_0 father_array_0 children_array_0 inherits None true
9 teacher_array_0 father_array_0 children_array_1 name None striped print
10 teacher_array_0 father_array_0 children_array_1 type None feature
11 teacher_array_0 father_array_0 children_array_1 inherits None true
12 teacher_array_0 father_array_0 children_array_1 sister_array_0 name pinstriped
13 teacher_array_0 father_array_0 children_array_1 sister_array_0 type feature
14 teacher_array_0 father_array_0 children_array_1 sister_array_0 inherits true
15 teacher_array_0 father_array_0 children_array_1 sister_array_1 name light stripe
16 teacher_array_0 father_array_0 children_array_1 sister_array_1 type feature
17 teacher_array_0 father_array_0 children_array_1 sister_array_1 inherits true
18 teacher_array_0 father_array_0 children_array_1 sister_array_2 name wide stripe
19 teacher_array_0 father_array_0 children_array_1 sister_array_2 type feature
20 teacher_array_0 father_array_0 children_array_1 sister_array_2 inherits true
21 teacher_array_1 name None None None colours
22 teacher_array_1 type None None None colour
23 teacher_array_1 father_array_0 name None None main colours
24 teacher_array_1 father_array_0 type None None colour
25 teacher_array_1 father_array_0 reacher_array_0 name None black
26 teacher_array_1 father_array_0 reacher_array_0 type None colour
27 teacher_array_1 father_array_0 reacher_array_0 sister_array_0 name light black
28 teacher_array_1 father_array_0 reacher_array_0 sister_array_0 type colour
29 teacher_array_1 father_array_0 reacher_array_0 sister_array_0 inherits true
30 teacher_array_1 father_array_0 reacher_array_0 sister_array_1 name blue black
31 teacher_array_1 father_array_0 reacher_array_0 sister_array_1 type colour
32 teacher_array_1 father_array_0 reacher_array_0 sister_array_1 inherits true
33 teacher_array_1 father_array_0 reacher_array_1 name None red
34 teacher_array_1 father_array_0 reacher_array_1 type None colour
35 teacher_array_1 father_array_0 reacher_array_1 teacher_array_0 name bright red
36 teacher_array_1 father_array_0 reacher_array_1 teacher_array_0 type colour
37 teacher_array_1 father_array_0 reacher_array_1 teacher_array_0 inherits true
38 teacher_array_1 father_array_0 reacher_array_1 teacher_array_1 name light red
39 teacher_array_1 father_array_0 reacher_array_1 teacher_array_1 type colour
40 teacher_array_2 name None None None genders
41 teacher_array_2 type None None None gender
42 teacher_array_2 children_array_0 name None None female
43 teacher_array_2 children_array_0 type None None gender
44 teacher_array_2 children_array_1 name None None male
45 teacher_array_2 children_array_1 type None None gender
I have also tried grouping and collecting as list but dont know how to proceed further to convert these columns to rows.
grouped_df = df.groupby(['root_level_0','root_level_1','root_level_2','root_level_3','root_level_4'])['root_level_5'].apply(list)
print(grouped_df)
root_level_0 root_level_1 root_level_2 root_level_3 root_level_4
teacher_array_0 father_array_0 children_array_1 sister_array_0 inherits [true]
name [pinstriped]
type [feature]
sister_array_1 inherits [true]
name [light stripe]
type [feature]
sister_array_2 inherits [true]
name [wide stripe]
type [feature]
teacher_array_1 father_array_0 reacher_array_0 sister_array_0 inherits [true]
name [light black]
type [colour]
sister_array_1 inherits [true]
name [blue black]
type [colour]
reacher_array_1 teacher_array_0 inherits [true]
name [bright red]
type [colour]
teacher_array_1 name [light red]
type [colour]
I have tried multiple things but not able to achieve data frame like below:
name type.sample teacher.name teacher.type ... teacher.father.reacher.teacher.type teacher.father.reacher.teacher.inherits teacher.children.name teacher.children.type
0 root text properties feature ... NaN NaN NaN NaN
1 root text properties feature ... NaN NaN NaN NaN
2 root text properties feature ... NaN NaN NaN NaN
3 root text properties feature ... NaN NaN NaN NaN
4 root text colours colour ... NaN NaN NaN NaN
5 root text colours colour ... NaN NaN NaN NaN
6 root text colours colour ... colour true NaN NaN
7 root text colours colour ... colour NaN NaN NaN
8 root text genders gender ... NaN NaN female gender
9 root text genders gender ... NaN NaN male gender
Any help will be great.
CodePudding user response:
Updated for child name agnosticism
This should do what you're looking for:
rows = []
def processData(data, prefix = "root", row = dict(), doEmit = True):
#add a column for each new attribute
hasChildren = False
for k in data.keys():
if type(data[k]) == str:
colName = prefix "_" k
row[colName] = data[k]
elif type(data[k]) == dict:
prefix = "_" k
child = data[k]
processData(child,prefix = prefix, row = row, doEmit = False);
for k in data.keys():
if type(data[k]) == list:
hasChildren = True
#update prefix
prefix = "_" k
for child in data[k]:
processData(child,prefix = prefix, row = row.copy());
if not hasChildren and doEmit:
#finished row when there is no child
rows.append(row);
return
processData(data)
df = pd.DataFrame(rows)
print(df)