I've been trying to convert a nested json file to csv. Here is a small example of the json file.
json_data =
{"labels":
{
"longfilename01:png": {
"events": {
"-N8V6uUR__vvB0qv1lPb": {
"t": "2022-08-02T19:54:23.608Z",
"user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1"
}
},
"questions": {
"would-you": {
"-N8V6uUR__vvB0qv1lPb": {
"answer": "no",
"format": 1
}
}
}
},
"longfilename02:png": {
"events": {
"-N8ILnaH-1ylwp2LGvtP": {
"t": "2022-07-31T08:24:23.698Z",
"user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2"
}
},
"questions": {
"would-you": {
"-N8ILnaH-1ylwp2LGvtP": {
"answer": "yes",
"format": 1
}
}
}
}
I've tried multiple ways to get this output:
Labels | Event | User | Time | Answer |
---|---|---|---|---|
Long filename 01 | -N8V6uUR__vvB0qv1lPb | bmEhwNCZT9Wiftgvsopb7vBjO9o1 | 2022-08-02T19:54:23.608Z | no |
Long filename 02 | -N8ILnaH-1ylwp2LGvtP | bmEhwNCZT9Wiftgvsopb7vBjO9o1 | 2022-07-31T08:24:23.698Z | yes |
If I normalise with:
f= open('after_labels.json')
data = json.load(f)
df = pd.json_normalize(data)
Or try to flatten the file with multiple functions such as:
def flatten_json(json):
def process_value(keys, value, flattened):
if isinstance(value, dict):
for key in value.keys():
process_value(keys [key], value[key], flattened)
elif isinstance(value, list):
for idx, v in enumerate(value):
process_value(keys [str(idx)], v, flattened)
else:
flattened['__'.join(keys)] = value
flattened = {}
for key in json.keys():
process_value([key], json[key], flattened)
return flattened
df = flatten_json(data)
or
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading '.' key))
elif isinstance(data, list):
rows = []
for item in data:
[rows.append(elem) for elem in flatten_list(flatten_json(item, prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
df = json_to_dataframe(data)
print(df)
It gives me 292 columns and I suspect this is because of the long unique filenames.
I can't change the json file before processing, because that seems like the simple solution to do "filename": "longfilename01:png" as they would then all be consistent and I wouldn't have this problem.
I would be grateful for any other clever ideas on how to solve this.
CodePudding user response:
Try:
json_data = {
"labels": {
"longfilename01:png": {
"events": {
"-N8V6uUR__vvB0qv1lPb": {
"t": "2022-08-02T19:54:23.608Z",
"user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1",
}
},
"questions": {
"would-you": {
"-N8V6uUR__vvB0qv1lPb": {"answer": "no", "format": 1}
}
},
},
"longfilename02:png": {
"events": {
"-N8ILnaH-1ylwp2LGvtP": {
"t": "2022-07-31T08:24:23.698Z",
"user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2",
}
},
"questions": {
"would-you": {
"-N8ILnaH-1ylwp2LGvtP": {"answer": "yes", "format": 1}
}
},
},
}
}
df = pd.DataFrame(
[
{
"Labels": k,
"Event": list(v["events"])[0],
"User": list(v["events"].values())[0]["user"],
"Time": list(v["events"].values())[0]["t"],
"Answer": list(list(v["questions"].values())[0].values())[0][
"answer"
],
}
for k, v in json_data["labels"].items()
]
)
print(df)
Prints:
Labels Event User Time Answer
0 longfilename01:png -N8V6uUR__vvB0qv1lPb bmEhwNCZT9Wiftgvsopb7vBjO9o1 2022-08-02T19:54:23.608Z no
1 longfilename02:png -N8ILnaH-1ylwp2LGvtP Qf7C5cXQkXfQanxKPR0rsKW4QzE2 2022-07-31T08:24:23.698Z yes