I'm trying to extract data in a dataframe. My attempts with pd.json_normalize did not work... I must be doing something wrong.
Exemple :
{
"data": [
{
"date": {
"01_07_2020": [
{
"customerId": "977869f4e181e656d",
"data": [
{
"_id": "5e1c75498de14f0bb5d",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 19.5,
"max": 20.75,
"avg": 20.0714285714,
"diff": -7.9478021978,
"last": 19.75
}
},
...
}
}
]
},
{
"customerId": "5efaf52b0b26e2ae31816",
"data": [
{
"_id": "5efb44604bd91a7cde4c",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 23.0,
"max": 23.0,
"avg": 23.0,
"diff": null,
"last": 23.0
}
},
{
"_id": "5efb44604bd9126e2de4d",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 17.75,
"max": 19.75,
"avg": 18.5833333333,
"diff": null,
"last": 17.75
}
}
]
}
]
},
"year": 2020
},
{
"date": {
"01_07_2021":
etc...
Expected result :
_id | sensorType | extarnal | min | max | avg | diff | last |
---|---|---|---|---|---|---|---|
5e1c75498de14f0bb5d | FLAT | 0.0 | 17.75 | 19.5 | 20.75 | 20.0714285714 | -7.9478021978 |
I don't show my results, I am very far from getting what I want.
CodePudding user response:
You can try:
import json
import pandas as pd
json_data = r"""{
"data": [
{
"date": {
"01_07_2020": [
{
"customerId": "977869f4e181e656d",
"data": [
{
"_id": "5e1c75498de14f0bb5d",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 19.5,
"max": 20.75,
"avg": 20.0714285714,
"diff": -7.9478021978,
"last": 19.75
}
}
]
},
{
"customerId": "5efaf52b0b26e2ae31816",
"data": [
{
"_id": "5efb44604bd91a7cde4c",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 23.0,
"max": 23.0,
"avg": 23.0,
"diff": null,
"last": 23.0
}
},
{
"_id": "5efb44604bd9126e2de4d",
"sensorType": "FLAT",
"external": 0.0,
"stats": {
"min": 17.75,
"max": 19.75,
"avg": 18.5833333333,
"diff": null,
"last": 17.75
}
}
]
}
]
}
}
]
}"""
def get_data(o):
if isinstance(o, dict):
if "_id" in o and "stats" in o:
yield o
else:
for v in o.values():
yield from get_data(v)
elif isinstance(o, list):
for v in o:
yield from get_data(v)
data = json.loads(json_data)
all_data = []
for d in get_data(data):
all_data.append(
{"_id": d["_id"], "sensorType": d["sensorType"], **d["stats"]}
)
df = pd.DataFrame(all_data)
print(df)
Prints:
_id sensorType min max avg diff last
0 5e1c75498de14f0bb5d FLAT 19.50 20.75 20.071429 -7.947802 19.75
1 5efb44604bd91a7cde4c FLAT 23.00 23.00 23.000000 NaN 23.00
2 5efb44604bd9126e2de4d FLAT 17.75 19.75 18.583333 NaN 17.75
EDIT: Different method to create the dataframe (with customerId
and date
):
data = json.loads(json_data)
all_data = []
for d in data["data"]:
for dt, dd in d["date"].items():
for ddd in dd:
customer_id = ddd["customerId"]
for dddd in ddd["data"]:
all_data.append(
{
"date": dt,
"customerId": customer_id,
"_id": dddd["_id"],
"sensorType": dddd["sensorType"],
**dddd["stats"],
}
)
df = pd.DataFrame(all_data)
print(df)
Prints:
date customerId _id sensorType min max avg diff last
0 01_07_2020 977869f4e181e656d 5e1c75498de14f0bb5d FLAT 19.50 20.75 20.071429 -7.947802 19.75
1 01_07_2020 5efaf52b0b26e2ae31816 5efb44604bd91a7cde4c FLAT 23.00 23.00 23.000000 NaN 23.00
2 01_07_2020 5efaf52b0b26e2ae31816 5efb44604bd9126e2de4d FLAT 17.75 19.75 18.583333 NaN 17.75