Home > Blockchain >  Extract data from Json into Pandas (Python)
Extract data from Json into Pandas (Python)

Time:12-27

I'm trying to extract data in a dataframe. My attempts with pd.json_normalize did not work... I must be doing something wrong.

Exemple :

{
    "data": [
        {
            "date": {
                "01_07_2020": [
                    {
                        "customerId": "977869f4e181e656d",
                        "data": [
                            {
                                "_id": "5e1c75498de14f0bb5d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 19.5,
                                    "max": 20.75,
                                    "avg": 20.0714285714,
                                    "diff": -7.9478021978,
                                    "last": 19.75
                                }
                            },
...
                                }
                            }
                        ]
                    },
                    {
                        "customerId": "5efaf52b0b26e2ae31816",
                        "data": [
                            {
                                "_id": "5efb44604bd91a7cde4c",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 23.0,
                                    "max": 23.0,
                                    "avg": 23.0,
                                    "diff": null,
                                    "last": 23.0
                                }
                            },
                            {
                                "_id": "5efb44604bd9126e2de4d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 17.75,
                                    "max": 19.75,
                                    "avg": 18.5833333333,
                                    "diff": null,
                                    "last": 17.75
                                }
                            }
                        ]
                    }
                ]
            },
            "year": 2020
        },
        {
            "date": {
                "01_07_2021": 
etc...

Expected result :

_id sensorType extarnal min max avg diff last
5e1c75498de14f0bb5d FLAT 0.0 17.75 19.5 20.75 20.0714285714 -7.9478021978

I don't show my results, I am very far from getting what I want.

CodePudding user response:

You can try:

import json
import pandas as pd


json_data = r"""{
    "data": [
        {
            "date": {
                "01_07_2020": [
                    {
                        "customerId": "977869f4e181e656d",
                        "data": [
                            {
                                "_id": "5e1c75498de14f0bb5d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 19.5,
                                    "max": 20.75,
                                    "avg": 20.0714285714,
                                    "diff": -7.9478021978,
                                    "last": 19.75
                                }
                            }
                        ]
                    },
                    {
                        "customerId": "5efaf52b0b26e2ae31816",
                        "data": [
                            {
                                "_id": "5efb44604bd91a7cde4c",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 23.0,
                                    "max": 23.0,
                                    "avg": 23.0,
                                    "diff": null,
                                    "last": 23.0
                                }
                            },
                            {
                                "_id": "5efb44604bd9126e2de4d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 17.75,
                                    "max": 19.75,
                                    "avg": 18.5833333333,
                                    "diff": null,
                                    "last": 17.75
                                }
                            }
                        ]
                    }
                ]
            }
        }
    ]
}"""


def get_data(o):
    if isinstance(o, dict):
        if "_id" in o and "stats" in o:
            yield o
        else:
            for v in o.values():
                yield from get_data(v)
    elif isinstance(o, list):
        for v in o:
            yield from get_data(v)


data = json.loads(json_data)

all_data = []
for d in get_data(data):
    all_data.append(
        {"_id": d["_id"], "sensorType": d["sensorType"], **d["stats"]}
    )

df = pd.DataFrame(all_data)
print(df)

Prints:

                     _id sensorType    min    max        avg      diff   last
0    5e1c75498de14f0bb5d       FLAT  19.50  20.75  20.071429 -7.947802  19.75
1   5efb44604bd91a7cde4c       FLAT  23.00  23.00  23.000000       NaN  23.00
2  5efb44604bd9126e2de4d       FLAT  17.75  19.75  18.583333       NaN  17.75

EDIT: Different method to create the dataframe (with customerId and date):

data = json.loads(json_data)

all_data = []
for d in data["data"]:
    for dt, dd in d["date"].items():
        for ddd in dd:
            customer_id = ddd["customerId"]
            for dddd in ddd["data"]:
                all_data.append(
                    {
                        "date": dt,
                        "customerId": customer_id,
                        "_id": dddd["_id"],
                        "sensorType": dddd["sensorType"],
                        **dddd["stats"],
                    }
                )

df = pd.DataFrame(all_data)
print(df)

Prints:

         date             customerId                    _id sensorType    min    max        avg      diff   last
0  01_07_2020      977869f4e181e656d    5e1c75498de14f0bb5d       FLAT  19.50  20.75  20.071429 -7.947802  19.75
1  01_07_2020  5efaf52b0b26e2ae31816   5efb44604bd91a7cde4c       FLAT  23.00  23.00  23.000000       NaN  23.00
2  01_07_2020  5efaf52b0b26e2ae31816  5efb44604bd9126e2de4d       FLAT  17.75  19.75  18.583333       NaN  17.75
  • Related