Convert nested dictionary to pandas dataframe in python-CodePudding

All, I have the following nested dictionary (from a JSON API response). I would like to access the individual items by means of a pandas dataframe.

The dictionary looks as follows:

{'pagination': {'limit': 2, 'offset': 0, 'count': 2, 'total': 1474969}, 'data': [{'flight_date': '2022-10-12', 'flight_status': 'active', 'departure': {'airport': 'Tullamarine', 'timezone': 'Australia/Melbourne', 'iata': 'MEL', 'icao': 'YMML', 'terminal': '2', 'gate': '16', 'delay': 20, 'scheduled': '2022-10-12T00:50:00 00:00', 'estimated': '2022-10-12T00:50:00 00:00', 'actual': '2022-10-12T01:09:00 00:00', 'estimated_runway': '2022-10-12T01:09:00 00:00', 'actual_runway': '2022-10-12T01:09:00 00:00'}, 'arrival': {'airport': 'Hong Kong International', 'timezone': 'Asia/Hong_Kong', 'iata': 'HKG', 'icao': 'VHHH', 'terminal': '1', 'gate': None, 'baggage': None, 'delay': None, 'scheduled': '2022-10-12T06:55:00 00:00', 'estimated': '2022-10-12T06:55:00 00:00', 'actual': None, 'estimated_runway': None, 'actual_runway': None}, 'airline': {'name': 'Finnair', 'iata': 'AY', 'icao': 'FIN'}, 'flight': {'number': '5844', 'iata': 'AY5844', 'icao': 'FIN5844', 'codeshared': {'airline_name': 'cathay pacific', 'airline_iata': 'cx', 'airline_icao': 'cpa', 'flight_number': '178', 'flight_iata': 'cx178', 'flight_icao': 'cpa178'}}, 'aircraft': None, 'live': None}, {'flight_date': '2022-10-12', 'flight_status': 'active', 'departure': {'airport': 'Tullamarine', 'timezone': 'Australia/Melbourne', 'iata': 'MEL', 'icao': 'YMML', 'terminal': '2', 'gate': '5', 'delay': 25, 'scheduled': '2022-10-12T00:30:00 00:00', 'estimated': '2022-10-12T00:30:00 00:00', 'actual': '2022-10-12T00:55:00 00:00', 'estimated_runway': '2022-10-12T00:55:00 00:00', 'actual_runway': '2022-10-12T00:55:00 00:00'}, 'arrival': {'airport': 'Kuala Lumpur International Airport (klia)', 'timezone': 'Asia/Kuala_Lumpur', 'iata': 'KUL', 'icao': 'WMKK', 'terminal': '1', 'gate': None, 'baggage': None, 'delay': 3, 'scheduled': '2022-10-12T06:00:00 00:00', 'estimated': '2022-10-12T06:00:00 00:00', 'actual': None, 'estimated_runway': None, 'actual_runway': None}, 'airline': {'name': 'KLM', 'iata': 'KL', 'icao': 'KLM'}, 'flight': {'number': '4109', 'iata': 'KL4109', 'icao': 'KLM4109', 'codeshared': {'airline_name': 'malaysia airlines', 'airline_iata': 'mh', 'airline_icao': 'mas', 'flight_number': '128', 'flight_iata': 'mh128', 'flight_icao': 'mas128'}}, 'aircraft': None, 'live': None}]}

The dictionary is stored under the variable name api_response. I am using the following code to convert to a dataframe as described in https://sparkbyexamples.com/pandas/pandas-convert-json-to-dataframe/

My code:

import boto3
import json
from datetime import datetime
import calendar
import random
import time
import requests
import pandas as pd

aircraftdata = ''
params = {
'access_key': 'KEY',
'limit': '2',
'flight_status':'active'
  }


url = "http://api.aviationstack.com/v1/flights"


api_result = requests.get('http://api.aviationstack.com/v1/flights', params)
api_statuscode =  api_result.status_code
api_response = api_result.json()


print (type(api_response)) #dictionary
print (api_response)


df = pd.DataFrame.from_dict(api_response, orient = 'index')

This yields the following error:

AttributeError: 'list' object has no attribute 'items'

I would like to obtain a dataframe with for each flight the live data:


flight_iata, live_latitude, live_longitude
AA1004, 36.2, -106.8

CodePudding user response：

Considering the desired output, let's say that the dictionary dic looks like the following

dic = {
    "pagination": {
        "limit": 100,
        "offset": 0,
        "count": 100,
        "total": 1669022
    },
    "data": [
        {
            "flight_date": "2019-12-12",
            "flight_status": "active",
            "departure": {
                "airport": "San Francisco International",
                "timezone": "America/Los_Angeles",
                "iata": "SFO",
                "icao": "KSFO",
                "terminal": "2",
                "gate": "D11",
                "delay": 13,
                "scheduled": "2019-12-12T04:20:00 00:00",
                "estimated": "2019-12-12T04:20:00 00:00",
                "actual": "2019-12-12T04:20:13 00:00",
                "estimated_runway": "2019-12-12T04:20:13 00:00",
                "actual_runway": "2019-12-12T04:20:13 00:00"
            },
            "arrival": {
                "airport": "Dallas/Fort Worth International",
                "timezone": "America/Chicago",
                "iata": "DFW",
                "icao": "KDFW",
                "terminal": "A",
                "gate": "A22",
                "baggage": "A17",
                "delay": 0,
                "scheduled": "2019-12-12T04:20:00 00:00",
                "estimated": "2019-12-12T04:20:00 00:00",
                "actual": None,
                "estimated_runway": None,
                "actual_runway": None
            },
            "airline": {
                "name": "American Airlines",
                "iata": "AA",
                "icao": "AAL"
            },
            "flight": {
                "number": "1004",
                "iata": "AA1004",
                "icao": "AAL1004",
                "codeshared": None
            },
            "aircraft": {
                "registration": "N160AN",
                "iata": "A321",
                "icao": "A321",
                "icao24": "A0F1BB"
            },
            "live": {
                "updated": "2019-12-12T10:00:00 00:00",
                "latitude": 36.28560000,
                "longitude": -106.80700000,
                "altitude": 8846.820,
                "direction": 114.340,
                "speed_horizontal": 894.348,
                "speed_vertical": 1.188,
                "is_ground": False
            }
        }
    ]
}

In order to obtain the desired output, one can start by converting the dictionary to a dataframe with pandas.DataFrame

df = pd.DataFrame(dic['data'], columns=['flight', 'live'])

[Out]:

                                              flight                                               live
0  {'number': '1004', 'iata': 'AA1004', 'icao': '...  {'updated': '2019-12-12T10:00:00 00:00', 'lati...

Then, one can use .apply() with a custom lambda function as follows

df = df[['flight', 'live']].apply(lambda x: pd.Series([x['flight']['iata'], x['live']['latitude'], x['live']['longitude'], x['live']['altitude']]), axis=1)

[Out]:

        0        1        2        3
0  AA1004  36.2856 -106.807  8846.82

Finally, the only thing missing is the name of the columns. And in order to change it, one can do the following

df.columns = ['flight_iata', 'live_latitude', 'live_longitude', 'live_altitude']

[Out]:

  flight_iata  live_latitude  live_longitude  live_altitude
0      AA1004        36.2856        -106.807        8846.82

And that is the desired output.

CodePudding user response：

df = pd.json_normalize(api_response["data"])
df = df[df.loc[:, df.columns.str.contains("live", case=False)].columns]

print(df)

                live.updated  live.latitude  live.longitude  live.altitude  live.direction  live.speed_horizontal  live.speed_vertical  live.is_ground
0  2019-12-12T10:00:00 00:00        36.2856        -106.807        8846.82          114.34                894.348                1.188           False