How to parse JSON data if keys are not same?-CodePudding

I am trying to create a script that parse json to csv format . I have created a script in python that works well only if only all the keys are present in the object. if some keys are not available in some object than code is breaking. if i add try catch than the script skipping the feild value instead of creating empty value for the same key if not present.

The Below script is not working for the below json as in that json first_name and last_name .. etc contains not present in other list or dict only present at first index. json as below on below path

https://github.com/david8981/sample-data/blob/main/data1.json

The below is the python code

import json
import csv


def parse_from_list(json_list, output_path):
    # Grab field names
    field_names = []
    for key in json_list[0].keys():
        current_key_list = set()
        found_list = False
        for obj in json_list:
            if isinstance(obj[key], dict):
                found_list = True
                for subkey in obj[key]:
                    current_key_list.add(subkey)
        if found_list:
            field_names.extend(['%s|||---|||%s' % (key, x) for x in list(current_key_list)])
        else:
            field_names.append(key)

    # Loop through objects and grab out data
    json_data = []
    for obj in json_list:
        new_data = {}
        for field in field_names:
            new_value = ''
            if '|||---|||' in field:
                if field.split('|||---|||')[0] in obj and field.split('|||---|||')[1] in obj[field.split('|||---|||')[0]]:
                    new_value = obj[field.split('|||---|||')[0]][field.split('|||---|||')[1]]
            else:
                if field in obj:
                    new_value = obj[field]
            new_data[field.replace('|||---|||', '_')] = new_value
        json_data.append(new_data)

    keys = json_data[0].keys()

    with open(output_path, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(json_data)
    pass


parse_from_list(json.load(open('data.json', 'rb'))['data']['items'], 'output.csv')

The code is working fine for below json

https://github.com/david8981/sample-data/blob/main/data2.json

CodePudding user response：

If you have a known set of fields in your result file then you can specify them by hand.

data0 = [
    {"name": "a", "age": 10},
    {"name": "b", "age": 12},
]

data1 = [
    {"name": "c", "gender": "F"},
    {"name": "d", "gender": "M"},
]

import csv

expected_fields = [
    "name",
    "age",
    "gender"
]

with open("./test.csv", "w", newline="") as file_out:
    writer = csv.DictWriter(file_out, fieldnames=expected_fields)
    writer.writeheader()
    writer.writerows(data0)
    writer.writerows(data1)

If you need to have fields specified dynamically based on the set of fields in both inputs you can do:

expected_fields = set(data0[0].keys()).union(data1[0].keys())

or even

expected_fields = set(key for row in (data0   data1) for key in row.keys())

Given your specific data in :

https://github.com/david8981/sample-data/blob/main/data1.json

This code:

import csv
import json

def flatten_dictionary(d):
    out = {}
    for key, val in d.items():
        if isinstance(val, dict):
            deeper = flatten_dictionary(val).items()
            out.update({f"{key}_{key2}": val2 for key2, val2 in deeper})
        elif isinstance(val, list):
            for subindex, subdict in enumerate(val):
                ## -----------------------
                ## dive into each item if needed
                ## -----------------------
                ## deeper = flatten_dictionary(subdict).items()
                ## out.update({f"{key}_{str(subindex)}_{key2}": val2 for key2, val2 in deeper})
                ## -----------------------
                out.update({f"{key}_{str(subindex)}": subdict})
        else:
            out[key] = val
    return out

with open("test.json", "r", encoding="utf-8") as file_in:
    users = [flatten_dictionary(row) for row in json.load(file_in)["data"]["items"]]

expected_fields = sorted(set(key for row in users for key in row.keys()))

with open("./test.csv", "w", newline="") as file_out:
    writer = csv.DictWriter(file_out, fieldnames=expected_fields)
    writer.writeheader()
    writer.writerows(users)

will give you a flattened CSV with the following columns:

albums_query
avatar_favicon
avatar_inline
avatar_message
avatar_print
avatar_profile
avatar_type
biography
date_pattern
deleted
email
email_address_privacy
first_name
followers_query
following_query
friendly_date_enabled
friendly_date_max_age
href
id
images_query
kudos_given_query
kudos_received_query
language
last_name
last_visit_time
location
login
mailbox_type
messages_query
metrics_type
online_status
online_status_privacy
personal_info_privacy
public_images_query
rank_bold
rank_icon_left
rank_id
rank_name
rank_position
rank_rank_status
rank_type
registration_data_registration_time
registration_data_type
reviews_query
roles_query
settings_query
solutions_authored_query
threads_participated_query
topics_query
type
user_badges_items_0
user_badges_items_1
user_badges_items_2
user_badges_list_item_type
user_badges_size
videos_query
view_href
web_page_url

CodePudding user response：

Debugging your code was very difficult because your test data is so large coupled with the fact that you provided no explanation for how it works. That said, the problem is because it's using the keys in first object to determine which ones to look for in all the rest of them in the list, which of course causes a KeyError when one isn't present.

A simple way to fix that is to "grab" the field names by examining the keys in all the object in the json_list. Do so will make it work with both of your JSON files.

import json
import csv


def parse_from_list(json_list, output_path):
    # Determine field names from keys in all items in json_list.
    field_names = []
    for obj in json_list:
        for key in obj.keys():
            current_keys = set()
            found_dict = False
            if isinstance(obj[key], dict):
                found_dict = True
                for subkey in obj[key]:
                    current_keys.add(subkey)
            if found_dict:
                field_names.extend([f'{key}|||---|||{x}' for x in current_keys])
            else:
                field_names.append(key)

    # Loop through objects and grab out data
    json_data = []
    for obj in json_list:
        new_data = {}
        for field in field_names:
            new_value = ''
            if '|||---|||' in field:
                if(field.split('|||---|||')[0] in obj and
                   field.split('|||---|||')[1] in obj[field.split('|||---|||')[0]]):
                    new_value = obj[field.split('|||---|||')[0]][field.split('|||---|||')[1]]
            else:
                if field in obj:
                    new_value = obj[field]
            new_data[field.replace('|||---|||', '_')] = new_value
        json_data.append(new_data)

    keys = json_data[0].keys()

    with open(output_path, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(json_data)


with open('data1.json', 'rb') as json_file:
    json_data = json.load(json_file)
    parse_from_list(json_data['data']['items'], 'output.csv')