How to convert nested json to csv with multiple different names?-CodePudding

I've been trying to convert a nested json file to csv. Here is a small example of the json file.

 json_data =   
{"labels":
    {
      "longfilename01:png": {
        "events": {
          "-N8V6uUR__vvB0qv1lPb": {
            "t": "2022-08-02T19:54:23.608Z",
            "user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1"
          }
        },
        "questions": {
          "would-you": {
            "-N8V6uUR__vvB0qv1lPb": {
              "answer": "no",
              "format": 1
            }
          }
        }
      },
      "longfilename02:png": {
        "events": {
          "-N8ILnaH-1ylwp2LGvtP": {
            "t": "2022-07-31T08:24:23.698Z",
            "user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2"
          }
        },
        "questions": {
          "would-you": {
            "-N8ILnaH-1ylwp2LGvtP": {
              "answer": "yes",
              "format": 1
            }
          }
        }
      }

I've tried multiple ways to get this output:

Labels	Event	User	Time	Answer
Long filename 01	-N8V6uUR__vvB0qv1lPb	bmEhwNCZT9Wiftgvsopb7vBjO9o1	2022-08-02T19:54:23.608Z	no
Long filename 02	-N8ILnaH-1ylwp2LGvtP	bmEhwNCZT9Wiftgvsopb7vBjO9o1	2022-07-31T08:24:23.698Z	yes

If I normalise with:

f= open('after_labels.json')

data = json.load(f)

df = pd.json_normalize(data)

Or try to flatten the file with multiple functions such as:

def flatten_json(json):
    def process_value(keys, value, flattened):
        if isinstance(value, dict):
            for key in value.keys():
                process_value(keys   [key], value[key], flattened)
        elif isinstance(value, list):
            for idx, v in enumerate(value):
                process_value(keys   [str(idx)], v, flattened)
        else:
            flattened['__'.join(keys)] = value

    flattened = {}
    for key in json.keys():
        process_value([key], json[key], flattened)
    return flattened

df = flatten_json(data)

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading   '.'   key))
        elif isinstance(data, list):
            rows = []
            for item in data:
                [rows.append(elem) for elem in flatten_list(flatten_json(item, prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

df = json_to_dataframe(data)
print(df)

It gives me 292 columns and I suspect this is because of the long unique filenames.

I can't change the json file before processing, because that seems like the simple solution to do "filename": "longfilename01:png" as they would then all be consistent and I wouldn't have this problem.

I would be grateful for any other clever ideas on how to solve this.

CodePudding user response：

Try:

json_data = {
    "labels": {
        "longfilename01:png": {
            "events": {
                "-N8V6uUR__vvB0qv1lPb": {
                    "t": "2022-08-02T19:54:23.608Z",
                    "user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1",
                }
            },
            "questions": {
                "would-you": {
                    "-N8V6uUR__vvB0qv1lPb": {"answer": "no", "format": 1}
                }
            },
        },
        "longfilename02:png": {
            "events": {
                "-N8ILnaH-1ylwp2LGvtP": {
                    "t": "2022-07-31T08:24:23.698Z",
                    "user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2",
                }
            },
            "questions": {
                "would-you": {
                    "-N8ILnaH-1ylwp2LGvtP": {"answer": "yes", "format": 1}
                }
            },
        },
    }
}


df = pd.DataFrame(
    [
        {
            "Labels": k,
            "Event": list(v["events"])[0],
            "User": list(v["events"].values())[0]["user"],
            "Time": list(v["events"].values())[0]["t"],
            "Answer": list(list(v["questions"].values())[0].values())[0][
                "answer"
            ],
        }
        for k, v in json_data["labels"].items()
    ]
)
print(df)

Prints:

               Labels                 Event                          User                      Time Answer
0  longfilename01:png  -N8V6uUR__vvB0qv1lPb  bmEhwNCZT9Wiftgvsopb7vBjO9o1  2022-08-02T19:54:23.608Z     no
1  longfilename02:png  -N8ILnaH-1ylwp2LGvtP  Qf7C5cXQkXfQanxKPR0rsKW4QzE2  2022-07-31T08:24:23.698Z    yes