Python Flatten Multilevel JSON (LinkedIn API) to CSV-CodePudding

I am trying to DE-NEST the below JSON in python to create a CSV table, can somebody help?

{
    "paging": { "start": 0, "count": 10, "links": [] },
    "elements": [
        {
            "followerGains": {
                "organicFollowerGain": 2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634169600000, "end": 1634256000000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -1,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634256000000, "end": 1634342400000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634342400000, "end": 1634428800000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": 0,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634428800000, "end": 1634515200000 }
        },

I tried the below code but it flattens everything into a single line. I read in another thread that using json_normalize() will structure data in columns. But can someone please tell me how to do it for this case?

The code I used is as follows Python code

import json
import pandas as pd
from pandas.io.json import json_normalize

data = json.load(open('C:/Users/Muj/Downloads/Linkedin data/follower_statistics_per_day.json'))

def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name   a   '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name   str(i)   '_')
                i  = 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

fd=flatten_json(data)
flat_data=json_normalize(fd)

flat_data.to_csv('C:/Users/Muj/Downloads/Linkedin data/test1.csv', index = False)

Can someone please help

The desired output is as follows -

organicFollowerGain	paidFollowerGain	organizationalEntity
2	0	urn:li:organization:28849398
-1	0	urn:li:organization:28849398

CodePudding user response：

There is no need for your flatten_json function. Just pass the elements portion directly to json_nomalize

flat_data = json_normalize(data['elements'])

That returns

organizationalEntity,followerGains.organicFollowerGain,followerGains.paidFollowerGain,timeRange.start,timeRange.end
urn:li:organization:28849398,2,0,1634169600000,1634256000000
urn:li:organization:28849398,-1,0,1634256000000,1634342400000
urn:li:organization:28849398,-2,0,1634342400000,1634428800000
urn:li:organization:28849398,0,0,1634428800000,1634515200000

Then you just need to rename the column headers and remove any columns you don't want.

# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)

And that returns

organizationalEntity,organicFollowerGain,paidFollowerGain
urn:li:organization:28849398,2,0
urn:li:organization:28849398,-1,0
urn:li:organization:28849398,-2,0
urn:li:organization:28849398,0,0

Putting that all together:

from pandas.io.json import json_normalize

data = {
    "paging": { "start": 0, "count": 10, "links": [] },
    "elements": [
        {
            "followerGains": {
                "organicFollowerGain": 2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634169600000, "end": 1634256000000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -1,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634256000000, "end": 1634342400000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634342400000, "end": 1634428800000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": 0,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634428800000, "end": 1634515200000 }
        }
    ]
}

flat_data = json_normalize(data['elements'])

# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)

# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)

flat_data.to_csv('out.csv', index=False)

CodePudding user response：

Try the below (Not using any external lib - just core python)

import csv
data = {
    "paging": { "start": 0, "count": 10, "links": [] },
    "elements": [
        {
            "followerGains": {
                "organicFollowerGain": 2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634169600000, "end": 1634256000000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -1,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634256000000, "end": 1634342400000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": -2,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634342400000, "end": 1634428800000 }
        },
        {
            "followerGains": {
                "organicFollowerGain": 0,
                "paidFollowerGain": 0
            },
            "organizationalEntity": "urn:li:organization:28849398",
            "timeRange": { "start": 1634428800000, "end": 1634515200000 }
        },
    ]}
holder = []
for e in data['elements']:
  temp = [str(e['followerGains']['organicFollowerGain'])]
  temp.append(str(e['followerGains']['paidFollowerGain']))
  temp.append(e['organizationalEntity'])
  temp.append(str(e['timeRange']['start']))
  temp.append(str(e['timeRange']['end']))
  holder.append(temp)
with open('out.csv','w') as f:
    f.write('organicFollowerGain,paidFollowerGain,organizationalEntity,start,end\n')
    writer = csv.writer(f)
    writer.writerows(holder)

out.csv

organicFollowerGain,paidFollowerGain,organizationalEntity,start,end
2,0,urn:li:organization:28849398,1634169600000,1634256000000
-1,0,urn:li:organization:28849398,1634256000000,1634342400000
-2,0,urn:li:organization:28849398,1634342400000,1634428800000
0,0,urn:li:organization:28849398,1634428800000,1634515200000