I am trying to DE-NEST the below JSON in python to create a CSV table, can somebody help?
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
I tried the below code but it flattens everything into a single line. I read in another thread that using json_normalize() will structure data in columns. But can someone please tell me how to do it for this case?
The code I used is as follows Python code
import json
import pandas as pd
from pandas.io.json import json_normalize
data = json.load(open('C:/Users/Muj/Downloads/Linkedin data/follower_statistics_per_day.json'))
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name a '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name str(i) '_')
i = 1
else:
out[name[:-1]] = x
flatten(y)
return out
fd=flatten_json(data)
flat_data=json_normalize(fd)
flat_data.to_csv('C:/Users/Muj/Downloads/Linkedin data/test1.csv', index = False)
Can someone please help
The desired output is as follows -
organicFollowerGain | paidFollowerGain | organizationalEntity |
---|---|---|
2 | 0 | urn:li:organization:28849398 |
-1 | 0 | urn:li:organization:28849398 |
CodePudding user response:
There is no need for your flatten_json
function. Just pass the elements
portion directly to json_nomalize
flat_data = json_normalize(data['elements'])
That returns
organizationalEntity,followerGains.organicFollowerGain,followerGains.paidFollowerGain,timeRange.start,timeRange.end
urn:li:organization:28849398,2,0,1634169600000,1634256000000
urn:li:organization:28849398,-1,0,1634256000000,1634342400000
urn:li:organization:28849398,-2,0,1634342400000,1634428800000
urn:li:organization:28849398,0,0,1634428800000,1634515200000
Then you just need to rename the column headers and remove any columns you don't want.
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
And that returns
organizationalEntity,organicFollowerGain,paidFollowerGain
urn:li:organization:28849398,2,0
urn:li:organization:28849398,-1,0
urn:li:organization:28849398,-2,0
urn:li:organization:28849398,0,0
Putting that all together:
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
CodePudding user response:
Try the below (Not using any external lib - just core python)
import csv
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
]}
holder = []
for e in data['elements']:
temp = [str(e['followerGains']['organicFollowerGain'])]
temp.append(str(e['followerGains']['paidFollowerGain']))
temp.append(e['organizationalEntity'])
temp.append(str(e['timeRange']['start']))
temp.append(str(e['timeRange']['end']))
holder.append(temp)
with open('out.csv','w') as f:
f.write('organicFollowerGain,paidFollowerGain,organizationalEntity,start,end\n')
writer = csv.writer(f)
writer.writerows(holder)
out.csv
organicFollowerGain,paidFollowerGain,organizationalEntity,start,end
2,0,urn:li:organization:28849398,1634169600000,1634256000000
-1,0,urn:li:organization:28849398,1634256000000,1634342400000
-2,0,urn:li:organization:28849398,1634342400000,1634428800000
0,0,urn:li:organization:28849398,1634428800000,1634515200000