I have about 300.000 rows as below, but what I need is only id and email address. Dataframe like this:
d = {'vid': [1201,1202], 'col2': [[{'vid': 1201, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0,
'identities': [{'type': 'EMAIL', 'value': '[email protected]', 'timestamp': 1548608578090, 'is-primary': True},
{'type': 'LEAD_GUID', 'value': '69c4f6ec-e0e9-4632-8d16-cbc204a57b22', 'timestamp': 1548608578106}]},
{'vid': 314479851, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 183374504, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 17543251, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 99700201, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 65375052, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 17525601, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []},
{'vid': 238128701, 'saved-at-timestamp': 1638824550030, 'deleted-changed-timestamp': 0, 'identities': []}],
[{'vid': 1202, 'saved-at-timestamp': 1548608578109, 'deleted-changed-timestamp': 0,
'identities': [{'type': 'EMAIL', 'value': '[email protected]', 'timestamp': 1548608578088, 'is-primary': True},
{'type': 'LEAD_GUID', 'value': 'fe6c2628-b1db-47c5-91f6-258e79ea58f0', 'timestamp': 1548608578106}]}]]}
df=pd.DataFrame(d)
df
vid col2
1201 [{'vid': 1201, 'saved-at-timestamp': 1638824550030........
1202 [{'vid': 1202, 'saved-at-timestamp': 1548608578109......
expected output (only two fields but for all rows):
vid email
1201 [email protected]
1202 [email protected]
.. ..
I tried to apply the solutions here but it didn't work
CodePudding user response:
Here's one way using json_normalize
:
out = (pd.concat(pd.json_normalize(lst, ['identities'], 'vid') for lst in d['col2'])
.pipe(lambda x: x[x['type']=='EMAIL'])[['vid','value']]
.rename(columns={'value':'email'}))
or just use str
accessor repeatedly for only the "emails":
df=pd.DataFrame(d)
df['email'] = df['col2'].str[0].str.get('identities').str[0].str.get('value')
df = df.drop(columns='col2')
Output:
vid email
0 1201 [email protected]
0 1202 [email protected]
CodePudding user response:
You can use pd.json_normalize
:
df = pd.json_normalize([sub for item in d['col2'] for sub in item], record_path='identities', meta='vid')
Output:
>>> df
type value timestamp is-primary vid
0 EMAIL [email protected] 1548608578090 True 1201
1 LEAD_GUID 69c4f6ec-e0e9-4632-8d16-cbc204a57b22 1548608578106 NaN 1201
2 EMAIL [email protected] 1548608578088 True 1202
3 LEAD_GUID fe6c2628-b1db-47c5-91f6-258e79ea58f0 1548608578106 NaN 1202
And now just use .loc
to get the data you want:
df = df.loc[df['type'] == 'EMAIL', ['vid', 'value']]
Output:
>>> df
vid value
0 1201 [email protected]
2 1202 [email protected]
Or you can pivot the dataframe after using json_normalize
, instead of using .loc
:
df = df.pivot(index='vid', columns='type', values='value').rename_axis(None, axis=1).reset_index()
Output:
>>> df
vid EMAIL LEAD_GUID
0 1201 [email protected] 69c4f6ec-e0e9-4632-8d16-cbc204a57b22
1 1202 [email protected] fe6c2628-b1db-47c5-91f6-258e79ea58f0