I have the following scenario in python with Pandas, below is for just an example in fact the real dataframe returns 15k objects for projects and another 15k objects for products
#cur is a connector to snowflake instance
rows = pd.DataFrame(cur.fetchall(), columns=['product', 'project'])
json_data = rows.to_dict(orient='dict')
print('projects_products =', json_data)
projects_products = {
"project": {
'0': 'project0',
'1': 'project0',
'2': 'project1',
'3': 'project1',
'4': 'project2',
'5': 'project2',
},
"product":{
'0': 'product1',
'1': 'product2',
'2': 'product3',
'3': 'product4',
'4': 'product5',
'5': 'product6',
}
}
How in pandas I can convert these data from above format to below?
merged_list = [
{ "key": 'project0', "cat": 'product1' },
{ "key": 'project0', "cat": 'product2' },
{ "key": 'project1', "cat": 'product3' },
{ "key": 'project1', "cat": 'product4' },
{ "key": 'project2', "cat": 'product5' },
{ "key": 'project2', "cat": 'product6' },
]
I tried going the primitive way, but it takes ages
projects_products = {
"response": {
"product": { ...
},
"project": { ...
}
}
}
def convert_dict():
projects_list = []
for project in projects_products['response']['project']:
for product in projects_products['response']['product']:
if projects_products['response']['project'][project] == projects_products['response']['project'][product]:
projects_list.append({ "key": projects_products['response']['product'][product], "cat": projects_products['response']['project'][project] })
return projects_list
start = timeit.default_timer()
convert_dict()
end = timeit.default_timer()
print(end-start)
The conversion took 109 seconds
$ python3 convert.py
109.38614280600001
CodePudding user response:
rename
columns before call to_dict(orient='records')
:
>>> rows.rename(columns={'project': 'key', 'product': 'cat'}) \
.to_dict(orient='records')
[{'key': 'project0', 'cat': 'product1'},
{'key': 'project0', 'cat': 'product2'},
{'key': 'project1', 'cat': 'product3'},
{'key': 'project1', 'cat': 'product4'},
{'key': 'project2', 'cat': 'product5'},
{'key': 'project2', 'cat': 'product6'}]
For 15k rows, the conversion took:
>>> len(rows)
15000
>>> %timeit rows.rename(columns={'project': 'key', 'product': 'cat'}).to_dict(orient='records')
23.5 ms ± 123 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)