I have some data that looks like this...
import pandas as pd
df = pd.DataFrame(
[[1, 2, 3, 4, 5, 6], [5, 6, 7, 8, 9, 10]],
columns=[
['a', 'a', 'b', 'b', 'c', 'c'],
['col1', 'col2', 'col1', 'col2', 'col3', 'col4']
]
)
# a b c
# col1 col2 col1 col2 col3 col4
# 0 1 2 3 4 5 6
# 1 5 6 7 8 9 10
I want to have this formatted as...
formatted_data
# [{'a': {'col1': 1, 'col2': 2},
# 'b': {'col1': 3, 'col2': 4},
# 'c': {'col3': 5, 'col4': 6}},
# {'a': {'col1': 5, 'col2': 6},
# 'b': {'col1': 7, 'col2': 8},
# 'c': {'col3': 9, 'col4': 10}}]
I have indeed achieved what I want, but the solution is either unreadable, poorly efficient, or <insert_random_negative_term>. And so I would like to know if there are some other ways of achieving this in a more readable manner. Preferably staying inside the pandas
api. Efficiency is not important here, as the data is small, and the operation must be performed once.
def get_multi_index_key(data, col_name):
return data[col_name].to_dict("records")
def preformat_data(df, columns):
multi_index_values = zip(*[get_multi_index_key(df, key) for key in columns])
for row in multi_index_values:
yield dict(
zip(columns, row)
)
formatted_data = preformat_data(df, ["a", "b", "c"])
next(formatted_data)
# {'a': {'col1': 1, 'col2': 2}, 'b': {'col1': 3, 'col2': 4}, 'c': {'col3': 5.0, 'col4': 6.0}}
The problem with the solutions that have been proposed is that the columns at level=1
are all shared between the MultiIndexes, meaning that the result is...
[{'a': {'col1': 1.0, 'col2': 2.0, 'col3': nan, 'col4': nan},
'b': {'col1': 3.0, 'col2': 4.0, 'col3': nan, 'col4': nan},
'c': {'col1': nan, 'col2': nan, 'col3': 5.0, 'col4': 6.0}},
{'a': {'col1': 5.0, 'col2': 6.0, 'col3': nan, 'col4': nan},
'b': {'col1': 7.0, 'col2': 8.0, 'col3': nan, 'col4': nan},
'c': {'col1': nan, 'col2': nan, 'col3': 9.0, 'col4': 10.0}}]
CodePudding user response:
You may try with for loop with to_dict
out = [df.loc[[x]].stack().reset_index(level=0,drop=True).to_dict() for x in df.index]
Out[608]:
[{'a': {'col1': 1, 'col2': 2}, 'b': {'col1': 3, 'col2': 4}},
{'a': {'col1': 5, 'col2': 6}, 'b': {'col1': 7, 'col2': 8}}]
CodePudding user response:
Pandas solution
s = df.stack(0)
s['r'] = s.to_dict('r')
s['r'].unstack().to_dict('r')
General pandas solution corresponding to OP's update:
s = df.melt(var_name=['l0', 'l1'], ignore_index=False)
s.groupby([s.index, 'l0']).apply(lambda s: dict(zip(s.l1, s.value))).unstack().to_dict('records')
[{'a': {'col1': 1, 'col2': 2},
'b': {'col1': 3, 'col2': 4},
'c': {'col3': 5, 'col4': 6}},
{'a': {'col1': 5, 'col2': 6},
'b': {'col1': 7, 'col2': 8},
'c': {'col3': 9, 'col4': 10}}]
CodePudding user response:
This would work
def nest(d: dict) -> dict:
result = {}
for key, value in d.items():
target = result
for k in key[:-1]: # traverse all keys but the last
target = target.setdefault(k, {})
target[key[-1]] = value
return result
def df_to_nested_dict(df: pd.DataFrame) -> dict:
d = df.to_dict(orient='index')
return {k: nest(v) for k, v in d.items()}
result = df_to_nested_dict(df)
If you print result
, you will get the desired output -
{
0: {
'a': {'col1': 1, 'col2': 2},
'b': {'col1': 3, 'col2': 4}
},
1: {
'a': {'col1': 5, 'col2': 6},
'b': {'col1': 7, 'col2': 8}
}
}