data = [{'category': 'A', 'maxvalue': '6', 'minvalue': '0'}, {'category': 'B', 'maxvalue': '11', 'minvalue': '6'}, {'category': 'C', 'maxvalue': '21', 'minvalue': '11'}, {'category': 'D', 'maxvalue': '31', 'minvalue': '21'}, {'category': 'E', 'maxvalue': '41', 'minvalue': '31'}, {'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}]
this data is inside a data frame and now that dataframe, the format i want is like below along with other columns inside dataframe
A B C D E F
0-6 6-11 11-21 21-31 31-41 41-51
CodePudding user response:
For an example of data as following :
import pandas as pd
import numpy as np
data=[[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '11', 'minvalue': '6'},
{'category': 'C', 'maxvalue': '21', 'minvalue': '11'},
{'category': 'D', 'maxvalue': '31', 'minvalue': '21'},
{'category': 'E', 'maxvalue': '41', 'minvalue': '31'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],
[{'category': 'A', 'maxvalue': '5', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '10', 'minvalue': '5'},
{'category': 'C', 'maxvalue': '20', 'minvalue': '10'},
{'category': 'D', 'maxvalue': '30', 'minvalue': '20'},
{'category': 'E', 'maxvalue': '40', 'minvalue': '30'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],[]]
df=pd.DataFrame(columns=['bucket'])
df['bucket']=data
df
df2=df.bucket.apply(lambda x:pd.DataFrame.from_dict(x))
df2
data=[]
for i in range(len(df2)):
if not(df2.iloc[i].empty):
df2.iloc[i]['value']=df2.iloc[i]['minvalue'] '-' (df2.iloc[i]['maxvalue'])
data.append(df2.iloc[i].T.loc['value'].to_list())
else:
data.append(np.zeros(len(df2.iloc[0].T.loc['category'].to_list())))
dfinal=pd.DataFrame(data,columns=df2.iloc[0].T.loc['category'].to_list())
final
Else if you want it to reconfigure in the same dataframe :
df[dfinal.columns]=data
df
CodePudding user response:
One solution could be as follows:
import pandas as pd
import numpy as np
# data set without duplicates in "category"
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
df = pd.DataFrame(columns=['bucket'])
df['bucket'] = lst
# code
from itertools import chain
# select notna rows
out = df[df['bucket'].notna()]
# get dict keys in separate cols with their associated vals as the vals
# repeat index from df according to len of each list
out = pd.DataFrame(list(chain.from_iterable(out['bucket'])),
index=pd.Index.repeat(out.index, out['bucket'].str.len()))
out['result'] = (out.minvalue.astype(str) '-' out.maxvalue.astype(str))
# =============================================================================
# solution 1: no duplicate categories per row
# =============================================================================
out1 = out.copy()
out1 = out1.pivot(index=None, columns='category', values='result').fillna('')
print(out1)
category A B
0 0-4 41-51
2 0-6 11-21
# =============================================================================
# solution 2: duplicate categories per row, e.g. 2x "A" in row 0
# =============================================================================
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'A', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
# continue after `out['result'] = (out.minvalue...)`
out2 = out.copy()
out2['result'] = out2.groupby([out2.index,'category'])['result']\
.transform(lambda x: ','.join(x))
out2.set_index('category', append=True, inplace=True)
out2 = out2[~out2.index.duplicated(keep='first')]['result']
out2 = out2.reset_index().set_index('level_0').rename_axis(None, axis=0)
out2 = out2.pivot(index=None, columns='category', values='result').fillna('')
print(out2)
category A B
0 0-4,41-51
2 0-6 11-21
# =============================================================================
# solution 3: same data as 2, but only keeping first of duplicates
# =============================================================================
# continue after `out['result'] = (out.minvalue...)`
out3 = out.copy()
out3 = out3.reset_index(drop=False)\
.drop_duplicates(['index','category'])\
.set_index('index', drop=True).rename_axis(None, axis=0)
out3 = out3.pivot(index=None, columns='category', values='result').fillna('')
print(out3) # only printing 0-4, not 41-51 for "A" in row 0
category A B
0 0-4
2 0-6 11-21