With a DataFrame like,
import pandas as pd
import numpy as np
df = pd.DataFrame({
'id_1': [33,33,33,33,22,22,88,100],
'id_2': [64,64,64,64,12,12,77,100],
'col_1': [np.nan, 'dog', np.nan, 'kangaroo', np.nan, np.nan, np.nan, np.nan],
'col_2': ['bike', 'car', np.nan, np.nan, 'train', np.nan, 'horse', np.nan],
'col_3': [np.nan, np.nan, 'star', 'meteor', np.nan, 'rock', np.nan, np.nan]
})
"""
id_1 id_2 col_1 col_2 col_3
0 33 64 NaN bike NaN
1 33 64 dog car NaN
2 33 64 NaN NaN star
3 33 64 kangaroo NaN meteor
4 22 12 NaN train NaN
5 22 12 NaN NaN rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
"""
How can it be transformed into a minimum amount of rows without aggregating or losing data like the following?
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
3 22 12 NaN train rock
4 88 77 NaN horse NaN
5 100 100 NaN NaN NaN
Basically, for each group of id_X
columns, the col_X
columns' NaN
values are replaced with other group values if applicable.
CodePudding user response:
# melt (wide to long) on id_1, id_2 and sort the values
# this brings the NaN to the top
df2=df.melt(id_vars=['id_1', 'id_2'], var_name='col').sort_values(['id_1', 'id_2','col', 'value'])
# create a seq, to make the keys unique and pivot
df3=(df2.assign(seq=df2.groupby(['id_1','id_2','col' ]).cumcount())
.pivot(index=['id_1','id_2','seq'], columns=['col'], values='value').reset_index()
)
# for id_1 =100, you have all NaN and still want to keep it
# so remove rows with all NaN except when its for seq=0
df3=df3.loc[~((df3['seq']>0) &
(df3[['col_1','col_2','col_3']].isna().all(axis=1)) )]
# drop the seq (temp) column
df3.drop(columns='seq', inplace=True)
df3
col id_1 id_2 col_1 col_2 col_3
0 22 12 NaN train rock
2 33 64 dog bike meteor
3 33 64 kangaroo car star
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
CodePudding user response:
Another possible solution:
# this is to push up all not NaN values to the top of each column
df.loc[:, 'col_1':'col_3'] = df.groupby(
['id_1', 'id_2'], sort=False).transform(lambda x: sorted(x, key=pd.isnull))
# this is to remove all useless rows of NaN
df.loc[~(df.duplicated(['id_1', 'id_2']) &
df.loc[:, 'col_1':'col_3'].isna().all(axis=1))]
Output:
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
4 22 12 NaN train rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
CodePudding user response:
To avoid illegible Pandas voodoo, after your imports and df instantiation, you can do
def get_max_vals_from_row_sets(row, cols):
mn = 1
for col in cols:
mn = max(mn, len(row[col]))
return mn
def add_id_row(d, row, ids, cols):
max_vals = get_max_vals_from_row_sets(row, cols)
for _ in range(max_vals):
for id_ in ids:
d[id_].append(row[id_])
for col in cols:
if len(row[col]) != 0:
d[col].append(row[col].pop())
else:
d[col].append(np.nan)
def drop_set_nans(row, cols):
for col in cols:
if np.nan in row[col]:
row[col].remove(np.nan)
return row
def squash_out_redundant_nans(df, ids, cols):
d = {k: [] for k in df.columns}
for _, row in df1.iterrows():
drop_set_nans(row, cols)
add_id_row(d, row, ids, cols)
df = pd.DataFrame(d)
return df
ids = ['id_1', 'id_2']
cols = ['col_1', 'col_2', 'col_3']
df1 = df.groupby(ids).agg(set).reset_index()
df = squash_out_redundant_nans(df1, ids, cols)
print(df)