I have the following data frame
df1 = pd.DataFrame(
{
"day": ["monday", "monday","Tuesday" ],
"column0": ["xx", "xx", ""],
"column1": ["yy", "aa", "bb"],
"column2": ["cc", "cc", "cc"],
"column3": ["cc", "", "aa"]})
day column0 column1 column2 column3
0 monday xx yy cc cc
1 monday xx aa cc
2 Tuesday bb cc aa
I want to group by the day and join the columns in rows and also keep the rows an index column
Expected outcome1:
df1 = pd.DataFrame(
{
"day": ["monday", "Tuesday" ],
"index": ["0,1", "2" ],
"column0": ["xx", ""],
"column1": ["yy", "bb"],
"column2": ["cc", "cc"],
"column3": ["cc", "aa"],
"column4": ["xx", ""],
"column5": ["aa", ""],
"column6": ["cc", ""]})
day index column0 column1 column2 column3 column4 column5 column6
0 monday 0,1 xx yy cc cc xx aa cc
1 Tuesday 2 bb cc aa
Finally, i want to delete the same values on each row and add NAN to blank columns
Final outcome:
df1 = pd.DataFrame(
{
"day": ["monday", "Tuesday" ],
"index": ["0,1", "2" ],
"column0": ["xx", "NAN"],
"column1": ["yy", "bb"],
"column2": ["cc", "cc"],
"column3": ["NAN", "aa"],
"column5": ["aa", "NAN"]})
day index column0 column1 column2 column3 column4
0 monday 0,1 xx yy cc NAN aa
1 Tuesday 2 NAN bb cc aa NAN
Any ideas?
CodePudding user response:
This is not perfect but it does work.
# Concatenate both DataFrames
df_merged = pd.concat([df1,df2],sort=False, axis=0)
day column0 column1 column2 column3 index column4 column5 column6
0 monday xx yy cc cc
1 monday xx aa cc
2 Tuesday bb cc aa
0 monday xx yy cc cc 0,1 xx aa cc
1 Tuesday bb cc aa 2
# Drop rows with NaN in "Index" column
df_merged.dropna(subset=['index'],inplace=True)
day column0 column1 column2 column3 index column4 column5 column6
0 monday xx yy cc cc 0,1 xx aa cc
1 Tuesday bb cc aa 2
CodePudding user response:
You can use numpy to flatten your grouped dataframes. Then store them in a list and make a dataframe from it.
You can finally replace ""
and None
with NaN
, drop NaN
columns and rename your columns:
import pandas as pd
import numpy as np
df1 = pd.DataFrame(
{
"day": ["monday", "monday","Tuesday" ],
"column0": ["xx", "xx", ""],
"column1": ["yy", "aa", "bb"],
"column2": ["cc", "cc", "cc"],
"column3": ["cc", "", "aa"]})
arr_list = []
for d, sub_df in df1.groupby("day"):
arr = list(np.array(sub_df.iloc[:,1:]).flatten())
arr = [d, list(sub_df.index)] arr
arr_list.append(arr)
df = pd.DataFrame(arr_list)
df = df.replace('',np.nan).fillna(value=np.nan).dropna(axis=1, how='all')
df.columns = ["day", "index"] [f"column{i}" for i in range(len(df.columns)-2)]
print(df)
Output:
day index column0 column1 column2 column3 column4 column5 column6
0 Tuesday [2] NaN bb cc aa NaN NaN NaN
1 monday [0, 1] xx yy cc cc xx aa cc
Edit: if you want to remove duplicates in each row, do it after flattening your array:
for d, sub_df in df1.groupby("day"):
arr = list(np.array(sub_df.iloc[:,1:]).flatten())
# removing duplicates for this row:
arr_unique = []
for x in arr:
if not x in arr_unique:
arr_unique.append(x)
else: # appending NaN to keep dataframe form
arr_unique.append(np.nan)
arr = [d, list(sub_df.index)] arr_unique
arr_list.append(arr)
Output:
day index column0 column1 column2 column3 column4
0 Tuesday [2] NaN bb cc aa NaN
1 monday [0, 1] xx yy cc NaN aa