I have a dataframe looks like this:
import pandas as pd
df = pd.DataFrame({'type_a': [1,0,0,0,0,1,0,0,0,1],
'type_b': [0,1,0,0,0,0,0,0,1,1],
'type_c': [0,0,1,1,1,1,0,0,0,0],
'type_d': [1,0,0,0,0,1,1,0,1,0],
})
I wanna create a new column based on those 4 columns, it will return the column names whenever the value in those 4 columns equals to 1, if there are multiple columns equal to 1 at the same time then it will return the list of those columns names, otherwise it will be nan.
The output dataframe will look like this:
df = pd.DataFrame({'type_a': [1,0,0,0,0,1,0,0,0,1],
'type_b': [0,1,0,0,0,0,0,0,1,1],
'type_c': [0,0,1,1,1,1,0,0,0,0],
'type_d': [1,0,0,0,0,1,1,0,1,0],
'type':[['type_a','type_d'], 'type_b', 'type_c', 'type_c','type_c', ['type_a','type_c','type_d'], 'type_d', 'nan', ['type_b','type_d'],['type_a','type_b']]
})
Any help will be really appreciated. Thanks!
CodePudding user response:
This is also another way:
import pandas as pd
df['type'] = (pd.melt(df.reset_index(), id_vars='index')
.query('value == 1')
.groupby('index')['variable']
.apply(lambda x:[str for str in x]))
type_a type_b type_c type_d type
0 1 0 0 1 [type_a, type_d]
1 0 1 0 0 [type_b]
2 0 0 1 0 [type_c]
3 0 0 1 0 [type_c]
4 0 0 1 0 [type_c]
5 1 0 1 1 [type_a, type_c, type_d]
6 0 0 0 1 [type_d]
7 0 0 0 0 NaN
8 0 1 0 1 [type_b, type_d]
9 1 1 0 0 [type_a, type_b]
CodePudding user response:
You can use this answer and adapt to your case.
import pandas as pd
df = pd.DataFrame({'type_a': [1,0,0,0,0,1,0,0,0,1],
'type_b': [0,1,0,0,0,0,0,0,1,1],
'type_c': [0,0,1,1,1,1,0,0,0,0],
'type_d': [1,0,0,0,0,1,1,0,1,0],
})
df['type'] = df.dot(df.columns ',')\
.str.rstrip(',')\
.apply(lambda x: x.split(','))
Where the output is
type_a type_b type_c type_d type
0 1 0 0 1 [type_a, type_d]
1 0 1 0 0 [type_b]
2 0 0 1 0 [type_c]
3 0 0 1 0 [type_c]
4 0 0 1 0 [type_c]
5 1 0 1 1 [type_a, type_c, type_d]
6 0 0 0 1 [type_d]
7 0 0 0 0 []
8 0 1 0 1 [type_b, type_d]
9 1 1 0 0 [type_a, type_b]
Edit 1
The general case will be
df['type'] = df.eq(1).dot(df.columns ',')\
.str.rstrip(',')\
.apply(lambda x: x.split(','))
Edit 2
Eventually you can avoid lambda (in case your dataframe is big)
df['type'] = df.eq(1).dot(df.columns ',')\
.str.rstrip(',')\
.str.split(',')
CodePudding user response:
here is one more way
df.mul(df.columns).apply(lambda x: list(pd.Series(i for i in x if len(i)>0)), axis=1)
0 [type_a, type_d]
1 [type_b]
2 [type_c]
3 [type_c]
4 [type_c]
5 [type_a, type_c, type_d]
6 [type_d]
7 []
8 [type_b, type_d]
9 [type_a, type_b]
CodePudding user response:
You can use
df.apply(lambda x: df.columns[x.eq(1)].tolist(), axis=1)
To create an output like this:
0 [type_a, type_d]
1 [type_b]
2 [type_c]
3 [type_c]
4 [type_c]
5 [type_a, type_c, type_d]
6 [type_d]
7 []
8 [type_b, type_d]
9 [type_a, type_b]
You can then format that into your required output using a custom function:
def f(l):
return l if len(l) > 1 else next(iter(l), np.NaN)
df["type"] = df.apply(lambda x: f(df.columns[x.eq(1)].tolist()), axis=1)
CodePudding user response:
And another one:
import pandas as pd
import numpy as np
df = pd.DataFrame({'type_a': [1,0,0,0,0,1,0,0,0,1],
'type_b': [0,1,0,0,0,0,0,0,1,1],
'type_c': [0,0,1,1,1,1,0,0,0,0],
'type_d': [1,0,0,0,0,1,1,0,1,0],
})
df['type']=''
for i,r in df.iterrows():
t=[k for k in r.keys() if r[k]==1]
if t:
if len(t)==1:
df.at[i,'type']=t[0]
else:
df.at[i,'type']=t
else:
df.at[i,'type']=np.nan