I have a pandas DataFrame like following:
df = pd.DataFrame({'id' : [1,1,2,2,3,3,4,4,5,6,6,7,7,8,8,9,9],
'value' : ["GC", "GD", "GD", "GQ","GQ","GR","LA","LK","LK",
"HA","HE","HE","JB","JB","JF","JF","JJ"]})
I want to group this by id and compare the last value of group and first value of group and generate a new column like below.
id value status
1 GC na
1 GD different
2 GD same
2 GQ different
3 GQ same
3 GR different
4 LA different
4 LK different
5 LK same
6 HA different
6 HE different
7 HE same
7 JB different
8 JB same
8 JF different
9 JF same
9 JJ na
I have tried the following code, it seems to compare the first and last values within the same group
def check_status(group):
selected = [False] * len(group)
selected[0] = selected[-1] = True
new_group = group[selected]
new_group['status'] = 'different' if new_group.value.is_unique else 'same'
return new_group
last_first.groupby('id').apply(check_status).reset_index(drop=True)
appreciate any form of help, thank you.
CodePudding user response:
try this:
mask = df['id'].diff().ne(0)
cur_group_fisrt_value = df['value'][mask]
pre_group_last_value = df['value'].shift()[mask]
tmp = (cur_group_fisrt_value.eq(pre_group_last_value)
.reindex(df.index)
.fillna('different')
.replace({True: 'same', False: 'different'}))
tmp.iloc[[0, -1]] = 'na'
df['status'] = tmp
print(df)
>>>
id value status
0 1 GC na
1 1 GD different
2 2 GD same
3 2 GQ different
4 3 GQ same
5 3 GR different
6 4 LA different
7 4 LK different
8 5 LK same
9 6 HA different
10 6 HE different
11 7 HE same
12 7 JB different
13 8 JB same
14 8 JF different
15 9 JF same
16 9 JJ na
CodePudding user response:
IIUC, I tried strenght loop check. Not optimize but it is the other way to do.
import numpy as np
last_value=None
last_id=None
f_id=0 # ID
f_value=1 # VALUE
f_status=2 # STATUS
df['status']=np.nan
for i,r in df.iterrows():
if last_value is None:
df.iloc[i,f_status]=np.nan
else:
if r['id'] != last_id:
if r['value']==last_value:
df.iloc[i,f_status]='same'
else:
df.iloc[i,f_status]='different'
else:
df.iloc[i,f_status]='different'
last_id=r['id']
last_value=r['value']
df.iloc[-1,f_status]=np.nan