Pandas dataframe Comparing the last value of first group and the first value of second group-CodePudding

I have a pandas DataFrame like following:

df = pd.DataFrame({'id' : [1,1,2,2,3,3,4,4,5,6,6,7,7,8,8,9,9],
                'value'  : ["GC", "GD", "GD", "GQ","GQ","GR","LA","LK","LK",
                           "HA","HE","HE","JB","JB","JF","JF","JJ"]})

I want to group this by id and compare the last value of group and first value of group and generate a new column like below.


id  value   status
1   GC  na
1   GD  different 
2   GD  same 
2   GQ  different 
3   GQ  same 
3   GR  different
4   LA  different
4   LK  different 
5   LK  same 
6   HA  different
6   HE  different 
7   HE  same 
7   JB  different
8   JB  same
8   JF  different
9   JF  same
9   JJ  na

I have tried the following code, it seems to compare the first and last values within the same group

def check_status(group):
    selected = [False] * len(group)
    selected[0] = selected[-1] = True
    new_group = group[selected]
    new_group['status'] = 'different' if new_group.value.is_unique else 'same'
    return new_group

last_first.groupby('id').apply(check_status).reset_index(drop=True)

appreciate any form of help, thank you.

CodePudding user response：

try this:

mask = df['id'].diff().ne(0)
cur_group_fisrt_value = df['value'][mask]
pre_group_last_value = df['value'].shift()[mask]
tmp = (cur_group_fisrt_value.eq(pre_group_last_value)
       .reindex(df.index)
       .fillna('different')
       .replace({True: 'same', False: 'different'}))
tmp.iloc[[0, -1]] = 'na'
df['status'] = tmp
print(df)
>>>
   id   value   status
0   1   GC      na
1   1   GD      different
2   2   GD      same
3   2   GQ      different
4   3   GQ      same
5   3   GR      different
6   4   LA      different
7   4   LK      different
8   5   LK      same
9   6   HA      different
10  6   HE      different
11  7   HE      same
12  7   JB      different
13  8   JB      same
14  8   JF      different
15  9   JF      same
16  9   JJ      na

CodePudding user response：

IIUC, I tried strenght loop check. Not optimize but it is the other way to do.

import numpy as np
last_value=None
last_id=None
f_id=0 # ID
f_value=1 # VALUE
f_status=2 # STATUS
df['status']=np.nan
for i,r in df.iterrows():
    if last_value is None:
        df.iloc[i,f_status]=np.nan        
    else:
        if r['id'] != last_id:
            if r['value']==last_value:
                df.iloc[i,f_status]='same'
            else:
                df.iloc[i,f_status]='different'
        else:
                df.iloc[i,f_status]='different'

    last_id=r['id']
    last_value=r['value']
df.iloc[-1,f_status]=np.nan