import pandas as pd
df = {'Date': ["2011-10-19",
"2013-01-14",
"2014-05-27",
"2014-06-23",
"2014-08-12",
"2014-09-22",
"2014-09-22",
"2014-09-22"
], 'Status': ["Pending",
"Pending",
"Complete",
"Pending",
"Complete",
"Pending",
"Pending",
"Pending"],
'Group': ["a",
"a",
"a",
"a",
"b",
"b",
"b",
"b"]}
df = pd.DataFrame(data=df)
df
I would like to create another variable based on the change in Status over time for each group such that they are considered a "completer" the next row after they have Status = "Complete"
For example I would like to create the "completer" column in the df2 table:
df2 = {'Date': ["2011-10-19",
"2013-01-14",
"2014-05-27",
"2014-06-23",
"2014-08-12",
"2014-09-22",
"2014-09-22",
"2014-09-22"
], 'Status': ["Pending",
"Pending",
"Complete",
"Pending",
"Complete",
"Pending",
"Pending",
"Pending"],
'Group': ["a",
"a",
"a",
"a",
"b",
"b",
"b",
"b"],
'Completer': ["Non-Completer",
"Non-Completer",
"Non-Completer",
"Completer",
"Non-Completer",
"Completer",
"Completer",
"Completer"]}
df2 = pd.DataFrame(data=df2)
df2
Thanks!
CodePudding user response:
I was able to solve it in two steps.
Firstly I created a column which marks the first date that each group has "Completed", and added 1 to it so that we update values starting from the next row.
Secondly, since True
/ False
translates to 1
/ 0
respectively I used a groupby
on Group and cummax
, which would update all following rows per group to True
.
Then lastly I just used replace
and dropped the helping column.
df['first_date_per_group'] = df.index.isin(df.loc[df['Status'].eq('Complete')]['Group'].index 1)
df = df.assign(Completer=df.groupby('Group')['first_date_per_group'].cummax()).replace(
{True:'Completer',False:'Non-Completer'}).drop('first_date_per_group',axis=1)
prints:
Date Status Group Completer
0 2011-10-19 Pending a Non-Completer
1 2013-01-14 Pending a Non-Completer
2 2014-05-27 Complete a Non-Completer
3 2014-06-23 Pending a Completer
4 2014-08-12 Complete b Non-Completer
5 2014-09-22 Pending b Completer
6 2014-09-22 Pending b Completer
7 2014-09-22 Pending b Completer
CodePudding user response:
use transform on the grouped data and assign 1 or nan, then ffill based on the group. this leaves the values from the past as null. Use that to fill the column as completer or non-completer
df['completer']=df.groupby('Group')['Status'].transform(
lambda row: np.where(row.shift(1).eq('Complete'), 1, np.nan ) )
df['completer']=df.groupby('Group')['completer'].ffill()
df['completer'] = np.where(df['completer'].isna(), 'non-completer', 'completer')
df
Date Status Group completer
0 2011-10-19 Pending a non-completer
1 2013-01-14 Pending a non-completer
2 2014-05-27 Complete a non-completer
3 2014-06-23 Pending a completer
4 2014-08-12 Complete b non-completer
5 2014-09-22 Pending b completer
6 2014-09-22 Pending b completer
7 2014-09-22 Pending b completer
CodePudding user response:
You can define a method to ffill
the 'completer' and then change the value to 'non-completer' and then do bfill
def fill_completer(g):
g.loc[g['Status']=='Complete', 'Completer'] = 'Completer'
g['Completer'] = g['Completer'].ffill()
g.loc[g['Status']=='Complete', 'Completer'] = 'Non-Completer'
g['Completer'] = g['Completer'].bfill()
return g
Then apply it to each group as :
df['Completer'] = np.nan
df = df.groupby('Group').apply(fill_completer)
print(df):
Date Status Group Completer
0 2011-10-19 Pending a Non-Completer
1 2013-01-14 Pending a Non-Completer
2 2014-05-27 Complete a Non-Completer
3 2014-06-23 Pending a Completer
4 2014-08-12 Complete b Non-Completer
5 2014-09-22 Pending b Completer
6 2014-09-22 Pending b Completer
7 2014-09-22 Pending b Completer
CodePudding user response:
Late, but I'd still like to add another, quite readable approach I came up with:
df['Completer'] = (df.Status.shift() # shift Status down by one row
.eq("Complete") # mark "Complete" rows
.groupby(df.Group).cumsum() # cumulative sum per group
.map({0: "Non-Completer", 1: "Completer"}) # replace 0s and 1s
)