I have dataframe like as below
cust_id,purchase_date
1,10/01/1998
1,10/12/1999
2,13/05/2016
3,14/02/2018
3,15/03/2019
I would like to do the below
a) display the output in text format as 5 years and 9 months
instead of 5.93244
etc.
I tried the below
from datetime import timedelta
df['purchase_date'] = pd.to_datetime(df['purchase_date'])
gb = df_new.groupby(['unique_key'])
df_cust_age = gb['purchase_date'].agg(min_date=np.min, max_date=np.max).reset_index()
df_cust_age['diff_in_days'] = df_cust_age['max_date'] - df_cust_age['min_date']
df_cust_age['years_diff'] = df_cust_age['diff_in_days']/timedelta(days=365)
but the above code gives the output in decimal numbers
.
I expect my output to be like as below
cust_id,years_diff
1, 1 years and 11 months and 0 day
2, 0 years
3, 1 year and 1 month and 1 day
CodePudding user response:
If possible create 'default'
month with 30 days use this custom function:
#https://stackoverflow.com/a/13756038/2901002
def td_format(td_object):
seconds = int(td_object.total_seconds())
periods = [
('year', 60*60*24*365),
('month', 60*60*24*30),
('day', 60*60*24),
('hour', 60*60),
('minute', 60),
('second', 1)
]
strings=[]
for period_name, period_seconds in periods:
if seconds > period_seconds:
period_value , seconds = divmod(seconds, period_seconds)
has_s = 's' if period_value > 1 else ''
strings.append("%s %s%s" % (period_value, period_name, has_s))
return ", ".join(strings) if len(strings) > 0 else '0 year'
df_cust_age['years_diff'] = df_cust_age['diff_in_days'].apply(td_format)
print (df_cust_age)
cust_id min_date max_date diff_in_days years_diff
0 1 1998-10-01 1999-10-12 376 days 1 year, 11 days
1 2 2016-05-13 2016-05-13 0 days 0 year
2 3 2018-02-14 2019-03-15 394 days 1 year, 29 days
CodePudding user response:
from io import StringIO
import pandas as pd
from dateutil.relativedelta import relativedelta as RD
string_data = '''unique_key,purchase_date
1,10/01/1998
1,10/12/1999
2,13/05/2016
3,14/02/2018
3,15/03/2019'''
## Custom functions
diff_obj = lambda d1,d2:RD(d1, d2) if d1>d2 else RD(d2, d1)
date_tuple = lambda diff:(diff.years,diff.months,diff.days)
pipeline = lambda row:date_tuple(diff_obj(row['min_date'],row['max_date']))
def string_format(date_tuple):
final_string = []
for val,name in zip(date_tuple,['years','months','day']):
if val:
final_string.append(f'{val} {name}')
return ' and '.join(final_string) if final_string else '0 years'
## Custom functions
df = pd.read_csv(StringIO(string_data))
df['purchase_date'] = pd.to_datetime(df['purchase_date'],format='%d/%m/%Y')
gb = df.groupby(['unique_key'])
df_cust_age = gb['purchase_date'].agg(min_date=np.min, max_date=np.max).reset_index()
df_cust_age['years_diff'] = df_cust_age.apply(pipeline,axis=1).apply(string_format)
print(df_cust_age)
unique_key min_date max_date years_diff
0 1 1998-01-10 1999-12-10 1 years and 11 months
1 2 2016-05-13 2016-05-13 0 years
2 3 2018-02-14 2019-03-15 1 years and 1 months and 1 day