I am trying to add missing months for each ID
. Added months should have info on ID
and year_month
, and NaN for Product. My code achieves this using apply()
, but is slow -- I am looking for a vectorized version, which can run significantly faster.
Specifically, df.set_index(df.index).groupby('ID').apply(add_missing_months)
takes about 20 seconds on my system with 60 000 rows. I plan to work with data with millions of rows, so I think I need to vectorize the operation. Any help is highly appreciated!
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
df = df.drop('year_month', axis = 1)
# The slow function
def add_missing_months(s):
min_d = s.index.min()
max_d = s.index.max()
s = s.reindex(pd.date_range(min_d, max_d, freq='MS'))
return(s)
df = df.set_index(df.index).groupby('ID').apply(add_missing_months)
df = df.drop('ID', axis = 1)
df = df.reset_index()
CodePudding user response:
Not sure if faster, but simplier code is:
df = df.sort_index().groupby('ID').apply(lambda x: x.asfreq('MS'))
df1 = df.groupby('ID').apply(lambda x: x.asfreq('MS'))
df2 = df.set_index(df.index).groupby('ID').apply(add_missing_months)
print (df1.equals(df2))
True
EDIT: For improve performance is create month periods by Series.dt.to_period
, aggregate minimal and maximal value with get difference of them with repeat indices by Index.repeat
, last add counter by GroupBy.cumcount
foa append months ranges, convert to timestamps by Series.dt.to_timestamp
and last use left join:
df1 = (df.assign(year_month = df['year_month'].dt.to_period('m'))
.groupby(['ID'])['year_month']
.agg(['min', 'max']))
diff = df1['max'].astype('int').sub(df1['min'].astype('int')) 1
df1 = df1.loc[df1.index.repeat(diff)]
df1 = (df1['min'].add(df1.groupby(level=0).cumcount()))
.dt.to_timestamp()
.reset_index(name='year_month'))
df = df1.merge(df.rename_axis(None), how='left')
Performance:
In [276]: %timeit jez(df)
126 ms ± 7.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [277]: %timeit vogel(df)
312 ms ± 32.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
df.index = pd.to_datetime(df['year_month'], format = '%Y%m%d')
def jez(df):
df1 = df.assign(year_month = df['year_month'].dt.to_period('m')).groupby(['ID'])['year_month'].agg(['min', 'max'])
df1 = df1.loc[df1.index.repeat( df1['max'].astype('int').sub(df1['min'].astype('int')) 1)]
df1 = (df1['min'] df1.groupby(level=0).cumcount()).dt.to_timestamp().reset_index(name='year_month')
return df1.merge(df.rename_axis(None), how='left')
def vogel(df):
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')
return df
CodePudding user response:
The code is a lot more convoluted and could probably be improved a bit, but it does seem to be much faster (down from 16 to 0.2 seconds on my system, starting from # my code starts here
):
import pandas as pd
df = pd.DataFrame({'ID': [1, 1, 1, 2, 2, 3], 'year_month': ['2020-01-01','2020-08-01','2020-10-01','2020-01-01','2020-07-01','2021-05-01'], 'product':['A','B','C','A','D','C']})
# Enlarge dataset to 60 000 rows
for i in range(9999):
df2 = df.iloc[-6:].copy()
df2['ID'] = df2['ID'] 3
df = pd.concat([df,df2], axis=0, ignore_index=True)
df['year_month'] = pd.to_datetime(df['year_month'])
# my code starts here
# find overall min and max date
min_d = df['year_month'].min()
max_d = df['year_month'].max()
# generate all possible combinations of date and ID
df_agg = df.groupby(['ID'])['year_month'].agg(['min', 'max'])
df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[pd.date_range(min_d, max_d, freq='MS'), df_agg.index]
)
)
# reduce to only relevant dates
df = df.merge(df_agg, left_on='ID', right_index=True)
df = df.reset_index().rename(columns={'level_0': 'year_month'})
df = df[df['year_month'].between(df['min'], df['max'])]
df = df.drop(columns=['min', 'max'])
# add product information
df = df.merge(df, how='left')