Assume that I have a Pandas data frame that looks like this:
df = pd.DataFrame({
"YEAR":[2000,2000,2001,2001,2002],
"VISITORS":[100,2000,200,300,250],
"SALES":[5000,2500,23500,1512,3510],
"MONTH":[1,2,1,2,1],
"LOCATION":["Loc1", "Loc2", "Loc1" , "Loc2" , "Loc1"]})
I want to join this data frame on MONTH
, LOCATION
columns with a previous year data of the same Pandas data frame.
I tried this:
def calculate(df):
result_all_years = []
for current_year in df["YEAR"].unique():
df_previous = df.copy()
df_previous = df_previous[df_previous["YEAR"] == current_year - 1]
df_previous.rename(
columns={
"VISITORS": "VISITORS_LAST_YEAR",
"SALES": "SALES_LAST_YEAR",
"YEAR": "PREVIOUS_YEAR",
},
inplace=True,
)
df_current = df[df["YEAR"] == current_year]
df_current = df_current.merge(
df_previous,
how="left",
on=["MONTH", "LOCATION" ]
)
# There are many simular calculations and additional columns to be added like the following:
df_current["SALES_DIFF"] = df_current["SALES"] - df_current["SALES_LAST_YEAR"]
result_all_years.append(df_current)
return pd.concat(result_all_years, ignore_index=True).round(3)
The code in the calculate
function is working fine. But is there any faster method to do that? Possibly faster?
CodePudding user response:
IIUC, you can muse merge
on the dataframe itself with the incremented YEAR:
(df.merge(df.assign(YEAR=df['YEAR'] 1).drop(columns=['MONTH']),
on=['YEAR', 'LOCATION'],
how='left',
suffixes=('', '_LAST_YEAR'))
.assign(SALES_DIFF=lambda d: d['SALES']-d['SALES_LAST_YEAR'],
LAST_YEAR=lambda d: d['YEAR'].sub(1).mask(d['SALES_DIFF'].isna())
)
)
output:
YEAR VISITORS SALES MONTH LOCATION VISITORS_LAST_YEAR SALES_LAST_YEAR SALES_DIFF LAST_YEAR
0 2000 100 5000 1 Loc1 NaN NaN NaN NaN
1 2000 2000 2500 2 Loc2 NaN NaN NaN NaN
2 2001 200 23500 1 Loc1 100.0 5000.0 18500.0 2000.0
3 2001 300 1512 2 Loc2 2000.0 2500.0 -988.0 2000.0
4 2002 250 3510 1 Loc1 200.0 23500.0 -19990.0 2001.0
CodePudding user response:
Try to merge
with the same dataframe and manipulate it accordingly
diff_df = pd.merge(df, df, left_on = [df['YEAR'], df['MONTH'], df['LOCATION']], suffixes=('', '_PREV'),
right_on = [df['YEAR'] 1, df['MONTH'], df['LOCATION']])
diff_df = diff_df[['YEAR', 'YEAR_PREV', 'MONTH', 'LOCATION','VISITORS','VISITORS_PREV','SALES','SALES_PREV']]
diff_df = diff_df.assign(VISITORS_DIFF = (diff_df['VISITORS_PREV'] - diff_df['VISITORS']),
SALES_DIFF = (diff_df['SALES_PREV'] - diff_df['SALES']))
Output
YEAR YEAR_PREV MONTH LOCATION VISITORS VISITORS_PREV SALES SALES_PREV VISITORS_DIFF SALES_DIFF
2001 2000 1 Loc1 200 100 23500 5000 -100 -18500
2001 2000 2 Loc2 300 2000 1512 2500 1700 988
2002 2001 1 Loc1 250 200 3510 23500 -50 19990