Join on previous year with additional calculations-CodePudding

Assume that I have a Pandas data frame that looks like this:

df = pd.DataFrame({
    "YEAR":[2000,2000,2001,2001,2002],
    "VISITORS":[100,2000,200,300,250],
    "SALES":[5000,2500,23500,1512,3510],
    "MONTH":[1,2,1,2,1],
    "LOCATION":["Loc1", "Loc2", "Loc1" , "Loc2" , "Loc1"]})

I want to join this data frame on MONTH, LOCATION columns with a previous year data of the same Pandas data frame.

I tried this:

def calculate(df):

    result_all_years = []
    for current_year in df["YEAR"].unique():
        df_previous = df.copy()
        df_previous = df_previous[df_previous["YEAR"] == current_year - 1]

        df_previous.rename(
            columns={
                "VISITORS": "VISITORS_LAST_YEAR",
                "SALES": "SALES_LAST_YEAR",
                "YEAR": "PREVIOUS_YEAR",
            },
            inplace=True,
        )
        df_current = df[df["YEAR"] == current_year]
        df_current = df_current.merge(
            df_previous,
            how="left",
            on=["MONTH", "LOCATION" ]
        )

        # There are many simular calculations and additional columns to be added like the following:  
        df_current["SALES_DIFF"] = df_current["SALES"] - df_current["SALES_LAST_YEAR"]

        result_all_years.append(df_current)
    return pd.concat(result_all_years, ignore_index=True).round(3)

The code in the calculate function is working fine. But is there any faster method to do that? Possibly faster?

CodePudding user response：

IIUC, you can muse merge on the dataframe itself with the incremented YEAR:

(df.merge(df.assign(YEAR=df['YEAR'] 1).drop(columns=['MONTH']),
          on=['YEAR', 'LOCATION'],
          how='left',
          suffixes=('', '_LAST_YEAR'))
   .assign(SALES_DIFF=lambda d: d['SALES']-d['SALES_LAST_YEAR'],
           LAST_YEAR=lambda d: d['YEAR'].sub(1).mask(d['SALES_DIFF'].isna())
           )
)

output:

   YEAR  VISITORS  SALES  MONTH LOCATION  VISITORS_LAST_YEAR  SALES_LAST_YEAR  SALES_DIFF  LAST_YEAR
0  2000       100   5000      1     Loc1                 NaN              NaN         NaN        NaN
1  2000      2000   2500      2     Loc2                 NaN              NaN         NaN        NaN
2  2001       200  23500      1     Loc1               100.0           5000.0     18500.0     2000.0
3  2001       300   1512      2     Loc2              2000.0           2500.0      -988.0     2000.0
4  2002       250   3510      1     Loc1               200.0          23500.0    -19990.0     2001.0

CodePudding user response：

Try to merge with the same dataframe and manipulate it accordingly

diff_df = pd.merge(df, df, left_on = [df['YEAR'], df['MONTH'], df['LOCATION']], suffixes=('', '_PREV'), 
                   right_on = [df['YEAR'] 1, df['MONTH'], df['LOCATION']])

diff_df = diff_df[['YEAR', 'YEAR_PREV', 'MONTH', 'LOCATION','VISITORS','VISITORS_PREV','SALES','SALES_PREV']]

diff_df = diff_df.assign(VISITORS_DIFF = (diff_df['VISITORS_PREV'] - diff_df['VISITORS']), 
                         SALES_DIFF = (diff_df['SALES_PREV'] - diff_df['SALES']))

Output

YEAR    YEAR_PREV   MONTH   LOCATION    VISITORS    VISITORS_PREV   SALES   SALES_PREV  VISITORS_DIFF   SALES_DIFF
2001    2000        1       Loc1        200         100             23500   5000        -100            -18500
2001    2000        2       Loc2        300         2000            1512    2500         1700           988
2002    2001        1       Loc1        250         200             3510    23500       -50             19990