Home > Software design >  Join on previous year with additional calculations
Join on previous year with additional calculations

Time:03-16

Assume that I have a Pandas data frame that looks like this:

df = pd.DataFrame({
    "YEAR":[2000,2000,2001,2001,2002],
    "VISITORS":[100,2000,200,300,250],
    "SALES":[5000,2500,23500,1512,3510],
    "MONTH":[1,2,1,2,1],
    "LOCATION":["Loc1", "Loc2", "Loc1" , "Loc2" , "Loc1"]})

I want to join this data frame on MONTH, LOCATION columns with a previous year data of the same Pandas data frame.

I tried this:

def calculate(df):

    result_all_years = []
    for current_year in df["YEAR"].unique():
        df_previous = df.copy()
        df_previous = df_previous[df_previous["YEAR"] == current_year - 1]

        df_previous.rename(
            columns={
                "VISITORS": "VISITORS_LAST_YEAR",
                "SALES": "SALES_LAST_YEAR",
                "YEAR": "PREVIOUS_YEAR",
            },
            inplace=True,
        )
        df_current = df[df["YEAR"] == current_year]
        df_current = df_current.merge(
            df_previous,
            how="left",
            on=["MONTH", "LOCATION" ]
        )

        # There are many simular calculations and additional columns to be added like the following:  
        df_current["SALES_DIFF"] = df_current["SALES"] - df_current["SALES_LAST_YEAR"]

        result_all_years.append(df_current)
    return pd.concat(result_all_years, ignore_index=True).round(3)

The code in the calculate function is working fine. But is there any faster method to do that? Possibly faster?

CodePudding user response:

IIUC, you can muse merge on the dataframe itself with the incremented YEAR:

(df.merge(df.assign(YEAR=df['YEAR'] 1).drop(columns=['MONTH']),
          on=['YEAR', 'LOCATION'],
          how='left',
          suffixes=('', '_LAST_YEAR'))
   .assign(SALES_DIFF=lambda d: d['SALES']-d['SALES_LAST_YEAR'],
           LAST_YEAR=lambda d: d['YEAR'].sub(1).mask(d['SALES_DIFF'].isna())
           )
)

output:

   YEAR  VISITORS  SALES  MONTH LOCATION  VISITORS_LAST_YEAR  SALES_LAST_YEAR  SALES_DIFF  LAST_YEAR
0  2000       100   5000      1     Loc1                 NaN              NaN         NaN        NaN
1  2000      2000   2500      2     Loc2                 NaN              NaN         NaN        NaN
2  2001       200  23500      1     Loc1               100.0           5000.0     18500.0     2000.0
3  2001       300   1512      2     Loc2              2000.0           2500.0      -988.0     2000.0
4  2002       250   3510      1     Loc1               200.0          23500.0    -19990.0     2001.0

CodePudding user response:

Try to merge with the same dataframe and manipulate it accordingly

diff_df = pd.merge(df, df, left_on = [df['YEAR'], df['MONTH'], df['LOCATION']], suffixes=('', '_PREV'), 
                   right_on = [df['YEAR'] 1, df['MONTH'], df['LOCATION']])

diff_df = diff_df[['YEAR', 'YEAR_PREV', 'MONTH', 'LOCATION','VISITORS','VISITORS_PREV','SALES','SALES_PREV']]

diff_df = diff_df.assign(VISITORS_DIFF = (diff_df['VISITORS_PREV'] - diff_df['VISITORS']), 
                         SALES_DIFF = (diff_df['SALES_PREV'] - diff_df['SALES']))

Output

YEAR    YEAR_PREV   MONTH   LOCATION    VISITORS    VISITORS_PREV   SALES   SALES_PREV  VISITORS_DIFF   SALES_DIFF
2001    2000        1       Loc1        200         100             23500   5000        -100            -18500
2001    2000        2       Loc2        300         2000            1512    2500         1700           988
2002    2001        1       Loc1        250         200             3510    23500       -50             19990
  • Related