Apply function to dataframe column based on combinations of values from other columns-CodePudding

I have a dataframe that looks like this:

Region	Country	Product	Year	Price
Africa	South Africa	ABC	2016	500
Africa	South Africa	ABC	2017	400
Africa	South Africa	ABC	2018	15
Africa	South Africa	ABC	2019	450
Africa	Uganda	ABC	2016	750
Africa	Uganda	ABC	2017	670
Africa	Uganda	ABC	2018	1300
Africa	Uganda	ABC	2019	890
Asia	Japan	DEF	2016	500
Asia	Japan	DEF	2017	420
Asia	Japan	DEF	2018	415
Asia	Japan	DEF	2019	0

data = {'Region': ['Africa','Africa','Africa','Africa','Africa','Africa','Africa','Africa','Asia','Asia','Asia','Asia'],
         'Country': ['South Africa','South Africa','South Africa','South Africa','Uganda','Uganda','Uganda','Uganda','Japan','Japan','Japan','Japan'],
         'Product': ['ABC','ABC','ABC','ABC','XYZ','XYZ','XYZ','XYZ','DEF','DEF','DEF','DEF'],
         'Year': [2016, 2017, 2018, 2019,2016, 2017, 2018, 2019,2016, 2017, 2018, 2019],
         'Price': [500, 400, 15,450,750,670,1300,890,500,420,415,0]}
df = pd.DataFrame(data)

I want to calculate the Interquartile Range to identify outliers and extract the index positions of potential outliers.

I created a function however, I am having trouble applying the function to the Price column based on combinations of the Region and Product columns.

My function is below:

def tukeys_method(df, variable, iterable1, iterable2):
    itr1 = df[iterable1].unique() #create list of unique values for iterable 1
    itr2 = df[iterable2].unique() #create list of unique values for iterable 2
    for (i,j) in zip(itr1, itr2):
        
        #Takes two parameters: dataframe & variable of interest as string
        q1 = df.groupby([iterable1,iterable2])[variable].quantile(0.25) #calculate quantiles
        q3 = df.groupby([iterable1,iterable2])[variable].quantile(0.75) #calculate quantiles
        iqr = q3-q1
        inner_fence = 1.5*iqr
        outer_fence = 3*iqr
    
        #inner fence lower and upper end
        inner_fence_le = q1-inner_fence
        inner_fence_ue = q3 inner_fence
    
        #outer fence lower and upper end
        outer_fence_le = q1-outer_fence
        outer_fence_ue = q3 outer_fence
    
        outliers_prob = []
        outliers_poss = []
        for index, x in enumerate(df.groupby([iterable1,iterable2])[variable]):
            if x <= outer_fence_le or x >= outer_fence_ue:
                outliers_prob.append(index)
        for index, x in enumerate(df.groupby([iterable1,iterable2])[variable]):
            if x <= inner_fence_le or x >= inner_fence_ue:
                outliers_poss.append(index)
        return outliers_prob, outliers_poss
        


probable_outliers_tm, possible_outliers_tm = tukeys_method(df, "Price",'Region','Product')

I get the following error when I run the function:

ValueError: operands could not be broadcast together with shapes (570,) (2,)

Does anyone know what I need to do to fix this?

CodePudding user response：

I managed to figure it out, if anyone is interested, the solution is below:

# Identify outliers using Tukey's method.
def outliers_tukey(df, variable, iterable1, iterable2):
    outliers_prob = []
    outliers_poss = []
    for (i,j) in itertools.product(df[iterable1].unique(), df[iterable2].unique()):
        
        #Takes two parameters: dataframe & variable of interest as string
        q1 = df.loc[(df[iterable1]==i) & (df[iterable2]==j)][variable].quantile(0.25)
        q3 = df.loc[(df[iterable1]==i) & (df[iterable2]==j)][variable].quantile(0.75)
        iqr = q3-q1
        inner_fence = 1.5*iqr
        outer_fence = 3*iqr
    
        #inner fence lower and upper end
        inner_fence_le = q1-inner_fence
        inner_fence_ue = q3 inner_fence
    
        #outer fence lower and upper end
        outer_fence_le = q1-outer_fence
        outer_fence_ue = q3 outer_fence
    

        for index, x in enumerate(df[variable]):
            if x <= outer_fence_le or x >= outer_fence_ue:
                outliers_prob.append(index)
        for index, x in enumerate(df[variable]):
            if x <= inner_fence_le or x >= inner_fence_ue:
                outliers_poss.append(index)
        return outliers_prob, outliers_poss
        

probable_outliers_tm, possible_outliers_tm = outliers_tukey(df, "Price",'Region','Product')