I have a dataframe that looks like this:
Region | Country | Product | Year | Price |
---|---|---|---|---|
Africa | South Africa | ABC | 2016 | 500 |
Africa | South Africa | ABC | 2017 | 400 |
Africa | South Africa | ABC | 2018 | 15 |
Africa | South Africa | ABC | 2019 | 450 |
Africa | Uganda | ABC | 2016 | 750 |
Africa | Uganda | ABC | 2017 | 670 |
Africa | Uganda | ABC | 2018 | 1300 |
Africa | Uganda | ABC | 2019 | 890 |
Asia | Japan | DEF | 2016 | 500 |
Asia | Japan | DEF | 2017 | 420 |
Asia | Japan | DEF | 2018 | 415 |
Asia | Japan | DEF | 2019 | 0 |
data = {'Region': ['Africa','Africa','Africa','Africa','Africa','Africa','Africa','Africa','Asia','Asia','Asia','Asia'],
'Country': ['South Africa','South Africa','South Africa','South Africa','Uganda','Uganda','Uganda','Uganda','Japan','Japan','Japan','Japan'],
'Product': ['ABC','ABC','ABC','ABC','XYZ','XYZ','XYZ','XYZ','DEF','DEF','DEF','DEF'],
'Year': [2016, 2017, 2018, 2019,2016, 2017, 2018, 2019,2016, 2017, 2018, 2019],
'Price': [500, 400, 15,450,750,670,1300,890,500,420,415,0]}
df = pd.DataFrame(data)
I want to calculate the Interquartile Range to identify outliers and extract the index positions of potential outliers.
I created a function however, I am having trouble applying the function to the Price
column based on combinations of the Region
and Product
columns.
My function is below:
def tukeys_method(df, variable, iterable1, iterable2):
itr1 = df[iterable1].unique() #create list of unique values for iterable 1
itr2 = df[iterable2].unique() #create list of unique values for iterable 2
for (i,j) in zip(itr1, itr2):
#Takes two parameters: dataframe & variable of interest as string
q1 = df.groupby([iterable1,iterable2])[variable].quantile(0.25) #calculate quantiles
q3 = df.groupby([iterable1,iterable2])[variable].quantile(0.75) #calculate quantiles
iqr = q3-q1
inner_fence = 1.5*iqr
outer_fence = 3*iqr
#inner fence lower and upper end
inner_fence_le = q1-inner_fence
inner_fence_ue = q3 inner_fence
#outer fence lower and upper end
outer_fence_le = q1-outer_fence
outer_fence_ue = q3 outer_fence
outliers_prob = []
outliers_poss = []
for index, x in enumerate(df.groupby([iterable1,iterable2])[variable]):
if x <= outer_fence_le or x >= outer_fence_ue:
outliers_prob.append(index)
for index, x in enumerate(df.groupby([iterable1,iterable2])[variable]):
if x <= inner_fence_le or x >= inner_fence_ue:
outliers_poss.append(index)
return outliers_prob, outliers_poss
probable_outliers_tm, possible_outliers_tm = tukeys_method(df, "Price",'Region','Product')
I get the following error when I run the function:
ValueError: operands could not be broadcast together with shapes (570,) (2,)
Does anyone know what I need to do to fix this?
CodePudding user response:
I managed to figure it out, if anyone is interested, the solution is below:
# Identify outliers using Tukey's method.
def outliers_tukey(df, variable, iterable1, iterable2):
outliers_prob = []
outliers_poss = []
for (i,j) in itertools.product(df[iterable1].unique(), df[iterable2].unique()):
#Takes two parameters: dataframe & variable of interest as string
q1 = df.loc[(df[iterable1]==i) & (df[iterable2]==j)][variable].quantile(0.25)
q3 = df.loc[(df[iterable1]==i) & (df[iterable2]==j)][variable].quantile(0.75)
iqr = q3-q1
inner_fence = 1.5*iqr
outer_fence = 3*iqr
#inner fence lower and upper end
inner_fence_le = q1-inner_fence
inner_fence_ue = q3 inner_fence
#outer fence lower and upper end
outer_fence_le = q1-outer_fence
outer_fence_ue = q3 outer_fence
for index, x in enumerate(df[variable]):
if x <= outer_fence_le or x >= outer_fence_ue:
outliers_prob.append(index)
for index, x in enumerate(df[variable]):
if x <= inner_fence_le or x >= inner_fence_ue:
outliers_poss.append(index)
return outliers_prob, outliers_poss
probable_outliers_tm, possible_outliers_tm = outliers_tukey(df, "Price",'Region','Product')