I have the dataframe below and I want to find and remove outliers for every factor of the Area
column of my dataframe. Normally the factors are more than 2. So as a result I want new dataframe without the outliers for every factor.
subs<-structure(list(Sold_Pr = c(6500, 173000, 60000, 73000, 155000,
105000, 140000, 39900, 73500, 46000, 99900, 180000, 164000, 120000,
206000, 160000, 67400, 215000, 145000, 175000, 350000, 425000,
435000, 490000, 545000, 585000), Area = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("411", "415", "981",
"8001", "8002", "8003", "8004", "8005", "8006", "8007", "8008",
"8009", "8010", "8011", "8012", "8013", "8014", "8015", "8016",
"8017", "8018", "8019", "8020", "8021", "8022", "8023", "8024",
"8025", "8026", "8027", "8028", "8029", "8030", "8031", "8034",
"8035", "8037", "8038", "8039", "8040", "8041", "8042", "8043",
"8044", "8045", "8046", "8047", "8048", "8049", "8050", "8051",
"8052", "8053", "8055", "8056", "8057", "8058", "8059", "8060",
"8061", "8062", "8063", "8064", "8065", "8066", "8067", "8068",
"8069", "8070", "8071", "8072", "8073", "8074", "8075", "8076",
"8077"), class = "factor"), Closed_Date = structure(c(18668,
18933, 18716, 18740, 18639, 18845, 18708, 18676, 18733, 18695,
18715, 18709, 18794, 18803, 18750, 18787, 18906, 18810, 18855,
18870, 18626, 18786, 18808, 18864, 18961, 18914), class = "Date")), row.names = c(NA,
-26L), class = c("tbl_df", "tbl", "data.frame"))
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] H)] <- NA
y
}
CodePudding user response:
You can change the function to return logical values and use them in filter
with group_by
-
library(dplyr)
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
!(x < (qnt[1] - H) | x > (qnt[2] H))
}
subs %>%
group_by(Area) %>%
filter(remove_outliers(Sold_Pr)) %>%
ungroup
CodePudding user response:
You could add the grouping to the function, using ave
. And for the calculation of the cutoffs you can use mapply
remove_outliers <- \(x, g, na.rm=TRUE, ...) {
q <- \(z) {
qnt <- quantile(z, probs=c(.25, .75), na.rm=na.rm, ...)
H <- 1.5 * IQR(z, na.rm=na.rm)
r <- mapply(` `, qnt, c(-H, H))
z > r[1] & z < r[2]
}
return(as.logical(ave(x, as.character(g), FUN=q)))
}
subs[26, 1] <- 1e9 ## fabricate outlier
Gives
with(subs, remove_outliers(Sold_Pr, Area))
# [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# [14] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
Applied in subsetting
subs[with(subs, remove_outliers(Sold_Pr, Area)), ]
# Sold_Pr Area Closed_Date
# 1 6500 411 2021-02-10
# 2 173000 411 2021-11-02
# 3 60000 411 2021-03-30
# 4 73000 411 2021-04-23
# 5 155000 411 2021-01-12
# 6 105000 411 2021-08-06
# 7 140000 411 2021-03-22
# 8 39900 411 2021-02-18
# 9 73500 411 2021-04-16
# 10 46000 411 2021-03-09
# 11 99900 411 2021-03-29
# 12 180000 411 2021-03-23
# 13 164000 411 2021-06-16
# 14 120000 411 2021-06-25
# 15 206000 411 2021-05-03
# 16 160000 411 2021-06-09
# 17 67400 411 2021-10-06
# 18 215000 411 2021-07-02
# 19 145000 411 2021-08-16
# 20 175000 411 2021-08-31
# 21 350000 415 2020-12-30
# 22 425000 415 2021-06-08
# 23 435000 415 2021-06-30
# 24 490000 415 2021-08-25
# 25 545000 415 2021-11-30
Note: R version 4.1.2 (2021-11-01)