theshold <- c(0.001, 0.5, 0.1)
df$a df$b df$c - Recode levels based on level frequency of less than first threshold
df$x df$y df$x - Recode levels based on level frequency of less than second threshold
df$d df$e df$f - Recode levels based on level frequency of less than third threshold
CodePudding user response:
You're looking for fct_lump_prop()
from forcats
.
library(forcats)
library(dplyr)
dat <- data.frame(base = c("A", "A", "A",
"B", "B",
"C",
"D"))
dat |> mutate(base0.2 = fct_lump_prop(base, 0.2),
base0.3 = fct_lump_prop(base, 0.3))
Output
#> base base0.2 base0.3
#> 1 A A A
#> 2 A A A
#> 3 A A A
#> 4 B B Other
#> 5 B B Other
#> 6 C Other Other
#> 7 D Other Other
Created on 2022-03-31 by the reprex package (v2.0.0)
CodePudding user response:
There may be an easier tidy
way of doing this, but you could write a little function that would implement this:
set.seed(519)
x <- sample(LETTERS[1:5], 1000, prob=c(.01,.1,.29,.3,.3), replace=TRUE)
x <- as.factor(x)
recode_thresh <- function(x, threshold = .15){
tab <- table(x)/sum(table(x))
levs <- levels(x)
levs <- c(levs, "other")
x <- as.character(x)
if(any(tab < threshold)){
x <- ifelse(x %in% names(tab)[which(tab < threshold)], "other", x)
}
levs <- intersect(levs, unique(x))
factor(x, levels=levs)
}
x2 <- recode_thresh(x, threshold=.15)
table(x)/1000
#> x
#> A B C D E
#> 0.014 0.106 0.294 0.276 0.310
table(x2)/1000
#> x2
#> C D E other
#> 0.294 0.276 0.310 0.120
Created on 2022-03-31 by the reprex package (v2.0.1)