I've been trying to reuse code from SO but without success. Ideally I wouldn't use zoo
but if it makes things simple...
library(data.table)
library(zoo)
#> Warning: package 'zoo' was built under R version 4.2.1
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
train_set <- data.table(A = sample( c("BOBO", "BABA"), 100, TRUE),
B = sample( c("TOTO", "TATA"), 100, TRUE))
cols <- names(Filter(is.character, train_set))
train_set[, (cols) := lapply(.SD, as.factor), .SDcols = cols]
train_set[c(1, 4, 9), A := NA]
train_set[c(2, 5, 10), B := NA]
calc_mode <- function(x){
distinct_values <- unique(x)
distinct_tabulate <- tabulate(match(x, distinct_values))
distinct_values[which.max(distinct_tabulate)]
}
cols <- names(Filter(is.factor, train_set))
train_set[ , (cols) := lapply(.SD, na.aggregate, 2, calc_mode), .SDcols = cols]
#> Warning in mean.default(x[!is.na(x)]): argument is not numeric or logical:
#> returning NA
#> Warning in mean.default(x[!is.na(x)]): argument is not numeric or logical:
#> returning NA
Created on 2022-10-15 with reprex v2.0.2
CodePudding user response:
If you want to take full advantage of data.table's speed, I'd do it like this:
cols = names(Filter(is.factor, train_set))
modes = sapply(train_set[, ..cols], calc_mode)
for(i in seq_along(cols)) {
train_set[is.na(train_set[[cols[i]]]), (cols[i]) := modes[i]]
}