R lapply loop performance with vector subsets-CodePudding

Hello to everyone on SO! I rarely ask new questions since so much has already been said on this forum, but this time I cannot find enough material to get me through my performance issues.

I basically have survey data from which I want to compute various indicators at brand level. The trick is my need to create variations of my vectors for each element of the loop, by excluding all elements related to the i-th element tested. At this moment I have not found a way to vectorize my code. Consequently, my lapply loop is tediously slow (the slowest part of a bigger script, by far).

My dataset is 8 million rows long and I loop over 70 brands, so performance starts to matter at this point. See shorter reproducible example for your own tests:

(EDIT : Comments added to the script for better understanding.)

# Small sample size to experiment
N <- 200L 

# Table with survey data :
# - each observation is the answer of a person about a brand
# - each observation is associated to a weight, used to compute statistics (frequencies, means...)
# - each person is the described by few socio-demographic variables (country, gender, age)
# - brands are given a grade ('score' variable), ranging from 0 to 10
repex_DT <- data.table (
  country = factor(sample(c("COUNTRY 1", "COUNTRY 2", "COUNTRY 3", "COUNTRY 4"), size = N, replace=TRUE)),
  gender = factor(sample(c("Male", "Female"), size = N, replace=TRUE)),
  age_group = factor(sample(c("Less than 35", "35 and more"), size = N, replace=TRUE)),
  brand = factor(sample(c(toupper(letters[1:26])), size = N, replace=TRUE)),
  score = sample(x = c(0:10), size = N, replace=TRUE),
  weight = sample(x = c(2/(1:10)), size = N, replace=TRUE)
)

# The loop computes for each "country x gender x age_group x brand" combination :
# - the "country x gender x age_group" socio-demographic group size (cases_total, i.e. all brands included)
# - the "country x gender x age_group" group size, excluding the people assessing the current 'brand_' argument
# - Same logic for mean and standard deviation indicators

current_loop <- data.table::rbindlist(l=lapply(unique(repex_DT$brand), function(brand_){
  
  # Calculations done for each 'brand_' of each "country x gender x age_group" combination
  out <- repex_DT[ , .(
    cases_total = sum(x=weight, na.rm=TRUE),
    cases_others = sum(x=weight[brand != brand_], na.rm=TRUE),
    mean_others = expss::w_mean(x=score[brand != brand_], weight=weight[brand != brand_], na.rm=TRUE),
    sd_others  = expss::w_sd(x=score[brand != brand_], weight=weight[brand != brand_], na.rm=TRUE)
  ), by = .(country, gender, age_group)]
  
  out[, brand := brand_]
  data.table::setcolorder(x=out, neworder="brand")
  return(data.table::copy(x=out))})) %>% 
  
  # Sorting at the end for better readability
  .[order(., country, gender, age_group, brand)]

So far I have read plenty of other SO questions like this one, this other one and others on the same topic, so I am well aware that loops extending a data.table is both memory and time consuming. Yet I haven't found an other way to get me where I want. Hope you R experts can :-) And by the way, I use expss to compute weighted means and standard deviations because I also use the package to compute tables here and there, but but I certainly could use other packages if that could help performance-wise.

CodePudding user response：

Here is a much faster approach

Get the n,mean,sd for total

cases_total = repex_DT[, .(cases_total = sum(weight, na.rm=T),
                           mean_total = expss::w_mean(score, weight, na.rm=T),
                           sd_total = expss::w_sd(score, weight, na.rm=T)), .(country, gender, age_group)]

Get the n, mean, sd for each brand

cases_brand = repex_DT[, .(cases_brand = sum(weight, na.rm=T),
                           mean_brand = expss::w_mean(score, weight, na.rm=T),
                           sd_brand = expss::w_sd(score, weight, na.rm=T)), .(brand,country, gender, age_group)]

Merge these together

result = cases_brand[cases_total, on=.(country, gender, age_group)][, cases_others:=cases_total-cases_brand]

Very easy to get mean for the the "others" (i.e. non-brand)

result[, mean_others:= (cases_total*mean_total - cases_brand*mean_brand)/cases_others]

Now, a little function to get the sd of the "others"

sd_other <- function(n1,n2,total_sd,sub_sd,total_mean, sub_mean ,other_mean) {
  sqrt(
    (total_sd^2*(n1 n2-1) - (n1-1)*sub_sd^2 - n1*(sub_mean-total_mean)^2 - n2*(other_mean-total_mean)^2)/(n2-1)
  )
}

Apply that function to get the sd of the others

result[, sd_others:= sd_other(cases_brand, cases_others,sd_total,sd_brand,mean_total,mean_brand, mean_others)]

Drop unnecessary columns and set order

result[, `:=`(cases_brand=NULL, mean_brand=NULL, sd_brand=NULL, mean_total=NULL, sd_total=NULL)]
setorder(result, country, gender, age_group, brand)

Comparison:

> microbenchmark::microbenchmark(list=cmds, times=10)
Unit: milliseconds
         expr       min        lq      mean    median        uq       max neval
 current_loop 3684.4233 3700.1134 3775.4322 3729.8387 3855.5353 3938.4605    10
 new_approach  155.9486  158.2265  164.1699  165.9736  167.5279  174.0746    10

CodePudding user response：

While not addressing the vectorization issue, using the collapse package can be more efficient and lead to a nice speedup (YMMV, depending on the number of available cores):

invisible(suppressPackageStartupMessages(
    lapply(c("magrittr", "expss", "data.table", "collapse"), require, character.only=TRUE)))
options(datatable.optimize = 3L)
N <- 1E7
repex_DT <- data.table (
  country = factor(sample(c("COUNTRY 1", "COUNTRY 2", "COUNTRY 3", "COUNTRY 4"), size = N, replace=TRUE)),
  gender = factor(sample(c("Male", "Female"), size = N, replace=TRUE)),
  age_group = factor(sample(c("Less than 35", "35 and more"), size = N, replace=TRUE)),
  brand = factor(sample(LETTERS, size = N, replace=TRUE)),
  score = sample(x = c(0:10), size = N, replace=TRUE),
  weight = sample(x = c(2/(1:10)), size = N, replace=TRUE)
)

# your version
system.time({
    current_loop <- data.table::rbindlist(l=lapply(unique(repex_DT$brand), function(brand_){
  # Calculations done for each 'brand_' of each "country x gender x age_group" combination
  out <- repex_DT[ , .(
    cases_total = sum(x=weight, na.rm=TRUE),
    cases_others = sum(x=weight[brand != brand_], na.rm=TRUE),
    mean_others = expss::w_mean(x=score[brand != brand_], weight=weight[brand != brand_], na.rm=TRUE),
    sd_others  = expss::w_sd(x=score[brand != brand_], weight=weight[brand != brand_], na.rm=TRUE)
  ), by = .(country, gender, age_group)]
  
  out[, brand := brand_]
  data.table::setcolorder(x=out, neworder="brand")
  return(data.table::copy(x=out))})) %>% 
  
  # Sorting at the end for better readability
  .[order(., country, gender, age_group, brand)]
})
#>    user  system elapsed 
#>  95.612   1.557  49.309

# version using collapse
system.time({
cols <- c("country", "gender", "age_group")
ot <- repex_DT[ , .(cases_total = sum(weight, na.rm=TRUE)), by = cols]
ot2 <- data.table::setorder(rbindlist(lapply(setNames(levels(repex_DT$brand), levels(repex_DT$brand)), 
    \(i) repex_DT[brand != i][, .(cases_others = fsum(x=weight, na.rm=TRUE),
    mean_others = fmean(score, w=weight, na.rm=TRUE),
    sd_others = fsd(score, w=weight, na.rm=TRUE)), by = cols]), 
    idcol="brand"), country, gender, age_group, brand)
out <- data.table::setcolorder(ot[ot2, on=cols], "brand")
out[, brand := factor(brand)]
})
#>    user  system elapsed 
#>  49.836   3.478  11.543

all.equal(current_loop, out)
#> [1] TRUE

^{Created on 2022-03-07 by the reprex package (v2.0.1)}