How to convert nested for-loops to lapply for code optimization in R-CodePudding

I am trying to convert the for loop into any apply family for code optimization

Here is the sample data

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

Actual Function (Using for loop) this function gives us the expected output

expand_collapse_compliance <- function(right_table){
  
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    for(i in Sub_Sector_list){
      df2 = right_table[right_table$Sub_Sector==i,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
    }
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

DT::datatable(expand_collapse_compliance(mydata1), 
              rownames = F,escape = FALSE,
              selection=list(mode="single",target="row"),
              options = list(pageLength = 50,scrollX = TRUE,
                             dom = 'tp',ordering=F,
                             columnDefs = list(list(visible=FALSE, targets=0),
              list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')

i tried to convert inner loop to lapply first while doing that the sub_sector value is not showing in the output table, please let me know how to fix any idea would be appreciated

expand_collapse_compliance <- function(right_table){
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    #for(i in Sub_Sector_list){
      lapply(Sub_Sector_list, function(x){
      df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
      })
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

CodePudding user response：

Using dplyr and tidyr you could do:

Note: I dropped the DT part.

library(dplyr)
library(tidyr)

expand_collapse_compliance1 <- function(x) {
  x <- x %>% 
    count(Sector, Sub_Sector, name = "Actual") %>% 
    group_split(Sector) %>% 
    lapply(function(x) {
      main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual)) 
      bind_rows(main, x)
    }) %>%
    bind_rows() %>% 
    mutate(Sector1 = Sector) %>%
    select(Sector1, Sector, Sub_Sector, Actual)
  
  total <- x %>%
    filter(is.na(Sub_Sector)) %>% 
    group_by(Sector = "Total") %>% 
    summarise(Actual = sum(Actual))
  
  bind_rows(x, total) %>% 
    mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>% 
    replace_na(list(Sub_Sector = "", Sector1 = ""))
}

expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#>   Sector1 Sector  Sub_Sector Actual
#>   <chr>   <chr>   <chr>       <int>
#> 1 "AAA"   "AAA"   ""              4
#> 2 "AAA"   ""      "AAA1"          4
#> 3 "BBB"   "BBB"   ""              2
#> 4 "BBB"   ""      "BBB1"          1
#> 5 "BBB"   ""      "BBB2"          1
#> 6 "CCC"   "CCC"   ""              2
#> 7 "CCC"   ""      "CCC1"          1
#> 8 "CCC"   ""      "CCC2"          1
#> 9 ""      "Total" ""              8

expand_collapse_compliance(my_data)
#>    Sector1 Sector Sub_Sector Actual
#> 2      AAA    AAA                 4
#> 3      AAA              AAA1      4
#> 4      BBB    BBB                 2
#> 5      BBB              BBB1      1
#> 6      BBB              BBB2      1
#> 7      CCC    CCC                 2
#> 8      CCC              CCC1      1
#> 9      CCC              CCC2      1
#> 91          Total                 8

CodePudding user response：

There is no need for any loops nor apply, what we want here is three different group counts and some formatting. With the assumption - and as seen in the sample data - there is no need for a split.

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

expand_collapse_compliance <- function(x) {
  x <- rbindlist(list(
    x[, .(Sector1 = Sector, Actual = .N), by = Sector], 
    setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
    x[, .(Sector = "Total", Actual = .N)]
  ), fill = T)
  setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
  setorder(x, Sector1, Sector, na.last = T)
  x
}

expand_collapse_compliance(my_data)

#    Sector1 Sector Sub_Sector Actual
# 1:     AAA    AAA       <NA>      4
# 2:     AAA   <NA>       AAA1      4
# 3:     BBB    BBB       <NA>      2
# 4:     BBB   <NA>       BBB1      1
# 5:     BBB   <NA>       BBB2      1
# 6:     CCC    CCC       <NA>      2
# 7:     CCC   <NA>       CCC1      1
# 8:     CCC   <NA>       CCC2      1
# 9:    <NA>  Total       <NA>      8

sidenote There is no need to convert NA to "" as in shiny DT will show blancs for it.

Speedtest

As I mentioned that for is often faster on very small data sets as any library solutions use some functions that take some time to load once...

my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

   test replications elapsed relative
2  eccB          150    0.32     1.00
1 eccDT          150    0.72     2.25

# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))

   test replications elapsed relative
2  eccB           50   79.30    5.146
1 eccDT           50   15.41    1.000

CodePudding user response：

The appropriate function of the *apply family could be tapply using a split-apply-combine approach. Since we need tapply only when there are multiple Sub_Sector's, we implement a case handling for sake of speed.

expand_collapse_complianceA <- \(data) {
  r <- do.call(rbind, c(by(data, data$Sector, \(x) {
    if (length(unique(x$Sub_Sector)) != 1L) {
      tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
      tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
    } else {
      tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
                              Actual=sum(x$count))))
    }
    rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
  }), make.row.names=FALSE))
  rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
    setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}

Note: R version 4.1.2 (2021-11-01).

Gives

expand_collapse_compliance(my_data)
#   Sector1 Sector Sub_Sector Actual
# 1     AAA    AAA                 4
# 2     AAA              AAA1      4
# 3     BBB    BBB                 2
# 4     BBB              BBB1      1
# 5     BBB              BBB2      1
# 6     CCC    CCC                 2
# 7     CCC              CCC1      1
# 8     CCC              CCC2      1
# 9          Total                 8


expand_collapse_complianceA(my_data) |> 
  (\(x) DT::datatable(
    x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"), 
    options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F, 
                 columnDefs=list(list(visible=FALSE, targets=0),
                                 list(className='dt-left', targets='_all'))), 
    class='hover cell-border stripe'))()

expand_collapse_complianceA now needs just 1/10 of the time as the original for loop. Here a benchmark (tested on 1080 rows).

# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval cld
#    ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351    10   c
# ecc_tapply  29.768177  29.851975  31.083977  30.611982  32.058980  34.50901    10 a  
#   ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652    10  b 
#     ecc_DT   3.267969   3.611711   4.610916   3.664493   3.707528  13.48797    10 a

Of course data.table is faster. However, I's like to see performance when the data is about to exhaust the RAM.

Benchmark Code:

microbenchmark::microbenchmark(
  ecc_for=expand_collapse_compliance(dat),
  ecc_tapply=expand_collapse_complianceA(dat),
  ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
  ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
  times=10L)

Note, that the "tidy" version has some flaws so far (at least with the new data).

res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}


all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"             
# [2] "Component “Actual”: target is character, current is numeric"

Data

dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ", 
                     "AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU", 
                     "AV", "AW", "AX", "AY", "AZ", "BA"),
            Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]