I am trying to convert the for loop into any apply family for code optimization
Here is the sample data
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
Actual Function (Using for loop) this function gives us the expected output
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
for(i in Sub_Sector_list){
df2 = right_table[right_table$Sub_Sector==i,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
}
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
DT::datatable(expand_collapse_compliance(mydata1),
rownames = F,escape = FALSE,
selection=list(mode="single",target="row"),
options = list(pageLength = 50,scrollX = TRUE,
dom = 'tp',ordering=F,
columnDefs = list(list(visible=FALSE, targets=0),
list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')
i tried to convert inner loop to lapply first while doing that the sub_sector value is not showing in the output table, please let me know how to fix any idea would be appreciated
expand_collapse_compliance <- function(right_table){
sector_list <- unique(right_table$Sector)
df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
for(s in sector_list){
df1 = right_table[right_table$Sector==s,]
sector1 = df1$Sector[1]
Sector = df1$Sector[1]
Sub_Sector = ""
actual = as.character(nrow(df1))
mainrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,mainrow)
Sub_Sector_list <- unique(df1$Sub_Sector)
#for(i in Sub_Sector_list){
lapply(Sub_Sector_list, function(x){
df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
sector1 = df1$Sector[1]
Sector = ""
Sub_Sector = df2$Sub_Sector[1]
actual = nrow(df2)
subrow = c(sector1,Sector,Sub_Sector,actual)
df = rbind(df,subrow)
})
}
df = df[2:nrow(df),]
df$Actual = as.numeric(df$Actual)
df_total = nrow(right_table)
df = rbind(df,c("","Total","",df_total))
return(df)
}
CodePudding user response:
Using dplyr
and tidyr
you could do:
Note: I dropped the DT
part.
library(dplyr)
library(tidyr)
expand_collapse_compliance1 <- function(x) {
x <- x %>%
count(Sector, Sub_Sector, name = "Actual") %>%
group_split(Sector) %>%
lapply(function(x) {
main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual))
bind_rows(main, x)
}) %>%
bind_rows() %>%
mutate(Sector1 = Sector) %>%
select(Sector1, Sector, Sub_Sector, Actual)
total <- x %>%
filter(is.na(Sub_Sector)) %>%
group_by(Sector = "Total") %>%
summarise(Actual = sum(Actual))
bind_rows(x, total) %>%
mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>%
replace_na(list(Sub_Sector = "", Sector1 = ""))
}
expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#> Sector1 Sector Sub_Sector Actual
#> <chr> <chr> <chr> <int>
#> 1 "AAA" "AAA" "" 4
#> 2 "AAA" "" "AAA1" 4
#> 3 "BBB" "BBB" "" 2
#> 4 "BBB" "" "BBB1" 1
#> 5 "BBB" "" "BBB2" 1
#> 6 "CCC" "CCC" "" 2
#> 7 "CCC" "" "CCC1" 1
#> 8 "CCC" "" "CCC2" 1
#> 9 "" "Total" "" 8
expand_collapse_compliance(my_data)
#> Sector1 Sector Sub_Sector Actual
#> 2 AAA AAA 4
#> 3 AAA AAA1 4
#> 4 BBB BBB 2
#> 5 BBB BBB1 1
#> 6 BBB BBB2 1
#> 7 CCC CCC 2
#> 8 CCC CCC1 1
#> 9 CCC CCC2 1
#> 91 Total 8
CodePudding user response:
There is no need for any loops nor apply, what we want here is three different group counts and some formatting. With the assumption - and as seen in the sample data - there is no need for a split.
my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
expand_collapse_compliance <- function(x) {
x <- rbindlist(list(
x[, .(Sector1 = Sector, Actual = .N), by = Sector],
setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
x[, .(Sector = "Total", Actual = .N)]
), fill = T)
setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
setorder(x, Sector1, Sector, na.last = T)
x
}
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1: AAA AAA <NA> 4
# 2: AAA <NA> AAA1 4
# 3: BBB BBB <NA> 2
# 4: BBB <NA> BBB1 1
# 5: BBB <NA> BBB2 1
# 6: CCC CCC <NA> 2
# 7: CCC <NA> CCC1 1
# 8: CCC <NA> CCC2 1
# 9: <NA> Total <NA> 8
sidenote
There is no need to convert NA
to ""
as in shiny DT will show blancs for it.
Speedtest
As I mentioned that for
is often faster on very small data sets as any library solutions use some functions that take some time to load once...
my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
"BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
"CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
"Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
-8L))
library(data.table)
setDT(my_data)
test replications elapsed relative
2 eccB 150 0.32 1.00
1 eccDT 150 0.72 2.25
# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))
test replications elapsed relative
2 eccB 50 79.30 5.146
1 eccDT 50 15.41 1.000
CodePudding user response:
The appropriate function of the *apply
family could be tapply
using a split-apply-combine approach. Since we need tapply
only when there are multiple Sub_Sector
's, we implement a case handling for sake of speed.
expand_collapse_complianceA <- \(data) {
r <- do.call(rbind, c(by(data, data$Sector, \(x) {
if (length(unique(x$Sub_Sector)) != 1L) {
tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
} else {
tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
Actual=sum(x$count))))
}
rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
}), make.row.names=FALSE))
rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}
Note: R version 4.1.2 (2021-11-01)
.
Gives
expand_collapse_compliance(my_data)
# Sector1 Sector Sub_Sector Actual
# 1 AAA AAA 4
# 2 AAA AAA1 4
# 3 BBB BBB 2
# 4 BBB BBB1 1
# 5 BBB BBB2 1
# 6 CCC CCC 2
# 7 CCC CCC1 1
# 8 CCC CCC2 1
# 9 Total 8
expand_collapse_complianceA(my_data) |>
(\(x) DT::datatable(
x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"),
options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F,
columnDefs=list(list(visible=FALSE, targets=0),
list(className='dt-left', targets='_all'))),
class='hover cell-border stripe'))()
expand_collapse_complianceA
now needs just 1/10 of the time as the original for
loop. Here a benchmark (tested on 1080 rows).
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351 10 c
# ecc_tapply 29.768177 29.851975 31.083977 30.611982 32.058980 34.50901 10 a
# ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652 10 b
# ecc_DT 3.267969 3.611711 4.610916 3.664493 3.707528 13.48797 10 a
Of course data.table is faster. However, I's like to see performance when the data is about to exhaust the RAM.
Benchmark Code:
microbenchmark::microbenchmark(
ecc_for=expand_collapse_compliance(dat),
ecc_tapply=expand_collapse_complianceA(dat),
ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
times=10L)
Note, that the "tidy" version has some flaws so far (at least with the new data).
res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}
all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"
# [2] "Component “Actual”: target is character, current is numeric"
Data
dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ",
"AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU",
"AV", "AW", "AX", "AY", "AZ", "BA"),
Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]