R Faster alternative for nested loops-CodePudding

I want to perform a wilcox-Test. I have 2 lists of dataframes. The Datalist contains the number of different observations in a 2 years period. The Varlist contains the Case and Control-Days in different scenarios.

I want to examine now, if there is a difference between the number of each observation and the different scenarios of case and control days. Therefore I use wilcox.test(~).

As output I would like to have a dataframe including the 2 values for Case and Control, the p-Value and of course all list and column-names to match the results correctly.

I have a working solution with an 4 times nested Loop, but its terribly slow (would take 10 days at least). Does anyone know how to solve this with a much faster code?

set.seed(42) 
n <- 365
df1 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                  D1 = sample(18:30, n, replace=TRUE),
                  D2 = sample(0:7, n, replace=TRUE),
                  D3 = sample(0:10, n, replace=TRUE),
                  D4 = sample(0:4, n, replace=TRUE),
                  D5 = sample(0:23, n, replace=TRUE))
set.seed(7) 
n <- 365
df2 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                 D1 = sample(18:30, n, replace=TRUE),
                 D2 = sample(0:7, n, replace=TRUE),
                 D3 = sample(0:10, n, replace=TRUE),
                 D4 = sample(0:4, n, replace=TRUE),
                 D5 = sample(0:23, n, replace=TRUE))

set.seed(9) 
n <- 365
df3 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                 D1 = sample(18:30, n, replace=TRUE),
                 D2 = sample(0:7, n, replace=TRUE),
                 D3 = sample(0:10, n, replace=TRUE),
                 D4 = sample(0:4, n, replace=TRUE),
                 D5 = sample(0:23, n, replace=TRUE))

Datalist = list(df1, df2, df3)

set.seed(2) 
n <- 365
Var1 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                 V1 = sample(c("Case", "Control", NA), n, replace=TRUE),
                 V2 = sample(c(NA, "Case", "Control"), n, replace=TRUE),
                 V3 = sample(c("Control", "Case", NA), n, replace=TRUE))

set.seed(6) 
n <- 365
Var2 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                  V1 = sample(c("Case", "Control", NA), n, replace=TRUE),
                  V2 = sample(c(NA, "Case", "Control"), n, replace=TRUE),
                  V3 = sample(c("Control", "Case", NA), n, replace=TRUE))

set.seed(23) 
n <- 365
Var3 = data.frame(Date=seq.Date(as.Date("2017-01-01"), as.Date("2018-12-31"), "day"),
                  V1 = sample(c("Case", "Control", NA), n, replace=TRUE),
                  V2 = sample(c(NA, "Case", "Control"), n, replace=TRUE),
                  V3 = sample(c("Control", "Case", NA), n, replace=TRUE))

Varlist = list(Var1, Var2, Var3)

EDIT: Here is my Code:

Results = data.frame(matrix(ncol = 7, nrow = 0))
colnames(Results) = c("Code","ICD", "Cond", "Case", "Control", "pValue", "Ver")

for (a in 1:length(Datalist)) {
  print(names(Datalist)[a])
  for (b in 2:length(Datalist[[a]])) {
    for (c in 1:length(Varlist)) {
      for (d in 2:ncol(Varlist[[c]])){ 
        Ill = Datalist[[a]][,b]
        cutpoint = nrow(Datalist[[a]])
        Group = Varlist[[c]][,d]
        Group = Group[1:cutpoint]
        casecontrol = na.omit(data.frame(Ill, Group)) 
        wiltest = wilcox.test(casecontrol$Ill ~ casecontrol$Group)  
        stats = tapply(casecontrol$Ill,casecontrol$Group,mean) 
        
        Code = names(Datalist)[a]
        ICD = colnames(Datalist[[a]])[b]
        Cond = colnames(Varlist[[c]])[d]
        Case = round(stats[1],2)
        Control  = round(stats[2],2)
        pValue  = round(wiltest$p.value, 2)
        Ver  = names(Varlist)[c]
        
        addrow = c(Code, ICD, Case, Control, pValue, Ver)
        
        Results= rbind(Results,addrow)}}}}

CodePudding user response：

Your code has two errors:

addrow = c(Code, ICD, Case, Control, pValue, Ver) only has 6 elements but Results is created with 7 columns;
addrow = c(Code, ICD, Case, Control, pValue, Ver) mixes character and numeric data, coercing everything to character.

The code below solves these errors and speeds up execution by a factor of 2. The results are identical, once the errors above are corrected. The main difference is to reserve memory to store the results before the loops and only create the return data.frame at the end.

g <- function(Datalist, Varlist) {
  ntotal <- length(Datalist) * (length(Datalist[[1]]) - 1L) * length(Varlist) * (ncol(Varlist[[1]]) - 1L)
  Code <- character(ntotal)
  ICD <- character(ntotal)
  Cond <- character(ntotal)
  Case <- numeric(ntotal)
  Control <- numeric(ntotal)
  pValue <- numeric(ntotal)
  Ver <- character(ntotal)
  i <- 0L
  for (a in 1:length(Datalist)) {
    print(names(Datalist)[a])
    for (b in 2:length(Datalist[[a]])) {
      for (c in 1:length(Varlist)) {
        for (d in 2:ncol(Varlist[[c]])){ 
          Ill = Datalist[[a]][,b]
          cutpoint = nrow(Datalist[[a]])
          Group = Varlist[[c]][,d]
          Group = Group[1:cutpoint]
          casecontrol = na.omit(data.frame(Ill, Group)) 
          wiltest = wilcox.test(Ill ~ Group, data = casecontrol)  
          stats = tapply(casecontrol$Ill,casecontrol$Group,mean) 
         
          i <- i   1L 
          Code[i] = names(Datalist)[a]
          ICD[i] = colnames(Datalist[[a]])[b]
          Cond[i] = colnames(Varlist[[c]])[d]
          Case[i] = round(stats[1],2)
          Control[i] = round(stats[2],2)
          pValue[i] = round(wiltest$p.value, 2)
          Ver[i] = names(Varlist)[c]
        }
      }
    }
  }
  data.frame(Code, ICD, Cond, Case, Control, pValue, Ver)
}