Expand for-loop to accommodate list in R?-CodePudding

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.

I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1. Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.

I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:

library(mgcv)

## Example data
set.seed(0) 
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])

d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]

df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))), 
                           function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]]) 

## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)

## for-loop
for(k in 1:length(df_combinations2)){
  for(i in 1:ncol(d_resp)){
  for(j in 1:ncol(df_combinations2[[k]])){
    results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1]) s(df_combinations2[[k]][,2]))
  }
}}

However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula". Anyone know why I am getting this error/ how to fix it?

Any insight is much appreciated. Thanks!

CodePudding user response：

Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.

library(data.table)
library(dplyr)

# Example data
set.seed(0) 
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])

#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]

#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
  #create combination
  combination <- combn(predictors, m = x) |> as.data.table()
  
  #add s() to all for gam
  combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()

  #collapse
  combination <- summarize_all(combination, .funs = paste0, collapse = " ")
  
  #unlist
  combination <- unlist(combination)
  
  #remove names
  names(combination) <- NULL
  
  #return
  return(combination)
})

#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)

#create folder to save results to
if(!dir.exists("dev")){
  dir.create("dev")
}
if(!dir.exists("dev/models")){
  dir.create("dev/models")
}

#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
  #create hypergrid and save to dev
  hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
  
  #add identifier
  hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
  
  #save to dev
  fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
  #if file exists read
  hypergrid <- fread("dev/hypergrid.csv")
}


#loop through hypergrid, create GAM models
  #progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
  #update progressbar
  setTxtProgressBar(pb, i)
  
  #select target
  target <- hypergrid[i,]$target
  
  #select predictors
  predictors <- hypergrid[i,]$predictors
  
  #create formula
  gam.formula <- as.formula(paste0(target, "~", predictors))
  
  #run gam
  gam.model <- gam(gam.formula, data = d)
  
  #save gam model do dev/model
  saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}

#example where you extract model performances
for(i in 1:nrow(hypergrid)){
  #read the right model
  rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
  
  #extract model performance, add to hypergrid
  hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}

#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))

Which would give

head(hypergrid)
  target                                predictors    model        R2
1:      y             s(x0) s(x1) s(x2) s(x4) s(x5) model319 0.6957242
2:      y       s(x0) s(x1) s(x2) s(x3) s(x4) s(x5) model423 0.6953753
3:      y       s(x0) s(x1) s(x2) s(x4) s(x5) s(x7) model437 0.6942054
4:      y                   s(x0) s(x1) s(x2) s(x5) model175 0.6941025
5:      y       s(x0) s(x1) s(x2) s(x4) s(x5) s(x6) model435 0.6940569
6:      y s(x0) s(x1) s(x2) s(x3) s(x4) s(x5) s(x7) model481 0.6939756

All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).

Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.