Home > Back-end >  Parallelizing codes for efficiency in R
Parallelizing codes for efficiency in R

Time:07-29

I am trying a variable screening using the SIS package in R using different tunings and penalties. I have for loops which will take long for relatively large data. I am trying to parallelize this piece of code for efficiency. But I am running into some errors. Please kindly help if you can. Thanks for your time and help.

#load library
library(parallel)
library(doParallel)
library(foreach)
library(SIS)
library(dplyr)
data('leukemia.train', package = 'SIS') #data for practice
y.train = leukemia.train[,dim(leukemia.train)[2]]
x.train = as.matrix(leukemia.train[,-dim(leukemia.train)[2]])
x.train = standardize(x.train)
#penalties for screening
penalty <- c("lasso", "SCAD", "MCP")
#storeage
RESULT <- NULL 
alldat <- NULL
for(pen in penalty){
  #tuning para
  tune  <- c("aic", "bic", "ebic", "cv")
#storage
  OUT <- NULL
  dat <- NULL
  for(tun in tune){
#SIS model for ultra-high dimensional screening
    mod=SIS(x = x.train, y = y.train, family = 'binomial',
            penalty = pen, tune = tun, varISIS = 'aggr', seed = 21)  #model
    out <- mod$ix
    coff <- mod$coef.est
    x <- x.train %>% as.data.frame()
    dat0 <- x[c(out)]
    if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
    df1 <- coff %>% as.data.frame()
    OUT[[tun]] <- cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
    names(OUT[tun]) <- paste(tun)
    dat[[tun]] <- dat0
     #store as list for cases
    names(dat[tun]) <- paste(tun)
    }
#list of all results of coef
  RESULT[[pen]] <- OUT
dat #list of data sets 
  alldat[[pen]] <- 
  names(RESULT[pen]) <- paste(pen)
  names(alldat[pen]) <- paste(pen)
}

#parallelize here
pentune.df <- expand.grid(
  tune  = c("aic", "bic", "ebic", "cv"),
  penalty = c("lasso", "SCAD", "MCP")
)# use expand for to obtain possible combinations

#create and register cluster
n.cores <- parallel::detectCores() - 2
my.cluster <- parallel::makeCluster(n.cores)
doParallel::registerDoParallel(cl = my.cluster)
foreach(
  tun = pentune.df$tun,
  pena = pentune.df$pena,
  .combine = 'list', 
  .packages = "SIS"
) %dopar% {
  
  #fit model
  mod <- SIS(x = x.train, y = y.train, family = 'binomial',
          penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
  out <- mod$ix
  coff <- mod$coef.est
  x <- as.data.frame(x.train)
  dat0 <- x[c(out)]
  if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
  df1 <- as.data.frame(coff)
  OUT <- return(cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL)))
  }
parallel::stopCluster(cl = my.cluster) #end job

CodePudding user response:

normally it is best if you can narrow in on the error that you are getting it makes it easier to help. The main issue seemed to be simplifying your iterator within the foreach and ensuring the penalty and tune variables for SIS are character. The expand.grid function is exactly what you need but the resulting columns are factors. So these need to be converted back when inserting into the SIS function.

Finally, in your last line of the %dopar% {} don't define a variable and you don't need to return. The last object returns automatically. So you can remove OUT <- return().

I have added some comments in the code below to indicate exactly what I have changed.

foreach(
  i = 1:nrow(pentune.df), # define a simpler iterator
  .combine = 'list', 
  .packages = "SIS"
) %dopar% {
  
  # define loop variables and ensure they are character
  pena <- as.character(pentune.df[i, 'penalty']) 
  tun <- as.character(pentune.df[i, 'tune'])

  #fit model
  mod <- SIS(x = x.train, y = y.train, family = 'binomial',
             penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
  out <- mod$ix
  coff <- mod$coef.est
  x <- as.data.frame(x.train)
  dat0 <- x[c(out)]
  if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
  df1 <- as.data.frame(coff)

  # don't define a variable here just create the object you want
  cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
}
  •  Tags:  
  • r
  • Related