I am trying a variable screening using the SIS package in R using different tunings and penalties. I have for loops which will take long for relatively large data. I am trying to parallelize this piece of code for efficiency. But I am running into some errors. Please kindly help if you can. Thanks for your time and help.
#load library
library(parallel)
library(doParallel)
library(foreach)
library(SIS)
library(dplyr)
data('leukemia.train', package = 'SIS') #data for practice
y.train = leukemia.train[,dim(leukemia.train)[2]]
x.train = as.matrix(leukemia.train[,-dim(leukemia.train)[2]])
x.train = standardize(x.train)
#penalties for screening
penalty <- c("lasso", "SCAD", "MCP")
#storeage
RESULT <- NULL
alldat <- NULL
for(pen in penalty){
#tuning para
tune <- c("aic", "bic", "ebic", "cv")
#storage
OUT <- NULL
dat <- NULL
for(tun in tune){
#SIS model for ultra-high dimensional screening
mod=SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pen, tune = tun, varISIS = 'aggr', seed = 21) #model
out <- mod$ix
coff <- mod$coef.est
x <- x.train %>% as.data.frame()
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- coff %>% as.data.frame()
OUT[[tun]] <- cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
names(OUT[tun]) <- paste(tun)
dat[[tun]] <- dat0
#store as list for cases
names(dat[tun]) <- paste(tun)
}
#list of all results of coef
RESULT[[pen]] <- OUT
dat #list of data sets
alldat[[pen]] <-
names(RESULT[pen]) <- paste(pen)
names(alldat[pen]) <- paste(pen)
}
#parallelize here
pentune.df <- expand.grid(
tune = c("aic", "bic", "ebic", "cv"),
penalty = c("lasso", "SCAD", "MCP")
)# use expand for to obtain possible combinations
#create and register cluster
n.cores <- parallel::detectCores() - 2
my.cluster <- parallel::makeCluster(n.cores)
doParallel::registerDoParallel(cl = my.cluster)
foreach(
tun = pentune.df$tun,
pena = pentune.df$pena,
.combine = 'list',
.packages = "SIS"
) %dopar% {
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
OUT <- return(cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL)))
}
parallel::stopCluster(cl = my.cluster) #end job
CodePudding user response:
normally it is best if you can narrow in on the error that you are getting it makes it easier to help. The main issue seemed to be simplifying your iterator within the foreach
and ensuring the penalty
and tune
variables for SIS
are character. The expand.grid
function is exactly what you need but the resulting columns are factors. So these need to be converted back when inserting into the SIS
function.
Finally, in your last line of the %dopar% {}
don't define a variable and you don't need to return. The last object returns automatically. So you can remove OUT <- return()
.
I have added some comments in the code below to indicate exactly what I have changed.
foreach(
i = 1:nrow(pentune.df), # define a simpler iterator
.combine = 'list',
.packages = "SIS"
) %dopar% {
# define loop variables and ensure they are character
pena <- as.character(pentune.df[i, 'penalty'])
tun <- as.character(pentune.df[i, 'tune'])
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
# don't define a variable here just create the object you want
cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
}