To cross validation for CCLE (Cancer Cell Line Encyclopedia) drug data I tried to convert the following codes from matlab to R. However, I was unsuccessful. Matlab codes work fine and can create both a *cross.mat that is a group of 10 fold CV data for each data set and a *data.mat that is the grouped data of 10 times of CV of each data set.
I will be appreciate if you can help me find my mistake.
#This function is about 10-fold cross-validation data grouping
getcrossMatrixs <- function(MM){
library(pracma)
N <- nnz(MM)
zeroM <- matrix(0L, nrow = dim(MM)[1], ncol = dim(MM)[2])
D <- randperm(N)
first <- floor(N/10)
w = which(MM != 0, arr.ind=TRUE);
nrows=w[,1]; ncols=w[,2]
crossdata <- list()
for (i in 1:10) {
crossdata[[i]] <- zeroM
}
for (i in 1:10){
for (j in (1 (i-1)*first):(i*first)){
crossdata[[i]][c(nrows[D[j]]),c(ncols[D[j]]) ] <- MM[c(nrows[D[j]]),c(ncols[D[j]])]
}
}
k <- (N-(10*first))
i <- 10*first 1
for (j in 1:k){
crossdata[[j]][c(nrows[D[i]]),c(ncols[D[i]]) ] <- MM[c(nrows[D[i]]),c(ncols[D[i]])]
i <- i 1
}
}
#The following lines is the main for calling above function.
library(foreach)
n.cores <- parallel::detectCores()
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
print(my.cluster)
#> socket cluster with 16 nodes on host 'localhost'
doParallel::registerDoParallel(cl = my.cluster)
foreach::getDoParRegistered()
#> [1] TRUE
CCLEdata <- list()
#MM<-matrix(read_csv("MM.csv", col_names = FALSE, show_col_types = FALSE), rownames.force = NA)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491) #datamatrix like CCLE drug activity area sensitivity matrrix(491*24)
foreach(i = 1:10) %dopar% {
CCLEcross <- getcrossMatrixs(MM)
CCLEdata[[i]] <- CCLEcross
}
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> NULL
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> NULL
#>
#> [[7]]
#> NULL
#>
#> [[8]]
#> NULL
#>
#> [[9]]
#> NULL
#>
#> [[10]]
#> NULL
Created on 2022-08-29 with reprex v2.0.2
Actually when I use the original CCLE dataset the error is changing in the main.R:
Error in { : task 1 failed - "is.numeric(x) || is.complex(x) is not TRUE"
or
Error in { :
task 1 failed - "attempt to select less than one element in integerOneIndex"
%These are from Matlab
function [crossdata] = getcrossMatrixs(MM)
N = nnz(MM(:));
zeroM = zeros(size(MM));
D = randperm(N);
first = floor(N/10);
[nrows,ncols] = find(MM);
crossdata = {};
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
for j = 1 (i-1)*first:i*first
crossdata{i}(nrows(D(j)),ncols(D(j))) = MM(nrows(D(j)),ncols(D(j)));
end
end
k=N -10*first ;
i=10*first 1;
for j=1:k
crossdata{j}(nrows(D(i)),ncols(D(i))) = MM(nrows(D(i)),ncols(D(i)));
i=i 1;
end
end
load('MM.mat')
parfor i=1:10
[CCLEcross] = getcrossMatrixs(MM);
CCLEdata{i}=CCLEcross;
end
CodePudding user response:
I didn't look too closely to figure out what was wrong. I based this function on the Matlab function supplied. Note that for this particular example, going parallel is more expensive due to overhead. Parallel will provide performance with large enough matrices and/or more samples.
library(parallel)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491)
getcrossMatrixs <- function(MM, parts = 10L) {
D <- sample(which(MM != 0))
first <- length(D) %/% parts
last <- length(D) %% parts
idx <- c(0L, cumsum(c(rep(first 1L, last), rep(first, parts - last))))
mZero <- matrix(0, nrow(MM), ncol(MM))
lapply(1:parts, function(i, m) {m[D[(idx[i] 1L):idx[i 1L]]] <- MM[D[(idx[i] 1L):idx[i 1L]]]; m}, mZero)
}
reps <- 10L
clust <- makeCluster(min(detectCores() - 1L, reps))
clusterExport(clust, c("getcrossMatrixs", "MM"))
CCLEdata <- parLapply(clust, 1:reps, function(x) getcrossMatrixs(MM))
stopCluster(clust)
# check that each set of matrices returned has all elements of MM
identical(rep(list(MM), reps), lapply(1:reps, function(i) Reduce(" ", CCLEdata[[i]], matrix(0, nrow(MM), ncol(MM)))))
#> [1] TRUE
And here's a cleaned-up version of the Matlab function:
function [crossdata] = getcrossMatrixs(MM)
idx = find(MM);
N = length(nrows);
zeroM = zeros(size(MM));
idx = idx(randperm(N));
first = floor(N/10);
crossdata = cell(10, 1);
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
j = 1 (i - 1)*first:i*first;
crossdata{i}(idx(j)) = MM(idx(j));
end
k = N - 10*first;
j = 10*first 1;
for i = 1:k
crossdata{i}(idx(j)) = MM(idx(j));
j = j 1;
end
end