I have a large banded sparse matrix S with few non-zero elements in its rows and columns. And I have vector b, I want to calculate sum(S*tcrossprod(b)). But because the data is too large, the system can't calculate it, I need to multiply the non-zero parts in S, how can I do that? Thanks! Here is the example data:
S = matrix(rnorm(64),8,8)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S) bw))
S[pick] = 0
S.r <- sample(1:8,40,replace = T)
S.c <- sample(1:8,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
rowlab = collab = NULL
for (ii in 1:S@Dim[2]) {
n <- S@i[(S@p[ii] 1):S@p[ii 1]]
rowlab <-c(rowlab, n)
collab <- c(collab, rep(ii,length(n)))
lab.b <- data.frame(rowlab = rowlab 1, collab = collab)
}
b = rnorm(8)
And I want to get
sum(S*tcrossprod(b))
For the real data, the dimension of S is c(20000,20000), and the dimension of b is c(20000,1).
CodePudding user response:
You are running into memory issues due to the dense matrix returned from tcrossprod(b)
. But as many of the terms of this are getting multiplied by zero we can avoid calculating some of the values (avoid generating the full cross-product). You can grab the indices of the non-zero elements of the sparse matrix S
, use these to index the vector b
and multiply. Below are a couple of ways to calculate S * tcrossprod(b)
:
library(Matrix)
# Several functions to calculate the matrix products:
f1 <- function(S, b) S*tcrossprod(b)
f2 <- function(S, b) (S * b) %*% Diagonal(x=b)
f3 <- function(S, b) {indices = summary(S); S@x = S@x * b[indices$i]*b[indices$j]; S}
f4 <- function(S, b) {dp = diff(S@p); j = rep(seq_along(dp),dp); S@x = S@x * b[S@i 1]*b[j]; S}
# Check equality of function results
all.equal(f1(S, b), f2(S, b))
# [1] TRUE
all.equal(f2(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f4(S, b))
# [1] TRUE
all.equal(f2(S, b), f4(S, b))
# [1] TRUE
all.equal(f3(S, b), f4(S, b))
# [1] TRUE
Benchmark:
rbenchmark::benchmark(f1(S, b), f2(S, b), f3(S, b), f4(S, b), replications=1) # 1 rep as f1() is slow
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(S, b) 1 12.182 12182 7.972 4.932 0 0
# 2 f2(S, b) 1 0.003 3 0.003 0.000 0 0
# 3 f3(S, b) 1 0.002 2 0.002 0.000 0 0
# 4 f4(S, b) 1 0.001 1 0.001 0.000 0 0
# longer benchmark for the other functions, drop f1()
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f2(S, b) 100 0.363 2.556 0.305 0.042 0 0
# 2 f3(S, b) 100 0.279 1.965 0.255 0.024 0 0
# 3 f4(S, b) 100 0.142 1.000 0.142 0.000 0 0
As you just want the sum
it is enough to do
dp = diff(S@p); j = rep(seq_along(dp),dp); sum(S@x * b[S@i 1]*b[j])
Data:
n = 1e4 # I dont have enough memeory on my laptop for 20k
set.seed(1)
b = rnorm(n)
S = matrix(rnorm(n*n), n, n)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S) bw))
S[pick] = 0
S.r <- sample(1:n,40,replace = T)
S.c <- sample(1:n,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")