A 340436 x 763717 sparse matrix with 35879680 non-zero elements will result in a very large object (~30 GB). My machine isn't able to hold that object in memory with 16GB RAM. However, the cross product is straightforward to do piecemeal. The function bigcrossprod
below performs the cross product in piecemeal, converts the results to data.table
objects, and then rbinds
the objects. The crossprod
operation is broken into nseg
separate operations.
library(data.table)
library(Matrix)
bigcrossprod <- function(m, nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
rbindlist(
lapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")
)
}
)
)
}
Demonstrating with a somewhat smaller sparse matrix:
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks
dt3 <- bigcrossprod(m, 10)
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
In order to calculate the cross product of the 340436 x 763717 sparse matrix with 35879680 elements, instead of storing the list of data.table
objects in a list to pass to rbindlist
, save the individual data.table objects for later processing using the fst
package. Instead of returning a single data.table
, the following version of bigcrossprod
returns a character vector of length nseg
containing .fst file paths. Again, demonstrating with the smaller matrix:
library(data.table)
library(Matrix)
library(fst)
bigcrossprod <- function(m, nseg, path) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
vapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
},
character(1)
)
}
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks, read the resulting .fst files,
# and rbindlist into a single data.table
dt3 <- rbindlist(lapply(bigcrossprod(m, 10, "C:/temp"),
function(x) read.fst(x, as.data.table = TRUE)))
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
I was able process a 340436 x 763717 sparse matrix with 35879680 non-zero elements in about 15 minutes with 16GB of RAM.
Explanation:
A walkthrough of the logic in bigcrossprod
using the OP's 5 x 3 example matrix:
idx <- c(1,2,5,7,9,13:15)
dt <- data.table(i = as.integer(((idx - 1) %% 5) + 1),
j = as.integer(((idx - 1) %/% 5) + 1),
key = "j")[, v := 1/sqrt(.N), j]
(m <- sparseMatrix(dt$i, dt$j, x = dt$v))
#> 5 x 3 sparse Matrix of class "dgCMatrix"
#>
#> [1,] 0.5773503 . .
#> [2,] 0.5773503 0.7071068 .
#> [3,] . . 0.5773503
#> [4,] . 0.7071068 0.5773503
#> [5,] 0.5773503 . 0.5773503
nseg <- 2 # process the cross product of m in two chunks
jmax <- ncol(m)
sumj <- cumsum(as.numeric(jmax:1))
# In order to balance the processing between chunks, process column 1 first,
# then columns 2:3 next. Each crossprod call will result in 3 elements.
(dtj <- data.table(j = 1:jmax, int = sumj %/% (sumj[jmax]/nseg + 1))[, .(idx1 = min(j), idx2 = max(j)), int][, idx1:idx2])
#> idx1 idx2
#> 1: 1 1
#> 2: 2 3
# process the first chunk
seg <- 1L
j1 <- dtj$idx1[seg]
# cross product of column 1 and the full matrix
(m2 <- as(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax]), "dgTMatrix"))
#> 1 x 3 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483 0.3333333
# The indices of m2 are 0-based. Add j1 to the i,j indices of m2 when converting
# to a data.table.
(dt1 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
# process the second chunk
seg <- 2L
j1 <- dtj$idx1[seg] # starts at column 2
# Cross product of columns 2:3 and columns 2:jmax (2:3). The end result
# will be a symmetric matrix, so we need only the upper triangular matrix.
(m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax])), "dgTMatrix"))
#> 2 x 2 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483
#> [2,] . 1.0000000
(dt2 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 2 2 1.0000000
#> 2: 2 3 0.4082483
#> 3: 3 3 1.0000000
# the final matrix (in data.table form) is the two data.tables row-bound
# together (in bigcrossprod, the lapply returns a list of data.table objects for
# rbindlist)
(dt3 <- rbindlist(list(dt1, dt2)))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
#> 4: 2 2 1.0000000
#> 5: 2 3 0.4082483
#> 6: 3 3 1.0000000
# dt3 can be converted to a symmetric sparse matrix
(m3 <- forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)))
#> 3 x 3 sparse Matrix of class "dsCMatrix"
#>
#> [1,] 1.0000000 0.4082483 0.3333333
#> [2,] 0.4082483 1.0000000 0.4082483
#> [3,] 0.3333333 0.4082483 1.0000000
And, finally, a parallel version of bigcrossprod
(for Linux):
library(data.table)
library(Matrix)
library(parallel)
library(fst)
bigcrossprod <- function(m, nseg, path, nthreads = nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
cl <- makeCluster(nthreads, type = "FORK", outfile = "")
on.exit(stopCluster(cl))
unlist(
parLapply(
cl,
1:nseg,
function(seg) {
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
}
)
)
}