2

Given a matrix like mat

> set.seed(1)
> mat <- matrix(rbinom(100,1,0.5),10,10)
> rownames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:nrow(mat)))
> colnames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:ncol(mat)))
> mat
    A1 A2 A3 B4 B5 B6 B7 A8 B9 B10
B1   0  0  1  0  1  0  1  0  0   0
B2   0  0  0  1  1  1  0  1  1   0
B3   1  1  1  0  1  0  0  0  0   1
A4   1  0  0  0  1  0  0  0  0   1
A5   0  1  0  1  1  0  1  0  1   1
A6   1  0  0  1  1  0  0  1  0   1
A7   1  1  0  1  0  0  0  1  1   0
B8   1  1  0  0  0  1  1  0  0   0
A9   1  0  1  1  1  1  0  1  0   1
A10  0  1  0  0  1  0  1  1  0   1

I want to sample submatrices of the form:

   A  B
A  0  1
B  1  0

EDIT: Specifically, the submatrix should contain a 1 in the A-row/B-column, a 1 in the B-row/A-column, and 0s in the other two cells.

I can do this by randomly selecting one A-row and one B-row, then randomly selecting one A-column and one B-column, then checking whether it has the desired pattern. But, I'm trying to find a more efficient method that will work even in large/sparse matrices. Thanks!

Zachary
  • 345
  • 1
  • 8
  • 1
    If the values in the EDIT are fixed, then I don't understand the reason for sampling – akrun Apr 28 '22 at 16:29
  • I intend to locate those submatrices, then swap the 0s and 1s. The goal is to get 1s in [A,A] and [B,B] cells, and 0s in [A,B] and [B,A] cells, while preserving the marginals. – Zachary Apr 28 '22 at 20:57

2 Answers2

3

You can sample on the dimension names:

set.seed(1)
dims <- lapply(dimnames(mat), \(x) c(sample(which(grepl("A", x)), 1), sample(which(grepl("B", x)), 1)))

mat[dims[[1]], dims[[2]]]

   A3 B4
A4  0  0
B8  0  0
Maël
  • 45,206
  • 3
  • 29
  • 67
  • This is an elegant way to pick an A-row, B-row, A-column, and B-column. However, in a large/sparse matrix it's still likely likely that this submatrix won't have the desired pattern of 0s and 1s. – Zachary Apr 28 '22 at 15:03
  • What about when there are no A or B rows/column with 1 or 0? – Maël Apr 28 '22 at 15:14
  • In that case, ideally the sampling function would return NULL, since the set is empty. – Zachary Apr 28 '22 at 15:26
2

One could enumerate all possible pairwise combinations of elements containing the value 1, then eliminate pairs that share a row or column and pairs that would not result in 0 elements for the submatrix main diagonal. The resulting rows and columns from each remaining pair would define all possible submatrices that meet the desired pattern and these would be trivial to sample. This is feasible for matrices with a relatively small number of 1 elements (e.g., <100K--depending on available memory).

For sparse matrices with a large number of 1 elements, a straightforward way to get an efficient, vectorized solution is also with rejection: sample pairs of 1 elements for each submatrix's antidiagonal and reject if the corresponding main diagonal elements are not 0. The below solution assumes more elements are 0 than 1. (If the opposite is true, it should be modified to sample two 0 elements for the main diagonal and reject if the antidiagonal elements are not 1.) The rejection rate will depend mainly on the density of the sparse matrix. The example matrix is rather dense, so it has a relatively high rejection rate.

library(data.table)
library(Matrix)

set.seed(1)
m <- matrix(rbinom(100,1,0.5),10,10)
n <- 20L # sample 20 pairs (before rejection)
m <- as(m, "ngTMatrix")
mIdx <- matrix(sample(length(m@i), 2L*n, TRUE), ncol = 2)
(data.table(
  row1 = m@i[mIdx[,1]],
  col1 = m@j[mIdx[,2]],
  row2 = m@i[mIdx[,2]],
  col2 = m@j[mIdx[,1]]
) + 1L)[
  row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), n)] + m[matrix(c(row2, col2), n)])
]
#>    row1 col1 row2 col2
#> 1:    1    4    2    7
#> 2:   10    6    2   10
#> 3:    7    7    8    9

Here it is implemented as a function that returns a specified number of samples either with or without replacement.

sampleSubMat <- function(m, n, replace = FALSE, maxIter = 10L) {
  # convert m to a sparse matrix in triplet format if it's not already
  if (!grepl("TMatrix", class(m))) m <- as(1*m, "dgTMatrix")
  nLeft <- n
  # over-sample based on dimensions and density of the matrix
  mult <- 1.1/(1 - length(m@i)/prod(dim(m)))^2/prod(1 - 1/(dim(m - 1)))
  iter <- 1L
  
  if (replace) { # sampling with replacement (duplicates allowed)
    # more efficient to store individual data.table objects from each
    # iteration in a list, then rbindlist at the end
    lDT <- vector("list", maxIter)
    
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        # print("Max iterations reached")
        return(rbindlist(lDT[1:(iter - 1L)])[1:n])
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      lDT[[iter]] <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(lDT[[iter]])) {
        mult <- 1.1*mult*nLeft/nrow(lDT[[iter]])
        nLeft <- nLeft - nrow(lDT[[iter]])
        iter <- iter + 1L
      } else {
        # no pattern found, double the samples for the next iteration
        mult <- mult*2
      }
    }
    rbindlist(lDT[1:(iter - 1L)])[1:n]
  } else { # sampling without replacement (no duplicates allowed)
    # rbindlist on each iteration to check for duplicates
    dtOut <- data.table(
      row1 = integer(0), col1 = integer(0),
      row2 = integer(0), col2 = integer(0)
    )
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        return(dtOut)
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      dt <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(dt)) {
        dtOut <- unique(rbindlist(list(dtOut, dt)))
        mult <- 1.1*mult*nLeft/(nrow(dtOut) - n + nLeft)
        nLeft <- nLeft - nrow(dtOut)
      } else {
        mult <- mult*2
      }
    }
    dtOut[1:n]
  }
}

(dtSamples1 <- sampleSubMat(m, 10L))
#>     row1 col1 row2 col2
#>  1:    3    6    2   10
#>  2:    3    8    7    5
#>  3:    9    7    5    1
#>  4:    5    8   10    9
#>  5:    7    7    1    8
#>  6:    3    8    9    2
#>  7:    5    1    3    9
#>  8:    5    1    8    5
#>  9:    4    7    1    1
#> 10:   10    6    8    8
(dtSamples2 <- sampleSubMat(m, 10L, TRUE))
#>     row1 col1 row2 col2
#>  1:    6    7    5    8
#>  2:    2   10    3    4
#>  3:    7    7   10    1
#>  4:    5    1    4    9
#>  5:    1    8    9    7
#>  6:   10    1    8    8
#>  7:    8    8    7    6
#>  8:    7   10    3    9
#>  9:    2   10    3    9
#> 10:    2    1    3    6
# timing 1k samples from a random 10k-square matrix with 1M elements
idx <- sample(1e8, 1e6)
m <- new("ngTMatrix", i = as.integer(((idx - 1) %% 1e4)), j = as.integer(((idx - 1) %/% 1e4)), Dim = c(1e4L, 1e4L))
system.time(dtSamples3 <- sampleSubMat(m, 1e3L)) # without replacement
#>    user  system elapsed 
#>    1.08    0.31    1.40
system.time(dtSamples4 <- sampleSubMat(m, 1e3L, TRUE)) # with replacement
#>    user  system elapsed 
#>    0.89    0.32    1.21
Created on 2022-04-29 by the reprex package (v2.0.1)
jblood94
  • 10,340
  • 1
  • 10
  • 15
  • This is a really nice way to identify contiguous submatrices, composed of consecutive rows and consecutive columns. But, in my problem, I'm also hoping to identify non-contiguous submatrices that meet the criteria. – Zachary Apr 29 '22 at 12:35
  • Ah. Interesting problem. See the updated answer. – jblood94 Apr 30 '22 at 02:12