Avoid using row combinations when indexing a matrix

I have two inputs in the following formats:

domains = list(
    O60925 = "PF01920",
    P01130 = c("PF07645", "PF00057", "PF00058"),
    Q14764 = c("PF11978", "PF01505"),
    Q9BX68 = "PF01230",
    P46777 = "PF14204")

interactions = structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 
0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0), .Dim = c(8L, 8L), .Dimnames = list(c("PF01920", 
"PF07645", "PF00057", "PF00058", "PF11978", "PF01505", "PF01230", 
"PF14204"), c("PF01920", "PF07645", "PF00057", "PF00058", "PF11978", 
"PF01505", "PF01230", "PF14204")))

        PF01920 PF07645 PF00057 PF00058 PF11978 PF01505 PF01230 PF14204
PF01920       1       0       0       0       0       0       1       0
PF07645       0       1       0       1       0       0       0       0
PF00057       0       0       1       1       0       0       0       0
PF00058       0       1       1       1       0       0       0       0
PF11978       0       0       0       0       1       0       0       0
PF01505       0       0       0       0       0       1       0       0
PF01230       1       0       0       0       0       0       1       0
PF14204       0       0       0       0       0       0       0       0

I would like to calculate the following output, where the integer in each cell represents the total sum of all cells in the matrix interactionsfor each pair of names in the list domains.

       O60925 P01130 Q14764 Q9BX68 P46777
O60925      1      0      0      1      0
P01130      0      7      0      0      0
Q14764      0      0      2      0      0
Q9BX68      1      0      0      1      0
P46777      0      0      0      0      0

The context is that I have a list of proteins (list names domains) and their Pfam domains (entries in the list domains) and a matrix of known interactions of the Pfam-Pfam domain domain (matrix interactions). I would like to summarize the total number of known domain interactions for each pair of proteins.

domains interactions , , . , , , apply:

proteins = names(domains)
result = matrix(0, nrow = length(proteins), ncol = length(proteins),
dimnames = list(proteins, proteins))
combinations = tidyr::crossing(proteins, proteins)
n_interactions = apply(combinations, 1, function(row) {
  domains1 = domains[[row[1]]]
  domains2 = domains[[row[2]]]
  sum(interactions[as.matrix(crossing(domains1, domains2))])
})
result[as.matrix(combinations)] = n_interactions

, , ?

+4
2

, , , :

columnBuilder <- function(m,l,n){
  rep.int(c(0,1,0),
          c(l,n,m-n-l))
}


matrixBuilder <- function(domainList){
  groupSizes <- sapply(domains,length)
  leadingZeros <- cumsum(c(0,groupSizes))
  m <- sum(groupSizes)

  sapply(seq_along(groupSizes),
         function(i){
           columnBuilder(m,leadingZeros[[i]],groupSizes[[i]])
         })
}

magicFunction <- function(interactionsM, domainL){
  magicMatrix <- matrixBuilder(domainL)

  output <- t(magicMatrix) %*% interactionsM %*% magicMatrix
  colnames(output) <- rownames(output) <- names(domainL)
  output

}

magicFunction(interactions, domains)

           O60925 P01130 Q14764 Q9BX68 P46777
O60925      1      0      0      1      0
P01130      0      7      0      0      0
Q14764      0      0      2      0      0
Q9BX68      1      0      0      1      0
P46777      0      0      0      0      0

1. 2. 3. , magicMatrix, , , . , , ! bencmarks:

microbenchmark::microbenchmark(
  OP = {
    proteins = names(domains)
    result = matrix(0, nrow = length(proteins), ncol = length(proteins),
                    dimnames = list(proteins, proteins))
    combinations = tidyr::crossing(proteins, proteins)
    n_interactions = apply(combinations, 1, function(row) {
      domains1 = domains[[row[1]]]
      domains2 = domains[[row[2]]]
      sum(interactions[as.matrix(tidyr::crossing(domains1, domains2))])
    })
    result[as.matrix(combinations)] = n_interactions
  },
  privefl = {
    n <- length(domains)
    res <- matrix(nrow = n, ncol = n)
    res[] <- purrr::pmap_dbl(expand.grid(domains, domains),
                             function(Var1,Var2){sum(interactions[Var1, Var2])}) 
    colnames(res) <- rownames(res) <- names(domains)
  },
  matrixAlgebra = {
    magicFunction(interactions, domains)
  },


  times = 10
)



Unit: microseconds
          expr       min        lq       mean    median        uq        max neval
            OP 18996.486 20218.043 33483.5307 21058.912 22152.479 143394.733    10
       privefl   406.579   424.811   467.1096   448.513   475.861    642.503    10
 matrixAlgebra    72.200    95.902   123.1771   111.946   137.471    261.085    10
+2

:

n <- length(domains)
res <- matrix(nrow = n, ncol = n)
res[] <- purrr::pmap_dbl(expand.grid(domains, domains), 
                         ~ sum(interactions[.x, .y])) 
colnames(res) <- rownames(res) <- names(domains)

, .


Benchmark:

microbenchmark::microbenchmark(
  OP = {
    proteins = names(domains)
    result = matrix(0, nrow = length(proteins), ncol = length(proteins),
                    dimnames = list(proteins, proteins))
    combinations = tidyr::crossing(proteins, proteins)
    n_interactions = apply(combinations, 1, function(row) {
      domains1 = domains[[row[1]]]
      domains2 = domains[[row[2]]]
      sum(interactions[as.matrix(crossing(domains1, domains2))])
    })
    result[as.matrix(combinations)] = n_interactions
  },
  privefl = {
    n <- length(domains)
    res <- matrix(nrow = n, ncol = n)
    res[] <- purrr::pmap_dbl(expand.grid(domains, domains), 
                             ~ sum(interactions[.x, .y])) 
    colnames(res) <- rownames(res) <- names(domains)
  },
  times = 10
)

:

Unit: microseconds
    expr        min         lq       mean     median         uq       max neval
      OP 208685.225 209913.891 231506.172 210817.264 213071.475 416724.50    10
 privefl    262.885    281.426   1580.779    306.092    396.975  12842.56    10
+2

Source: https://habr.com/ru/post/1689016/


All Articles