Creating an entry matrix for each char from a large number of rows of equal length

Question

Creating an entry matrix for each char from a large number of rows of equal length

I am trying to create a matrix that gives me the appearance of each element in each position based on the large number of lines in the vector.

I have the following pet example and potential solution:

set.seed(42)
seqs <- sapply(1:10, FUN = function(x) { paste(sample(LETTERS, size = 11, replace = T), collapse = "") })
test <- lapply(seqs, FUN = function(s) {
  do.call(cbind, lapply(LETTERS, FUN = function(ch) {
    grepl(ch, unlist(strsplit(s, split="")))
  }))
})

testR <- Reduce("+", test)
seqs
# [1] "XYHVQNTDRSL" "SYGMYZDMOXD" "ZYCNKXLVTVK" "RAVAFXPJLAZ" "LYXQZQIJKUB" "TREGNRZTOWE" "HVSGBDFMFSA" "JNAPEJQUOGC" "CHRAFYYTINT"
#[10] "QQFFKYZTTNA"
testR
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23]
 [1,]    0    0    1    0    0    0    0    1    0     1     0     1     0     0     0     0     1     1     1     1     0     0     0
 [2,]    1    0    0    0    0    0    0    1    0     0     0     0     0     1     0     0     1     1     0     0     0     1     0
 [3,]    1    0    1    0    1    1    1    1    0     0     0     0     0     0     0     0     0     1     1     0     0     1     0
 [4,]    2    0    0    0    0    1    2    0    0     0     0     0     1     1     0     1     1     0     0     0     0     1     0
 [5,]    0    1    0    0    1    2    0    0    0     0     2     0     0     1     0     0     1     0     0     0     0     0     0
 [6,]    0    0    0    1    0    0    0    0    0     1     0     0     0     1     0     0     1     1     0     0     0     0     0
 [7,]    0    0    0    1    0    1    0    0    1     0     0     1     0     0     0     1     1     0     0     1     0     0     0
 [8,]    0    0    0    1    0    0    0    0    0     2     0     0     2     0     0     0     0     0     0     3     1     1     0
 [9,]    0    0    0    0    0    1    0    0    1     0     1     1     0     0     3     0     0     1     0     2     0     0     0
[10,]    1    0    0    0    0    0    1    0    0     0     0     0     0     2     0     0     0     0     2     0     1     1     1
[11,]    2    1    1    1    1    0    0    0    0     0     1     1     0     0     0     0     0     0     0     1     0     0     0
      [,24] [,25] [,26]
 [1,]     1     0     1
 [2,]     0     4     0
 [3,]     1     0     0
 [4,]     0     0     0
 [5,]     0     1     1
 [6,]     2     2     1
 [7,]     0     1     2
 [8,]     0     0     0
 [9,]     0     0     0
[10,]     1     0     0
[11,]     0     0     1

I try to force myself not to use loops, but instead use vectorized functions, but I'm not sure if my solution is really a good (efficient) solution or if I got confused somewhere. It is also quite difficult to debug if real-life data is somehow messed up (which, unfortunately, is the case).

So my question is, what is a good way to solve this problem?

EDIT: 989, , .

library(microbenchmark)
set.seed(42)
seqs <- sapply(1:10000, FUN = function(x) { paste(sample(LETTERS, size = 31, replace = T), collapse = "") })

f.posdef=function(){
  test <- lapply(seqs, FUN = function(s) {
    do.call(cbind, lapply(LETTERS, FUN = function(ch) {
      grepl(ch, unlist(strsplit(s, split="")))
    }))
  })
  (testR <- Reduce("+", test))
}

f.989=function() {
  l <- lapply(seqs, function(x) {
    m <- matrix(0, nchar(x), 26)
    replace(m, cbind(seq(nchar(x)),  match(strsplit(x, "")[[1]], LETTERS)), 1)
  })
  Reduce("+",l)
}

f.docendo1=function()
  t(Reduce("+", lapply(strsplit(seqs, "", fixed = TRUE), function(xx) 
    table(factor(xx, levels = LETTERS), 1:31))))

f.docendo2=function()
  t(table(do.call(cbind, strsplit(seqs, "", fixed = TRUE)), rep(1:31, 10000)))

f.akrun=function(){
  strsplit(seqs, "") %>% 
    transpose %>% 
    map(unlist) %>% 
    setNames(seq_len(nchar(seqs[1]))) %>% 
    stack %>%
    select(2:1) %>% 
    table
}

r <- f.posdef()

, 989 .

> all(r==f.989())
[1] TRUE
> all(r==f.docendo1())
[1] TRUE
> all(r==f.docendo2())
[1] TRUE
> all(r==f.akrun())
[1] FALSE
> res <- microbenchmark(f.posdef(), f.989(), f.docendo1(), f.docendo2(), f.akrun())
> autoplot(res)

, akrun , , -, . , . , , , docendo, 989 / m <- matrix(0, nchar(x), 26)

/ (.. seqs) nchar, . , , .

+4

string matrix r

posdef 16 '17 14:46

3

docendo discimus · Answer 1 · 2017-05-16T15:04:52+0000

R, , OP:

t(Reduce("+", lapply(strsplit(seqs, "", fixed = TRUE), function(xx) 
                               table(factor(xx, levels = LETTERS), 1:11))))

#    A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# 1  0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1
# 2  1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 4 0
# 3  1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0
# 4  2 0 0 0 0 1 2 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0
# 5  0 1 0 0 1 2 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1
# 6  0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 2 2 1
# 7  0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 2
# 8  0 0 0 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 3 1 1 0 0 0 0
# 9  0 0 0 0 0 1 0 0 1 0 1 1 0 0 3 0 0 1 0 2 0 0 0 0 0 0
# 10 1 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 2 0 1 1 1 1 0 0
# 11 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1

, , :

t(table(do.call(cbind, strsplit(seqs, "", fixed = TRUE)), rep(1:nchar(seqs[1]), length(seqs))))

989 · Answer 2 · 2017-05-16T15:00:11+0000

match R:

l <- lapply(seqs, function(x) {
    m <- matrix(0, nchar(x), 26)
    replace(m, cbind(seq(nchar(x)),  match(strsplit(x, "")[[1]], LETTERS)), 1)
})

all(Reduce("+",l)==testR)
#[1] TRUE

BENCHMARKING ( @akrun, )

library(microbenchmark)
set.seed(42)
seqs <- sapply(1:10, FUN = function(x) { paste(sample(LETTERS, size = 11, replace = T), collapse = "") })

fOP=function(){
test <- lapply(seqs, FUN = function(s) {
   do.call(cbind, lapply(LETTERS, FUN = function(ch) {
    grepl(ch, unlist(strsplit(s, split="")))
   }))
 })

 (testR <- Reduce("+", test))
 }

f989=function() {
    l <- lapply(seqs, function(x) {
    m <- matrix(0, nchar(x), 26)
    replace(m, cbind(seq(nchar(x)),  match(strsplit(x, "")[[1]], LETTERS)), 1)
    })
    Reduce("+",l)
}

fdocendo.discimus=function()
t(Reduce("+", lapply(strsplit(seqs, "", fixed = TRUE), function(xx) 
                               table(factor(xx, levels = LETTERS), 1:11))))

fdocendo.discimus1=function()
t(table(do.call(cbind, strsplit(seqs, "", fixed = TRUE)), rep(1:11, 10)))

r <- fOP()
all(r==f989())
# [1] TRUE
all(r==fdocendo.discimus())
# [1] TRUE
all(r==fdocendo.discimus1())
# [1] TRUE
res <- microbenchmark(fOP(), f989(), fdocendo.discimus(), fdocendo.discimus1())

print(res, order="mean")
# Unit: microseconds
                 # expr      min        lq      mean    median       uq      max neval
               # f989()  135.813  150.8360  205.3294  154.1415  159.700 4968.565   100
 # fdocendo.discimus1()  391.813  405.1845  447.6911  418.2545  445.146 2418.480   100
  # fdocendo.discimus()  943.775  990.9495 1090.9905 1015.5880 1062.311 3996.245   100
                # fOP() 1486.725 1521.4280 1643.1604 1548.9215 1602.104 5782.838   100

akrun · Answer 3 · 2017-05-16T15:14:37+0000

We can also use tableonce

library(tidyverse)
strsplit(seqs, "") %>% 
       transpose %>% 
       map(unlist) %>% 
       setNames(seq_len(nchar(seqs[1]))) %>% 
       stack %>%
       select(2:1) %>% 
       table
#   values
#ind  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
#  1  0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1
#  2  1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 4 0
#  3  1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0
#  4  2 0 0 0 0 1 2 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0
#  5  0 1 0 0 1 2 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1
#  6  0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 2 2 1
#  7  0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 2
#  8  0 0 0 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 3 1 1 0 0 0 0
#  9  0 0 0 0 0 1 0 0 1 0 1 1 0 0 3 0 0 1 0 2 0 0 0 0 0 0
#  10 1 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 2 0 1 1 1 1 0 0
#  11 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1

Or a little more compact, using mtabulatefromqdapTools

library(qdapTools)
strsplit(seqs, "") %>% 
         transpose %>% 
         map(unlist) %>%
         mtabulate
#   A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
#1  0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1
#2  1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 4 0
#3  1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0
#4  2 0 0 0 0 1 2 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0
#5  0 1 0 0 1 2 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1
#6  0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 2 2 1
#7  0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 2
#8  0 0 0 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 3 1 1 0 0 0 0
#9  0 0 0 0 0 1 0 0 1 0 1 1 0 0 3 0 0 1 0 2 0 0 0 0 0 0
#10 1 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 2 0 1 1 1 1 0 0
#11 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1

Creating an entry matrix for each char from a large number of rows of equal length

More articles: