I am trying to create a matrix that gives me the appearance of each element in each position based on the large number of lines in the vector.
I have the following pet example and potential solution:
set.seed(42)
seqs <- sapply(1:10, FUN = function(x) { paste(sample(LETTERS, size = 11, replace = T), collapse = "") })
test <- lapply(seqs, FUN = function(s) {
do.call(cbind, lapply(LETTERS, FUN = function(ch) {
grepl(ch, unlist(strsplit(s, split="")))
}))
})
testR <- Reduce("+", test)
seqs
# [1] "XYHVQNTDRSL" "SYGMYZDMOXD" "ZYCNKXLVTVK" "RAVAFXPJLAZ" "LYXQZQIJKUB" "TREGNRZTOWE" "HVSGBDFMFSA" "JNAPEJQUOGC" "CHRAFYYTINT"
#[10] "QQFFKYZTTNA"
testR
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23]
[1,] 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0
[2,] 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0
[3,] 1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0
[4,] 2 0 0 0 0 1 2 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0
[5,] 0 1 0 0 1 2 0 0 0 0 2 0 0 1 0 0 1 0 0 0 0 0 0
[6,] 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0
[7,] 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0
[8,] 0 0 0 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 3 1 1 0
[9,] 0 0 0 0 0 1 0 0 1 0 1 1 0 0 3 0 0 1 0 2 0 0 0
[10,] 1 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 2 0 1 1 1
[11,] 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0
[,24] [,25] [,26]
[1,] 1 0 1
[2,] 0 4 0
[3,] 1 0 0
[4,] 0 0 0
[5,] 0 1 1
[6,] 2 2 1
[7,] 0 1 2
[8,] 0 0 0
[9,] 0 0 0
[10,] 1 0 0
[11,] 0 0 1
I try to force myself not to use loops, but instead use vectorized functions, but I'm not sure if my solution is really a good (efficient) solution or if I got confused somewhere. It is also quite difficult to debug if real-life data is somehow messed up (which, unfortunately, is the case).
So my question is, what is a good way to solve this problem?
EDIT: 989, , .
library(microbenchmark)
set.seed(42)
seqs <- sapply(1:10000, FUN = function(x) { paste(sample(LETTERS, size = 31, replace = T), collapse = "") })
f.posdef=function(){
test <- lapply(seqs, FUN = function(s) {
do.call(cbind, lapply(LETTERS, FUN = function(ch) {
grepl(ch, unlist(strsplit(s, split="")))
}))
})
(testR <- Reduce("+", test))
}
f.989=function() {
l <- lapply(seqs, function(x) {
m <- matrix(0, nchar(x), 26)
replace(m, cbind(seq(nchar(x)), match(strsplit(x, "")[[1]], LETTERS)), 1)
})
Reduce("+",l)
}
f.docendo1=function()
t(Reduce("+", lapply(strsplit(seqs, "", fixed = TRUE), function(xx)
table(factor(xx, levels = LETTERS), 1:31))))
f.docendo2=function()
t(table(do.call(cbind, strsplit(seqs, "", fixed = TRUE)), rep(1:31, 10000)))
f.akrun=function(){
strsplit(seqs, "") %>%
transpose %>%
map(unlist) %>%
setNames(seq_len(nchar(seqs[1]))) %>%
stack %>%
select(2:1) %>%
table
}
r <- f.posdef()
, 989 .
> all(r==f.989())
[1] TRUE
> all(r==f.docendo1())
[1] TRUE
> all(r==f.docendo2())
[1] TRUE
> all(r==f.akrun())
[1] FALSE
> res <- microbenchmark(f.posdef(), f.989(), f.docendo1(), f.docendo2(), f.akrun())
> autoplot(res)

, akrun , , -, . , . , , , docendo, 989 / m <- matrix(0, nchar(x), 26)
/ (.. seqs) nchar, . , , .