In R: efficiently convert one format (character vector) to another format (number matrix)

Using one software, I can calculate the fingerprint as follows:

>L
[1]    "1  1:1 2:1 3:1 5:1 6:1 8:1"
[2]    "5  1:1 2:1 4:1"
[3]    "9  1:1 2:1 7:1 10:1"

The first value: 1, 5, 9 are the corresponding molecular names, and the rest are the corresponding fingerprints that have a fixed length, say 10. This means that one to the left of ":" is the position and to the right is the bit, where 1 indicates the presence of this bit, and 0 - omit (specify the bit), so I would like to restore the original format. That is, for 10 bits, each bit must have a corresponding value:

L like it, I can save L as csv format.

mol 1 2 3 4 5 6 7 8 9 10
1   1 1 1 0 1 1 0 1 0 0
5   1 1 0 1 0 0 0 0 0 0
9   1 1 0 0 0 0 1 0 0 1

Here L has millions of lines, what is an efficient way to convert the desired format?

Thank.

+4
3

Update

read.csv, strsplit splitstackshape:::numMat:

M <- strsplit(L, "\\s+|:")
cbind(mol = as.numeric(sapply(M, `[`, 1)),
    splitstackshape:::numMat(lapply(M, `[`, -1), fill=0))

2:

....

:

L <- c("1  1:1 2:1 3:1 5:1 6:1 8:1",
       "5  1:1 2:1 4:1",
       "9  1:1 2:1 7:1 10:1")
M <- replicate(10000, L)

@thelatemail :

fun1 <- function() {
  spl <- lapply(strsplit(M,"\\s+|:.? |:.$"),as.numeric)
  vals <- lapply(spl,"[",-1)

  data.frame(
    mol=sapply(spl,"[",1),
    t(sapply(vals, function(x) {
      out <- rep(0,max(unlist(vals)))
      out[x] <- 1
      out} ))
  )
} 

system.time(out_late <- fun1())
#    user  system elapsed 
#   98.36    1.28  100.06
head(out_late)
#   mol X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1   1  1  1  1  0  1  1  0  1  0   0
# 2   5  1  1  0  1  0  0  0  0  0   0
# 3   9  1  1  0  0  0  0  1  0  0   1
# 4   1  1  1  1  0  1  1  0  1  0   0
# 5   5  1  1  0  1  0  0  0  0  0   0
# 6   9  1  1  0  0  0  0  1  0  0   1

:

library(splitstackshape)
fun2 <- function() {
  M <- strsplit(M, "\\s+|:")
  cbind(mol = as.numeric(sapply(M, `[`, 1)),
        splitstackshape:::numMat(lapply(M, `[`, -1), fill=0))
}

system.time(out_ananda <- fun2())
#    user  system elapsed 
#    0.67    0.00    0.68
head(out_ananda)
#      mol 1 2 3 4 5 6 7 8 9 10
# [1,]   1 1 1 1 0 1 1 0 1 0  0
# [2,]   5 1 1 0 1 0 0 0 0 0  0
# [3,]   9 1 1 0 0 0 0 1 0 0  1
# [4,]   1 1 1 1 0 1 1 0 1 0  0
# [5,]   5 1 1 0 1 0 0 0 0 0  0
# [6,]   9 1 1 0 0 0 0 1 0 0  1

@ . , , "val".

fun3 <- function() {
  t(sapply(strsplit(M, "\\s+"), function(l) {
    mol <- as.numeric(l[1])
    names(mol) <- 'mol'
    val <- numeric(10)
    names(val) <- 1:10
    for (x in strsplit(l[-1], ":"))
      val[x[1]] <- as.numeric(x[2])
    c(mol, val)
  }))
}

system.time(out_matthew <- fun3())
#    user  system elapsed 
#    2.33    0.00    2.34
head(out_matthew)
#      mol 1 2 3 4 5 6 7 8 9 10
# [1,]   1 1 1 1 0 1 1 0 1 0  0
# [2,]   5 1 1 0 1 0 0 0 0 0  0
# [3,]   9 1 1 0 0 0 0 1 0 0  1
# [4,]   1 1 1 1 0 1 1 0 1 0  0
# [5,]   5 1 1 0 1 0 0 0 0 0  0
# [6,]   9 1 1 0 0 0 0 1 0 0  1
+2

R, , L , @Ananda.

spl <- lapply(strsplit(L,"\\s+|:.? |:.$"),as.numeric)
vals <- lapply(spl,"[",-1)

data.frame(
  mol=sapply(spl,"[",1),
  t(sapply(vals, function(x) {
   out <- rep(0,max(unlist(vals)))
   out[x] <- 1
   out} ))
)

#  mol X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
#1   1  1  1  1  0  1  1  0  1  0   0
#2   5  1  1  0  1  0  0  0  0  0   0
#3   9  1  1  0  0  0  0  1  0  0   1
+2

thelatemail, , . , 1, : for. , .

t(sapply(strsplit(L, "\\s+"), function(l) {
  # Each line is passed in as a vector, the first element is "mol"
  mol <- as.numeric(l[1])
  names(mol) <- 'mol'

  # Store the values in a vector of length 10, with names
  val <- numeric(10)
  names(val) <- 1:10

  # Split the tail of the input vector on ":" and assign to the proper slot of the output vector
  for (x in strsplit(l[-1], ":"))
     val[x[1]] <- as.numeric(x[2])

  # Put them back together
  c(mol, val)
}))

##      mol 1 2 3 4 5 6 7 8 9 10
## [1,]   1 1 1 1 0 1 1 0 1 0  0
## [2,]   5 1 1 0 1 0 0 0 0 0  0
## [3,]   9 1 1 0 0 0 0 1 0 0  1
+2
source

Source: https://habr.com/ru/post/1538667/


All Articles