Effective character vector splitting

I have this character vector style:

vec <- c("id a; sex m; age 16; type 1;","id a; sex m; age 16;","id a; sex m; age 16; type 3")

Each item in vecis a ";" A separate list of attributes, where each attribute has the format of a "key value" (the symbol ";" can only be displayed as a separator).

So, the first list of attributes: ID = a gender = m age = 16 type = 1

Please note that different elements in vecmay have slightly different attributes.

I am looking for an effective way to split vecinto a list of lists. Each element of the external list is a list of all attribute values ​​in which the element names are attribute keys. This means that the length of the external list will be the length of the elements vec, and the length of each internal list will be equal to the length of the attributes.

I currently have this implementation that helps me understand the result I need:

attributes.list <- sapply(vec, function(x) strsplit(x, split = "(\\;)(\\s+)?", perl = TRUE)[[1]])
attributes.lol <- lapply(attributes.list, function(x) {
  attribute.mat <- sapply(x, function(y) strsplit(y, split = " ")[[1]])
  colnames(attribute.mat) <- NULL
  attribute.list <- as.list(attribute.mat[2,])
  names(attribute.list) <- attribute.mat[1,]
  return(attribute.list)
})

> attributes.lol[[1]]
$id
[1] "a"

$sex
[1] "m"

$age
[1] "16"

$type
[1] "1"

The length vecis actually very long (~ a million elements), so I was wondering if there is a more efficient way to achieve this.

+4
source share
4 answers

R. , , , read.dcf. m, type.convert . ( , .)

m <- read.dcf(textConnection(sub(" ",": ",trimws(unlist(strsplit(paste0(vec, ";"),";"))))))
as.data.frame(lapply(as.data.frame(m, stringsAsFactors = FALSE), type.convert))

:

  id sex age type
1  a   m  16    1
2  a   m  16   NA
3  a   m  16    3
+3

"iotools" "data.table", - :

library(iotools)
library(data.table)
melt(data.table(ind = seq_along(vec), trimws(mstrsplit(vec, ";"))),
     "ind", na.rm = TRUE)[
      , c("key", "val") := tstrsplit(value, " ", TRUE)][
        , c("variable", "value") := NULL][]

, "" (, @GGrothendieck):

dcast(
  melt(data.table(ind = seq_along(vec), trimws(mstrsplit(vec, ";"))),
       "ind", na.rm = TRUE)[
         , c("key", "val") := tstrsplit(value, " ", TRUE)][
           , c("variable", "value") := NULL][], ind ~ key, value.var = "val")

, , . :

3, 100000 1 .

vec <- c("id a; sex m; age 16; type 1;","id a; sex m; age 16;","id a; sex m; age 16; type 3")
v100k <- rep(vec, ceiling(100000/length(vec)))
v1M <- rep(vec, ceiling(1000000/length(vec)))

, :

library(iotools)
library(data.table)

funAM_l <- function(invec) {
  melt(data.table(ind = seq_along(invec), trimws(mstrsplit(invec, ";"))), "ind", na.rm = TRUE)[
    , c("key", "val") := tstrsplit(value, " ", TRUE)][
      , c("variable", "value") := NULL][]
}

funAM_w <- function(invec) dcast(funAM_l(invec), ind ~ key, value.var = "val")

funMT <- function(v) {
  z <- strsplit(v, split = "(\\;)(\\s+)?", perl = TRUE)
  lapply(z,function(s) {v <- unlist(strsplit(s,' ')); setNames(as.list(v[c(F,T)]),v[c(T,F)]) })
}

funF <- function(invec) rbindlist(lapply(invec, function(x) { fread(gsub(";", "\n", x)) }), idcol = TRUE)

funGG <- function(invec) read.dcf(textConnection(sub(" ",": ",trimws(unlist(strsplit(paste0(invec, ";"),";"))))))

:

library(microbenchmark)
microbenchmark(funAM_l(vec), funAM_w(vec), funF(vec), funGG(vec), funMT(vec))
# Unit: microseconds
#          expr      min        lq       mean    median        uq      max neval
#  funAM_l(vec) 1474.163 1525.3765 1614.28414 1573.6325 1601.3815 2828.481   100
#  funAM_w(vec) 3293.376 3482.9510 3741.30381 3553.7240 3714.1730 6787.863   100
#     funF(vec)  690.761  729.4900  830.61645  756.4610  777.6725 4083.904   100
#    funGG(vec)  182.281  209.8405  220.46376  220.8055  232.1820  280.788   100
#    funMT(vec)   57.288   76.5225   84.81496   83.2755   90.3120  166.352   100

, , :

system.time(funAM_l(v100k))
#    user  system elapsed 
#    0.24    0.00    0.24 
system.time(funAM_w(v100k))
#    user  system elapsed 
#   0.296   0.000   0.296 
system.time(funMT(v100k))
#    user  system elapsed 
#   1.768   0.000   1.768 
system.time(funF(v100k))
#    user  system elapsed 
#  21.960   0.136  22.068 
system.time(funGG(v100k))
#    user  system elapsed 
#  30.968   0.004  30.940 

1 .

system.time(funAM_w(v1M))
#    user  system elapsed 
#   4.316   0.092   4.402 

, cSplit splitstackshape. , @Marat.

1 :

library(splitstackshape)
system.time(dcast(
  cSplit(cSplit(data.table(ind = seq_along(v1M), v1M), "v1M", ";", "long"), "v1M", " "), 
  ind ~ v1M_1, value.var = "v1M_2"))
#    user  system elapsed 
#  13.744   0.156  13.882
+4

, @alexis_laz:

:

vec <- c("id a; sex m; age 16; type 1;","id a; sex m; age 16;","id a; sex m; age 16; type 3")

v <- rep(vec,1e5)

:

z <- strsplit(v, split = "(\\;)(\\s+)?", perl = TRUE)

out <- lapply(z,function(s) {v <- unlist(strsplit(s,' ')); setNames(as.list(v[c(F,T)]),v[c(T,F)]) })
+2

, ";" :

require(data.table)
l <- lapply(vec, function(x){
  fread(gsub(";", "\n", x))
})

,

rbindlist(l, idcol = TRUE)

:

   .id   id  a
1:   1  sex  m
2:   1  age 16
3:   1 type  1
4:   2  sex  m
5:   2  age 16
6:   3  sex  m
7:   3  age 16
8:   3 type  3
+1

Source: https://habr.com/ru/post/1624161/


All Articles