Effective calculation or tabulation of the factor by other factors and restructuring in the data frame?

Question

Effective calculation or tabulation of the factor by other factors and restructuring in the data frame?

I am looking for an effective way to calculate the summation (tab) of all vector levels of a vector using data.table.

Problem

The initial /data.table DT dataset consists of four variables, one of which is called experience. A goal is a vector that contains cumulative values of factor levels in the experiment of conditional two other variables: id and cl. It should be noted that factor experience has more factors than is present in the data set (this is a necessary property).

Data look like

    id trial experience cl
 1:  1     1       000A  A
 2:  1     2       000A  A
 3:  1     3       000B  B
 4:  1     4       111A  A
 5:  1     5       001B  B
 6:  2     1       100B  B
 7:  2     2       111A  A
 8:  2     3       100B  B
 9:  2     4       010A  A
10:  2     5       011B  B

Experience experience levels have a value of 16

levels(DT$experience)
#  [1] "000A" "001A" "010A" "011A" "100A" "101A" "110A" "111A"
#  [9] "000B" "001B" "010B" "011B" "100B" "101B" "110B" "111B"

, , - , id cl. : id = 1 000A, c000A = 1. 000A, c000A = 2. 000B c000A 2, c000B = 1, 0 .

, , , :

    id trial experience cl c000A c001A c010A c011A c100A c101A c110A c111A c000B c001B c010B c011B c100B c101B c110B c111B
 1:  1     1       000A  A     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 2:  1     2       000A  A     2     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 3:  1     3       000B  B     2     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
 4:  1     4       111A  A     2     0     0     0     0     0     0     1     1     0     0     0     0     0     0     0
 5:  1     5       001B  B     2     0     0     0     0     0     0     1     1     1     0     0     0     0     0     0
 6:  2     1       100B  B     0     0     0     0     0     0     0     0     0     0     0     0     1     0     0     0
 7:  2     2       111A  A     0     0     0     0     0     0     0     1     0     0     0     0     1     0     0     0
 8:  2     3       100B  B     0     0     0     0     0     0     0     1     0     0     0     0     2     0     0     0
 9:  2     4       010A  A     0     0     1     0     0     0     0     1     0     0     0     0     2     0     0     0
10:  2     5       011B  B     0     0     1     0     0     0     0     1     0     0     0     1     2     0     0     0

. , 16 c000A,..., c111B . , 16 , c000A, c001A,..., c110B, c111B, .

. .

foo <- function(DT){
   # tabulate experience for each trial
   # store in an auxiliary variables <s000A, s001A, ..., s110B, s111B>
   DT[, paste(sep="","s",levels(DT$experience)) := as.list(table(experience)), by = c("id","cl","trial")]
   # sum each of the s____ variables by id
   DT[, "c000A" := cumsum(s000A), by = id] # this is clumsy
   DT[, "c001A" := cumsum(s001A), by = id]
   DT[, "c010A" := cumsum(s010A), by = id]
   DT[, "c011A" := cumsum(s011A), by = id]
   DT[, "c100A" := cumsum(s100A), by = id]
   DT[, "c101A" := cumsum(s101A), by = id]
   DT[, "c110A" := cumsum(s110A), by = id]
   DT[, "c111A" := cumsum(s111A), by = id]
   DT[, "c000B" := cumsum(s000B), by = id]
   DT[, "c001B" := cumsum(s001B), by = id]
   DT[, "c010B" := cumsum(s010B), by = id]
   DT[, "c011B" := cumsum(s011B), by = id]
   DT[, "c100B" := cumsum(s100B), by = id]
   DT[, "c101B" := cumsum(s101B), by = id]
   DT[, "c110B" := cumsum(s110B), by = id]
   DT[, "c111B" := cumsum(s111B), by = id]
}

n = 1e + 4 2 :

system.time(foo(DT))
# User  System verstrichen 
# 9.78    0.00       10.05

library("data.table")
library("R.utils")
# Sample dataframe DF with n=1e+4
n <- 1e+4 #to test change this to n=5
DT <- data.table(id = rep(1:2,each=n), trial = rep(1:n,2), experience = c("000A","000A","000B","111A","001B","100B","111A","100B","010A","011B"), cl = c("A","A","B","A","B","B","A","B","A","B")) # experience needs to be a factor w more levels
DT$experience <- factor(DT$experience, levels = paste(sep="", intToBin(0:7), rep(c("A","B"),each=8)))
setkey(DT,id,trial,cl) #set the data.table keys

?

! Jana

: :

library("microbenchmark")
benchmk <- microbenchmark(
   DT2  <- foo2(DT),
   DT3a <- foo3a(DT),
   DT3b <- foo3b(DT),
   times=100L
   )
print(benchmk)

# with n=1e+4
#
# unit milliseconds
#              expr      min       lq   median        uq      max neval
# DT2   <- foo2(DT) 46.96745 52.17469 74.72479 120.93339 212.7912   100
# DT3a <- foo3a(DT) 25.21907 26.57921 28.84702  34.89401 121.3164   100
# DT3b <- foo3b(DT) 19.82076 20.80570 22.87369  30.83561 148.0520   100 

# with n=1e+5
#
# unit milliseconds
#              expr       min       lq   median       uq       max neval
#   DT2 <- foo2(DT) 386.93890 445.0184 481.4660 534.9619 1160.6151   100
# DT3a <- foo3a(DT) 144.45937 154.5672 170.6048 233.6362  494.8972   100
# DT3b <- foo3b(DT)  95.91988 100.5313 110.4060 125.1678  364.5651   100

foo2 Eddi

foo2 <- function(DT){
    DT[, counter := 1:.N]
    DT[, dummy := 1]
    RE <- dcast.data.table(DT, counter+id ~ experience, value.var = 'dummy', fill = 0)[,lapply(.SD, cumsum), by = id, .SDcols = c(-1,-2)]
    RE[, setdiff(levels(DT$experience), unique(DT$experience)) := 0]
    setcolorder(RE, c("id",levels(DT$experience)))
}

foo3a ,

foo3a <- function(DT){
   ex = levels(DT$experience)
   DT[, c(ex) := 0L]
   tmp = DT[, list(list(.I)), by=experience]
   tmp[, experience := as.character(experience)] ## convert to char
   for(i in seq(nrow(tmp))) {
      set(DT, i=tmp$V1[[i]], j=tmp$experience[i], val=1L)
   }
   DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]
}

foo3b Arun

foo3b <- function(DT){
   ex = levels(DT$experience)
   DT[, c(ex) := 0L]
   tmp = DT[, list(list(.I)), by=experience]
   tmp[, experience := as.character(experience)] ## convert to char
   for(i in seq(nrow(tmp))) {
      set(DT, i=tmp$V1[[i]], j=tmp$experience[i], val=1L)
   }
   ex = as.character(unique(DT$experience)) ## rewrite 'ex'
   DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]
}

+4

performance r data.table

JBJ 24 . '14 14:33

2

- :

# add some extra variables
DT[, counter := 1:.N]
DT[, dummy := 1]

dcast.data.table(DT, counter+id ~ experience, value.var = 'dummy', fill = 0)[,
  lapply(.SD, cumsum), by = id, .SDcols = c(-1,-2)]
#       id 000A 010A 111A 000B 001B 011B 100B
#    1:  1    1    0    0    0    0    0    0
#    2:  1    2    0    0    0    0    0    0
#    3:  1    2    0    0    1    0    0    0
#    4:  1    2    0    1    1    0    0    0
#    5:  1    2    0    1    1    1    0    0
#   ---                                      
#19996:  2 2000  999 1999 1000 1000  999 1999
#19997:  2 2000  999 2000 1000 1000  999 1999
#19998:  2 2000  999 2000 1000 1000  999 2000
#19999:  2 2000 1000 2000 1000 1000  999 2000
#20000:  2 2000 1000 2000 1000 1000 1000 2000

cbind , .

+2

eddi 24 . '14 15:19

Arun · Accepted Answer · 2014-04-24T16:09:08+0000

?

0L.

ex = levels(DT$experience)
DT[, c(ex) := 0L]

, group by experience , experience , :

tmp = DT[, list(list(.I)), by=experience]
tmp[, experience := as.character(experience)] ## convert to char

set ( V1) ( experience) tmp, 1 DT :

for(i in seq(nrow(tmp))) {
    set(DT, i=tmp$V1[[i]], j=tmp$experience[i], val=1L)
}

, a cumsum id:

DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]

0,013 ( dcast.data.table, , 0,027 ).

, , ex as.character(unique(DT$experience)) .. 0, cumsum. :

ex = as.character(unique(DT$experience)) ## rewrite 'ex'
DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]

Effective calculation or tabulation of the factor by other factors and restructuring in the data frame?

Problem

: :

More articles: