I am looking for an effective way to calculate the summation (tab) of all vector levels of a vector using data.table.
Problem
The initial /data.table DT dataset consists of four variables, one of which is called experience. A goal is a vector that contains cumulative values of factor levels in the experiment of conditional two other variables: id and cl. It should be noted that factor experience has more factors than is present in the data set (this is a necessary property).
Data look like
id trial experience cl
1: 1 1 000A A
2: 1 2 000A A
3: 1 3 000B B
4: 1 4 111A A
5: 1 5 001B B
6: 2 1 100B B
7: 2 2 111A A
8: 2 3 100B B
9: 2 4 010A A
10: 2 5 011B B
Experience experience levels have a value of 16
levels(DT$experience)
# [1] "000A" "001A" "010A" "011A" "100A" "101A" "110A" "111A"
# [9] "000B" "001B" "010B" "011B" "100B" "101B" "110B" "111B"
, , - , id cl. : id = 1 000A, c000A = 1. 000A, c000A = 2. 000B c000A 2, c000B = 1, 0 .
, , , :
id trial experience cl c000A c001A c010A c011A c100A c101A c110A c111A c000B c001B c010B c011B c100B c101B c110B c111B
1: 1 1 000A A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2: 1 2 000A A 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3: 1 3 000B B 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4: 1 4 111A A 2 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
5: 1 5 001B B 2 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
6: 2 1 100B B 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
7: 2 2 111A A 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
8: 2 3 100B B 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0
9: 2 4 010A A 0 0 1 0 0 0 0 1 0 0 0 0 2 0 0 0
10: 2 5 011B B 0 0 1 0 0 0 0 1 0 0 0 1 2 0 0 0
. , 16 c000A,..., c111B . , 16 , c000A, c001A,..., c110B, c111B, .
. .
foo <- function(DT){
DT[, paste(sep="","s",levels(DT$experience)) := as.list(table(experience)), by = c("id","cl","trial")]
DT[, "c000A" := cumsum(s000A), by = id]
DT[, "c001A" := cumsum(s001A), by = id]
DT[, "c010A" := cumsum(s010A), by = id]
DT[, "c011A" := cumsum(s011A), by = id]
DT[, "c100A" := cumsum(s100A), by = id]
DT[, "c101A" := cumsum(s101A), by = id]
DT[, "c110A" := cumsum(s110A), by = id]
DT[, "c111A" := cumsum(s111A), by = id]
DT[, "c000B" := cumsum(s000B), by = id]
DT[, "c001B" := cumsum(s001B), by = id]
DT[, "c010B" := cumsum(s010B), by = id]
DT[, "c011B" := cumsum(s011B), by = id]
DT[, "c100B" := cumsum(s100B), by = id]
DT[, "c101B" := cumsum(s101B), by = id]
DT[, "c110B" := cumsum(s110B), by = id]
DT[, "c111B" := cumsum(s111B), by = id]
}
n = 1e + 4 2 :
system.time(foo(DT))
# User System verstrichen
# 9.78 0.00 10.05
library("data.table")
library("R.utils")
n <- 1e+4
DT <- data.table(id = rep(1:2,each=n), trial = rep(1:n,2), experience = c("000A","000A","000B","111A","001B","100B","111A","100B","010A","011B"), cl = c("A","A","B","A","B","B","A","B","A","B"))
DT$experience <- factor(DT$experience, levels = paste(sep="", intToBin(0:7), rep(c("A","B"),each=8)))
setkey(DT,id,trial,cl)
?
!
Jana
: :
library("microbenchmark")
benchmk <- microbenchmark(
DT2 <- foo2(DT),
DT3a <- foo3a(DT),
DT3b <- foo3b(DT),
times=100L
)
print(benchmk)
foo2 Eddi
foo2 <- function(DT){
DT[, counter := 1:.N]
DT[, dummy := 1]
RE <- dcast.data.table(DT, counter+id ~ experience, value.var = 'dummy', fill = 0)[,lapply(.SD, cumsum), by = id, .SDcols = c(-1,-2)]
RE[, setdiff(levels(DT$experience), unique(DT$experience)) := 0]
setcolorder(RE, c("id",levels(DT$experience)))
}
foo3a ,
foo3a <- function(DT){
ex = levels(DT$experience)
DT[, c(ex) := 0L]
tmp = DT[, list(list(.I)), by=experience]
tmp[, experience := as.character(experience)] ## convert to char
for(i in seq(nrow(tmp))) {
set(DT, i=tmp$V1[[i]], j=tmp$experience[i], val=1L)
}
DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]
}
foo3b Arun
foo3b <- function(DT){
ex = levels(DT$experience)
DT[, c(ex) := 0L]
tmp = DT[, list(list(.I)), by=experience]
tmp[, experience := as.character(experience)]
for(i in seq(nrow(tmp))) {
set(DT, i=tmp$V1[[i]], j=tmp$experience[i], val=1L)
}
ex = as.character(unique(DT$experience))
DT[, c(ex) := lapply(.SD, cumsum), by=id, .SDcols=ex]
}