I collected data collected at different time steps. Each time step has several value registrations. Each value can occur one or more times within and between time steps.
Some information about the toy:
df <- data.frame(grp = rep(1:2, each = 8),
time = c(rep(1, 3), rep(2, 2), rep(3, 3)),
val = c(1, 2, 1, 2, 3, 2, 3, 4, 1, 2, 3, 1, 1, 1, 2, 3))
df
Goals
I want to do some calculations in an expanding time window, that is, during the 1st step, during the time 1 and 2 together, within 1, 2 and 3 together and so on. In each window, I want to calculate the number of unique values, the number of values that occurred more than once, and the proportion of values that occurred more than once.
, (grp) 1 ( = 1 2 ) (val 1, 2, 3) (n_val = 3), (1, 2) (n_re = 2), "re_rate" 0,67 (. ).
data.table . , base, , , , data.table. data.table , . (), .
, , , , data.table ( , data.table ). , - , data.table-esque. , , , , @Khashaa . , ?
data.table :
library(data.table)
f_dt <- function(df){
setDT(df, key = c("grp", "time", "val"))[ , {
times <- .SD[ , unique(time)]
idx <- seq_along(times)
d2 <- data.table(time = times,
n_val = integer(1),
n_re = integer(1),
re_rate = numeric(1))
for(i in idx){
n <- .SD[time %in% times[seq_len(i)], .(n = .N), by = val][ , n]
set(x = d2, i = i, j = 2L, length(n))
set(x = d2, i = i, j = 3L, sum(n > 1))
}
d2[ , re_rate := round(n_re / n_val, 2)]
d2
}
, by = grp]
}
..., :
f_dt(df)
base :
f_by <- function(df){
do.call(rbind,
by(data = df, df$grp, function(d){
times <- unique(d$time)
idx <- seq_along(times)
d2 <- data.frame(grp = d$grp[1],
time = times,
n_val = integer(1),
n_re = integer(1),
re_rate = numeric(1))
for(i in idx){
dat <- d[d$time %in% times[seq_len(i)], ]
tt <- table(dat$val)
n_re <- sum(tt > 1)
n_val <- length(tt)
re_rate <- round(n_re / n_val, 2)
d2[i, ] <- data.frame(d2$grp[1], time = times[i], n_val, n_re, re_rate)
}
d2
})
)
}
:
:
library(microbenchmark)
microbenchmark(f_by(df),
f_dt(df),
times = 10,
unit = "relative")
:
set.seed(123)
df <- data.frame(grp = sample(1:100, 100000, replace = TRUE),
time = sample(1:100, 100000, replace = TRUE),
val = sample(1:100, 100000, replace = TRUE))
microbenchmark(f_by(df),
f_dt(df),
times = 10,
unit = "relative")
, , , data.table . ... , , . .