Window extension (cumulative calculation) in data.table: how to increase productivity

I collected data collected at different time steps. Each time step has several value registrations. Each value can occur one or more times within and between time steps.

Some information about the toy:

df <- data.frame(grp = rep(1:2, each = 8),
                 time = c(rep(1, 3), rep(2, 2), rep(3, 3)),
                 val = c(1, 2, 1,  2, 3,  2, 3, 4,  1, 2, 3,  1, 1,  1, 2, 3))

df
#    grp time val
# 1    1    1   1
# 2    1    1   2
# 3    1    1   1
# 4    1    2   2
# 5    1    2   3
# 6    1    3   2
# 7    1    3   3
# 8    1    3   4
# 9    2    1   1
# 10   2    1   2
# 11   2    1   3
# 12   2    2   1
# 13   2    2   1
# 14   2    3   1
# 15   2    3   2
# 16   2    3   3

Goals

I want to do some calculations in an expanding time window, that is, during the 1st step, during the time 1 and 2 together, within 1, 2 and 3 together and so on. In each window, I want to calculate the number of unique values, the number of values ​​that occurred more than once, and the proportion of values ​​that occurred more than once.

, (grp) 1 ( = 1 2 ) (val 1, 2, 3) (n_val = 3), (1, 2) (n_re = 2), "re_rate" 0,67 (. ).

data.table . , base, , , , data.table. data.table , . (), .

, , , , data.table ( , data.table ). , - , data.table-esque. , , , , @Khashaa . , ?


data.table :

library(data.table)

f_dt <- function(df){
  setDT(df, key = c("grp", "time", "val"))[ , {
  # key or not only affects speed marginally

    # unique time steps
    times <- .SD[ , unique(time)]

    # index vector to loop over
    idx <- seq_along(times)

    # pre-allocate data table
    d2 <- data.table(time = times,
                     n_val = integer(1),
                     n_re = integer(1),
                     re_rate = numeric(1))

    # loop to generate expanding window
    for(i in idx){

      # number of registrations per val
      n <- .SD[time %in% times[seq_len(i)], .(n = .N), by = val][ , n]

      # number of unique val
      set(x = d2, i = i, j = 2L, length(n))

      # number of val registered more than once
      set(x = d2, i = i, j = 3L, sum(n > 1))
    }
    # proportion values registered more than once
    d2[ , re_rate := round(n_re / n_val, 2)]
    d2
  }
  , by = grp]
}

..., :

f_dt(df)

#    grp time n_val n_re re_rate
# 1:   1    1     2    1    0.50
# 2:   1    2     3    2    0.67
# 3:   1    3     4    3    0.75
# 4:   2    1     3    0    0.00
# 5:   2    2     3    1    0.33
# 6:   2    3     3    3    1.00

base :

f_by <- function(df){
  do.call(rbind,
          by(data = df, df$grp, function(d){

            times <- unique(d$time)
            idx <- seq_along(times)
            d2 <- data.frame(grp = d$grp[1],
                             time = times,
                             n_val = integer(1),
                             n_re = integer(1),
                             re_rate = numeric(1))

            for(i in idx){

              dat <- d[d$time %in% times[seq_len(i)], ]
              tt <- table(dat$val)
              n_re <- sum(tt > 1)
              n_val <- length(tt)
              re_rate <- round(n_re / n_val, 2)

              d2[i, ] <- data.frame(d2$grp[1], time = times[i], n_val, n_re, re_rate)
            }
            d2
          })
  )
}

:

:

library(microbenchmark)
microbenchmark(f_by(df),
               f_dt(df),
               times = 10,
               unit = "relative")

# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
# f_by(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10
# f_dt(df) 1.481724 1.450203 1.474037 1.452887 1.521378 1.502686    10

:

set.seed(123)
df <- data.frame(grp = sample(1:100, 100000, replace = TRUE),
                 time = sample(1:100, 100000, replace = TRUE),
                 val = sample(1:100, 100000, replace = TRUE))

microbenchmark(f_by(df),
               f_dt(df),
               times = 10,
               unit = "relative")

# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
# f_by(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10
# f_dt(df) 1.094424 1.099642 1.107821 1.096997 1.097693 1.194983    10

, , , data.table . ... , , . .

+4
2
f <- function(df){
  setDT(df)[, n_val := cumsum(!duplicated(val)), grp
   ][, occ := 1:.N, .(grp, val)
     ][, occ1 := cumsum(occ == 1) - cumsum(occ == 2), grp
       ][, n_re := n_val - occ1,
         ][, re_rate := round(n_re/n_val, 2),
           ][, .(n_val = n_val[.N], n_re = n_re[.N], re_rate =re_rate[.N]), .(grp, time)]
}

  • cumsum(!duplicated(val)) () n_val,
  • occ ( , val).
  • occ1 val, . , , 1 occ==1, 1 occ==2; , cumsum(occ == 1) - cumsum(occ == 2).
  • , , n_val-occ1

set.seed(123)
df <- data.frame(grp = sample(1:100, 100000, replace = TRUE),
                 time = sample(1:100, 100000, replace = TRUE),
                 val = sample(1:100, 100000, replace = TRUE))


system.time(f(df))
# user  system elapsed 
# 0.038   0.000   0.038 

system.time(f_dt(df))
# user  system elapsed 
# 16.617   0.013  16.727

system.time(f_by(df))
# user  system elapsed 
# 16.077   0.040  16.122 

, .

+7

.

, , , ( ). , between.

#expanding group by where groups are duplicated
library(data.table)
setDT(df)
df[ , {
        #get list of unique time groups to be used in the expanding group
        uniqt <- unique(time)

        c(list(time=uniqt), #output time as well

            #expanding window of each unique time group
            do.call(rbind, lapply(uniqt, function(n) {

                #tabulate the occurrences
                x <- table(val[between(time, uniqt[1L], n)])

                #calculate desired values
                n_val <- length(x)
                n_re <- sum(x > 1)
                data.frame(n_val=n_val, n_re=n_re, re_rate=n_re/n_val)
        })))

    }, by=grp]

:

#    grp time n_val n_re   re_rate
# 1:   1    1     2    1 0.5000000
# 2:   1    2     3    2 0.6666667
# 3:   1    3     4    3 0.7500000
# 4:   2    1     3    0 0.0000000
# 5:   2    2     3    1 0.3333333
# 6:   2    3     3    3 1.0000000

, data.table between, , , , between .

0

Source: https://habr.com/ru/post/1618900/


All Articles