How to group a data table across multiple columns sequentially

Question

How to group a data table across multiple columns sequentially

I want to take a bunch of descriptive statistics grouped by several hundred grouping vars. I know from How to group data.table by multiple columns? so that I can use list () in the grouping parameter if I need a stat for the combination of the vars grouping. In my case, I want the average for each level Y than the average for each level Z

    # example data
    set.seed(007) 
    DF <- data.frame(X=1:50000, Y=sample(c(0,1), 50000, TRUE), Z=sample(0:5, 50000, TRUE))

    library(data.table)
    DT <- data.table(DF)

    # I tried this - but this gives the mean for each combination of Y and Z
    DT[, mean(X), by=list(Y, Z)]

    # so does this
    DT[, mean(X), by=c("Y", "Z")]

    # This works but.... 
    out <- lapply( c( "Y","Z") , FUN= function(K){ DT[, mean(X), by=get(K)]})
    out <- do.call( rbind, out )
   #...but it is really slow.

I have 100 million records and 400+ grouping vars, so something is needed - somewhat efficiently. The lapply option adds up to several days of additional processing time.

options( digits=15 )
start.time <- Sys.time()
out <- lapply( c( "Y","Z") , FUN= function(K){ DT[, mean(X), by=get(K)]})
end.time <- Sys.time()
time.taken <- end.time - start.time

start.time <- Sys.time()
DT[, mean(X), by=c("Y")]
DT[, mean(X), by=c("Z")]
end.time <- Sys.time()
time.taken2 <- end.time - start.time
time.taken - time.taken2

+4

r data.table

MatthewR Jan 31 '18 at 16:49

source share

1 answer

Uwe · Accepted Answer · 2018-01-31T17:28:07+0000

1.10.5 data.table Seting Set, , () .

library(data.table)
# data.table 1.10.5 IN DEVELOPMENT built 2018-01-31 02:23:45 UTC

grp_vars <- setdiff(names(DF), "X")
groupingsets(setDT(DF), mean(X), by = grp_vars, sets = as.list(grp_vars))

    Y  Z       V1
1:  1 NA 24960.98
2:  0 NA 25039.96
3: NA  5 24652.44
4: NA  0 25006.61
5: NA  2 25223.83
6: NA  3 24959.26
7: NA  1 25095.58
8: NA  4 25068.84

Benchmark

# create data
n_rows = 1e6L
n_vars = 5
n_grps = 1e2L
set.seed(007) 
DT <- data.table(rn = seq_len(n_rows))
for (i in seq_len(n_vars)) set(DT, , paste0("X", i), i*rnorm(n_rows))
for (i in seq_len(n_grps)) set(DT, , paste0("Z", i), sample(0:i, n_rows, TRUE))

grps <- grep("^Z", names(DT), value = TRUE)
vars <- grep("^X", names(DT), value = TRUE)

# run benchmark
bm <- microbenchmark::microbenchmark(
  gs = {
    groupingsets(DT, lapply(.SD, mean), by = grps, sets = as.list(grps), .SDcols = vars)
  },
  lapply1 = {
    rbindlist(lapply(grps, function(K) DT[, lapply(.SD, mean), by = K, .SDcols = vars]), 
                fill = TRUE)
  },
  lapply2 = {
    out <- lapply(grps, function(K) DT[, lapply(.SD, mean), by = get(K), .SDcols = vars])
    do.call(rbind, out)
  },
  times = 3L
)
print(bm)

1 100 (groupingsets() , ):

Unit: seconds
    expr      min       lq     mean   median       uq      max neval
      gs 3.602689 3.606646 3.608343 3.610603 3.611169 3.611735     3
 lapply1 3.524957 3.546060 3.561130 3.567163 3.579217 3.591270     3
 lapply2 3.562424 3.569284 3.577199 3.576144 3.584586 3.593027     3

How to group a data table across multiple columns sequentially

Benchmark

More articles: