Moving amounts for groups with uneven time intervals

Here is the tap to the previous question. Here is my details:

set.seed(3737) DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)), date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)), value = round(rnorm(14, 15, 5), 1)) user_id date value 27 2016-01-01 15.0 27 2016-01-03 22.4 27 2016-01-05 13.3 27 2016-01-07 21.9 27 2016-01-10 20.6 27 2016-01-14 18.6 27 2016-01-16 16.4 11 2016-01-01 6.8 11 2016-01-03 21.3 11 2016-01-05 19.8 11 2016-01-07 22.0 11 2016-01-10 19.4 11 2016-01-14 17.5 11 2016-01-16 19.3 

This time I would like to calculate the total amount a value for each user_id for the specified time period; e.g. last 7, 14 days. The desired solution would be as follows:

  user_id date value v_minus7 v_minus14 27 2016-01-01 15.0 15.0 15.0 27 2016-01-03 22.4 37.4 37.4 27 2016-01-05 13.3 50.7 50.7 27 2016-01-07 21.9 72.6 72.6 27 2016-01-10 20.6 78.2 93.2 27 2016-01-14 18.6 61.1 111.8 27 2016-01-16 16.4 55.6 113.2 11 2016-01-01 6.8 6.8 6.8 11 2016-01-03 21.3 28.1 28.1 11 2016-01-05 19.8 47.9 47.9 11 2016-01-07 22.0 69.9 69.9 11 2016-01-10 19.4 82.5 89.3 11 2016-01-14 17.5 58.9 106.8 11 2016-01-16 19.3 56.2 119.3 

Ideally, I would like to use dplyr for this, but other packages will be fine.

+7
source share
6 answers

logic: the first group is user_id , followed by date . Now for each subset of data, we check what all the dates are between the current date and 7/14 days ago, using between() , which returns a logical vector.

Based on this logical vector, I add a value column

 library(data.table) setDT(DF2)[, `:=`(v_minus7 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]), v_minus14 = sum(DF2$value[DF2$user_id == user_id][between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])), by = c("user_id", "date")][] # user_id date value v_minus7 v_minus14 #1: 27 2016-01-01 15.0 15.0 15.0 #2: 27 2016-01-03 22.4 37.4 37.4 #3: 27 2016-01-05 13.3 50.7 50.7 #4: 27 2016-01-07 21.9 72.6 72.6 #5: 27 2016-01-10 20.6 78.2 93.2 #6: 27 2016-01-14 18.6 61.1 111.8 #7: 27 2016-01-16 16.4 55.6 113.2 #8: 11 2016-01-01 6.8 6.8 6.8 #9: 11 2016-01-03 21.3 28.1 28.1 #10: 11 2016-01-05 19.8 47.9 47.9 #11: 11 2016-01-07 22.0 69.9 69.9 #12: 11 2016-01-10 19.4 82.5 89.3 #13: 11 2016-01-14 17.5 58.9 106.8 #14: 11 2016-01-16 19.3 56.2 119.3 

 # from alexis_laz answer. ff = function(date, value, minus){ cs = cumsum(value) i = findInterval(date - minus, date, rightmost.closed = TRUE) w = which(as.logical(i)) i[w] = cs[i[w]] cs - i } setDT(DF2) DF2[, `:=`( v_minus7 = ff(date, value, 7), v_minus14 = ff(date, value, 14)), by = c("user_id")] 
+6
source

You can use rollapply from zoo after filling in the missing dates for the first time:

 library(dplyr) library(zoo) set.seed(3737) DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)), date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)), value = round(rnorm(14, 15, 5), 1)) all_combinations <- expand.grid(user_id=unique(DF2$user_id), date=seq(min(DF2$date), max(DF2$date), by="day")) res <- DF2 %>% merge(all_combinations, by=c('user_id','date'), all=TRUE) %>% group_by(user_id) %>% arrange(date) %>% mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'), v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>% filter(!is.na(value)) 
+4
source

Here are some approaches using the zoo.

1) Define the sum_last function, which, given the zoo object, takes the sum of values ​​whose time is within k days of the last day in the series and defines the roll function that applies it to the whole series. Then use ave to apply roll to each user_id once for k = 7 and once for k = 14.

Note that this uses the coredata argument for rollapply , which was introduced in the most recent version of the zoo, so make sure you don't have an earlier version.

 library(zoo) # compute sum of values within k time units of last time point sum_last <- function(z, k) { tt <- time(z) sum(z[tt > tail(tt, 1) - k]) } # given indexes ix run rollapplyr on read.zoo(DF2[ix, -1]) roll <- function(ix, k) { rollapplyr(read.zoo(DF2[ix, -1]), k, sum_last, coredata = FALSE, partial = TRUE, k = k) } nr <- nrow(DF2) transform(DF2, v_minus7 = ave(1:nr, user_id, FUN = function(x) roll(x, 7)), v_minus14 = ave(1:nr, user_id, FUN = function(x) roll(x, 14))) 

2) An alternative would be to replace roll with the version shown below. This converts DF2[ix, -1] to "zoo" and combines it with a zero-width grid with filled spaces. Then rollapply is applied to this, and we use window to subset it back to the original times.

 roll <- function(ix, k) { z <- read.zoo(DF2[ix, -1]) g <- zoo(, seq(start(z), end(z), "day")) m <- merge(z, g, fill = 0) r <- rollapplyr(m, k, sum, partial = TRUE) window(r, time(z)) } 
+3
source

Here is another idea with findInterval to minimize comparisons and operations. First, define a function to host the main part, ignoring the grouping. The following function calculates the cumulative amount and subtracts the total amount in each position from the one that was on the last date:

 ff = function(date, value, minus) { cs = cumsum(value) i = findInterval(date - minus, date, left.open = TRUE) w = which(as.logical(i)) i[w] = cs[i[w]] cs - i } 

And apply it by group:

 do.call(rbind, lapply(split(DF2, DF2$user_id), function(x) data.frame(x, minus7 = ff(x$date, x$value, 7), minus14 = ff(x$date, x$value, 14)))) # user_id date value minus7 minus14 #11.8 11 2016-01-01 6.8 6.8 6.8 #11.9 11 2016-01-03 21.3 28.1 28.1 #11.10 11 2016-01-05 19.8 47.9 47.9 #11.11 11 2016-01-07 22.0 69.9 69.9 #11.12 11 2016-01-10 19.4 82.5 89.3 #11.13 11 2016-01-14 17.5 58.9 106.8 #11.14 11 2016-01-16 19.3 56.2 119.3 #27.1 27 2016-01-01 15.0 15.0 15.0 #27.2 27 2016-01-03 22.4 37.4 37.4 #27.3 27 2016-01-05 13.3 50.7 50.7 #27.4 27 2016-01-07 21.9 72.6 72.6 #27.5 27 2016-01-10 20.6 78.2 93.2 #27.6 27 2016-01-14 18.6 61.1 111.8 #27.7 27 2016-01-16 16.4 55.6 113.2 

The above operation individually may, of course, be replaced by any preferred method.

+3
source

Here is a new option using dplyr and tbrf

 library(tbrf) library(dplyr) set.seed(3737) DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)), date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)), value = round(rnorm(14, 15, 5), 1)) DF2 %>% group_by(user_id) %>% tbrf::tbr_sum(value, date, unit = "days", n = 7) %>% arrange(user_id, date) %>% rename(v_minus7 = sum) %>% tbrf::tbr_sum(value, date, unit = "days", n = 14) %>% rename(v_minus14 = sum) 

Creates a tible:

 # A tibble: 14 x 5 user_id date value v_minus7 v_minus14 <dbl> <date> <dbl> <dbl> <dbl> 1 11 2016-01-01 6.8 6.8 21.8 2 27 2016-01-01 15 15 21.8 3 11 2016-01-03 21.3 28.1 65.5 4 27 2016-01-03 22.4 37.4 65.5 5 11 2016-01-05 19.8 47.9 98.6 6 27 2016-01-05 13.3 50.7 98.6 7 11 2016-01-07 22 69.9 142. 8 27 2016-01-07 21.9 72.6 142. 9 11 2016-01-10 19.4 82.5 182. 10 27 2016-01-10 20.6 78.2 182. 11 11 2016-01-14 17.5 58.9 219. 12 27 2016-01-14 18.6 61.1 219. 13 11 2016-01-16 19.3 56.2 232. 14 27 2016-01-16 16.4 55.6 232. 

I suspect this is not the fastest solution with large datasets, but it works well in dplyr chains.

+1
source

Try the runner package if you want to calculate the time / date of windows. Go to the github documentation and check the Windows depending on date section Windows depending on date .

 library(runner) DF2 %>% group_by(user_id) %>% mutate( v_minus7 = sum_run(value, 7, idx = date), v_minus14 = sum_run(value, 14, idx = date) ) 

Test here

 library(data.table) library(dplyr) library(zoo) library(tbrf) set.seed(3737) DF2 = data.frame(user_id = c(rep(27, 7), rep(11, 7)), date = as.Date(rep(c('2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07', '2016-01-10', '2016-01-14', '2016-01-16'), 2)), value = round(rnorm(14, 15, 5), 1)) # example 1 data_table <- function(DF2) { setDT(DF2)[, ':='(v_minus7 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-7, date, incbounds = TRUE)]), v_minus14 = sum(DF2$value[DF2$user_id == user_id][data.table::between(DF2$date[DF2$user_id == user_id], date-14, date, incbounds = TRUE)])), by = c("user_id", "date")][] } # example 2 dplyr_grid <- function(DF2) { all_combinations <- expand.grid(user_id=unique(DF2$user_id), date=seq(min(DF2$date), max(DF2$date), by="day")) DF2 %>% merge(all_combinations, by=c('user_id','date'), all=TRUE) %>% group_by(user_id) %>% arrange(date) %>% mutate(v_minus7=rollapply(value, width=8, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right'), v_minus14=rollapply(value, width=15, FUN=function(x) sum(x, na.rm=TRUE), partial=TRUE, align='right')) %>% filter(!is.na(value)) } # example 3 dplyr_tbrf <- function(DF2) { DF2 %>% group_by(user_id) %>% tbrf::tbr_sum(value, date, unit = "days", n = 7) %>% arrange(user_id, date) %>% rename(v_minus7 = sum) %>% tbrf::tbr_sum(value, date, unit = "days", n = 14) %>% rename(v_minus14 = sum) } # example 4 runner <- function(DF2) { DF2 %>% group_by(user_id) %>% mutate( v_minus7 = sum_run(value, 7, idx = date), v_minus14 = sum_run(value, 14, idx = date) ) } microbenchmark::microbenchmark( runner = runner(DF2), data.table = data_table(DF2), dplyr = dplyr_tbrf(DF2), dplyr_tbrf = dplyr_tbrf(DF2), times = 100L ) # Unit: milliseconds # expr min lq mean median uq max neval # runner 1.478331 1.797512 2.350416 2.083680 2.559875 9.181675 100 # data.table 5.432618 5.970619 7.107540 6.424862 7.563405 13.674661 100 # dplyr 63.841710 73.652023 86.228112 79.861760 92.304231 256.841078 100 # dplyr_tbrf 60.582381 72.511075 90.175891 80.435700 92.865997 307.454643 100 
+1
source

Source: https://habr.com/ru/post/1262991/


All Articles