Combining datasets by date range and categorical variable

Suppose I have two datasets. One contains a list of promotions with start and end dates, and the other contains monthly sales data for each program.

promotions = data.frame(
    start.date = as.Date(c("2012-01-01", "2012-06-14", "2012-02-01", "2012-03-31", "2012-07-13")), 
    end.date = as.Date(c("2014-04-05", "2014-11-13", "2014-02-25", "2014-08-02", "2014-09-30")), 
    program = c("a", "a", "a", "b", "b"))

sales = data.frame(
    year.month.day = as.Date(c("2013-02-01", "2014-09-01", "2013-08-01", "2013-04-01", "2012-11-01")), 
    program = c("a", "b", "a", "a", "b"), 
    monthly.sales = c(200, 200, 200, 400, 200))

Please note that is sales$year.month.dayused to indicate year / month. The day is on, so R can simply treat the column as a vector of date objects, but it has nothing to do with actual sales.

I need to determine the number of promotions that occurred per month for each program. Here is an example of a loop that produces the output I want:

sales$count = rep(0, nrow(sales))
sub = list()
for (i in 1:nrow(sales)) {
  sub[[i]] = promotions[which(promotions$program == sales$program[i]),]
  if (nrow(sub[[i]]) > 1) {
    for (j in 1:nrow(sub[[i]])) {
      if (sales$year.month.day[i] %in% seq(from = as.Date(sub[[i]]$start.date[j]), to = as.Date(sub[[i]]$end.date[j]), by = "day")) {
        sales$count[i] = sales$count[i] + 1
      }
    }
  }
}

Output Example:

 sales = data.frame(
    year.month.day = as.Date(c("2013-02-01", "2014-09-01", "2013-08-01", "2013-04-01", "2012-11-01")), 
    program = c("a", "b", "a", "a", "b"), 
    monthly.sales = c(200, 200, 200, 400, 200),
    count = c(3, 1, 3, 3, 2)
)

However, since my actual datasets are very large, this loop crashes when I run it in R.

? , - dplyr?

+4
4

:

library(dplyr)
library(lubridate)

, , sales dataframe:

df <- promotions %>% 
    mutate(start.date = floor_date(start.date, unit = "month"),
           end.date = floor_date(end.date, unit = "month"))

:

df$output <- mapply(function(x,y) seq(x, y, by =  "month"),
       df$start.date,
       df$end.date)

, :

df %>% tidyr::unnest(output) %>% 
    group_by(output, program) %>%
    summarise(prom_num = n()) %>%
    merge(sales, ., 
      by.x = c("year.month.day", "program"),
      by.y = c("output", "program"))

:

  year.month.day program monthly.sales prom_num
1     2012-11-01       b           200        2
2     2013-02-01       a           200        3
3     2013-04-01       a           400        3
4     2013-08-01       a           200        3
5     2014-09-01       b           200        1
+3

sql.

library(sqldf)
sqldf("select s.ymd,p.program,s.monthlysales, count(*) from promotions p outer left join sales s on p.program=s.program 
where s.ymd between p.startdate and p.enddate and p.program=s.program group by s.ymd, s.program" )

2 , ymd , . ymd . .

+5

data.table:

require(data.table) # v1.9.7+
setDT(promotions) # convert to data.table by reference
setDT(sales)

ans = promotions[sales, .(monthly.sales, .N), by=.EACHI, allow.cartesian=TRUE, 
        on=.(program, start.date<=year.month.day, end.date>=year.month.day), nomatch=0L]

ans[, end.date := NULL]
setnames(ans, "start.date", "year.month.date")
#    program year.month.date monthly.sales N
# 1:       a      2013-02-01           200 3
# 2:       b      2014-09-01           200 1
# 3:       a      2013-08-01           200 3
# 4:       a      2013-04-01           400 3
# 5:       b      2012-11-01           200 2

. .

+5

?data.table::foverlaps

library(data.table)
setDT(sales)[, c("start.date", "end.date") := year.month.day] # Add overlap cols
setkey(sales, program, start.date, end.date) # Key for join
res <- foverlaps(setDT(promotions), sales)[, .N, by = year.month.day] # Count joins
sales[res, count := i.N, on = "year.month.day"] # Update `sales` with results
sales
#    year.month.day program monthly.sales start.date   end.date count
# 1:     2013-02-01       a           200 2013-02-01 2013-02-01     3
# 2:     2013-04-01       a           400 2013-04-01 2013-04-01     3
# 3:     2013-08-01       a           200 2013-08-01 2013-08-01     3
# 4:     2012-11-01       b           200 2012-11-01 2012-11-01     2
# 5:     2014-09-01       b           200 2014-09-01 2014-09-01     1

sales, + program, sales. , sales[, c("start.date", "end.date") := NULL], . Google foverlaps data.table

+3

Source: https://habr.com/ru/post/1649551/


All Articles