Count the number of unique dates per identifier between two dates, conditionally

I have a main table that contains the dates of the main events for each person:

dfMain <- data.frame(last    = c("2017-08-01", "2017-08-01", "2017-08-05","2017-09-02","2017-09-02"),
                 previous    = c(NA, NA, "2017-08-01", "2017-08-05", "2017-08-01"),
                 personid    = c(12341, 122345, 12341, 12341, 122345),
                 diff        = c(NA, NA, 4, 28, 32))

(NS on the "previous" and "difference" variables indicate that this person had his first "main equal", that is, no previous dates and time differences)

I also have a secondary table, which consists of a "secondary event" for each person:

dfSecondary <- data.frame(date = c("2017-09-01", "2017-08-30", "2017-08-04", "2017-08-02", "2017-08-02"),
                      personid = c(122345, 122345, 12341, 122345, 12341))

My question is: what is the best way (due to the amount of my data) for increasing my "dfMain" data frame with the number of unique secondary events between the dates of the main events for each person,

In a dummy example, my goal is to get this table:

Occurances  <- c(NA, NA, 2, 0, 3)
dfObjective <- data.frame(dfMain, Occurances)
+4
4

Jaap data.table "":

dfMain[, Occurrences := dfSecondary[dfMain, 
                                    on = .(personid, date <= last, date >= previous), 
                                    .N, by = .EACHI]$N][]
         last   previous personid diff Occurrences
1: 2017-08-01       <NA>    12341   NA           0
2: 2017-08-01       <NA>   122345   NA           0
3: 2017-08-05 2017-08-01    12341    4           2
4: 2017-09-02 2017-08-05    12341   28           0
5: 2017-09-02 2017-08-01   122345   32           3

dfSecondary[dfMain, ...] , dfMain . dfMain. , count N Occurrences.

Non-equi joins - , data.table 1.9.8 ( CRAN 25 2016 .).

data.table, .

library(data.table)
cols <- c("last", "previous")
setDT(dfMain)[, (cols) := lapply(.SD, as.IDate), .SDcols = cols][]
setDT(dfSecondary)[, date := as.IDate(date)][]
+3

data.table -:

# load 'data.table' package and convert date-columns to date-class
library(data.table)
setDT(dfMain)[, 1:2 := lapply(.SD, as.IDate), .SDcols = 1:2][]
setDT(dfSecondary)[, date := as.IDate(date)][]

# create a reference
dfSecondary <- dfSecondary[dfMain
                           , on = .(personid, date > previous, date < last)
                           , .(dates = x.date)
                           , by = .EACHI]
setnames(dfSecondary, 2:3, c('previous','last'))

# join and summarise
dfMain[na.omit(dfSecondary, cols = 1:3)[, sum(!is.na(dates), na.rm = TRUE)
                                        , by = .(personid, previous, last)]
       , on = .(personid, previous, last)
       , Occ := V1][]

:

         last   previous personid diff Occ
1: 2017-08-01       <NA>    12341   NA  NA
2: 2017-08-01       <NA>   122345   NA  NA
3: 2017-08-05 2017-08-01    12341    4   2
4: 2017-09-02 2017-08-05    12341   28   0
5: 2017-09-02 2017-08-01   122345   32   3
+5

dplyr tidyr

library(dplyr)
library(tidyr)

dfMain %>%
  left_join(dfSecondary,by="personid") %>%                  # put everything together
  mutate_at(c("last","previous","date"),as.Date) %>%        # reformat as date
  mutate(is_between = date <= last & date >= previous) %>%  # tests if it in between
  group_by(last,previous,personid,diff) %>%                 # group by columns from initial df
  summarize(Occ = sum(is_between)) %>%                      # count how many we have in between
  `[<-`(is.na(.$previous),"Occ",NA) %>%                     # add NAs where previous was NA
  ungroup                                                   # ungroup to have regular table

# # A tibble: 5 x 5
#         last   previous personid  diff   Occ
#       <date>     <date>    <dbl> <dbl> <int>
# 1 2017-08-01         NA    12341    NA    NA
# 2 2017-08-01         NA   122345    NA    NA
# 3 2017-08-05 2017-08-01    12341     4     2
# 4 2017-09-02 2017-08-01   122345    32     3
# 5 2017-09-02 2017-08-05    12341    28     0

Note: the order is changed, tell me if this is a problem, and I will fix it.

+2
source

Here is the solution from tidyverse.

library(tidyverse)

# Convert columns of factor to date class
# Add an ID column
dfMain2 <- dfMain %>% 
  mutate_if(is.factor, as.character) %>%
  mutate_if(is.character, as.Date) %>%
  mutate(ID = 1:n())

# Convert columns of factor to date class
# Add a Count column
dfSecondary2 <- dfSecondary %>% 
  mutate_if(is.factor, as.character) %>%
  mutate_if(is.character, as.Date) %>%
  mutate(Count = 1)

# Create sequence of dates between previous and last
# Unnest the data frame
# Perform join based on "Period" = "date", "personid"
# Group the data frame by ID and calculate the total count
dfMain3 <- dfMain2 %>%
  drop_na(previous) %>%
  mutate(Period = map2(previous, last, seq, by = 1)) %>%
  unnest() %>%
  left_join(dfSecondary2, by = c("Period" = "date", "personid")) %>%
  group_by(ID) %>%
  summarise(Occurances = sum(Count, na.rm = TRUE))

# Join the data frame by ID to create dfObjective
dfObjective <- dfMain2 %>%
  left_join(dfMain3, by = "ID") %>%
  select(-ID)

dfObjective
        last   previous personid diff Occurances
1 2017-08-01       <NA>    12341   NA         NA
2 2017-08-01       <NA>   122345   NA         NA
3 2017-08-05 2017-08-01    12341    4          2
4 2017-09-02 2017-08-05    12341   28          0
5 2017-09-02 2017-08-01   122345   32          3

Data

dfMain <- data.frame(last    = c("2017-08-01", "2017-08-01", "2017-08-05","2017-09-02","2017-09-02"),
                     previous    = c(NA, NA, "2017-08-01", "2017-08-05", "2017-08-01"),
                     personid    = c(12341, 122345, 12341, 12341, 122345),
                     diff        = c(NA, NA, 4, 28, 32))


dfSecondary <- data.frame(date = c("2017-09-01", "2017-08-30", "2017-08-04", "2017-08-02", "2017-08-02"),
                          personid = c(122345, 122345, 12341, 122345, 12341))
0
source

Source: https://habr.com/ru/post/1685026/


All Articles