R - counts how many rows in the data frame have the same value, and the date is for x days

I have a dataframe 'DFrame' that looks like this:

RecordNo |  Cust_ID  |  Record_Date
1        |  023      |  2014-03-01
2        |  056      |  2014-01-18
3        |  041      |  2014-03-04
4        |  023      |  2014-03-21
5        |  056      |  2014-01-25
6        |  003      |  2014-03-01
7        |  023      |  2014-04-01
8        |  023      |  2014-04-02

I would like to add a column that shows the number of attempts to write one client ID in the next 14 days from the current record_record.

RecordNo |  Cust_ID  |  Record_Date  | 14-day_Repeat_Count
1        |  023      |  2014-03-01   | 0
2        |  056      |  2014-01-18   | 1
3        |  041      |  2014-03-04   | 0
4        |  023      |  2014-03-21   | 2
5        |  056      |  2014-01-25   | 0
6        |  003      |  2014-03-01   | 0
7        |  023      |  2014-04-01   | 1
8        |  023      |  2014-04-02   | 0

I am trying to write fast code in R to accomplish this. I found several articles in which accounts that meet the conditions look easy, but they usually only indicate static conditions or conditions that are not related to the value of the current record: http://one-line-it.blogspot.ca/2013 /01/r-number-of-rows-matching-condition.html

I assume the logic might look like this:

# Sort DFRAME by RECORD_DATE decreasing=FALSE
......
# Count records below current record where that have matching Cust_ID
# AND the Record_Date is <= 14 days of the current Record_Date

# put result into DFrame$14-day_Repeat_Count
......

I made this type of logic in DAX:

=calculate(counta([Cust_ID],
      filter(DFrame,
             [Cust_ID]=Earlier([Cust_ID]) &&
             [Record_Date] > Earlier([Record_Date]) &&
             [Record_Date] <= (Earlier([Record_Date]) + 14)
      )
)

( , Microsoft), Excel "CountIfs" ( , Microsoft), - , R?

+4
5

, :

##Combine into one data.table
library("data.table")
RecordNo <- 1:36
Record_Date <- c(31,33,38,41,44,59,68,69,75,78,85,88,
                 32,34,45,46,51,54,60,65,67,70,74,80,
                 33,35,42,45,50,60,65,70,75,80,82,85)
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
data <- data.table(Cust_ID,Record_Date)[order(Cust_ID,Record_Date)]

##Assign each customer an number that ranks them
data[,Cust_No:=.GRP,by=c("Cust_ID")]

##Create "list" of comparison dates for each customer
Ref <- data[,list(Compare_Date=list(I(Record_Date))), by=c("Cust_ID")]

##Compare two lists and see of the compare date is within N days
system.time(
  data$Roll.Cnt <- mapply(FUN = function(RD, NUM) {
    d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
    sum((d > 0 & d <= 14))
  }, RD = data$Record_Date,NUM=data$Cust_No)
)

:

data <- data[,list(Cust_ID,Record_Date,Roll.Cnt)][order(Cust_ID,Record_Date)]
data

     Cust_ID   Record_Date  Roll.Cnt
 1:       1          31        4
 2:       1          33        3
 3:       1          38        2
 4:       1          41        1
 5:       1          44        0
 6:       1          59        2
 7:       1          68        3
 8:       1          69        2
 9:       1          75        3
10:       1          78        2
11:       1          85        1
12:       1          88        0
13:       2          32        3
14:       2          34        2
15:       2          45        3
+3

, , .

##Make some sample variables
Record_Date <- as.Date(c(31,33,38,41,44,59,68,69,75,78,85,88,
                         32,34,45,46,51,54,60,65,67,70,74,80,
                         33,35,42,45,50,60,65,70,75,80,82,85),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))

##Combine into one data.table
library("data.table")
data <- data.table(Cust_ID,Record_Date)

   Cust_ID Record_Date
 1:       1  2010-02-01
 2:       1  2010-02-03
 3:       1  2010-02-08
 4:       1  2010-02-11
 5:       1  2010-02-14
 6:       1  2010-03-01
 7:       1  2010-03-10
 8:       1  2010-03-11
 9:       1  2010-03-17
10:       1  2010-03-20
11:       1  2010-03-27
12:       1  2010-03-30
13:       2  2010-02-02
14:       2  2010-02-04
15:       2  2010-02-15
16:       2  2010-02-16
17:       2  2010-02-21

, R , , , data.table .

output <- data[,as.list(data[,list(Cust_ID2=Cust_ID,Compare_Date=Record_Date)]),
  by=c("Cust_ID","Record_Date")][Cust_ID==Cust_ID2 & Compare_Date>=Record_Date,list(Cust_ID,Record_Date,Compare_Date,
    Within14=(as.numeric(Compare_Date-Record_Date)<=14)*1)][,list(Within14=(sum(Within14)-1)),by=c("Cust_ID","Record_Date")]   

    Cust_ID Record_Date Within14
 1:       1  2010-02-01        4
 2:       1  2010-02-03        3
 3:       1  2010-02-08        2
 4:       1  2010-02-11        1
 5:       1  2010-02-14        0
 6:       1  2010-03-01        2
 7:       1  2010-03-10        3
 8:       1  2010-03-11        2
 9:       1  2010-03-17        3
10:       1  2010-03-20        2
11:       1  2010-03-27        1
12:       1  2010-03-30        0
13:       2  2010-02-02        3
14:       2  2010-02-04        2
15:       2  2010-02-15        3
16:       2  2010-02-16        3
17:       2  2010-02-21        3

:

##Combine into one data.table
Record_Date <- as.Date(c("2014-03-01","2014-01-18","2014-03-04","2014-03-21","2014-01-25","2014-03-01",
                       "2014-04-01","2014-04-02"))
Cust_ID <- c("023","056","041","023","056","003","023","023")


data <- data.table(Cust_ID,Record_Date)

output <- data[,as.list(data[,list(Cust_ID2=Cust_ID,Compare_Date=Record_Date)]),
  by=c("Cust_ID","Record_Date")][Cust_ID==Cust_ID2 & Compare_Date>=Record_Date,list(Cust_ID,Record_Date,Compare_Date,
    Within14=(as.numeric(Compare_Date-Record_Date)<=14)*1)][,list(Within14=(sum(Within14)-1)),by=c("Cust_ID","Record_Date")]      

output

Cust_ID Record_Date Within14
1:     023  2014-03-01        0
2:     056  2014-01-18        1
3:     041  2014-03-04        0
4:     023  2014-03-21        2
5:     056  2014-01-25        0
6:     003  2014-03-01        0
7:     023  2014-04-01        1
8:     023  2014-04-02        0
+3

I do not think you will get much faster than Rcpp.

First collect a data frame.

df = read.table(text="RecordNo   Cust_ID    Record_Date
1          023        2014-03-01
2          056        2014-01-18
3          041        2014-03-04
4          023        2014-03-21
5          056        2014-01-25
6          003        2014-03-01
7          023        2014-04-01
8          023        2014-04-02",header=TRUE,stringsAsFactors=FALSE)
df$Record_Date = as.Date(df$Record_Date)
df$Record_Date = as.numeric(df$Record_Date - min(df$Record_Date))
df = df[order(df$Record_Date),]

Then calculate the counts using the Rcpp function.

library(Rcpp)
cppFunction('
  NumericVector count_14( NumericVector id, NumericVector day) {
            unsigned int n=id.size(), i, j;
            NumericVector out(n);

            for( i=0; i<n; i++ ) {
              j=1;
              while( day[i+j] - day[i] <= 14 && i+j<n )
                if( id[i]==id[i+j++] ) out[i]++;
            }
            return out;
            }')
df$count = count_14(df$Cust_ID,df$Record_Date)
df[order(df$RecordNo),]
#   RecordNo Cust_ID Record_Date count
# 1        1      23          42     0
# 2        2      56           0     1
# 3        3      41          45     0
# 4        4      23          62     2
# 5        5      56           7     0
# 6        6       3          42     0
# 7        7      23          73     1
# 8        8      23          74     0
+3
source

You can try using findInterval()O (n * log n) complexity:

DF <- read.csv(text=
'"RecordNo","Cust_ID","Record_Date"
1,"023","2014-03-01"
2,"056","2014-01-18"
3,"041","2014-03-04"
4,"023","2014-03-21"
5,"056","2014-01-25"
6,"003","2014-03-01"
7,"023","2014-04-01"
8,"023","2014-04-02"',
stringsAsFactors=F)

DF$Record_Date <- as.POSIXct(DF$Record_Date,format='%Y-%m-%d',tz='GMT')

# sort by ascending date
DF <- DF[order(DF$Record_Date),]

# for each date D we find the row index of the first next date <= D+14
DF$EndIdx <- findInterval(x=DF$Record_Date+14*60*60*24,vec=DF$Record_Date)

DF$Count <- 
sapply(1:nrow(DF), FUN=function(i){
                          currRow <- DF[i,]
                          sum(DF[i:currRow$EndIdx,'Cust_ID'] == currRow$Cust_ID)
                       })

> DF
  RecordNo Cust_ID Record_Date EndIdx Count
2        2      56  2014-01-18      2     2
5        5      56  2014-01-25      2     1
1        1      23  2014-03-01      5     1
6        6       3  2014-03-01      5     1
3        3      41  2014-03-04      5     1
4        4      23  2014-03-21      8     3
7        7      23  2014-04-01      8     2
8        8      23  2014-04-02      8     1
+2
source

This is not a quick way to do this, but it should give someone a start. In my experience, sliding calculations in R can be difficult to do quickly. If you cannot find a smart solution, you may have to take a look at Rcpp.

df = read.table(text="RecordNo   Cust_ID    Record_Date
1          023        2014-03-01
2          056        2014-01-18
3          041        2014-03-04
4          023        2014-03-21
5          056        2014-01-25
6          003        2014-03-01
7          023        2014-04-01
8          023        2014-04-02",header=TRUE,stringsAsFactors=FALSE)

# I don't like working with dates
# Converting to number of days after earliest day
df$Record_Date = as.Date(df$Record_Date)
df$Record_Date = as.numeric(df$Record_Date - min(df$Record_Date))

# Use a slow loop to do the calculations
for( i in 1:nrow(df) ) {
  window = df$Record_Date < df$Record_Date[i] + 14 & 
               df$Record_Date > df$Record_Date[i]
  df$count[i] = sum(df$Cust_ID[window] == df$Cust_ID[i])
}

df
#   RecordNo Cust_ID Record_Date count
# 1        1      23          42     0
# 2        2      56           0     1
# 3        3      41          45     0
# 4        4      23          62     2
# 5        5      56           7     0
# 6        6       3          42     0
# 7        7      23          73     1
# 8        8      23          74     0
0
source

Source: https://habr.com/ru/post/1536663/


All Articles