Work with dates dependent on other factors in R

I am having trouble developing the best way to answer this problem. I am worried that this may be due to a fundamental misunderstanding regarding analyzes (more on this later). The problem is that: about 25,000 transactions, I need to find which customers called within two months after the expiration of their subscription.

id = unique customer ID

call = 1 signifies the observation is a call

lapse = 1 signifies the observation is a lapse

Please note that if any client has a call and expiration on the same day, there will be two entries for that client on that day; the client can have several calls on one date (each own observation and own line in df); but any customer can only once a day.

Mini df without solution:

library(lubridate)
df <- data.frame(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4),
             date = dmy(c("01-01-2014", "07-02-2014",   "05-03-2014",   "14-02-2014",   "15-04-2014",   "17-04-2014",   "11-05-2014",   "19-08-2014",   "07-10-2014",   "21-12-2014",   "04-06-2010",   "06-03-2012",   "12-07-2012",   "13-07-2012",   "14-01-2014",   "05-05-2014",   "19-08-2014",   "19-08-2014",   "13-02-2013",   "11-11-2013",   "04-03-2014",   "10-12-2014",   "02-03-2017",   "03-03-2017")), 
             call = c(1,    0,  0,  1,  1,  1,  0,  1,  1,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  1,  1,  1,  1,  0),
             lapse = c(0,   1,  1,  0,  0,  0,  1,  0,  0,  1,  1,  1,  1,  1,  0,  1,  0,  1,  1,  0,  0,  0,  0,  1))

... and the solution vector:

df$call_2months_or_less_before_lapse <- c(1,    0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0)

, , , Excel. - !

, , - tidyverse. , , . , , R.

+4
3

, R, , , , . dplyr . dplyr R- split() lapply().

# Function that finds time to most recent call before a lapse.
time_to_most_recent_call <- function(x) {
  # Extract vector of dates when the subscription lapsed, and vector of dates when customer called.
  lapse_dates <- x$date[x$lapse == 1]
  call_dates <- x$date[x$call == 1]
  # Get all pairwise time intervals in days between lapse and call.
  time_intervals <- sapply(lapse_dates, function(z) z - call_dates)
  # Find most recent call before each lapse (only look at positive time intervals)
  shortest_intervals <- apply(time_intervals, 2, function(z) min(z[z >= 0]))  
  # Return result (also include flag if it between 0 and 60)
  return(data.frame(lapse_date = lapse_dates, 
                    interval = shortest_intervals, 
                    within2months = shortest_intervals >= 0 & shortest_intervals <= 60))
}

library(dplyr)

df %>%
  group_by(id) %>%
  do(time_to_most_recent_call(.))

. , 60 (2 ). , , .

+2

.

data.table:

library(lubridate)
library(data.table)
setDT(df)[, answer := 
            df[.(id = id, date1 = date, date2 = date %m+% months(2)), 
               on = .(id, date >= date1, date <= date2),
               as.integer(any(lapse == 1)), by = .EACHI]$V1][
                 call == 0, answer := 0][]
    id       date call lapse call_2months_or_less_before_lapse answer
 1:  1 2014-01-01    1     0                                 1      1
 2:  1 2014-02-07    0     1                                 0      0
 3:  1 2014-03-05    0     1                                 0      0
 4:  1 2014-03-14    1     0                                 0      1
 5:  1 2014-04-15    1     0                                 1      1
 6:  1 2014-04-17    1     0                                 1      1
 7:  1 2014-05-11    0     1                                 0      0
 8:  1 2014-08-19    1     0                                 0      0
 9:  1 2014-10-07    1     0                                 0      0
10:  1 2014-12-21    0     1                                 0      0
11:  3 2010-06-04    0     1                                 0      0
12:  3 2012-03-06    0     1                                 0      0
13:  3 2012-07-12    0     1                                 0      0
14:  3 2012-07-13    0     1                                 0      0
15:  3 2014-01-14    1     0                                 0      0
16:  3 2014-05-05    0     1                                 0      0
17:  3 2014-08-19    1     0                                 1      1
18:  3 2014-08-19    0     1                                 0      0
19:  4 2013-02-13    0     1                                 0      0
20:  4 2013-11-11    1     0                                 0      0
21:  4 2014-03-04    1     0                                 0      0
22:  4 2014-12-10    1     0                                 0      0
23:  4 2017-03-02    1     0                                 1      1
24:  4 2017-03-03    0     1                                 0      0
    id       date call lapse call_2months_or_less_before_lapse answer

, 4 OP, .

, ( ), 2 . , - . , 1 0, .

equi:

df[.(id = id, date1 = date, date2 = date %m+% months(2)), 
   on = .(id, date >= date1, date <= date2), 
   as.integer(any(lapse == 1)), by = .EACHI]

df data.table, " ", .(), id, date date %m+% months(2). lubridate OP 2- ( 60 ).

on , , .. id, . (by = .EACHI), any() .

df answer. := df , .

, answer , .

setDT(df) df data.table.

:

library(lubridate)
df <- data.frame(
  id    = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4),
  date  = dmy(c("01-01-2014", "07-02-2014", "05-03-2014", "14-03-2014", "15-04-2014", "17-04-2014", 
                "11-05-2014", "19-08-2014", "07-10-2014", "21-12-2014", "04-06-2010", "06-03-2012", 
                "12-07-2012", "13-07-2012", "14-01-2014", "05-05-2014", "19-08-2014", "19-08-2014",
                "13-02-2013", "11-11-2013", "04-03-2014", "10-12-2014", "02-03-2017", "03-03-2017")), 
  call  = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0),
  lapse = c(0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1))

, 4 OP. 4 "14-02-2014", . , "14-03-2014", .

, call_2months_or_less_before_lapse, OP 0. 1. "14-02-2014" 3 . "14-03-2014" 7 .

+2

, . , (qite) R.

, . , . :

2 , (, 1 ). , , , .

:

1. , 2    df.

# @df_to_proceed is the data frame to be looked up
# @current_df_i is the row index of the precessed lapse
    Find2MonthsEarlier <- function(df_to_proceed, current_df_i) {
    # the customer ID for the given lapse
    given_id <- df_to_proceed$id[current_df_i] 
    # select the entries of the df corresponding 
    # to the 2-month period before the given lapse
    current_date <- df_to_proceed$date[current_df_i]
    # assume 2 month as simply 60 days
    date_2month_earlier <- as_date(current_date - 60)
    period_2month_earlier <- interval(date_2month_earlier, current_date)
    # select a subset for the certain customer and the 2-month period 
    # before the given lapse
    subset_2month_earlier <- df_to_proceed[with(df_to_proceed, 
        (date %within% period_2month_earlier & id == given_id)), ]
    subset_2month_earlier_reordrd <- subset_2month_earlier[order(subset_2month_earlier$date), ]
    # finds the row with the latest call within 2-month period before the given lapse
    i_of_latest_call_within2months <- nrow(subset_2month_earlier_reordrd) - 
        match(table = rev(subset_2month_earlier_reordrd$call), x = 1) +
        1
    date_of_latest_call_within2months <- subset_2month_earlier_reordrd[i_of_latest_call_within2months,
        "date"]
    # extract all the dates between the latest call within 2-month period 
    # before the given lapse (for the certain customer!)
    dates_to_flag <- subset_2month_earlier$date[subset_2month_earlier$date <=
        date_of_latest_call_within2months]  
    return(list(Subset = subset_2month_earlier, 
        LatestDate = as_date(date_of_latest_call_within2months),
        ID = given_id, FlaggedDates = dates_to_flag))
}

2. df

i_of_lapse <- which(df$lapse == 1)

3.

for (i in i_of_lapse) {
    test_list <- Find2MonthsEarlier(df_to_proceed = df, 
        current_df_i = i)
    # duplicated dates are processed differently
    dates_with_dupl <- unique(test_list[["FlaggedDates"]][duplicated(test_list[["FlaggedDates"]])])
    # check length(dates_with_dupl) to prevent loss of the data
    if (length(dates_with_dupl) > 0) {
        dates_without_dupl <- test_list[["FlaggedDates"]][!(test_list$date %in% dates_with_dupl)]
    } else {
        dates_without_dupl <- test_list[["FlaggedDates"]]
        }
    # entries with duplicated dates are flagged only if corresponding call = 1
    df[(df$date %in% dates_with_dupl & 
            df$id == test_list[["ID"]] & df$call == 1),
        "flag_calls_2month_earlier_inR"] <- 1
    df[(df$date %in% dates_without_dupl & 
        df$id == test_list[["ID"]]),
        "flag_calls_2month_earlier_inR"] <- 1   
    }

The only thing I'm not sure about is the values df$call_2months_or_less_before_lapse[c(3, 4)]corresponding to the dates "07-02-2014"and "14-02-2014". It was call == 1for "14-02-2014", and it was lapse == 1for "05-03-2014". It seems like it should be flag == 1for "07-02-2014"and "14-02-2014", but actually it is 0. So, something is wrong both with the formulation of the problem and with examples of meanings. It would be great if you could check and comment on this issue.

+1
source

Source: https://habr.com/ru/post/1691904/


All Articles