Calculate purchase trip length in R

I am currently stuck in R with a very specific problem: I have a dataset of approx. 2.5 million lines displaying data on purchases related to purchases. The format is as follows (I excluded most demographic data and some other variables for simplicity):

   UserID   PurchaseID       Time of Contact       Purchase   Age   
     1          1          2015-08-07 19:16:59        0       35
     1          1          2015-08-07 21:17:32        0       35
     1          1          2015-08-07 22:42:51        0       35
     1          1          2015-08-07 23:06:13        0       35
     1          2          2016-05-26 11:01:16        1       35
     1          2          2016-06-02 19:57:25        1       35
     1          2          2016-06-15 15:48:20        1       35
     1          2          2016-06-21 08:39:44        1       35
     2          3          2015-11-14 11:32:10        0       51
     2          3          2015-11-14 11:32:20        0       51
     2          3          2015-11-14 11:33:50        0       51

I want to analyze how the average time between contacts of each individual trip affects the likelihood of a purchase. Therefore, I want to calculate the total length of each client’s trip (for example, the start time of purchase ID 1 before the expiration date 1). Subsequently, I want to combine the data so that it looks like this:

   UserID   PurchaseID    Customer journey length  Purchase   Age   
     1          1                 03:49:14            0       35
     1          2                621:38:28            1       35
     2          3                 00:01:40            0       51

I honestly have no idea where to start, so I hope you can help me! Many thanks!

+4
3

( , , ):

library(dplyr)
library(lubridate)

df <- data.frame(userID=c(1,1), 
           PurchaseID=c(1,1), 
           Contactime= c(ymd_hms("2015-08-07 19:16:59"), ymd_hms("2015-08-07 21:16:59")), 
           Purchase=c(0,0), 
           Age=c(35, 35))


timesummary<- df %>% 
  group_by( userID,PurchaseID, Purchase, Age) %>% 
  summarise(journeylength= as.numeric(difftime(max(Contactime),min(Contactime), units="secs"))) 

, , .

+1

dat1 <- aggregate(. ~PurchaseID+UserID, data=df[,1:3], function(V)max(V)-min(V))
dat2 <- aggregate(. ~PurchaseID+UserID, data=df[,c(1:2, 4)], sum)
dat3 <- aggregate(. ~PurchaseID+UserID, data=df[,c(1:2, 5)], mean)

dat <- merge(merge(dat1, dat2, by = c("PurchaseID", "UserID")), 
         dat3, by = c("PurchaseID", "UserID")) 
   )
dat <- dat[-which(dat$TimeofContact == 0),]
# some polishing
names(dat)[3] <- "CustomerJourneyLength"
# converting time differences in a more suitable format
hours <- dat$CustomerJourneyLength %/% 3600
minutes <- (dat$CustomerJourneyLength %% 3600)%/%60
seconds <- (dat$CustomerJourneyLength %% 3600)%%60
dat$CustomerJourneyLength <- paste0(hours, " hours ", minutes, " minutes ", round(seconds), " seconds")

# which yields
> dat
  PurchaseID UserID          CustomerJourneyLength Purchase Age
1          1      1 15 hours 28 minutes 49 seconds        1  27
2          1      2 15 hours 21 minutes 44 seconds        3  31
3          2      1  4 hours 11 minutes 17 seconds        2  27
5          3      1  9 hours 39 minutes 45 seconds        1  27
6          3      2 14 hours 36 minutes 31 seconds        1  31

,

df <- data.frame(UserID = sample(1:2, 20, replace = T), 
             PurchaseID = sample(1:3, 20, replace = T),
             TimeofContact = runif(20, Sys.time(), Sys.time() + 20*3600),
             Purchase = sample(0:1, 20, replace = T), 
             Age = rep(NA, 20))
df$Age[which(df$UserID == 1)] <- sample(20:40, 1)
df$Age[which(df$UserID == 2)] <- sample(20:40, 1)
0

Using a data table that will work fast.

library(data.table)

Re-creating your data:

dat <-
  data.table(
    UserID = round(runif(1e5, 1, 1e5 / 5)),
    PurchaseID = round(runif(1e5, 1, 5)),
    timeOfContact = as.POSIXct(runif(1e5, 0, 2e5), origin = '2017-09-20'),
    Purchase = round(runif(1e5, 0, 1)),
    age = round(runif(1e5, 15, 65))
  )
dat[, age := max(age), .(UserID)]
dat[, Purchase := max(Purchase), .(UserID, PurchaseID)]

Single line of code:

dat[, .(customerJourneyLength = as.numeric(difftime(
  max(timeOfContact),
  min(timeOfContact),
  tz = 'GMT',
  units = 'secs'
))), .(UserID, PurchaseID, Purchase, age)]

Aside, avoid having column names with spaces in them.

0
source

Source: https://habr.com/ru/post/1695578/


All Articles