R Date Time ranges per hour Sum of duration

I have an R data frame containing start and end times for events that look like this:

             timestamp        endtimestamp 
1  2018-03-27 10:00:27 2018-03-27 10:07:27 
2  2018-03-27 10:27:28 2018-03-27 10:37:58 
3  2018-03-27 10:52:59 2018-03-27 11:01:29 
4  2018-03-27 11:17:59 2018-03-27 11:27:00 
5  2018-03-27 12:03:29 2018-03-27 12:15:59 
6  2018-03-27 12:51:00 2018-03-27 13:01:30 
7  2018-03-27 13:18:31 2018-03-27 13:26:01 
8  2018-03-27 13:42:56 2018-03-27 13:50:56 
9  2018-03-27 14:08:26 2018-03-27 14:21:27 
10 2018-03-27 14:36:02 2018-03-27 14:43:58 

I want to convert the data to have hourly ranges with the sum of the durations of events that occur only during this hour (for example, an event that starts in one hour and ends in the next will only take into account its parts every hour), resulting in:

        starttimestamp        endtimestamp    duration
1  2018-03-27 10:00:00 2018-03-27 11:00:00   1471 secs
2  2018-03-27 11:00:00 2018-03-27 12:00:00    630 secs
3  2018-03-27 12:00:00 2018-03-27 13:00:00   1290 secs
4  2018-03-27 13:00:00 2018-03-27 14:00:00   1020 secs
5  2018-03-27 14:00:00 2018-03-27 15:00:00   1257 secs

I think I can do this with a loop, although it feels awkward, but any solution that I am trying to use with dplyr / magrittr does not seem to work.

Example: the value of 1471 seconds is calculated as follows:

2018-03-27 10:00:27 to 2018-03-27 10:07:27 = 420 seconds

2018-03-27 10:27:28 until 2018-03-27 10:37:58 = 630 seconds

2018-03-27 10:52:59 2018-03-27 11:00:00 = 421 .

420 + 630 + 421 = 1471

, 11:01:29. 01:29 .

.

:

test <- data.frame(IDX = c(1:10),
           timestamp = c(as.POSIXct("2018-03-27T10:00:27Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T10:27:28Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T10:52:59Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T11:17:59Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T12:03:29Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T12:51:00Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T13:18:31Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T13:42:56Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T14:08:26Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                         as.POSIXct("2018-03-27T14:36:02Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC")
           ),
           endtimestamp = c(as.POSIXct("2018-03-27T10:07:27Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T10:37:58Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T11:01:29Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T11:27:00Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T12:15:59Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T13:01:30Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T13:26:01Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T13:50:56Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T14:21:27Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC"),
                            as.POSIXct("2018-03-27T14:43:58Z", format = "%Y-%m-%dT%H:%M:%OS", tz = "UTC")
           ))
+4
2

, ...

library(data.table)
setDT(test)

durDT = test[, {
  hr  = seq(trunc(timestamp, "hour"), trunc(endtimestamp, "hour"), by="hour")
  dur = structure(rep(3600, length(hr)), units="secs", class="difftime")

  n = length(hr)
  if (n==1){
    dur = difftime(endtimestamp, timestamp, unit = "secs")
  } else {
    dur[1] <- difftime(hr[1] + 3600, timestamp, unit = "secs")
    dur[n] <- difftime(endtimestamp, hr[n], unit = "secs")
  }
  .(hr = hr, dur = dur)
}, by=IDX]

durDT[, .(total_dur = sum(dur)), by=hr]

> durDT
    IDX                  hr      dur
 1:   1 2018-03-27 06:00:00 420 secs
 2:   2 2018-03-27 06:00:00 630 secs
 3:   3 2018-03-27 06:00:00 421 secs
 4:   3 2018-03-27 07:00:00  89 secs
 5:   4 2018-03-27 07:00:00 541 secs
 6:   5 2018-03-27 08:00:00 750 secs
 7:   6 2018-03-27 08:00:00 540 secs
 8:   6 2018-03-27 09:00:00  90 secs
 9:   7 2018-03-27 09:00:00 450 secs
10:   8 2018-03-27 09:00:00 480 secs
11:   9 2018-03-27 10:00:00 781 secs
12:  10 2018-03-27 10:00:00 476 secs

> durDT[, .(total_dur = sum(dur)), by=hr]
                    hr total_dur
1: 2018-03-27 06:00:00 1471 secs
2: 2018-03-27 07:00:00  630 secs
3: 2018-03-27 08:00:00 1290 secs
4: 2018-03-27 09:00:00 1020 secs
5: 2018-03-27 10:00:00 1257 secs

, ( OP ).

OP - - .

+1

. , base_time, . lag excess , duration1. excess duration1 duration. duration timestamp_hour endtimestamp_hour .

df %>%
  mutate(timestamp_hour = floor_date(timestamp, unit = 'hours'),
         endtimestamp_hour = timestamp_hour + hours(1)) %>%
  mutate(base_time = round_date(timestamp, unit = 'hours')) %>%
  mutate(excess = ifelse((endtimestamp > base_time) & (timestamp < base_time), difftime(endtimestamp, base_time, units = 'secs'), 0)) %>%
  mutate(duration1 = ifelse((endtimestamp > base_time) & (timestamp < base_time), difftime(base_time, timestamp, unit = 'secs'), difftime(endtimestamp, timestamp, units = 'secs'))) %>%
  mutate_at(vars(excess), lag, default = 0) %>%
  mutate(duration = excess + duration1) %>%
  group_by(timestamp_hour, endtimestamp_hour) %>%
  summarise(duration = sum(duration))

library(tidyverse)
library(lubridate)

tt <- 'timestamp,        endtimestamp 
2018-03-27 10:00:27, 2018-03-27 10:07:27 
2018-03-27 10:27:28, 2018-03-27 10:37:58 
2018-03-27 10:52:59, 2018-03-27 11:01:29 
2018-03-27 11:17:59, 2018-03-27 11:27:00 
2018-03-27 12:03:29, 2018-03-27 12:15:59 
2018-03-27 12:51:00, 2018-03-27 13:01:30 
2018-03-27 13:18:31, 2018-03-27 13:26:01 
2018-03-27 13:42:56, 2018-03-27 13:50:56 
2018-03-27 14:08:26, 2018-03-27 14:21:27 
2018-03-27 14:36:02, 2018-03-27 14:43:58' 


df <- read.table(text = tt, header = T, sep = ',')

df <- df %>% mutate(
  timestamp = as.POSIXct(timestamp),
  endtimestamp = as.POSIXct(endtimestamp)
)

# A tibble: 5 x 3
# Groups:   timestamp_hour [?]
  timestamp_hour      endtimestamp_hour   duration
  <dttm>              <dttm>                 <dbl>
1 2018-03-27 10:00:00.000 2018-03-27 11:00:00.000    1471.
2 2018-03-27 11:00:00.000 2018-03-27 12:00:00.000     630.
3 2018-03-27 12:00:00.000 2018-03-27 13:00:00.000    1290.
4 2018-03-27 13:00:00.000 2018-03-27 14:00:00.000    1020.
5 2018-03-27 14:00:00.000 2018-03-27 15:00:00.000    1257.
+1

Source: https://habr.com/ru/post/1695819/


All Articles