How to average over time by hours?

I am new to R and will experience my first difficulties. I have a dataset of about 1.0000 rpm. from 365 days when I record events. These events are noted only during the first 14 days of each month. I would like to supplement the additional 16 days by averaging compared to previous events of the corresponding month (by hour).

The structure is as follows:

                    day           hours      occurrence
                    2000-01-01     1          5
                    2000-01-01     2          6
                    2000-01-01     3          7
                    ...            ...        ...
                    2000-01-01     23         3
                    2000-01-01     24         2
                    ...            ...        ...
                    2000-01-02     1          4
                    2000-01-02     2          2
                    2000-01-02     3          5
                    ...            ...        ...
                    2000-01-02     23         2
                    2000-01-02     24         1
                    ...
                    ...
                    2000-01-15     1          average of the previous 1 hours((5+4+n)/2*k))
                    2000-01-15     2          average of the previous 2 hours ((6+2+n)/2*k))
                    2000-01-15     3          average of the previous 3 hours((7+5+n)/2*k))
                    ...            ...         ...
                    2000-01-15     23         average of the previous 23 hours
                    2000-01-15     24         average of the previous 24 hours
                    ...            ...         ...
                    ...            ...         ...
                    2000-01-30
                    2000-01-30
                    2000-01-30
                    2000-01-30
                    ...            ...         ...
                    ...            ...         ...
                    2000-02-01
                    2000-02-01
                    2000-02-01
                    2000-02-01
                    ...            ...         ...
                    ...
                    ...            ...         ...
                    2000-12-24

I tried

               aggregate( occurences ~ hours, mean) 

but the results were meaningless and I tried

               tapply( X = occurences, INDEX = list(hours), FUN = Mean )

Unfortunately, both did not work as I imagined. I consider it necessary to include the corresponding month in the function. However, my funds are apparently limited.

+3
source share
2 answers

. , , , 1-4- 0-1 . 1 2 , 2 3 .

library(dplyr)

# create dummy data
set.seed(123) # for reproducibility of sample

d1 <- data.frame(time = seq(from = as.POSIXct("2000-01-01"), 
                            to = as.POSIXct("2000-02-28"),
                            by = "hour"))
d1 <- d1 %>%
  mutate(hour = as.integer(format(time, "%H")),
         day = as.integer(format(time, "%d")), # <~~ only needed to generate sample data
         month = as.integer(format(time, "%m")),
         occurence = sample(1:10, length(time), replace = TRUE),
         occurence = ifelse(day %in% 1:2, occurence, NA)) %>%  # <~~~ data only for day 1-2
  filter(hour %in% 0:1 & day %in% 1:4) %>%  # <~~~ smaller example: select hour 0-1, day 1-4
  select(-day)

# calculate mean occurrence per month and hour
d2 <- d1 %>%
  group_by(month, hour) %>%
  summarise(mean_occ = round(mean(occurence, na.rm = TRUE), 1))
d2
#   month hour mean_occ
# 1     1    0      5.0
# 2     1    1      8.0
# 3     2    0      5.5
# 4     2    1      6.5


# replace missing occurrence with mean_occ
d3 <- d1 %>%
  left_join(d2, by = c("hour", "month")) %>%
  mutate(occurence2 = ifelse(is.na(occurence), mean_occ, occurence)) %>%
  select(-month, -mean_occ)

d3
#    hour                time occurence occurence2
# 1     0 2000-01-01 00:00:00         3        3.0
# 2     1 2000-01-01 01:00:00         8        8.0
# 3     0 2000-01-02 00:00:00         7        7.0
# 4     1 2000-01-02 01:00:00         8        8.0
# 5     0 2000-01-03 00:00:00        NA        5.0
# 6     1 2000-01-03 01:00:00        NA        8.0
# 7     0 2000-01-04 00:00:00        NA        5.0
# 8     1 2000-01-04 01:00:00        NA        8.0
# 9     0 2000-02-01 00:00:00         4        4.0
# 10    1 2000-02-01 01:00:00         6        6.0
# 11    0 2000-02-02 00:00:00         7        7.0
# 12    1 2000-02-02 01:00:00         7        7.0
# 13    0 2000-02-03 00:00:00        NA        5.5
# 14    1 2000-02-03 01:00:00        NA        6.5
# 15    0 2000-02-04 00:00:00        NA        5.5
# 16    1 2000-02-04 01:00:00        NA        6.5
+4

, @Henrik:

library(lubridate)
library(data.table)
##
setDT(Df)
Df[,month:=month(days)]
Df[,year:=year(days)]
##
naDf <- Df[mday(days)>14,]
subDf <- Df[mday(days)<=14,]
##
avgDf <- subDf[
  ,
  list(occurrence=mean(occurrence)),
  by="month,year"]
##
naDf <- base::merge(
  x=naDf[,list(days,hours,month,year)],
  y=avgDf,
  by=c("month","year"))
newDf <- rbind(
  subDf,naDf,
  use.names=TRUE)[order(days,hours),]

: , , .

d0 <- as.Date("2000-01-01")
set.seed(123)
##
Df <- data.frame(
  days=rep(d0+0:364,each=24),
  hours=rep(1:24,365),
  occurrence=sample(1:15,24*365,replace=TRUE))

, :

Df[mday(days)>14,
   occurrence:=NA]
Df[,datetime:=as.POSIXct(
  days,tz="GMT")+3600*(4+hours)]
##
newDf[,datetime:=as.POSIXct(
  days,tz="GMT")+3600*(4+hours)]
##
library(ggplot2)
ggplot(
  data=newDf[200:800,],
  aes(x=datetime,y=occurrence))+
  geom_line(color="red")
ggplot(
  data=Df[200:800,],
  aes(x=datetime,y=occurrence))+
  geom_line()

enter image description here

enter image description here

(200: 800), .

+1

Source: https://habr.com/ru/post/1619036/


All Articles