R - customizable data table calendar function

I need to calculate the ts value if the value I'm working with is an outlier using the previous 30 values. The data I'm working with is 600 columns per 200,000 rows. Therefore, I want to take advantage of the speed of a data table.

My function:

es_outlier<-function(vect){
  qq =quantile(vect, prob=c(0.25,0.75), na.rm=T)
  q3=qq[2]
  IC=q3-qq[1]
  limSup=q3+IC*1.5
  vector_final=abs(vect)>limSup
  return(vector_final[length(vect)] )
}

Example table:

library(data.table)

dt<-data.table(x1=runif(50000), x2=runif(50000))
dt$x1[555]<-2000
dt$x2[556]<-2000

I can solve this with the zoo package:

zoo::rollapply(dt,30,es_outlier, fill=NA,align='right')

But it takes a lot of time and less than my real data.

I would like something like:

dt[, (nom):=lapply(.SD,function, n=30)]

I tried using Rcpp, but it does not have a quantization function.

Is there a faster way to apply my function?

PS: for a tiny table, the function returns:

x<-data.frame(x1=1:8, x2=c(1:7,2000))
x_dt<-data.table(x)
zoo::rollapply(x_dt,5,es_outlier, fill=NA,align='right')

 x1    x2
 NA    NA
 NA    NA
 NA    NA
 NA    NA
 FALSE FALSE
 FALSE FALSE
 FALSE FALSE
 FALSE  TRUE
+4
source share
1 answer

, 1 . , , ..

set.seed(25L)
N <- 50000
dt <- data.frame(x1=runif(N), x2=runif(N))
dt$x1[555] <- 2000
dt$x2[556] <- 2000
wl <- 30

####################################################################################################
#' Calculate IQR for a sorted vector with 30 observations
#' 
#' @details assume that sorted is sorted. using type 7 in ?quantile.
#' 
#' @param sorted sorted numeric vector
#' 
#' @return the interquartile range
#' 
iqr30obs <- function(sorted) {
    c(sorted[8] + 0.25 * (sorted[9] - sorted[8]), sorted[22] + 0.75 * (sorted[23] - sorted[22]))
} #iqr30obs


es_outlier2 <- function(vect) {
    start <- 1
    end <- start + wl - 1
    sorted <- sort(structure(vect[start:end], names=start:end))
    i <- 0
    res <- rep(NA, nrow(dt))
    while (end < nrow(dt)) {  
        locFirstObs <- which(names(sorted)==start)

        if (!(i > 9 && i < 22 && locFirstObs > 9 && locFirstObs < 22)) {
            #changes in the 8th. 9th, 22th and 23th positions after removing first obs 
            #and adding new observation            
            qt <- iqr30obs(sorted)
            iqr1.5 <- 1.5 * (qt[2] - qt[1])
        }
        res[end] <- sorted[as.character(end)] < qt[1] - iqr1.5 |
               sorted[as.character(end)] > qt[2] + iqr1.5

        #moving to next window ----
        #remove the first observation in the window
        sorted <- sorted[-locFirstObs]

        #create the new observation to add to window
        toAdd <- structure(vect[end+1], names=end+1)

        #insert this new observation into the sorted vector while maintaining order
        for (i in seq_along(sorted)) {
            if (toAdd < sorted[i]) {
                sorted <- c(sorted[seq_len(i-1)], toAdd, sorted[i:(wl-1)])
                break
            }
        }
        if (i == length(sorted)) {
            sorted <- c(sorted, toAdd)
        }

        #increment indices
        start <- start + 1
        end <- end + 1
    } #while

    res
} #es_outlier2

es_outlier<-function(vect){
    qq =quantile(vect, prob=c(0.25,0.75), na.rm=T)
    q3=qq[2]
    IC=q3-qq[1]
    limSup=q3+IC*1.5
    vector_final=abs(vect)>limSup
    return(vector_final[length(vect)] )
}

:

system.time(es_outlier2(dt$x1))
# user  system elapsed 
# 4.62    0.00    4.67 
system.time(es_outlier2(dt$x2))
# user  system elapsed 
# 4.56    0.00    4.83 

system.time(zoo::rollapply(dt, 30, es_outlier, fill=NA, align='right'))
#   user  system elapsed 
#  17.59    0.01   17.69 
+1

Source: https://habr.com/ru/post/1692669/


All Articles