Using a function c++to execute some loops will be fast on a large set.
This function returns a matrix with two columns, the first column gives the "beginning" of a sequence of numbers, the second column gives the "end" of a sequence.
library(Rcpp)
cppFunction('NumericMatrix naSeq(NumericVector myVec) {
int n = myVec.size();
NumericVector starts(n); // pre-allocate
NumericVector ends(n); // pre-allocate
starts.fill(NumericVector::get_na());
ends.fill(NumericVector::get_na());
int startCounter = 0;
int endCounter = 0;
bool firstNumber = !NumericVector::is_na(myVec[0]); // initialise based on first value
// groups are considered sequential numbers without an NA between them
for (int i = 0; i < (n-1); i++) {
if ( !NumericVector::is_na(myVec[i]) && NumericVector::is_na(myVec[i+1]) ) {
if (i == 0 && firstNumber) {
startCounter++;
}
ends[endCounter] = i + 1;
endCounter++;
}
if (NumericVector::is_na(myVec[i]) && !NumericVector::is_na(myVec[i+1]) ) {
if ( i == 0 && !firstNumber){
endCounter++;
}
starts[startCounter] = i + 2;
startCounter++;
}
}
int matSize = startCounter > endCounter ? startCounter : endCounter;
IntegerVector idx = seq(0, matSize);
NumericMatrix m(matSize, 2);
starts = starts[idx];
ends = ends[idx];
m(_, 0) = starts;
m(_, 1) = ends;
return m;
}')
naSeq(xx)
which gives
# [,1] [,2]
# [1,] NA 1
# [2,] 11 13
# [3,] 15 44
# [4,] 61 119
# [5,] 122 NA
Benchmarking
, . , ( ) .
library(microbenchmark)
set.seed(123)
xx <- seq(1:1e6)
naXX <- sample(xx, size = 1e5)
xx[naXX] <- NA
mb <- microbenchmark(
late = { latemail(xx) },
sym = { naSeq(xx) },
www = { www(xx) },
mkr = { mkr(xx) },
times = 5
)
print(mb, order = "median")
latemail <- function(xx) {
nas <- is.na(xx)
by(xx[!nas], cumsum(nas)[!nas], function(x) x[unique(c(1,length(x)))] )
}
www <- function(xx) {
RLE <- rle(is.na(xx))
L <- RLE$lengths
Index <- cumsum(L[-length(L)]) + (1:(length(L) - 1) + 1) %% 2
matrix(c(Index[1], NA, Index[2:length(Index)], NA), ncol = 2, byrow = TRUE)
}
library(dplyr)
mkr <- function(xx) {
df <- data.frame(xx = xx)
df %>% mutate(value = ifelse(is.na(xx), ifelse(!is.na(lag(xx)), lag(xx),
ifelse(!is.na(lead(xx)),lead(xx), NA)), NA)) %>%
select(value) %>%
filter(!is.na(value))
}