First and last value before NA

I am trying to get the first and last value for different segments before the NA value in the vector. Here is an example:

xx = seq(1, 122, by = 1)
xx[c(2:10, 14, 45:60, 120:121)] = NA

In turn, my results would be 1; 11 and 13; 15 and 44; 61 and 119; 122.

+4
source share
4 answers

Using a function c++to execute some loops will be fast on a large set.

This function returns a matrix with two columns, the first column gives the "beginning" of a sequence of numbers, the second column gives the "end" of a sequence.

library(Rcpp)

cppFunction('NumericMatrix naSeq(NumericVector myVec) {

    int n = myVec.size();
    NumericVector starts(n); // pre-allocate
    NumericVector ends(n);   // pre-allocate
    starts.fill(NumericVector::get_na());
    ends.fill(NumericVector::get_na());
    int startCounter = 0;
    int endCounter = 0;
    bool firstNumber = !NumericVector::is_na(myVec[0]); // initialise based on first value

    // groups are considered sequential numbers without an NA between them

    for (int i = 0; i < (n-1); i++) {
        if ( !NumericVector::is_na(myVec[i]) && NumericVector::is_na(myVec[i+1]) ) {
            if (i == 0 && firstNumber) {
                startCounter++;
            }
            ends[endCounter] = i + 1;
            endCounter++;
        }

        if (NumericVector::is_na(myVec[i]) && !NumericVector::is_na(myVec[i+1]) ) {
            if ( i == 0 && !firstNumber){
                endCounter++;
            }
            starts[startCounter] = i + 2;
            startCounter++;
        }
    }


    int matSize = startCounter > endCounter ? startCounter : endCounter; 
    IntegerVector idx = seq(0, matSize);
    NumericMatrix m(matSize, 2);

    starts = starts[idx];
    ends = ends[idx];

    m(_, 0) = starts;
    m(_, 1) = ends;

    return m;

}')

naSeq(xx)

which gives

#      [,1] [,2]
# [1,]   NA    1
# [2,]   11   13
# [3,]   15   44
# [4,]   61  119
# [5,]  122   NA

Benchmarking

, . , ( ) .

library(microbenchmark)

set.seed(123)
xx <- seq(1:1e6)
naXX <- sample(xx, size = 1e5)
xx[naXX] <- NA 

mb <- microbenchmark(
    late = { latemail(xx) },
    sym = { naSeq(xx) },
    www = { www(xx) },
    mkr = { mkr(xx) },
    times = 5
)

print(mb, order = "median")

# Unit: milliseconds
# expr        min         lq       mean     median         uq        max neval
#  sym   22.66139   23.26898   27.18414   23.48402   27.85917   38.64716     5
#  www   45.11008   46.69587   55.73575   56.97421   61.63140   68.26719     5
#  mkr  369.69303  384.15262  427.35080  392.26770  469.59242  521.04821     5
# late 2417.21556 2420.25472 2560.41563 2627.19973 2665.19272 2672.21543     5

latemail <- function(xx) {
    nas <- is.na(xx)
    by(xx[!nas], cumsum(nas)[!nas], function(x) x[unique(c(1,length(x)))] )
}

www <- function(xx) {
    RLE <- rle(is.na(xx))
    L <- RLE$lengths
    Index <- cumsum(L[-length(L)]) + (1:(length(L) - 1) + 1) %% 2

    matrix(c(Index[1], NA, Index[2:length(Index)], NA), ncol = 2, byrow = TRUE)
}

library(dplyr)
mkr <- function(xx) {
    df <- data.frame(xx = xx)
    df %>% mutate(value = ifelse(is.na(xx), ifelse(!is.na(lag(xx)), lag(xx),
                                                                                                 ifelse(!is.na(lead(xx)),lead(xx), NA)), NA)) %>%
        select(value) %>%
        filter(!is.na(value))
}
+4

, NA - cumsum(nas)[!nas], NA:

nas <- is.na(xx)
by(xx[!nas], cumsum(nas)[!nas], function(x) x[unique(c(1,length(x)))] )

#cumsum(nas)[!nas]: 0
#[1] 1
#--------------
#cumsum(nas)[!nas]: 9
#[1] 11 13
#--------------
#cumsum(nas)[!nas]: 10
#[1] 15 44
#--------------
#cumsum(nas)[!nas]: 26
#[1]  61 119
#--------------
#cumsum(nas)[!nas]: 28
#[1] 122

, by , split ting lapply ing:

lapply(split(xx[!nas], cumsum(nas)[!nas]), function(x) x[unique(c(1,length(x)))] )
+4

, , tidyverse. data.frame, OP. (mutate) .

lead lag -NA previous next. , NA, NA, .

library(tidyverse)
xx = seq(1, 122, by = 1)
xx[c(2:10, 14, 45:60, 120:121)] = NA

df <- data.frame(xx = xx)
df %>% mutate(value = ifelse(is.na(xx), ifelse(!is.na(lag(xx)), lag(xx),
                            ifelse(!is.na(lead(xx)),lead(xx), NA)), NA)) %>%
  select(value) %>%
  filter(!is.na(value))

#Result
#  value
#1     1
#2    11
#3    13
#4    44
#5    61
#6   119
#7   122
+2

rle cumsum.

RLE <- rle(is.na(xx))
L <- RLE$lengths
Index <- c(1, cumsum(L) + (1:length(L) + 1) %% 2)

matrix(Index, ncol = 2, byrow = TRUE)
#      [,1] [,2]
# [1,]    1    1
# [2,]   11   13
# [3,]   15   44
# [4,]   61  119
# [5,]  122  122

rle(is.na(xx)) is.na(xx), NA -NA.

RLE <- rle(is.na(xx))
RLE
# Run Length Encoding
#   lengths: int [1:9] 1 9 3 1 30 16 59 2 1
#   values : logi [1:9] FALSE TRUE FALSE TRUE FALSE TRUE ...

L <- RLE$lengths .

L <- RLE$lengths
L
# [1]  1  9  3  1 30 16 59  2  1

cumsum(L) .

cumsum(L)
# [1]   1  10  13  14  44  60 119 121 122 

. (1:length(L) + 1) %% 2, .

(1:(length(L) - 1) + 1) %% 2
# [1] 0 1 0 1 0 1 0 1 0

, .

Index <- c(1, cumsum(L) + (1:length(L) + 1) %% 2)
Index
#  [1]   1   1  11  13  15  44  61 119 122 122

Finally, I used matrix(Index, ncol = 2, byrow = TRUE)to see the results more clearly. Each row represents one group. The first column indicates the starting index of each group, and the second column indicates the end of each group.

matrix(Index, ncol = 2, byrow = TRUE)
#      [,1] [,2]
# [1,]    1    1
# [2,]   11   13
# [3,]   15   44
# [4,]   61  119
# [5,]  122  122
+2
source

Source: https://habr.com/ru/post/1693807/


All Articles