Count the sequence to include NA values

Here is an example data frame that resembles a larger data set:

Day <- c(1, 2, NA, 3, 4, NA, NA, NA, NA, NA, 1, 2, 3, NA, NA, NA, NA, 1, 2, NA, NA, 3, 4, 5)
y   <- rpois(length(Day), 2)
z   <- seq(1:length(Day)) + 500
df  <- data.frame(z, Day, y)

If the Day column contains a sequence of 4 or more missing (NA) values, this sequence represents the gap between the cohorts in the study. If the sequence has less than 4 NA, then the missing value is still considered part of the cohort (for example, line 3 is part of cohort 1, but line 8 is not). There are 3 cohorts in the sample data frame (Cohort 1: lines 1-5, Cohort 2: lines 11-13 and Cohort 3: lines 18-24). I would like to add a column indicating the cohort number and another column that indicates the day of the cohort study. Here is the code I used:

require(dplyr)
CheckNA        <- rle(is.na(df$Day))
CheckNA$values <- CheckNA$lengths >= 4 & CheckNA$values == 1
ListNA         <- rep(CheckNA$values, CheckNA$lengths)
df$Co          <- rep(c(1, NA, 2, NA, 3), rle(ListNA)$lengths) %>% as.factor()

df <- df %>% 
  group_by (Co) %>% 
  mutate(CoDay = seq(Co)) %>% 
  as.data.frame()

df$CoDay <- ifelse(is.na(df$Co), NA, df$CoDay)

? , , 10 . , : c (1, NA, 2, NA, 3).

!

+6
2

CheckNA        <- rle(is.na(df$Day))
CheckNA$values <- CheckNA$lengths >= 4 & CheckNA$values == 1
CheckNA$values <- ifelse(!CheckNA$values, cumsum(CheckNA$values)+1, NA)
df$Co <- inverse.rle(CheckNA)

, cumsum() . , . inverse.rle , rep(), .

, dplyr

id_NA_break <- function(x) {
  CheckNA        <- rle(is.na(x))
  CheckNA$values <- CheckNA$lengths >= 4 & CheckNA$values == 1
  CheckNA$values <- ifelse(!CheckNA$values, cumsum(CheckNA$values)+1, NA)
  inverse.rle(CheckNA)  
}

df  <- data.frame(z, Day, y)
df %>% 
  mutate(Co=id_NA_break(Day)) %>%
  group_by(Co) %>% 
  mutate(CoDay = ifelse(is.na(Co), NA, seq(Co))) 
+5

data.table. , . . data.table , .

library(data.table)
Day <- c(1, 2, NA, 3, 4, NA, NA, NA, NA, NA, 1, 2, 3, NA, NA, NA, NA, 1, 2, NA, NA, 3, 4, 5)
y   <- rpois(length(Day), 2)
z   <- seq(1:length(Day)) + 500
df  <- data.frame(z, Day, y)

setDT(df)

df[ , "isNA" := ifelse(is.na(Day), 1, 0)]
df[ , "numNA" := rep(rle(isNA)$length*rle(isNA)$value, rle(isNA)$length)]
df[ , "Gap" := ifelse(numNA < 4, 0, 1)]
df[ , "Cohort" := cumsum(Gap)]

df[Gap == 1, "Cohort" := NA]
df[Gap == 0, "Cohort" := as.double(rleid(Cohort))]

> df
      z Day y isNA numNA Gap Cohort
 1: 501   1 1    0     0   0      1
 2: 502   2 2    0     0   0      1
 3: 503  NA 2    1     1   0      1
 4: 504   3 1    0     0   0      1
 5: 505   4 2    0     0   0      1
 6: 506  NA 2    1     5   1     NA
 7: 507  NA 1    1     5   1     NA
 8: 508  NA 0    1     5   1     NA
 9: 509  NA 4    1     5   1     NA
10: 510  NA 2    1     5   1     NA
11: 511   1 3    0     0   0      2
12: 512   2 3    0     0   0      2
13: 513   3 2    0     0   0      2
14: 514  NA 3    1     4   1     NA
15: 515  NA 1    1     4   1     NA
16: 516  NA 3    1     4   1     NA
17: 517  NA 2    1     4   1     NA
18: 518   1 4    0     0   0      3
19: 519   2 4    0     0   0      3
20: 520  NA 1    1     2   0      3
21: 521  NA 1    1     2   0      3
22: 522   3 3    0     0   0      3
23: 523   4 0    0     0   0      3
24: 524   5 3    0     0   0      3
      z Day y isNA numNA Gap Cohort

df[ , c("isNA", "numNA", "Gap") := NULL]

MrFlick . .

> microbenchmark(data_table_way(df))
Unit: milliseconds
               expr      min       lq     mean   median       uq      max neval
 data_table_way(df) 2.515004 2.678493 2.879678 2.770054 2.923348 4.917869   100

> microbenchmark(dplyr_way())
Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
 dplyr_way() 1.564279 1.703792 1.814998 1.765713 1.824615 2.773641   100
+3

Source: https://habr.com/ru/post/1016369/


All Articles