How to return the last value to a vector that satisfies a certain condition

I have a vector (in a data frame) filled with increasing numbers. I would like to find all consecutive numbers and replace them with the first number in the series. Is it possible to do without a cycle?

My input:

V1
1
4
5
7
10
15
16
17
20

What I would like to conclude:

V1    Out
1     1
4     4
5     4
7     7
10    10
15    15
16    15
17    15
20    20

So far, I have been able to calculate the difference between the two lines using diff () and a loop through the vector to replace the correct values.

V1 <- c(1, 4, 5, 7, 10, 15, 16, 17, 20)
df <- data.frame(V1)
df$diff <- c(0, diff(df$V1) == 1)
df$Out <- NA
for (j in 1:(nrow(df))){
    if (df$diff[j] == 0){
        df$Out[j] <- df$V1[j]
    } else {
        df$Out[j] <- df$V1[max(which(df$diff[1:j] == 0))]
    }
}

This does the job, but it is very inefficient. Is there a way to get rid of the loop and quickly make this code?

Many thanks!

+4
source share
4 answers

Using the R base, you can do

with(d1, ave(V1, cumsum(c(1, diff(V1) != 1)), FUN = function(i) i[1]))
#[1]  1  4  4  7 10 15 15 15 20

dplyr

library(dplyr)

d1 %>% 
 group_by(grp = cumsum(c(1, diff(V1) != 1))) %>% 
 mutate(out = first(V1))

data.table

library(data.table)

setDT(d1)[, out := first(V1), by = cumsum(c(1, diff(V1) != 1))]
+8
source

, 3 , zoo:

V2 V1:

df$V2 <- df$V1

( diff 1) NA:

df$V2[c(FALSE, diff(df$V1)==1)] <- NA

, zoo::na.locf NA :

library(zoo)
df$V2 <- na.locf(df$V2)

:

df
#   V1 V2
# 1  1  1
# 2  4  4
# 3  5  4
# 4  7  7
# 5 10 10
# 6 15 15
# 7 16 15
# 8 17 15
# 9 20 20

, , magrittr:

library(magrittr)
df$V2 <- df$V1 %>% replace(c(FALSE, diff(df$V1)==1), NA) %>% na.locf
+5

shift() lag() diff()

, , diff(V1) . , data.table dplyr shift() lag(), , ( @Frank).

, Sotos 'data.table

library(data.table)
setDT(d1)[, out := first(V1), by = cumsum(c(1, diff(V1) != 1))]

setDT(d1)[, out := V1[1], by = cumsum(V1 - shift(V1, fill = V1[1]) != 1)]

dplyr

library(dplyr)
d1 %>% 
  group_by(grp = cumsum(V1 - lag(V1, default = V1[1]) != 1)) %>% 
  mutate(out = first(V1)) 

, R-

library(data.table)
with(d1, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1]))

-:: na.locf()

library(zoo)
library(magrittr)
library(data.table)
df$V2 <- df$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1]) + 1, NA) %>% na.locf()

Benchmark

, . , , 1, double 1L, , integer. , NA_integer_ NA.

, - , , data.table . .

data.frame 1 M 2 M . , Out data.frame. data.table DF.

library(data.table)
n <- 1e6L
f <- 2L
set.seed(1234L)
DF <- data.frame(V1 = sort(sample.int(f*n, n)),
                 Out = 1:n)
DT <- data.table(DF)
DT

12 , , 24 .

library(magrittr)
library(microbenchmark)
bm <- microbenchmark(
  ave_diff = DF$Out <- with(DF, ave(V1, cumsum(c(1, diff(V1) != 1)), FUN = function(i) i[1])),
  ave_shift = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1]) != 1), FUN = function(i) i[1])),
  zoo_diff = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1)] <- NA; DF$Out <- zoo::na.locf(DF$Out)},
  zoo_pipe = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1), NA) %>% zoo::na.locf(),
  zoo_shift = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1]) + 1, NA) %>% zoo::na.locf(),
  dp_diff = r2 <- DF %>% 
    dplyr::group_by(grp = cumsum(c(1, diff(V1) != 1))) %>% 
    dplyr::mutate(Out = first(V1)),
  dp_lag = r3 <- DF %>% 
    dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1]) != 1)) %>% 
    dplyr::mutate(Out = first(V1)),
  dt_diff = DT[, Out := V1[1], by = cumsum(c(1, diff(V1) != 1))],
  dt_shift1 = DT[, Out := V1[1], by = cumsum(V1 - shift(V1, fill = V1[1]) != 1)],
  dt_shift2 = DT[, Out := V1[1], by = cumsum(V1 != shift(V1, fill = V1[1]) + 1)],
  dt_zoo_diff = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1), Out := NA][, Out := zoo::na.locf(Out)],
  dt_zoo_shift = DT[, Out := V1][V1 == shift(V1, fill = V1[1]) + 1, Out := NA][, Out := zoo::na.locf(Out)],
  ave_diff_L = DF$Out <- with(DF, ave(V1, cumsum(c(1L, diff(V1) != 1L)), FUN = function(i) i[1L])),
  ave_shift_L = DF$Out <- with(DF, ave(V1, cumsum(V1 - shift(V1, fill = V1[1L]) != 1L), FUN = function(i) i[1L])),
  zoo_diff_L = {DF$Out <- DF$V1; DF$Out[c(FALSE, diff(DF$V1) == 1L)] <- NA_integer_; DF$Out <- zoo::na.locf(DF$Out)},
  zoo_pipe_L = DF$Out <- DF$V1 %>% replace(c(FALSE, diff(DF$V1) == 1L), NA_integer_) %>% zoo::na.locf(),
  zoo_shift_L = DF$Out <- DF$V1 %>% replace(DF$V1 == shift(DF$V1, fill = DF$V1[1L]) + 1L, NA_integer_) %>% zoo::na.locf(),
  dp_diff_L = r2 <- DF %>% 
    dplyr::group_by(grp = cumsum(c(1L, diff(V1) != 1L))) %>% 
    dplyr::mutate(Out = first(V1)),
  dp_lag_L = r3 <- DF %>% 
    dplyr::group_by(grp = cumsum(V1 - dplyr::lag(V1, default = V1[1L]) != 1L)) %>% 
    dplyr::mutate(Out = first(V1)),
  dt_diff_L = DT[, Out := V1[1L], by = cumsum(c(1L, diff(V1) != 1L))],
  dt_shift1_L = DT[, Out := V1[1L], by = cumsum(V1 - shift(V1, fill = V1[1L]) != 1L)],
  dt_shift2_L = DT[, Out := V1[1L], by = cumsum(V1 != shift(V1, fill = V1[1L]) + 1L)],
  dt_zoo_diff_L = DT[, Out := V1][c(FALSE, diff(DF$V1) == 1L), Out := NA_integer_][, Out := zoo::na.locf(Out)],
  dt_zoo_shift_L = DT[, Out := V1][V1 == shift(V1, fill = V1[1L]) + 1L, Out := NA_integer_][, Out := zoo::na.locf(Out)],
  times = 20L
)

library(ggplot2)
autoplot(bm)

enter image description here

.

Unit: milliseconds
           expr        min         lq      mean    median        uq       max neval   cld
       ave_diff 2594.89941 2643.32224 2752.9753 2723.7035 2868.6586 3006.0420    20     e
      ave_shift  947.13267 1001.70742 1107.7351 1047.6835 1218.5809 1395.5059    20   c  
       zoo_diff  100.13967  130.23284  197.7273  142.8525  262.1980  428.2976    20 a    
       zoo_pipe  104.98025  112.04101  181.3073  119.5275  185.3215  434.2936    20 a    
      zoo_shift   88.86549   98.49058  177.2143  110.5392  260.1160  416.9985    20 a    
        dp_diff 1148.18227 1219.68396 1303.6350 1290.5575 1344.1400 1628.1786    20    d 
         dp_lag  712.58827  746.77952  804.8908  776.3303  809.8323 1157.2102    20  b   
        dt_diff  226.67524  233.81038  292.0675  241.9369  275.8491  517.1760    20 a    
      dt_shift1  199.64651  207.39276  255.1607  215.7960  223.7947  882.9923    20 a    
      dt_shift2  203.87617  210.06736  260.8550  218.9917  244.7247  499.8797    20 a    
    dt_zoo_diff  109.45194  121.41501  216.3579  159.0960  278.5257  483.1110    20 a    
   dt_zoo_shift   94.59905  109.32432  204.0329  127.0619  373.8622  430.0885    20 a    
     ave_diff_L  992.12820 1041.12873 1127.8128 1071.8525 1217.1493 1457.3166    20   c  
    ave_shift_L  905.41152  973.81932 1063.2237 1015.6805 1170.2522 1323.9317    20   c  
     zoo_diff_L  103.30228  114.63442  227.4359  140.5280  300.3003  822.3366    20 a    
     zoo_pipe_L  103.89433  112.16467  231.3165  133.3362  398.7240  545.7856    20 a    
    zoo_shift_L   91.88764  104.21339  157.6434  138.7488  165.0197  401.3890    20 a    
      dp_diff_L  749.65952  766.00479  851.0737  806.1116  886.6429 1155.3144    20  b   
       dp_lag_L  731.08180  757.95232  823.0169  794.4421  827.7100 1079.2576    20  b   
      dt_diff_L  214.97477  226.80928  241.3575  232.7037  244.8673  323.6259    20 a    
    dt_shift1_L  199.80509  211.20539  277.5616  218.3371  259.9801  513.2925    20 a    
    dt_shift2_L  200.37902  204.23732  224.7275  210.7217  216.6133  470.6335    20 a    
  dt_zoo_diff_L  111.64757  122.62327  162.4947  140.4175  174.0932  409.0788    20 a    
 dt_zoo_shift_L   95.91114  109.24219  164.7059  126.5924  170.2320  388.6558    20 a

:

  • zoo::na.locf() , , na.locf() shift().
  • , data.table .
  • , dplyr.
  • Last is ave(), which is more than 20 times slower than the fastest, and takes up to 3 seconds per turn.
  • Versions shift()/ are lag()always faster than diff().
  • Type conversion matters. Versions with the use of diff(), for example, ave_diff with integer constants are approximately 2.5 times faster than the version with two versions.
+5
source

C dplyrand tidyr:

library(tidyr)
library(dplyr)


> df %>% mutate(
+   diff=c(0,diff(V1))==1,
+   V2=ifelse(diff,NA,V1)
+   ) %>% 
+   fill(V2) %>% 
+   select(-diff)
  V1 V2
1  1  1
2  4  4
3  5  4
4  7  7
5 10 10
6 15 15
7 16 15
8 17 15
9 20 20
+4
source

Source: https://habr.com/ru/post/1684382/


All Articles