R - delete consecutive (ONLY) duplicates

I need to remove rows from a data frame based on the repetition of values ​​in a given column, but only consecutive. For example, for the following data frame:

df = data.frame(x=c(1,1,1,2,2,4,2,2,1))
df$y <- c(10,11,30,12,49,13,12,49,30)
df$z <- c(1,2,3,4,5,6,7,8,9)

x  y z
1 10 1
1 11 2
1 30 3
2 12 4
2 49 5
4 13 6
2 12 7
2 49 8
1 30 9

I would need to delete rows with consecutive duplicate values ​​in column x, save the last duplicate row, and save the data frame structure:

x  y z
1 30 3
2 49 5
4 13 6
2 49 8
1 30 9

Following the directions helpand some other posts, I tried to use the function duplicated:

df[ !duplicated(x,fromLast=TRUE), ] # which gives me this:
      x  y  z
1     1 10  1
6     4 13  6
7     2 12  7
9     1 30  9
NA   NA NA NA
NA.1 NA NA NA
NA.2 NA NA NA
NA.3 NA NA NA
NA.4 NA NA NA
NA.5 NA NA NA
NA.6 NA NA NA
NA.7 NA NA NA
NA.8 NA NA NA

I don’t know why I get the NA rows at the end (which was not the case with the same table I tested), but it only works partially on the values.

I also tried using the package data.tableas follows:

library(data.table)
dt <- as.data.table(df)           
setkey(dt, x)                    
dt[J(unique(x)), mult ='last'] 

, , , - :

x  y z
1 30 9
2 49 8
4 13 6

, , . , , . .

+4
4

, , , x [i + 1]!= x [i] , .

df[c(df$x[-1] != df$x[-nrow(df)],TRUE),]
  x  y z
3 1 30 3
5 2 49 5
6 4 13 6
8 2 49 8
9 1 30 9
+5

:

df[cumsum(rle(df$x)$lengths),]

:

rle(df$x)

x. :

rle(df$x)$lengths

. :

cumsum(rle(df$x)$lengths)

, , [.

microbenchmark , rle, , consec, , , @James, , , dp dplyr, @Nik.

#> Unit: microseconds
#>    expr       min         lq       mean     median         uq        max
#>     rle   134.389   145.4220   162.6967   154.4180   172.8370    375.109
#>  consec   111.411   118.9235   136.1893   123.6285   145.5765    314.249
#>      dp 20478.898 20968.8010 23536.1306 21167.1200 22360.8605 179301.213

rle , .

+5

dplyr, :

:

library(dplyr)
df %>% 
  mutate(id = lag(x, 1), 
         decision = if_else(x != id, 1, 0), 
         final = lead(decision, 1, default = 1)) %>% 
  filter(final == 1) %>% 
  select(-id, -decision, -final)

:

  x  y z
1 1 30 3
2 2 49 5
3 4 13 6
4 2 49 8
5 1 30 9

, x

:

df2 <- df %>% add_row(x = 1, y = 10, z = 12)
df2

   x  y  z
1  1 10  1
2  1 11  2
3  1 30  3
4  2 12  4
5  2 49  5
6  4 13  6
7  2 12  7
8  2 49  8
9  1 30  9
10 1 10 12

:

df2 %>% 
  mutate(id = lag(x, 1), 
         decision = if_else(x != id, 1, 0), 
         final = lead(decision, 1, default = 1)) %>% 
  filter(final == 1) %>% 
  select(-id, -decision, -final)

:

  x  y  z
1 1 30  3
2 2 49  5
3 4 13  6
4 2 49  8
5 1 10 12
+2

data.table. , x shift x

library(data.table)
dattab <- as.data.table(df)
dattab[x != shift(x = x, n = 1, fill = -999, type = "lead")] # edited to add closing )

x , . , x, .

+1
source

Source: https://habr.com/ru/post/1694933/


All Articles