Group R frame by related values

I did not find a solution to this general grouping problem in R:

This is my initial dataset.

ID  State
1   A
2   A
3   B
4   B
5   B
6   A
7   A
8   A
9   C
10  C

This should be my grouped result dataset.

State   min(ID) max(ID)
A       1       2
B       3       5
A       6       8
C       9       10

So the idea is to first sort the data set by the identifier column (or timestamp column). Then all connected states without spaces should be grouped together, and the minimum and maximum ID values ​​should be returned. This is due to the rle method, but it does not allow us to calculate the minimum values ​​for groups.

Any ideas?

+4
source share
4 answers

You can try:

library(dplyr)
df %>%
  mutate(rleid = cumsum(State != lag(State, default = ""))) %>%
  group_by(rleid) %>%
  summarise(State = first(State), min = min(ID), max = max(ID)) %>%
  select(-rleid)

, @alistaire, group_by() , . data.table::rleid() summarise_all() :

df %>% 
  group_by(State, rleid = data.table::rleid(State)) %>% 
  summarise_all(funs(min, max)) %>% 
  select(-rleid)

:

## A tibble: 4 × 3
#   State   min   max
#  <fctr> <int> <int>
#1      A     1     2
#2      B     3     5
#3      A     6     8
#4      C     9    10
+6

, rle R .

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=c(1, head(cumsum(temp$lengths) + 1, -1)),
                    max.ID=cumsum(temp$lengths))

newDF
  State min.ID max.ID
1     A      1      2
2     B      3      5
3     A      6      8
4     C      9     10

, rle , , as.is .


@cryo111, , , rle. , as.POSIXct, df <- df[order(df$ID),], :

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=df$ID[c(1, head(cumsum(temp$lengths) + 1, -1))],
                    max.ID=df$ID[cumsum(temp$lengths)])

< >

df <- read.table(header=TRUE, as.is=TRUE, text="ID  State
1   A
2   A
3   B
4   B
5   B
6   A
7   A
8   A
9   C
10  C")
+5

data.table:

require(data.table)

dt <- fread("ID  State
1   A
            2   A
            3   B
            4   B
            5   B
            6   A
            7   A
            8   A
            9   C
            10  C")

dt[,rle := rleid(State)]
dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")]

:

   rle State min max
1:   1     A   1   2
2:   2     B   3   5
3:   3     A   6   8
4:   4     C   9  10

, rleid, min max of ID rle State.

rle

dt2[,rle:=NULL]

:

 dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")][,rle:=NULL]

, rleid by :

dt2 <- dt[, .(min=min(ID),max=max(ID)), by=.(State, rleid(State))][, rleid:=NULL]
+4

Here is another attempt using rleand aggregatefrom the R base:

rl <- rle(df$State)
newdf <- data.frame(ID=df$ID, State=rep(1:length(rl$lengths),rl$lengths))
newdf <- aggregate(ID~State, newdf, FUN = function(x) c(minID=min(x), maxID=max(x)))
newdf$State <- rl$values

  # State ID.minID ID.maxID
# 1     A        1        2
# 2     B        3        5
# 3     A        6        8
# 4     C        9       10

<strong> data

df <- structure(list(ID = 1:10, State = c("A", "A", "B", "B", "B", 
"A", "A", "A", "C", "C")), .Names = c("ID", "State"), class = "data.frame", 
row.names = c(NA, 
    -10L))
+2
source

Source: https://habr.com/ru/post/1654696/


All Articles