Group R frame by related values

Question

Group R frame by related values

I did not find a solution to this general grouping problem in R:

This is my initial dataset.

ID  State
1   A
2   A
3   B
4   B
5   B
6   A
7   A
8   A
9   C
10  C

This should be my grouped result dataset.

State   min(ID) max(ID)
A       1       2
B       3       5
A       6       8
C       9       10

So the idea is to first sort the data set by the identifier column (or timestamp column). Then all connected states without spaces should be grouped together, and the minimum and maximum ID values should be returned. This is due to the rle method, but it does not allow us to calculate the minimum values for groups.

Any ideas?

+4

r group-by dataframe run-length-encoding

Hanshupe 15 sept. '16 at 12:56

source share

4 answers

, rle R .

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=c(1, head(cumsum(temp$lengths) + 1, -1)),
                    max.ID=cumsum(temp$lengths))

newDF
  State min.ID max.ID
1     A      1      2
2     B      3      5
3     A      6      8
4     C      9     10

, rle , , as.is .

@cryo111, , , rle. , as.POSIXct, df <- df[order(df$ID),], :

# get the run length encoding
temp <- rle(df$State)

# construct the data.frame
newDF <- data.frame(State=temp$values,
                    min.ID=df$ID[c(1, head(cumsum(temp$lengths) + 1, -1))],
                    max.ID=df$ID[cumsum(temp$lengths)])

< >

df <- read.table(header=TRUE, as.is=TRUE, text="ID  State
1   A
2   A
3   B
4   B
5   B
6   A
7   A
8   A
9   C
10  C")

+5

lmo 15 . '16 13:14

data.table:

require(data.table)

dt <- fread("ID  State
1   A
            2   A
            3   B
            4   B
            5   B
            6   A
            7   A
            8   A
            9   C
            10  C")

dt[,rle := rleid(State)]
dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")]

:

   rle State min max
1:   1     A   1   2
2:   2     B   3   5
3:   3     A   6   8
4:   4     C   9  10

, rleid, min max of ID rle State.

rle

dt2[,rle:=NULL]

:

 dt2<-dt[,list(min=min(ID),max=max(ID)),by=c("rle","State")][,rle:=NULL]

, rleid by :

dt2 <- dt[, .(min=min(ID),max=max(ID)), by=.(State, rleid(State))][, rleid:=NULL]

+4

Tensibai 15 . '16 13:12

Here is another attempt using rleand aggregatefrom the R base:

rl <- rle(df$State)
newdf <- data.frame(ID=df$ID, State=rep(1:length(rl$lengths),rl$lengths))
newdf <- aggregate(ID~State, newdf, FUN = function(x) c(minID=min(x), maxID=max(x)))
newdf$State <- rl$values

  # State ID.minID ID.maxID
# 1     A        1        2
# 2     B        3        5
# 3     A        6        8
# 4     C        9       10

<strong> data

df <- structure(list(ID = 1:10, State = c("A", "A", "B", "B", "B", 
"A", "A", "A", "C", "C")), .Names = c("ID", "State"), class = "data.frame", 
row.names = c(NA, 
    -10L))

+2

989 15 sept. '16 at 13:59

source share

Steven Beaupré · Accepted Answer · 2016-09-15T13:04:37+0000

You can try:

library(dplyr)
df %>%
  mutate(rleid = cumsum(State != lag(State, default = ""))) %>%
  group_by(rleid) %>%
  summarise(State = first(State), min = min(ID), max = max(ID)) %>%
  select(-rleid)

, @alistaire, group_by() , . data.table::rleid() summarise_all() :

df %>% 
  group_by(State, rleid = data.table::rleid(State)) %>% 
  summarise_all(funs(min, max)) %>% 
  select(-rleid)

:

## A tibble: 4 × 3
#   State   min   max
#  <fctr> <int> <int>
#1      A     1     2
#2      B     3     5
#3      A     6     8
#4      C     9    10

Group R frame by related values

More articles: