In R, split the data frame so that the dataframes subset contains the last row of the previous data frame and the first row of the next data frame

There are many answers to the question of how to split a data frame, for example, How to split a data frame?

However, I would like to split the data frame so that the smaller data frames contain the last row of the previous data frame and the first row of the next frame.

Here is an example

n <- 1:9
group <- rep(c("a","b","c"), each = 3)
data.frame(n = n, group)

  n  group
1 1     a
2 2     a
3 3     a
4 4     b
5 5     b
6 6     b
7 7     c
8 8     c
9 9     c

I would like the result to look like this:

 d1 <- data.frame(n = 1:4, group = c(rep("a",3),"b"))
 d2 <- data.frame(n = 3:7, group = c("a",rep("b",3),"c"))
 d3 <- data.frame(n = 6:9, group = c("b",rep("c",3)))
 d <- list(d1, d2, d3)
 d

[[1]]
  n group
1 1     a
2 2     a
3 3     a
4 4     b

[[2]]
  n group
1 3     a
2 4     b
3 5     b
4 6     b
5 7     c

[[3]]
  n group
1 6     b
2 7     c
3 8     c
4 9     c

What is an effective way to accomplish this task?

+4
source share
5 answers

, DF - data.frame, n group. n - DF. extract, ix, , DF. , extract, 1,..., n extract .

n <- nrow(DF)
extract <- function(ix) DF[seq(max(1, min(ix) - 1), min(n, max(ix) + 1)), ]
lapply(split(seq_len(n), DF$group), extract)

$a
  n group
1 1     a
2 2     a
3 3     a
4 4     b

$b
  n group
3 3     a
4 4     b
5 5     b
6 6     b
7 7     c

$c
  n group
6 6     b
7 7     c
8 8     c
9 9     c
+5

good'ol by, "[a] ppl [ies] [ INDICES]".

by(data = df, INDICES = df$group, function(x){
   id <- c(min(x$n) - 1, x$n, max(x$n) + 1)
   na.omit(df[id, ])
   })


# df$group: a
#   n group
# 1 1     a
# 2 2     a
# 3 3     a
# 4 4     b
# -------------------------------------------------------------------------------- 
#   df$group: b
# n group
# 3 3     a
# 4 4     b
# 5 5     b
# 6 6     b
# 7 7     c
# -------------------------------------------------------------------------------- 
#   df$group: c
#   n group
# 6 6     b
# 7 7     c
# 8 8     c
# 9 9     c

print by "" , ( ) list, , ( str names ).

+4

@cdetermans, . , data.table::shift ( dyplr::lag), , lapply , -

library(data.table) # v1.9.6+ 
indx <- setDT(df)[, which(group != shift(group, fill = TRUE))]
lapply(Map(`:`, c(1L, indx - 1L), c(indx, nrow(df))), function(x) df[x,])
# [[1]]
#    n group
# 1: 1     a
# 2: 2     a
# 3: 3     a
# 4: 4     b
# 
# [[2]]
#    n group
# 1: 3     a
# 2: 4     b
# 3: 5     b
# 4: 6     b
# 5: 7     c
# 
# [[3]]
#    n group
# 1: 6     b
# 2: 7     c
# 3: 8     c
# 4: 9     c
+3

data.frame, - data.table? parallelism.

library(data.table)
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
df <- data.table(n = n, group)
df[, `:=` (group = factor(df$group))]
df[, `:=` (group_i = seq_len(.N), group_N = .N), by = "group"]

library(doParallel)
groups <- unique(df$group)
foreach(i = seq(groups)) %do% {
  df[group == groups[i] | (as.integer(group) == i + 1 & group_i == 1) | (as.integer(group) == i - 1 & group_i == group_N), c("n", "group"), with = FALSE]  
}
[[1]]
   n group
1: 1     a
2: 2     a
3: 3     a
4: 4     b
[[2]]
   n group
1: 3     a
2: 4     b
3: 5     b
4: 6     b
5: 7     c
[[3]]
   n group
1: 6     b
2: 7     c
3: 8     c
4: 9     c
+1

dplyr:

library(dplyr)

data = 
  data_frame(n = n, group) %>%
  group_by(group)

firsts = 
  data %>%
  slice(1) %>%
  ungroup %>%
  mutate(new_group = lag(group)) %>%
  slice(-1)

lasts = 
  data %>%
  slice(n()) %>%
  ungroup %>%
  mutate(new_group = lead(group)) %>%
  slice(-n())

bind_rows(firsts, data, lasts) %>%
  mutate(final_group = 
           ifelse(is.na(new_group),
                  group,
                  new_group) ) %>%
  arrange(final_group, n) %>%
  group_by(final_group)
0
source

Source: https://habr.com/ru/post/1616434/


All Articles