Select the nth value of the aggregated column after the group in R

Considering dfas follows:

  # group value
# 1     A     8
# 2     A     1
# 3     A     7
# 4     B     3
# 5     B     2
# 6     B     6
# 7     C     4
# 8     C     5

df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))

And the index vector (possibly c NA):

inds <- c(2,1,NA)

How can we get the nth column element valuefor each group, preferably in the R base ?

For example, based on indswe need the second element valuein the group A, the first element in the group B, NAin the group C. Thus, the result will be:

#[1] 1 3 NA
+4
source share
5 answers

Here is a solution with mapplyand split:

mapply("[", with(df, split(value, group)), inds)

which returns a named vector

 A  B  C 
 1  3 NA

with(df, split(value, group)) . mapply "inds" "[" .

+5

levels sapply, :

DF <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))


inds <- c(2,1,NA)

lvls = levels(DF$group)

groupInds = sapply(1:length(lvls),function(x) DF$value[DF$group==lvls[x]][inds[x]]  )

groupInds
#[1]  1  3 NA
+2

Usage again mapply (but not as elegant as IMO answer):

 mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds)
+1
source

I know what you said preferably in the R base, but just for the record, here's the data.tableway

setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
#[1]  1  3 NA
+1
source

I just came up with another solution:

diag(aggregate(value~group, df, function(x) x[inds])[,-1])
#[1]  1  3 NA

Benchmarking

library(microbenchmark)
library(data.table)
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))
inds <- c(2,1,NA)

f_Imo <- function(df) as.vector(mapply("[", with(df, split(value, group)), inds))
f_Osssan <- function(df) {lvls = levels(df$group);sapply(1:length(lvls),function(x) df$value[df$group==lvls[x]][inds[x]])}
f_User2321 <- function(df)  unlist(mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds))
f_dww <- function(df) setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
f_m0h3n <- function(df) diag(aggregate(value~group, df, function(x) x[inds])[,-1])

all.equal(f_Imo(df), f_Osssan(df), f_User2321(df), f_dww(df), f_m0h3n(df))
# [1] TRUE

microbenchmark(f_Imo(df), f_Osssan(df), f_m0h3n(df), f_User2321(df), f_dww(df))

# Unit: microseconds
           # expr      min        lq       mean   median        uq      max neval
      # f_Imo(df)   71.004   85.1180   91.52996   91.748   96.8810  121.048   100
   # f_Osssan(df)  252.788  276.5265  318.70529  287.648  301.5495 2651.492   100
    # f_m0h3n(df) 1422.627 1555.4365 1643.47184 1618.740 1670.7095 4729.827   100
 # f_User2321(df) 2889.738 3000.3055 3148.44916 3037.945 3118.7860 6013.442   100
      # f_dww(df) 2960.740 3086.2790 3206.02147 3143.381 3250.9545 5976.229   100
0
source

Source: https://habr.com/ru/post/1658112/


All Articles