How to get the nth match of elements between two vectors in R?

match returns the position of the first matches between its first and second arguments:

 match(c("a","c"), c("a", "a", "b", "c", "c", "c")) # 1 4 

What is the best way to indicate matches other than the first? For example, we want the second match for "a" and the third for "c" (so we would get: 2 6 ).

Update: inefficient solution n is looking for:

 value_index_query <- data.frame(value = c("a", "c"), index = c(2, 3)) id <- c("a", "a", "b", "c", "c", "c") apply(value_index_query, 1, function(value_index) { value <- value_index[1] index <- as.integer(value_index[2]) which(id == value)[index] }) 
+5
source share
8 answers

It also uses mapply to start two columns in tandem via operation (.) [N].

 with(value_index_query, mapply( function(target, nth) which(id==target)[nth], target=value, nth=index) ) [1] 2 6 
+7
source

Here is the data.table solution, in which we join the id vector with a mapping table. Then we can use .EACHI to group, capturing the index from .I for each group.

 library(data.table) ## 'dti' would be your 'value_index_query' with the 'value' column renamed dti <- data.table(id = c("a", "c"), index = c(2, 3)) ## join it with 'id' and take 'index' by group data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1 # [1] 2 6 

We can put this in a function:

 viq <- function(id, value, index) { dti <- data.table(id = value, index = index) data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1 } id <- c("a", "a", "b", "c", "c", "c") viq(id, c("a", "c"), 2:3) # [1] 2 6 viq(id, c("a", "c"), c(2, 4)) # [1] 2 NA viq(id, c("a", "b", "c"), c(2, 1, 4)) # [1] 2 3 NA viq(id, c("a", "b", "c"), c(2, 1, 3)) # [1] 2 3 6 
+6
source

One at a time with grep .

 vec <- c("a", "a", "b", "c", "c", "c") aa <-grep("a", vec)[2] #2nd cc <-grep("c", vec)[3] #3rd c(aa,cc) #[1] 2 6 
+3
source

Here is the dplyr way

 library(dplyr) test = data_frame(value = c("a","c"), order = c(2, 3)) original = data_frame(value = c("a", "a", "b", "c", "c", "c")) original %>% mutate(ID = 1:n()) %>% right_join(test) %>% group_by(value) %>% slice(order %>% first) 
+3
source

How about this ?:

 mapply(function(x,y) x[[y]], x = sapply(v1, function(x) which(x == v2)), y = c(2,3)) ac 2 6 
+2
source

For comparison, the (maybe not perfect, I'm still studying) Rcpp solution with some timings with three other basic approaches.

 library(Rcpp) library(microbenchmark) library(data.table) library(dplyr) foo_mapply <- function(value,id,index){ mapply( function(target, nth, id) which(id==target)[nth], target=value, nth=index,MoreArgs = list(id = id)) } foo_dt <- function(dti,id){ data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1 } foo_dplyr <- function(test,original){ original %>% mutate(ID = 1:n()) %>% right_join(test,by = "value") %>% group_by(value) %>% slice(order %>% first) } cppFunction('IntegerVector nmatch(CharacterVector value,CharacterVector id,IntegerVector index){ int nvalue = value.size(); int nid = id.size(); int completed = 0; IntegerVector match_count(nvalue,0); IntegerVector out(nvalue,IntegerVector::get_na()); for (int i = 0; i < nid; ++i){ for (int j = 0; j < nvalue; ++j){ if (value[j] == id[i]){ match_count[j] = match_count[j] + 1; if (match_count[j] == index[j]){ out[j] = i + 1; completed++; } } } if (completed == nvalue){ break; } } return out; }') 

And the synchronization results:

 > #One with all matches relatively early > set.seed(123) > value <- c("a","b", "c") > index <- c(150,50,500) > id <- sample(letters[1:5],10000,replace = TRUE) > dti <- data.table(id = value,index = index) > test = data_frame(value = value, order = index) > original = data_frame(value = id) > > microbenchmark(nmatch(value = value, id = id,index = index), + foo_mapply(value = value,id = id,index = index), + foo_dt(dti = dti,id = id), + foo_dplyr(test = test,original = original)) Unit: microseconds expr min lq mean median uq max neval cld nmatch(value = value, id = id, index = index) 118.326 121.9060 124.2930 122.8535 124.5040 167.713 100 a foo_mapply(value = value, id = id, index = index) 863.281 873.1505 949.8326 878.8535 896.7795 2119.411 100 b foo_dt(dti = dti, id = id) 1860.678 1927.0990 2038.5965 1985.2720 2082.7900 3761.116 100 c foo_dplyr(test = test, original = original) 2862.143 2943.7280 3175.9202 2986.2385 3121.7685 4502.976 100 d > #One with a match that forces us nearer the end of the list > set.seed(123) > value <- c("a","b", "c") > index <- c(150,50,2000) > id <- sample(letters[1:5],10000,replace = TRUE) > dti <- data.table(id = value,index = index) > test = data_frame(value = value, order = index) > original = data_frame(value = id) > > microbenchmark(nmatch(value = value, id = id,index = index), + foo_mapply(value = value,id = id,index = index), + foo_dt(dti = dti,id = id), + foo_dplyr(test = test,original = original)) Unit: microseconds expr min lq mean median uq max neval cld nmatch(value = value, id = id, index = index) 469.208 473.4735 481.0698 475.1040 487.7145 560.031 100 a foo_mapply(value = value, id = id, index = index) 861.797 872.6845 949.6749 882.5335 903.1255 2091.864 100 a foo_dt(dti = dti, id = id) 1821.554 1924.5690 2022.2231 1977.5970 2082.6035 3300.399 100 b foo_dplyr(test = test, original = original) 2875.626 2945.7560 3681.2624 2995.7900 3100.3235 53508.339 100 c 
+2
source

Using this setting

 set.seed(123) id <- sample(letters[1:5], 10000, replace = TRUE) value <- c("a", "b", "c") index <- c(150, 50, 500) 

Index and then split id vector

 index_by_id <- split(seq_along(id), id) 

Match values ​​with their elements in id_by_value

 value_idx <- match(value, names(index_by_id)) 

Choose the i-th element of each match

 mapply(`[`, index_by_id[value_idx], index) 

And as a function:

 f1 <- function(id, value, index) { index_by_id <- split(seq_along(id), id) value_idx <- match(value, names(index_by_id)) mapply(`[`, index_by_id[value_idx], index) } 

It will be fast when value long, but with several levels, for example,

 f0 <- function(id, value, index) mapply(function(target, nth) which(id==target)[nth], value, index) viq <- function(id, value, index) { dti <- data.table(id = value, index = index) data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1 } 

and

 > value <- rep(value, 100) > identical(f0(id, value, index), f1(id, value, index)) [1] TRUE > all.equal(f0(id, value, index), viq(id, value, index), + check.attributes=FALSE) [1] TRUE > microbenchmark(f0(id, value, index), f1(id, value, index), + viq(id, value, index)) Unit: milliseconds expr min lq mean median uq f0(id, value, index) 53.166878 54.909566 56.917717 55.336116 56.503741 f1(id, value, index) 1.682265 1.716843 1.883576 1.755070 1.831189 viq(id, value, index) 4.304148 4.381708 4.667590 4.656087 4.757184 max neval 99.621742 100 3.291769 100 6.590130 100 
+1
source

Option to @ 42-answer

 mapply( function(value, index) which(value == id)[index], value = value_index_query$value, index = value_index_query$index ) 
0
source

Source: https://habr.com/ru/post/1238438/


All Articles