The most efficient way to determine if an element exists in a vector

I have several algorithms that depend on the effectiveness of determining whether an element exists in a vector or not. It seems to me that %in%(which is equivalent is.element()) should be the most efficient, since it just returns a boolean value. After testing several methods, to my surprise, these methods are by far the most inefficient. Below is my analysis (the results deteriorate as the size of the vectors increases):

EfficiencyTest <- function(n, Lim) {

    samp1 <- sample(Lim, n)
    set1 <- sample(Lim, Lim)

    print(system.time(for(i in 1:n) {which(set1==samp1[i])}))
    print(system.time(for(i in 1:n) {samp1[i] %in% set1}))
    print(system.time(for(i in 1:n) {is.element(samp1[i], set1)}))
    print(system.time(for(i in 1:n) {match(samp1[i], set1)}))
    a <- system.time(set1 <- sort(set1))
    b <- system.time(for (i in 1:n) {BinVecCheck(samp1[i], set1)})
    print(a+b)
}

> EfficiencyTest(10^3, 10^5)
user  system elapsed 
0.29    0.11    0.40 
user  system elapsed 
19.79    0.39   20.21 
user  system elapsed 
19.89    0.53   20.44 
user  system elapsed 
20.04    0.28   20.33 
user  system elapsed 
0.02    0.00    0.03 

Where BinVecCheckis the binary search algorithm I wrote that returns TRUE/ FALSE. Please note that I include the time needed to sort the vector with the final method. Here is the code for binary search:

BinVecCheck <- function(tar, vec) {      
    if (tar==vec[1] || tar==vec[length(vec)]) {return(TRUE)}        
    size <- length(vec)
    size2 <- trunc(size/2)
    dist <- (tar - vec[size2])       
    if (dist > 0) {
        lower <- size2 - 1L
        upper <- size
    } else {
        lower <- 1L
        upper <- size2 + 1L
    }        
    while (size2 > 1 && !(dist==0)) {
        size2 <- trunc((upper-lower)/2)
        temp <- lower+size2
        dist <- (tar - vec[temp])
        if (dist > 0) {
            lower <- temp-1L
        } else {
            upper <- temp+1L
        }
    }       
    if (dist==0) {return(TRUE)} else {return(FALSE)}
}

:

> sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

, R? , R Python set, ? , %in% .. which, ( , )?

+4
4

, (?) ( , , r-devel@r-project.org, , , fastmatch ...)

 n <- 10^3; Lim <- 10^5
 set.seed(101)
 samp1 <- sample(Lim,n)
 set1 <- sample(Lim,Lim)
 library("rbenchmark")

 library("fastmatch")
 `%fin%` <- function(x, table) {
     stopifnot(require(fastmatch))
     fmatch(x, table, nomatch = 0L) > 0L
 }
 benchmark(which=sapply(samp1,function(x) which(set1==x)),
           infun=sapply(samp1,function(x) x %in% set1),
           fin= sapply(samp1,function(x) x %fin% set1),
           brc= sapply(samp1,BinVecCheck,vec=sort(set1)),
           replications=20,
    columns = c("test", "replications", "elapsed", "relative"))

##    test replications elapsed relative
## 4   brc           20   0.871    2.329
## 3   fin           20   0.374    1.000
## 2 infun           20   6.480   17.326
## 1 which           20  10.634   28.433

, %in% , which - BinVecCheck 7 , fastmatch 2 . , Rcpp ... , :

##    user  system elapsed   (which)
##   0.488   0.096   0.586 
##    user  system elapsed   (%in%) 
##   0.184   0.132   0.315 
##    user  system elapsed  (is.element)
##   0.188   0.124   0.313 
##    user  system elapsed  (match)
##   0.148   0.164   0.312 
##    user  system elapsed  (BinVecCheck)
##   0.048   0.008   0.055 

update: r-devel ( R, ), R NEWS :

match(x, table) , , x , , Haverty PR # 16491.

sessionInfo()
## R Under development (unstable) (2015-10-23 r69563)
## Platform: i686-pc-linux-gnu (32-bit)
## Running under: Ubuntu precise (12.04.5 LTS)
+8

%in% , :

"%in%" <- function(x, table) match(x, table, nomatch = 0) > 0

match which ( C) , .Internal(). pryr:

install.packages("pryr")
library(pryr)
pryr::show_c_source(.Internal(which(x)))
pryr::show_c_source(.Internal(match(x, table, nomatch, incomparables)))

, . which , .., match. ( ).

+3

, . , @ben-bolker, %fin% . , , ( samp1) (set1). , .

, TRUE, ?

if (tar==vec[1] || tar==vec[size]) {return(TRUE)}

else .

-, , (set1) . , size . .

ModifiedBinVecCheck <- function(tar, vec, size) {
    size2 <- trunc(size/2)
    dist <- (tar - vec[size2])
    if (dist > 0) {
        lower <- size2 - 1L
        upper <- size
    } else {
        lower <- 1L
        upper <- size2 + 1L
    }
    while (size2 > 1 && !(dist==0)) {
        size2 <- trunc((upper-lower)/2)
        temp <- lower+size2
        dist <- (tar - vec[temp])
        if (dist > 0) {
            lower <- temp-1L
        } else {
            upper <- temp+1L
        }
    }
    if (dist==0) {
        return(TRUE)
    } else {
        if (tar==vec[1] || tar==vec[size]) {return(TRUE)} else {return(FALSE)}
    }
}

, , . sort - shell, , ( ) , quick (quick ). quick ( ) , ( ). , fmatch match , , .

n.

Case1 (n = 10^3 Lim = 10^6, n to Lim ratio is 1:1000):

n <- 10^3; Lim <- 10^6
set.seed(101)
samp1 <- sample(Lim,n)
set1 <- sample(Lim,Lim)
benchmark(fin= sapply(samp1,function(x) x %fin% set1),
            brc= sapply(samp1,ModifiedBinVecCheck,vec=sort(set1, method = "quick"),size=Lim),
            oldbrc= sapply(samp1,BinVecCheck,vec=sort(set1)),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"))
test replications elapsed relative
2    brc           10    0.97    4.217
1    fin           10    0.23    1.000
3 oldbrc           10    1.45    6.304

Case2 (n = 10^4 Lim = 10^6, n to Lim ratio is 1:100):

n <- 10^4; Lim <- 10^6
set.seed(101)
samp1 <- sample(Lim,n)
set1 <- sample(Lim,Lim)
benchmark(fin= sapply(samp1,function(x) x %fin% set1),
            brc= sapply(samp1,ModifiedBinVecCheck,vec=sort(set1, method = "quick"),size=Lim),
            oldbrc= sapply(samp1,BinVecCheck,vec=sort(set1)),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"))
test replications elapsed relative
2    brc           10    2.08    1.000
1    fin           10    2.16    1.038
3 oldbrc           10    2.57    1.236

Case3: (n = 10^5 Lim = 10^6, n to Lim ratio is 1:10):

n <- 10^5; Lim <- 10^6
set.seed(101)
samp1 <- sample(Lim,n)
set1 <- sample(Lim,Lim)
benchmark(fin= sapply(samp1,function(x) x %fin% set1),
            brc= sapply(samp1,ModifiedBinVecCheck,vec=sort(set1, method = "quick"),size=Lim),
            oldbrc= sapply(samp1,BinVecCheck,vec=sort(set1)),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"))
    test replications elapsed relative
2    brc           10   13.13    1.000
1    fin           10   21.23    1.617
3 oldbrc           10   13.93    1.061

Case4: (n = 10^6 Lim = 10^6, n to Lim ratio is 1:1):

n <- 10^6; Lim <- 10^6
set.seed(101)
samp1 <- sample(Lim,n)
set1 <- sample(Lim,Lim)
benchmark(fin= sapply(samp1,function(x) x %fin% set1),
            brc= sapply(samp1,ModifiedBinVecCheck,vec=sort(set1, method = "quick"),size=Lim),
            oldbrc= sapply(samp1,BinVecCheck,vec=sort(set1)),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"))
   test replications elapsed relative
2    brc           10  124.61    1.000
1    fin           10  214.20    1.719
3 oldbrc           10  127.39    1.022


, n Lim, ( ) . 1, %fin% 4 , , Case2 , 3 , 4 %fin%.

, " ?", . %fin% , ModifiedBinVecCheck .

+2

any( x == "foo" ) , , x NA. NA, R 3.3 "% in%", .

. findInterval , . , x .

+1

Source: https://habr.com/ru/post/1613973/


All Articles