R how to do this for this operation

In R, how can I best draw this operation?

I have a reference table with a lower limit (A) and an upper (B).

I also have a table of values ​​(X) to search in comparison to the above table.

For each value of X, I need to determine whether it BETWEEN ANY of the values ​​of A and B in the lookup table.

To demonstrate the above, here is a solution using a loop:

#For Reproduceability,
set.seed(1); 

#Set up the Reference and Lookup Tables
nref = 5; nlook = 10
referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5),
                             B=runif(nref,min=0.50,max=0.75));
lookupTable    <- data.frame(X=runif(nlook),IsIn=0)

#Process for each row in the lookup table
#search for at least one match in the reference table where A <= X < B
for(x in 1:nrow(lookupTable)){
  v   <- lookupTable$X[x]
  tmp <- subset(referenceTable,v >= A & v < B)
  lookupTable[x,'IsIn'] = as.integer(nrow(tmp) > 0)
}

I am trying to remove a component for(x in .... ), since the table in my real life problem is many thousands of records.

+4
source share
2 answers

, data.table::foverlaps. lookupTable, . key referenceTable (, foverlaps ), , ( 0^, )

library(data.table)
setDT(lookupTable)[, Y := X] # Add an additional boundary column
setkey(setDT(referenceTable)) # Key the referenceTable data set
lookupTable[, IsIn := 0 ^ !foverlaps(lookupTable, 
                                     referenceTable, 
                                     by.x = c("X", "Y"),
                                     mult = "first", 
                                     nomatch = 0L, 
                                     which = TRUE)]
#             X IsIn         Y
#  1: 0.2059746    0 0.2059746
#  2: 0.1765568    0 0.1765568
#  3: 0.6870228    1 0.6870228
#  4: 0.3841037    1 0.3841037
#  5: 0.7698414    0 0.7698414
#  6: 0.4976992    1 0.4976992
#  7: 0.7176185    1 0.7176185
#  8: 0.9919061    0 0.9919061
#  9: 0.3800352    1 0.3800352
# 10: 0.7774452    0 0.7774452
+6

findInterval

referenceTable2 = cbind(-Inf, referenceTable)

for(x in 1:nrow(referenceTable2)){
  tmp <- findInterval(lookupTable$X, referenceTable2[x,])
  lookupTable[,'IsIn'] = lookupTable[,'IsIn'] + (tmp == 2)
}
lookupTable[,'IsIn'] = sign(lookupTable[,'IsIn'])

, , , . :

1) :

> microbenchmark(nicholas = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); for(x in 1:nrow(lookupTable)){v   <- lookupTable$X[x]; tmp <- subset(referenceTable,v >= A & v < B); lookupTable[x,'IsIn'] = as.integer(nrow(tmp) > 0)}}, 
+                mts = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); referenceTable2 = cbind(-Inf, referenceTable); for(x in 1:nrow(referenceTable2)){tmp <- findInterval(lookupTable$X, referenceTable2[x,]); lookupTable[,'IsIn'] = lookupTable[,'IsIn'] + (tmp == 2);}; lookupTable[,'IsIn'] = sign(lookupTable[,'IsIn'])},
+                david = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); setDT(lookupTable)[, Y := X]; setkey(setDT(referenceTable)); lookupTable[, IsIn := 0 ^ !foverlaps(lookupTable, referenceTable, by.x = c("X", "Y"), mult = "first", nomatch = 0L, which = TRUE)]},
+                times = 100)
Unit: milliseconds
     expr      min       lq     mean   median       uq       max neval
 nicholas 1.948096 2.091311 2.190386 2.150790 2.254352  4.092121   100
      mts 2.520489 2.711986 2.883299 2.803421 2.885990  5.165999   100
    david 6.466129 7.013798 7.344596 7.197132 7.422916 12.274029   100

2) nref = 10; nlook = 1000

Unit: milliseconds
     expr        min         lq       mean     median         uq        max neval
 nicholas 152.804680 160.681265 164.601443 163.304849 165.387296 243.250708   100
      mts   4.434997   4.720027   5.025555   4.819624   4.991995  11.325172   100
    david   6.505314   6.920032   7.181116   7.111529   7.331950   9.318765   100

3) nref = 200; nlook = 1000

Unit: milliseconds
     expr        min         lq       mean     median         uq       max neval
 nicholas 172.939666 179.672397 183.337253 181.191081 183.694077 264.59672   100
      mts  77.836588  81.071752  83.860281  81.991919  83.484246 168.22290   100
    david   6.870116   7.404256   7.736445   7.587591   7.836234  11.54349   100

, . , . , .

0

Source: https://habr.com/ru/post/1619010/


All Articles