Find a list of values ​​in a list of ranges in R

I have two data frames:

set.seed(123) myData<-data.frame(id=1:10, pos=21:30) refData<-data.frame(id=letters[1:15], pos=sample(10:40,15))

looks like that

> myData 
  id1  pos1
  1    21
  2    22
  3    23
  4    24
  5    25
  6    26
  7    27
  8    28
  9    29
 10    30
> refData
  id2  pos2
  a    18
  b    33
  c    21
  d    34
  e    35
  f    11
  g    23
  h    31
  i    22
  j    20
  k    30
  l    19
  m    32
  n    39
  o    36

I want an extended myData data frame. For each row in myData, I want to check if there is an entry in refData with a distance of less than 2 numbers, and if so, I need the refData identifiers inserted in the new myData column. At the end, my new data frame should look like this:

 id1 pos1     newColumn
   1   21 c, g, i, j, l
   2   22    c, g, i, j
   3   23       c, g, i
   4   24          g, i
   5   25             g
   6   26              
   7   27              
   8   28             k
   9   29          h, k
  10   30       h, k, m

Obviously, I could do this with the following loop, which works just fine:

myData$newColumn<-rep(NA, nrow(myData))
for(i in 1:nrow(myData)){
  ww<-which(abs(refData$pos2 - myData$pos1[i]) <=  2)
  myData$newColumn[i]<-paste(refData[ww,1],collapse=", ")
}

But I'm looking for a really quick way to do this, since my real data has about 10 ^ 6 records, and my real refData has about 10 ^ 7 records.

I really appreciate any help and ideas for a quick way to do this!

+4
source share
3 answers

You can try:

myData$newColumn = lapply(myData$pos, 
                 function(x) {paste(refData$id[abs(refData$pos-x)<3],collapse=', ')})

Conclusion:

   id pos     newColumn
1   1  21 c, g, i, j, l
2   2  22    c, g, i, j
3   3  23       c, g, i
4   4  24          g, i
5   5  25             g
6   6  26              
7   7  27              
8   8  28             k
9   9  29          h, k
10 10  30       h, k, m

, !

+3

:

myData$newColumn <- sapply(myData$pos, function(x) paste(refData$id[refData$pos >= x-2 & refData$pos <= x+2], collapse = ", "))

n = 1000 @Florian :

set.seed(123)
myData<-data.frame(id=1:1000, pos=sample(21:30, 1000, replace = T))
refData<-data.frame(id=sample(letters[1:15], 1000, replace = T), pos=sample(10:40, 1000, replace = T))

myData$newColumn<-rep(NA, nrow(myData))

library(microbenchmark)
microbenchmark(for(i in 1:nrow(myData)){
  ww<-which(abs(refData$pos - myData$pos[i]) <=  2)
  myData$newColumn[i]<-paste(refData[ww, "id"],collapse=", ")
},
myData$newColumn2 <- sapply(myData$pos, function(x) paste(refData$id[refData$pos >= x-2 & refData$pos <= x+2], collapse = ", ")),
myData$newColumn3 <- lapply(myData$pos, function(x) paste(refData$id[abs(refData$pos - x) <  3], collapse = ", ")))

Unit: milliseconds

    expr
 for (i in 1:nrow(myData)) {     ww <- which(abs(refData$pos - myData$pos[i]) <= 2)     myData$newColumn[i] <- paste(refData[ww, "id"], collapse = ", ") }
                 myData$newColumn2 <- sapply(myData$pos, function(x) paste(refData$id[refData$pos >=      x - 2 & refData$pos <= x + 2], collapse = ", "))
                                    myData$newColumn3 <- lapply(myData$pos, function(x) paste(refData$id[abs(refData$pos -      x) < 3], collapse = ", "))
      min       lq     mean   median       uq       max neval cld
 62.97657 64.74155 70.01541 68.81024 71.02023 206.80477   100   c
 46.55872 47.90585 50.75397 50.42333 53.42990  58.01813   100  b 
 36.69362 37.34244 39.70480 38.54905 42.49614  46.27513   100 a  
+2

: 1) nrow(myData) * nrow(refData) 2) refData$id.

, ( myData$pos /) , findInterval , refData$pos myData$pos +/- ( 2). , nrow(refData) * log(nrow(myData)) , , .

:

a = myData$pos
b = refData$pos 

a + 2, b :

i = findInterval(b, a + 2L, all.inside = TRUE, left.open = TRUE)
#> i
# [1] 1 9 1 9 9 1 1 8 1 1 7 1 9 9 9

(lower, upper] 1:(length(a) - 1), , b 2 a:

i1 = ifelse(abs(b - a[i + 1L]) <= 2, i + 1L, NA)
i2 = ifelse(abs(b - a[i]) <= 2, i, NA)
ii = pmin(i1, i2, na.rm = TRUE)
#> ii
# [1] NA NA  1 NA NA NA  1  9  1  1  8  1 10 NA NA

([lower, upper)) a - 2, b , a, b - 2 :

j = findInterval(b, a - 2L, all.inside = TRUE, left.open = FALSE)
j1 = ifelse(abs(b - a[j + 1L]) <= 2, j + 1L, NA)
j2 = ifelse(abs(b - a[j]) <= 2, j, NA)
jj = pmax(j1, j2, na.rm = TRUE)
#> jj
# [1] NA NA  3 NA NA NA  5 10  4  2 10  1 10 NA NA

(ii) (jj) myData$pos (a), refData$pos (b) +/- 2 ( ).

, , .

However, to continue representing concatenated refData$ids as matches , we could probably use the package IRangesfrom here to hope for something effective:

library(IRanges)
nr = 1:nrow(myData)
myrng = IRanges(nr, nr)
refrng = IRanges(ifelse(is.na(ii), 0L, ii), ifelse(is.na(jj), 0L, jj))  ## replace NA with 0
ovrs = findOverlaps(myrng, refrng)
tapply(refData$id[subjectHits(ovrs)], factor(queryHits(ovrs), nr), toString)
#              1               2               3               4               5 
#"c, g, i, j, l"    "c, g, i, j"       "c, g, i"          "g, i"             "g" 
#              6               7               8               9              10 
#             NA              NA             "k"          "h, k"       "h, k, m" 
+1
source

Source: https://habr.com/ru/post/1693912/


All Articles