Effective coordinate matching between data frames in R with sapply

I am trying to get a vector telling me which lines in the data.frame file (transcriptcoords)

              chr  start  end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370

have start / end coordinates weakly contained (with a tolerance of +/- 10) in another data file. (genecoords)

              chr  start  end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887

To do this, I cycle through the tiers of the rows of the first data.frame, matching the coordinates with any row in the second data.frame. I have a solution (described below), but it seems pretty slow (about six seconds with a slice of 2000 lines):

   user  system elapsed 
   6.02    0.00    6.04

I am trying to figure out which parts of sapply can be optimized. Is this an if / else block? Or comparison lines (==, <=,> =)? Or, simply put, is this essentially a slow algorithm?

Thanks! The generated code below:

load(url("http://www.giorgilab.org/stuff/data.rda"))

# Pre-vectorize the data frames
g0<-rownames(genecoords)
g1<-genecoords[,1]
g2<-as.integer(genecoords[,2])
g3<-as.integer(genecoords[,3])

t0<-rownames(transcriptcoords)
t1<-transcriptcoords[,1]
t2<-as.integer(transcriptcoords[,2])
t3<-as.integer(transcriptcoords[,3])

system.time(gs<-sapply(1:2000,function(i){
            t<-t0[i]
            chr<-t1[i]
            start<-t2[i]
            end<-t3[i]

            # Find a match (loose boundaries +/- 10)
            right1<-which(g1==chr)
            right2<-which(g2<=start+10)
            right3<-which(g3>=end-10)
            right<-intersect(right3,intersect(right1,right2))
            right<-g0[right]

            if(length(right)==1){
                g<-right
            } else if(length(right)>1){
                # Get the smallest match
                heregenecoords<-genecoords[right,]
                size<-apply(heregenecoords,1,function(x){abs(as.numeric(x[3])-as.numeric(x[2]))})
                g<-names(which.min(size))
            } else {
                g<-t
            }
            return(g)           
        }
))
+4
3

tx0 <- read.table(textConnection("chr  start  end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370"))

gene0 <- read.table(textConnection("chr  start  end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887"))

GenomicRanges Bioconductor ( ).

library(GenomicRanges)
tx <- with(tx0, GRanges(chr, IRanges(start, end)))
gene <- with(gene0, GRanges(chr, IRanges(start, end)))

## increase width by 10 on both sides of the center of the gene range
gene <- resize(gene, width=width(gene) + 20, fix="center")
## find overlaps of 'query' tx and 'subject' gene, where query is within subject
olaps <- findOverlaps(tx, gene, type="within")

, , "" (tx) 1, 2, 3 4 "" () 1.

> findOverlaps(tx, gene, type="within")
Hits of length 6
queryLength: 6
subjectLength: 6
  queryHits subjectHits 
   <integer>   <integer> 
 1         1           1 
 2         2           1 
 3         3           1 
 4         4           1 
 5         5           2 
 6         6           2 

1 4 , 2 2.

> table(subjectHits(olaps))

1 2 
4 2 

. . :

tx <- with(transcriptcoords, GRanges(V1, IRanges(V2, V3, names=rownames(tx0))))
gene <- with(genecoords, GRanges(V1, IRanges(V2, V3, names=rownames(gene0))))

:

system.time(gene <- resize(gene, width=width(gene) + 20, fix="center"))
##   user  system elapsed 
##  0.056   0.000   0.057 
system.time(findOverlaps(tx, gene, type="within"))
##   user  system elapsed 
##  2.248   0.000   2.250 

, data.table @danas.zuokos 1000

system.time({
    dt <- genecoords[transcriptcoords, allow.cartesian = TRUE]; 
    res <- dt[start <= start.1 + tol & end >= end.1 - tol, 
         list(gene = gene[which.min(size)]), by = transcript]
})
##    user  system elapsed 
##   2.148   0.244   2.400 
+2

! . , . Martin, .

, , .

:

gs = 1:10*500
genes = data.frame(start=gs, end=gs+400)
rownames(genes) = sprintf('g%05d', 1:nrow(genes))

ts = sample(1:max(genes$end), size=10)
transcripts = data.frame(start=ts, end=ts+60)
rownames(transcripts) = sprintf('t%05d', 1:nrow(transcripts))

, outer, .

overlaps = function(genes, transcripts, min_overlap=1) {
  b1 = outer(genes$end, transcripts$start, min_overlap=min_overlap, 
             function(e,s,min_overlap) e-s+1>min_overlap)
  b2 = outer(genes$start, transcripts$end, min_overlap=min_overlap,
             function(s,e,min_overlap) e-s+1>min_overlap)
  result = b1 & b2
  rownames(result) = rownames(genes)
  colnames(result) = rownames(transcripts)
  return(result)
}

- :

> genes
       start  end
g00001   500  900
g00002  1000 1400
g00003  1500 1900
g00004  2000 2400
g00005  2500 2900
g00006  3000 3400
g00007  3500 3900
g00008  4000 4400
g00009  4500 4900
g00010  5000 5400

> transcripts
       start  end
t00001   535  595
t00002  2502 2562
t00003  4757 4817
t00004  3570 3630
t00005  3094 3154
t00006  1645 1705
t00007  5202 5262
t00008    13   73
t00009   788  848
t00010  4047 4107

o1 = overlaps(genes, transcripts, 1)

, , .

> o1
       t00001 t00002 t00003 t00004 t00005 t00006 t00007 t00008 t00009 t00010
g00001   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE
g00002  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00003  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE
g00004  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00005  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00006  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE
g00007  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00008  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE
g00009  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00010  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE
+1

data.table.

rm(list = ls())
load(url("http://www.giorgilab.org/stuff/data.rda"))
library(data.table)
tol <- 10 # tolerance
id <- 1:2000 # you can comment this out, but make sure you have big RAM

data.table. ( , abs, , ?).

genecoords <- data.table(genecoords, keep.rownames = TRUE)
setnames(genecoords, c("gene", "chr", "start", "end"))
genecoords[, size := end - start]
transcriptcoords <- data.table(transcriptcoords, keep.rownames = TRUE)[id] # comment out [id] when running on all trascripts
setnames(transcriptcoords, c("transcript", "chr", "start", "end"))

.

setkeyv(genecoords, "chr")
setkeyv(transcriptcoords, "chr")
dt <- genecoords[transcriptcoords, allow.cartesian = TRUE]
res <- dt[start <= start.1 + tol & end >= end.1 - tol, list(gene = gene[which.min(size)]), by = transcript]

, , (g<-t ).

+1
source

Source: https://habr.com/ru/post/1524634/


All Articles