Union and Intersection Intervals

I have a group of intervals for different identifiers. For instance:

df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725)) 

The intervals of each identifier do not overlap, but the intervals between different identifiers may overlap. Here is the image:

 plot(range(df[,c(2,3)]),c(1,nrow(df)),type="n",xlab="",ylab="",yaxt="n") for ( ii in 1:nrow(df) ) lines(c(df[ii,2],df[ii,3]),rep(nrow(df)-ii+1,2),col=as.numeric(df$id[ii]),lwd=2) legend("bottomleft",lwd=2,col=seq_along(levels(df$id)),legend=levels(df$id)) 

intervals I am looking for two functions: 1. A function that will take the union of these intervals. In the above example, it will return this data.frame file:

 union.df <- data.frame(id=rep("a,b,c",4), start=c(100,400,600,700), end=c(325,550,675,725)) 
  1. A function that will traverse these intervals only preserving the range if all identifiers overlap for that range. In the above example, it will return this data.frame file:

intersection.df <- data.frame(id="a,b,c", start=610, end=640)

+6
source share
4 answers

This is a bit inconvenient, but the idea is that you expand the data into a series of opening and closing events. Then you keep track of how many intervals are open at a time. This assumes that each group has no overlapping intervals.

 df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725)) sets<-function(start, end, group, overlap=length(unique(group))) { dd<-rbind(data.frame(pos=start, event=1), data.frame(pos=end, event=-1)) dd<-aggregate(event~pos, dd, sum) dd<-dd[order(dd$pos),] dd$open <- cumsum(dd$event) r<-rle(dd$open>=overlap) ex<-cumsum(r$lengths-1 + rep(1, length(r$lengths))) sx<-ex-r$lengths+1 cbind(dd$pos[sx[r$values]],dd$pos[ex[r$values]+1]) } #union with(df, sets(start, end, id,1)) # [,1] [,2] # [1,] 100 325 # [2,] 400 550 # [3,] 600 675 # [4,] 700 725 #overlap with(df, sets(start, end, id,3)) # [,1] [,2] # [1,] 610 640 
+2
source

The interval package solves the combined part of the question:

 require(intervals) idf <- Intervals(df[,2:3]) as.data.frame(interval_union(idf)) 

And for the part of the intersection, depending on how the intervals are determined:

 idl <- lapply(unique(df$id),function(x){var <- as(Intervals(df[df$id==x,2:3]),"Intervals_full");closed(var)[,1]<- FALSE;return(var)}) idt <- idl[[1]] for(i in idl)idt <- interval_intersection(idt,i) res <- as.data.frame(idt) res V1 V2 1 610 640 
+4
source

For intersection, I would start by counting the number of intervals in which you are in each range (the beginning of the range is marked ord.dirs$x in this code, and the number of intervals in the range ord.dirs$z ):

 dirs <- data.frame(x=c(df$start, df$end), y=rep(c(1, -1), each=nrow(df))) ord.dirs <- dirs[order(dirs$x),] ord.dirs$z <- cumsum(ord.dirs$y) ord.dirs <- ord.dirs[!duplicated(ord.dirs$x, fromLast=T),] ord.dirs # xyz # 1 100 1 1 # 5 150 1 2 # 10 200 -1 1 # 2 250 1 2 # 14 275 -1 2 # 11 300 -1 1 # 16 325 -1 0 # 3 400 1 1 # 12 550 -1 0 # 8 600 1 2 # 6 610 1 3 # 15 640 -1 2 # 13 650 -1 1 # 17 675 -1 0 # 9 700 1 1 # 18 725 -1 0 

Now you just need to capture the ranges where you have the correct number of intervals (in this case 3):

 pos.all <- which(ord.dirs$z == length(unique(df$id))) data.frame(start=ord.dirs$x[pos.all], end=ord.dirs$x[pos.all+1]) # start end # 1 610 640 

Similarly, you can use ord.dirs to capture a union of sets:

 zero.pos <- which(ord.dirs$z == 0) data.frame(start=c(ord.dirs$x[1], ord.dirs$x[head(zero.pos, -1)+1]), end=ord.dirs$x[zero.pos]) # start end # 1 100 325 # 2 400 550 # 3 600 675 # 4 700 725 
+1
source

The GenomicRanges package provides some overlap and overlap features:

 library(GenomicRanges) source("http://bioconductor.org/biocLite.R") biocLite("Gviz") library(Gviz) 

create a grange object with equal seqnames (this is important)

 df <- data.frame(id=c(rep("a",4),rep("b",2),rep("c",3)), start=c(100,250,400,600,150,610,275,600,700), end=c(200,300,550,650,275,640,325,675,725)) gr <- GRanges(seqnames = rep(1,nrow(df)),IRanges(start = df$start,end = df$end)) 

Now you can create ranges with the Gviz package.

 d0 <- GenomeAxisTrack() d1 <- AnnotationTrack(gr,group = df$id,fill=df$id) plotTracks(c(d0,d1)) 

Merging is done by shrinking when the intervals are compressed

 as.data.frame(reduce(gr))[,2:3] 

intersection is done through findoverlaps. Then filtered by ranges that span 3 ranges.

 OL <- as.data.frame(findOverlaps(gr,type="within")) table(OL[,1]) df[as.numeric(names(which(table(OL[,1])==3))),] 
+1
source

Source: https://habr.com/ru/post/986669/


All Articles