Overlap / interval joins. #528. TODO: add tests.

Rdatatable · Sep 2, 2014 · cb6cace · cb6cace
1 parent b9e632d
commit cb6cace
Show file tree

Hide file tree

Showing 5 changed files with 919 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,6 +15,7 @@ export(setNumericRounding, getNumericRounding)
 export(chmatch, "%chin%", chorder, chgroup)
 export(rbindlist)
 export(fread)
+export(foverlaps)
 export(address)
 export(.SD,.N,.I,.GRP,.BY)
 

diff --git a/R/foverlaps.R b/R/foverlaps.R
@@ -0,0 +1,144 @@
+foverlaps <- function(x, y, by.x = NULL, by.y = by.x, maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch"), which = FALSE,  verbose=getOption("datatable.verbose")) {
+
+    if (!is.data.table(y) || !is.data.table(x)) stop("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.")
+    maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap); 
+    which = as.logical(which); nomatch = as.integer(nomatch); 
+    if (!length(maxgap) || length(maxgap) != 1L || is.na(maxgap) || maxgap < 0L)
+        stop("maxgap must be a non-negative integer value of length 1")    
+    if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L)
+        stop("minoverlap must be a positive integer value of length 1")
+    if (!length(which) || length(which) != 1L || is.na(which))
+        stop("which must be a logical vector of length 1. Either TRUE/FALSE")
+    if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L))
+        stop("nomatch must either be NA or 0, or (ideally) NA_integer_ or 0L")
+    type = match.arg(type)
+    mult = match.arg(mult)
+    if (type == "equal")
+        stop("type = 'equal' is not implemented yet. But note that this is just the same as a normal data.table join y[x, ...], unless you are also interested in setting 'minoverlap / maxgap' arguments. But those arguments are not implemented yet as well.")
+    if (maxgap > 0L || minoverlap > 1L)
+        stop("maxgap and minoverlap arguments are not yet implemented.")
+    if (length(by.x) < 2L || length(by.y) < 2L)
+        stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.")
+    if (is.numeric(by.x)) {
+        if (any(by.x < 0L) || any(by.x > length(x)))
+            stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)")
+        by.x = names(x)[by.x]
+    }
+    if (is.numeric(by.y)) {
+        if (any(by.y < 0L) || any(by.y > length(y)))
+            stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.y <= length(y)")
+        by.y = names(y)[by.y]
+    }
+    if (!length(by.x) || !is.character(by.x))
+        stop("A non-empty vector of column names is required for by.x")
+    if (!length(by.y) || !is.character(by.y))
+        stop("A non-empty vector of column names is required for by.y")
+    if (any(is.na(chmatch(by.x, names(x)))))
+        stop("Elements listed in 'by.x' must be valid names in data.table 'x'")
+    if (any(is.na(chmatch(by.y, names(y)))))
+        stop("Elements listed in 'by.y' must be valid names in data.table 'y'")
+    if (anyDuplicated(by.x) || anyDuplicated(by.y))
+        stop("Duplicate columns are not allowed in overlap joins. This may change in the future.")
+    if (length(by.x) != length(by.y))
+        stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.")
+
+    xnames = by.x; xintervals = tail(xnames, 2L);
+    ynames = by.y; yintervals = tail(ynames, 2L);
+    if (!storage.mode(x[[xintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(x[[xintervals[2L]]]) %chin% c("double", "integer"))
+        stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table 'x' and must be integer/numeric type.") 
+    if ( any(x[[xintervals[2L]]] - x[[xintervals[1L]]] < 0L) )
+        stop("All entries in column ", xintervals[1L], " should be <= corresponding entries in column ", xintervals[2L], " in data.table 'x'")
+    if (!storage.mode(y[[yintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(y[[yintervals[2L]]]) %chin% c("double", "integer"))
+        stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table 'y' and must be integer/numeric type.") 
+    if ( any(y[[yintervals[2L]]] - y[[yintervals[1L]]] < 0L) )
+        stop("All entries in column ", yintervals[1L], " should be <= corresponding entries in column ", yintervals[2L], " in data.table 'y'")
+
+    ## hopefully all checks are over. Now onto the actual task at hand.
+    shallow <- function(x, cols) {
+        xx = as.list(x)[cols]
+        setDT(xx)
+    }
+    origx = x; x = shallow(x, by.x)
+    origy = y; y = shallow(y, by.y)
+    if (identical(by.x, key(origx)[seq_along(by.x)]))
+        setattr(x, 'sorted', by.x)
+    if (identical(by.y, key(origy)[seq_along(by.y)]))
+        setattr(y, 'sorted', by.y)
+    roll = switch(type, start=, end=, equal= FALSE, 
+                    any=, within= TRUE)
+    make_call = function(names, fun=NULL) {
+        if (is.character(names))
+            names = lapply(names, as.name)
+        call = c(substitute(fun, list(fun=fun)), names)
+        if (!is.null(fun)) as.call(call) else call
+    }
+    construct = function(icols, mcols, type=type) {
+        icall = make_call(icols)
+        setattr(icall, 'names', icols)
+        mcall = make_call(mcols, quote(c))
+        if (type %chin% c("within", "any")) {
+            mcall[[3L]] = substitute(val+1L, 
+                list(val = mcall[[3L]]))
+        }
+        make_call(c(icall, pos=mcall), quote(list))
+    }
+    uycols = switch(type, start = yintervals[1L], 
+                end = yintervals[2L], any =, 
+                within =, equal = yintervals)
+    call = construct(head(ynames, -2L), uycols, type)
+    if (verbose) {last.started.at=proc.time()[3];cat("unique() + setkey() operations done in ...");flush.console()}
+    uy = unique(y[, eval(call)])
+    setkey(uy)[, `:=`(lookup = list(list(integer(0))), type_lookup = list(list(integer(0))), count=0L, type_count=0L)]
+    if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
+    matches <- function(ii, xx, del, ...) {
+        cols = setdiff(names(xx), del)
+        xx = shallow(xx, cols)
+        ii[xx, which=TRUE, ...]
+    }
+    indices = function(x, y, intervals, ...) {
+        if (type == "start") {
+            sidx = eidx = matches(x, y, intervals[2L], ...) ## TODO: eidx can be set to integer(0)
+        } else if (type == "end") {
+            eidx = sidx = matches(x, y, intervals[1L], ...) ## TODO: sidx can be set to integer(0)
+        } else {
+            rollends = ifelse(type == "any", TRUE, FALSE)
+            sidx = matches(x, y, intervals[2L], rollends=rollends, ...)
+            eidx = matches(x, y, intervals[1L], ...)
+        }
+        list(sidx, eidx)
+    }
+    .Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, roll=roll), maxgap, minoverlap, mult, type, verbose)
+    if (maxgap == 0L && minoverlap == 1L) {
+        iintervals = tail(names(x), 2L)
+        if (verbose) {last.started.at=proc.time()[3];cat("binary search(es) done in ...");flush.console()}
+        xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll)
+        if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
+        olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose)
+    } else if (maxgap == 0L && minoverlap > 1L) {
+        stop("Not yet implemented")
+    } else if (maxgap > 0L && minoverlap == 1L) {
+        stop("Not yet implemented")
+    } else if (maxgap > 0L && minoverlap > 1L) {
+        if (maxgap > minoverlap)
+            warning("maxgap > minoverlap. maxgap will have no effect here.")
+        stop("Not yet implemented")
+    }
+    setDT(olaps)
+    setnames(olaps, c("xid", "yid"))
+    # if (type == "any") setorder(olaps) # at times the combine operation may not result in sorted order
+    if (which) {
+        if (mult %chin% c("first", "last"))
+            return (olaps$yid)
+        else if (!is.na(nomatch))
+            return (olaps[yid > 0L])
+        else return (olaps)
+    } else {
+        if (!is.na(nomatch))
+            olaps = olaps[yid > 0L]
+        tmp = origy[olaps$yid, !c(by.y), with=FALSE]
+        if (!identical(tmp, null.data.table()))
+            ans = cbind(origx[olaps$xid], tmp)
+        else ans = origx[olaps$xid]
+        return (ans)
+    }
+}
diff --git a/man/foverlaps.Rd b/man/foverlaps.Rd
@@ -0,0 +1,79 @@
+\name{foverlaps}
+\alias{foverlaps}
+\title{Fast overlap joins}
+\description{
+  A \emph{fast} binary-search based \emph{overlap join} of two \code{data.table}s. This is very much inspired by \code{findOverlaps} function from the bioconductor package \code{IRanges} (see link below under \code{See Also}).
+
+  Usually, \code{x} is a very large data.table with small interval ranges, and \code{y} is much smaller \code{data.table} with relatively larger interval spans. For an usage in \code{genomics}, please look at the examples section.
+
+  NOTE: This is still under development, meaning it's stable, but some features are yet to be implemented. Also, some arguments and/or the function name itself could be changed.
+}
+
+\usage{
+foverlaps(x, y, by.x = NULL, by.y = by.x, 
+    maxgap = 0L, minoverlap = 1L, 
+    type = c("any", "within", "start", "end", "equal"), 
+    mult = c("all", "first", "last"), 
+    nomatch = getOption("datatable.nomatch"), 
+    which = FALSE, verbose = getOption("datatable.verbose"))
+}
+\arguments{
+  \item{x, y}{ \code{data.table}s. \code{x} and \code{y} need not be sorted and therefore setting keys are not an absolute requirement.}
+  \item{by.x, by.y}{A vector of column names (or numbers) to compute the overlap joins. The last two columns in both \code{by.x} and \code{by.y} should each correspond to the \code{start} and \code{end} interval columns in \code{x} and \code{y} respectively. And the \code{start} column should always be <= \code{end} column. If \code{by.y} is not specified, it is by default equal to \code{by.x}. }
+  \item{maxgap}{It should be a non-negative integer value >= 0. Default is 0 (no gap). For intervals \code{[a,b]} and \code{[c,d]}, where \code{a<=b} and \code{c<=d}, when \code{c > b} or \code{d < a}, the two intervals don't overlap. If the gap between these two intervals is \code{<= maxgap}, these two intervals are considered as overlapping. Note: This is not yet implemented.}
+  \item{minoverlap}{ It should be a positive integer value > 0. Default is 1. For intervals \code{[a,b]} and \code{[c,d]}, where \code{a<=b} and \code{c<=d}, when \code{c<=b} and \code{d>=a}, the two intervals overlap. If the length of overlap between these two intervals is \code{>= minoverlap}, then these two intervals are considered to be overlapping. Note: This is not yet implemented.}
+  \item{type}{ Default value is \code{any}. Allowed values are \code{any}, \code{within}, \code{start}, \code{end} and \code{equal}. Note: \code{equal} is not yet implemented. But this is just a normal join of the type \code{y[x, ...]}, unless you require also using \code{maxgap} and \code{minoverlap} arguments. 
+
+      The types shown here are identical in functionality to the function \code{findOverlaps} in the bioconductor package \code{IRanges}. Let \code{[a,b]} and \code{[c,d]} be intervals in \code{x} and \code{y} with \code{a<=b} and \code{c<=d}. For \code{type="start"}, the intervals overlap iff \code{a == c}. For \code{type="end"}, the intervals overlap iff \code{b == d}. For \code{type="within"}, the intervals overlap iff \code{a>=c and b<=d}. For \code{type="equal"}, the intervals overlap iff \code{a==c and b==d}. For \code{type="any"}, as long as \code{c<=b and d<=a}, they overlap. In addition to these requirments, they also have to satisfy the \code{minoverlap} argument as explained above.
+
+      NB: \code{maxgap} argument, when > 0, is to be interpreted according to the type of the overlap. This will be updated once \code{maxgap} is implemented.}
+  \item{mult}{ When multiple rows in \code{y} match to the row in \code{x}, \code{mult=.} controls which values are returned - \code{"all"} (default), \code{first} or \code{"last"}.}
+  \item{nomatch}{ Same as \code{nomatch} in \code{\link{match}}. When a row (with interval say, \code{[a,b]}) in \code{x} has no match in \code{y}, \code{nomatch=NA} (default) means \code{NA} is returned for \code{y}'s non-\code{by.y} columns for that row of \code{x}. \code{0} means no rows will be returned for that row of \code{x}. The default value (used when \code{nomatch} is not supplied) can be changed from \code{NA} to \code{0} using \code{options(datatable.nomatch=0)}. }
+  \item{which}{ When \code{TRUE}, if \code{mult="all"} returns a two column \code{data.table} with the first column corresponding to \code{x}'s row number and the second corresponding to \code{y}'s. when \code{nomatch=NA}, no matches return \code{NA} for \code{y}, and if \code{nomatch=0}, those rows where no match is found will be skipped; if \code{mult="first" or "last"}, a vector of length equal to the number of rows in \code{x} is returned, with no-match entries filled with \code{NA} or \code{0} corresponding to the \code{nomatch} argument. Default is \code{FALSE}, which returns a join with the rows in \code{y}.}
+  \item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future.}
+}
+\details{
+Very briefly, \code{foverlaps()} collapses the two-column interval in \code{y} to one-column of \emph{unique} values to generate a \code{lookup} table, and then performs the join depending on the type of \code{overlap}, using the already available \code{binary search} feature of \code{data.table}. The time (and space) required to generate the \code{lookup} is therefore proportional to the number of unique values present in the interval columns of \code{x} when combined together. 
+
+Overlap joins works with or without setting keys on \code{data.table}s. Although note that the rows in \code{y} (that match \code{x}) will be returned in sorted order. The columns in \code{by.x} argument should correspond to the columns specified in \code{by.y}. The last two columns should be the \emph{interval} columns in both \code{by.x} and \code{by.y}. The first interval column in \code{by.x} should always be <= the second interval column in \code{by.x}; similarly in \code{by.y}. The \code{\link{storage.mode}} of the interval columns must be either \code{double} or \code{integer}. It therefore works with \code{bit64::integer64} type as well.
+
+The \code{lookup} generation step is the most time consuming step. But as long as the number of unique values are not too large (ex: in the order of millions), it should be incredibly fast. The algorithm is developed under the consideration that \code{y} will not have too many unique values in most scenarios.
+
+Note that, a \code{range join} is a special case of \code{overlap join} (or interval join) where the \code{start} and \code{end} intervals for \code{data.table x} are exactly the same. 
+
+NB: \code{type="equal"} is not yet implemented, but note that it's just a normal join as long as \code{maxgap} and \code{minoverlap} arguments are not changed from their default values. And so are \code{maxgap} and \code{minoverlap} arguments. We hope to implement this by the next release. 
+}
+\value{
+    A new \code{data.table} by joining over the interval columns (along with other additional identifier columns) specified in \code{by.x} and \code{by.y}. 
+
+    NB: When \code{which=TRUE}: \code{a)} \code{mult="first" or "last"} returns a \code{vector} of matching row numbers in \code{y}, and \code{b)} when \code{mult="all"} returns a data.table with two columns with the first containing row numbers of \code{x} and the second column with corresponding row numbers of \code{y}.
+
+    \code{nomatch=NA or 0} also influences whether non-matching rows are returned or not, as explained above.
+}
+
+\examples{
+require(data.table)
+## simple example:
+x = data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)
+y = data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)
+foverlaps(x, y, by.x=c("start", "end"), type="any", which=TRUE) ## return overlap indices
+foverlaps(x, y, by.x=c("start", "end"), type="any") ## return overlap join
+foverlaps(x, y, by.x=c("start", "end"), type="any", mult="first") ## returns only first match
+foverlaps(x, y, by.x=c("start", "end"), type="within") ## matches iff 'x' is fully contained in 'y'
+
+## with extra identifiers, a Genomics example
+x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"), start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60))
+y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1), end=c(4, 18, 55), val=1:3)
+foverlaps(x, y, by.x=1:3, type="any", which=TRUE)
+foverlaps(x, y, by.x=1:3, type="any")
+foverlaps(x, y, by.x=1:3, type="any", nomatch=0L)
+foverlaps(x, y, by.x=1:3, type="within", which=TRUE)
+foverlaps(x, y, by.x=1:3, type="within")
+foverlaps(x, y, by.x=1:3, type="start")
+
+}
+\seealso{
+  \code{\link{data.table}}, \url{https://r-forge.r-project.org/projects/datatable/} \url{http://www.bioconductor.org/packages/release/bioc/html/IRanges.html}
+}
+\keyword{ data }
+