Skip to content

Commit

Permalink
Overlap / interval joins. #528. TODO: add tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Sep 2, 2014
1 parent b9e632d commit cb6cace
Show file tree
Hide file tree
Showing 5 changed files with 919 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export(setNumericRounding, getNumericRounding)
export(chmatch, "%chin%", chorder, chgroup)
export(rbindlist)
export(fread)
export(foverlaps)
export(address)
export(.SD,.N,.I,.GRP,.BY)

Expand Down
144 changes: 144 additions & 0 deletions R/foverlaps.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
foverlaps <- function(x, y, by.x = NULL, by.y = by.x, maxgap=0L, minoverlap=1L, type=c("any", "within", "start", "end", "equal"), mult=c("all", "first", "last"), nomatch=getOption("datatable.nomatch"), which = FALSE, verbose=getOption("datatable.verbose")) {

if (!is.data.table(y) || !is.data.table(x)) stop("y and x must both be data.tables. Use `setDT()` to convert list/data.frames to data.tables by reference or as.data.table() to convert to data.tables by copying.")
maxgap = as.integer(maxgap); minoverlap = as.integer(minoverlap);
which = as.logical(which); nomatch = as.integer(nomatch);
if (!length(maxgap) || length(maxgap) != 1L || is.na(maxgap) || maxgap < 0L)
stop("maxgap must be a non-negative integer value of length 1")
if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L)
stop("minoverlap must be a positive integer value of length 1")
if (!length(which) || length(which) != 1L || is.na(which))
stop("which must be a logical vector of length 1. Either TRUE/FALSE")
if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L))
stop("nomatch must either be NA or 0, or (ideally) NA_integer_ or 0L")
type = match.arg(type)
mult = match.arg(mult)
if (type == "equal")
stop("type = 'equal' is not implemented yet. But note that this is just the same as a normal data.table join y[x, ...], unless you are also interested in setting 'minoverlap / maxgap' arguments. But those arguments are not implemented yet as well.")
if (maxgap > 0L || minoverlap > 1L)
stop("maxgap and minoverlap arguments are not yet implemented.")
if (length(by.x) < 2L || length(by.y) < 2L)
stop("'by.x' and 'by.y' should contain at least two column names (or numbers) each - corresponding to 'start' and 'end' points of intervals. Please see ?foverlaps and examples for more info.")
if (is.numeric(by.x)) {
if (any(by.x < 0L) || any(by.x > length(x)))
stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.x <= length(x)")
by.x = names(x)[by.x]
}
if (is.numeric(by.y)) {
if (any(by.y < 0L) || any(by.y > length(y)))
stop("Invalid numeric value for 'by.x'; it should be a vector with values 1 <= by.y <= length(y)")
by.y = names(y)[by.y]
}
if (!length(by.x) || !is.character(by.x))
stop("A non-empty vector of column names is required for by.x")
if (!length(by.y) || !is.character(by.y))
stop("A non-empty vector of column names is required for by.y")
if (any(is.na(chmatch(by.x, names(x)))))
stop("Elements listed in 'by.x' must be valid names in data.table 'x'")
if (any(is.na(chmatch(by.y, names(y)))))
stop("Elements listed in 'by.y' must be valid names in data.table 'y'")
if (anyDuplicated(by.x) || anyDuplicated(by.y))
stop("Duplicate columns are not allowed in overlap joins. This may change in the future.")
if (length(by.x) != length(by.y))
stop("length(by.x) != length(by.y). Columns specified in by.x should correspond to columns specified in by.y and should be of same lengths.")

xnames = by.x; xintervals = tail(xnames, 2L);
ynames = by.y; yintervals = tail(ynames, 2L);
if (!storage.mode(x[[xintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(x[[xintervals[2L]]]) %chin% c("double", "integer"))
stop("The last two columns in by.x should correspond to the 'start' and 'end' intervals in data.table 'x' and must be integer/numeric type.")
if ( any(x[[xintervals[2L]]] - x[[xintervals[1L]]] < 0L) )
stop("All entries in column ", xintervals[1L], " should be <= corresponding entries in column ", xintervals[2L], " in data.table 'x'")
if (!storage.mode(y[[yintervals[1L]]]) %chin% c("double", "integer") || !storage.mode(y[[yintervals[2L]]]) %chin% c("double", "integer"))
stop("The last two columns in by.y should correspond to the 'start' and 'end' intervals in data.table 'y' and must be integer/numeric type.")
if ( any(y[[yintervals[2L]]] - y[[yintervals[1L]]] < 0L) )
stop("All entries in column ", yintervals[1L], " should be <= corresponding entries in column ", yintervals[2L], " in data.table 'y'")

## hopefully all checks are over. Now onto the actual task at hand.
shallow <- function(x, cols) {
xx = as.list(x)[cols]
setDT(xx)
}
origx = x; x = shallow(x, by.x)
origy = y; y = shallow(y, by.y)
if (identical(by.x, key(origx)[seq_along(by.x)]))
setattr(x, 'sorted', by.x)
if (identical(by.y, key(origy)[seq_along(by.y)]))
setattr(y, 'sorted', by.y)
roll = switch(type, start=, end=, equal= FALSE,
any=, within= TRUE)
make_call = function(names, fun=NULL) {
if (is.character(names))
names = lapply(names, as.name)
call = c(substitute(fun, list(fun=fun)), names)
if (!is.null(fun)) as.call(call) else call
}
construct = function(icols, mcols, type=type) {
icall = make_call(icols)
setattr(icall, 'names', icols)
mcall = make_call(mcols, quote(c))
if (type %chin% c("within", "any")) {
mcall[[3L]] = substitute(val+1L,
list(val = mcall[[3L]]))
}
make_call(c(icall, pos=mcall), quote(list))
}
uycols = switch(type, start = yintervals[1L],
end = yintervals[2L], any =,
within =, equal = yintervals)
call = construct(head(ynames, -2L), uycols, type)
if (verbose) {last.started.at=proc.time()[3];cat("unique() + setkey() operations done in ...");flush.console()}
uy = unique(y[, eval(call)])
setkey(uy)[, `:=`(lookup = list(list(integer(0))), type_lookup = list(list(integer(0))), count=0L, type_count=0L)]
if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
matches <- function(ii, xx, del, ...) {
cols = setdiff(names(xx), del)
xx = shallow(xx, cols)
ii[xx, which=TRUE, ...]
}
indices = function(x, y, intervals, ...) {
if (type == "start") {
sidx = eidx = matches(x, y, intervals[2L], ...) ## TODO: eidx can be set to integer(0)
} else if (type == "end") {
eidx = sidx = matches(x, y, intervals[1L], ...) ## TODO: sidx can be set to integer(0)
} else {
rollends = ifelse(type == "any", TRUE, FALSE)
sidx = matches(x, y, intervals[2L], rollends=rollends, ...)
eidx = matches(x, y, intervals[1L], ...)
}
list(sidx, eidx)
}
.Call(Clookup, uy, nrow(y), indices(uy, y, yintervals, roll=roll), maxgap, minoverlap, mult, type, verbose)
if (maxgap == 0L && minoverlap == 1L) {
iintervals = tail(names(x), 2L)
if (verbose) {last.started.at=proc.time()[3];cat("binary search(es) done in ...");flush.console()}
xmatches = indices(uy, x, xintervals, nomatch=0L, roll=roll)
if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
olaps = .Call(Coverlaps, uy, xmatches, mult, type, nomatch, verbose)
} else if (maxgap == 0L && minoverlap > 1L) {
stop("Not yet implemented")
} else if (maxgap > 0L && minoverlap == 1L) {
stop("Not yet implemented")
} else if (maxgap > 0L && minoverlap > 1L) {
if (maxgap > minoverlap)
warning("maxgap > minoverlap. maxgap will have no effect here.")
stop("Not yet implemented")
}
setDT(olaps)
setnames(olaps, c("xid", "yid"))
# if (type == "any") setorder(olaps) # at times the combine operation may not result in sorted order
if (which) {
if (mult %chin% c("first", "last"))
return (olaps$yid)
else if (!is.na(nomatch))
return (olaps[yid > 0L])
else return (olaps)
} else {
if (!is.na(nomatch))
olaps = olaps[yid > 0L]
tmp = origy[olaps$yid, !c(by.y), with=FALSE]
if (!identical(tmp, null.data.table()))
ans = cbind(origx[olaps$xid], tmp)
else ans = origx[olaps$xid]
return (ans)
}
}
79 changes: 79 additions & 0 deletions man/foverlaps.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
\name{foverlaps}
\alias{foverlaps}
\title{Fast overlap joins}
\description{
A \emph{fast} binary-search based \emph{overlap join} of two \code{data.table}s. This is very much inspired by \code{findOverlaps} function from the bioconductor package \code{IRanges} (see link below under \code{See Also}).

Usually, \code{x} is a very large data.table with small interval ranges, and \code{y} is much smaller \code{data.table} with relatively larger interval spans. For an usage in \code{genomics}, please look at the examples section.

NOTE: This is still under development, meaning it's stable, but some features are yet to be implemented. Also, some arguments and/or the function name itself could be changed.
}
\usage{
foverlaps(x, y, by.x = NULL, by.y = by.x,
maxgap = 0L, minoverlap = 1L,
type = c("any", "within", "start", "end", "equal"),
mult = c("all", "first", "last"),
nomatch = getOption("datatable.nomatch"),
which = FALSE, verbose = getOption("datatable.verbose"))
}
\arguments{
\item{x, y}{ \code{data.table}s. \code{x} and \code{y} need not be sorted and therefore setting keys are not an absolute requirement.}
\item{by.x, by.y}{A vector of column names (or numbers) to compute the overlap joins. The last two columns in both \code{by.x} and \code{by.y} should each correspond to the \code{start} and \code{end} interval columns in \code{x} and \code{y} respectively. And the \code{start} column should always be <= \code{end} column. If \code{by.y} is not specified, it is by default equal to \code{by.x}. }
\item{maxgap}{It should be a non-negative integer value >= 0. Default is 0 (no gap). For intervals \code{[a,b]} and \code{[c,d]}, where \code{a<=b} and \code{c<=d}, when \code{c > b} or \code{d < a}, the two intervals don't overlap. If the gap between these two intervals is \code{<= maxgap}, these two intervals are considered as overlapping. Note: This is not yet implemented.}
\item{minoverlap}{ It should be a positive integer value > 0. Default is 1. For intervals \code{[a,b]} and \code{[c,d]}, where \code{a<=b} and \code{c<=d}, when \code{c<=b} and \code{d>=a}, the two intervals overlap. If the length of overlap between these two intervals is \code{>= minoverlap}, then these two intervals are considered to be overlapping. Note: This is not yet implemented.}
\item{type}{ Default value is \code{any}. Allowed values are \code{any}, \code{within}, \code{start}, \code{end} and \code{equal}. Note: \code{equal} is not yet implemented. But this is just a normal join of the type \code{y[x, ...]}, unless you require also using \code{maxgap} and \code{minoverlap} arguments.

The types shown here are identical in functionality to the function \code{findOverlaps} in the bioconductor package \code{IRanges}. Let \code{[a,b]} and \code{[c,d]} be intervals in \code{x} and \code{y} with \code{a<=b} and \code{c<=d}. For \code{type="start"}, the intervals overlap iff \code{a == c}. For \code{type="end"}, the intervals overlap iff \code{b == d}. For \code{type="within"}, the intervals overlap iff \code{a>=c and b<=d}. For \code{type="equal"}, the intervals overlap iff \code{a==c and b==d}. For \code{type="any"}, as long as \code{c<=b and d<=a}, they overlap. In addition to these requirments, they also have to satisfy the \code{minoverlap} argument as explained above.

NB: \code{maxgap} argument, when > 0, is to be interpreted according to the type of the overlap. This will be updated once \code{maxgap} is implemented.}
\item{mult}{ When multiple rows in \code{y} match to the row in \code{x}, \code{mult=.} controls which values are returned - \code{"all"} (default), \code{first} or \code{"last"}.}
\item{nomatch}{ Same as \code{nomatch} in \code{\link{match}}. When a row (with interval say, \code{[a,b]}) in \code{x} has no match in \code{y}, \code{nomatch=NA} (default) means \code{NA} is returned for \code{y}'s non-\code{by.y} columns for that row of \code{x}. \code{0} means no rows will be returned for that row of \code{x}. The default value (used when \code{nomatch} is not supplied) can be changed from \code{NA} to \code{0} using \code{options(datatable.nomatch=0)}. }
\item{which}{ When \code{TRUE}, if \code{mult="all"} returns a two column \code{data.table} with the first column corresponding to \code{x}'s row number and the second corresponding to \code{y}'s. when \code{nomatch=NA}, no matches return \code{NA} for \code{y}, and if \code{nomatch=0}, those rows where no match is found will be skipped; if \code{mult="first" or "last"}, a vector of length equal to the number of rows in \code{x} is returned, with no-match entries filled with \code{NA} or \code{0} corresponding to the \code{nomatch} argument. Default is \code{FALSE}, which returns a join with the rows in \code{y}.}
\item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future.}
}
\details{
Very briefly, \code{foverlaps()} collapses the two-column interval in \code{y} to one-column of \emph{unique} values to generate a \code{lookup} table, and then performs the join depending on the type of \code{overlap}, using the already available \code{binary search} feature of \code{data.table}. The time (and space) required to generate the \code{lookup} is therefore proportional to the number of unique values present in the interval columns of \code{x} when combined together.
Overlap joins works with or without setting keys on \code{data.table}s. Although note that the rows in \code{y} (that match \code{x}) will be returned in sorted order. The columns in \code{by.x} argument should correspond to the columns specified in \code{by.y}. The last two columns should be the \emph{interval} columns in both \code{by.x} and \code{by.y}. The first interval column in \code{by.x} should always be <= the second interval column in \code{by.x}; similarly in \code{by.y}. The \code{\link{storage.mode}} of the interval columns must be either \code{double} or \code{integer}. It therefore works with \code{bit64::integer64} type as well.
The \code{lookup} generation step is the most time consuming step. But as long as the number of unique values are not too large (ex: in the order of millions), it should be incredibly fast. The algorithm is developed under the consideration that \code{y} will not have too many unique values in most scenarios.
Note that, a \code{range join} is a special case of \code{overlap join} (or interval join) where the \code{start} and \code{end} intervals for \code{data.table x} are exactly the same.
NB: \code{type="equal"} is not yet implemented, but note that it's just a normal join as long as \code{maxgap} and \code{minoverlap} arguments are not changed from their default values. And so are \code{maxgap} and \code{minoverlap} arguments. We hope to implement this by the next release.
}
\value{
A new \code{data.table} by joining over the interval columns (along with other additional identifier columns) specified in \code{by.x} and \code{by.y}.

NB: When \code{which=TRUE}: \code{a)} \code{mult="first" or "last"} returns a \code{vector} of matching row numbers in \code{y}, and \code{b)} when \code{mult="all"} returns a data.table with two columns with the first containing row numbers of \code{x} and the second column with corresponding row numbers of \code{y}.

\code{nomatch=NA or 0} also influences whether non-matching rows are returned or not, as explained above.
}

\examples{
require(data.table)
## simple example:
x = data.table(start=c(5,31,22,16), end=c(8,50,25,18), val2 = 7:10)
y = data.table(start=c(10, 20, 30), end=c(15, 35, 45), val1 = 1:3)
foverlaps(x, y, by.x=c("start", "end"), type="any", which=TRUE) ## return overlap indices
foverlaps(x, y, by.x=c("start", "end"), type="any") ## return overlap join
foverlaps(x, y, by.x=c("start", "end"), type="any", mult="first") ## returns only first match
foverlaps(x, y, by.x=c("start", "end"), type="within") ## matches iff 'x' is fully contained in 'y'

## with extra identifiers, a Genomics example
x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"), start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60))
y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1), end=c(4, 18, 55), val=1:3)
foverlaps(x, y, by.x=1:3, type="any", which=TRUE)
foverlaps(x, y, by.x=1:3, type="any")
foverlaps(x, y, by.x=1:3, type="any", nomatch=0L)
foverlaps(x, y, by.x=1:3, type="within", which=TRUE)
foverlaps(x, y, by.x=1:3, type="within")
foverlaps(x, y, by.x=1:3, type="start")

}
\seealso{
\code{\link{data.table}}, \url{https://r-forge.r-project.org/projects/datatable/} \url{http://www.bioconductor.org/packages/release/bioc/html/IRanges.html}
}
\keyword{ data }

Loading

0 comments on commit cb6cace

Please sign in to comment.