diff --git a/NAMESPACE b/NAMESPACE index c2c095a1d8..d4c633fb8b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(is.sorted) S3method("[", data.table) S3method("[<-", data.table) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..ad32b03e2e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,8 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. New function `is.sorted` is now exported. It is fast routine for checking sortedness of data.table type objects or atomic vectors, [#2325](https://github.com/Rdatatable/data.table/issues/2325). Thanks to @franknarf1 for feature request. For more details about usage see function manual [`?is.sorted`](https://rdatatable.gitlab.io/data.table/library/data.table/html/is.sorted.html). + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/setkey.R b/R/setkey.R index 334ca1e801..71287c46ed 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -155,16 +155,24 @@ setreordervec = function(x, order) .Call(Creorder, x, order) # The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for # consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors. -is.sorted = function(x, by=seq_along(x)) { +is.sorted = function(x, by=seq_along(x), retOrd=FALSE) { if (is.list(x)) { - warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.") + # for efficient use via retOrd argument see note in ?is.sorted # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in. - # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by) - 0L == length(forderv(x,by,retGrp=FALSE,sort=TRUE)) - } else { + o = forderv(x,by,retGrp=FALSE,sort=TRUE) + ans = 0L == length(o) + if (isTRUE(retOrd)) + ans = setattr(copy(ans), "order", o) + ans + } else if (is.null(x)) { # NULL does not satisfy C isVectorAtomic + NA + } else if (is.atomic(x)) { + if (isTRUE(retOrd)) stop("retOrd works only for data.table/list input") if (!missing(by)) stop("x is vector but 'by' is supplied") .Call(Cfsorted, x) + } else { + stop("'x' argument is of unsupported type") } # Cfsorted could be named CfIsSorted, but since "sorted" is an adjective not verb, it's clear; e.g., Cfsort would sort it ("sort" is verb). # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..fff124e1ea 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -37,7 +37,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { guess = data.table:::guess INT = data.table:::INT is_na = data.table:::is_na - is.sorted = data.table:::is.sorted isReallyReal = data.table:::isReallyReal melt.data.table = data.table:::melt.data.table # for test 1953.4 null.data.table = data.table:::null.data.table @@ -68,6 +67,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # masked by which package? # ================================= setattr = data.table::setattr # bit + is.sorted = data.table::is.sorted # bit shift = data.table::shift # IRanges, GenomicRanges between = data.table::between # plm second = data.table::second # S4Vectors @@ -3988,7 +3988,7 @@ y <- copy(x) setreordervec(y, o) test(1161.3, x[o], y) -# tests for setreordervec +# tests for is.sorted DT <- data.table(x=sample(c(NA, -10:10), 2e2, TRUE), y=sample(c(NA, NaN, -Inf, Inf, -10:10), 2e2, TRUE), z=sample(c(NA, letters), 2e2, TRUE)) @@ -4014,8 +4014,24 @@ test(1162.09, length(forderv(DT, by=2:3)), 0L) setkey(DT) # test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10 test(1162.10, length(forderv(DT, by=1:3)), 0L) -test(1162.11, is.sorted(DT, by=1:3), TRUE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") -test(1162.12, is.sorted(DT, by=2:1), FALSE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") +test(1162.11, is.sorted(DT, by=1:3), TRUE) +test(1162.12, is.sorted(DT, by=2:1), FALSE) +x = as.data.table(iris) # is.sorted now exported #2325 +test(1162.21, is.sorted(x, by="Species"), TRUE) +ans = is.sorted(x, by="Sepal.Length", retOrd=TRUE) +test(1162.22, identical(ans, FALSE), FALSE) +test(1162.23, isFALSE(ans), TRUE) +test(1162.24, is.integer(attr(ans, "order"))) +o = attr(ans, "order") +test(1162.25, x[o], x[order(Sepal.Length)]) +test(1162.26, is.sorted(x[o], by="Sepal.Length")) +test(1162.31, is.sorted(NULL), NA) +test(1162.32, is.sorted(1:5, retOrd=TRUE), error="retOrd works only for data.table/list input") +test(1162.33, is.sorted(1:5, by="a"), error="but 'by' is supplied") +test(1162.34, is.sorted(expression(1:5)), error="unsupported type") +test(1162.35, is.sorted(as.raw(1:5)), error="is not yet supported") +test(1162.36, is.sorted(c("a",NA,"c")), FALSE) +test(1162.37, is.sorted(c("a","fa\xE7ile","b")), FALSE) # FR #351 - last on length=0 arguments x <- character(0) diff --git a/man/is.sorted.Rd b/man/is.sorted.Rd new file mode 100644 index 0000000000..18a391dd4c --- /dev/null +++ b/man/is.sorted.Rd @@ -0,0 +1,50 @@ +\name{is.sorted} +\alias{is.sorted} +\title{ Checks if input is sorted } +\description{ + Checks if input is sorted. +} +\usage{ + is.sorted(x, by=seq_along(x), retOrd=FALSE) +} +\arguments{ + \item{x}{ data.table type object or atomic vector. } + \item{by}{ data.table columns used to check if \code{x} is sorted by those columns. } + \item{retOrd}{ logical, when \code{TRUE} it will set an attribute \code{"order"} on the returned value, providing an order of \code{x}. Works only for data.table type \code{x}, not for atomic vector. } +} +\details{ + Checks if the input is object is sorted. Can check also by a subset of columns provided in \code{by} argument. Can also return an order used in computation when using \code{retOrd} argument. +} +\note{ + Checking sortedness is an expensive computation, and most commonly the intermediate computation, the order, could be re-used. + For example the following check + +\preformatted{ + if (!is.sorted(DT, by="Sepal.Length")) + DT = DT[order(Sepal.Length)] +} + + could be written as + +\preformatted{ + if (!s <- is.sorted(DT, by="Sepal.Length", retOrd=TRUE)) + DT = DT[attr(s, "order")] +} + +so the order is computated only once. Of course for performance it is even better to sort in-place using \code{\link{setkey}}. +} +\value{ + Logical scalar, TRUE or FALSE, or if \code{NULL} provided, then logical \code{NA}. When \code{retOrd} set to TRUE, the resulting logical scalar will have an attribute \code{"order"}. The attribute will be integer vector the same length as nrow of \code{x}, or length 0 integer in case if \code{x} was sorted. Any missing values are being ordered to front, unlike \code{\link[base]{order}}. Note that logical scalar having attribute attached will fail test for \code{identical}, although it will work fine with \code{isTRUE} and \code{isFALSE}. +} +\seealso{ \code{\link{data.table}} } +\examples{ +x = as.data.table(iris) +is.sorted(x, by="Species") + +ans = is.sorted(x, by="Sepal.Length", retOrd=TRUE) +identical(ans, FALSE) +isFALSE(ans) +o = attr(ans, "order") +x[o] +} +\keyword{ data } \ No newline at end of file diff --git a/src/forder.c b/src/forder.c index ea0be76d04..d7f9b82d55 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1257,49 +1257,64 @@ void radix_r(const int from, const int to, const int radix) { } -SEXP fsorted(SEXP x) -{ +SEXP fsorted(SEXP x) { // Just checks if ordered and returns FALSE early if not. Does not return ordering if so, unlike forder. // Always increasing order with NA's first // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than returning NA when NA present. // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end) // These are all sequential access to x, so very quick and cache efficient. Could be parallel by checking continuity at batch boundaries. + if (!isVectorAtomic(x)) + STOP(_("internal error: is.sorted got list/NULL/unsupported type object on input thus should be handled on R level but reached C level fsorted")); // # nocov const int n = length(x); - if (n <= 1) return(ScalarLogical(TRUE)); - if (!isVectorAtomic(x)) STOP(_("is.sorted (R level) and fsorted (C level) only to be used on vectors. If needed on a list/data.table, you'll need the order anyway if not sorted, so use if (length(o<-forder(...))) for efficiency in one step, or equivalent at C level")); + if (n <= 1) + return(ScalarLogical(TRUE)); int i=1; switch(TYPEOF(x)) { case INTSXP : case LGLSXP : { int *xd = INTEGER(x); - while (i=xd[i-1]) i++; + while (i=xd[i-1]) + i++; } break; - case REALSXP : - if (inherits(x,"integer64")) { + case REALSXP : { + if (Rinherits(x,char_integer64)) { int64_t *xd = (int64_t *)REAL(x); - while (i=xd[i-1]) i++; + while (i=xd[i-1]) + i++; } else { double *xd = REAL(x); - while (i=dtwiddle(xd,i-1)) i++; + while (i=dtwiddle(xd,i-1)) + i++; } - break; + } break; case STRSXP : { SEXP *xd = STRING_PTR(x); i = 0; - while (i