Rdatatable · jangorecki · Apr 11, 2020 · Apr 11, 2020 · Apr 11, 2020 · Apr 12, 2020
@@ -56,6 +56,7 @@ export(nafill)
 export(setnafill)
 export(.Last.updated)
 export(fcoalesce)
+export(is.sorted)
 
 S3method("[", data.table)
 S3method("[<-", data.table)

@@ -81,6 +81,8 @@ unit = "s")
 
 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
 
+15. New function `is.sorted` is now exported. It is fast routine for checking sortedness of data.table type objects or atomic vectors, [#2325](https://github.com/Rdatatable/data.table/issues/2325). Thanks to @franknarf1 for feature request. For more details about usage see function manual [`?is.sorted`](https://rdatatable.gitlab.io/data.table/library/data.table/html/is.sorted.html).
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).

@@ -155,16 +155,24 @@ setreordervec = function(x, order) .Call(Creorder, x, order)
 # The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
 # consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors.
 
-is.sorted = function(x, by=seq_along(x)) {
+is.sorted = function(x, by=seq_along(x), retOrd=FALSE) {
   if (is.list(x)) {
-    warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.")
+    # for efficient use via retOrd argument see note in ?is.sorted
     # could pass through a flag for forderv to return early on first FALSE. But we don't need that internally
     # since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in.
-    # This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by)
-    0L == length(forderv(x,by,retGrp=FALSE,sort=TRUE))
-  } else {
+    o = forderv(x,by,retGrp=FALSE,sort=TRUE)
+    ans = 0L == length(o)
+    if (isTRUE(retOrd))
+      ans = setattr(copy(ans), "order", o)
+    ans
+  } else if (is.null(x)) { # NULL does not satisfy C isVectorAtomic
+    NA
+  } else if (is.atomic(x)) {
+    if (isTRUE(retOrd)) stop("retOrd works only for data.table/list input")
     if (!missing(by)) stop("x is vector but 'by' is supplied")
     .Call(Cfsorted, x)
+  } else {
+    stop("'x' argument is of unsupported type")
   }
   # Cfsorted could be named CfIsSorted, but since "sorted" is an adjective not verb, it's clear; e.g., Cfsort would sort it ("sort" is verb).
   # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1)

@@ -37,7 +37,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   guess = data.table:::guess
   INT = data.table:::INT
   is_na = data.table:::is_na
-  is.sorted = data.table:::is.sorted
   isReallyReal = data.table:::isReallyReal
   melt.data.table = data.table:::melt.data.table  # for test 1953.4
   null.data.table = data.table:::null.data.table
@@ -68,6 +67,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
                                         # masked by which package?
                                         # =================================
   setattr = data.table::setattr         # bit
+  is.sorted = data.table::is.sorted     # bit
   shift = data.table::shift             # IRanges, GenomicRanges
   between = data.table::between         # plm
   second = data.table::second           # S4Vectors
@@ -3988,7 +3988,7 @@ y <- copy(x)
 setreordervec(y, o)
 test(1161.3, x[o], y)
 
-# tests for setreordervec
+# tests for is.sorted
 DT <- data.table(x=sample(c(NA, -10:10), 2e2, TRUE),
       y=sample(c(NA, NaN, -Inf, Inf, -10:10), 2e2, TRUE),
       z=sample(c(NA, letters), 2e2, TRUE))
@@ -4014,8 +4014,24 @@ test(1162.09, length(forderv(DT, by=2:3)), 0L)
 setkey(DT)
 # test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10
 test(1162.10, length(forderv(DT, by=1:3)), 0L)
-test(1162.11, is.sorted(DT, by=1:3), TRUE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted")
-test(1162.12, is.sorted(DT, by=2:1), FALSE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted")
+test(1162.11, is.sorted(DT, by=1:3), TRUE)
+test(1162.12, is.sorted(DT, by=2:1), FALSE)
+x = as.data.table(iris) # is.sorted now exported #2325
+test(1162.21, is.sorted(x, by="Species"), TRUE)
+ans = is.sorted(x, by="Sepal.Length", retOrd=TRUE)
+test(1162.22, identical(ans, FALSE), FALSE)
+test(1162.23, isFALSE(ans), TRUE)
+test(1162.24, is.integer(attr(ans, "order")))
+o = attr(ans, "order")
+test(1162.25, x[o], x[order(Sepal.Length)])
+test(1162.26, is.sorted(x[o], by="Sepal.Length"))
+test(1162.31, is.sorted(NULL), NA)
+test(1162.32, is.sorted(1:5, retOrd=TRUE), error="retOrd works only for data.table/list input")
+test(1162.33, is.sorted(1:5, by="a"), error="but 'by' is supplied")
+test(1162.34, is.sorted(expression(1:5)), error="unsupported type")
+test(1162.35, is.sorted(as.raw(1:5)), error="is not yet supported")
+test(1162.36, is.sorted(c("a",NA,"c")), FALSE)
+test(1162.37, is.sorted(c("a","fa\xE7ile","b")), FALSE)
 
 # FR #351 - last on length=0 arguments
 x <- character(0)

@@ -0,0 +1,50 @@
+\name{is.sorted}
+\alias{is.sorted}
+\title{ Checks if input is sorted }
+\description{
+  Checks if input is sorted.
+}
+\usage{
+  is.sorted(x, by=seq_along(x), retOrd=FALSE)
+}
+\arguments{
+  \item{x}{ data.table type object or atomic vector. }
+  \item{by}{ data.table columns used to check if \code{x} is sorted by those columns. }
+  \item{retOrd}{ logical, when \code{TRUE} it will set an attribute \code{"order"} on the returned value, providing an order of \code{x}. Works only for data.table type \code{x}, not for atomic vector. }
+}
+\details{
+  Checks if the input is object is sorted. Can check also by a subset of columns provided in \code{by} argument. Can also return an order used in computation when using \code{retOrd} argument.
+}
+\note{
+  Checking sortedness is an expensive computation, and most commonly the intermediate computation, the order, could be re-used.
+  For example the following check
+
+\preformatted{
+  if (!is.sorted(DT, by="Sepal.Length"))
+    DT = DT[order(Sepal.Length)]
+}
+
+  could be written as
+
+\preformatted{
+  if (!s <- is.sorted(DT, by="Sepal.Length", retOrd=TRUE))
+    DT = DT[attr(s, "order")]
+}
+
+so the order is computated only once. Of course for performance it is even better to sort in-place using \code{\link{setkey}}.
+}
+\value{
+  Logical scalar, TRUE or FALSE, or if \code{NULL} provided, then logical \code{NA}. When \code{retOrd} set to TRUE, the resulting logical scalar will have an attribute \code{"order"}. The attribute will be integer vector the same length as nrow of \code{x}, or length 0 integer in case if \code{x} was sorted. Any missing values are being ordered to front, unlike \code{\link[base]{order}}. Note that logical scalar having attribute attached will fail test for \code{identical}, although it will work fine with \code{isTRUE} and \code{isFALSE}.
+}
+\seealso{ \code{\link{data.table}} }
+\examples{
+x = as.data.table(iris)
+is.sorted(x, by="Species")
+
+ans = is.sorted(x, by="Sepal.Length", retOrd=TRUE)
+identical(ans, FALSE)
+isFALSE(ans)
+o = attr(ans, "order")
+x[o]
+}
+\keyword{ data }
@@ -1257,49 +1257,64 @@ void radix_r(const int from, const int to, const int radix) {
 }
 
 
-SEXP fsorted(SEXP x)
-{
+SEXP fsorted(SEXP x) {
   // Just checks if ordered and returns FALSE early if not. Does not return ordering if so, unlike forder.
   // Always increasing order with NA's first
   // Similar to base:is.unsorted but accepts NA at the beginning (standard in data.table and considered sorted) rather than returning NA when NA present.
   // TODO: test in big steps first to return faster if unsortedness is at the end (a common case of rbind'ing data to end)
   // These are all sequential access to x, so very quick and cache efficient. Could be parallel by checking continuity at batch boundaries.
+  if (!isVectorAtomic(x))
+    STOP(_("internal error: is.sorted got list/NULL/unsupported type object on input thus should be handled on R level but reached C level fsorted")); // # nocov
   const int n = length(x);
-  if (n <= 1) return(ScalarLogical(TRUE));
-  if (!isVectorAtomic(x)) STOP(_("is.sorted (R level) and fsorted (C level) only to be used on vectors. If needed on a list/data.table, you'll need the order anyway if not sorted, so use if (length(o<-forder(...))) for efficiency in one step, or equivalent at C level"));
+  if (n <= 1)
+    return(ScalarLogical(TRUE));
   int i=1;
   switch(TYPEOF(x)) {
   case INTSXP : case LGLSXP : {
     int *xd = INTEGER(x);
-    while (i<n && xd[i]>=xd[i-1]) i++;
+    while (i<n && xd[i]>=xd[i-1])
+      i++;
   } break;
-  case REALSXP :
-    if (inherits(x,"integer64")) {
+  case REALSXP : {
+    if (Rinherits(x,char_integer64)) {
       int64_t *xd = (int64_t *)REAL(x);
-      while (i<n && xd[i]>=xd[i-1]) i++;
+      while (i<n && xd[i]>=xd[i-1])
+        i++;
     } else {
       double *xd = REAL(x);
-      while (i<n && dtwiddle(xd,i)>=dtwiddle(xd,i-1)) i++;
+      while (i<n && dtwiddle(xd,i)>=dtwiddle(xd,i-1))
+        i++;
     }
-    break;
+  } break;
   case STRSXP : {
     SEXP *xd = STRING_PTR(x);
     i = 0;
-    while (i<n && xd[i]==NA_STRING) i++;
+    while (i<n && xd[i]==NA_STRING)
+      i++;
     bool need = NEED2UTF8(xd[i]);
     i++; // pass over first non-NA_STRING
     while (i<n) {
-      if (xd[i]==xd[i-1]) {i++; continue;}
-      if (xd[i]==NA_STRING) break;
-      if (!need) need = NEED2UTF8(xd[i]);
-      if ((need ? strcmp(CHAR(ENC2UTF8(xd[i])), CHAR(ENC2UTF8(xd[i-1]))) :
-                  strcmp(CHAR(xd[i]), CHAR(xd[i-1]))) < 0) break;
+      if (xd[i]==xd[i-1]) {
+        i++; continue;
+      }
+      if (xd[i]==NA_STRING)
+        break;
+      if (!need)
+        need = NEED2UTF8(xd[i]);
+      if (need) {
+        if (strcmp(CHAR(ENC2UTF8(xd[i])), CHAR(ENC2UTF8(xd[i-1]))) < 0)
+          break;
+      } else {
+        if (strcmp(CHAR(xd[i]), CHAR(xd[i-1])) < 0)
+          break;
+      }
       i++;
     }
   } break;
-  default :
+  default : {
     STOP(_("type '%s' is not yet supported"), type2char(TYPEOF(x)));
   }
+  }
   return ScalarLogical(i==n);
 }