Rdatatable · 2005m · Jan 17, 2020 · Jan 18, 2020 · Jan 18, 2020 · Jan 18, 2020
@@ -18,6 +18,7 @@ export(chmatch, "%chin%", chorder, chgroup)
 export(rbindlist)
 export(fifelse)
 export(fcase)
+export(topn)
 export(fread)
 export(fwrite)
 export(foverlaps)

@@ -73,6 +73,33 @@ unit = "s")
 
 10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR.
 
+11. New function `topn`, implemented in C by Morgan Jacob, [#3804](https://github.com/Rdatatable/data.table/issues/3804). It returns the top largest or smallest `n` values for a given numeric vector `vec`. Please see `?topn` for more details. Similar to `dplyr::top_n`.
+
+```R
+set.seed(123)
+x = rnorm(5e7) # 382 MB
+microbenchmark::microbenchmark(
+  topn(x, 6L),
+  order(x)[1:6], 
+  times = 10L
+)
+# Unit: seconds
+#          expr    min    lq  mean  median    uq   max neval
+# topn(x, 6L)     0.19  0.19  0.20    0.20  0.20  0.22    10
+# order(x)[1:6]   4.56  4.60  4.65    4.62  4.70  4.77    10
+
+microbenchmark::microbenchmark(
+  x[topn(x, 6L)],
+  sort(x, partial = 1:6)[1:6],
+  times = 10L,
+  unit = "s"
+)
+# Unit: seconds
+#                        expr   min    lq  mean  median    uq   max neval
+# x[topn(x, 6L)]               0.19  0.20  0.20    0.20  0.20  0.21    10
+# sort(x, partial = 1:6)[1:6]  1.20  1.22  1.23    1.24  1.25  1.27    10
+```
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).

@@ -7,6 +7,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE)
 
 fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na)
 fcase   = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L])
+topn    = function(vec, n=6L, decreasing=FALSE) .Call(CtopnR, vec, n, decreasing)
 
 colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups)
 coerceFill = function(x) .Call(CcoerceFillR, x)

@@ -16770,6 +16770,88 @@ test(2132.2, fifelse(TRUE, 1, s2),       error = "S4 class objects (except nanot
 test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.")
 rm(s1, s2, class2132)
 
+# topn, #3804
+x0 = c(3L, 2L, 10L, NA_integer_, 1L, 1L, NA_integer_,  NA_integer_, 10L, 20L, 20L, 20L, 30L)
+x1 = as.numeric(x0)
+x2 = c(NA_integer_, NA_integer_, NA_integer_)
+x3 = as.numeric(x2)
+x4 = as.raw(c(1,2,3))
+
+class2134 = setClass("class2134", slots=list(x="numeric"))
+s1 = class2134(x=20191231)
+
+test(2134.001, topn(x0, 1L), order(x0)[1:1])
+test(2134.002, topn(x0, 2L), order(x0)[1:2])
+test(2134.003, topn(x0, 3L), order(x0)[1:3])
+test(2134.004, topn(x0, 4L), order(x0)[1:4])
+test(2134.005, topn(x0, 5L), order(x0)[1:5])
+test(2134.006, topn(x0, 6L), order(x0)[1:6])
+test(2134.007, topn(x0, 7L), order(x0)[1:7])
+test(2134.008, topn(x0, 8L), order(x0)[1:8])
+test(2134.009, topn(x0, 9L), order(x0)[1:9])
+test(2134.010, topn(x0, 10L), order(x0)[1:10])
+test(2134.011, topn(x0, 11L), order(x0)[1:11])
+test(2134.012, topn(x0, 12L), order(x0)[1:12])
+test(2134.013, topn(x0, 13L), order(x0)[1:13])
+test(2134.014, topn(x1, 1L), order(x1)[1:1])
+test(2134.015, topn(x1, 2L), order(x1)[1:2])
+test(2134.016, topn(x1, 3L), order(x1)[1:3])
+test(2134.017, topn(x1, 4L), order(x1)[1:4])
+test(2134.018, topn(x1, 5L), order(x1)[1:5])
+test(2134.019, topn(x1, 6L), order(x1)[1:6])
+test(2134.020, topn(x1, 7L), order(x1)[1:7])
+test(2134.021, topn(x1, 8L), order(x1)[1:8])
+test(2134.022, topn(x1, 9L), order(x1)[1:9])
+test(2134.023, topn(x1, 10L), order(x1)[1:10])
+test(2134.024, topn(x1, 11L), order(x1)[1:11])
+test(2134.025, topn(x1, 12L), order(x1)[1:12])
+test(2134.026, topn(x1, 13L), order(x1)[1:13])
+test(2134.027, topn(x2, 1L), order(x2)[1:1])
+test(2134.028, topn(x2, 2L), order(x2)[1:2])
+test(2134.029, topn(x2, 3L), order(x2)[1:3])
+test(2134.030, topn(x3, 1L), order(x3)[1:1])
+test(2134.031, topn(x3, 2L), order(x3)[1:2])
+test(2134.032, topn(x3, 3L), order(x3)[1:3])
+test(2134.033, topn(x0, 1L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:1])
+test(2134.034, topn(x0, 2L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:2])
+test(2134.035, topn(x0, 3L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:3])
+test(2134.036, topn(x0, 4L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:4])
+test(2134.037, topn(x0, 5L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:5])
+test(2134.038, topn(x0, 6L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:6])
+test(2134.039, topn(x0, 7L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:7])
+test(2134.040, topn(x0, 8L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:8])
+test(2134.041, topn(x0, 9L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:9])
+test(2134.042, topn(x0, 10L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:10])
+test(2134.043, topn(x0, 11L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:11])
+test(2134.044, topn(x0, 12L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:12])
+test(2134.045, topn(x0, 13L, decreasing=TRUE), order(x0, decreasing=TRUE)[1:13])
+test(2134.046, topn(x1, 1L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:1])
+test(2134.047, topn(x1, 2L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:2])
+test(2134.048, topn(x1, 3L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:3])
+test(2134.049, topn(x1, 4L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:4])
+test(2134.050, topn(x1, 5L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:5])
+test(2134.051, topn(x1, 6L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:6])
+test(2134.052, topn(x1, 7L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:7])
+test(2134.053, topn(x1, 8L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:8])
+test(2134.054, topn(x1, 9L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:9])
+test(2134.055, topn(x1, 10L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:10])
+test(2134.056, topn(x1, 11L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:11])
+test(2134.057, topn(x1, 12L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:12])
+test(2134.058, topn(x1, 13L, decreasing=TRUE), order(x1, decreasing=TRUE)[1:13])
+test(2134.060, topn(x2, 1L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:1])
+test(2134.061, topn(x2, 2L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:2])
+test(2134.062, topn(x2, 3L, decreasing=TRUE), order(x2, decreasing=TRUE)[1:3])
+test(2134.063, topn(x3, 1L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:1])
+test(2134.064, topn(x3, 2L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:2])
+test(2134.065, topn(x3, 3L, decreasing=TRUE), order(x3, decreasing=TRUE)[1:3])
+test(2134.066, topn(x0, -1L), error = "Please enter a positive integer larger or equal to 1.")
+test(2134.067, topn(x0, 1001L), error = "Function 'topn' is not built for large value of 'n'. The algorithm is made for small values. Please prefer the 'order' if you want to proceed with such large value.")
+test(2134.068, topn(x0, 100L), order(x0)[1:13], warning = "'n' is larger than length of 'vec'. 'n' will be set to length of 'vec'.")
+test(2134.069, topn(x0, 10L, decreasing = NA), error = "Argument 'decreasing' must be TRUE or FALSE and length 1.")
+test(2134.070, topn(s1, 10L, decreasing = NA), error = "S4 class objects (excluding nanotime) are not supported.")
+test(2134.071, topn(x4, 2L), error = "Type raw is not supported.")
+test(2134.072, topn(x4, 2L, decreasing = TRUE), error = "Type raw is not supported.")
+rm(s1, class2134)
 
 ########################
 #  Add new tests here  #

@@ -0,0 +1,35 @@
+\name{topn}
+\alias{topn}
+\title{ Top N values index}
+\description{
+  \code{topn} is used to get the indices of the few values of an input. This is an extension of \code{\link{which.max}}/\code{\link{which.min}} which provide \emph{only} the first such index. 
+
+  The output is the same as \code{order(vec)[1:n]}, but internally optimized not to sort the irrelevant elements of the input (and therefore much faster, for small \code{n} relative to input size).
+}
+\usage{
+  topn(vec, n=6L, decreasing=FALSE)
+}
+\arguments{
+  \item{vec}{ A numeric vector of type numeric or integer. Other types are not supported yet. }
+  \item{n}{ A positive integer value greater or equal to 1. Maximum value is 1000. }
+  \item{decreasing}{ A logical value (default \code{FALSE}) to indicate whether to sort \code{vec} in decreasing or increasing order. Equivalent to argument \code{decreasing} in function \code{base::order}. }
+}
+\value{
+  \code{integer} vector of indices of the most extreme (according to \code{decreasing}) \code{n} values in vector \code{vec}.
+}
+\examples{
+x = rnorm(1e6)
+
+# Example 1: index of top 6 negative values 
+topn(x, 6L)
+order(x)[1:6]
+
+# Example 2: index of top 6 positive values
+topn(x, 6L, decreasing = TRUE)
+order(x, decreasing = TRUE)[1:6]
+
+# Example 3: top 6 negative values
+x[topn(x, 6L)]
+sort(x)[1:6]
+}
+\keyword{ data }
@@ -242,3 +242,4 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k);
 //fifelse.c
 SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
+SEXP topnR(SEXP vec, SEXP n, SEXP dec);
@@ -343,3 +343,194 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) {
   UNPROTECT(nprotect);
   return ans;
 }
+
+SEXP topnR(SEXP vec, SEXP n, SEXP dec) {
+	int nprotect = 0;
+	int64_t i, j, idx = 0;
+	int len0 = asInteger(n);
+	const int64_t len1 = xlength(vec);
+
+	if (isS4(vec) && !INHERITS(vec, char_nanotime)) {
+		error("S4 class objects (excluding nanotime) are not supported.");
+	}
+	if (len0 > 1000) {
+		error("Function 'topn' is not built for large value of 'n'. The algorithm is made for small values. Please prefer the 'order' if you want to proceed with such large value.");
+	}
+	if (len0 > len1) {
+		warning("'n' is larger than length of 'vec'. 'n' will be set to length of 'vec'.");
+		len0 = len1;
+	}
+	if (len0 < 1) {
+		error("Please enter a positive integer larger or equal to 1.");
+	}
+	if (!IS_TRUE_OR_FALSE(dec)) {
+		error("Argument 'decreasing' must be TRUE or FALSE and length 1.");
+	}
+
+	const bool vdec = LOGICAL(dec)[0];
+	SEXPTYPE tvec = TYPEOF(vec);
+	SEXP ans = PROTECT(allocVector(INTSXP, len0)); nprotect++;
+	int *restrict pans = INTEGER(ans);
+	int tmp;
+
+	if (vdec) {
+		switch(tvec) {
+		case INTSXP: {
+			const int *restrict pvec = INTEGER(vec);
+			int min_value = pvec[0];
+			for (i = 0; i < len0; ++i) {
+				pans[i] = i;
+				if (pvec[i] <= min_value || pvec[i] == NA_INTEGER) {
+					min_value = pvec[i];
+					idx = i;
+				}
+			}
+			for (i = len0; i < len1; ++i) {
+				if (pvec[i] == NA_INTEGER) {
+					continue;
+				}
+				if (pvec[i] > min_value) {
+					min_value = pvec[i];
+					pans[idx] = i;
+					for (j = 0; j <len0; ++j) {
+						if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || pvec[pans[j]] == NA_INTEGER) {
+							min_value = pvec[pans[j]];
+							idx = j;
+						}
+					}
+				}
+			}
+			for (i = 0; i < len0; ++i) {
+				tmp = pans[i];
+				for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1])); --j) {
+					pans[j] = pans[j-1];
+				}
+				pans[j] = tmp;
+			}
+			for (i =0; i < len0; ++i) {
+				pans[i]++;
+			}
+		} break;
+		case REALSXP: {
+			const double *restrict pvec = REAL(vec);
+			double min_value = pvec[0];
+			for (i = 0; i < len0; ++i) {
+				pans[i] = i;
+				if (pvec[i] <= min_value || ISNAN(pvec[i])) {
+					min_value = pvec[i];
+					idx = i;
+				}
+			}
+			for (i = len0; i < len1; ++i) {
+				if (ISNAN(pvec[i])) {
+					continue;
+				}
+				if (pvec[i] > min_value || ISNAN(min_value)) {
+					min_value = pvec[i];
+					pans[idx] = i;
+					for (j = 0; j <len0; ++j) {
+						if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) {
+							min_value = pvec[pans[j]];
+							idx = j;
+						}
+					}
+				}
+			}
+			for (i = 0; i < len0; ++i) {
+				tmp = pans[i];
+				for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) {
+					pans[j] = pans[j-1];
+				}
+				pans[j] = tmp;
+			}
+			for (i =0; i < len0; ++i) {
+				pans[i]++;
+			}
+		} break;
+		default:
+			error("Type %s is not supported.", type2char(tvec));
+		}
+	} else {
+		switch(tvec) {
+		case INTSXP: {
+			const int *restrict pvec = INTEGER(vec);
+			int min_value = pvec[0];
+			for (i = 0; i < len0; ++i) {
+				pans[i] = i;
+				if ((pvec[i] >= min_value && min_value != NA_INTEGER) || pvec[i] == NA_INTEGER) {
+					min_value = pvec[i];
+					idx = i;
+				}
+			}
+			for (i = len0; i < len1; ++i) {
+				if (pvec[i] == NA_INTEGER) {
+					continue;
+				}
+				if (pvec[i] < min_value || min_value == NA_INTEGER) {
+					min_value = pvec[i];
+					pans[idx] = i;
+					for (j = 0; j <len0; ++j) {
+						if (((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) && min_value != NA_INTEGER) || pvec[pans[j]] == NA_INTEGER) {
+							min_value = pvec[pans[j]];
+							idx = j;
+						}
+					}
+				}
+			}
+			for (i = 0; i < len0; ++i) {
+				tmp = pans[i];
+				if (pvec[tmp] == NA_INTEGER) {
+					continue;
+				}
+				for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || pvec[pans[j-1]] == NA_INTEGER); --j) {
+					pans[j] = pans[j-1];
+				}
+				pans[j] = tmp;
+			}
+			for (i =0; i < len0; ++i) {
+				pans[i]++;
+			}
+		} break;
+		case REALSXP: {
+			const double *restrict pvec = REAL(vec);
+			double min_value = pvec[0];
+			for (i = 0; i < len0; ++i) {
+				pans[i] = i;
+				if (pvec[i] >= min_value || ISNAN(pvec[i])) {
+					min_value = pvec[i];
+					idx = i;
+				}
+			}
+			for (i = len0; i < len1; ++i) {
+				if (ISNAN(pvec[i])) {
+					continue;
+				}
+				if (pvec[i] < min_value || ISNAN(min_value)) {
+					min_value = pvec[i];
+					pans[idx] = i;
+					for (j = 0; j <len0; ++j) {
+						if ((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) {
+							min_value = pvec[pans[j]];
+							idx = j;
+						}
+					}
+				}
+			}
+			for (i = 0; i < len0; ++i) {
+				tmp = pans[i];
+				for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) {
+					pans[j] = pans[j-1];
+				}
+				pans[j] = tmp;
+			}
+			for (i =0; i < len0; ++i) {
+				pans[i]++;
+			}
+		} break;
+		default:
+			error("Type %s is not supported.", type2char(tvec));
+		}
+	}	
+	UNPROTECT(nprotect);
+	return ans;
+}
@@ -53,6 +53,7 @@ SEXP chmatchdup_R();
 SEXP chin_R();
 SEXP fifelseR();
 SEXP fcaseR();
+SEXP topnR();
 SEXP freadR();
 SEXP fwriteR();
 SEXP reorder();
@@ -205,6 +206,7 @@ R_CallMethodDef callMethods[] = {
 {"Ccoalesce", (DL_FUNC) &coalesce, -1},
 {"CfifelseR", (DL_FUNC) &fifelseR, -1},
 {"CfcaseR", (DL_FUNC) &fcaseR, -1},
+{"CtopnR", (DL_FUNC) &topnR, -1},
 {"C_lock", (DL_FUNC) &lock, -1},  // _ for these 3 to avoid Clock as in time
 {"C_unlock", (DL_FUNC) &unlock, -1},
 {"C_islocked", (DL_FUNC) &islockedR, -1},