From 7032d163db88bcc9a2c3d864c22253c973784498 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 13 Feb 2020 20:18:45 +0800 Subject: [PATCH] Add ties.method="last" to frank --- NEWS.md | 2 ++ R/frank.R | 6 +++--- inst/tests/tests.Rraw | 4 ++-- man/frank.Rd | 10 +++++----- src/frank.c | 25 ++++++++++++++++++------- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 53a1c7c2d..7251bace7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -73,6 +73,8 @@ unit = "s") 10. The dimensions of objects in a `list` column are now displayed, [#3671](https://github.com/Rdatatable/data.table/issues/3671). Thanks to @randomgambit for the request, and Tyson Barrett for the PR. +11. `frank` gains `ties.method='last'`, paralleling the same in `base::order` which has been available since R 3.3.0 (April 2016), [#1689](https://github.com/Rdatatable/data.table/issues/1689). Thanks @abudis for the encouragement to accommodate this. + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/frank.R b/R/frank.R index abe8a5a18..05fd7df75 100644 --- a/R/frank.R +++ b/R/frank.R @@ -1,4 +1,4 @@ -frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("average", "first", "random", "max", "min", "dense")) { +frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("average", "first", "last", "random", "max", "min", "dense")) { ties.method = match.arg(ties.method) if (!length(na.last)) stop('length(na.last) = 0') if (length(na.last) != 1L) { @@ -48,7 +48,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a xorder = seq_along(x[[1L]]) } ans = switch(ties.method, - average = , min = , max =, dense = { + average = , min = , max =, dense =, last = { rank = .Call(Cfrank, xorder, xstart, uniqlengths(xstart, length(xorder)), ties.method) }, first = , random = { @@ -65,7 +65,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a ans } -frank = function(x, ..., na.last=TRUE, ties.method=c("average", "first", "random", "max", "min", "dense")) { +frank = function(x, ..., na.last=TRUE, ties.method=c("average", "first", "last", "random", "max", "min", "dense")) { cols = substitute(list(...))[-1L] if (identical(as.character(cols), "NULL")) { cols = NULL diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ae3b91d97..2a42adf7a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5480,7 +5480,7 @@ test_no = 1368.0 for (i in seq_along(dt)) { col = dt[[i]] for (j in list(TRUE, FALSE, "keep")) { - for (k in c("average", "min", "max", "first")) { + for (k in c("average", "min", "max", "first", "last")) { if (k == "random") set.seed(45L) if (class(col) == "integer64") { r1 = rank(as.integer(col), ties.method=k, na.last=j) @@ -5511,7 +5511,7 @@ if (test_bit64) dt[, DD := as.integer64(DD)] test_no = 1369.0 for (i in seq_along(dt)) { col = dt[[i]] - for (k in c("average", "min", "max", "first")) { + for (k in c("average", "min", "max", "first", "last")) { if (k == "random") set.seed(45L) if (class(col) == "integer64") { r1 = rank(as.integer(col), ties.method=k, na.last=NA) diff --git a/man/frank.Rd b/man/frank.Rd index 93d5a6d73..5dad0da39 100644 --- a/man/frank.Rd +++ b/man/frank.Rd @@ -4,8 +4,8 @@ \alias{rank} \title{Fast rank} \description{ - Similar to \code{base::rank} but \emph{much faster}. And it accepts vectors, lists, data.frames or data.tables as input. In addition to the \code{ties.method} possibilities provided by \code{base::rank}, it also provides \code{ties.method="dense"}. - + Similar to \code{base::rank} but \emph{much faster}. And it accepts vectors, lists, \code{data.frame}s or \code{data.table}s as input. In addition to the \code{ties.method} possibilities provided by \code{base::rank}, it also provides \code{ties.method="dense"}. + Like \code{\link{forder}}, sorting is done in "C-locale"; in particular, this may affect how capital/lowercase letters are ranked. See Details on \code{forder} for more. \code{bit64::integer64} type is also supported. @@ -13,10 +13,10 @@ \usage{ frank(x, \dots, na.last=TRUE, ties.method=c("average", - "first", "random", "max", "min", "dense")) + "first", "last", "random", "max", "min", "dense")) frankv(x, cols=seq_along(x), order=1L, na.last=TRUE, - ties.method=c("average", "first", "random", + ties.method=c("average", "first", "last", "random", "max", "min", "dense")) } @@ -33,7 +33,7 @@ frankv(x, cols=seq_along(x), order=1L, na.last=TRUE, \code{frank} is not limited to vectors. It accepts \code{data.table}s (and \code{list}s and \code{data.frame}s) as well. It accepts unquoted column names (with names preceded with a \code{-} sign for descending order, even on character vectors), for e.g., \code{frank(DT, a, -b, c, ties.method="first")} where \code{a,b,c} are columns in \code{DT}. The equivalent in \code{frankv} is the \code{order} argument. - In addition to the \code{ties.method} values possible using base's \code{\link[base]{rank}}, it also provides another additional argument \emph{"dense"} which returns the ranks without any gaps in the ranking. See examples. + In addition to the \code{ties.method} values possible using base's \code{\link[base]{rank}}, it also provides another additional argument \code{"dense"} which returns the ranks without any gaps in the ranking. See examples. } \value{ A numeric vector of length equal to \code{NROW(x)} (unless \code{na.last = NA}, when missing values are removed). The vector is of integer type unless \code{ties.method = "average"} when it is of double type (irrespective of ties). diff --git a/src/frank.c b/src/frank.c index 2557d40b3..565154c70 100644 --- a/src/frank.c +++ b/src/frank.c @@ -69,14 +69,16 @@ SEXP dt_na(SEXP x, SEXP cols) { SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) { const int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg); - enum {MEAN, MAX, MIN, DENSE, SEQUENCE} ties = MEAN; // RUNLENGTH + enum {MEAN, MAX, MIN, DENSE, SEQUENCE, LAST} ties; // RUNLENGTH - if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "average")) ties = MEAN; - else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "max")) ties = MAX; - else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "min")) ties = MIN; - else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "dense")) ties = DENSE; - else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "sequence")) ties = SEQUENCE; - // else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "runlength")) ties = RUNLENGTH; + const char *pties = CHAR(STRING_ELT(ties_method, 0)); + if (!strcmp(pties, "average")) ties = MEAN; + else if (!strcmp(pties, "max")) ties = MAX; + else if (!strcmp(pties, "min")) ties = MIN; + else if (!strcmp(pties, "dense")) ties = DENSE; + else if (!strcmp(pties, "sequence")) ties = SEQUENCE; + else if (!strcmp(pties, "last")) ties = LAST; + // else if (!strcmp(pties, "runlength")) ties = RUNLENGTH; else error(_("Internal error: invalid ties.method for frankv(), should have been caught before. please report to data.table issue tracker")); // # nocov const int n = length(xorderArg); SEXP ans = PROTECT(allocVector(ties==MEAN ? REALSXP : INTSXP, n)); @@ -118,6 +120,15 @@ SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) { ians[xorder[j]-1] = k++; } break; + case LAST : + for (int i=0; i