Rdatatable · mattdowle · May 15, 2019 · May 15, 2019 · May 15, 2019 · May 15, 2019
@@ -28,7 +28,7 @@
     fwrite(DT, "data.csv.gz")   #   2MB; 1.6s
     identical(fread("data.csv.gz"), DT)
     ```
-    
+
     * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.
 
 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
@@ -91,6 +91,8 @@
 
 7. Grouping by `NULL` on zero rows data.table now behaves consistently to non-zero rows data.table, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example.
 
+8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting.
+
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.

@@ -7,7 +7,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
            dateTimeAs = c("ISO","squash","epoch","write.csv"),
            buffMB=8, nThread=getDTthreads(verbose),
            showProgress=getOption("datatable.showProgress", interactive()),
-           compress = c("auto", "none", "gzip"), 
+           compress = c("auto", "none", "gzip"),
            yaml = FALSE,
            verbose=getOption("datatable.verbose", FALSE)) {
   na = as.character(na[1L]) # fix for #1725
@@ -48,9 +48,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
     length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
     length(nThread)==1L && !is.na(nThread) && nThread>=1L
     )
-  
+
   is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
-  
+
   file <- path.expand(file)  # "~/foo/bar"
   if (append && missing(col.names) && (file=="" || file.exists(file)))
     col.names = FALSE  # test 1658.16 checks this
@@ -74,7 +74,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
       return(invisible())
     }
   }
-  
+
   # process YAML after potentially short-circuiting due to irregularities
   if (yaml) {
     if (!requireNamespace('yaml', quietly = TRUE))
@@ -89,12 +89,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
       # as.vector strips names
       schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
       yaml_header = list(
-        source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', 
-                         R.version$major, R.version$minor, format(utils::packageVersion('data.table'))),
+        source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
+                         R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))),
         creation_time_utc = format(Sys.time(), tz = 'UTC'),
         schema = list(
           fields = lapply(
-            seq_along(x), 
+            seq_along(x),
             function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
           )
         ),

@@ -14401,8 +14401,7 @@ if (test_yaml) {  # csvy; #1701
   DT_yaml[ , var2 := as.integer(var2)]
   test(2032.22, fread(f, skip = 'var1,', yaml = TRUE),
        DT_yaml, warning = 'Combining a search.*YAML.*')
-
-
+
   # fwrite csvy: #3534
   tmp = tempfile()
   DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5])
@@ -14413,25 +14412,25 @@ if (test_yaml) {  # csvy; #1701
   test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
   test(2033.03, grepl('creation_time_utc', as_read[3L]))
   test(2033.04, as_read[4:24],
-       c("schema:", "  fields:", "  - name: a", "    type: integer", 
-         "  - name: b", "    type: numeric", "  - name: c", "    type: character", 
-         "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", 
+       c("schema:", "  fields:", "  - name: a", "    type: integer",
+         "  - name: b", "    type: numeric", "  - name: c", "    type: character",
+         "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
          # NB: apparently \n is encoded like this in YAML
-         "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", 
+         "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
          "logical01: no", ""))
   tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
   test(2033.05, as_read[26:31], tbl_body)
-  
+
   # windows eol
   fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
   test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"')
-       
+
   # multi-class columns
   DT[ , t := .POSIXct(1:5, tz = 'UTC')]
   fwrite(DT, tmp, yaml = TRUE)
   as_read = readLines(tmp)
   test(2033.07, as_read[13L], "    type: POSIXct")
-  
+
   # ~invertibility~
   # fread side needs to be improved for Hugh's colClasses update
   DT[ , t := NULL]
@@ -14440,7 +14439,7 @@ if (test_yaml) {  # csvy; #1701
   # remove metadata to compare
   attr(DT2, 'yaml_metadata') = NULL
   test(2033.08, all.equal(DT, DT2))
-  
+
   # unsupported operations
   test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
        warning = 'Skipping yaml writing because append = TRUE')
@@ -14460,7 +14459,7 @@ test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"',
 test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans)
 
 # source() printing edge case; #2369
-setup = c('library(data.table)', 'DT = data.table(a = 1)')
+setup = c('DT = data.table(a = 1)')
 writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile())
 test(2036.1, !any(grepl("1:     1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE)))
 ## test force-printing still works
@@ -14569,10 +14568,17 @@ if (test_bit64) {
 }
 
 # zero rows table group by NULL #3530
-dt = data.table(x = c("a","b","a","b"), y = c(1,2,3,4))
-test(2040.1, dt[0, .N, by = NULL], data.table(N=0L))
+DT = data.table(x = c("a","b","a","b"), y = c(1,2,3,4))
+test(2040.1, DT[0, .N, by = NULL], data.table(N=0L))
 f = function(...) NULL
-test(2040.2, dt[0, .N, by = f()], data.table(N=0L))
+test(2040.2, DT[0, .N, by = f()], data.table(N=0L))
+
+# gmedian retaining class; #3079
+DT = data.table(date = as.Date(c("2018-01-01", "2018-01-03", "2018-01-08", "2018-01-10", "2018-01-25", "2018-01-30")),
+                g = rep(letters[1:2], each = 3))
+DT[, time:=as.POSIXct(date)]
+test(2041.1, DT[, median(date), by=g], data.table(g=c("a","b"), V1=as.Date(c("2018-01-03","2018-01-25"))))
+test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)])
 
 
 ###################################

@@ -160,8 +160,9 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted,
                 SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg);
 
 // quickselect
-double dquickselect(double *x, int n, int k);
-double iquickselect(int *x, int n, int k);
+double dquickselect(double *x, int n);
+double iquickselect(int *x, int n);
+double i64quickselect(int64_t *x, int n);
 
 // fread.c
 double wallclock();

@@ -20,10 +20,6 @@ static int maxgrpn = 0;
 static int *oo = NULL;
 static int *ff = NULL;
 static int isunsorted = 0;
-static union {
-  double d;
-  long long ll;
-} u;
 
 // from R's src/cov.c (for variance / sd)
 #ifdef HAVE_LONG_DOUBLE
@@ -776,177 +772,56 @@ SEXP gmax(SEXP x, SEXP narm)
 }
 
 // gmedian, always returns numeric type (to avoid as.numeric() wrap..)
-SEXP gmedian(SEXP x, SEXP narm) {
-
-  if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
+SEXP gmedian(SEXP x, SEXP narmArg) {
+  if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
   if (!isVectorAtomic(x)) error("GForce median can only be applied to columns, not .SD or similar. To find median of all items in a list such as .SD, either add the prefix stats::median(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,median),by=,.SDcols=]'");
   if (inherits(x, "factor")) error("median is not meaningful for factors.");
-  R_len_t i=0, j=0, k=0, imed=0, thisgrpsize=0, medianindex=0, nacount=0;
-  double val = 0.0;
-  Rboolean isna = FALSE, isint64 = FALSE;
-  SEXP ans, sub, klass;
-  void *ptr;
+  const bool isInt64 = INHERITS(x, char_integer64), narm = LOGICAL(narmArg)[0];
   int n = (irowslen == -1) ? length(x) : irowslen;
   if (nrow != n) error("nrow [%d] != length(x) [%d] in gmedian", nrow, n);
+  SEXP ans = PROTECT(allocVector(REALSXP, ngrp));
+  double *ansd = REAL(ans);
   switch(TYPEOF(x)) {
-  case REALSXP:
-    klass = getAttrib(x, R_ClassSymbol);
-    isint64 = (isString(klass) && STRING_ELT(klass, 0) == char_integer64);
-    ans = PROTECT(allocVector(REALSXP, ngrp));
-    sub = PROTECT(allocVector(REALSXP, maxgrpn)); // allocate once upfront
-    ptr = REAL(sub);
-    if (!LOGICAL(narm)[0]) {
-      for (i=0; i<ngrp; i++) {
-        isna = FALSE;
-        thisgrpsize = grpsize[i];
-        SETLENGTH(sub, thisgrpsize);
-        for (j=0; j<thisgrpsize; j++) {
-          k = ff[i]+j-1;
-          if (isunsorted) k = oo[k]-1;
-          k = (irowslen == -1) ? k : irows[k]-1;
-          // TODO: raise this if-statement?
-          if (!isint64) {
-            if (!ISNAN(REAL(x)[k])) {
-              REAL(sub)[j] = REAL(x)[k];
-            } else {
-              REAL(ans)[i] = NA_REAL;
-              isna = TRUE; break;
-            }
-          } else {
-            u.d = REAL(x)[k];
-            if (u.ll != NA_INT64_LL) {
-              REAL(sub)[j] = (double)u.ll;
-            } else {
-              REAL(ans)[i] = NA_REAL;
-              isna = TRUE; break;
-            }
-          }
-        }
-        if (isna) continue;
-        medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
-        REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
-        // all elements to the left of thisgrpsize/2 is < the value at that index
-        // we just need to get min of last half
-        if (thisgrpsize % 2 == 0) {
-          val = REAL(sub)[medianindex]; // 0-indexed
-          for (imed=medianindex+1; imed<thisgrpsize; imed++) {
-            val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
-          }
-          REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
-        }
-      }
-    } else {
-      for (i=0; i<ngrp; i++) {
-        nacount = 0;
-        thisgrpsize = grpsize[i];
-        for (j=0; j<thisgrpsize; j++) {
-          k = ff[i]+j-1;
-          if (isunsorted) k = oo[k]-1;
-          k = (irowslen == -1) ? k : irows[k]-1;
-          // TODO: raise this if-statement?
-          if (!isint64) {
-            if (!ISNAN(REAL(x)[k])) {
-              REAL(sub)[j-nacount] = REAL(x)[k];
-            } else { nacount++; continue; }
-          } else {
-            u.d = REAL(x)[k];
-            if (u.ll != NA_INT64_LL) {
-              REAL(sub)[j-nacount] = (double)u.ll;
-            } else { nacount++; continue; }
-          }
-        }
-        if (nacount == thisgrpsize) {
-          REAL(ans)[i] = NA_REAL; // all NAs
-          continue;
-        }
-        thisgrpsize -= nacount;
-        SETLENGTH(sub, thisgrpsize);
-        medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
-        REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1);
-        if (thisgrpsize % 2 == 0) {
-          // all elements to the left of thisgrpsize/2 is < the value at that index
-          // we just need to get min of last half
-          val = REAL(sub)[medianindex]; // 0-indexed
-          for (imed=medianindex+1; imed<thisgrpsize; imed++) {
-            val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
-          }
-          REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
-        }
+  case REALSXP: {
+    double *subd = REAL(PROTECT(allocVector(REALSXP, maxgrpn))); // allocate once upfront and reuse
+    int64_t *xi64 = (int64_t *)REAL(x);
+    double  *xd = REAL(x);
+    for (int i=0; i<ngrp; ++i) {
+      int thisgrpsize = grpsize[i], nacount=0;
+      for (int j=0; j<thisgrpsize; ++j) {
+        int k = ff[i]+j-1;
+        if (isunsorted) k = oo[k]-1;
+        k = (irowslen == -1) ? k : irows[k]-1;
+        if (isInt64 ? xi64[k]==NA_INTEGER64 : ISNAN(xd[k])) nacount++;
+        else subd[j-nacount] = xd[k];
       }
-    }
-    SETLENGTH(sub, maxgrpn);
+      thisgrpsize -= nacount;  // all-NA is returned as NA_REAL via n==0 case inside *quickselect
+      ansd[i] = (nacount && !narm) ? NA_REAL : (isInt64 ? i64quickselect((void *)subd, thisgrpsize) : dquickselect(subd, thisgrpsize));
+    }}
     break;
-  case LGLSXP: case INTSXP:
-    ans = PROTECT(allocVector(REALSXP, ngrp));
-    sub = PROTECT(allocVector(INTSXP, maxgrpn)); // allocate once upfront
-    ptr = INTEGER(sub);
-    if (!LOGICAL(narm)[0]) {
-      for (i=0; i<ngrp; i++) {
-        isna = FALSE;
-        thisgrpsize = grpsize[i];
-        SETLENGTH(sub, thisgrpsize);
-        for (j=0; j<thisgrpsize; j++) {
-          k = ff[i]+j-1;
-          if (isunsorted) k = oo[k]-1;
-          k = (irowslen == -1) ? k : irows[k]-1;
-          if (INTEGER(x)[k] != NA_INTEGER) {
-            INTEGER(sub)[j] = INTEGER(x)[k];
-          } else {
-            REAL(ans)[i] = NA_REAL;
-            isna = TRUE; break;
-          }
-        }
-        if (isna) continue;
-        medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
-        REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
-        // all elements to the left of thisgrpsize/2 is < the value at that index
-        // we just need to get min of last half
-        if (thisgrpsize % 2 == 0) {
-          val = INTEGER(sub)[medianindex]; // 0-indexed
-          for (imed=medianindex+1; imed<thisgrpsize; imed++) {
-            val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
-          }
-          REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
-        }
-      }
-    } else {
-      for (i=0; i<ngrp; i++) {
-        nacount = 0;
-        thisgrpsize = grpsize[i];
-        for (j=0; j<thisgrpsize; j++) {
-          k = ff[i]+j-1;
-          if (isunsorted) k = oo[k]-1;
-          k = (irowslen == -1) ? k : irows[k]-1;
-          if (INTEGER(x)[k] != NA_INTEGER) {
-            INTEGER(sub)[j-nacount] = INTEGER(x)[k];
-          } else { nacount++; continue; }
-        }
-        if (nacount == thisgrpsize) {
-          REAL(ans)[i] = NA_REAL; // all NAs
-          continue;
-        }
-        thisgrpsize -= nacount;
-        SETLENGTH(sub, thisgrpsize);
-        medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
-        REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1);
-        if (thisgrpsize % 2 == 0) {
-          // all elements to the left of thisgrpsize/2 is < the value at that index
-          // we just need to get min of last half
-          val = INTEGER(sub)[medianindex]; // 0-indexed
-          for (imed=medianindex+1; imed<thisgrpsize; imed++) {
-            val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
-          }
-          REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
-        }
+  case LGLSXP: case INTSXP: {
+    int *subi = INTEGER(PROTECT(allocVector(INTSXP, maxgrpn)));
+    int *xi = INTEGER(x);
+    for (int i=0; i<ngrp; i++) {
+      int thisgrpsize = grpsize[i], nacount=0;
+      for (int j=0; j<thisgrpsize; ++j) {
+        int k = ff[i]+j-1;
+        if (isunsorted) k = oo[k]-1;
+        k = (irowslen == -1) ? k : irows[k]-1;
+        if (xi[k]==NA_INTEGER) nacount++;
+        else subi[j-nacount] = xi[k];
       }
-    }
-    SETLENGTH(sub, maxgrpn);
+      ansd[i] = (nacount && !narm) ? NA_REAL : iquickselect(subi, thisgrpsize-nacount);
+    }}
     break;
   default:
-    error("Type '%s' not supported by GForce median (gmedian). Either add the prefix stats::median(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
+    error("Type '%s' not supported by GForce median (gmedian). Either add the prefix stats::median(.) or turn "
+          "off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
   }
-  UNPROTECT(2);
-  return(ans);
+  if (!isInt64) copyMostAttrib(x, ans);
+  // else the integer64 class needs to be dropped since double is always returned by gmedian
+  UNPROTECT(2);  // ans, subd|subi
+  return ans;
 }
 
 SEXP glast(SEXP x) {
@@ -1306,6 +1181,7 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD)
         error("Type '%s' not supported by GForce sd (gsd). Either add the prefix stats::sd(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
       }
   }
+  // no copyMostAttrib(x, ans) since class (e.g. Date) unlikely applicable to sd/var
   UNPROTECT(2);
   return (ans);
 }