Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gmedian retain class #3564

Merged
merged 3 commits into from
May 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
fwrite(DT, "data.csv.gz") # 2MB; 1.6s
identical(fread("data.csv.gz"), DT)
```

* Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.

4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
Expand Down Expand Up @@ -91,6 +91,8 @@

7. Grouping by `NULL` on zero rows data.table now behaves consistently to non-zero rows data.table, [#3530](https://github.com/Rdatatable/data.table/issues/3530). Thanks to @SymbolixAU for the report and reproducible example.

8. GForce optimization of `median` did not retain the class; e.g. `median` of `Date` or `POSIXct` would return a raw number rather than retain the date class, [#3079](https://github.com/Rdatatable/data.table/issues/3079). Thanks to @Henrik-P for reporting.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
14 changes: 7 additions & 7 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
dateTimeAs = c("ISO","squash","epoch","write.csv"),
buffMB=8, nThread=getDTthreads(verbose),
showProgress=getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
compress = c("auto", "none", "gzip"),
yaml = FALSE,
verbose=getOption("datatable.verbose", FALSE)) {
na = as.character(na[1L]) # fix for #1725
Expand Down Expand Up @@ -48,9 +48,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
length(nThread)==1L && !is.na(nThread) && nThread>=1L
)

is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))

file <- path.expand(file) # "~/foo/bar"
if (append && missing(col.names) && (file=="" || file.exists(file)))
col.names = FALSE # test 1658.16 checks this
Expand All @@ -74,7 +74,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
return(invisible())
}
}

# process YAML after potentially short-circuiting due to irregularities
if (yaml) {
if (!requireNamespace('yaml', quietly = TRUE))
Expand All @@ -89,12 +89,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
# as.vector strips names
schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
yaml_header = list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(utils::packageVersion('data.table'))),
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))),
creation_time_utc = format(Sys.time(), tz = 'UTC'),
schema = list(
fields = lapply(
seq_along(x),
seq_along(x),
function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
)
),
Expand Down
34 changes: 20 additions & 14 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14401,8 +14401,7 @@ if (test_yaml) { # csvy; #1701
DT_yaml[ , var2 := as.integer(var2)]
test(2032.22, fread(f, skip = 'var1,', yaml = TRUE),
DT_yaml, warning = 'Combining a search.*YAML.*')



# fwrite csvy: #3534
tmp = tempfile()
DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5])
Expand All @@ -14413,25 +14412,25 @@ if (test_yaml) { # csvy; #1701
test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
test(2033.03, grepl('creation_time_utc', as_read[3L]))
test(2033.04, as_read[4:24],
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
# NB: apparently \n is encoded like this in YAML
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"logical01: no", ""))
tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
test(2033.05, as_read[26:31], tbl_body)

# windows eol
fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"')

# multi-class columns
DT[ , t := .POSIXct(1:5, tz = 'UTC')]
fwrite(DT, tmp, yaml = TRUE)
as_read = readLines(tmp)
test(2033.07, as_read[13L], " type: POSIXct")

# ~invertibility~
# fread side needs to be improved for Hugh's colClasses update
DT[ , t := NULL]
Expand All @@ -14440,7 +14439,7 @@ if (test_yaml) { # csvy; #1701
# remove metadata to compare
attr(DT2, 'yaml_metadata') = NULL
test(2033.08, all.equal(DT, DT2))

# unsupported operations
test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
warning = 'Skipping yaml writing because append = TRUE')
Expand All @@ -14460,7 +14459,7 @@ test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"',
test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans)

# source() printing edge case; #2369
setup = c('library(data.table)', 'DT = data.table(a = 1)')
setup = c('DT = data.table(a = 1)')
writeLines(c(setup, 'DT[ , a := 1]'), tmp<-tempfile())
test(2036.1, !any(grepl("1: 1", capture.output(source(tmp, echo = TRUE)), fixed = TRUE)))
## test force-printing still works
Expand Down Expand Up @@ -14569,10 +14568,17 @@ if (test_bit64) {
}

# zero rows table group by NULL #3530
dt = data.table(x = c("a","b","a","b"), y = c(1,2,3,4))
test(2040.1, dt[0, .N, by = NULL], data.table(N=0L))
DT = data.table(x = c("a","b","a","b"), y = c(1,2,3,4))
test(2040.1, DT[0, .N, by = NULL], data.table(N=0L))
f = function(...) NULL
test(2040.2, dt[0, .N, by = f()], data.table(N=0L))
test(2040.2, DT[0, .N, by = f()], data.table(N=0L))

# gmedian retaining class; #3079
DT = data.table(date = as.Date(c("2018-01-01", "2018-01-03", "2018-01-08", "2018-01-10", "2018-01-25", "2018-01-30")),
g = rep(letters[1:2], each = 3))
DT[, time:=as.POSIXct(date)]
test(2041.1, DT[, median(date), by=g], data.table(g=c("a","b"), V1=as.Date(c("2018-01-03","2018-01-25"))))
test(2041.2, DT[, median(time), by=g], DT[c(2,5),.(g=g, V1=time)])


###################################
Expand Down
5 changes: 3 additions & 2 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted,
SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg);

// quickselect
double dquickselect(double *x, int n, int k);
double iquickselect(int *x, int n, int k);
double dquickselect(double *x, int n);
double iquickselect(int *x, int n);
double i64quickselect(int64_t *x, int n);

// fread.c
double wallclock();
Expand Down
204 changes: 40 additions & 164 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ static int maxgrpn = 0;
static int *oo = NULL;
static int *ff = NULL;
static int isunsorted = 0;
static union {
double d;
long long ll;
} u;

// from R's src/cov.c (for variance / sd)
#ifdef HAVE_LONG_DOUBLE
Expand Down Expand Up @@ -776,177 +772,56 @@ SEXP gmax(SEXP x, SEXP narm)
}

// gmedian, always returns numeric type (to avoid as.numeric() wrap..)
SEXP gmedian(SEXP x, SEXP narm) {

if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
SEXP gmedian(SEXP x, SEXP narmArg) {
if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
if (!isVectorAtomic(x)) error("GForce median can only be applied to columns, not .SD or similar. To find median of all items in a list such as .SD, either add the prefix stats::median(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,median),by=,.SDcols=]'");
if (inherits(x, "factor")) error("median is not meaningful for factors.");
R_len_t i=0, j=0, k=0, imed=0, thisgrpsize=0, medianindex=0, nacount=0;
double val = 0.0;
Rboolean isna = FALSE, isint64 = FALSE;
SEXP ans, sub, klass;
void *ptr;
const bool isInt64 = INHERITS(x, char_integer64), narm = LOGICAL(narmArg)[0];
int n = (irowslen == -1) ? length(x) : irowslen;
if (nrow != n) error("nrow [%d] != length(x) [%d] in gmedian", nrow, n);
SEXP ans = PROTECT(allocVector(REALSXP, ngrp));
double *ansd = REAL(ans);
switch(TYPEOF(x)) {
case REALSXP:
klass = getAttrib(x, R_ClassSymbol);
isint64 = (isString(klass) && STRING_ELT(klass, 0) == char_integer64);
ans = PROTECT(allocVector(REALSXP, ngrp));
sub = PROTECT(allocVector(REALSXP, maxgrpn)); // allocate once upfront
ptr = REAL(sub);
if (!LOGICAL(narm)[0]) {
for (i=0; i<ngrp; i++) {
isna = FALSE;
thisgrpsize = grpsize[i];
SETLENGTH(sub, thisgrpsize);
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j] = REAL(x)[k];
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
} else {
u.d = REAL(x)[k];
if (u.ll != NA_INT64_LL) {
REAL(sub)[j] = (double)u.ll;
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
}
}
if (isna) continue;
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
if (thisgrpsize % 2 == 0) {
val = REAL(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
} else {
for (i=0; i<ngrp; i++) {
nacount = 0;
thisgrpsize = grpsize[i];
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
// TODO: raise this if-statement?
if (!isint64) {
if (!ISNAN(REAL(x)[k])) {
REAL(sub)[j-nacount] = REAL(x)[k];
} else { nacount++; continue; }
} else {
u.d = REAL(x)[k];
if (u.ll != NA_INT64_LL) {
REAL(sub)[j-nacount] = (double)u.ll;
} else { nacount++; continue; }
}
}
if (nacount == thisgrpsize) {
REAL(ans)[i] = NA_REAL; // all NAs
continue;
}
thisgrpsize -= nacount;
SETLENGTH(sub, thisgrpsize);
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = dquickselect(ptr, thisgrpsize, medianindex-1);
if (thisgrpsize % 2 == 0) {
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
val = REAL(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = REAL(sub)[imed] > val ? val : REAL(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
case REALSXP: {
double *subd = REAL(PROTECT(allocVector(REALSXP, maxgrpn))); // allocate once upfront and reuse
int64_t *xi64 = (int64_t *)REAL(x);
double *xd = REAL(x);
for (int i=0; i<ngrp; ++i) {
int thisgrpsize = grpsize[i], nacount=0;
for (int j=0; j<thisgrpsize; ++j) {
int k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (isInt64 ? xi64[k]==NA_INTEGER64 : ISNAN(xd[k])) nacount++;
jangorecki marked this conversation as resolved.
Show resolved Hide resolved
else subd[j-nacount] = xd[k];
}
}
SETLENGTH(sub, maxgrpn);
thisgrpsize -= nacount; // all-NA is returned as NA_REAL via n==0 case inside *quickselect
ansd[i] = (nacount && !narm) ? NA_REAL : (isInt64 ? i64quickselect((void *)subd, thisgrpsize) : dquickselect(subd, thisgrpsize));
}}
break;
case LGLSXP: case INTSXP:
ans = PROTECT(allocVector(REALSXP, ngrp));
sub = PROTECT(allocVector(INTSXP, maxgrpn)); // allocate once upfront
ptr = INTEGER(sub);
if (!LOGICAL(narm)[0]) {
for (i=0; i<ngrp; i++) {
isna = FALSE;
thisgrpsize = grpsize[i];
SETLENGTH(sub, thisgrpsize);
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j] = INTEGER(x)[k];
} else {
REAL(ans)[i] = NA_REAL;
isna = TRUE; break;
}
}
if (isna) continue;
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1); // 0-indexed
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
if (thisgrpsize % 2 == 0) {
val = INTEGER(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
}
} else {
for (i=0; i<ngrp; i++) {
nacount = 0;
thisgrpsize = grpsize[i];
for (j=0; j<thisgrpsize; j++) {
k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (INTEGER(x)[k] != NA_INTEGER) {
INTEGER(sub)[j-nacount] = INTEGER(x)[k];
} else { nacount++; continue; }
}
if (nacount == thisgrpsize) {
REAL(ans)[i] = NA_REAL; // all NAs
continue;
}
thisgrpsize -= nacount;
SETLENGTH(sub, thisgrpsize);
medianindex = (R_len_t)(ceil((double)(thisgrpsize)/2));
REAL(ans)[i] = iquickselect(ptr, thisgrpsize, medianindex-1);
if (thisgrpsize % 2 == 0) {
// all elements to the left of thisgrpsize/2 is < the value at that index
// we just need to get min of last half
val = INTEGER(sub)[medianindex]; // 0-indexed
for (imed=medianindex+1; imed<thisgrpsize; imed++) {
val = INTEGER(sub)[imed] > val ? val : INTEGER(sub)[imed];
}
REAL(ans)[i] = (REAL(ans)[i] + val)/2.0;
}
case LGLSXP: case INTSXP: {
int *subi = INTEGER(PROTECT(allocVector(INTSXP, maxgrpn)));
int *xi = INTEGER(x);
for (int i=0; i<ngrp; i++) {
int thisgrpsize = grpsize[i], nacount=0;
for (int j=0; j<thisgrpsize; ++j) {
int k = ff[i]+j-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
if (xi[k]==NA_INTEGER) nacount++;
else subi[j-nacount] = xi[k];
}
}
SETLENGTH(sub, maxgrpn);
ansd[i] = (nacount && !narm) ? NA_REAL : iquickselect(subi, thisgrpsize-nacount);
}}
break;
default:
error("Type '%s' not supported by GForce median (gmedian). Either add the prefix stats::median(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
error("Type '%s' not supported by GForce median (gmedian). Either add the prefix stats::median(.) or turn "
"off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
UNPROTECT(2);
return(ans);
if (!isInt64) copyMostAttrib(x, ans);
// else the integer64 class needs to be dropped since double is always returned by gmedian
UNPROTECT(2); // ans, subd|subi
return ans;
}

SEXP glast(SEXP x) {
Expand Down Expand Up @@ -1306,6 +1181,7 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD)
error("Type '%s' not supported by GForce sd (gsd). Either add the prefix stats::sd(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
}
}
// no copyMostAttrib(x, ans) since class (e.g. Date) unlikely applicable to sd/var
UNPROTECT(2);
return (ans);
}
Expand Down
Loading