Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port CJ to C #3596

Merged
merged 17 commits into from
Jun 17, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions R/CJ.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
SJ = function(...) {
JDT = as.data.table(list(...))
setkey(JDT)
}
# S for Sorted, usually used in i to sort the i table

# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid alloc.col

CJ <- function(..., sorted = TRUE, unique = FALSE)
{
# Pass in a list of unique values, e.g. ids and dates
# Cross Join will then produce a join table with the combination of all values (cross product).
# The last vector is varied the quickest in the table, so dates should be last for roll for example
l = list(...)
emptyList <- FALSE ## fix for #2511
if(any(vapply_1i(l, length) == 0L)){
## at least one column is empty The whole thing will be empty in the end
emptyList <- TRUE
l <- lapply(l, "[", 0L)
}
if (unique && !emptyList) l = lapply(l, unique)

dups = FALSE # fix for #1513
ncol = length(l)
if (ncol==1L && !emptyList) {
if (sorted && length(o <- forderv(l[[1L]]))) out = list(l[[1L]][o])
else out = list(l[[1L]])
nrow = length(l[[1L]])
} else if (ncol > 1L && !emptyList) {
# using rep.int instead of rep speeds things up considerably (but attributes are dropped).
n = vapply_1i(l, length) #lengths(l) will work from R 3.2.0 (also above)
nrow = prod(n)
if (nrow > .Machine$integer.max) {
stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max)
}

# apply sorting
if (sorted) l = lapply(l, function(li) {
# fix for #1513
if (length(o <- forderv(li, retGrp=TRUE))) li = li[o]
if (!dups) dups <<- attr(o, 'maxgrpn') > 1L
return(li)
})

# standard [ method destroys attributes, so below
# will keep attributes only for classes with methods that impose so
attrib = lapply(l, attributes)
out = .Call(Ccj, l)
for (jj in 1:ncol) if (!is.null(attributes(l[[jj]]))) attributes(out[[jj]]) = attrib[[jj]]
# ncol == 0 || emptyList
} else {out = l; nrow = length(l[[1L]])}
setattr(out, "row.names", .set_row_names(nrow))
setattr(out, "class", c("data.table", "data.frame"))
if (getOption("datatable.CJ.names", TRUE)) { # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0
vnames = name_dots(...)$vnames
} else {
if (is.null(vnames <- names(out))) vnames = paste0("V", seq_len(ncol))
else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
}
setattr(out, "names", vnames)

alloc.col(out) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
if (sorted) {
if (!dups) setattr(out, 'sorted', names(out))
else setkey(out) # fix #1513
}
out
}
70 changes: 0 additions & 70 deletions R/setkey.R
Original file line number Diff line number Diff line change
Expand Up @@ -345,73 +345,3 @@ binary = function(x) .Call(Cbinary, x)

setNumericRounding = function(x) {.Call(CsetNumericRounding, as.integer(x)); invisible()}
getNumericRounding = function() .Call(CgetNumericRounding)

SJ = function(...) {
JDT = as.data.table(list(...))
setkey(JDT)
}
# S for Sorted, usually used in i to sort the i table

# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid alloc.col

CJ = function(..., sorted = TRUE, unique = FALSE)
{
# Pass in a list of unique values, e.g. ids and dates
# Cross Join will then produce a join table with the combination of all values (cross product).
# The last vector is varied the quickest in the table, so dates should be last for roll for example
l = list(...)
emptyList = FALSE ## fix for #2511
if(any(sapply(l, length) == 0L)){
## at least one column is empty The whole thing will be empty in the end
emptyList = TRUE
l = lapply(l, "[", 0L)
}
if (unique && !emptyList) l = lapply(l, unique)

dups = FALSE # fix for #1513
if (length(l)==1L && !emptyList && sorted && length(o <- forderv(l[[1L]])))
l[[1L]] = l[[1L]][o]
else if (length(l) > 1L && !emptyList) {
# using rep.int instead of rep speeds things up considerably (but attributes are dropped).
attribs = lapply(l, attributes) # remember attributes for resetting after rep.int
n = vapply(l, length, 0L) #lengths(l) will work from R 3.2.0
nrow = prod(n)
if (nrow > .Machine$integer.max) {
stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max)
}
x = c(rev( head(cumprod(rev(n)),-1) ), 1L)
for (i in seq_along(x)) {
y = l[[i]]
# fix for #1513
if (sorted) {
if (length(o <- forderv(y, retGrp=TRUE))) y = y[o]
if (!dups) dups = attr(o, 'maxgrpn') > 1L
}
if (i == 1L)
l[[i]] = rep.int(y, times = rep.int(x[i], n[i])) # i.e. rep(y, each=x[i])
else if (i == length(n))
l[[i]] = rep.int(y, times = nrow/(x[i]*n[i]))
else
l[[i]] = rep.int(rep.int(y, times = rep.int(x[i], n[i])), times = nrow/(x[i]*n[i]))
if (!is.null(attribs[[i]])){
attributes(l[[i]]) = attribs[[i]] # reset all attributes that were destroyed by rep.int
}
}
}
setattr(l, "row.names", .set_row_names(length(l[[1L]])))
setattr(l, "class", c("data.table", "data.frame"))
if (getOption("datatable.CJ.names", TRUE)) { # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0
vnames = name_dots(...)$vnames
} else {
if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l)))
else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
}
setattr(l, "names", vnames)

l = alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
if (sorted) {
if (!dups) setattr(l, 'sorted', names(l))
else setkey(l) # fix #1513
}
l
}
53 changes: 53 additions & 0 deletions src/cj.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "data.table.h"

SEXP cj(SEXP base_list) {
int JJ = LENGTH(base_list), nprotect = 0;
SEXP out = PROTECT(allocVector(VECSXP, JJ)); nprotect++;
int NN = 1;
for (int j = 0; j < JJ; j++) NN *= LENGTH(VECTOR_ELT(base_list, j));
int div = NN, modulo;

for (int j = 0; j < JJ; j++) {
SEXP this_v = VECTOR_ELT(base_list, j);
modulo = div;
div = modulo/LENGTH(VECTOR_ELT(base_list, j));
switch(TYPEOF(this_v)) {
case LGLSXP: {
SEXP this_col = PROTECT(allocVector(LGLSXP, NN)); nprotect++;
for (int i = 0; i < NN; i++) {
LOGICAL(this_col)[i] = LOGICAL(this_v)[(i % modulo) / div];
}
SET_VECTOR_ELT(out, j, this_col);
}
break;
case INTSXP: {
SEXP this_col = PROTECT(allocVector(INTSXP, NN)); nprotect++;
for (int i = 0; i < NN; i++) {
INTEGER(this_col)[i] = INTEGER(this_v)[(i % modulo) / div];
}
SET_VECTOR_ELT(out, j, this_col);
}
break;
case REALSXP: {
SEXP this_col = PROTECT(allocVector(REALSXP, NN)); nprotect++;
for (int i = 0; i < NN; i++) {
REAL(this_col)[i] = REAL(this_v)[(i % modulo) / div];
}
SET_VECTOR_ELT(out, j, this_col);
}
break;
case STRSXP: {
SEXP this_col = PROTECT(allocVector(STRSXP, NN)); nprotect++;
for (int i = 0; i < NN; i++) {
SET_STRING_ELT(this_col, i, STRING_ELT(this_v, (i % modulo) / div));
}
SET_VECTOR_ELT(out, j, this_col);
} break;
default:
error("Type '%s' not supported by CJ.", type2char(TYPEOF(this_v)));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do I need to UNPROTECT in an error branch?

Copy link
Member

@mattdowle mattdowle May 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's one nice thing: R will clean up all objects on error so you don't need UNPROTECT.
Yay -- you're into C! Party!
We've been taking R API usage outside loops recently since R 3.5 added overhead. So take the REAL(), INTEGER() and LOGICAL() calls outside (see other C code in data.table for examples but look at files that have been more recently revised).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only the REAL calls? not INTEGER/LOGICAL?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

REAL, INTEGER and LOGICAL

}
}

UNPROTECT(nprotect);
return(out);
}
3 changes: 3 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ bool GetVerbose();
double NA_INT64_D;
long long NA_INT64_LL;

// cj.c
SEXP cj(SEXP base_list);

// dogroups.c
SEXP keepattr(SEXP to, SEXP from);
SEXP growVector(SEXP x, R_len_t newlen);
Expand Down
2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ SEXP dllVersion();
SEXP nafillR();
SEXP colnamesInt();
SEXP initLastUpdated();
SEXP cj();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -168,6 +169,7 @@ R_CallMethodDef callMethods[] = {
{"CnafillR", (DL_FUNC) &nafillR, -1},
{"CcolnamesInt", (DL_FUNC) &colnamesInt, -1},
{"CinitLastUpdated", (DL_FUNC) &initLastUpdated, -1},
{"Ccj", (DL_FUNC) &cj, -1},
{NULL, NULL, 0}
};

Expand Down