Rdatatable · mattdowle · Jun 17, 2019 · May 25, 2019 · May 25, 2019 · May 25, 2019
diff --git a/R/CJ.R b/R/CJ.R
@@ -0,0 +1,68 @@
+SJ = function(...) {
+  JDT = as.data.table(list(...))
+  setkey(JDT)
+}
+# S for Sorted, usually used in i to sort the i table
+
+# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid alloc.col
+
+CJ <- function(..., sorted = TRUE, unique = FALSE)
+{
+  # Pass in a list of unique values, e.g. ids and dates
+  # Cross Join will then produce a join table with the combination of all values (cross product).
+  # The last vector is varied the quickest in the table, so dates should be last for roll for example
+  l = list(...)
+  emptyList <- FALSE ## fix for #2511
+  if(any(vapply_1i(l, length) == 0L)){
+    ## at least one column is empty The whole thing will be empty in the end
+    emptyList <- TRUE
+    l <- lapply(l, "[", 0L)
+  }
+  if (unique && !emptyList) l = lapply(l, unique)
+
+  dups = FALSE # fix for #1513
+  ncol = length(l)
+  if (ncol==1L && !emptyList) {
+    if (sorted && length(o <- forderv(l[[1L]]))) out = list(l[[1L]][o])
+    else out = list(l[[1L]])
+    nrow = length(l[[1L]])
+  } else if (ncol > 1L && !emptyList) {
+    # using rep.int instead of rep speeds things up considerably (but attributes are dropped).
+    n = vapply_1i(l, length) #lengths(l) will work from R 3.2.0 (also above)
+    nrow = prod(n)
+    if (nrow > .Machine$integer.max) {
+      stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max)
+    }
+
+    # apply sorting
+    if (sorted) l = lapply(l, function(li) {
+      # fix for #1513
+      if (length(o <- forderv(li, retGrp=TRUE))) li = li[o]
+      if (!dups) dups <<- attr(o, 'maxgrpn') > 1L
+      return(li)
+    })
+
+    # standard [ method destroys attributes, so below
+    #   will keep attributes only for classes with methods that impose so
+    attrib = lapply(l, attributes)
+    out = .Call(Ccj, l)
+    for (jj in 1:ncol) if (!is.null(attributes(l[[jj]]))) attributes(out[[jj]]) = attrib[[jj]]
+  # ncol == 0 || emptyList
+  } else {out = l; nrow = length(l[[1L]])}
+  setattr(out, "row.names", .set_row_names(nrow))
+  setattr(out, "class", c("data.table", "data.frame"))
+  if (getOption("datatable.CJ.names", TRUE)) {  # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0
+    vnames = name_dots(...)$vnames
+  } else {
+    if (is.null(vnames <- names(out))) vnames = paste0("V", seq_len(ncol))
+    else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
+  }
+  setattr(out, "names", vnames)
+
+  alloc.col(out)  # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
+  if (sorted) {
+    if (!dups) setattr(out, 'sorted', names(out))
+    else setkey(out) # fix #1513
+  }
+  out
+}
@@ -345,73 +345,3 @@ binary = function(x) .Call(Cbinary, x)
 
 setNumericRounding = function(x) {.Call(CsetNumericRounding, as.integer(x)); invisible()}
 getNumericRounding = function() .Call(CgetNumericRounding)
-
-SJ = function(...) {
-  JDT = as.data.table(list(...))
-  setkey(JDT)
-}
-# S for Sorted, usually used in i to sort the i table
-
-# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid alloc.col
-
-CJ = function(..., sorted = TRUE, unique = FALSE)
-{
-  # Pass in a list of unique values, e.g. ids and dates
-  # Cross Join will then produce a join table with the combination of all values (cross product).
-  # The last vector is varied the quickest in the table, so dates should be last for roll for example
-  l = list(...)
-  emptyList = FALSE ## fix for #2511
-  if(any(sapply(l, length) == 0L)){
-    ## at least one column is empty The whole thing will be empty in the end
-    emptyList = TRUE
-    l = lapply(l, "[", 0L)
-  }
-  if (unique && !emptyList) l = lapply(l, unique)
-
-  dups = FALSE # fix for #1513
-  if (length(l)==1L && !emptyList && sorted && length(o <- forderv(l[[1L]])))
-    l[[1L]] = l[[1L]][o]
-  else if (length(l) > 1L && !emptyList) {
-    # using rep.int instead of rep speeds things up considerably (but attributes are dropped).
-    attribs = lapply(l, attributes)  # remember attributes for resetting after rep.int
-    n = vapply(l, length, 0L) #lengths(l) will work from R 3.2.0
-    nrow = prod(n)
-    if (nrow > .Machine$integer.max) {
-      stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max)
-    }
-    x = c(rev(  head(cumprod(rev(n)),-1)  ), 1L)
-    for (i in seq_along(x)) {
-      y = l[[i]]
-      # fix for #1513
-      if (sorted) {
-        if (length(o <- forderv(y, retGrp=TRUE))) y = y[o]
-        if (!dups) dups = attr(o, 'maxgrpn') > 1L
-      }
-      if (i == 1L)
-        l[[i]] = rep.int(y, times = rep.int(x[i], n[i]))   # i.e. rep(y, each=x[i])
-      else if (i == length(n))
-        l[[i]] = rep.int(y, times = nrow/(x[i]*n[i]))
-      else
-        l[[i]] = rep.int(rep.int(y, times = rep.int(x[i], n[i])), times = nrow/(x[i]*n[i]))
-      if (!is.null(attribs[[i]])){
-        attributes(l[[i]]) = attribs[[i]] # reset all attributes that were destroyed by rep.int
-      }
-    }
-  }
-  setattr(l, "row.names", .set_row_names(length(l[[1L]])))
-  setattr(l, "class", c("data.table", "data.frame"))
-  if (getOption("datatable.CJ.names", TRUE)) {  # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0
-    vnames = name_dots(...)$vnames
-  } else {
-    if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l)))
-    else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
-  }
-  setattr(l, "names", vnames)
-
-  l = alloc.col(l)  # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
-  if (sorted) {
-    if (!dups) setattr(l, 'sorted', names(l))
-    else setkey(l) # fix #1513
-  }
-  l
-}
@@ -0,0 +1,53 @@
+#include "data.table.h"
+
+SEXP cj(SEXP base_list) {
+  int JJ = LENGTH(base_list), nprotect = 0;
+  SEXP out = PROTECT(allocVector(VECSXP, JJ)); nprotect++;
+  int NN = 1;
+  for (int j = 0; j < JJ; j++) NN *= LENGTH(VECTOR_ELT(base_list, j));
+  int div = NN, modulo;
+
+  for (int j = 0; j < JJ; j++) {
+    SEXP this_v = VECTOR_ELT(base_list, j);
+    modulo = div;
+    div = modulo/LENGTH(VECTOR_ELT(base_list, j));
+    switch(TYPEOF(this_v)) {
+    case LGLSXP: {
+      SEXP this_col = PROTECT(allocVector(LGLSXP, NN)); nprotect++;
+      for (int i = 0; i < NN; i++) {
+        LOGICAL(this_col)[i] = LOGICAL(this_v)[(i % modulo) / div];
+      }
+      SET_VECTOR_ELT(out, j, this_col);
+    }
+      break;
+    case INTSXP: {
+      SEXP this_col = PROTECT(allocVector(INTSXP, NN)); nprotect++;
+      for (int i = 0; i < NN; i++) {
+        INTEGER(this_col)[i] = INTEGER(this_v)[(i % modulo) / div];
+      }
+      SET_VECTOR_ELT(out, j, this_col);
+    }
+      break;
+    case REALSXP: {
+      SEXP this_col = PROTECT(allocVector(REALSXP, NN)); nprotect++;
+      for (int i = 0; i < NN; i++) {
+        REAL(this_col)[i] = REAL(this_v)[(i % modulo) / div];
+      }
+      SET_VECTOR_ELT(out, j, this_col);
+    }
+      break;
+    case STRSXP: {
+      SEXP this_col = PROTECT(allocVector(STRSXP, NN)); nprotect++;
+      for (int i = 0; i < NN; i++) {
+        SET_STRING_ELT(this_col, i, STRING_ELT(this_v, (i % modulo) / div));
+      }
+      SET_VECTOR_ELT(out, j, this_col);
+    } break;
+    default:
+      error("Type '%s' not supported by CJ.", type2char(TYPEOF(this_v)));
+    }
+  }
+
+  UNPROTECT(nprotect);
+  return(out);
+}
@@ -91,6 +91,9 @@ bool GetVerbose();
 double NA_INT64_D;
 long long NA_INT64_LL;
 
+// cj.c
+SEXP cj(SEXP base_list);
+
 // dogroups.c
 SEXP keepattr(SEXP to, SEXP from);
 SEXP growVector(SEXP x, R_len_t newlen);

@@ -83,6 +83,7 @@ SEXP dllVersion();
 SEXP nafillR();
 SEXP colnamesInt();
 SEXP initLastUpdated();
+SEXP cj();
 
 // .Externals
 SEXP fastmean();
@@ -168,6 +169,7 @@ R_CallMethodDef callMethods[] = {
 {"CnafillR", (DL_FUNC) &nafillR, -1},
 {"CcolnamesInt", (DL_FUNC) &colnamesInt, -1},
 {"CinitLastUpdated", (DL_FUNC) &initLastUpdated, -1},
+{"Ccj", (DL_FUNC) &cj, -1},
 {NULL, NULL, 0}
 };