From 38f7cee9b0cd5c58b221c511cbe2cd294a2673a5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 4 Jun 2020 22:12:07 +0100 Subject: [PATCH 01/34] dev --- R/wrappers.R | 6 ++ src/data.table.h | 3 + src/fjoin.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++ src/init.c | 3 + 4 files changed, 231 insertions(+) create mode 100644 src/fjoin.c diff --git a/R/wrappers.R b/R/wrappers.R index 5fec33a92f..c13feb78a7 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -12,3 +12,9 @@ colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, c coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) + +#fsjoin = function(x, y) .Call(CsjoinR, x, y) +fijoin = function(x, y) { + ans = .Call(CijoinR, x, y) + ans[3:4] +} diff --git a/src/data.table.h b/src/data.table.h index 1cf975e68b..0109e1da75 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -243,3 +243,6 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k); //fifelse.c SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); + +// fjoin.c +SEXP ijoinR(SEXP x, SEXP y); diff --git a/src/fjoin.c b/src/fjoin.c new file mode 100644 index 0000000000..3695641643 --- /dev/null +++ b/src/fjoin.c @@ -0,0 +1,219 @@ +#include "data.table.h" + +/* + * as of now these are optimized routines for joining + * - single integer columns + * - sorted (experimental) or indexed + * + * TODO: + * type of join is pushed down here so we compute only what is necessary + * mult!="all" and "unique-key" branches escape extra computation + */ + +typedef enum { + inner, left, right, full, semi, anti, cross +} joinhow; + +SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { + SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames; + setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4)); + SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0)); + SET_STRING_ELT(ansnames, 0, mkChar("starts_x")); + //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0)); + SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0)); + SET_STRING_ELT(ansnames, 1, mkChar("lens_x")); + //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1)); + SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn)); + SET_STRING_ELT(ansnames, 2, mkChar("starts_y")); + int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2)); + SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn)); + SET_STRING_ELT(ansnames, 3, mkChar("lens_y")); + int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3)); + for (int i=0; iy[j]) { + Rprintf("x[%d]>y[%d]\n", i, j); + j++; + continue; + } + } + } else { + error("not yet implemented"); + } + matchn[0] = imatch; +} +SEXP sjoinR(SEXP x, SEXP y) { + if (!isInteger(x) || !isInteger(y)) + error("must be integer"); + int nx = LENGTH(x), ny = LENGTH(y); + // left: nx + // right: ny + // inner: min(nx, ny) + // full: max(nx, ny) + double t_alloc = omp_get_wtime(); + bool *match = (bool *)R_alloc(nx, sizeof(bool)); // not sure if we need this + int *matchi = (int *)R_alloc(nx, sizeof(int)); + int *matchlens = (int *)R_alloc(nx, sizeof(int)); + int *matchstarts = (int *)R_alloc(nx, sizeof(int)); + int matchn = 0; + t_alloc = omp_get_wtime() - t_alloc; + double t_sjoin = omp_get_wtime(); + sjoin(INTEGER(x), nx, INTEGER(y), ny, left, match, matchi, matchstarts, matchlens, &matchn); + t_sjoin = omp_get_wtime() - t_sjoin; + Rprintf("alloc took %.3fs; sjoin took %.3fs\n", t_alloc, t_sjoin); + return joinOut(matchn, matchi, matchstarts, matchlens); +} +*/ + +/* + * fast indexed join + * sort-merge join + */ +void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, + int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts, + joinhow how, + bool *match, int *matchn, + int *starts_x, int *lens_x, + int *starts_y, int *lens_y) { + if (how != left) + error("only left join implemented so far"); + if (!x_ord || !y_ord) + error("support for already unsorted not yet implemented"); + bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; + //int imatch = 0; + + int *x_lens=0, *y_lens=0; + if (!unq_x) { + x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395 + for (int i=0; i y_j) j++; + } + } else if (how==left && unq_y) { + while (is y_j) j++; + } + } else if (how==left && unq_x) { + while (i y_j) js++; + } + } else if (how==left) { + while (is y_j) js++; + } + } else if (how==inner && unq_x && unq_y) { + + } else if (how==full && unq_x && unq_y) { + + } + + matchn[0] = nx; +} + +SEXP ijoinR(SEXP x, SEXP y) { + if (!isInteger(x) || !isInteger(y)) + error("must be integer"); + int nx = LENGTH(x), ny = LENGTH(y); + double t_index = omp_get_wtime(); + SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); + SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); + SEXP x_starts = getAttrib(x_idx, sym_starts); + SEXP y_starts = getAttrib(y_idx, sym_starts); + t_index = omp_get_wtime() - t_index; + double t_alloc = omp_get_wtime(); + bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this + int matchn = 0; + int *starts_x = (int *)R_alloc(0, sizeof(int)); + int *lens_x = (int *)R_alloc(0, sizeof(int)); + int *starts_y = (int *)R_alloc(nx, sizeof(int)); + int *lens_y = (int *)R_alloc(nx, sizeof(int)); + for (int i=0; i Date: Fri, 5 Jun 2020 00:52:03 +0100 Subject: [PATCH 02/34] devdev --- R/wrappers.R | 6 +- inst/tests/fjoin.Rraw | 57 ++++++++++++++++ src/data.table.h | 2 +- src/fjoin.c | 155 +++++++++++++++++++++++++++--------------- src/init.c | 4 +- 5 files changed, 162 insertions(+), 62 deletions(-) create mode 100644 inst/tests/fjoin.Rraw diff --git a/R/wrappers.R b/R/wrappers.R index c13feb78a7..1c9fa6c351 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -#fsjoin = function(x, y) .Call(CsjoinR, x, y) -fijoin = function(x, y) { - ans = .Call(CijoinR, x, y) +fjoin = function(x, y) { + ans = .Call(CfjoinR, x, y) + ans[[4L]][is.na(ans[[3L]])] = 1L ans[3:4] } diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw new file mode 100644 index 0000000000..d9fabf7664 --- /dev/null +++ b/inst/tests/fjoin.Rraw @@ -0,0 +1,57 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + fjoin = data.table:::fjoin + bmerge = data.table:::bmerge + forderv = data.table:::forderv +} +check = function(x, y) { + a = d(x, y) + b = fjoin(x, y) + na = is.na(a[["starts"]]) + fail = !identical(a[["starts"]][!na], b[["starts_y"]][!na]) + fail = fail || !identical(a[["lens"]][!na], b[["lens_y"]][!na]) + if (fail) { + if (interactive() && length(x) < 20L) { + out = capture.output(print(list(d=a, fj=b))) + stop("starts are not equal: ", out) + } + stop("starts are not equal") + } + invisible(TRUE) +} + + +d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2] + +x = c(1L,2L,3L,4L) +y = c(2L,3L,5L) +test(1.01, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(2L,3L,5L) +test(1.02, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,4L) +y = c(2L,3L,3L,5L) +test(1.03, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(2L,3L,3L,5L) +test(1.04, fjoin(x, y), d(x, y)) + +# y unsorted +x = c(1L,2L,3L,4L) +y = c(2L,5L,3L) +test(2.01, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(3L,2L,5L) +test(2.02, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,4L) +y = c(5L,3L,2L,3L) +test(2.03, fjoin(x, y), d(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(5L,3L,3L,2L) +test(2.04, fjoin(x, y), d(x, y)) diff --git a/src/data.table.h b/src/data.table.h index 0109e1da75..8bca1a7e02 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // fjoin.c -SEXP ijoinR(SEXP x, SEXP y); +SEXP fjoinR(SEXP x, SEXP y); diff --git a/src/fjoin.c b/src/fjoin.c index 3695641643..92e7725732 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -24,10 +24,10 @@ SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) SET_STRING_ELT(ansnames, 1, mkChar("lens_x")); //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1)); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn)); - SET_STRING_ELT(ansnames, 2, mkChar("starts_y")); + SET_STRING_ELT(ansnames, 2, mkChar("starts")); int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2)); SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn)); - SET_STRING_ELT(ansnames, 3, mkChar("lens_y")); + SET_STRING_ELT(ansnames, 3, mkChar("lens")); int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3)); for (int i=0; i y_j) j++; } - else if (x_i < y_j) i++; - else if (x_i > y_j) j++; - } - } else if (how==left && unq_y) { - while (is y_j) j++; } - else if (x_i < y_j) is++; - else if (x_i > y_j) j++; - } - } else if (how==left && unq_x) { - while (i y_j) js++; + } + } else if (how==left) { + while (is y_j) js++; } - else if (x_i < y_j) i++; - else if (x_i > y_j) js++; } - } else if (how==left) { - while (is y_j) j++; + } + } else if (how==left && unq_y) { + while (is y_j) j++; + } + } else if (how==left && unq_x) { + while (i y_j) js++; + } + } else if (how==left) { + while (is y_j) js++; } - else if (x_i < y_j) is++; - else if (x_i > y_j) js++; } - } else if (how==inner && unq_x && unq_y) { - - } else if (how==full && unq_x && unq_y) { - + } else if (y_ord) { + error("dev"); + } else { + error("devVVV"); + //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]); } - matchn[0] = nx; } -SEXP ijoinR(SEXP x, SEXP y) { +SEXP fjoinR(SEXP x, SEXP y) { if (!isInteger(x) || !isInteger(y)) error("must be integer"); int nx = LENGTH(x), ny = LENGTH(y); @@ -208,7 +251,7 @@ SEXP ijoinR(SEXP x, SEXP y) { t_alloc = omp_get_wtime() - t_alloc; double t_ijoin = omp_get_wtime(); // *x, nx, *x_o, x_ord, *x_starts, nx_starts, - ijoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts), + fjoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts), INTEGER(y), ny, INTEGER(y_idx), LENGTH(y_idx)==0, INTEGER(y_starts), LENGTH(y_starts), left, match, &matchn, starts_x, lens_x, starts_y, lens_y); diff --git a/src/init.c b/src/init.c index cb8678cd6e..9f7f45ee8c 100644 --- a/src/init.c +++ b/src/init.c @@ -120,7 +120,7 @@ SEXP unlock(); SEXP islockedR(); SEXP allNAR(); //SEXP sjoinR(); -SEXP ijoinR(); +SEXP fjoinR(); // .Externals SEXP fastmean(); @@ -213,7 +213,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, -{"CijoinR", (DL_FUNC) &ijoinR, -1}, +{"CfjoinR", (DL_FUNC) &fjoinR, -1}, {NULL, NULL, 0} }; From 3480d328d492a47d5abb9c19ea4e7edf9da6315a Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 5 Jun 2020 12:05:13 +0100 Subject: [PATCH 03/34] unsorted works as well --- inst/tests/fjoin.Rraw | 5 +++++ src/fjoin.c | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw index d9fabf7664..0e16f0965e 100644 --- a/inst/tests/fjoin.Rraw +++ b/inst/tests/fjoin.Rraw @@ -55,3 +55,8 @@ test(2.03, fjoin(x, y), d(x, y)) x = c(1L,2L,3L,3L,4L) y = c(5L,3L,3L,2L) test(2.04, fjoin(x, y), d(x, y)) + +# x, y unsorted +x = c(4L,2L,3L,1L) +y = c(2L,5L,3L) +test(2.01, fjoin(x, y), d(x, y)) diff --git a/src/fjoin.c b/src/fjoin.c index 92e7725732..fc5dfbbd6b 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -113,8 +113,6 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, int *starts_y, int *lens_y) { if (how != left) error("only left join implemented so far"); - if (!x_ord) - error("support for already unsorted not yet implemented"); bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; //int imatch = 0; @@ -222,9 +220,19 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, } } } else if (y_ord) { - error("dev"); + error("y_ord not yet implemented"); } else { - error("devVVV"); + if (how==left && unq_x && unq_y) { + while (i y_j) j++; + } + } else { + error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)"); + } //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]); } matchn[0] = nx; From 99589c5d29027787c7d65e3d803a02cd4e8af55f Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 5 Jun 2020 13:06:04 +0100 Subject: [PATCH 04/34] testing and fixing --- inst/tests/fjoin.Rraw | 63 ++++++++++++++++++++++++++++++++++++++++--- src/fjoin.c | 8 +++--- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw index 0e16f0965e..cfdf0d0beb 100644 --- a/inst/tests/fjoin.Rraw +++ b/inst/tests/fjoin.Rraw @@ -29,6 +29,7 @@ check = function(x, y) { d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2] +## x y sorted x = c(1L,2L,3L,4L) y = c(2L,3L,5L) test(1.01, fjoin(x, y), d(x, y)) @@ -42,7 +43,7 @@ x = c(1L,2L,3L,3L,4L) y = c(2L,3L,3L,5L) test(1.04, fjoin(x, y), d(x, y)) -# y unsorted +## y unsorted x = c(1L,2L,3L,4L) y = c(2L,5L,3L) test(2.01, fjoin(x, y), d(x, y)) @@ -56,7 +57,61 @@ x = c(1L,2L,3L,3L,4L) y = c(5L,3L,3L,2L) test(2.04, fjoin(x, y), d(x, y)) -# x, y unsorted -x = c(4L,2L,3L,1L) +#... + +## xy unsorted +x = c(4L,1L,3L,2L) y = c(2L,5L,3L) -test(2.01, fjoin(x, y), d(x, y)) +test(4.01, fjoin(x, y), d(x, y)) + +#... + +## xy sorted +ssa = function(unq_n, size, sort=FALSE) { + stopifnot(unq_n <= size) + unq_sub = seq_len(unq_n) + ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) + if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) + if (sort) sort(ans) else ans +} +N = 1e2 +x = ssa(N, N, sort=TRUE) +y = ssa(N, N, sort=TRUE) +test(11.01, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N, sort=TRUE) +test(11.02, fjoin(x, y), d(x, y)) +x = ssa(N, N, sort=TRUE) +y = ssa(N, N*1.1, sort=TRUE) +test(11.03, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N*1.1, sort=TRUE) +test(11.04, fjoin(x, y), d(x, y)) + +## y unsorted +x = ssa(N, N, sort=TRUE) +y = ssa(N, N) +test(12.01, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N) +test(12.02, fjoin(x, y), d(x, y)) +x = ssa(N, N, sort=TRUE) +y = ssa(N, N*1.1) +test(12.03, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N*1.1) +test(12.04, fjoin(x, y), d(x, y)) + +## xy unsorted +x = ssa(N, N) +y = ssa(N, N) +test(14.01, fjoin(x, y), d(x, y)) +#x = ssa(N, N*1.1) +#y = ssa(N, N) +#test(14.02, fjoin(x, y), d(x, y)) +#x = ssa(N, N) +#y = ssa(N, N*1.1) +#test(14.03, fjoin(x, y), d(x, y)) +#x = ssa(N, N*1.1) +#y = ssa(N, N*1.1) +#test(14.04, fjoin(x, y), d(x, y)) diff --git a/src/fjoin.c b/src/fjoin.c index fc5dfbbd6b..96d1fd45f8 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -120,12 +120,14 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, if (!unq_x) { x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395 for (int i=0; i y_j) j++; } From 94a0faaa2480bacba658b797031dfb5f3e54916a Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 6 Jun 2020 01:19:25 +0100 Subject: [PATCH 05/34] devdevdev --- R/wrappers.R | 4 +- src/data.table.h | 2 +- src/fjoin.c | 409 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 401 insertions(+), 14 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index 1c9fa6c351..6b9da0d799 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -fjoin = function(x, y) { - ans = .Call(CfjoinR, x, y) +fjoin = function(x, y, how=c("left","inner","full")) { + ans = .Call(CfjoinR, x, y, match.arg(how)) ans[[4L]][is.na(ans[[3L]])] = 1L ans[3:4] } diff --git a/src/data.table.h b/src/data.table.h index 8bca1a7e02..abd993a592 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // fjoin.c -SEXP fjoinR(SEXP x, SEXP y); +SEXP fjoinR(SEXP x, SEXP y, SEXP how); diff --git a/src/fjoin.c b/src/fjoin.c index 96d1fd45f8..6d7b5dff11 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -14,7 +14,7 @@ typedef enum { inner, left, right, full, semi, anti, cross } joinhow; -SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { +SEXP joinOut2(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames; setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4)); SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0)); @@ -105,14 +105,15 @@ SEXP sjoinR(SEXP x, SEXP y) { * split whole 1:nx_starts into numbers of buckets equal to threads * each thread do binary merge x of first and last element, and then sort-merge to a subset of y defined by those matches */ -void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, +// while loop approach returning results consistent to bmerge +void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts, joinhow how, bool *match, int *matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { - if (how != left) - error("only left join implemented so far"); + //if (how != left) + // error("only left join implemented so far"); bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; //int imatch = 0; @@ -174,6 +175,14 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, is++; } else if (x_i < y_j) is++; else if (x_i > y_j) js++; } + } else if (how==inner && unq_x && unq_y) { + error("dev"); + } else if (how==inner) { + error("not yet implemented"); + } else if (how==full && unq_x && unq_y) { + error("dev"); + } else if (how==full) { + error("not yet implemented"); } } else if (x_ord) { if (how==left && unq_x && unq_y) { @@ -240,7 +249,7 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, matchn[0] = nx; } -SEXP fjoinR(SEXP x, SEXP y) { +SEXP ijoinR(SEXP x, SEXP y, SEXP how) { if (!isInteger(x) || !isInteger(y)) error("must be integer"); int nx = LENGTH(x), ny = LENGTH(y); @@ -251,22 +260,400 @@ SEXP fjoinR(SEXP x, SEXP y) { SEXP y_starts = getAttrib(y_idx, sym_starts); t_index = omp_get_wtime() - t_index; double t_alloc = omp_get_wtime(); + int ans_n = 0; + joinhow how2; + // left: nx + // right: ny + // inner: min(nx, ny) + // full: sum(nx, ny) + if (STRING_ELT(how, 0)==mkChar("left")) { + ans_n = nx; how2=left; + } else if (STRING_ELT(how, 0)==mkChar("inner")) { + ans_n = MIN(nx, ny); how2=inner; + } else if (STRING_ELT(how, 0)==mkChar("full")) { + ans_n = nx+ny; how2=full; + } else error("other how not yet implemented"); bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this int matchn = 0; int *starts_x = (int *)R_alloc(0, sizeof(int)); int *lens_x = (int *)R_alloc(0, sizeof(int)); - int *starts_y = (int *)R_alloc(nx, sizeof(int)); - int *lens_y = (int *)R_alloc(nx, sizeof(int)); - for (int i=0; i b ? a : b; } +static inline int min_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] < y[y_idx] ? x_idx : y_idx; } +static inline int max_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] > y[y_idx] ? x_idx : y_idx; }*/ +static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously + for (int i=0; i val: %d > %d\n", i, y[y_o[i]-1], val); + if (y[y_o[i]-1]>val) { + if (i==0) error("i=0 max_i_match, unhandled"); + return(i-1); + } + } + error("not yet handled in max_i_match"); +} +static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { + for (int i=ny-1; i y_j) j++; + } + } else if (how==left && unq_y) { + error("dev"); + while (is y_j) j++; + } + } else if (how==left && unq_x) { + error("dev"); + while (i y_j) js++; + } + } else if (how==left) { + double t_batch = omp_get_wtime(); + //int min_i = 0, max_i = x_starts[nx_starts-1]-1; + //Rprintf("x[i]=[%d:%d]\n", min_i, max_i); + //int nth = getDTthreads(); + int nBatch=1; + //int b1_min_i = min_i, b1_max_i = max_i; + //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1; + //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i; + //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i); + //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i); + t_batch = omp_get_wtime() - t_batch; + + //int min_y = y[y_starts[0]-1]; + //int max_y = y[y_starts[nx_starts-1]-1]; + //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y); + int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization + //bool skip=false; + t_join = omp_get_wtime(); + //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads()) + for (int b=0; b y_j) js++; + } + /*for (is=0; is y_j) { + while(x[x_starts[is+1]-1] < ) + if (++js>=ny_starts) {skip=true; continue} + } + }*/ + th[b] = omp_get_thread_num(); + } + t_join = omp_get_wtime() - t_join; + for (int b=0; b y_j) js++; + }*/ + } else if (how==inner && unq_x && unq_y) { + error("dev"); + } else if (how==inner) { + error("not yet implemented"); + } else if (how==full && unq_x && unq_y) { + error("dev"); + } else if (how==full) { + /* + * + */ + error("not yet implemented"); + } + } else { + error("input must be ordered!"); + if (false) { + // find bucket ranges + int min_x_i = x_o[0]-1, max_x_i = x_o[nx-1]-1; + int min_y_i = y_o[0]-1, max_y_i = y_o[ny-1]-1; + //int min_x = x[min_x_i], max_x = x[max_x_i]; + //int min_y = y[min_y_i], max_y = y[max_y_i]; + + //bool trim_min_x = min_x < min_y; + //bool trim_min_y = min_x > min_y; + //bool trim_max_x = max_x > max_y; + //bool trim_max_y = max_x < max_y; + // todo trim buckets to join-able subsets of x and y + + //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y); + //int min_xy = imin(min_x, min_y), max_xy = imin(max_x, max_y); + //if (max_xy - min_xy < 2) error("not handled yet"); + + int n_buckets = 2; + if (n_buckets != 2) error("not handled yet"); + // this needs to be done for every bucket + if (y_ord) error("y must not be sorted"); + + int b1_max_x_o_i = max_i_match(x, nx, x_o, x_ord, x[x_o[nx/2]-1]); + Rprintf("b1_max_x_o_i=%d; x[x_o[b1_max_x_o_i]-1]=%d\n", b1_max_x_o_i, x[x_o[b1_max_x_o_i]-1]); + int b1_min_x_i = min_x_i, b1_max_x_i = x_o[b1_max_x_o_i]-1; // this need to ensure that duplicate goes into same bucket!! fjoin(c(1:3,3L,3L,5L,4L), y), 3 goes into both buckets! + int b2_min_x_i = x_o[b1_max_x_o_i+1]-1, b2_max_x_i = max_x_i; + if (x[b1_max_x_i]==x[b2_min_x_i]) error("value of x=%d fits into multiple buckets", x[b2_min_x_i]); + Rprintf("min(x)=%d; max(x)=%d; len(x)=%d\n", x[min_x_i], x[max_x_i], nx); + Rprintf("b1(min(x))=%d; b1(max(x))=%d; b1(len(x))=%d\n", x[b1_min_x_i], x[b1_max_x_i], b1_max_x_o_i-0); + Rprintf("b2(min(x))=%d; b2(max(x))=%d; b2(len(x))=%d\n", x[b2_min_x_i], x[b2_max_x_i], nx-b1_max_x_o_i+1); + error("dev"); + int b1_max_y_o_i = max_i_match(y, ny, y_o, y_ord, /*value to match to*/x[b1_max_x_i]); + int b1_min_y_i = min_y_i, b1_max_y_i = y_o[b1_max_y_o_i]-1; + int b2_min_y_o_i = min_i_match(y, ny, y_o, y_ord, x[b2_min_x_i]); + int b2_min_y_i = y_o[b2_min_y_o_i]-1, b2_max_y_i = max_y_i; + Rprintf("b1_min_x_i=%d, b1_max_x_i=%d, b2_min_x_i=%d, b2_max_x_i=%d\n", b1_min_x_i, b1_max_x_i, b2_min_x_i, b2_max_x_i); + Rprintf("buckets: 1(x:%d-%d, y:%d-%d); 2(x:%d-%d, y:%d-%d)\n", x[b1_min_x_i], x[b1_max_x_i], y[b1_min_y_i], y[b1_max_y_i], x[b2_min_x_i], x[b2_max_x_i], y[b2_min_y_i], y[b2_max_y_i]); + // openmp here!! + for (int b=0; b Date: Sat, 6 Jun 2020 12:39:20 +0100 Subject: [PATCH 06/34] cleanup --- src/fjoin.c | 524 ++++++++-------------------------------------------- 1 file changed, 74 insertions(+), 450 deletions(-) diff --git a/src/fjoin.c b/src/fjoin.c index 6d7b5dff11..47b4f1cf8e 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -1,310 +1,11 @@ #include "data.table.h" /* - * as of now these are optimized routines for joining - * - single integer columns - * - sorted (experimental) or indexed - * - * TODO: - * type of join is pushed down here so we compute only what is necessary - * mult!="all" and "unique-key" branches escape extra computation - */ - -typedef enum { - inner, left, right, full, semi, anti, cross -} joinhow; - -SEXP joinOut2(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { - SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames; - setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4)); - SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0)); - SET_STRING_ELT(ansnames, 0, mkChar("starts_x")); - //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0)); - SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0)); - SET_STRING_ELT(ansnames, 1, mkChar("lens_x")); - //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1)); - SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn)); - SET_STRING_ELT(ansnames, 2, mkChar("starts")); - int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2)); - SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn)); - SET_STRING_ELT(ansnames, 3, mkChar("lens")); - int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3)); - for (int i=0; iy[j]) { - Rprintf("x[%d]>y[%d]\n", i, j); - j++; - continue; - } - } - } else { - error("not yet implemented"); - } - matchn[0] = imatch; -} -SEXP sjoinR(SEXP x, SEXP y) { - if (!isInteger(x) || !isInteger(y)) - error("must be integer"); - int nx = LENGTH(x), ny = LENGTH(y); - // left: nx - // right: ny - // inner: min(nx, ny) - // full: max(nx, ny) - double t_alloc = omp_get_wtime(); - bool *match = (bool *)R_alloc(nx, sizeof(bool)); // not sure if we need this - int *matchi = (int *)R_alloc(nx, sizeof(int)); - int *matchlens = (int *)R_alloc(nx, sizeof(int)); - int *matchstarts = (int *)R_alloc(nx, sizeof(int)); - int matchn = 0; - t_alloc = omp_get_wtime() - t_alloc; - double t_sjoin = omp_get_wtime(); - sjoin(INTEGER(x), nx, INTEGER(y), ny, left, match, matchi, matchstarts, matchlens, &matchn); - t_sjoin = omp_get_wtime() - t_sjoin; - Rprintf("alloc took %.3fs; sjoin took %.3fs\n", t_alloc, t_sjoin); - return joinOut(matchn, matchi, matchstarts, matchlens); -} -*/ - -/* - * fast indexed join * sort-merge join * - * split whole 1:nx_starts into numbers of buckets equal to threads - * each thread do binary merge x of first and last element, and then sort-merge to a subset of y defined by those matches + * split whole 1:nx_starts into numbers of buckets equal to threads*2 + * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches */ -// while loop approach returning results consistent to bmerge -void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts, - int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts, - joinhow how, - bool *match, int *matchn, - int *starts_x, int *lens_x, - int *starts_y, int *lens_y) { - //if (how != left) - // error("only left join implemented so far"); - bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; - //int imatch = 0; - - int *x_lens=0, *y_lens=0; - if (!unq_x) { - x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395 - for (int i=0; i y_j) j++; - } - } else if (how==left && unq_y) { - while (is y_j) j++; - } - } else if (how==left && unq_x) { - while (i y_j) js++; - } - } else if (how==left) { - while (is y_j) js++; - } - } else if (how==inner && unq_x && unq_y) { - error("dev"); - } else if (how==inner) { - error("not yet implemented"); - } else if (how==full && unq_x && unq_y) { - error("dev"); - } else if (how==full) { - error("not yet implemented"); - } - } else if (x_ord) { - if (how==left && unq_x && unq_y) { - while (i y_j) j++; - } - } else if (how==left && unq_y) { - while (is y_j) j++; - } - } else if (how==left && unq_x) { - while (i y_j) js++; - } - } else if (how==left) { - while (is y_j) js++; - } - } - } else if (y_ord) { - error("y_ord not yet implemented"); - } else { - if (how==left && unq_x && unq_y) { - while (i y_j) j++; - } - } else { - error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)"); - } - //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]); - } - matchn[0] = nx; -} - -SEXP ijoinR(SEXP x, SEXP y, SEXP how) { - if (!isInteger(x) || !isInteger(y)) - error("must be integer"); - int nx = LENGTH(x), ny = LENGTH(y); - double t_index = omp_get_wtime(); - SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); - SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); - SEXP x_starts = getAttrib(x_idx, sym_starts); - SEXP y_starts = getAttrib(y_idx, sym_starts); - t_index = omp_get_wtime() - t_index; - double t_alloc = omp_get_wtime(); - int ans_n = 0; - joinhow how2; - // left: nx - // right: ny - // inner: min(nx, ny) - // full: sum(nx, ny) - if (STRING_ELT(how, 0)==mkChar("left")) { - ans_n = nx; how2=left; - } else if (STRING_ELT(how, 0)==mkChar("inner")) { - ans_n = MIN(nx, ny); how2=inner; - } else if (STRING_ELT(how, 0)==mkChar("full")) { - ans_n = nx+ny; how2=full; - } else error("other how not yet implemented"); - bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this - int matchn = 0; - int *starts_x = (int *)R_alloc(0, sizeof(int)); - int *lens_x = (int *)R_alloc(0, sizeof(int)); - int *starts_y = (int *)R_alloc(ans_n, sizeof(int)); - int *lens_y = (int *)R_alloc(ans_n, sizeof(int)); - for (int i=0; i b ? a : b; } -static inline int min_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] < y[y_idx] ? x_idx : y_idx; } -static inline int max_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] > y[y_idx] ? x_idx : y_idx; }*/ static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously for (int i=0; i val: %d > %d\n", i, y[y_o[i]-1], val); @@ -325,6 +26,16 @@ static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { error("not yet handled in min_i_match"); } +int uniqueN(int *x, int nx) { // x have 0:(nx-1) elements + int ans=0; + uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); + memset(seen, 0, nx*sizeof(uint8_t)); + for (int i=0; i y_j) j++; - } - } else if (how==left && unq_y) { - error("dev"); - while (is y_j) j++; - } - } else if (how==left && unq_x) { - error("dev"); - while (i y_j) js++; - } - } else if (how==left) { - double t_batch = omp_get_wtime(); - //int min_i = 0, max_i = x_starts[nx_starts-1]-1; - //Rprintf("x[i]=[%d:%d]\n", min_i, max_i); - //int nth = getDTthreads(); - int nBatch=1; - //int b1_min_i = min_i, b1_max_i = max_i; - //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1; - //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i; - //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i); - //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i); - t_batch = omp_get_wtime() - t_batch; - - //int min_y = y[y_starts[0]-1]; - //int max_y = y[y_starts[nx_starts-1]-1]; - //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y); - int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization - //bool skip=false; - t_join = omp_get_wtime(); - //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads()) - for (int b=0; b y_j) js++; + double t_batch = omp_get_wtime(); + //int min_i = 0, max_i = x_starts[nx_starts-1]-1; + //Rprintf("x[i]=[%d:%d]\n", min_i, max_i); + //int nth = getDTthreads(); + //int nBatch=nth*2; + int nBatch=1; + //int b1_min_i = min_i, b1_max_i = max_i; + //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1; + //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i; + //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i); + //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i); + t_batch = omp_get_wtime() - t_batch; + + //int min_y = y[y_starts[0]-1]; + //int max_y = y[y_starts[nx_starts-1]-1]; + //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y); + int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization + //bool skip=false; + double t_join = omp_get_wtime(); + //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads()) + for (int b=0; b y_j) { - while(x[x_starts[is+1]-1] < ) - if (++js>=ny_starts) {skip=true; continue} - } - }*/ - th[b] = omp_get_thread_num(); - } - t_join = omp_get_wtime() - t_join; - for (int b=0; b y_j) js++; - }*/ - } else if (how==inner && unq_x && unq_y) { - error("dev"); - } else if (how==inner) { - error("not yet implemented"); - } else if (how==full && unq_x && unq_y) { - error("dev"); - } else if (how==full) { - /* - * - */ - error("not yet implemented"); + is++; + } else if (x_i < y_j) is++; else if (x_i > y_j) js++; } - } else { + th[b] = omp_get_thread_num(); + } + t_join = omp_get_wtime() - t_join; + //for (int b=0; b Date: Sat, 6 Jun 2020 13:24:09 +0100 Subject: [PATCH 07/34] prepare for batching --- src/fjoin.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/fjoin.c b/src/fjoin.c index 47b4f1cf8e..6971f7de2d 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -26,12 +26,13 @@ static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { error("not yet handled in min_i_match"); } -int uniqueN(int *x, int nx) { // x have 0:(nx-1) elements - int ans=0; +// helper to count how many threads were used +int unqN(int *x, int nx) { // x have 0:(nx-1) values + int ans = 0; uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); memset(seen, 0, nx*sizeof(uint8_t)); for (int i=0; i y_j) js++; } + /*while (is y_j) js++; + }*/ th[b] = omp_get_thread_num(); } t_join = omp_get_wtime() - t_join; @@ -176,7 +193,7 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts, } //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]); } - Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, uniqueN(th, nBatch)); + Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, unqN(th, nBatch)); matchn[0] = nx; } From ffb521aec9079ae12e4ce9621f9ccc336a5ed9d6 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 6 Jun 2020 19:02:11 +0100 Subject: [PATCH 08/34] finally parallel --- inst/tests/fjoin.Rraw | 36 ++----- src/fjoin.c | 212 ++++++++++++++++-------------------------- 2 files changed, 90 insertions(+), 158 deletions(-) diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw index cfdf0d0beb..1537cdd988 100644 --- a/inst/tests/fjoin.Rraw +++ b/inst/tests/fjoin.Rraw @@ -10,22 +10,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { bmerge = data.table:::bmerge forderv = data.table:::forderv } -check = function(x, y) { - a = d(x, y) - b = fjoin(x, y) - na = is.na(a[["starts"]]) - fail = !identical(a[["starts"]][!na], b[["starts_y"]][!na]) - fail = fail || !identical(a[["lens"]][!na], b[["lens_y"]][!na]) - if (fail) { - if (interactive() && length(x) < 20L) { - out = capture.output(print(list(d=a, fj=b))) - stop("starts are not equal: ", out) - } - stop("starts are not equal") - } - invisible(TRUE) -} - d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2] @@ -74,7 +58,7 @@ ssa = function(unq_n, size, sort=FALSE) { if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) if (sort) sort(ans) else ans } -N = 1e2 +N = 1e2L x = ssa(N, N, sort=TRUE) y = ssa(N, N, sort=TRUE) test(11.01, fjoin(x, y), d(x, y)) @@ -106,12 +90,12 @@ test(12.04, fjoin(x, y), d(x, y)) x = ssa(N, N) y = ssa(N, N) test(14.01, fjoin(x, y), d(x, y)) -#x = ssa(N, N*1.1) -#y = ssa(N, N) -#test(14.02, fjoin(x, y), d(x, y)) -#x = ssa(N, N) -#y = ssa(N, N*1.1) -#test(14.03, fjoin(x, y), d(x, y)) -#x = ssa(N, N*1.1) -#y = ssa(N, N*1.1) -#test(14.04, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N) +test(14.02, fjoin(x, y), d(x, y)) +x = ssa(N, N) +y = ssa(N, N*1.1) +test(14.03, fjoin(x, y), d(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N*1.1) +test(14.04, fjoin(x, y), d(x, y)) diff --git a/src/fjoin.c b/src/fjoin.c index 6971f7de2d..1b3627259f 100644 --- a/src/fjoin.c +++ b/src/fjoin.c @@ -6,24 +6,23 @@ * split whole 1:nx_starts into numbers of buckets equal to threads*2 * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches */ -static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously - for (int i=0; i val: %d > %d\n", i, y[y_o[i]-1], val); - if (y[y_o[i]-1]>val) { - if (i==0) error("i=0 max_i_match, unhandled"); - return(i-1); - } +static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { // those need to be binary search obviously + int i; + for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); + if (y[y_starts[i]-1]==val) return(i); + else if (y[y_starts[i]-1]>val) return(i-1); } - error("not yet handled in max_i_match"); + return(i-1); } -static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { - for (int i=ny-1; i=0; --i) { + //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); + if (y[y_starts[i]-1]==val) return(i); + else if (y[y_starts[i]-1] y_j) js++; + } + return; +} + void fjoinC(int *x, int nx, int *x_starts, int nx_starts, int *y, int ny, int *y_starts, int ny_starts, //joinhow how, bool *match, int *matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) { - bool x_ord=true, y_ord=true; - int *x_o=0, *y_o=0; - bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; int *x_lens=0, *y_lens=0; double t_lens = omp_get_wtime(); - if (!unq_x || true) { + if (!unq_x || true) { // else true as we dont yet have branches for unq cases x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395 grpLens(x_starts, nx_starts, nx, x_lens); //for (int i=0; i y_j) js++; + if (b==0) { + xs = b1_xs; nxs = b1_nxs; ys = b1_ys; nys = b1_nys, xl = b1_xl, yl = b1_yl; + } else if (b==1) { + xs = b2_xs; nxs = b2_nxs; ys = b2_ys; nys = b2_nys, xl = b2_xl, yl = b2_yl; } - /*while (is y_j) js++; - }*/ + fjoin1(xs, xl, nxs, ys, yl, nys, /*batch specific input: shifted pointers and size*/ + x, y, /*common input*/ + starts_y, lens_y); /*common output*/ th[b] = omp_get_thread_num(); } + for (int b=0; b min_y; - //bool trim_max_x = max_x > max_y; - //bool trim_max_y = max_x < max_y; - // todo trim buckets to join-able subsets of x and y - - //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y); - //int min_xy = imin(min_x, min_y), max_xy = imin(max_x, max_y); - //if (max_xy - min_xy < 2) error("not handled yet"); - - int n_buckets = 2; - if (n_buckets != 2) error("not handled yet"); - // this needs to be done for every bucket - if (y_ord) error("y must not be sorted"); - - int b1_max_x_o_i = max_i_match(x, nx, x_o, x_ord, x[x_o[nx/2]-1]); - Rprintf("b1_max_x_o_i=%d; x[x_o[b1_max_x_o_i]-1]=%d\n", b1_max_x_o_i, x[x_o[b1_max_x_o_i]-1]); - int b1_min_x_i = min_x_i, b1_max_x_i = x_o[b1_max_x_o_i]-1; // this need to ensure that duplicate goes into same bucket!! fjoin(c(1:3,3L,3L,5L,4L), y), 3 goes into both buckets! - int b2_min_x_i = x_o[b1_max_x_o_i+1]-1, b2_max_x_i = max_x_i; - if (x[b1_max_x_i]==x[b2_min_x_i]) error("value of x=%d fits into multiple buckets", x[b2_min_x_i]); - Rprintf("min(x)=%d; max(x)=%d; len(x)=%d\n", x[min_x_i], x[max_x_i], nx); - Rprintf("b1(min(x))=%d; b1(max(x))=%d; b1(len(x))=%d\n", x[b1_min_x_i], x[b1_max_x_i], b1_max_x_o_i-0); - Rprintf("b2(min(x))=%d; b2(max(x))=%d; b2(len(x))=%d\n", x[b2_min_x_i], x[b2_max_x_i], nx-b1_max_x_o_i+1); - error("dev"); - int b1_max_y_o_i = max_i_match(y, ny, y_o, y_ord, /*value to match to*/x[b1_max_x_i]); - int b1_min_y_i = min_y_i, b1_max_y_i = y_o[b1_max_y_o_i]-1; - int b2_min_y_o_i = min_i_match(y, ny, y_o, y_ord, x[b2_min_x_i]); - int b2_min_y_i = y_o[b2_min_y_o_i]-1, b2_max_y_i = max_y_i; - Rprintf("b1_min_x_i=%d, b1_max_x_i=%d, b2_min_x_i=%d, b2_max_x_i=%d\n", b1_min_x_i, b1_max_x_i, b2_min_x_i, b2_max_x_i); - Rprintf("buckets: 1(x:%d-%d, y:%d-%d); 2(x:%d-%d, y:%d-%d)\n", x[b1_min_x_i], x[b1_max_x_i], y[b1_min_y_i], y[b1_max_y_i], x[b2_min_x_i], x[b2_max_x_i], y[b2_min_y_i], y[b2_max_y_i]); - // openmp here!! - for (int b=0; b Date: Sat, 6 Jun 2020 21:52:08 +0100 Subject: [PATCH 09/34] parallel --- R/wrappers.R | 4 +- inst/tests/{fjoin.Rraw => smjoin.Rraw} | 44 +++---- src/data.table.h | 4 +- src/init.c | 5 +- src/{fjoin.c => smjoin.c} | 152 ++++++++++++++----------- 5 files changed, 115 insertions(+), 94 deletions(-) rename inst/tests/{fjoin.Rraw => smjoin.Rraw} (70%) rename src/{fjoin.c => smjoin.c} (63%) diff --git a/R/wrappers.R b/R/wrappers.R index 6b9da0d799..29fc5da8d1 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -fjoin = function(x, y, how=c("left","inner","full")) { - ans = .Call(CfjoinR, x, y, match.arg(how)) +smjoin = function(x, y, how=c("left","inner","full")) { + ans = .Call(CsmjoinR, x, y, match.arg(how)) ans[[4L]][is.na(ans[[3L]])] = 1L ans[3:4] } diff --git a/inst/tests/fjoin.Rraw b/inst/tests/smjoin.Rraw similarity index 70% rename from inst/tests/fjoin.Rraw rename to inst/tests/smjoin.Rraw index 1537cdd988..9bc193264b 100644 --- a/inst/tests/fjoin.Rraw +++ b/inst/tests/smjoin.Rraw @@ -6,7 +6,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } else { require(data.table) test = data.table:::test - fjoin = data.table:::fjoin + smjoin = data.table:::smjoin bmerge = data.table:::bmerge forderv = data.table:::forderv } @@ -16,37 +16,37 @@ d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, roll ## x y sorted x = c(1L,2L,3L,4L) y = c(2L,3L,5L) -test(1.01, fjoin(x, y), d(x, y)) +test(1.01, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,3L,4L) y = c(2L,3L,5L) -test(1.02, fjoin(x, y), d(x, y)) +test(1.02, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,4L) y = c(2L,3L,3L,5L) -test(1.03, fjoin(x, y), d(x, y)) +test(1.03, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,3L,4L) y = c(2L,3L,3L,5L) -test(1.04, fjoin(x, y), d(x, y)) +test(1.04, smjoin(x, y), d(x, y)) ## y unsorted x = c(1L,2L,3L,4L) y = c(2L,5L,3L) -test(2.01, fjoin(x, y), d(x, y)) +test(2.01, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,3L,4L) y = c(3L,2L,5L) -test(2.02, fjoin(x, y), d(x, y)) +test(2.02, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,4L) y = c(5L,3L,2L,3L) -test(2.03, fjoin(x, y), d(x, y)) +test(2.03, smjoin(x, y), d(x, y)) x = c(1L,2L,3L,3L,4L) y = c(5L,3L,3L,2L) -test(2.04, fjoin(x, y), d(x, y)) +test(2.04, smjoin(x, y), d(x, y)) #... ## xy unsorted x = c(4L,1L,3L,2L) y = c(2L,5L,3L) -test(4.01, fjoin(x, y), d(x, y)) +test(4.01, smjoin(x, y), d(x, y)) #... @@ -61,41 +61,41 @@ ssa = function(unq_n, size, sort=FALSE) { N = 1e2L x = ssa(N, N, sort=TRUE) y = ssa(N, N, sort=TRUE) -test(11.01, fjoin(x, y), d(x, y)) +test(11.01, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N, sort=TRUE) -test(11.02, fjoin(x, y), d(x, y)) +test(11.02, smjoin(x, y), d(x, y)) x = ssa(N, N, sort=TRUE) y = ssa(N, N*1.1, sort=TRUE) -test(11.03, fjoin(x, y), d(x, y)) +test(11.03, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N*1.1, sort=TRUE) -test(11.04, fjoin(x, y), d(x, y)) +test(11.04, smjoin(x, y), d(x, y)) ## y unsorted x = ssa(N, N, sort=TRUE) y = ssa(N, N) -test(12.01, fjoin(x, y), d(x, y)) +test(12.01, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N) -test(12.02, fjoin(x, y), d(x, y)) +test(12.02, smjoin(x, y), d(x, y)) x = ssa(N, N, sort=TRUE) y = ssa(N, N*1.1) -test(12.03, fjoin(x, y), d(x, y)) +test(12.03, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N*1.1) -test(12.04, fjoin(x, y), d(x, y)) +test(12.04, smjoin(x, y), d(x, y)) ## xy unsorted x = ssa(N, N) y = ssa(N, N) -test(14.01, fjoin(x, y), d(x, y)) +test(14.01, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1) y = ssa(N, N) -test(14.02, fjoin(x, y), d(x, y)) +test(14.02, smjoin(x, y), d(x, y)) x = ssa(N, N) y = ssa(N, N*1.1) -test(14.03, fjoin(x, y), d(x, y)) +test(14.03, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1) y = ssa(N, N*1.1) -test(14.04, fjoin(x, y), d(x, y)) +test(14.04, smjoin(x, y), d(x, y)) diff --git a/src/data.table.h b/src/data.table.h index abd993a592..fe8091e382 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -244,5 +244,5 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k); SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); -// fjoin.c -SEXP fjoinR(SEXP x, SEXP y, SEXP how); +// smjoin.c +SEXP smjoinR(SEXP x, SEXP y, SEXP how); diff --git a/src/init.c b/src/init.c index 9f7f45ee8c..731f0b765b 100644 --- a/src/init.c +++ b/src/init.c @@ -119,8 +119,7 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); -//SEXP sjoinR(); -SEXP fjoinR(); +SEXP smjoinR(); // .Externals SEXP fastmean(); @@ -213,7 +212,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, -{"CfjoinR", (DL_FUNC) &fjoinR, -1}, +{"CsmjoinR", (DL_FUNC) &smjoinR, -1}, {NULL, NULL, 0} }; diff --git a/src/fjoin.c b/src/smjoin.c similarity index 63% rename from src/fjoin.c rename to src/smjoin.c index 1b3627259f..222b7f1af1 100644 --- a/src/fjoin.c +++ b/src/smjoin.c @@ -3,10 +3,11 @@ /* * sort-merge join * - * split whole 1:nx_starts into numbers of buckets equal to threads*2 - * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches + * split whole 1:nx_starts into batches and do sort-merge join on each batch separately */ -static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { // those need to be binary search obviously + +// those helpers need to be binary search! last matching, and first matching index +static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { int i; for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); @@ -36,43 +37,42 @@ int unqN(int *x, int nx) { // x have 0:(nx-1) values return ans; } -// materialize counts of duplicated entries -void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395 +// count of duplicated entries +void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395 for (int i=0; i y_j) js++; + } + else if (x_i < y_j) is++; + else if (x_i > y_j) js++; } return; } -void fjoinC(int *x, int nx, int *x_starts, int nx_starts, - int *y, int ny, int *y_starts, int ny_starts, - //joinhow how, - bool *match, int *matchn, - int *starts_x, int *lens_x, - int *starts_y, int *lens_y) { +void smjoinC(int *x, int nx, int *x_starts, int nx_starts, + int *y, int ny, int *y_starts, int ny_starts, + bool *match, int *matchn, + int *starts_x, int *lens_x, + int *starts_y, int *lens_y) { bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; int *x_lens=0, *y_lens=0; + double t_lens = omp_get_wtime(); if (!unq_x || true) { // else true as we dont yet have branches for unq cases x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395 @@ -84,33 +84,51 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts, grpLens(y_starts, ny_starts, ny, y_lens); } t_lens = omp_get_wtime() - t_lens; - + double t_batch = omp_get_wtime(); - /* - int min_i = 0, max_i = x_starts[nx_starts-1]-1; - Rprintf("x[i]=[%d:%d]\n", min_i, max_i); - int b1_min_i = min_i, b1_max_i = max_i; - int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1; - int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i; - Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i); - Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i); - */ - int nth = 2;//getDTthreads(); - //int nBatch=nth*2; - int nBatch=2; - - int *xs, *ys, nxs, nys, *xl, *yl; - /* - xs = x_starts + 0*sizeof(int); - nxs = nx_starts; - ys = y_starts + 0*sizeof(int); - nys = ny_starts; - */ - int b1_nxs = nx_starts/2, b2_nxs = nx_starts-b1_nxs; + int nth = getDTthreads(); + int nBatch = nth*2; + int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used + + size_t batchSize = (nx_starts-1)/nBatch + 1; + size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; + //Rprintf("batchSize=%d; lastBatchSize=%d\n", batchSize, lastBatchSize); + int *nxsB = (int *)R_alloc(nBatch, sizeof(int)); + int **xsB = (int **)R_alloc(nBatch, sizeof(int*)); + int **xlB = (int **)R_alloc(nBatch, sizeof(int*)); + int *nysB = (int *)R_alloc(nBatch, sizeof(int)); + int **ysB = (int **)R_alloc(nBatch, sizeof(int*)); + int **ylB = (int **)R_alloc(nBatch, sizeof(int*)); + for (int b=0; b Date: Sat, 6 Jun 2020 22:52:34 +0100 Subject: [PATCH 10/34] cleanup and speedup --- R/wrappers.R | 8 +- src/data.table.h | 2 +- src/smjoin.c | 194 ++++++++++++++++++----------------------------- 3 files changed, 78 insertions(+), 126 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index 29fc5da8d1..f8d51219c5 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smjoin = function(x, y, how=c("left","inner","full")) { - ans = .Call(CsmjoinR, x, y, match.arg(how)) - ans[[4L]][is.na(ans[[3L]])] = 1L - ans[3:4] +smjoin = function(x, y) { + ans = .Call(CsmjoinR, x, y, NULL, NULL) + ans[["lens"]][is.na(ans[["starts"]])] = 1L + ans } diff --git a/src/data.table.h b/src/data.table.h index fe8091e382..74d101429d 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smjoinR(SEXP x, SEXP y, SEXP how); +SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx); diff --git a/src/smjoin.c b/src/smjoin.c index 222b7f1af1..1af803a53e 100644 --- a/src/smjoin.c +++ b/src/smjoin.c @@ -3,7 +3,10 @@ /* * sort-merge join * - * split whole 1:nx_starts into batches and do sort-merge join on each batch separately + * join on a single integer column + * sort LHS and RHS + * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches + * parallel sort-merge join */ // those helpers need to be binary search! last matching, and first matching index @@ -39,6 +42,7 @@ int unqN(int *x, int nx) { // x have 0:(nx-1) values // count of duplicated entries void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395 + #pragma omp parallel for schedule(static) num_threads(getDTthreads()) for (int i=0; i Date: Sun, 7 Jun 2020 01:22:41 +0100 Subject: [PATCH 11/34] bucketing now uses binary search --- R/wrappers.R | 4 +- inst/tests/smjoin.Rraw | 8 +++ src/smjoin.c | 140 ++++++++++++++++++++++++++++++++--------- 3 files changed, 119 insertions(+), 33 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index f8d51219c5..ba43d8cba2 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smjoin = function(x, y) { - ans = .Call(CsmjoinR, x, y, NULL, NULL) +smjoin = function(x, y, x.idx=NULL, y.idx=NULL) { + ans = .Call(CsmjoinR, x, y, x.idx, y.idx) ans[["lens"]][is.na(ans[["starts"]])] = 1L ans } diff --git a/inst/tests/smjoin.Rraw b/inst/tests/smjoin.Rraw index 9bc193264b..48a479463c 100644 --- a/inst/tests/smjoin.Rraw +++ b/inst/tests/smjoin.Rraw @@ -99,3 +99,11 @@ test(14.03, smjoin(x, y), d(x, y)) x = ssa(N, N*1.1) y = ssa(N, N*1.1) test(14.04, smjoin(x, y), d(x, y)) + +# sparse +x = sample(2e2L, 1e2L) +y = sample(2e2L, 1e2L) +test(21.01, smjoin(x, y), d(x, y)) +x = sample(2e2L, 1e2L, TRUE) +y = sample(2e2L, 1e2L, TRUE) +test(21.02, smjoin(x, y), d(x, y)) diff --git a/src/smjoin.c b/src/smjoin.c index 1af803a53e..ca80067bdd 100644 --- a/src/smjoin.c +++ b/src/smjoin.c @@ -9,26 +9,6 @@ * parallel sort-merge join */ -// those helpers need to be binary search! last matching, and first matching index -static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { - int i; - for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); - if (y[y_starts[i]-1]==val) return(i); - else if (y[y_starts[i]-1]>val) return(i-1); - } - return(i-1); -} -static int min_i_match(int *y, int *y_starts, int ny_starts, int val) { - int i; - for (i=ny_starts-1; i>=0; --i) { - //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); - if (y[y_starts[i]-1]==val) return(i); - else if (y[y_starts[i]-1]= val: %d >= %d\n", i, y[y_starts[i]-1], val); + if (y[y_starts[i]-1]==val) return(i); + else if (y[y_starts[i]-1]>val) return(i-1); + } + return(i-1); +} +static int min_i_match(int *y, int *y_starts, int ny_starts, int val) { + int i; + for (i=ny_starts-1; i>=0; --i) { + //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); + if (y[y_starts[i]-1]==val) return(i); + else if (y[y_starts[i]-1] val) { + if (verbose) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val); + return 0; + } + } else if (side > 0) { // upper bound early stopping + if (x[ix[nix-1]-1] == val) { + if (verbose) Rprintf("rollbs: max elements %d match\n", val); + return nix-1; + } + if (x[ix[0]-1] > val) { + if (verbose) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val); + return -1; + } + if (x[ix[nix-1]-1] < val) { + if (verbose) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val); + return nix-1; + } + } + int lower=0, upper=nix, i=0; + while (lower<=upper) { + i = lower + (upper-lower)/2; + int thisx = x[ix[i]-1]; + //if (verbose) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); + if (thisx==val) + return(i); + else if (thisx < val) + lower = i+1; + else if (thisx > val) + upper = i-1; + } + if (verbose) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side); + if (side < 0) // anyone to stress test this logic? + return lower; + else + return upper; +} + void smjoinC(int *x, int nx, int *x_starts, int nx_starts, int *y, int ny, int *y_starts, int ny_starts, int *matchn, int *starts_y, int *lens_y) { + + bool verbose = true; bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; int *x_lens=0, *y_lens=0; @@ -87,25 +142,45 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts, } t_lens = omp_get_wtime() - t_lens; - double t_batch = omp_get_wtime(); + double t_batch = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! int nth = getDTthreads(); - int nBatch = nth*2; + int nBatch = 0; // for a single threaded or small unq count use single batch + if (nth == 1 || nx_starts < 4) { + nBatch = 1; + } else { + nBatch = nth * 2; + } int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used size_t batchSize = (nx_starts-1)/nBatch + 1; size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; - //Rprintf("batchSize=%d; lastBatchSize=%d\n", batchSize, lastBatchSize); + //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize); int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*)); int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*)); + bool do_not_set_to_false = false; int y_min2, y_max2; + //#pragma omp parallel for schedule(dynamic) num_threads(nth) for (int b=0; b Date: Sun, 7 Jun 2020 01:43:10 +0100 Subject: [PATCH 12/34] rename vars --- src/smjoin.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/smjoin.c b/src/smjoin.c index ca80067bdd..d768659ce5 100644 --- a/src/smjoin.c +++ b/src/smjoin.c @@ -30,16 +30,16 @@ void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395 } // workhorse join that runs in parallel on batches -void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts_y, int *lens_y) { +void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts, int *lens) { int i=0, j=0, is=0, js=0, x_i, y_j; while (is Date: Mon, 8 Jun 2020 00:40:47 +0100 Subject: [PATCH 13/34] getting closer --- R/bmerge.R | 15 +- R/wrappers.R | 6 +- inst/tests/smerge.Rraw | 129 +++++++++++++++++ inst/tests/smjoin.Rraw | 109 --------------- src/bmerge.c | 6 +- src/data.table.h | 7 +- src/init.c | 14 +- src/{smjoin.c => smerge.c} | 280 +++++++++++++++++++++++++++---------- 8 files changed, 369 insertions(+), 197 deletions(-) create mode 100644 inst/tests/smerge.Rraw delete mode 100644 inst/tests/smjoin.Rraw rename src/{smjoin.c => smerge.c} (51%) diff --git a/R/bmerge.R b/R/bmerge.R index 3d6ab028f3..1cf98bcfa8 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,6 +1,19 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { + if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer + && isTRUE(getOption("datatable.smerge")) ## enable option + && identical(nomatch, NA_integer_) ## for now only outer join + && identical(mult, "all") ## for now only mult='all' + && identical(ops, 1L) ## equi join + && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join + ) { + if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} + ans = smerge(x=i[[icols]], y=x[[xcols]]) + if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} + ans$xo = c(ans$xo) ## drop starts and maxgrpn + return(ans[c("starts","lens","indices","allLen1","allGrp1","xo")]) + } callersi = i i = shallow(i) # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here @@ -180,9 +193,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...\n");flush.console()} ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} - # TO DO: xo could be moved inside Cbmerge - ans$xo = xo # for further use by [.data.table return(ans) } diff --git a/R/wrappers.R b/R/wrappers.R index ba43d8cba2..06a25924b7 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,8 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smjoin = function(x, y, x.idx=NULL, y.idx=NULL) { - ans = .Call(CsmjoinR, x, y, x.idx, y.idx) - ans[["lens"]][is.na(ans[["starts"]])] = 1L - ans -} +smerge = function(x, y, x.idx=NULL, y.idx=NULL) .Call(CsmergeR, x, y, x.idx, y.idx) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw new file mode 100644 index 0000000000..f0f4530a3f --- /dev/null +++ b/inst/tests/smerge.Rraw @@ -0,0 +1,129 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + smerge = data.table:::smerge + bmerge = data.table:::bmerge + forderv = data.table:::forderv +} + +bm = function(x, y) { + ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE) + ## if undefining SMERGE_ALL + #ans[c("starts","lens")] + ans +} +sm = function(x, y) { + ans = smerge(x, y) + ans = ans[c("starts","lens","indices","allLen1","allGrp1","xo")] ## drop extra stats + ans[["xo"]] = c(ans[["xo"]]) ## removes starts and maxgrpn, and possibly others in future + ## if undefining SMERGE_ALL + #ans[c("starts","lens")] + ans +} + +## x y sorted +x = c(1L,2L,3L,4L) +y = c(2L,3L,5L) +test(1.01, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(2L,3L,5L) +test(1.02, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L) +y = c(2L,3L,3L,5L) +test(1.03, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(2L,3L,3L,5L) +test(1.04, sm(x, y), bm(x, y)) + +## y unsorted +x = c(1L,2L,3L,4L) +y = c(2L,5L,3L) +test(2.01, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(3L,2L,5L) +test(2.02, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L) +y = c(5L,3L,2L,3L) +test(2.03, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,3L,4L) +y = c(5L,3L,3L,2L) +test(2.04, sm(x, y), bm(x, y)) + +#... + +## xy unsorted +x = c(4L,1L,3L,2L) +y = c(2L,5L,3L) +test(4.01, sm(x, y), bm(x, y)) + +#... + +## xy sorted +ssa = function(unq_n, size, sort=FALSE) { + stopifnot(unq_n <= size) + unq_sub = seq_len(unq_n) + ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) + if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) + if (sort) sort(ans) else ans +} +N = 1e2L +x = ssa(N, N, sort=TRUE) +y = ssa(N, N, sort=TRUE) +test(11.01, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N, sort=TRUE) +test(11.02, sm(x, y), bm(x, y)) +x = ssa(N, N, sort=TRUE) +y = ssa(N, N*1.1, sort=TRUE) +test(11.03, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N*1.1, sort=TRUE) +test(11.04, sm(x, y), bm(x, y)) + +## y unsorted +x = ssa(N, N, sort=TRUE) +y = ssa(N, N) +test(12.01, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N) +test(12.02, sm(x, y), bm(x, y)) +x = ssa(N, N, sort=TRUE) +y = ssa(N, N*1.1) +test(12.03, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1, sort=TRUE) +y = ssa(N, N*1.1) +test(12.04, sm(x, y), bm(x, y)) + +## xy unsorted +x = ssa(N, N) +y = ssa(N, N) +test(14.01, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N) +test(14.02, sm(x, y), bm(x, y)) +x = ssa(N, N) +y = ssa(N, N*1.1) +test(14.03, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N*1.1) +test(14.04, sm(x, y), bm(x, y)) + +# sparse +x = sample(2e2L, 1e2L) ## may fail when no matches! +y = sample(2e2L, 1e2L) +test(21.01, sm(x, y), bm(x, y)) +x = sample(2e2L, 1e2L, TRUE) +y = sample(2e2L, 1e2L, TRUE) +test(21.02, sm(x, y), bm(x, y)) + +d1 = data.table(x=x, v1=seq_along(x)) +d2 = data.table(y=y, v2=seq_along(y)) +options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491 +test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") +options(datatable.smerge=TRUE) +test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge") diff --git a/inst/tests/smjoin.Rraw b/inst/tests/smjoin.Rraw deleted file mode 100644 index 48a479463c..0000000000 --- a/inst/tests/smjoin.Rraw +++ /dev/null @@ -1,109 +0,0 @@ -require(methods) - -if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { - if ((tt<-compiler::enableJIT(-1))>0) - cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") -} else { - require(data.table) - test = data.table:::test - smjoin = data.table:::smjoin - bmerge = data.table:::bmerge - forderv = data.table:::forderv -} - -d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2] - -## x y sorted -x = c(1L,2L,3L,4L) -y = c(2L,3L,5L) -test(1.01, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,3L,4L) -y = c(2L,3L,5L) -test(1.02, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,4L) -y = c(2L,3L,3L,5L) -test(1.03, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,3L,4L) -y = c(2L,3L,3L,5L) -test(1.04, smjoin(x, y), d(x, y)) - -## y unsorted -x = c(1L,2L,3L,4L) -y = c(2L,5L,3L) -test(2.01, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,3L,4L) -y = c(3L,2L,5L) -test(2.02, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,4L) -y = c(5L,3L,2L,3L) -test(2.03, smjoin(x, y), d(x, y)) -x = c(1L,2L,3L,3L,4L) -y = c(5L,3L,3L,2L) -test(2.04, smjoin(x, y), d(x, y)) - -#... - -## xy unsorted -x = c(4L,1L,3L,2L) -y = c(2L,5L,3L) -test(4.01, smjoin(x, y), d(x, y)) - -#... - -## xy sorted -ssa = function(unq_n, size, sort=FALSE) { - stopifnot(unq_n <= size) - unq_sub = seq_len(unq_n) - ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) - if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) - if (sort) sort(ans) else ans -} -N = 1e2L -x = ssa(N, N, sort=TRUE) -y = ssa(N, N, sort=TRUE) -test(11.01, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N, sort=TRUE) -test(11.02, smjoin(x, y), d(x, y)) -x = ssa(N, N, sort=TRUE) -y = ssa(N, N*1.1, sort=TRUE) -test(11.03, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N*1.1, sort=TRUE) -test(11.04, smjoin(x, y), d(x, y)) - -## y unsorted -x = ssa(N, N, sort=TRUE) -y = ssa(N, N) -test(12.01, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N) -test(12.02, smjoin(x, y), d(x, y)) -x = ssa(N, N, sort=TRUE) -y = ssa(N, N*1.1) -test(12.03, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N*1.1) -test(12.04, smjoin(x, y), d(x, y)) - -## xy unsorted -x = ssa(N, N) -y = ssa(N, N) -test(14.01, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1) -y = ssa(N, N) -test(14.02, smjoin(x, y), d(x, y)) -x = ssa(N, N) -y = ssa(N, N*1.1) -test(14.03, smjoin(x, y), d(x, y)) -x = ssa(N, N*1.1) -y = ssa(N, N*1.1) -test(14.04, smjoin(x, y), d(x, y)) - -# sparse -x = sample(2e2L, 1e2L) -y = sample(2e2L, 1e2L) -test(21.01, smjoin(x, y), d(x, y)) -x = sample(2e2L, 1e2L, TRUE) -y = sample(2e2L, 1e2L, TRUE) -test(21.02, smjoin(x, y), d(x, y)) diff --git a/src/bmerge.c b/src/bmerge.c index 5273ae59b9..cd6d2b307d 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -175,18 +175,20 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE memcpy(INTEGER(retLengthArg), retLength, sizeof(int)*ctr); memcpy(INTEGER(retIndexArg), retIndex, sizeof(int)*ctr); } - SEXP ans = PROTECT(allocVector(VECSXP, 5)); protecti++; - SEXP ansnames = PROTECT(allocVector(STRSXP, 5)); protecti++; + SEXP ans = PROTECT(allocVector(VECSXP, 6)); protecti++; + SEXP ansnames = PROTECT(allocVector(STRSXP, 6)); protecti++; SET_VECTOR_ELT(ans, 0, retFirstArg); SET_VECTOR_ELT(ans, 1, retLengthArg); SET_VECTOR_ELT(ans, 2, retIndexArg); SET_VECTOR_ELT(ans, 3, allLen1Arg); SET_VECTOR_ELT(ans, 4, allGrp1Arg); + SET_VECTOR_ELT(ans, 5, xoArg); SET_STRING_ELT(ansnames, 0, char_starts); // changed from mkChar to char_ to pass the grep in CRAN_Release.cmd SET_STRING_ELT(ansnames, 1, char_lens); SET_STRING_ELT(ansnames, 2, char_indices); SET_STRING_ELT(ansnames, 3, char_allLen1); SET_STRING_ELT(ansnames, 4, char_allGrp1); + SET_STRING_ELT(ansnames, 5, char_xo); setAttrib(ans, R_NamesSymbol, ansnames); if (nqmaxgrp > 1 && mult == ALL) { Free(retFirst); diff --git a/src/data.table.h b/src/data.table.h index 74d101429d..0a5fc02ebd 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -79,6 +79,11 @@ extern SEXP char_lens; extern SEXP char_indices; extern SEXP char_allLen1; extern SEXP char_allGrp1; +extern SEXP char_xo; +extern SEXP char_io; +extern SEXP char_lhsLen1; +extern SEXP char_xyLen1; +extern SEXP char_nMatch; extern SEXP char_factor; extern SEXP char_ordered; extern SEXP char_datatable; @@ -245,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx); +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx); diff --git a/src/init.c b/src/init.c index 731f0b765b..a47cfd99c7 100644 --- a/src/init.c +++ b/src/init.c @@ -15,6 +15,11 @@ SEXP char_lens; SEXP char_indices; SEXP char_allLen1; SEXP char_allGrp1; +SEXP char_xo; +SEXP char_io; +SEXP char_lhsLen1; +SEXP char_xyLen1; +SEXP char_nMatch; SEXP char_factor; SEXP char_ordered; SEXP char_datatable; @@ -119,7 +124,7 @@ SEXP lock(); SEXP unlock(); SEXP islockedR(); SEXP allNAR(); -SEXP smjoinR(); +SEXP smergeR(); // .Externals SEXP fastmean(); @@ -212,7 +217,7 @@ R_CallMethodDef callMethods[] = { {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, {"C_allNAR", (DL_FUNC) &allNAR, -1}, -{"CsmjoinR", (DL_FUNC) &smjoinR, -1}, +{"CsmergeR", (DL_FUNC) &smergeR, -1}, {NULL, NULL, 0} }; @@ -319,6 +324,11 @@ void attribute_visible R_init_datatable(DllInfo *info) char_indices = PRINTNAME(install("indices")); char_allLen1 = PRINTNAME(install("allLen1")); char_allGrp1 = PRINTNAME(install("allGrp1")); + char_xo = PRINTNAME(install("xo")); + char_io = PRINTNAME(install("io")); + char_lhsLen1 = PRINTNAME(install("lhsLen1")); + char_xyLen1 = PRINTNAME(install("xyLen1")); + char_nMatch = PRINTNAME(install("nMatch")); char_factor = PRINTNAME(install("factor")); char_ordered = PRINTNAME(install("ordered")); char_datatable = PRINTNAME(install("data.table")); diff --git a/src/smjoin.c b/src/smerge.c similarity index 51% rename from src/smjoin.c rename to src/smerge.c index d768659ce5..25e2e455b8 100644 --- a/src/smjoin.c +++ b/src/smerge.c @@ -2,12 +2,19 @@ /* * sort-merge join - * + * * join on a single integer column * sort LHS and RHS * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches * parallel sort-merge join + * + * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_ALL + * y_len1/y_lens1/allLen1: signals if there where multiple matches in RHS of join, ['s _x_ table + * x_len1/x_lens1/lhsLen1: signals if multiple matches in LHS of join, ['s _i_ table + * xy_len1/xy_lens1/xlLen1: signals if "many to many" matches between LHS and RHS + * cnt/nmatch/n_match/n_matchr: count of matches, taking multiple matches into account, uint64_t */ +#define SMERGE_ALL // helper to count how many threads were used int unqN(int *x, int nx) { // x have 0:(nx-1) values @@ -30,26 +37,105 @@ void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395 } // workhorse join that runs in parallel on batches -void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts, int *lens) { +void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, + int *x, bool unq_x, int *y, bool unq_y, + int *starts, int *lens, + uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) { int i=0, j=0, is=0, js=0, x_i, y_j; - while (is y_j) { + j++; + } + } + } else if (unq_x) { //Rprintf("smerge1: unq_x && !unq_y\n"); + while (i1) + ylen1 = false; + cnt += (uint64_t)yl1; +#endif + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + js++; + } + } + } else if (unq_y) { //Rprintf("smerge1: !unq_x && unq_y\n"); + while (is1) + xlen1 = false; + cnt += (uint64_t)xl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + j++; + } + } + } else { //Rprintf("smerge1: !unq_x && !unq_y\n"); + while (is1) + xlen1 = false; + if (ylen1 && yl1>1) + ylen1 = false; + if (xylen1 && xl1>1 && yl1>1) + xylen1 = false; + cnt += (uint64_t)xl1 * (uint64_t)yl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; } - is++; } - else if (x_i < y_j) is++; - else if (x_i > y_j) js++; } + xlens1[0] = xlen1; ylens1[0] = ylen1; xylens1[0] = xylen1; nmatch[0] = cnt; return; } -// those helpers need to be binary search! last matching, and first matching index +// those two helpers function may be removed once we will confirm rollbs works as expected, they find last matching, and first matching index static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { int i; for (i=0; i0) + Rprintf("smergeC: grpLens took %.3fs\n", omp_get_wtime() - t); - double t_batch = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! + t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function int nth = getDTthreads(); int nBatch = 0; // for a single threaded or small unq count use single batch if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 @@ -156,7 +245,7 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts, //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize); int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*)); int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*)); - bool do_not_set_to_false = false; int y_min2, y_max2; + bool do_not_set_to_false = true; int y_min2, y_max2; //#pragma omp parallel for schedule(dynamic) num_threads(nth) for (int b=0; b0) + Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); - double t_join = omp_get_wtime(); - #pragma omp parallel for schedule(dynamic) num_threads(nth) + t = omp_get_wtime(); + bool xlens1 = true, ylens1 = true, xylens1 = true; + uint64_t nmatch = 0; + #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth) for (int b=0; b0) + Rprintf("smergeC: %d calls to smerge1 using %d/%d threads took %.3fs\n", nBatch, unqN(th, nBatch), nth, omp_get_wtime() - t); - if (verbose) - Rprintf("smjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth in %d batch\n", t_lens, t_batch, t_join, unqN(th, nBatch), nBatch); - matchn[0] = nx; + return; } void sortInt(int *x, int nx, int *idx, int *ans) { @@ -209,61 +302,81 @@ void sortInt(int *x, int nx, int *idx, int *ans) { return; } -SEXP joinOut(int n, int *starts, int *lens, bool x_ord, SEXP out_starts, SEXP out_lens, int *xoop) { - SEXP ans = PROTECT(allocVector(VECSXP, 2)), ansnames; - setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 2)); +// this produce output consistent to bmerge, note that bmerge i,x is smerge x,y +SEXP joinOut(int n, int *starts, int *lens, bool x_ord, + SEXP out_starts, SEXP out_lens, + SEXP x_idx, SEXP y_idx, + SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1) { + SEXP ans = PROTECT(allocVector(VECSXP, 10)), ansnames; + setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 10)); SET_STRING_ELT(ansnames, 0, char_starts); SET_STRING_ELT(ansnames, 1, char_lens); + SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join + SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1); + SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true)); // const for equi join + SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx); + SET_STRING_ELT(ansnames, 6, char_io); SET_VECTOR_ELT(ans, 6, x_idx); + SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1); + SET_STRING_ELT(ansnames, 8, char_xyLen1); SET_VECTOR_ELT(ans, 8, xy_lens1); + SET_STRING_ELT(ansnames, 9, char_nMatch); SET_VECTOR_ELT(ans, 9, n_match); if (x_ord) { SET_VECTOR_ELT(ans, 0, out_starts); SET_VECTOR_ELT(ans, 1, out_lens); } else { SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n)); SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n)); - sortInt(starts, n, xoop, INTEGER(VECTOR_ELT(ans, 0))); - sortInt(lens, n, xoop, INTEGER(VECTOR_ELT(ans, 1))); + SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533 + sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0))); + sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1))); + UNPROTECT(1); } UNPROTECT(1); return ans; } -SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) { +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) { - bool verbose = true; + const int verbose = GetVerbose()*2; // remove *2 after #4491 if (!isInteger(x) || !isInteger(y)) - error("must be integer"); - int protecti=0, nx = LENGTH(x), ny = LENGTH(y); + error("'x' and 'y' must be integer"); + int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); + double t = 0; - double t_index = omp_get_wtime(); - if (isNull(x_idx)) { - x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; + t = omp_get_wtime(); + bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); + if (do_x_idx) { + x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 } - if (isNull(y_idx)) { - y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; + if (do_y_idx) { + y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 } - SEXP x_starts = getAttrib(x_idx, sym_starts); - SEXP y_starts = getAttrib(y_idx, sym_starts); + if (!isInteger(x_idx) || !isInteger(y_idx)) + error("'x.idx' and 'y.idx' must be integer"); + SEXP x_starts = getAttrib(x_idx, sym_starts); SEXP y_starts = getAttrib(y_idx, sym_starts); if (isNull(x_starts) || isNull(y_starts)) - error("Indices provided to smjoin must carry 'starts' attribute"); - t_index = omp_get_wtime() - t_index; + error("Indices provided to smerge must carry 'starts' attribute"); + if (verbose>0) + Rprintf("smergeR: index took %.3fs\n", omp_get_wtime() - t); - double t_sort = omp_get_wtime(); + t = omp_get_wtime(); bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); - SEXP xoo = R_NilValue; // order of an order of x - int *xp, *yp, *xoop=0; + int *xp, *yp; if (!x_ord) { xp = (int *)R_alloc(nx, sizeof(int)); sortInt(INTEGER(x), nx, INTEGER(x_idx), xp); - xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; - xoop = INTEGER(xoo); - } else xp = INTEGER(x); + } else { + xp = INTEGER(x); + } if (!y_ord) { yp = (int *)R_alloc(ny, sizeof(int)); sortInt(INTEGER(y), ny, INTEGER(y_idx), yp); - } else yp = INTEGER(y); - t_sort = omp_get_wtime() - t_sort; + } else { + yp = INTEGER(y); + } + if (verbose>0) + Rprintf("smergeR: sort took %.3fs\n", omp_get_wtime() - t); - double t_alloc = omp_get_wtime(); + t = omp_get_wtime(); SEXP out_starts = R_NilValue, out_lens = R_NilValue; int *starts=0, *lens=0; if (x_ord) { // we dont need to reorder results so can save one allocation @@ -276,25 +389,40 @@ SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) { lens = (int *)R_alloc(nx, sizeof(int)); } #pragma omp parallel for schedule(static) num_threads(getDTthreads()) - for (int i=0; i0) + Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t); - double t_smjoin = omp_get_wtime(); - int matchn = 0; // not yet used - smjoinC( + t = omp_get_wtime(); + smergeC( xp, nx, INTEGER(x_starts), LENGTH(x_starts), yp, ny, INTEGER(y_starts), LENGTH(y_starts), - &matchn, starts, lens + starts, lens, + &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1), + verbose-1 ); - t_smjoin = omp_get_wtime() - t_smjoin; + if (n_match > DBL_MAX) { + REAL(n_matchr)[0] = NA_REAL; + warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field"); + } else { + REAL(n_matchr)[0] = (double)n_match; + } + if (verbose>0) + Rprintf("smergeR: smergeC of %d x %d => %"PRIu64" took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); - double t_ans = omp_get_wtime(); - SEXP ans = joinOut(matchn, starts, lens, x_ord, out_starts, out_lens, xoop); // usually also sorts inside - t_ans = omp_get_wtime() - t_ans; + t = omp_get_wtime(); + SEXP ans = joinOut(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1); + if (verbose>0) + Rprintf("smergeR: joinOut (unsort) took %.3fs\n", omp_get_wtime() - t); - if (verbose) - Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans); UNPROTECT(protecti); return ans; } From b1f34634b12a51a8b8f915238f14ea9dc84877d3 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 8 Jun 2020 00:49:13 +0100 Subject: [PATCH 14/34] switch magic option --- src/smerge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smerge.c b/src/smerge.c index 25e2e455b8..3e0213707a 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -245,7 +245,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts, //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize); int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*)); int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*)); - bool do_not_set_to_false = true; int y_min2, y_max2; + bool do_not_set_to_false = false; int y_min2, y_max2; //#pragma omp parallel for schedule(dynamic) num_threads(nth) for (int b=0; b Date: Mon, 8 Jun 2020 21:33:07 +0100 Subject: [PATCH 15/34] more robust, batching into new function and struct --- R/bmerge.R | 10 +- R/wrappers.R | 2 +- inst/tests/smerge.Rraw | 22 ++-- src/data.table.h | 2 +- src/smerge.c | 271 +++++++++++++++++++++++------------------ 5 files changed, 178 insertions(+), 129 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 1cf98bcfa8..f9ab80cd26 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -8,11 +8,15 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos && identical(ops, 1L) ## equi join && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join ) { + getIdxGrp = function(x, cols) { ## get index only if retGrp=T + if (is.numeric(cols)) cols = names(x)[cols] + idx = attr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), exact=TRUE) + if (!is.null(attr(idx, "starts", exact=TRUE))) idx + } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - ans = smerge(x=i[[icols]], y=x[[xcols]]) + ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} - ans$xo = c(ans$xo) ## drop starts and maxgrpn - return(ans[c("starts","lens","indices","allLen1","allGrp1","xo")]) + return(ans) } callersi = i i = shallow(i) diff --git a/R/wrappers.R b/R/wrappers.R index 06a25924b7..83e3d73417 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,4 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smerge = function(x, y, x.idx=NULL, y.idx=NULL) .Call(CsmergeR, x, y, x.idx, y.idx) +smerge = function(x, y, x.idx=NULL, y.idx=NULL, out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, out.bmerge) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index f0f4530a3f..3d8cb27b79 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -12,17 +12,15 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { } bm = function(x, y) { + stopifnot(is.integer(x), is.integer(y)) ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE) - ## if undefining SMERGE_ALL - #ans[c("starts","lens")] + ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans } sm = function(x, y) { - ans = smerge(x, y) - ans = ans[c("starts","lens","indices","allLen1","allGrp1","xo")] ## drop extra stats - ans[["xo"]] = c(ans[["xo"]]) ## removes starts and maxgrpn, and possibly others in future - ## if undefining SMERGE_ALL - #ans[c("starts","lens")] + stopifnot(is.integer(x), is.integer(y)) + ans = smerge(x, y, out.bmerge=TRUE) + ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans } @@ -127,3 +125,13 @@ options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #449 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") options(datatable.smerge=TRUE) test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge") + +setindexv2 = function(x, cols) { ## pretend we are after #4386 + stopifnot(is.data.table(x), is.character(cols)) + if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer()) + setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), forderv(x, cols, retGrp=TRUE)) + invisible(x) +} +setindexv2(d1, "x") +setindexv2(d2, "y") +test(101.03, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge") diff --git a/src/data.table.h b/src/data.table.h index 0a5fc02ebd..9ceb722cea 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx); +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge); diff --git a/src/smerge.c b/src/smerge.c index 3e0213707a..6dac785b2c 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -8,50 +8,42 @@ * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches * parallel sort-merge join * - * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_ALL + * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_STATS * y_len1/y_lens1/allLen1: signals if there where multiple matches in RHS of join, ['s _x_ table * x_len1/x_lens1/lhsLen1: signals if multiple matches in LHS of join, ['s _i_ table * xy_len1/xy_lens1/xlLen1: signals if "many to many" matches between LHS and RHS * cnt/nmatch/n_match/n_matchr: count of matches, taking multiple matches into account, uint64_t */ -#define SMERGE_ALL +#define SMERGE_STATS -// helper to count how many threads were used -int unqN(int *x, int nx) { // x have 0:(nx-1) values - int ans = 0; - uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); - memset(seen, 0, nx*sizeof(uint8_t)); - for (int i=0; ixs, *xl=batch->xl, nxs=batch->nxs; + int *ys=batch->ys, *yl=batch->yl, nys=batch->nys; int i=0, j=0, is=0, js=0, x_i, y_j; uint64_t cnt=0; bool xlen1 = true, ylen1 = true, xylen1 = true; - if (unq_x && unq_y) { //Rprintf("smerge1: unq_x && unq_y\n"); + if (unq_x && unq_y) { //Rprintf("smerge: unq_x && unq_y\n"); while (i1) ylen1 = false; cnt += (uint64_t)yl1; @@ -80,7 +72,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, js++; } } - } else if (unq_y) { //Rprintf("smerge1: !unq_x && unq_y\n"); + } else if (unq_y) { //Rprintf("smerge: !unq_x && unq_y\n"); while (is1) xlen1 = false; cnt += (uint64_t)xl1; @@ -103,7 +95,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, j++; } } - } else { //Rprintf("smerge1: !unq_x && !unq_y\n"); + } else { //Rprintf("smerge: !unq_x && !unq_y\n"); while (is1) xlen1 = false; if (ylen1 && yl1>1) @@ -135,11 +127,10 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, return; } -// those two helpers function may be removed once we will confirm rollbs works as expected, they find last matching, and first matching index +// those two helpers needs to be removed once we will confirm rollbs works as expected, they find "first/last within" index static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { int i; - for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); + for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); if (y[y_starts[i]-1]==val) return(i); else if (y[y_starts[i]-1]>val) return(i-1); } @@ -147,8 +138,7 @@ static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { } static int min_i_match(int *y, int *y_starts, int ny_starts, int val) { int i; - for (i=ny_starts-1; i>=0; --i) { - //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); + for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); if (y[y_starts[i]-1]==val) return(i); else if (y[y_starts[i]-1]0) Rprintf("rollbs: side=%d; val=%d\n", side, val); if (side < 0) { // lower bound early stopping if (x[ix[0]-1] == val) { - if (verbose) Rprintf("rollbs: min elements %d match\n", val); + if (verbose>0) Rprintf("rollbs: min elements %d match\n", val); return 0; } if (x[ix[nix-1]-1] < val) { - if (verbose) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val); + if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val); return -1; } if (x[ix[0]-1] > val) { - if (verbose) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val); + if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val); return 0; } } else if (side > 0) { // upper bound early stopping if (x[ix[nix-1]-1] == val) { - if (verbose) Rprintf("rollbs: max elements %d match\n", val); + if (verbose>0) Rprintf("rollbs: max elements %d match\n", val); return nix-1; } if (x[ix[0]-1] > val) { - if (verbose) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val); + if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val); return -1; } if (x[ix[nix-1]-1] < val) { - if (verbose) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val); + if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val); return nix-1; } } @@ -193,7 +183,7 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) { while (lower<=upper) { i = lower + (upper-lower)/2; int thisx = x[ix[i]-1]; - //if (verbose) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); + //if (verbose>0) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); if (thisx==val) return(i); else if (thisx < val) @@ -201,28 +191,89 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) { else if (thisx > val) upper = i-1; } - if (verbose) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side); + if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side); if (side < 0) // anyone to stress test this logic? return lower; else return upper; } +// cuts x_starts into equal batches and binary search corresponding y_starts ranges +struct smBatch *batching(int nBatch, + int *x, int nx, int *x_starts, int nx_starts, int *x_lens, + int *y, int ny, int *y_starts, int ny_starts, int *y_lens, + int verbose) { + if (verbose>0) + Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch); + struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch)); + size_t batchSize = (nx_starts-1)/nBatch + 1; + size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; + bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV + for (int b=0; b0) { // print smBatch objects, 1-indexed! x y sorted! for debugging and verbose + for (int b=0; b= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); + } + } + return B; +} + +// helper to count how many threads were used +int unqNth(int *x, int nx) { // x have 0:(nx-1) values + int ans = 0; + uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); + memset(seen, 0, nx*sizeof(uint8_t)); + for (int i=0; i0) Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); @@ -279,18 +300,17 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts, uint64_t nmatch = 0; #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth) for (int b=0; b0) - Rprintf("smergeC: %d calls to smerge1 using %d/%d threads took %.3fs\n", nBatch, unqN(th, nBatch), nth, omp_get_wtime() - t); + Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); return; } @@ -302,23 +322,17 @@ void sortInt(int *x, int nx, int *idx, int *ans) { return; } -// this produce output consistent to bmerge, note that bmerge i,x is smerge x,y -SEXP joinOut(int n, int *starts, int *lens, bool x_ord, - SEXP out_starts, SEXP out_lens, - SEXP x_idx, SEXP y_idx, - SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1) { - SEXP ans = PROTECT(allocVector(VECSXP, 10)), ansnames; - setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 10)); +// for bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y +SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord, + SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation + SEXP x_idx, SEXP y_idx, + SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1, + bool bmerge) { + int out_len = bmerge ? 6 : 10; + SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames; + setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len)); SET_STRING_ELT(ansnames, 0, char_starts); SET_STRING_ELT(ansnames, 1, char_lens); - SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join - SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1); - SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true)); // const for equi join - SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx); - SET_STRING_ELT(ansnames, 6, char_io); SET_VECTOR_ELT(ans, 6, x_idx); - SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1); - SET_STRING_ELT(ansnames, 8, char_xyLen1); SET_VECTOR_ELT(ans, 8, xy_lens1); - SET_STRING_ELT(ansnames, 9, char_nMatch); SET_VECTOR_ELT(ans, 9, n_match); if (x_ord) { SET_VECTOR_ELT(ans, 0, out_starts); SET_VECTOR_ELT(ans, 1, out_lens); @@ -330,19 +344,40 @@ SEXP joinOut(int n, int *starts, int *lens, bool x_ord, sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1))); UNPROTECT(1); } + SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join + SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1); + SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true)); // const for equi join + if (bmerge) { + y_idx = shallow_duplicate(y_idx); // possibly improve after #4467 + setAttrib(y_idx, sym_starts, R_NilValue); + setAttrib(y_idx, sym_maxgrpn, R_NilValue); + //setAttrib(y_idx, sym_anyna, R_NilValue); // enable after #4386 + //setAttrib(y_idx, sym_anyinfnan, R_NilValue); + //setAttrib(y_idx, sym_anynotascii, R_NilValue); + //setAttrib(y_idx, sym_anynotutf8, R_NilValue); + } + SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx); + if (!bmerge) { + SET_STRING_ELT(ansnames, 6, char_io); SET_VECTOR_ELT(ans, 6, x_idx); + SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1); + SET_STRING_ELT(ansnames, 8, char_xyLen1); SET_VECTOR_ELT(ans, 8, xy_lens1); + SET_STRING_ELT(ansnames, 9, char_nMatch); SET_VECTOR_ELT(ans, 9, n_match); + } UNPROTECT(1); return ans; } -SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) { +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { - const int verbose = GetVerbose()*2; // remove *2 after #4491 + double t_total = omp_get_wtime(); + const int verbose = GetVerbose()*3; // remove *3 after #4491 if (!isInteger(x) || !isInteger(y)) error("'x' and 'y' must be integer"); + if (!IS_TRUE_OR_FALSE(out_bmerge)) + error("'out.bmerge' must be TRUE or FALSE"); int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); - double t = 0; - t = omp_get_wtime(); + double t = omp_get_wtime(); bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); if (do_x_idx) { x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 @@ -409,19 +444,21 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) { &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1), verbose-1 ); - if (n_match > DBL_MAX) { + if (n_match > (uint64_t)DBL_MAX) { REAL(n_matchr)[0] = NA_REAL; warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field"); } else { REAL(n_matchr)[0] = (double)n_match; } if (verbose>0) - Rprintf("smergeR: smergeC of %d x %d => %"PRIu64" took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); + Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); t = omp_get_wtime(); - SEXP ans = joinOut(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1); + SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]); + if (verbose>0) + Rprintf("smergeR: outSmerge (unsort) took %.3fs\n", omp_get_wtime() - t); if (verbose>0) - Rprintf("smergeR: joinOut (unsort) took %.3fs\n", omp_get_wtime() - t); + Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total); UNPROTECT(protecti); return ans; From 9b0d195162ae4a6440ef329adcf70055bfeaafda Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 8 Jun 2020 21:42:25 +0100 Subject: [PATCH 16/34] static funs and comments --- src/smerge.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/smerge.c b/src/smerge.c index 6dac785b2c..8b5b2658c3 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -27,11 +27,11 @@ struct smBatch { }; // workhorse join that runs in parallel on batches -void smerge(struct smBatch *batch, - int *x, bool unq_x, int *y, bool unq_y, - int *starts, int *lens, - uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) { - int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs; +static void smerge(struct smBatch *batch, + int *x, bool unq_x, int *y, bool unq_y, + int *starts, int *lens, + uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) { + int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs; // unpack smBatch struct int *ys=batch->ys, *yl=batch->yl, nys=batch->nys; int i=0, j=0, is=0, js=0, x_i, y_j; uint64_t cnt=0; @@ -199,16 +199,16 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) { } // cuts x_starts into equal batches and binary search corresponding y_starts ranges -struct smBatch *batching(int nBatch, - int *x, int nx, int *x_starts, int nx_starts, int *x_lens, - int *y, int ny, int *y_starts, int ny_starts, int *y_lens, - int verbose) { +static struct smBatch *batching(int nBatch, + int *x, int nx, int *x_starts, int nx_starts, int *x_lens, + int *y, int ny, int *y_starts, int ny_starts, int *y_lens, + int verbose) { if (verbose>0) Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch); struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch)); size_t batchSize = (nx_starts-1)/nBatch + 1; size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; - bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV + bool extra_validate = false; // validates against very slow m[in|ax]_i_match // #DEV for (int b=0; b Date: Mon, 8 Jun 2020 22:55:31 +0100 Subject: [PATCH 17/34] better verbose --- R/bmerge.R | 12 +++++++++--- inst/tests/smerge.Rraw | 8 ++++++-- src/smerge.c | 26 +++++++++++++++++++------- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index f9ab80cd26..3ec080fb18 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,6 +1,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { + smergeOpt = FALSE if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer && isTRUE(getOption("datatable.smerge")) ## enable option && identical(nomatch, NA_integer_) ## for now only outer join @@ -9,14 +10,15 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join ) { getIdxGrp = function(x, cols) { ## get index only if retGrp=T + if (!isTRUE(getOption("datatable.use.index"))) return() if (is.numeric(cols)) cols = names(x)[cols] idx = attr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), exact=TRUE) if (!is.null(attr(idx, "starts", exact=TRUE))) idx } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) + smans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} - return(ans) + smergeOpt = TRUE } callersi = i i = shallow(i) @@ -147,7 +149,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } else { xo = NULL if (isTRUE(getOption("datatable.use.index"))) { - xo = getindex(x, names(x)[xcols]) + xo = c(getindex(x, names(x)[xcols])) ## c takes care of future #4386 if (verbose && !is.null(xo)) cat("on= matches existing index, using index\n") } if (is.null(xo)) { @@ -198,6 +200,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} + if (smergeOpt && !identical(ans, smans)) { + (if (!interactive()) stop else message)("bmerge opt to smerge produced different results") + browser() + } return(ans) } diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 3d8cb27b79..cb7bf40661 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -24,6 +24,8 @@ sm = function(x, y) { ans } +#options(datatable.verbose=TRUE) + ## x y sorted x = c(1L,2L,3L,4L) y = c(2L,3L,5L) @@ -124,7 +126,7 @@ d2 = data.table(y=y, v2=seq_along(y)) options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") options(datatable.smerge=TRUE) -test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge") +test(101.02, d1[d2, on="x==y"], expected, output="smerge") ## for now extra computation of bmerge is still done so no: #, notOutput="bmerge") setindexv2 = function(x, cols) { ## pretend we are after #4386 stopifnot(is.data.table(x), is.character(cols)) @@ -134,4 +136,6 @@ setindexv2 = function(x, cols) { ## pretend we are after #4386 } setindexv2(d1, "x") setindexv2(d2, "y") -test(101.03, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge") +test(101.03, d1[d2, on="x==y"], expected, output="smerge.*already indexed") +options(datatable.use.index=FALSE) +test(101.04, d1[d2, on="x==y"], expected, output="smerge", notOutput="already indexed") diff --git a/src/smerge.c b/src/smerge.c index 8b5b2658c3..5035fe9486 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -208,7 +208,7 @@ static struct smBatch *batching(int nBatch, struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch)); size_t batchSize = (nx_starts-1)/nBatch + 1; size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; - bool extra_validate = false; // validates against very slow m[in|ax]_i_match // #DEV + bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV for (int b=0; b0) - Rprintf("smergeC: grpLens took %.3fs\n", omp_get_wtime() - t); + Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function int nth = getDTthreads(); int nBatch = 0; // for a single threaded or small unq count use single batch - if (nth == 1 || nx_starts < 1024) { // this is 4 only for testing, will be probably 1024 // #DEV + if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV nBatch = 1; } else { nBatch = nth * 2; @@ -310,7 +322,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts, } x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch; if (verbose>0) - Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); + Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic) return; } @@ -392,7 +404,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (isNull(x_starts) || isNull(y_starts)) error("Indices provided to smerge must carry 'starts' attribute"); if (verbose>0) - Rprintf("smergeR: index took %.3fs\n", omp_get_wtime() - t); + Rprintf("smergeR: index %s took %.3fs\n", verboseDone(do_x_idx, do_y_idx, "(already indexed)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); t = omp_get_wtime(); bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); @@ -410,7 +422,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { yp = INTEGER(y); } if (verbose>0) - Rprintf("smergeR: sort took %.3fs\n", omp_get_wtime() - t); + Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); t = omp_get_wtime(); SEXP out_starts = R_NilValue, out_lens = R_NilValue; @@ -457,7 +469,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { t = omp_get_wtime(); SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]); if (verbose>0) - Rprintf("smergeR: outSmerge (unsort) took %.3fs\n", omp_get_wtime() - t); + Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t); if (verbose>0) Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total); From 19d1df6e0a378f0abbcacdd75891c9805cddda63 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 9 Jun 2020 02:39:48 +0100 Subject: [PATCH 18/34] simpler batching, more strict type defs --- inst/tests/smerge.Rraw | 1 + src/smerge.c | 153 +++++++++++++++++++++-------------------- 2 files changed, 79 insertions(+), 75 deletions(-) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index cb7bf40661..3502efa392 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -71,6 +71,7 @@ ssa = function(unq_n, size, sort=FALSE) { if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) if (sort) sort(ans) else ans } +set.seed(108) N = 1e2L x = ssa(N, N, sort=TRUE) y = ssa(N, N, sort=TRUE) diff --git a/src/smerge.c b/src/smerge.c index 5035fe9486..8ca13dd616 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -16,29 +16,21 @@ */ #define SMERGE_STATS -// shifted pointers so each batch knows where to start and end -struct smBatch { - int *xs; // x's starts: indices on which unique values can be found - int *xl; // x's lens: counts of each unique value - int nxs; // x's starts length - int *ys; // same for y - int *yl; - int nys; -}; - // workhorse join that runs in parallel on batches -static void smerge(struct smBatch *batch, - int *x, bool unq_x, int *y, bool unq_y, - int *starts, int *lens, +static void smerge(const int bx_off, const int bnx, + const int by_off, const int bny, + const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, + const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, + int *restrict starts, int *restrict lens, uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) { - int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs; // unpack smBatch struct - int *ys=batch->ys, *yl=batch->yl, nys=batch->nys; - int i=0, j=0, is=0, js=0, x_i, y_j; - uint64_t cnt=0; + uint64_t cnt = 0; bool xlen1 = true, ylen1 = true, xylen1 = true; - if (unq_x && unq_y) { //Rprintf("smerge: unq_x && unq_y\n"); - while (i= val: %d >= %d\n", i, y[y_starts[i]-1], val); if (y[y_starts[i]-1]==val) return(i); @@ -136,7 +137,7 @@ static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { } return(i-1); } -static int min_i_match(int *y, int *y_starts, int ny_starts, int val) { +static int min_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) { int i; for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); if (y[y_starts[i]-1]==val) return(i); @@ -149,7 +150,7 @@ static int min_i_match(int *y, int *y_starts, int ny_starts, int val) { * used to find 'y' lower and upper bounds for each batch * side -1: lower bound; side 1: upper bound */ -static int rollbs(int *x, int *ix, int nix, int val, int side) { +static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) { int verbose = 0; // devel debug only if (verbose>0) Rprintf("rollbs: side=%d; val=%d\n", side, val); if (side < 0) { // lower bound early stopping @@ -199,24 +200,21 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) { } // cuts x_starts into equal batches and binary search corresponding y_starts ranges -static struct smBatch *batching(int nBatch, - int *x, int nx, int *x_starts, int nx_starts, int *x_lens, - int *y, int ny, int *y_starts, int ny_starts, int *y_lens, - int verbose) { +static void batching(const int nBatch, + const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, + const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, + int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny, + const int verbose) { if (verbose>0) Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch); - struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch)); size_t batchSize = (nx_starts-1)/nBatch + 1; size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV for (int b=0; b0) { // print smBatch objects, 1-indexed! x y sorted! for debugging and verbose + if (verbose>0) { // print batches, 1-indexed! x y sorted! for debugging and verbose for (int b=0; b= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); } } - return B; + return; } // helper to count how many threads were used -static int unqNth(int *x, int nx) { // x have 0:(nx-1) values +static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values int ans = 0; uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); memset(seen, 0, nx*sizeof(uint8_t)); @@ -253,7 +252,7 @@ static int unqNth(int *x, int nx) { // x have 0:(nx-1) values } // helper for verbose messages to print what was actually computed -static const char *verboseDone(bool x, bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) { +static const char *verboseDone(const bool x, const bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) { if (!x && !y) return(not_xy); else if (!x && y) @@ -265,8 +264,8 @@ static const char *verboseDone(bool x, bool y, const char *not_xy, const char *n } // count of duplicated entries -void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395 -#pragma omp parallel for schedule(static) num_threads(getDTthreads()) +void grpLens(const int *restrict starts, const int n_starts, const int n, int *restrict lens) { // #4395 + #pragma omp parallel for schedule(static) num_threads(getDTthreads()) for (int i=0; i0) Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); - t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function + t = omp_get_wtime(); int nth = getDTthreads(); int nBatch = 0; // for a single threaded or small unq count use single batch if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV @@ -303,7 +302,9 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts, nBatch = nth * 2; } int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used - struct smBatch *B = batching(nBatch, x, nx, x_starts, nx_starts, x_lens, y, ny, y_starts, ny_starts, y_lens, verbose-1); + int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int)); + int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int)); + batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1); if (verbose>0) Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); @@ -313,8 +314,10 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts, #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth) for (int b=0; b Date: Tue, 9 Jun 2020 22:39:03 +0100 Subject: [PATCH 19/34] last batch not balanced anymore --- inst/tests/smerge.Rraw | 173 ++++++++++++++++++++++++++++++---------- src/smerge.c | 177 ++++++++++++++++++++++++----------------- 2 files changed, 237 insertions(+), 113 deletions(-) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 3502efa392..99eb2cfef9 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -19,95 +19,186 @@ bm = function(x, y) { } sm = function(x, y) { stopifnot(is.integer(x), is.integer(y)) + #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS") ans = smerge(x, y, out.bmerge=TRUE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans } +#setDTthreads(2) #options(datatable.verbose=TRUE) +# unique and sort ## x y sorted -x = c(1L,2L,3L,4L) -y = c(2L,3L,5L) +x = c(1L,2L,3L,4L) # unq +y = c(2L,3L,5L) # unq test(1.01, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) -y = c(2L,3L,5L) +y = c(2L,3L,5L) # unq test(1.02, sm(x, y), bm(x, y)) x = c(1L,2L,3L,4L) -y = c(2L,3L,3L,5L) +y = c(2L,3L,3L,5L) # unq test(1.03, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) y = c(2L,3L,3L,5L) test(1.04, sm(x, y), bm(x, y)) - ## y unsorted -x = c(1L,2L,3L,4L) -y = c(2L,5L,3L) +x = c(1L,2L,3L,4L) # unq +y = c(2L,5L,3L) # unq test(2.01, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) -y = c(3L,2L,5L) +y = c(3L,2L,5L) # unq test(2.02, sm(x, y), bm(x, y)) -x = c(1L,2L,3L,4L) +x = c(1L,2L,3L,4L) # unq y = c(5L,3L,2L,3L) test(2.03, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) y = c(5L,3L,3L,2L) test(2.04, sm(x, y), bm(x, y)) - -#... - +## x unsorted +x = c(2L,3L,1L,4L) # unq +y = c(2L,3L,5L) # unq +test(3.01, sm(x, y), bm(x, y)) +x = c(1L,3L,2L,4L,3L) +y = c(2L,3L,5L) # unq +test(3.02, sm(x, y), bm(x, y)) +x = c(4L,2L,3L,1L) +y = c(2L,3L,3L,5L) # unq +test(3.03, sm(x, y), bm(x, y)) +x = c(1L,2L,4L,3L,3L) +y = c(2L,3L,3L,5L) +test(3.04, sm(x, y), bm(x, y)) ## xy unsorted -x = c(4L,1L,3L,2L) +x = c(4L,1L,3L,2L) # unq y = c(2L,5L,3L) test(4.01, sm(x, y), bm(x, y)) +x = c(1L,3L,2L,4L,3L) +y = c(5L,3L,2L) # unq +test(4.02, sm(x, y), bm(x, y)) +x = c(4L,2L,3L,1L) +y = c(3L,3L,2L,5L) # unq +test(4.03, sm(x, y), bm(x, y)) +x = c(1L,2L,4L,3L,3L) +y = c(5L,2L,3L,3L) +test(4.04, sm(x, y), bm(x, y)) -#... +# ties +x = c(1L,2L,3L,4L,5L) +y = c(2L,4L) # within +test(5.01, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L,5L) +y = c(-1L,2L,4L) # left tie +test(5.02, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L,5L) +y = c(2L,4L,7L) # right tie +test(5.03, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L,5L) +y = c(-1L,2L,4L,6L) # both ties +test(5.04, sm(x, y), bm(x, y)) -## xy sorted +# nomatch +x = c(1L,3L,5L) +y = c(2L,4L) # within nomatch +test(6.01, sm(x, y), bm(x, y)) +x = c(1L,2L,3L,4L,5L) +y = c(-1L,6L) # ties nomatch +test(6.02, sm(x, y), bm(x, y)) +x = c(1L,2L,2L,2L,5L) +y = c(2L,4L) # x duplicates single match +test(6.03, sm(x, y), bm(x, y)) +x = c(1L,2L,2L,2L,3L,3L,4L,5L) +y = c(2L,4L) # x duplicates multi, single match +test(6.04, sm(x, y), bm(x, y)) +x = c(1L,2L,2L,2L,3L,4L,4L,5L) +y = c(2L,4L) # x duplicates multi, multi match +test(6.05, sm(x, y), bm(x, y)) +x = c(1L,2L,2L,2L,5L) +y = c(-1L,6L) # x duplicates nomatch +test(6.06, sm(x, y), bm(x, y)) + +# skew +N = 2e3L +x = seq_len(N) +y = c(head(x), tail(x)) +test(7.01, sm(x, y), bm(x, y)) +y = c(1:6, 750L, 1250L, 1995:2000) +test(7.02, sm(x, y), bm(x, y)) + +# custom cases +x=c(39L, 41L, 41L, 37L, 86L, 93L, 20L, 34L, 38L, 21L, 79L, 84L, +2L, 80L, 51L, 58L, 66L, 33L, 32L, 22L, 24L, 4L, 67L, 59L, 89L, +1L, 44L, 62L, 34L, 18L, 93L, 67L, 22L, 42L, 8L, 72L, 45L, 87L, +41L, 85L, 30L, 61L, 5L, 45L, 48L, 41L, 57L, 63L, 68L, 96L, 72L, +62L, 14L, 84L, 57L, 43L, 6L, 49L, 33L, 68L, 2L, 18L, 69L, 41L, +2L, 52L, 69L, 94L, 56L, 72L, 13L, 50L, 86L, 81L, 8L, 28L, 96L, +28L, 87L, 28L, 1L, 27L, 60L, 61L, 99L, 19L, 39L, 99L, 67L, 70L, +53L, 86L, 64L, 49L, 99L, 91L, 36L, 7L, 57L, 63L) +y=c(44L, 50L, 47L, 26L, 44L, 11L, 18L, 60L, 9L, 96L, 25L, 59L, +53L, 82L, 4L, 41L, 65L, 30L, 29L, 34L, 29L, 23L, 12L, 40L, 76L, +40L, 30L, 29L, 98L, 2L, 57L, 13L, 44L, 68L, 72L, 82L, 19L, 88L, +19L, 95L, 22L, 46L, 43L, 36L, 67L, 96L, 34L, 6L, 16L, 20L, 86L, +65L, 89L, 78L, 36L, 95L, 19L, 67L, 65L, 99L, 59L, 77L, 16L, 50L, +99L, 98L, 72L, 26L, 35L, 46L, 52L, 55L, 56L, 1L, 91L, 21L, 52L, +69L, 7L, 87L, 97L, 97L, 71L, 48L, 6L, 35L, 62L, 26L, 44L, 36L, +50L, 75L, 100L, 63L, 39L, 3L, 94L, 85L, 99L, 61L) +test(8.01, sm(x, y), bm(x, y)) + +# scale up ssa = function(unq_n, size, sort=FALSE) { - stopifnot(unq_n <= size) + if (unq_n <= size) return(sample.int(unq_n, size, TRUE)) unq_sub = seq_len(unq_n) ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) - if (unq_n==size) stopifnot(uniqueN(ans)==length(ans)) if (sort) sort(ans) else ans } set.seed(108) N = 1e2L -x = ssa(N, N, sort=TRUE) -y = ssa(N, N, sort=TRUE) +## xy sorted +x = ssa(N, N, sort=TRUE) # unq +y = ssa(N, N, sort=TRUE) # unq test(11.01, sm(x, y), bm(x, y)) x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N, sort=TRUE) +y = ssa(N, N, sort=TRUE) # unq test(11.02, sm(x, y), bm(x, y)) -x = ssa(N, N, sort=TRUE) +x = ssa(N, N, sort=TRUE) # unq y = ssa(N, N*1.1, sort=TRUE) test(11.03, sm(x, y), bm(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N*1.1, sort=TRUE) test(11.04, sm(x, y), bm(x, y)) - ## y unsorted -x = ssa(N, N, sort=TRUE) -y = ssa(N, N) +x = ssa(N, N, sort=TRUE) # unq +y = ssa(N, N) # unq test(12.01, sm(x, y), bm(x, y)) x = ssa(N, N*1.1, sort=TRUE) -y = ssa(N, N) +y = ssa(N, N) # unq test(12.02, sm(x, y), bm(x, y)) -x = ssa(N, N, sort=TRUE) +x = ssa(N, N, sort=TRUE) # unq y = ssa(N, N*1.1) test(12.03, sm(x, y), bm(x, y)) x = ssa(N, N*1.1, sort=TRUE) y = ssa(N, N*1.1) test(12.04, sm(x, y), bm(x, y)) - +## x unsorted +x = ssa(N, N) # unq +y = ssa(N, N, sort=TRUE) # unq +test(13.01, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N, sort=TRUE) # unq +test(13.02, sm(x, y), bm(x, y)) +x = ssa(N, N) # unq +y = ssa(N, N*1.1, sort=TRUE) +test(13.03, sm(x, y), bm(x, y)) +x = ssa(N, N*1.1) +y = ssa(N, N*1.1, sort=TRUE) +test(13.04, sm(x, y), bm(x, y)) ## xy unsorted -x = ssa(N, N) -y = ssa(N, N) +x = ssa(N, N) # unq +y = ssa(N, N) # unq test(14.01, sm(x, y), bm(x, y)) x = ssa(N, N*1.1) -y = ssa(N, N) +y = ssa(N, N) # unq test(14.02, sm(x, y), bm(x, y)) -x = ssa(N, N) +x = ssa(N, N) # unq y = ssa(N, N*1.1) test(14.03, sm(x, y), bm(x, y)) x = ssa(N, N*1.1) @@ -115,28 +206,30 @@ y = ssa(N, N*1.1) test(14.04, sm(x, y), bm(x, y)) # sparse -x = sample(2e2L, 1e2L) ## may fail when no matches! -y = sample(2e2L, 1e2L) +x = sample.int(2e2L, 1e2L) +y = sample.int(2e2L, 1e2L) test(21.01, sm(x, y), bm(x, y)) -x = sample(2e2L, 1e2L, TRUE) -y = sample(2e2L, 1e2L, TRUE) +x = sample.int(2e2L, 1e2L, TRUE) +y = sample.int(2e2L, 1e2L, TRUE) test(21.02, sm(x, y), bm(x, y)) -d1 = data.table(x=x, v1=seq_along(x)) -d2 = data.table(y=y, v2=seq_along(y)) +# [.data.table join +d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x)) +d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y)) options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") options(datatable.smerge=TRUE) test(101.02, d1[d2, on="x==y"], expected, output="smerge") ## for now extra computation of bmerge is still done so no: #, notOutput="bmerge") - setindexv2 = function(x, cols) { ## pretend we are after #4386 stopifnot(is.data.table(x), is.character(cols)) if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer()) setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), forderv(x, cols, retGrp=TRUE)) invisible(x) } -setindexv2(d1, "x") -setindexv2(d2, "y") +options(datatable.verbose=FALSE) +setindexv2(d1, "x"); setindexv2(d2, "y") +options(datatable.use.index=TRUE, datatable.verbose=TRUE) test(101.03, d1[d2, on="x==y"], expected, output="smerge.*already indexed") options(datatable.use.index=FALSE) test(101.04, d1[d2, on="x==y"], expected, output="smerge", notOutput="already indexed") +options(datatable.use.index=TRUE) diff --git a/src/smerge.c b/src/smerge.c index 8ca13dd616..5ed2ef20a9 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -26,14 +26,13 @@ static void smerge(const int bx_off, const int bnx, uint64_t cnt = 0; bool xlen1 = true, ylen1 = true, xylen1 = true; if (unq_x && unq_y) { - int i=0, j=0; - const int *restrict bx = x + bx_off; - const int *restrict by = y + by_off; - while (i0) Rprintf("rollbs: side=%d; val=%d\n", side, val); + if (verbose>0) Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); + if (x[ix[0]-1] == val) { // common early stopping + if (verbose>0) Rprintf("rollbs: min elements %d match: 0\n", val); + return 0; + } + if (x[ix[nix-1]-1] == val) { + if (verbose>0) Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); + return nix-1; + } if (side < 0) { // lower bound early stopping - if (x[ix[0]-1] == val) { - if (verbose>0) Rprintf("rollbs: min elements %d match\n", val); - return 0; - } if (x[ix[nix-1]-1] < val) { - if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val); + if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); return -1; } if (x[ix[0]-1] > val) { - if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val); + if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); return 0; } } else if (side > 0) { // upper bound early stopping - if (x[ix[nix-1]-1] == val) { - if (verbose>0) Rprintf("rollbs: max elements %d match\n", val); - return nix-1; - } if (x[ix[0]-1] > val) { - if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val); + if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); return -1; } if (x[ix[nix-1]-1] < val) { - if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val); + if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); return nix-1; } } @@ -192,7 +189,7 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix, else if (thisx > val) upper = i-1; } - if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side); + if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); if (side < 0) // anyone to stress test this logic? return lower; else @@ -205,10 +202,17 @@ static void batching(const int nBatch, const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny, const int verbose) { + //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov + //size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile, at least when we hardcode nBatch + //size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; + size_t batchSize = nx_starts / nBatch; + size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; // because of the above line last batch can be anything between 1 and 2*batchSize-1 if (verbose>0) - Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch); - size_t batchSize = (nx_starts-1)/nBatch + 1; - size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; + Rprintf("batching: input %d into %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, nBatch, batchSize, lastBatchSize); + if (lastBatchSize==0) + error("internal error: lastBatchSize must not be zero"); // # nocov + if ((nBatch-1) * batchSize + lastBatchSize != nx_starts) + error("internal error: batching %d input is attempting to use invalid batches: nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, nBatch, batchSize, lastBatchSize); // # nocov bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV for (int b=0; b= 0 && y_i_max >= 0; + if (extra_validate && y_match) { int y_i_min2 = min_i_match(y, y_starts, ny_starts, x[x_i_min-1]); int y_i_max2 = max_i_match(y, y_starts, ny_starts, x[x_i_max-1]); if (y_i_min!=y_i_min2) @@ -225,16 +230,18 @@ static void batching(const int nBatch, if (y_i_max!=y_i_max2) error("y upper bound different: %d vs %d", y_i_max2, y_i_max); } - By_off[b] = y_i_min; - Bny[b] = y_i_max - y_i_min + 1; + By_off[b] = y_match ? y_i_min : 0; + Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0; } if (verbose>0) { // print batches, 1-indexed! x y sorted! for debugging and verbose for (int b=0; b= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); + if (Bny[b] > 0) { + int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1]; + int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1]; + Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min); + Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); + } } } return; @@ -279,7 +286,9 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1, const int verbose) { - double t = omp_get_wtime(); + double t = 0; + if (verbose>0) + t = omp_get_wtime(); int *restrict x_lens = 0, *restrict y_lens = 0; const bool unq_x = nx_starts==nx, unq_y = ny_starts==ny; if (!unq_x) { @@ -293,36 +302,51 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, if (verbose>0) Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); - t = omp_get_wtime(); - int nth = getDTthreads(); - int nBatch = 0; // for a single threaded or small unq count use single batch - if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV - nBatch = 1; + if (verbose>0) + t = omp_get_wtime(); + int nBatch = 0; + const int nth = getDTthreads(); + if (nth == 1 || nx_starts < -1) { // nx_starts threshold disabled during dev + nBatch = 1; // any hardcoding here is likely to raise internal error in batching or segfault + } else if (nx_starts < nth * 2) { + nBatch = nx_starts; // stress test single row batches, will be usually escaped by branch above } else { nBatch = nth * 2; } - int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int)); int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int)); batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1); + int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used if (verbose>0) Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); - t = omp_get_wtime(); + if (verbose>0) + t = omp_get_wtime(); bool xlens1 = true, ylens1 = true, xylens1 = true; uint64_t nmatch = 0; + //for (int z=0; z0) Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic) @@ -330,7 +354,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, return; } -void sortInt(int *x, int nx, int *idx, int *ans) { +void sortInt(const int *restrict x, const int nx, const int *restrict idx, int *restrict ans) { #pragma omp parallel for schedule(static) num_threads(getDTthreads()) for (int i=0; i0) + t_total = omp_get_wtime(); if (!isInteger(x) || !isInteger(y)) error("'x' and 'y' must be integer"); if (!IS_TRUE_OR_FALSE(out_bmerge)) error("'out.bmerge' must be TRUE or FALSE"); int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); - double t = omp_get_wtime(); - bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); + if (verbose>0) + t = omp_get_wtime(); + const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); if (do_x_idx) { x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 } @@ -409,8 +436,9 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (verbose>0) Rprintf("smergeR: index %s took %.3fs\n", verboseDone(do_x_idx, do_y_idx, "(already indexed)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); - t = omp_get_wtime(); - bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); + if (verbose>0) + t = omp_get_wtime(); + const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); int *xp, *yp; if (!x_ord) { xp = (int *)R_alloc(nx, sizeof(int)); @@ -427,7 +455,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (verbose>0) Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); - t = omp_get_wtime(); + if (verbose>0) + t = omp_get_wtime(); SEXP out_starts = R_NilValue, out_lens = R_NilValue; int *restrict starts=0, *restrict lens=0; if (x_ord) { // we dont need to reorder results so can save one allocation @@ -452,7 +481,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (verbose>0) Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t); - t = omp_get_wtime(); + if (verbose>0) + t = omp_get_wtime(); smergeC( xp, nx, INTEGER(x_starts), LENGTH(x_starts), yp, ny, INTEGER(y_starts), LENGTH(y_starts), @@ -469,7 +499,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (verbose>0) Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); - t = omp_get_wtime(); + if (verbose>0) + t = omp_get_wtime(); SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]); if (verbose>0) Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t); From 6423c0b489a1a2917769644b56a2ab8521e59f12 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 9 Jun 2020 23:39:53 +0100 Subject: [PATCH 20/34] batching balanced again --- inst/tests/smerge.Rraw | 5 +++- src/smerge.c | 52 +++++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 99eb2cfef9..78ae166982 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -9,12 +9,14 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { smerge = data.table:::smerge bmerge = data.table:::bmerge forderv = data.table:::forderv + vecseq = data.table:::vecseq } bm = function(x, y) { stopifnot(is.integer(x), is.integer(y)) ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well + ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL)))) ans } sm = function(x, y) { @@ -22,6 +24,7 @@ sm = function(x, y) { #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS") ans = smerge(x, y, out.bmerge=TRUE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well + ans$nMatch = smerge(x, y, out.bmerge=FALSE)$nMatch ans } @@ -151,7 +154,7 @@ ssa = function(unq_n, size, sort=FALSE) { if (sort) sort(ans) else ans } set.seed(108) -N = 1e2L +N = 1e4L ## xy sorted x = ssa(N, N, sort=TRUE) # unq y = ssa(N, N, sort=TRUE) # unq diff --git a/src/smerge.c b/src/smerge.c index 5ed2ef20a9..cdfdef57aa 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -13,8 +13,11 @@ * x_len1/x_lens1/lhsLen1: signals if multiple matches in LHS of join, ['s _i_ table * xy_len1/xy_lens1/xlLen1: signals if "many to many" matches between LHS and RHS * cnt/nmatch/n_match/n_matchr: count of matches, taking multiple matches into account, uint64_t + * + * when hardcoding or changing default number of batches it is advised to undefine SMERGE_BATCHING_BALANCED */ #define SMERGE_STATS +#define SMERGE_BATCHING_BALANCED // workhorse join that runs in parallel on batches static void smerge(const int bx_off, const int bnx, @@ -202,18 +205,20 @@ static void batching(const int nBatch, const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny, const int verbose) { +#ifdef SMERGE_BATCHING_BALANCED //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov - //size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile, at least when we hardcode nBatch - //size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; - size_t batchSize = nx_starts / nBatch; - size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; // because of the above line last batch can be anything between 1 and 2*batchSize-1 + size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch + bool balanced = true; // this is only for verbose message +#else + size_t batchSize = nx_starts / nBatch; // last batch size can be anything between 1 and 2*batchSize-1 + bool balanced = false; +#endif + size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; if (verbose>0) - Rprintf("batching: input %d into %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, nBatch, batchSize, lastBatchSize); - if (lastBatchSize==0) - error("internal error: lastBatchSize must not be zero"); // # nocov - if ((nBatch-1) * batchSize + lastBatchSize != nx_starts) - error("internal error: batching %d input is attempting to use invalid batches: nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, nBatch, batchSize, lastBatchSize); // # nocov - bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV + Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); + if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts)) + error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov + bool extra_validate = false; // validates against very slow m[in|ax]_i_match // this needs to be removed #DEV for (int b=0; b Date: Tue, 9 Jun 2020 23:51:22 +0100 Subject: [PATCH 21/34] remove extra checks in bmerge to smerge opt --- R/bmerge.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 3ec080fb18..c385eab265 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,7 +1,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { - smergeOpt = FALSE if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer && isTRUE(getOption("datatable.smerge")) ## enable option && identical(nomatch, NA_integer_) ## for now only outer join @@ -16,9 +15,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!is.null(attr(idx, "starts", exact=TRUE))) idx } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - smans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) + ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} - smergeOpt = TRUE + return(ans) } callersi = i i = shallow(i) @@ -200,10 +199,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} - if (smergeOpt && !identical(ans, smans)) { - (if (!interactive()) stop else message)("bmerge opt to smerge produced different results") - browser() - } return(ans) } From 739a35a4fdc8254a17b666e5426ac3cb9795bd44 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 10 Jun 2020 21:34:57 +0100 Subject: [PATCH 22/34] fix test function --- inst/tests/smerge.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 78ae166982..31dcd6e07b 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -148,7 +148,7 @@ test(8.01, sm(x, y), bm(x, y)) # scale up ssa = function(unq_n, size, sort=FALSE) { - if (unq_n <= size) return(sample.int(unq_n, size, TRUE)) + if (unq_n > size) return(sample.int(unq_n, size)) unq_sub = seq_len(unq_n) ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE))) if (sort) sort(ans) else ans From 47cfd8a68df469bf9765956fcf71d1c5a3d6cf94 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 10 Jun 2020 21:35:54 +0100 Subject: [PATCH 23/34] improve verbose msg --- src/smerge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smerge.c b/src/smerge.c index cdfdef57aa..6028110961 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -509,7 +509,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { t = omp_get_wtime(); SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]); if (verbose>0) - Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t); + Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t); if (verbose>0) Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total); From 2806017cba78d6def44fee85d09b14bf3629e3f1 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 11 Jun 2020 00:38:54 +0100 Subject: [PATCH 24/34] cleanup dev mostly --- src/smerge.c | 81 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) diff --git a/src/smerge.c b/src/smerge.c index 6028110961..2049065087 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -123,28 +123,10 @@ static void smerge(const int bx_off, const int bnx, } } } - //Rprintf("cnt=%"PRIu64"\n", cnt); xlens1[0] = xlen1; ylens1[0] = ylen1; xylens1[0] = xylen1; nmatch[0] = cnt; return; } -// those two helpers needs to be removed once we will confirm rollbs works as expected, they find "first/last within" index -static int max_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) { - int i; - for (i=0; i= val: %d >= %d\n", i, y[y_starts[i]-1], val); - if (y[y_starts[i]-1]==val) return(i); - else if (y[y_starts[i]-1]>val) return(i-1); - } - return(i-1); -} -static int min_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) { - int i; - for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val); - if (y[y_starts[i]-1]==val) return(i); - else if (y[y_starts[i]-1]0) Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); + if (verbose>0) + Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov if (x[ix[0]-1] == val) { // common early stopping - if (verbose>0) Rprintf("rollbs: min elements %d match: 0\n", val); + if (verbose>0) + Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov return 0; } if (x[ix[nix-1]-1] == val) { - if (verbose>0) Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); + if (verbose>0) + Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov return nix-1; } if (side < 0) { // lower bound early stopping if (x[ix[nix-1]-1] < val) { - if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); + if (verbose>0) + Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov return -1; } if (x[ix[0]-1] > val) { - if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); + if (verbose>0) + Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov return 0; } } else if (side > 0) { // upper bound early stopping if (x[ix[0]-1] > val) { - if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); + if (verbose>0) + Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov return -1; } if (x[ix[nix-1]-1] < val) { - if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); + if (verbose>0) + Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov return nix-1; } } @@ -184,7 +173,7 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix, while (lower<=upper) { i = lower + (upper-lower)/2; int thisx = x[ix[i]-1]; - //if (verbose>0) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); + //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out if (thisx==val) return(i); else if (thisx < val) @@ -192,7 +181,8 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix, else if (thisx > val) upper = i-1; } - if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); + if (verbose>0) + Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov if (side < 0) // anyone to stress test this logic? return lower; else @@ -218,7 +208,6 @@ static void batching(const int nBatch, Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts)) error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov - bool extra_validate = false; // validates against very slow m[in|ax]_i_match // this needs to be removed #DEV for (int b=0; b= 0 && y_i_max >= 0; - if (extra_validate && y_match) { - int y_i_min2 = min_i_match(y, y_starts, ny_starts, x[x_i_min-1]); - int y_i_max2 = max_i_match(y, y_starts, ny_starts, x[x_i_max-1]); - if (y_i_min!=y_i_min2) - error("y lower bound different: %d vs %d", y_i_min2, y_i_min); - if (y_i_max!=y_i_max2) - error("y upper bound different: %d vs %d", y_i_max2, y_i_max); - } By_off[b] = y_match ? y_i_min : 0; Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0; } @@ -252,7 +233,7 @@ static void batching(const int nBatch, return; } -// helper to count how many threads were used +// helper for verbose messages to count how many threads were used, due to schedule dynamic not all may be used static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values int ans = 0; uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t)); @@ -266,13 +247,13 @@ static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values // helper for verbose messages to print what was actually computed static const char *verboseDone(const bool x, const bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) { if (!x && !y) - return(not_xy); + return not_xy; else if (!x && y) - return(not_x); + return not_x; else if (x && !y) - return(not_y); + return not_y; else - return(xy); + return xy; } // count of duplicated entries @@ -311,7 +292,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, t = omp_get_wtime(); int nBatch = 0; const int nth = getDTthreads(); - if (nth == 1 || nx_starts < 1024) { // nx_starts threshold disabled during dev + if (nth == 1 || nx_starts < 1024) { #ifdef SMERGE_BATCHING_BALANCED nBatch = 1; // when using balanced lastBatchSize is never bigger than batchSize, any hardcoding here is likely to raise internal error in batching or segfault, testing possible with: for (i in 1:10) cc("smerge.Rraw") #else @@ -333,9 +314,6 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, t = omp_get_wtime(); bool xlens1 = true, ylens1 = true, xylens1 = true; uint64_t nmatch = 0; - //for (int z=0; z0) Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic) @@ -367,7 +343,7 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int * return; } -// wrap results into list, bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y +// wrap results into list, bmerge=true produces bmerge replacement output, note that bmerge i,x is smerge x,y (as of now) SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord, SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation SEXP x_idx, SEXP y_idx, @@ -423,6 +399,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { error("'x' and 'y' must be integer"); if (!IS_TRUE_OR_FALSE(out_bmerge)) error("'out.bmerge' must be TRUE or FALSE"); + const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0]; int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); if (verbose>0) @@ -474,10 +451,12 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { starts = (int *)R_alloc(nx, sizeof(int)); lens = (int *)R_alloc(nx, sizeof(int)); } + // this fills default values, bmerge's defaults are tricky (dictated by how they are consumed): nomatch=0 makes starts=0 not NA, lens=0 is fine there; nomatch=NA makes lens=1 not NA, starts=NA is fine there + const int default_lens = 1; //ans_bmerge ? 1 : NA_INTEGER; // for now keep it fixed to bmerge nomatch out: 1 #pragma omp parallel for schedule(static) num_threads(getDTthreads()) for (int i=0; i (uint64_t)DBL_MAX) { + if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here REAL(n_matchr)[0] = NA_REAL; warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field"); } else { @@ -507,7 +486,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { if (verbose>0) t = omp_get_wtime(); - SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]); + SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, ans_bmerge); if (verbose>0) Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t); if (verbose>0) From 4e5f56d23b5fb4a49b8aed88f89bdd47e7088caf Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 11 Jun 2020 02:01:04 +0100 Subject: [PATCH 25/34] comment about thread utilization --- src/smerge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smerge.c b/src/smerge.c index 2049065087..e1a64e313d 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -327,7 +327,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, &bnmatch, &bxlens1, &bylens1, &bxylens1 // reduction output ); nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1; - th[b] = omp_get_thread_num(); + th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message } x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch; if (verbose>0) From 405f55d7bb62e502380321db522fdffc57b234de Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 12 Jun 2020 11:47:01 +0100 Subject: [PATCH 26/34] mult support --- R/wrappers.R | 2 +- inst/tests/smerge.Rraw | 15 ++-- src/data.table.h | 2 +- src/smerge.c | 195 ++++++++++++++++++++++++++++++++--------- 4 files changed, 161 insertions(+), 53 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index 83e3d73417..636e6e04ee 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,4 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smerge = function(x, y, x.idx=NULL, y.idx=NULL, out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, out.bmerge) +smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 31dcd6e07b..3a3bf9812a 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -12,19 +12,18 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { vecseq = data.table:::vecseq } -bm = function(x, y) { +bm = function(x, y, mult="all") { stopifnot(is.integer(x), is.integer(y)) - ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE) + ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL)))) ans } -sm = function(x, y) { +sm = function(x, y, mult="all") { stopifnot(is.integer(x), is.integer(y)) - #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS") - ans = smerge(x, y, out.bmerge=TRUE) + ans = smerge(x, y, mult=mult, out.bmerge=TRUE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well - ans$nMatch = smerge(x, y, out.bmerge=FALSE)$nMatch + ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch ans } @@ -39,8 +38,8 @@ test(1.01, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) y = c(2L,3L,5L) # unq test(1.02, sm(x, y), bm(x, y)) -x = c(1L,2L,3L,4L) -y = c(2L,3L,3L,5L) # unq +x = c(1L,2L,3L,4L) # unq +y = c(2L,3L,3L,5L) test(1.03, sm(x, y), bm(x, y)) x = c(1L,2L,3L,3L,4L) y = c(2L,3L,3L,5L) diff --git a/src/data.table.h b/src/data.table.h index 9ceb722cea..231f8f28b4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge); +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge); diff --git a/src/smerge.c b/src/smerge.c index e1a64e313d..41de40d82e 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -19,13 +19,17 @@ #define SMERGE_STATS #define SMERGE_BATCHING_BALANCED +// this only refers to RHS side: y +enum emult{ALL, FIRST, LAST, ERR}; + // workhorse join that runs in parallel on batches static void smerge(const int bx_off, const int bnx, const int by_off, const int bny, const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, int *restrict starts, int *restrict lens, - uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) { + uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1, + const enum emult mult) { uint64_t cnt = 0; bool xlen1 = true, ylen1 = true, xylen1 = true; if (unq_x && unq_y) { @@ -49,23 +53,59 @@ static void smerge(const int bx_off, const int bnx, } else if (unq_x) { int i = bx_off, js = by_off; const int ni = bx_off+bnx, njs = by_off+bny; - while (i1) - ylen1 = false; - cnt += (uint64_t)yl1; + if (ylen1 && yl1>1) + ylen1 = false; + cnt += (uint64_t)yl1; #endif - } else if (x_i < y_j) { - i++; - } else if (x_i > y_j) { - js++; + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + js++; + } + } + } else if (mult == FIRST) { + while (i y_j) { + js++; + } + } + } else if (mult == LAST) { + while (i y_j) { + js++; + } } } } else if (unq_y) { @@ -96,30 +136,78 @@ static void smerge(const int bx_off, const int bnx, } else { int is = bx_off, js = by_off; const int nis = bx_off+bnx, njs = by_off+bny; - while (is1) + xlen1 = false; + if (ylen1 && yl1>1) + ylen1 = false; + if (xylen1 && xl1>1 && yl1>1) + xylen1 = false; + cnt += (uint64_t)xl1 * (uint64_t)yl1; + #endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; } - is++; + } + } else if (mult==FIRST) { + while (is1) - xlen1 = false; - if (ylen1 && yl1>1) - ylen1 = false; - if (xylen1 && xl1>1 && yl1>1) - xylen1 = false; - cnt += (uint64_t)xl1 * (uint64_t)yl1; + if (xlen1 && xl1>1) + xlen1 = false; + cnt += (uint64_t)xl1; #endif - } else if (x_i < y_j) { - is++; - } else if (x_i > y_j) { - js++; + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; + } + } + } else if (mult==LAST) { + while (is1) + xlen1 = false; + cnt += (uint64_t)xl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; + } } } } @@ -270,7 +358,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict starts, int *restrict lens, uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1, - const int verbose) { + const enum emult mult, const int verbose) { double t = 0; if (verbose>0) @@ -281,12 +369,12 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // remove after #4395 grpLens(x_starts, nx_starts, nx, x_lens); } - if (!unq_y) { + if (!(unq_y || mult==FIRST)) { y_lens = (int *)R_alloc(ny_starts, sizeof(int)); grpLens(y_starts, ny_starts, ny, y_lens); } if (verbose>0) - Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); + Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !(unq_y || mult==FIRST), "(x already unq, y unq or mult='first')", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); if (verbose>0) t = omp_get_wtime(); @@ -324,7 +412,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, x, x_starts, x_lens, unq_x, // common input x y, y_starts, y_lens, unq_y, // common input y starts, lens, // common output - &bnmatch, &bxlens1, &bylens1, &bxylens1 // reduction output + &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output + mult ); nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1; th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message @@ -332,6 +421,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch; if (verbose>0) Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic) + if (mult==ERR && !ylens1) + error("mult='error' and multiple matches during merge"); return; } @@ -388,8 +479,23 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len return ans; } +const enum emult matchMultArg(SEXP multArg) { + enum emult mult; + if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) + mult = ALL; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) + mult = FIRST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) + mult = LAST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "error")) + mult = ERR; + else + error(_("Internal error: invalid value for 'mult'. please report to data.table issue tracker")); // # nocov + return mult; +} + // main interface from R -SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) { const int verbose = GetVerbose()*3; // remove *3 after #4491 double t_total = 0, t = 0; @@ -397,6 +503,9 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { t_total = omp_get_wtime(); if (!isInteger(x) || !isInteger(y)) error("'x' and 'y' must be integer"); + if (!isString(multArg)) + error("'mult' must be a string"); + const enum emult mult = matchMultArg(multArg); if (!IS_TRUE_OR_FALSE(out_bmerge)) error("'out.bmerge' must be TRUE or FALSE"); const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0]; @@ -473,7 +582,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) { yp, ny, INTEGER(y_starts), LENGTH(y_starts), starts, lens, &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1), - verbose-1 + mult, verbose-1 ); if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here REAL(n_matchr)[0] = NA_REAL; From 9ee0a7a30121192a25504986662743318475126f Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 12 Jun 2020 15:54:26 +0100 Subject: [PATCH 27/34] avoid lens allocation and unsort for mult=first|last --- R/data.table.R | 1 + src/smerge.c | 81 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 75b6b290ed..27e27dec2d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -513,6 +513,7 @@ replace_dot_alias = function(e) { } # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"? # TODO: how about when nomatch=0L, can we avoid allocating then as well? + # if we take nomatch out from [b|s]merge then it should be easier to avoid allocation } if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? diff --git a/src/smerge.c b/src/smerge.c index 41de40d82e..7fa907416e 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -39,7 +39,7 @@ static void smerge(const int bx_off, const int bnx, const int x_i = x[i], y_j = y[j]; if (x_i == y_j) { starts[i] = j+1; - //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc, same for unq_y branch + //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc i++; #ifdef SMERGE_STATS cnt++; @@ -53,7 +53,7 @@ static void smerge(const int bx_off, const int bnx, } else if (unq_x) { int i = bx_off, js = by_off; const int ni = bx_off+bnx, njs = by_off+bny; - if (mult == ALL || mult == ERR) { // mult==err is raised based on ylen1 flag outside of parallel region + if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region while (i0) t = omp_get_wtime(); - SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, ans_bmerge); + SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge); if (verbose>0) Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t); if (verbose>0) From 58165497c456a77a6f87570f73f0851cc9bf6ae5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 12 Jun 2020 17:24:33 +0100 Subject: [PATCH 28/34] move R allocs --- src/smerge.c | 61 ++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/smerge.c b/src/smerge.c index 7fa907416e..ef908b73f5 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -357,7 +357,7 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict starts, int *restrict lens, - uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1, + uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1, const enum emult mult, const int verbose) { double t = 0; @@ -400,8 +400,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, if (verbose>0) t = omp_get_wtime(); - bool xlens1 = true, ylens1 = true, xylens1 = true; uint64_t nmatch = 0; + bool xlens1 = true, ylens1 = true, xylens1 = true; #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth) for (int b=0; b0) Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic) if (mult==ERR && !ylens1) @@ -438,39 +438,41 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int * SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord, SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation SEXP x_idx, SEXP y_idx, - SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1, + uint64_t n_match, bool x_lens1, bool y_lens1, bool xy_lens1, const bool multLen1, const bool bmerge) { - int out_len = bmerge ? 6 : 10; + const int out_len = bmerge ? 6 : 10; SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames; setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len)); SET_STRING_ELT(ansnames, 0, char_starts); SET_STRING_ELT(ansnames, 1, char_lens); - if (bmerge) { + if (bmerge) { // for bmerge we need still allocate, but not unsort, for !bmerge no alloc and no unsort if (x_ord) { SET_VECTOR_ELT(ans, 0, out_starts); SET_VECTOR_ELT(ans, 1, out_lens); } else { SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n)); SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n)); - SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533 + SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533 sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0))); sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1))); UNPROTECT(1); } } else { - const bool skipLens = !multLen1 && LOGICAL(y_lens1)[0]; + const bool skipLens = !multLen1 && y_lens1; if (skipLens) { // compact lens if ylens1, mult=first|last already has compact lens out_lens = PROTECT(allocVector(INTSXP, 0)); } if (x_ord) { + if (y_lens1 && LENGTH(out_lens)) + error("internal error: lens should be already compact 0 length integer"); // # nocov SET_VECTOR_ELT(ans, 0, out_starts); SET_VECTOR_ELT(ans, 1, out_lens); } else { SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n)); - SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, LOGICAL(y_lens1)[0] ? 0 : n)); - SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533 + SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, y_lens1 ? 0 : n)); + SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533 sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0))); - if (LENGTH(VECTOR_ELT(ans, 1))) // not need to unsort 1-only vector, it is now compact 0 length + if (!y_lens1) // not need to unsort 1-only vector, it is now compact 0 length sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1))); UNPROTECT(1); } @@ -478,7 +480,7 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len UNPROTECT(1); } SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join - SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1); + SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, ScalarLogical(y_lens1)); SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true)); // const for equi join if (bmerge) { y_idx = shallow_duplicate(y_idx); // possibly improve after #4467 @@ -492,9 +494,17 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx); if (!bmerge) { SET_STRING_ELT(ansnames, 6, char_io); SET_VECTOR_ELT(ans, 6, x_idx); - SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1); - SET_STRING_ELT(ansnames, 8, char_xyLen1); SET_VECTOR_ELT(ans, 8, xy_lens1); - SET_STRING_ELT(ansnames, 9, char_nMatch); SET_VECTOR_ELT(ans, 9, n_match); + SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, ScalarLogical(x_lens1)); + SET_STRING_ELT(ansnames, 8, char_xyLen1); SET_VECTOR_ELT(ans, 8, ScalarLogical(xy_lens1)); + SEXP n_matchr = PROTECT(allocVector(REALSXP, 1)); + if (n_match > (uint64_t)DBL_MAX) { // 1e9 x 1e9 cartesian join results 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here + REAL(n_matchr)[0] = NA_REAL; + warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field"); + } else { + REAL(n_matchr)[0] = (double)n_match; + } + SET_STRING_ELT(ansnames, 9, char_nMatch); SET_VECTOR_ELT(ans, 9, n_matchr); + UNPROTECT(1); } UNPROTECT(1); return ans; @@ -537,10 +547,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer t = omp_get_wtime(); const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); if (do_x_idx) { - x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 + x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (do_y_idx) { - y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533 + y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (!isInteger(x_idx) || !isInteger(y_idx)) error("'x.idx' and 'y.idx' must be integer"); @@ -596,35 +606,26 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer lens[i] = 1; } } - uint64_t n_match = 0; - SEXP x_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++; - SEXP y_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++; - SEXP xy_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++; - SEXP n_matchr = PROTECT(allocVector(REALSXP, 1)); protecti++; if (verbose>0) Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t); if (verbose>0) t = omp_get_wtime(); + uint64_t n_match = 0; + bool x_lens1 = true, y_lens1 = true, xy_lens1 = true; smergeC( xp, nx, INTEGER(x_starts), LENGTH(x_starts), yp, ny, INTEGER(y_starts), LENGTH(y_starts), starts, lens, - &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1), + &n_match, &x_lens1, &y_lens1, &xy_lens1, mult, verbose-1 ); - if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here - REAL(n_matchr)[0] = NA_REAL; - warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field"); - } else { - REAL(n_matchr)[0] = (double)n_match; - } if (verbose>0) Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); if (verbose>0) t = omp_get_wtime(); - SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge); + SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_match, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge); if (verbose>0) Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t); if (verbose>0) From d7c2b4bad7f5f29daf3be2baf3ee7fe2d9384008 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 12 Jun 2020 20:21:57 +0100 Subject: [PATCH 29/34] avoid one more unsort --- src/smerge.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/smerge.c b/src/smerge.c index ef908b73f5..de6ce473e2 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -433,6 +433,12 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int * ans[i] = x[idx[i]-1]; return; } +void copyInt(const int *restrict x, const int nx, int *restrict ans) { + #pragma omp parallel for schedule(static) num_threads(getDTthreads()) + for (int i=0; i Date: Sat, 13 Jun 2020 11:40:55 +0100 Subject: [PATCH 30/34] mult already supported --- R/bmerge.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/bmerge.R b/R/bmerge.R index c385eab265..dd2b43fdff 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -4,7 +4,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer && isTRUE(getOption("datatable.smerge")) ## enable option && identical(nomatch, NA_integer_) ## for now only outer join - && identical(mult, "all") ## for now only mult='all' && identical(ops, 1L) ## equi join && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join ) { From 9f1ea21cf06c895b180e6ba3b67691759442768b Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 13 Jun 2020 11:41:54 +0100 Subject: [PATCH 31/34] mult already supported2 --- R/bmerge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bmerge.R b/R/bmerge.R index dd2b43fdff..69e9c89e02 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -14,7 +14,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!is.null(attr(idx, "starts", exact=TRUE))) idx } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE) + ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE) if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} return(ans) } From 103b9c63cdb10c7ab60dd2a3f185caf20b2eb70d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 13 Jun 2020 11:49:13 +0100 Subject: [PATCH 32/34] better algo description --- src/smerge.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/smerge.c b/src/smerge.c index de6ce473e2..8918546e88 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -5,7 +5,8 @@ * * join on a single integer column * sort LHS and RHS - * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches + * split LHS into equal batches based on its unique values + * split RHS into batches by matching corresponding upper-lower bounds of LHS batches using binary search * parallel sort-merge join * * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_STATS From 2acaf233c9636656e793125aa5a0052b95b979d5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 16 Jun 2020 01:54:53 +0100 Subject: [PATCH 33/34] multiple columns support --- R/bmerge.R | 23 +- R/wrappers.R | 3 +- inst/tests/smerge.Rraw | 22 +- src/data.table.h | 2 +- src/smerge.c | 498 +++++++++++++++++++++++++---------------- 5 files changed, 335 insertions(+), 213 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 69e9c89e02..a2c20f3576 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,10 +1,13 @@ +intCols = function(x, cols) all(vapply(cols, function(col, x) is.integer(x[[col]]), NA, x)) bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { - if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer - && isTRUE(getOption("datatable.smerge")) ## enable option - && identical(nomatch, NA_integer_) ## for now only outer join - && identical(ops, 1L) ## equi join + if (TRUE + && isTRUE(getOption("datatable.smerge")) ## switch + && length(icols)==length(xcols) ## avoid invalid input + && intCols(i, icols) && intCols(x, xcols) ## all columns integer + #&& identical(nomatch, NA_integer_) ## nomatch=0L is made as post-processing + && all(ops==1L) ## equi join && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join ) { getIdxGrp = function(x, cols) { ## get index only if retGrp=T @@ -14,8 +17,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!is.null(attr(idx, "starts", exact=TRUE))) idx } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE) + ans = smerge(x=i, y=x, x.cols=icols, y.cols=xcols, x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE) + if (identical(nomatch, 0L)) { + nom = is.na(ans$starts) + ans$starts[nom] = 0L + ans$lens[nom] = 0L + } if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} + if (verbose && !isTRUE(getOption("datatable.smerge"))) { ## just to satisfy existing bmerge unit tests, when switch above commented out + #cat("1\n", file="~/git/smergeOptCount.out", append=TRUE) ## be sure to have this path if you comment out switch of "datatable.smerge" option, 24 times used in main.Rraw + cat("existing index\n") + cat("ad hoc\n") + } return(ans) } callersi = i diff --git a/R/wrappers.R b/R/wrappers.R index 636e6e04ee..a8ee7b9989 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,4 +13,5 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge) +smerge = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.cols, y.cols, x.idx, y.idx, match.arg(mult), out.bmerge) + diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 3a3bf9812a..099628a4be 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -12,18 +12,22 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { vecseq = data.table:::vecseq } -bm = function(x, y, mult="all") { - stopifnot(is.integer(x), is.integer(y)) - ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE) +bm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") { + if (is.integer(x)) {x = data.table(x=x); x.cols=1L} + if (is.integer(y)) {y = data.table(y=y); y.cols=1L} + stopifnot(is.data.table(x), is.data.table(y)) + ans = bmerge(x, y, x.cols, y.cols, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=rep.int(1L, length(x.cols)), verbose=FALSE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL)))) ans } -sm = function(x, y, mult="all") { - stopifnot(is.integer(x), is.integer(y)) - ans = smerge(x, y, mult=mult, out.bmerge=TRUE) +sm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") { + if (is.integer(x)) {x = data.table(x=x); x.cols=1L} + if (is.integer(y)) {y = data.table(y=y); y.cols=1L} + stopifnot(is.data.table(x), is.data.table(y)) + ans = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=TRUE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well - ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch + ans$nMatch = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=FALSE)$nMatch ans } @@ -216,8 +220,8 @@ y = sample.int(2e2L, 1e2L, TRUE) test(21.02, sm(x, y), bm(x, y)) # [.data.table join -d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x)) -d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y)) +d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_len(1e2L)) +d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_len(1e2L)) options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") options(datatable.smerge=TRUE) diff --git a/src/data.table.h b/src/data.table.h index 231f8f28b4..afef343eb3 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge); +SEXP smergeR(SEXP x, SEXP y,SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge); diff --git a/src/smerge.c b/src/smerge.c index 8918546e88..8dd403d5d7 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -26,29 +26,33 @@ enum emult{ALL, FIRST, LAST, ERR}; // workhorse join that runs in parallel on batches static void smerge(const int bx_off, const int bnx, const int by_off, const int bny, - const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, - const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, + const int **restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, + const int **restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, int *restrict starts, int *restrict lens, uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1, - const enum emult mult) { + const enum emult mult, const int ncol) { uint64_t cnt = 0; bool xlen1 = true, ylen1 = true, xylen1 = true; if (unq_x && unq_y) { int i = bx_off, j = by_off; const int ni = bx_off+bnx, nj = by_off+bny; while (i y_j) { - j++; + for (int c=0; c y_j) { + j++; break; + } } } } else if (unq_x) { @@ -56,56 +60,68 @@ static void smerge(const int bx_off, const int bnx, const int ni = bx_off+bnx, njs = by_off+bny; if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region while (i1) - ylen1 = false; - cnt += (uint64_t)yl1; -#endif - } else if (x_i < y_j) { - i++; - } else if (x_i > y_j) { - js++; + for (int c=0; c1) + ylen1 = false; + cnt += (uint64_t)yl1; + #endif + } else if (x_i < y_j) { + i++; break; + } else if (x_i > y_j) { + js++; break; + } } } } else if (mult == FIRST) { while (i y_j) { - js++; + for (int c=0; c y_j) { + js++; break; + } } } } else if (mult == LAST) { while (i y_j) { - js++; + for (int c=0; c y_j) { + js++; break; + } } } } @@ -113,101 +129,117 @@ static void smerge(const int bx_off, const int bnx, int is = bx_off, j = by_off; const int nis = bx_off+bnx, nj = by_off+bny; while (is1) - xlen1 = false; - cnt += (uint64_t)xl1; -#endif - } else if (x_i < y_j) { - is++; - } else if (x_i > y_j) { - j++; - } - } - } else { - int is = bx_off, js = by_off; - const int nis = bx_off+bnx, njs = by_off+bny; - if (mult==ALL || mult==ERR) { - while (is1) xlen1 = false; - if (ylen1 && yl1>1) - ylen1 = false; - if (xylen1 && xl1>1 && yl1>1) - xylen1 = false; - cnt += (uint64_t)xl1 * (uint64_t)yl1; + cnt += (uint64_t)xl1; #endif } else if (x_i < y_j) { - is++; + is++; break; } else if (x_i > y_j) { - js++; + j++; break; + } + } + } + } else { + int is = bx_off, js = by_off; + const int nis = bx_off+bnx, njs = by_off+bny; + if (mult==ALL || mult==ERR) { + while (is1) + xlen1 = false; + if (ylen1 && yl1>1) + ylen1 = false; + if (xylen1 && xl1>1 && yl1>1) + xylen1 = false; + cnt += (uint64_t)xl1 * (uint64_t)yl1; + #endif + } else if (x_i < y_j) { + is++; break; + } else if (x_i > y_j) { + js++; break; + } } } } else if (mult==FIRST) { while (is1) + xlen1 = false; + cnt += (uint64_t)xl1; + #endif + } else if (x_i < y_j) { + is++; break; + } else if (x_i > y_j) { + js++; break; } - is++; -#ifdef SMERGE_STATS - if (xlen1 && xl1>1) - xlen1 = false; - cnt += (uint64_t)xl1; -#endif - } else if (x_i < y_j) { - is++; - } else if (x_i > y_j) { - js++; } } } else if (mult==LAST) { while (is1) + xlen1 = false; + cnt += (uint64_t)xl1; + #endif + } else if (x_i < y_j) { + is++; break; + } else if (x_i > y_j) { + js++; break; } - is++; -#ifdef SMERGE_STATS - if (xlen1 && xl1>1) - xlen1 = false; - cnt += (uint64_t)xl1; -#endif - } else if (x_i < y_j) { - is++; - } else if (x_i > y_j) { - js++; } } } @@ -216,62 +248,103 @@ static void smerge(const int bx_off, const int bnx, return; } +// helper when printing from batching +void printRow(const int **restrict x, const int row, const int ncol) { + for (int c=0; c thisy) { + return 1; + } else if (thisx < thisy) { + return -1; + } + } + error("rowCmp got ncol=0?"); + return 0; +} + /* * 'rolling nearest' binary search * used to find 'y' lower and upper, 0-based, bounds for each batch * side -1: lower bound; side 1: upper bound */ -static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) { - int verbose = 0; // devel debug only - if (verbose>0) - Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov - if (x[ix[0]-1] == val) { // common early stopping +static int rollbs(const int **restrict x, const int *restrict ix, const int nix, const int **restrict vals, const int vali, const int ncol, const int side) { + int verbose = 0; // manual devel debug only + if (nix<1) { if (verbose>0) - Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov + Rprintf("rollbs: -1: empty input\n"); + return -1; + } + if (rowCmp(x, ix[0]-1, vals, vali, ncol)==0) { // common early stopping + if (verbose>0) { + Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return 0; } - if (x[ix[nix-1]-1] == val) { - if (verbose>0) - Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov + if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)==0) { + if (verbose>0) { + Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return nix-1; } if (side < 0) { // lower bound early stopping - if (x[ix[nix-1]-1] < val) { - if (verbose>0) - Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov + if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) { + if (verbose>0) { + Rprintf("rollbs: -1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is still smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return -1; } - if (x[ix[0]-1] > val) { - if (verbose>0) - Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov + if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) { + if (verbose>0) { + Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return 0; } } else if (side > 0) { // upper bound early stopping - if (x[ix[0]-1] > val) { - if (verbose>0) - Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov + if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) { + if (verbose>0) { + Rprintf("rollbs: -1: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is still bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return -1; } - if (x[ix[nix-1]-1] < val) { - if (verbose>0) - Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov + if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) { + if (verbose>0) { + Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + } return nix-1; } } - int lower=0, upper=nix, i=0; - while (lower<=upper) { - i = lower + (upper-lower)/2; - int thisx = x[ix[i]-1]; - //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out - if (thisx==val) - return(i); - else if (thisx < val) - lower = i+1; - else if (thisx > val) - upper = i-1; + int lower=0, upper=nix, i=0; // upper=nix is correct, just in case of wanting to "fix" for nix-1 + for (int c=0; c val) + upper = i-1; + } + } + if (verbose>0) { + Rprintf("rollbs: %d: last iter %d, element (", side<0?lower:upper, i); printRow(x, ix[i]-1, ncol); Rprintf(") nomatch to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov } - if (verbose>0) - Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov if (side < 0) // anyone to stress test this logic? return lower; else @@ -280,10 +353,10 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix, // cuts x_starts into equal batches and binary search corresponding y_starts ranges static void batching(const int nBatch, - const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, - const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, + const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts, + const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny, - const int verbose) { + const int ncol, const int verbose) { #ifdef SMERGE_BATCHING_BALANCED //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch @@ -296,14 +369,14 @@ static void batching(const int nBatch, if (verbose>0) Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts)) - error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov + error("internal error: batching %d input is attempting to use invalid batches: balanced=%s, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov for (int b=0; b= 0 && y_i_max >= 0; By_off[b] = y_match ? y_i_min : 0; Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0; @@ -314,8 +387,8 @@ static void batching(const int nBatch, if (Bny[b] > 0) { int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1]; int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1]; - Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min); - Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); + Rprintf("## lower: x[%d]: (", x_i_min); printRow(x, x_i_min-1, ncol); Rprintf(") <= ("); printRow(y, y_i_min-1, ncol); Rprintf(") :y[%d]\n", y_i_min); + Rprintf("## upper: x[%d]: (", x_i_max); printRow(x, x_i_max-1, ncol); Rprintf(") >= ("); printRow(y, y_i_max-1, ncol); Rprintf(") :y[%d]\n", y_i_max); } } } @@ -355,12 +428,14 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r } // pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel -void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, - const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, +void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts, + const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict starts, int *restrict lens, uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1, - const enum emult mult, const int verbose) { + const int ncol, const enum emult mult, const int verbose) { + if (!nx) + return; double t = 0; if (verbose>0) t = omp_get_wtime(); @@ -394,7 +469,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, } int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int)); int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int)); - batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1); + batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, ncol, verbose-1); int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used if (verbose>0) Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); @@ -414,7 +489,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, y, y_starts, y_lens, unq_y, // common input y starts, lens, // common output &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output - mult + mult, ncol ); nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1; th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message @@ -535,15 +610,38 @@ const enum emult matchMultArg(SEXP multArg) { return mult; } +SEXP ord1(int n) { + SEXP ans = PROTECT(allocVector(INTSXP, n)); + int *ansp = INTEGER(ans); + for (int i=0; i0) t_total = omp_get_wtime(); - if (!isInteger(x) || !isInteger(y)) - error("'x' and 'y' must be integer"); + if (!isNewList(x) || !isNewList(y)) + error("'x' and 'y' must be data.table"); + if (!isInteger(x_cols) || !isInteger(y_cols)) + error("'x.cols' and 'y.cols' must be integer"); + if (LENGTH(x_cols) != LENGTH(y_cols)) + error("'x.cols' and 'y.cols' must be equal length"); + const int ncol = LENGTH(x_cols), nxcol = length(x), nycol = length(y); + if (!nxcol || !nycol) + error("'x' and 'y' must be non 0 length"); + for (int i=0; i 0 && x_col <= nxcol) || !(y_col > 0 && y_col <= nycol)) + error("'x.cols' and 'y.cols' must specify existing columns"); + if (!isInteger(VECTOR_ELT(x, x_col-1)) || !isInteger(VECTOR_ELT(y, y_col-1))) + error("columns of 'x' and 'y' to join must be integer"); + } if (!isString(multArg)) error("'mult' must be a string"); const enum emult mult = matchMultArg(multArg); @@ -551,16 +649,16 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer if (!IS_TRUE_OR_FALSE(out_bmerge)) error("'out.bmerge' must be TRUE or FALSE"); const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0]; - int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); + int protecti = 0, nx = LENGTH(VECTOR_ELT(x, 0)), ny = LENGTH(VECTOR_ELT(y, 0)); if (verbose>0) t = omp_get_wtime(); const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); if (do_x_idx) { - x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 + x_idx = PROTECT(forder(x, x_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (do_y_idx) { - y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 + y_idx = PROTECT(forder(y, y_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (!isInteger(x_idx) || !isInteger(y_idx)) error("'x.idx' and 'y.idx' must be integer"); @@ -573,18 +671,24 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer if (verbose>0) t = omp_get_wtime(); const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); - int *xp, *yp; - if (!x_ord) { - xp = (int *)R_alloc(nx, sizeof(int)); - sortInt(INTEGER(x), nx, INTEGER(x_idx), xp); - } else { - xp = INTEGER(x); - } - if (!y_ord) { - yp = (int *)R_alloc(ny, sizeof(int)); - sortInt(INTEGER(y), ny, INTEGER(y_idx), yp); - } else { - yp = INTEGER(y); + const int **restrict xp=(const int **)R_alloc(ncol, sizeof(int*)), **restrict yp=(const int **)R_alloc(ncol, sizeof(int*)); + for (int i=0; i0) Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); @@ -628,10 +732,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer yp, ny, INTEGER(y_starts), LENGTH(y_starts), starts, lens, &n_match, &x_lens1, &y_lens1, &xy_lens1, - mult, verbose-1 + ncol, mult, verbose-1 ); if (verbose>0) - Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); + Rprintf("smergeR: smergeC %d cols of %d x %d = %"PRIu64"; took %.3fs\n", ncol, nx, ny, n_match, omp_get_wtime() - t); if (verbose>0) t = omp_get_wtime(); From e1643c2ce72e8bfe2acc00ead23ac7f7282d20da Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 22 Jun 2020 18:21:07 +0100 Subject: [PATCH 34/34] Revert "multiple columns support" This reverts commit 2acaf233c9636656e793125aa5a0052b95b979d5. --- R/bmerge.R | 23 +- R/wrappers.R | 3 +- inst/tests/smerge.Rraw | 22 +- src/data.table.h | 2 +- src/smerge.c | 498 ++++++++++++++++------------------------- 5 files changed, 213 insertions(+), 335 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index a2c20f3576..69e9c89e02 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,13 +1,10 @@ -intCols = function(x, cols) all(vapply(cols, function(col, x) is.integer(x[[col]]), NA, x)) bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) { - if (TRUE - && isTRUE(getOption("datatable.smerge")) ## switch - && length(icols)==length(xcols) ## avoid invalid input - && intCols(i, icols) && intCols(x, xcols) ## all columns integer - #&& identical(nomatch, NA_integer_) ## nomatch=0L is made as post-processing - && all(ops==1L) ## equi join + if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer + && isTRUE(getOption("datatable.smerge")) ## enable option + && identical(nomatch, NA_integer_) ## for now only outer join + && identical(ops, 1L) ## equi join && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join ) { getIdxGrp = function(x, cols) { ## get index only if retGrp=T @@ -17,18 +14,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!is.null(attr(idx, "starts", exact=TRUE))) idx } if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()} - ans = smerge(x=i, y=x, x.cols=icols, y.cols=xcols, x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE) - if (identical(nomatch, 0L)) { - nom = is.na(ans$starts) - ans$starts[nom] = 0L - ans$lens[nom] = 0L - } + ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE) if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()} - if (verbose && !isTRUE(getOption("datatable.smerge"))) { ## just to satisfy existing bmerge unit tests, when switch above commented out - #cat("1\n", file="~/git/smergeOptCount.out", append=TRUE) ## be sure to have this path if you comment out switch of "datatable.smerge" option, 24 times used in main.Rraw - cat("existing index\n") - cat("ad hoc\n") - } return(ans) } callersi = i diff --git a/R/wrappers.R b/R/wrappers.R index a8ee7b9989..636e6e04ee 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,5 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) -smerge = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.cols, y.cols, x.idx, y.idx, match.arg(mult), out.bmerge) - +smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge) diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw index 099628a4be..3a3bf9812a 100644 --- a/inst/tests/smerge.Rraw +++ b/inst/tests/smerge.Rraw @@ -12,22 +12,18 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { vecseq = data.table:::vecseq } -bm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") { - if (is.integer(x)) {x = data.table(x=x); x.cols=1L} - if (is.integer(y)) {y = data.table(y=y); y.cols=1L} - stopifnot(is.data.table(x), is.data.table(y)) - ans = bmerge(x, y, x.cols, y.cols, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=rep.int(1L, length(x.cols)), verbose=FALSE) +bm = function(x, y, mult="all") { + stopifnot(is.integer(x), is.integer(y)) + ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL)))) ans } -sm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") { - if (is.integer(x)) {x = data.table(x=x); x.cols=1L} - if (is.integer(y)) {y = data.table(y=y); y.cols=1L} - stopifnot(is.data.table(x), is.data.table(y)) - ans = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=TRUE) +sm = function(x, y, mult="all") { + stopifnot(is.integer(x), is.integer(y)) + ans = smerge(x, y, mult=mult, out.bmerge=TRUE) ## if undefining SMERGE_STATS then we have to ignore allLen1 as well - ans$nMatch = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=FALSE)$nMatch + ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch ans } @@ -220,8 +216,8 @@ y = sample.int(2e2L, 1e2L, TRUE) test(21.02, sm(x, y), bm(x, y)) # [.data.table join -d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_len(1e2L)) -d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_len(1e2L)) +d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x)) +d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y)) options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge") options(datatable.smerge=TRUE) diff --git a/src/data.table.h b/src/data.table.h index afef343eb3..231f8f28b4 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); SEXP fcaseR(SEXP na, SEXP rho, SEXP args); // smjoin.c -SEXP smergeR(SEXP x, SEXP y,SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge); +SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge); diff --git a/src/smerge.c b/src/smerge.c index 8dd403d5d7..8918546e88 100644 --- a/src/smerge.c +++ b/src/smerge.c @@ -26,33 +26,29 @@ enum emult{ALL, FIRST, LAST, ERR}; // workhorse join that runs in parallel on batches static void smerge(const int bx_off, const int bnx, const int by_off, const int bny, - const int **restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, - const int **restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, + const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x, + const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y, int *restrict starts, int *restrict lens, uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1, - const enum emult mult, const int ncol) { + const enum emult mult) { uint64_t cnt = 0; bool xlen1 = true, ylen1 = true, xylen1 = true; if (unq_x && unq_y) { int i = bx_off, j = by_off; const int ni = bx_off+bnx, nj = by_off+bny; while (i y_j) { - j++; break; - } + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + starts[i] = j+1; + //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc + i++; +#ifdef SMERGE_STATS + cnt++; +#endif + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + j++; } } } else if (unq_x) { @@ -60,68 +56,56 @@ static void smerge(const int bx_off, const int bnx, const int ni = bx_off+bnx, njs = by_off+bny; if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region while (i1) - ylen1 = false; - cnt += (uint64_t)yl1; - #endif - } else if (x_i < y_j) { - i++; break; - } else if (x_i > y_j) { - js++; break; - } + const int j = y_starts[js]-1; + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + starts[i] = j+1; + const int yl1 = y_lens[js]; + lens[i] = yl1; + i++; +#ifdef SMERGE_STATS + if (ylen1 && yl1>1) + ylen1 = false; + cnt += (uint64_t)yl1; +#endif + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + js++; } } } else if (mult == FIRST) { while (i y_j) { - js++; break; - } + const int j = y_starts[js]-1; + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + starts[i] = j+1; + //lens[i] = 1; + i++; +#ifdef SMERGE_STATS + cnt++; +#endif + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + js++; } } } else if (mult == LAST) { while (i y_j) { - js++; break; - } + const int j = y_starts[js]-1; + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + starts[i] = j+y_lens[js]; + //lens[i] = 1; + i++; +#ifdef SMERGE_STATS + cnt++; +#endif + } else if (x_i < y_j) { + i++; + } else if (x_i > y_j) { + js++; } } } @@ -129,117 +113,101 @@ static void smerge(const int bx_off, const int bnx, int is = bx_off, j = by_off; const int nis = bx_off+bnx, nj = by_off+bny; while (is1) + xlen1 = false; + cnt += (uint64_t)xl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + j++; + } + } + } else { + int is = bx_off, js = by_off; + const int nis = bx_off+bnx, njs = by_off+bny; + if (mult==ALL || mult==ERR) { + while (is1) xlen1 = false; - cnt += (uint64_t)xl1; + if (ylen1 && yl1>1) + ylen1 = false; + if (xylen1 && xl1>1 && yl1>1) + xylen1 = false; + cnt += (uint64_t)xl1 * (uint64_t)yl1; #endif } else if (x_i < y_j) { - is++; break; + is++; } else if (x_i > y_j) { - j++; break; - } - } - } - } else { - int is = bx_off, js = by_off; - const int nis = bx_off+bnx, njs = by_off+bny; - if (mult==ALL || mult==ERR) { - while (is1) - xlen1 = false; - if (ylen1 && yl1>1) - ylen1 = false; - if (xylen1 && xl1>1 && yl1>1) - xylen1 = false; - cnt += (uint64_t)xl1 * (uint64_t)yl1; - #endif - } else if (x_i < y_j) { - is++; break; - } else if (x_i > y_j) { - js++; break; - } + js++; } } } else if (mult==FIRST) { while (is1) - xlen1 = false; - cnt += (uint64_t)xl1; - #endif - } else if (x_i < y_j) { - is++; break; - } else if (x_i > y_j) { - js++; break; + const int i = x_starts[is]-1, j = y_starts[js]-1; + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + const int j1 = j+1; + const int xl1 = x_lens[is]; + for (int ii=0; ii1) + xlen1 = false; + cnt += (uint64_t)xl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; } } } else if (mult==LAST) { while (is1) - xlen1 = false; - cnt += (uint64_t)xl1; - #endif - } else if (x_i < y_j) { - is++; break; - } else if (x_i > y_j) { - js++; break; + const int i = x_starts[is]-1, j = y_starts[js]-1; + const int x_i = x[i], y_j = y[j]; + if (x_i == y_j) { + const int j1 = j+y_lens[js]; + const int xl1 = x_lens[is]; + for (int ii=0; ii1) + xlen1 = false; + cnt += (uint64_t)xl1; +#endif + } else if (x_i < y_j) { + is++; + } else if (x_i > y_j) { + js++; } } } @@ -248,103 +216,62 @@ static void smerge(const int bx_off, const int bnx, return; } -// helper when printing from batching -void printRow(const int **restrict x, const int row, const int ncol) { - for (int c=0; c thisy) { - return 1; - } else if (thisx < thisy) { - return -1; - } - } - error("rowCmp got ncol=0?"); - return 0; -} - /* * 'rolling nearest' binary search * used to find 'y' lower and upper, 0-based, bounds for each batch * side -1: lower bound; side 1: upper bound */ -static int rollbs(const int **restrict x, const int *restrict ix, const int nix, const int **restrict vals, const int vali, const int ncol, const int side) { - int verbose = 0; // manual devel debug only - if (nix<1) { +static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) { + int verbose = 0; // devel debug only + if (verbose>0) + Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov + if (x[ix[0]-1] == val) { // common early stopping if (verbose>0) - Rprintf("rollbs: -1: empty input\n"); - return -1; - } - if (rowCmp(x, ix[0]-1, vals, vali, ncol)==0) { // common early stopping - if (verbose>0) { - Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov return 0; } - if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)==0) { - if (verbose>0) { - Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + if (x[ix[nix-1]-1] == val) { + if (verbose>0) + Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov return nix-1; } if (side < 0) { // lower bound early stopping - if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) { - if (verbose>0) { - Rprintf("rollbs: -1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is still smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + if (x[ix[nix-1]-1] < val) { + if (verbose>0) + Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov return -1; } - if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) { - if (verbose>0) { - Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + if (x[ix[0]-1] > val) { + if (verbose>0) + Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov return 0; } } else if (side > 0) { // upper bound early stopping - if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) { - if (verbose>0) { - Rprintf("rollbs: -1: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is still bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + if (x[ix[0]-1] > val) { + if (verbose>0) + Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov return -1; } - if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) { - if (verbose>0) { - Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov - } + if (x[ix[nix-1]-1] < val) { + if (verbose>0) + Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov return nix-1; } } - int lower=0, upper=nix, i=0; // upper=nix is correct, just in case of wanting to "fix" for nix-1 - for (int c=0; c val) - upper = i-1; - } - } - if (verbose>0) { - Rprintf("rollbs: %d: last iter %d, element (", side<0?lower:upper, i); printRow(x, ix[i]-1, ncol); Rprintf(") nomatch to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov + int lower=0, upper=nix, i=0; + while (lower<=upper) { + i = lower + (upper-lower)/2; + int thisx = x[ix[i]-1]; + //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out + if (thisx==val) + return(i); + else if (thisx < val) + lower = i+1; + else if (thisx > val) + upper = i-1; } + if (verbose>0) + Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov if (side < 0) // anyone to stress test this logic? return lower; else @@ -353,10 +280,10 @@ static int rollbs(const int **restrict x, const int *restrict ix, const int nix, // cuts x_starts into equal batches and binary search corresponding y_starts ranges static void batching(const int nBatch, - const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts, - const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts, + const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, + const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny, - const int ncol, const int verbose) { + const int verbose) { #ifdef SMERGE_BATCHING_BALANCED //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch @@ -369,14 +296,14 @@ static void batching(const int nBatch, if (verbose>0) Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts)) - error("internal error: batching %d input is attempting to use invalid batches: balanced=%s, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov + error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov for (int b=0; b= 0 && y_i_max >= 0; By_off[b] = y_match ? y_i_min : 0; Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0; @@ -387,8 +314,8 @@ static void batching(const int nBatch, if (Bny[b] > 0) { int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1]; int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1]; - Rprintf("## lower: x[%d]: (", x_i_min); printRow(x, x_i_min-1, ncol); Rprintf(") <= ("); printRow(y, y_i_min-1, ncol); Rprintf(") :y[%d]\n", y_i_min); - Rprintf("## upper: x[%d]: (", x_i_max); printRow(x, x_i_max-1, ncol); Rprintf(") >= ("); printRow(y, y_i_max-1, ncol); Rprintf(") :y[%d]\n", y_i_max); + Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min); + Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max); } } } @@ -428,14 +355,12 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r } // pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel -void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts, - const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts, +void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts, + const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts, int *restrict starts, int *restrict lens, uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1, - const int ncol, const enum emult mult, const int verbose) { + const enum emult mult, const int verbose) { - if (!nx) - return; double t = 0; if (verbose>0) t = omp_get_wtime(); @@ -469,7 +394,7 @@ void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, } int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int)); int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int)); - batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, ncol, verbose-1); + batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1); int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used if (verbose>0) Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t); @@ -489,7 +414,7 @@ void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, y, y_starts, y_lens, unq_y, // common input y starts, lens, // common output &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output - mult, ncol + mult ); nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1; th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message @@ -610,38 +535,15 @@ const enum emult matchMultArg(SEXP multArg) { return mult; } -SEXP ord1(int n) { - SEXP ans = PROTECT(allocVector(INTSXP, n)); - int *ansp = INTEGER(ans); - for (int i=0; i0) t_total = omp_get_wtime(); - if (!isNewList(x) || !isNewList(y)) - error("'x' and 'y' must be data.table"); - if (!isInteger(x_cols) || !isInteger(y_cols)) - error("'x.cols' and 'y.cols' must be integer"); - if (LENGTH(x_cols) != LENGTH(y_cols)) - error("'x.cols' and 'y.cols' must be equal length"); - const int ncol = LENGTH(x_cols), nxcol = length(x), nycol = length(y); - if (!nxcol || !nycol) - error("'x' and 'y' must be non 0 length"); - for (int i=0; i 0 && x_col <= nxcol) || !(y_col > 0 && y_col <= nycol)) - error("'x.cols' and 'y.cols' must specify existing columns"); - if (!isInteger(VECTOR_ELT(x, x_col-1)) || !isInteger(VECTOR_ELT(y, y_col-1))) - error("columns of 'x' and 'y' to join must be integer"); - } + if (!isInteger(x) || !isInteger(y)) + error("'x' and 'y' must be integer"); if (!isString(multArg)) error("'mult' must be a string"); const enum emult mult = matchMultArg(multArg); @@ -649,16 +551,16 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S if (!IS_TRUE_OR_FALSE(out_bmerge)) error("'out.bmerge' must be TRUE or FALSE"); const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0]; - int protecti = 0, nx = LENGTH(VECTOR_ELT(x, 0)), ny = LENGTH(VECTOR_ELT(y, 0)); + int protecti = 0, nx = LENGTH(x), ny = LENGTH(y); if (verbose>0) t = omp_get_wtime(); const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx); if (do_x_idx) { - x_idx = PROTECT(forder(x, x_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 + x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (do_y_idx) { - y_idx = PROTECT(forder(y, y_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 + y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533 } if (!isInteger(x_idx) || !isInteger(y_idx)) error("'x.idx' and 'y.idx' must be integer"); @@ -671,24 +573,18 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S if (verbose>0) t = omp_get_wtime(); const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx); - const int **restrict xp=(const int **)R_alloc(ncol, sizeof(int*)), **restrict yp=(const int **)R_alloc(ncol, sizeof(int*)); - for (int i=0; i0) Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t); @@ -732,10 +628,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S yp, ny, INTEGER(y_starts), LENGTH(y_starts), starts, lens, &n_match, &x_lens1, &y_lens1, &xy_lens1, - ncol, mult, verbose-1 + mult, verbose-1 ); if (verbose>0) - Rprintf("smergeR: smergeC %d cols of %d x %d = %"PRIu64"; took %.3fs\n", ncol, nx, ny, n_match, omp_get_wtime() - t); + Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t); if (verbose>0) t = omp_get_wtime();