From 38f7cee9b0cd5c58b221c511cbe2cd294a2673a5 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Thu, 4 Jun 2020 22:12:07 +0100
Subject: [PATCH 01/34] dev

---
 R/wrappers.R     |   6 ++
 src/data.table.h |   3 +
 src/fjoin.c      | 219 +++++++++++++++++++++++++++++++++++++++++++++++
 src/init.c       |   3 +
 4 files changed, 231 insertions(+)
 create mode 100644 src/fjoin.c

diff --git a/R/wrappers.R b/R/wrappers.R
index 5fec33a92f..c13feb78a7 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -12,3 +12,9 @@ colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, c
 coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
+
+#fsjoin = function(x, y) .Call(CsjoinR, x, y)
+fijoin = function(x, y) {
+  ans = .Call(CijoinR, x, y)
+  ans[3:4]
+}
diff --git a/src/data.table.h b/src/data.table.h
index 1cf975e68b..0109e1da75 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -243,3 +243,6 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k);
 //fifelse.c
 SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
+
+// fjoin.c
+SEXP ijoinR(SEXP x, SEXP y);
diff --git a/src/fjoin.c b/src/fjoin.c
new file mode 100644
index 0000000000..3695641643
--- /dev/null
+++ b/src/fjoin.c
@@ -0,0 +1,219 @@
+#include "data.table.h"
+
+/*
+ * as of now these are optimized routines for joining
+ * - single integer columns
+ * - sorted (experimental) or indexed
+ * 
+ * TODO:
+ * type of join is pushed down here so we compute only what is necessary
+ * mult!="all" and "unique-key" branches escape extra computation
+ */
+
+typedef enum {
+  inner, left, right, full, semi, anti, cross
+} joinhow;
+
+SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) {
+  SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames;
+  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4));
+  SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0));
+  SET_STRING_ELT(ansnames, 0, mkChar("starts_x"));
+  //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0));
+  SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0));
+  SET_STRING_ELT(ansnames, 1, mkChar("lens_x"));
+  //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1));
+  SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn));
+  SET_STRING_ELT(ansnames, 2, mkChar("starts_y"));
+  int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2));
+  SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn));
+  SET_STRING_ELT(ansnames, 3, mkChar("lens_y"));
+  int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3));
+  for (int i=0; i<matchn; ++i) {
+    //starts_x_p[i] = starts_x[i];
+    //lens_x_p[i] = lens_x[i];
+    starts_y_p[i] = starts_y[i];
+    lens_y_p[i] = lens_y[i];
+  }
+  UNPROTECT(1);
+  return ans;
+}
+
+/*
+ * fast sorted join
+ */
+/*
+void sjoin(int *x, int nx, int *y, int ny, joinhow how, bool *match, int *matchi, int *matchstarts, int *matchlens, int *matchn) {
+  int imatch = 0;
+  if (how==left) {
+    for (int i=0, j=0; i<nx;) {
+      if (x[i]==y[j]) {
+        Rprintf("x[%d]==y[%d]\n", i, j);
+        match[i] = true;
+        matchi[imatch++] = i;
+        matchstarts[i] = j;
+        matchlens[i] = 1;
+        int jj = j+1; // rather than j++, so duplicates on x side works
+        while(x[i]==y[jj]) { // multiple matches
+          matchlens[i]++; jj++;
+        }
+        i++; // we could have an outer while instead, then we dont have to recalculate matchlens when dups to dups match 2x2
+        continue;
+      } else if (x[i]<y[j]) {
+        Rprintf("x[%d]<y[%d]\n", i, j);
+        match[i] = false;
+        i++;
+        continue;
+      } else if (x[i]>y[j]) {
+        Rprintf("x[%d]>y[%d]\n", i, j);
+        j++;
+        continue;
+      }
+    }
+  } else {
+    error("not yet implemented");
+  }
+  matchn[0] = imatch;
+}
+SEXP sjoinR(SEXP x, SEXP y) {
+  if (!isInteger(x) || !isInteger(y))
+    error("must be integer");
+  int nx = LENGTH(x), ny = LENGTH(y);
+  //  left:  nx
+  // right:  ny
+  // inner:  min(nx, ny)
+  //  full:  max(nx, ny)
+  double t_alloc = omp_get_wtime();
+  bool *match = (bool *)R_alloc(nx, sizeof(bool)); // not sure if we need this
+  int *matchi = (int *)R_alloc(nx, sizeof(int));
+  int *matchlens = (int *)R_alloc(nx, sizeof(int));
+  int *matchstarts = (int *)R_alloc(nx, sizeof(int));
+  int matchn = 0;
+  t_alloc = omp_get_wtime() - t_alloc;
+  double t_sjoin = omp_get_wtime();
+  sjoin(INTEGER(x), nx, INTEGER(y), ny, left, match, matchi, matchstarts, matchlens, &matchn);
+  t_sjoin = omp_get_wtime() - t_sjoin;
+  Rprintf("alloc took %.3fs; sjoin took %.3fs\n", t_alloc, t_sjoin);
+  return joinOut(matchn, matchi, matchstarts, matchlens);
+}
+*/
+
+/*
+ * fast indexed join
+ * sort-merge join
+ */
+void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
+           int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts,
+           joinhow how,
+           bool *match, int *matchn,
+           int *starts_x, int *lens_x,
+           int *starts_y, int *lens_y) {
+  if (how != left)
+    error("only left join implemented so far");
+  if (!x_ord || !y_ord)
+    error("support for already unsorted not yet implemented");
+  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
+  //int imatch = 0;
+  
+  int *x_lens=0, *y_lens=0;
+  if (!unq_x) {
+    x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
+    for (int i=0; i<nx_starts-1; ++i) x_lens[i] = x_starts[i+1]-x_starts[i];
+    x_lens[nx_starts-1] = nx-x_starts[nx_starts-1];
+  }
+  if (!unq_y) {
+    y_lens = (int *)R_alloc(ny_starts, sizeof(int));
+    for (int i=0; i<ny_starts-1; ++i) y_lens[i] = y_starts[i+1]-y_starts[i];
+    y_lens[ny_starts-1] = ny-y_starts[ny_starts-1];
+  }
+
+  int i=0, j=0, is=0, js=0;
+  if (how==left && unq_x && unq_y) {
+    while (i<nx && j<ny) {
+      int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        starts_y[i] = j+1; lens_y[i] = 1;
+        i++;
+      }
+      else if (x_i < y_j) i++;
+      else if (x_i > y_j) j++;
+    }
+  } else if (how==left && unq_y) {
+    while (is<nx_starts && j<ny) {
+      int i = x_starts[is]-1;
+      int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        int j1 = j+1;
+        for (int ii=0; ii<x_lens[is]; ++ii) {
+          starts_y[i+ii] = j1; lens_y[i+ii] = 1;
+        }
+        is++;
+      }
+      else if (x_i < y_j) is++;
+      else if (x_i > y_j) j++;
+    }
+  } else if (how==left && unq_x) {
+    while (i<nx && js<ny_starts) {
+      int j = y_starts[js]-1;
+      int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        starts_y[i] = j+1;
+        lens_y[i] = y_lens[js];
+        i++;
+      }
+      else if (x_i < y_j) i++;
+      else if (x_i > y_j) js++;
+    }
+  } else if (how==left) {
+    while (is<nx_starts && js<ny_starts) {
+      int i = x_starts[is]-1, j = y_starts[js]-1;
+      int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        int j1 = j+1;
+        for (int ii=0; ii<x_lens[is]; ++ii) {
+          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+        }
+        is++;
+      }
+      else if (x_i < y_j) is++;
+      else if (x_i > y_j) js++;
+    }
+  } else if (how==inner && unq_x && unq_y) {
+    
+  } else if (how==full && unq_x && unq_y) {
+    
+  }
+  
+  matchn[0] = nx;
+}
+
+SEXP ijoinR(SEXP x, SEXP y) {
+  if (!isInteger(x) || !isInteger(y))
+    error("must be integer");
+  int nx = LENGTH(x), ny = LENGTH(y);
+  double t_index = omp_get_wtime();
+  SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE)));
+  SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE)));
+  SEXP x_starts = getAttrib(x_idx, sym_starts);
+  SEXP y_starts = getAttrib(y_idx, sym_starts);
+  t_index = omp_get_wtime() - t_index;
+  double t_alloc = omp_get_wtime();
+  bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
+  int matchn = 0;
+  int *starts_x = (int *)R_alloc(0, sizeof(int));
+  int *lens_x = (int *)R_alloc(0, sizeof(int));
+  int *starts_y = (int *)R_alloc(nx, sizeof(int));
+  int *lens_y = (int *)R_alloc(nx, sizeof(int));
+  for (int i=0; i<nx; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
+  t_alloc = omp_get_wtime() - t_alloc;
+  double t_ijoin = omp_get_wtime();
+  //            *x, nx,           *x_o,            x_ord,         *x_starts,        nx_starts,
+  ijoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
+        INTEGER(y), ny, INTEGER(y_idx), LENGTH(y_idx)==0, INTEGER(y_starts), LENGTH(y_starts),
+        left, match, &matchn,
+        starts_x, lens_x, starts_y, lens_y);
+  t_ijoin = omp_get_wtime() - t_ijoin;
+  Rprintf("index took %.3fs; alloc took %.3fs; ijoin took %.3fs\n", t_index, t_alloc, t_ijoin);
+  UNPROTECT(2);
+  return joinOut(matchn, starts_x, lens_x, starts_y, lens_y);
+}
diff --git a/src/init.c b/src/init.c
index 916db3ab57..cb8678cd6e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -119,6 +119,8 @@ SEXP lock();
 SEXP unlock();
 SEXP islockedR();
 SEXP allNAR();
+//SEXP sjoinR();
+SEXP ijoinR();
 
 // .Externals
 SEXP fastmean();
@@ -211,6 +213,7 @@ R_CallMethodDef callMethods[] = {
 {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
 {"CtestMsgR", (DL_FUNC) &testMsgR, -1},
 {"C_allNAR", (DL_FUNC) &allNAR, -1},
+{"CijoinR", (DL_FUNC) &ijoinR, -1},
 {NULL, NULL, 0}
 };
 

From febf07cbcf329ab53ba8c42a2b42b6d15c7064a4 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 5 Jun 2020 00:52:03 +0100
Subject: [PATCH 02/34] devdev

---
 R/wrappers.R          |   6 +-
 inst/tests/fjoin.Rraw |  57 ++++++++++++++++
 src/data.table.h      |   2 +-
 src/fjoin.c           | 155 +++++++++++++++++++++++++++---------------
 src/init.c            |   4 +-
 5 files changed, 162 insertions(+), 62 deletions(-)
 create mode 100644 inst/tests/fjoin.Rraw

diff --git a/R/wrappers.R b/R/wrappers.R
index c13feb78a7..1c9fa6c351 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-#fsjoin = function(x, y) .Call(CsjoinR, x, y)
-fijoin = function(x, y) {
-  ans = .Call(CijoinR, x, y)
+fjoin = function(x, y) {
+  ans = .Call(CfjoinR, x, y)
+  ans[[4L]][is.na(ans[[3L]])] = 1L
   ans[3:4]
 }
diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw
new file mode 100644
index 0000000000..d9fabf7664
--- /dev/null
+++ b/inst/tests/fjoin.Rraw
@@ -0,0 +1,57 @@
+require(methods)
+
+if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
+  if ((tt<-compiler::enableJIT(-1))>0)
+    cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
+} else {
+  require(data.table)
+  test = data.table:::test
+  fjoin = data.table:::fjoin
+  bmerge = data.table:::bmerge
+  forderv = data.table:::forderv
+}
+check = function(x, y) {
+  a = d(x, y)
+  b = fjoin(x, y)
+  na = is.na(a[["starts"]])
+  fail = !identical(a[["starts"]][!na], b[["starts_y"]][!na])
+  fail = fail || !identical(a[["lens"]][!na], b[["lens_y"]][!na])
+  if (fail) {
+    if (interactive() && length(x) < 20L) {
+      out = capture.output(print(list(d=a, fj=b)))
+      stop("starts are not equal: ", out)
+    }
+    stop("starts are not equal")
+  }
+  invisible(TRUE)
+}
+
+
+d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2]
+
+x = c(1L,2L,3L,4L)
+y = c(2L,3L,5L)
+test(1.01, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(2L,3L,5L)
+test(1.02, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,4L)
+y = c(2L,3L,3L,5L)
+test(1.03, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(2L,3L,3L,5L)
+test(1.04, fjoin(x, y), d(x, y))
+
+# y unsorted
+x = c(1L,2L,3L,4L)
+y = c(2L,5L,3L)
+test(2.01, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(3L,2L,5L)
+test(2.02, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,4L)
+y = c(5L,3L,2L,3L)
+test(2.03, fjoin(x, y), d(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(5L,3L,3L,2L)
+test(2.04, fjoin(x, y), d(x, y))
diff --git a/src/data.table.h b/src/data.table.h
index 0109e1da75..8bca1a7e02 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // fjoin.c
-SEXP ijoinR(SEXP x, SEXP y);
+SEXP fjoinR(SEXP x, SEXP y);
diff --git a/src/fjoin.c b/src/fjoin.c
index 3695641643..92e7725732 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -24,10 +24,10 @@ SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y)
   SET_STRING_ELT(ansnames, 1, mkChar("lens_x"));
   //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1));
   SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 2, mkChar("starts_y"));
+  SET_STRING_ELT(ansnames, 2, mkChar("starts"));
   int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2));
   SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 3, mkChar("lens_y"));
+  SET_STRING_ELT(ansnames, 3, mkChar("lens"));
   int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3));
   for (int i=0; i<matchn; ++i) {
     //starts_x_p[i] = starts_x[i];
@@ -101,8 +101,11 @@ SEXP sjoinR(SEXP x, SEXP y) {
 /*
  * fast indexed join
  * sort-merge join
+ * 
+ * split whole 1:nx_starts into numbers of buckets equal to threads
+ * each thread do binary merge x of first and last element, and then sort-merge to a subset of y defined by those matches
  */
-void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
+void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
            int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts,
            joinhow how,
            bool *match, int *matchn,
@@ -110,7 +113,7 @@ void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
            int *starts_y, int *lens_y) {
   if (how != left)
     error("only left join implemented so far");
-  if (!x_ord || !y_ord)
+  if (!x_ord)
     error("support for already unsorted not yet implemented");
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   //int imatch = 0;
@@ -127,67 +130,107 @@ void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
     y_lens[ny_starts-1] = ny-y_starts[ny_starts-1];
   }
 
-  int i=0, j=0, is=0, js=0;
-  if (how==left && unq_x && unq_y) {
-    while (i<nx && j<ny) {
-      int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        starts_y[i] = j+1; lens_y[i] = 1;
-        i++;
+  int i=0, j=0, is=0, js=0, x_i, y_j;
+  if (x_ord && y_ord) {
+    if (how==left && unq_x && unq_y) {
+      while (i<nx && j<ny) {
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts_y[i] = j+1; lens_y[i] = 1;
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
       }
-      else if (x_i < y_j) i++;
-      else if (x_i > y_j) j++;
-    }
-  } else if (how==left && unq_y) {
-    while (is<nx_starts && j<ny) {
-      int i = x_starts[is]-1;
-      int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        int j1 = j+1;
-        for (int ii=0; ii<x_lens[is]; ++ii) {
-          starts_y[i+ii] = j1; lens_y[i+ii] = 1;
-        }
-        is++;
+    } else if (how==left && unq_y) {
+      while (is<nx_starts && j<ny) {
+        i = x_starts[is]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
       }
-      else if (x_i < y_j) is++;
-      else if (x_i > y_j) j++;
-    }
-  } else if (how==left && unq_x) {
-    while (i<nx && js<ny_starts) {
-      int j = y_starts[js]-1;
-      int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        starts_y[i] = j+1;
-        lens_y[i] = y_lens[js];
-        i++;
+    } else if (how==left && unq_x) {
+      while (i<nx && js<ny_starts) {
+        j = y_starts[js]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts_y[i] = j+1;
+          lens_y[i] = y_lens[js];
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
+      }
+    } else if (how==left) {
+      while (is<nx_starts && js<ny_starts) {
+        i = x_starts[is]-1, j = y_starts[js]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
       }
-      else if (x_i < y_j) i++;
-      else if (x_i > y_j) js++;
     }
-  } else if (how==left) {
-    while (is<nx_starts && js<ny_starts) {
-      int i = x_starts[is]-1, j = y_starts[js]-1;
-      int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        int j1 = j+1;
-        for (int ii=0; ii<x_lens[is]; ++ii) {
-          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-        }
-        is++;
+  } else if (x_ord) {
+    if (how==left && unq_x && unq_y) {
+      while (i<nx && j<ny) {
+        x_i = x[i], y_j = y[y_o[j]-1];
+        if (x_i == y_j) {
+          starts_y[i] = j+1; lens_y[i] = 1;
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
+      }
+    } else if (how==left && unq_y) {
+      while (is<nx_starts && j<ny) {
+        i = x_starts[is]-1;
+        x_i = x[i], y_j = y[y_o[j]-1];
+        //Rprintf("x[x_starts[%d]-1]=%d; y[y_o[%d]-1]=%d\n", is, x[i], j, y[y_o[j]-1]);
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            //Rprintf("starts_y[%d+%d]=%d\n", i, ii, j1);
+            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
+      }
+    } else if (how==left && unq_x) {
+      while (i<nx && js<ny_starts) {
+        j = y_starts[js]-1;
+        x_i = x[i], y_j = y[y_o[j]-1];
+        if (x_i == y_j) {
+          starts_y[i] = j+1;
+          lens_y[i] = y_lens[js];
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
+      }
+    } else if (how==left) {
+      while (is<nx_starts && js<ny_starts) {
+        i = x_starts[is]-1, j = y_starts[js]-1;
+        x_i = x[i], y_j = y[y_o[j]-1];
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
       }
-      else if (x_i < y_j) is++;
-      else if (x_i > y_j) js++;
     }
-  } else if (how==inner && unq_x && unq_y) {
-    
-  } else if (how==full && unq_x && unq_y) {
-    
+  } else if (y_ord) {
+    error("dev");
+  } else {
+    error("devVVV");
+    //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
   }
-  
   matchn[0] = nx;
 }
 
-SEXP ijoinR(SEXP x, SEXP y) {
+SEXP fjoinR(SEXP x, SEXP y) {
   if (!isInteger(x) || !isInteger(y))
     error("must be integer");
   int nx = LENGTH(x), ny = LENGTH(y);
@@ -208,7 +251,7 @@ SEXP ijoinR(SEXP x, SEXP y) {
   t_alloc = omp_get_wtime() - t_alloc;
   double t_ijoin = omp_get_wtime();
   //            *x, nx,           *x_o,            x_ord,         *x_starts,        nx_starts,
-  ijoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
+  fjoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
         INTEGER(y), ny, INTEGER(y_idx), LENGTH(y_idx)==0, INTEGER(y_starts), LENGTH(y_starts),
         left, match, &matchn,
         starts_x, lens_x, starts_y, lens_y);
diff --git a/src/init.c b/src/init.c
index cb8678cd6e..9f7f45ee8c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -120,7 +120,7 @@ SEXP unlock();
 SEXP islockedR();
 SEXP allNAR();
 //SEXP sjoinR();
-SEXP ijoinR();
+SEXP fjoinR();
 
 // .Externals
 SEXP fastmean();
@@ -213,7 +213,7 @@ R_CallMethodDef callMethods[] = {
 {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
 {"CtestMsgR", (DL_FUNC) &testMsgR, -1},
 {"C_allNAR", (DL_FUNC) &allNAR, -1},
-{"CijoinR", (DL_FUNC) &ijoinR, -1},
+{"CfjoinR", (DL_FUNC) &fjoinR, -1},
 {NULL, NULL, 0}
 };
 

From 3480d328d492a47d5abb9c19ea4e7edf9da6315a Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 5 Jun 2020 12:05:13 +0100
Subject: [PATCH 03/34] unsorted works as well

---
 inst/tests/fjoin.Rraw |  5 +++++
 src/fjoin.c           | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw
index d9fabf7664..0e16f0965e 100644
--- a/inst/tests/fjoin.Rraw
+++ b/inst/tests/fjoin.Rraw
@@ -55,3 +55,8 @@ test(2.03, fjoin(x, y), d(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(5L,3L,3L,2L)
 test(2.04, fjoin(x, y), d(x, y))
+
+# x, y unsorted
+x = c(4L,2L,3L,1L)
+y = c(2L,5L,3L)
+test(2.01, fjoin(x, y), d(x, y))
diff --git a/src/fjoin.c b/src/fjoin.c
index 92e7725732..fc5dfbbd6b 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -113,8 +113,6 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
            int *starts_y, int *lens_y) {
   if (how != left)
     error("only left join implemented so far");
-  if (!x_ord)
-    error("support for already unsorted not yet implemented");
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   //int imatch = 0;
   
@@ -222,9 +220,19 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
       }
     }
   } else if (y_ord) {
-    error("dev");
+    error("y_ord not yet implemented");
   } else {
-    error("devVVV");
+    if (how==left && unq_x && unq_y) {
+      while (i<nx && j<ny) {
+        x_i = x[x_o[i]-1], y_j = y[y_o[j]-1];
+        if (x_i == y_j) {
+          starts_y[i] = j+1; lens_y[i] = 1;
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
+      }
+    } else {
+      error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)");
+    }
     //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
   }
   matchn[0] = nx;

From 99589c5d29027787c7d65e3d803a02cd4e8af55f Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 5 Jun 2020 13:06:04 +0100
Subject: [PATCH 04/34] testing and fixing

---
 inst/tests/fjoin.Rraw | 63 ++++++++++++++++++++++++++++++++++++++++---
 src/fjoin.c           |  8 +++---
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw
index 0e16f0965e..cfdf0d0beb 100644
--- a/inst/tests/fjoin.Rraw
+++ b/inst/tests/fjoin.Rraw
@@ -29,6 +29,7 @@ check = function(x, y) {
 
 d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2]
 
+## x y sorted
 x = c(1L,2L,3L,4L)
 y = c(2L,3L,5L)
 test(1.01, fjoin(x, y), d(x, y))
@@ -42,7 +43,7 @@ x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,3L,5L)
 test(1.04, fjoin(x, y), d(x, y))
 
-# y unsorted
+## y unsorted
 x = c(1L,2L,3L,4L)
 y = c(2L,5L,3L)
 test(2.01, fjoin(x, y), d(x, y))
@@ -56,7 +57,61 @@ x = c(1L,2L,3L,3L,4L)
 y = c(5L,3L,3L,2L)
 test(2.04, fjoin(x, y), d(x, y))
 
-# x, y unsorted
-x = c(4L,2L,3L,1L)
+#...
+
+## xy unsorted
+x = c(4L,1L,3L,2L)
 y = c(2L,5L,3L)
-test(2.01, fjoin(x, y), d(x, y))
+test(4.01, fjoin(x, y), d(x, y))
+
+#...
+
+## xy sorted
+ssa = function(unq_n, size, sort=FALSE) {
+  stopifnot(unq_n <= size)
+  unq_sub = seq_len(unq_n)
+  ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
+  if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
+  if (sort) sort(ans) else ans
+}
+N = 1e2
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N, sort=TRUE)
+test(11.01, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N, sort=TRUE)
+test(11.02, fjoin(x, y), d(x, y))
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N*1.1, sort=TRUE)
+test(11.03, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N*1.1, sort=TRUE)
+test(11.04, fjoin(x, y), d(x, y))
+
+## y unsorted
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N)
+test(12.01, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N)
+test(12.02, fjoin(x, y), d(x, y))
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N*1.1)
+test(12.03, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N*1.1)
+test(12.04, fjoin(x, y), d(x, y))
+
+## xy unsorted
+x = ssa(N, N)
+y = ssa(N, N)
+test(14.01, fjoin(x, y), d(x, y))
+#x = ssa(N, N*1.1)
+#y = ssa(N, N)
+#test(14.02, fjoin(x, y), d(x, y))
+#x = ssa(N, N)
+#y = ssa(N, N*1.1)
+#test(14.03, fjoin(x, y), d(x, y))
+#x = ssa(N, N*1.1)
+#y = ssa(N, N*1.1)
+#test(14.04, fjoin(x, y), d(x, y))
diff --git a/src/fjoin.c b/src/fjoin.c
index fc5dfbbd6b..96d1fd45f8 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -120,12 +120,14 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
   if (!unq_x) {
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
     for (int i=0; i<nx_starts-1; ++i) x_lens[i] = x_starts[i+1]-x_starts[i];
-    x_lens[nx_starts-1] = nx-x_starts[nx_starts-1];
+    //Rprintf("nx=%d; x_starts[nx_starts-1]=%d\n", nx, x_starts[nx_starts-1]);
+    x_lens[nx_starts-1] = nx-x_starts[nx_starts-1]+1;
+    //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
   }
   if (!unq_y) {
     y_lens = (int *)R_alloc(ny_starts, sizeof(int));
     for (int i=0; i<ny_starts-1; ++i) y_lens[i] = y_starts[i+1]-y_starts[i];
-    y_lens[ny_starts-1] = ny-y_starts[ny_starts-1];
+    y_lens[ny_starts-1] = ny-y_starts[ny_starts-1]+1;
   }
 
   int i=0, j=0, is=0, js=0, x_i, y_j;
@@ -226,7 +228,7 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
       while (i<nx && j<ny) {
         x_i = x[x_o[i]-1], y_j = y[y_o[j]-1];
         if (x_i == y_j) {
-          starts_y[i] = j+1; lens_y[i] = 1;
+          starts_y[x_o[i]-1] = j+1; lens_y[x_o[i]-1] = 1;
           i++;
         } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
       }

From 94a0faaa2480bacba658b797031dfb5f3e54916a Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 01:19:25 +0100
Subject: [PATCH 05/34] devdevdev

---
 R/wrappers.R     |   4 +-
 src/data.table.h |   2 +-
 src/fjoin.c      | 409 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 401 insertions(+), 14 deletions(-)

diff --git a/R/wrappers.R b/R/wrappers.R
index 1c9fa6c351..6b9da0d799 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-fjoin = function(x, y) {
-  ans = .Call(CfjoinR, x, y)
+fjoin = function(x, y, how=c("left","inner","full")) {
+  ans = .Call(CfjoinR, x, y, match.arg(how))
   ans[[4L]][is.na(ans[[3L]])] = 1L
   ans[3:4]
 }
diff --git a/src/data.table.h b/src/data.table.h
index 8bca1a7e02..abd993a592 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // fjoin.c
-SEXP fjoinR(SEXP x, SEXP y);
+SEXP fjoinR(SEXP x, SEXP y, SEXP how);
diff --git a/src/fjoin.c b/src/fjoin.c
index 96d1fd45f8..6d7b5dff11 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -14,7 +14,7 @@ typedef enum {
   inner, left, right, full, semi, anti, cross
 } joinhow;
 
-SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) {
+SEXP joinOut2(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) {
   SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames;
   setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4));
   SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0));
@@ -105,14 +105,15 @@ SEXP sjoinR(SEXP x, SEXP y) {
  * split whole 1:nx_starts into numbers of buckets equal to threads
  * each thread do binary merge x of first and last element, and then sort-merge to a subset of y defined by those matches
  */
-void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
+// while loop approach returning results consistent to bmerge
+void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
            int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts,
            joinhow how,
            bool *match, int *matchn,
            int *starts_x, int *lens_x,
            int *starts_y, int *lens_y) {
-  if (how != left)
-    error("only left join implemented so far");
+  //if (how != left)
+  //  error("only left join implemented so far");
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   //int imatch = 0;
   
@@ -174,6 +175,14 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
           is++;
         } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
       }
+    } else if (how==inner && unq_x && unq_y) {
+      error("dev");
+    } else if (how==inner) {
+      error("not yet implemented");
+    } else if (how==full && unq_x && unq_y) {
+      error("dev");
+    } else if (how==full) {
+      error("not yet implemented");
     }
   } else if (x_ord) {
     if (how==left && unq_x && unq_y) {
@@ -240,7 +249,7 @@ void fjoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
   matchn[0] = nx;
 }
 
-SEXP fjoinR(SEXP x, SEXP y) {
+SEXP ijoinR(SEXP x, SEXP y, SEXP how) {
   if (!isInteger(x) || !isInteger(y))
     error("must be integer");
   int nx = LENGTH(x), ny = LENGTH(y);
@@ -251,22 +260,400 @@ SEXP fjoinR(SEXP x, SEXP y) {
   SEXP y_starts = getAttrib(y_idx, sym_starts);
   t_index = omp_get_wtime() - t_index;
   double t_alloc = omp_get_wtime();
+  int ans_n = 0;
+  joinhow how2;
+  //  left:  nx
+  // right:  ny
+  // inner:  min(nx, ny)
+  //  full:  sum(nx, ny)
+  if (STRING_ELT(how, 0)==mkChar("left")) {
+    ans_n = nx; how2=left;
+  } else if (STRING_ELT(how, 0)==mkChar("inner")) {
+    ans_n = MIN(nx, ny); how2=inner;
+  } else if (STRING_ELT(how, 0)==mkChar("full")) {
+    ans_n = nx+ny; how2=full;
+  } else error("other how not yet implemented");
   bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
   int matchn = 0;
   int *starts_x = (int *)R_alloc(0, sizeof(int));
   int *lens_x = (int *)R_alloc(0, sizeof(int));
-  int *starts_y = (int *)R_alloc(nx, sizeof(int));
-  int *lens_y = (int *)R_alloc(nx, sizeof(int));
-  for (int i=0; i<nx; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
+  int *starts_y = (int *)R_alloc(ans_n, sizeof(int));
+  int *lens_y = (int *)R_alloc(ans_n, sizeof(int));
+  for (int i=0; i<ans_n; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
   t_alloc = omp_get_wtime() - t_alloc;
   double t_ijoin = omp_get_wtime();
   //            *x, nx,           *x_o,            x_ord,         *x_starts,        nx_starts,
-  fjoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
+  ijoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
         INTEGER(y), ny, INTEGER(y_idx), LENGTH(y_idx)==0, INTEGER(y_starts), LENGTH(y_starts),
-        left, match, &matchn,
+        how2, match, &matchn,
         starts_x, lens_x, starts_y, lens_y);
   t_ijoin = omp_get_wtime() - t_ijoin;
   Rprintf("index took %.3fs; alloc took %.3fs; ijoin took %.3fs\n", t_index, t_alloc, t_ijoin);
   UNPROTECT(2);
-  return joinOut(matchn, starts_x, lens_x, starts_y, lens_y);
+  return joinOut2(matchn, starts_x, lens_x, starts_y, lens_y);
+}
+
+
+
+/*
+ * openmp friendly
+ * get range of x and y
+ * take pmin of it
+ * 
+ */
+/*static inline int imin(int a, int b) { return a < b ? a : b; }
+static inline int imax(int a, int b) { return a > b ? a : b; }
+static inline int min_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] < y[y_idx] ? x_idx : y_idx; }
+static inline int max_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] > y[y_idx] ? x_idx : y_idx; }*/
+static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously
+  for (int i=0; i<ny; ++i) {
+    //Rprintf("i=%d: y[y_o[i]] > val: %d > %d\n", i, y[y_o[i]-1], val);
+    if (y[y_o[i]-1]>val) {
+      if (i==0) error("i=0 max_i_match, unhandled");
+      return(i-1);
+    }
+  }
+  error("not yet handled in max_i_match");
+}
+static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) {
+  for (int i=ny-1; i<ny; --i) {
+    if (y[y_o[i]-1]<val) {
+      if (i==ny-1) error("i=ny-1 min_i_match, unhandled");
+      return(i+1);
+    }
+  }
+  error("not yet handled in min_i_match");
+}
+
+void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
+  for (int i=0; i<n_starts-1; ++i)
+    lens[i] = starts[i+1]-starts[i];
+  lens[n_starts-1] = n - starts[n_starts-1]+1;
+  return;
+}
+
+void fjoin(int *x, int nx, int *x_starts, int nx_starts,
+           int *y, int ny, int *y_starts, int ny_starts,
+           joinhow how,
+           bool *match, int *matchn,
+           int *starts_x, int *lens_x,
+           int *starts_y, int *lens_y) {
+  //if (how != left)
+  //  error("only left join implemented so far");
+  bool x_ord=true, y_ord=true;
+  int *x_o=0, *y_o=0;
+  
+  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
+  int *x_lens=0, *y_lens=0;
+  double t_lens = omp_get_wtime();
+  if (!unq_x) {
+    x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
+    grpLens(x_starts, nx_starts, nx, x_lens);
+  }
+  if (!unq_y) {
+    y_lens = (int *)R_alloc(ny_starts, sizeof(int));
+    grpLens(y_starts, ny_starts, ny, y_lens);
+  }
+  t_lens = omp_get_wtime() - t_lens;
+  
+  double t_batch=0, t_join=0;
+  int i=0, j=0, is=0, js=0, x_i, y_j;
+  if (x_ord && y_ord) {
+    if (how==left && unq_x && unq_y) {
+      error("dev");
+      while (i<nx && j<ny) {
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts_y[i] = j+1; lens_y[i] = 1;
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
+      }
+    } else if (how==left && unq_y) {
+      error("dev");
+      while (is<nx_starts && j<ny) {
+        i = x_starts[is]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
+      }
+    } else if (how==left && unq_x) {
+      error("dev");
+      while (i<nx && js<ny_starts) {
+        j = y_starts[js]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts_y[i] = j+1;
+          lens_y[i] = y_lens[js];
+          i++;
+        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
+      }
+    } else if (how==left) {
+      double t_batch = omp_get_wtime();
+      //int min_i = 0, max_i = x_starts[nx_starts-1]-1;
+      //Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
+      //int nth = getDTthreads();
+      int nBatch=1;
+      //int b1_min_i = min_i, b1_max_i = max_i;
+      //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
+      //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
+      //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
+      //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
+      t_batch = omp_get_wtime() - t_batch;
+      
+      //int min_y = y[y_starts[0]-1];
+      //int max_y = y[y_starts[nx_starts-1]-1];
+      //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
+      int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization
+      //bool skip=false;
+      t_join = omp_get_wtime();
+      //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads())
+      for (int b=0; b<nBatch; ++b) {
+        int i=0, j=0, is=0, js=0, x_i, y_j;
+        while (is<nx_starts && js<ny_starts) {
+          //Rprintf("============= is=%d\n", is);
+          i = x_starts[is]-1, j = y_starts[js]-1;
+          x_i = x[i], y_j = y[j];
+          if (x_i == y_j) {
+            //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+            int j1 = j+1;
+            for (int ii=0; ii<x_lens[is]; ++ii) {
+              //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+              starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+            }
+            is++;
+          } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+        }
+        /*for (is=0; is<nx_starts; ++is) {
+          Rprintf("============= is=%s\n", is);
+          if (skip) {Rprintf("skip is=%s\n", is); continue;} // we need this to stop on !js<ny_starts
+          i = x_starts[is]-1, j = y_starts[js]-1;
+          x_i = x[i], y_j = y[j];
+          if (x_i == y_j) {
+            Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+            int j1 = j+1;
+            for (int ii=0; ii<x_lens[is]; ++ii) {
+              Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+              starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+            }
+            //is++;
+          }
+          else if (x_i < y_j) ;//is++;
+          else if (x_i > y_j) {
+            while(x[x_starts[is+1]-1] < )
+            if (++js>=ny_starts) {skip=true; continue}
+          }
+        }*/
+        th[b] = omp_get_thread_num();
+      }
+      t_join = omp_get_wtime() - t_join;
+      for (int b=0; b<nBatch; ++b) Rprintf("th(batch[%d])=%d\n", b, th[b]);
+      
+      /*while (is<nx_starts && js<ny_starts) {
+        i = x_starts[is]-1, j = y_starts[js]-1;
+        x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          int j1 = j+1;
+          for (int ii=0; ii<x_lens[is]; ++ii) {
+            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+          }
+          is++;
+        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+      }*/
+    } else if (how==inner && unq_x && unq_y) {
+      error("dev");
+    } else if (how==inner) {
+      error("not yet implemented");
+    } else if (how==full && unq_x && unq_y) {
+      error("dev");
+    } else if (how==full) {
+      /*
+       * 
+       */
+      error("not yet implemented");
+    }
+  } else {
+    error("input must be ordered!");
+    if (false) {
+      // find bucket ranges
+      int min_x_i = x_o[0]-1, max_x_i = x_o[nx-1]-1;
+      int min_y_i = y_o[0]-1, max_y_i = y_o[ny-1]-1;
+      //int min_x = x[min_x_i], max_x = x[max_x_i];
+      //int min_y = y[min_y_i], max_y = y[max_y_i];
+      
+      //bool trim_min_x = min_x < min_y;
+      //bool trim_min_y = min_x > min_y;
+      //bool trim_max_x = max_x > max_y;
+      //bool trim_max_y = max_x < max_y;
+      // todo trim buckets to join-able subsets of x and y
+      
+      //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
+      //int min_xy = imin(min_x, min_y), max_xy = imin(max_x, max_y);
+      //if (max_xy - min_xy < 2) error("not handled yet");
+      
+      int n_buckets = 2;
+      if (n_buckets != 2) error("not handled yet");
+      // this needs to be done for every bucket
+      if (y_ord) error("y must not be sorted");
+      
+      int b1_max_x_o_i = max_i_match(x, nx, x_o, x_ord, x[x_o[nx/2]-1]);
+      Rprintf("b1_max_x_o_i=%d; x[x_o[b1_max_x_o_i]-1]=%d\n", b1_max_x_o_i, x[x_o[b1_max_x_o_i]-1]);
+      int b1_min_x_i = min_x_i, b1_max_x_i = x_o[b1_max_x_o_i]-1; // this need to ensure that duplicate goes into same bucket!! fjoin(c(1:3,3L,3L,5L,4L), y), 3 goes into both buckets!
+      int b2_min_x_i = x_o[b1_max_x_o_i+1]-1, b2_max_x_i = max_x_i;
+      if (x[b1_max_x_i]==x[b2_min_x_i]) error("value of x=%d fits into multiple buckets", x[b2_min_x_i]);
+      Rprintf("min(x)=%d; max(x)=%d; len(x)=%d\n", x[min_x_i], x[max_x_i], nx);
+      Rprintf("b1(min(x))=%d; b1(max(x))=%d; b1(len(x))=%d\n", x[b1_min_x_i], x[b1_max_x_i], b1_max_x_o_i-0);
+      Rprintf("b2(min(x))=%d; b2(max(x))=%d; b2(len(x))=%d\n", x[b2_min_x_i], x[b2_max_x_i], nx-b1_max_x_o_i+1);
+      error("dev");
+      int b1_max_y_o_i = max_i_match(y, ny, y_o, y_ord, /*value to match to*/x[b1_max_x_i]);
+      int b1_min_y_i = min_y_i, b1_max_y_i = y_o[b1_max_y_o_i]-1;
+      int b2_min_y_o_i = min_i_match(y, ny, y_o, y_ord, x[b2_min_x_i]);
+      int b2_min_y_i = y_o[b2_min_y_o_i]-1, b2_max_y_i = max_y_i;
+      Rprintf("b1_min_x_i=%d, b1_max_x_i=%d, b2_min_x_i=%d, b2_max_x_i=%d\n", b1_min_x_i, b1_max_x_i, b2_min_x_i, b2_max_x_i);
+      Rprintf("buckets: 1(x:%d-%d, y:%d-%d); 2(x:%d-%d, y:%d-%d)\n", x[b1_min_x_i], x[b1_max_x_i], y[b1_min_y_i], y[b1_max_y_i], x[b2_min_x_i], x[b2_max_x_i], y[b2_min_y_i], y[b2_max_y_i]);
+      // openmp here!!
+      for (int b=0; b<n_buckets; ++b) {
+        int *x_o_, nx_, *y_o_, ny_;
+        if (b==0) {
+          x_o_ = x_o;
+          nx_ = nx/2;
+          y_o_ = y_o;
+          ny_ = b1_max_y_o_i;
+        } else {
+          x_o_ = x_o + (nx/2+1)*sizeof(int);
+          nx_ = nx-nx/2;
+          y_o_ = y_o + b2_min_y_o_i*sizeof(int);
+          ny_ = ny-b2_min_y_o_i+1;
+        }
+        Rprintf("--- bucket %d content ---", b);
+        Rprintf("\nx (len %d):", nx_);
+        for (int i=0; i<nx_; ++i) Rprintf(" %d", x[x_o_[i]-1]);
+        Rprintf("\ny (len %d):", ny_);
+        for (int i=0; i<ny_; ++i) Rprintf(" %d", y[y_o_[i]-1]);
+        Rprintf("\n");
+        //bucketjoin(x, y, bucket_len_x, bucket_start_x, x_o, x_min, y
+      }
+      error("dev");
+      //error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)");
+    }
+    //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
+  }
+  Rprintf("fjoin: lens took %.3fs; batch took %.3fs; join took %.3fs\n", t_lens, t_batch, t_join);
+  matchn[0] = nx;
+}
+
+SEXP sortInt(SEXP x, SEXP idx) {
+  if (!LENGTH(idx))
+    error("already sorted, should be escaped");
+  SEXP ans = PROTECT(allocVector(INTSXP, LENGTH(x)));
+  int *xp = INTEGER(x), *idxp = INTEGER(idx), *ansp = INTEGER(ans);
+  for (int i=0; i<LENGTH(x); ++i)
+    ansp[i] = xp[idxp[i]-1];
+  UNPROTECT(1);
+  return ans;
+}
+
+SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y, bool x_ord, int *x_o_idx) {
+  SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames;
+  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4));
+  SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0));
+  SET_STRING_ELT(ansnames, 0, mkChar("starts_x"));
+  //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0));
+  SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0));
+  SET_STRING_ELT(ansnames, 1, mkChar("lens_x"));
+  //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1));
+  SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn));
+  SET_STRING_ELT(ansnames, 2, mkChar("starts"));
+  int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2));
+  SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn));
+  SET_STRING_ELT(ansnames, 3, mkChar("lens"));
+  int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3));
+  if (!x_ord) {
+    for (int i=0; i<matchn; ++i) {
+      //starts_x_p[i] = starts_x[i];
+      //lens_x_p[i] = lens_x[i];
+      starts_y_p[i] = starts_y[x_o_idx[i]-1];
+      lens_y_p[i] = lens_y[x_o_idx[i]-1];
+    }
+  } else {
+    for (int i=0; i<matchn; ++i) {
+      //starts_x_p[i] = starts_x[i];
+      //lens_x_p[i] = lens_x[i];
+      starts_y_p[i] = starts_y[i];
+      lens_y_p[i] = lens_y[i];
+    }
+  }
+  UNPROTECT(1);
+  return ans;
+}
+
+
+SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
+
+  if (!isInteger(x) || !isInteger(y))
+    error("must be integer");
+  int protecti=0, nx = LENGTH(x), ny = LENGTH(y);
+
+  double t_index = omp_get_wtime();
+  SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  SEXP x_starts = getAttrib(x_idx, sym_starts);
+  SEXP y_starts = getAttrib(y_idx, sym_starts);
+  t_index = omp_get_wtime() - t_index;
+
+  bool x_ord = LENGTH(x_idx)==0, y_ord = LENGTH(y_idx)==0;
+  SEXP x_o_idx = R_NilValue;
+  double t_sort = omp_get_wtime();
+  if (!x_ord) {
+    x = PROTECT(sortInt(x, x_idx)); protecti++;
+    //x_ord = true;
+    x_o_idx = PROTECT(forder(x_idx, R_NilValue, ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  }
+  if (!y_ord) {
+    y = PROTECT(sortInt(y, y_idx)); protecti++;
+    //y_ord = true;
+  }
+  t_sort = omp_get_wtime() - t_sort;
+
+  double t_alloc = omp_get_wtime();
+  int ans_n = 0;
+  joinhow how2;
+  //  left:  nx
+  // right:  ny
+  // inner:  min(nx, ny)
+  //  full:  sum(nx, ny)
+  if (STRING_ELT(how, 0)==mkChar("left")) {
+    ans_n = nx; how2=left;
+  } else if (STRING_ELT(how, 0)==mkChar("inner")) {
+    ans_n = MIN(nx, ny); how2=inner;
+  } else if (STRING_ELT(how, 0)==mkChar("full")) {
+    ans_n = nx+ny; how2=full;
+  } else error("other how not yet implemented");
+  bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
+  int matchn = 0;
+  int *starts_x = (int *)R_alloc(0, sizeof(int));
+  int *lens_x = (int *)R_alloc(0, sizeof(int));
+  int *starts_y = (int *)R_alloc(ans_n, sizeof(int));
+  int *lens_y = (int *)R_alloc(ans_n, sizeof(int));
+  for (int i=0; i<ans_n; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
+  t_alloc = omp_get_wtime() - t_alloc;
+
+  double t_fjoin = omp_get_wtime();
+  //            *x, nx,         *x_starts,        nx_starts,
+  fjoin(INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
+        INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
+        how2, match, &matchn,
+        starts_x, lens_x, starts_y, lens_y);
+  t_fjoin = omp_get_wtime() - t_fjoin;
+
+  double t_ans = omp_get_wtime();
+  SEXP ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, INTEGER(x_o_idx));
+  t_ans = omp_get_wtime() - t_ans;
+  Rprintf("fjoinR: index %.3fs; sort %.3fs; alloc %.3fs; fjoin %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_fjoin, t_ans);
+  UNPROTECT(protecti);
+  return ans;
 }

From 6df14150729d34f396a14a193759b553ba2d8691 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 12:39:20 +0100
Subject: [PATCH 06/34] cleanup

---
 src/fjoin.c | 524 ++++++++--------------------------------------------
 1 file changed, 74 insertions(+), 450 deletions(-)

diff --git a/src/fjoin.c b/src/fjoin.c
index 6d7b5dff11..47b4f1cf8e 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -1,310 +1,11 @@
 #include "data.table.h"
 
 /*
- * as of now these are optimized routines for joining
- * - single integer columns
- * - sorted (experimental) or indexed
- * 
- * TODO:
- * type of join is pushed down here so we compute only what is necessary
- * mult!="all" and "unique-key" branches escape extra computation
- */
-
-typedef enum {
-  inner, left, right, full, semi, anti, cross
-} joinhow;
-
-SEXP joinOut2(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y) {
-  SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames;
-  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4));
-  SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0));
-  SET_STRING_ELT(ansnames, 0, mkChar("starts_x"));
-  //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0));
-  SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0));
-  SET_STRING_ELT(ansnames, 1, mkChar("lens_x"));
-  //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1));
-  SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 2, mkChar("starts"));
-  int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2));
-  SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 3, mkChar("lens"));
-  int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3));
-  for (int i=0; i<matchn; ++i) {
-    //starts_x_p[i] = starts_x[i];
-    //lens_x_p[i] = lens_x[i];
-    starts_y_p[i] = starts_y[i];
-    lens_y_p[i] = lens_y[i];
-  }
-  UNPROTECT(1);
-  return ans;
-}
-
-/*
- * fast sorted join
- */
-/*
-void sjoin(int *x, int nx, int *y, int ny, joinhow how, bool *match, int *matchi, int *matchstarts, int *matchlens, int *matchn) {
-  int imatch = 0;
-  if (how==left) {
-    for (int i=0, j=0; i<nx;) {
-      if (x[i]==y[j]) {
-        Rprintf("x[%d]==y[%d]\n", i, j);
-        match[i] = true;
-        matchi[imatch++] = i;
-        matchstarts[i] = j;
-        matchlens[i] = 1;
-        int jj = j+1; // rather than j++, so duplicates on x side works
-        while(x[i]==y[jj]) { // multiple matches
-          matchlens[i]++; jj++;
-        }
-        i++; // we could have an outer while instead, then we dont have to recalculate matchlens when dups to dups match 2x2
-        continue;
-      } else if (x[i]<y[j]) {
-        Rprintf("x[%d]<y[%d]\n", i, j);
-        match[i] = false;
-        i++;
-        continue;
-      } else if (x[i]>y[j]) {
-        Rprintf("x[%d]>y[%d]\n", i, j);
-        j++;
-        continue;
-      }
-    }
-  } else {
-    error("not yet implemented");
-  }
-  matchn[0] = imatch;
-}
-SEXP sjoinR(SEXP x, SEXP y) {
-  if (!isInteger(x) || !isInteger(y))
-    error("must be integer");
-  int nx = LENGTH(x), ny = LENGTH(y);
-  //  left:  nx
-  // right:  ny
-  // inner:  min(nx, ny)
-  //  full:  max(nx, ny)
-  double t_alloc = omp_get_wtime();
-  bool *match = (bool *)R_alloc(nx, sizeof(bool)); // not sure if we need this
-  int *matchi = (int *)R_alloc(nx, sizeof(int));
-  int *matchlens = (int *)R_alloc(nx, sizeof(int));
-  int *matchstarts = (int *)R_alloc(nx, sizeof(int));
-  int matchn = 0;
-  t_alloc = omp_get_wtime() - t_alloc;
-  double t_sjoin = omp_get_wtime();
-  sjoin(INTEGER(x), nx, INTEGER(y), ny, left, match, matchi, matchstarts, matchlens, &matchn);
-  t_sjoin = omp_get_wtime() - t_sjoin;
-  Rprintf("alloc took %.3fs; sjoin took %.3fs\n", t_alloc, t_sjoin);
-  return joinOut(matchn, matchi, matchstarts, matchlens);
-}
-*/
-
-/*
- * fast indexed join
  * sort-merge join
  * 
- * split whole 1:nx_starts into numbers of buckets equal to threads
- * each thread do binary merge x of first and last element, and then sort-merge to a subset of y defined by those matches
+ * split whole 1:nx_starts into numbers of buckets equal to threads*2
+ * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches
  */
-// while loop approach returning results consistent to bmerge
-void ijoin(int *x, int nx, int *x_o, bool x_ord, int *x_starts, int nx_starts,
-           int *y, int ny, int *y_o, bool y_ord, int *y_starts, int ny_starts,
-           joinhow how,
-           bool *match, int *matchn,
-           int *starts_x, int *lens_x,
-           int *starts_y, int *lens_y) {
-  //if (how != left)
-  //  error("only left join implemented so far");
-  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
-  //int imatch = 0;
-  
-  int *x_lens=0, *y_lens=0;
-  if (!unq_x) {
-    x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
-    for (int i=0; i<nx_starts-1; ++i) x_lens[i] = x_starts[i+1]-x_starts[i];
-    //Rprintf("nx=%d; x_starts[nx_starts-1]=%d\n", nx, x_starts[nx_starts-1]);
-    x_lens[nx_starts-1] = nx-x_starts[nx_starts-1]+1;
-    //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
-  }
-  if (!unq_y) {
-    y_lens = (int *)R_alloc(ny_starts, sizeof(int));
-    for (int i=0; i<ny_starts-1; ++i) y_lens[i] = y_starts[i+1]-y_starts[i];
-    y_lens[ny_starts-1] = ny-y_starts[ny_starts-1]+1;
-  }
-
-  int i=0, j=0, is=0, js=0, x_i, y_j;
-  if (x_ord && y_ord) {
-    if (how==left && unq_x && unq_y) {
-      while (i<nx && j<ny) {
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts_y[i] = j+1; lens_y[i] = 1;
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_y) {
-      while (is<nx_starts && j<ny) {
-        i = x_starts[is]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_x) {
-      while (i<nx && js<ny_starts) {
-        j = y_starts[js]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts_y[i] = j+1;
-          lens_y[i] = y_lens[js];
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
-      }
-    } else if (how==left) {
-      while (is<nx_starts && js<ny_starts) {
-        i = x_starts[is]-1, j = y_starts[js]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
-      }
-    } else if (how==inner && unq_x && unq_y) {
-      error("dev");
-    } else if (how==inner) {
-      error("not yet implemented");
-    } else if (how==full && unq_x && unq_y) {
-      error("dev");
-    } else if (how==full) {
-      error("not yet implemented");
-    }
-  } else if (x_ord) {
-    if (how==left && unq_x && unq_y) {
-      while (i<nx && j<ny) {
-        x_i = x[i], y_j = y[y_o[j]-1];
-        if (x_i == y_j) {
-          starts_y[i] = j+1; lens_y[i] = 1;
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_y) {
-      while (is<nx_starts && j<ny) {
-        i = x_starts[is]-1;
-        x_i = x[i], y_j = y[y_o[j]-1];
-        //Rprintf("x[x_starts[%d]-1]=%d; y[y_o[%d]-1]=%d\n", is, x[i], j, y[y_o[j]-1]);
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            //Rprintf("starts_y[%d+%d]=%d\n", i, ii, j1);
-            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_x) {
-      while (i<nx && js<ny_starts) {
-        j = y_starts[js]-1;
-        x_i = x[i], y_j = y[y_o[j]-1];
-        if (x_i == y_j) {
-          starts_y[i] = j+1;
-          lens_y[i] = y_lens[js];
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
-      }
-    } else if (how==left) {
-      while (is<nx_starts && js<ny_starts) {
-        i = x_starts[is]-1, j = y_starts[js]-1;
-        x_i = x[i], y_j = y[y_o[j]-1];
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
-      }
-    }
-  } else if (y_ord) {
-    error("y_ord not yet implemented");
-  } else {
-    if (how==left && unq_x && unq_y) {
-      while (i<nx && j<ny) {
-        x_i = x[x_o[i]-1], y_j = y[y_o[j]-1];
-        if (x_i == y_j) {
-          starts_y[x_o[i]-1] = j+1; lens_y[x_o[i]-1] = 1;
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
-      }
-    } else {
-      error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)");
-    }
-    //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
-  }
-  matchn[0] = nx;
-}
-
-SEXP ijoinR(SEXP x, SEXP y, SEXP how) {
-  if (!isInteger(x) || !isInteger(y))
-    error("must be integer");
-  int nx = LENGTH(x), ny = LENGTH(y);
-  double t_index = omp_get_wtime();
-  SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE)));
-  SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE)));
-  SEXP x_starts = getAttrib(x_idx, sym_starts);
-  SEXP y_starts = getAttrib(y_idx, sym_starts);
-  t_index = omp_get_wtime() - t_index;
-  double t_alloc = omp_get_wtime();
-  int ans_n = 0;
-  joinhow how2;
-  //  left:  nx
-  // right:  ny
-  // inner:  min(nx, ny)
-  //  full:  sum(nx, ny)
-  if (STRING_ELT(how, 0)==mkChar("left")) {
-    ans_n = nx; how2=left;
-  } else if (STRING_ELT(how, 0)==mkChar("inner")) {
-    ans_n = MIN(nx, ny); how2=inner;
-  } else if (STRING_ELT(how, 0)==mkChar("full")) {
-    ans_n = nx+ny; how2=full;
-  } else error("other how not yet implemented");
-  bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
-  int matchn = 0;
-  int *starts_x = (int *)R_alloc(0, sizeof(int));
-  int *lens_x = (int *)R_alloc(0, sizeof(int));
-  int *starts_y = (int *)R_alloc(ans_n, sizeof(int));
-  int *lens_y = (int *)R_alloc(ans_n, sizeof(int));
-  for (int i=0; i<ans_n; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
-  t_alloc = omp_get_wtime() - t_alloc;
-  double t_ijoin = omp_get_wtime();
-  //            *x, nx,           *x_o,            x_ord,         *x_starts,        nx_starts,
-  ijoin(INTEGER(x), nx, INTEGER(x_idx), LENGTH(x_idx)==0, INTEGER(x_starts), LENGTH(x_starts),
-        INTEGER(y), ny, INTEGER(y_idx), LENGTH(y_idx)==0, INTEGER(y_starts), LENGTH(y_starts),
-        how2, match, &matchn,
-        starts_x, lens_x, starts_y, lens_y);
-  t_ijoin = omp_get_wtime() - t_ijoin;
-  Rprintf("index took %.3fs; alloc took %.3fs; ijoin took %.3fs\n", t_index, t_alloc, t_ijoin);
-  UNPROTECT(2);
-  return joinOut2(matchn, starts_x, lens_x, starts_y, lens_y);
-}
-
-
-
-/*
- * openmp friendly
- * get range of x and y
- * take pmin of it
- * 
- */
-/*static inline int imin(int a, int b) { return a < b ? a : b; }
-static inline int imax(int a, int b) { return a > b ? a : b; }
-static inline int min_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] < y[y_idx] ? x_idx : y_idx; }
-static inline int max_idx(int *x, int x_idx, int *y, int y_idx) { return x[x_idx] > y[y_idx] ? x_idx : y_idx; }*/
 static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously
   for (int i=0; i<ny; ++i) {
     //Rprintf("i=%d: y[y_o[i]] > val: %d > %d\n", i, y[y_o[i]-1], val);
@@ -325,6 +26,16 @@ static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) {
   error("not yet handled in min_i_match");
 }
 
+int uniqueN(int *x, int nx) { // x have 0:(nx-1) elements
+  int ans=0;
+  uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
+  memset(seen, 0, nx*sizeof(uint8_t));
+  for (int i=0; i<nx; ++i) {
+    if(!seen[x[i]]++) ans++;
+  }
+  return ans;
+}
+
 void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
   for (int i=0; i<n_starts-1; ++i)
     lens[i] = starts[i+1]-starts[i];
@@ -332,151 +43,74 @@ void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
   return;
 }
 
-void fjoin(int *x, int nx, int *x_starts, int nx_starts,
+void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
            int *y, int ny, int *y_starts, int ny_starts,
-           joinhow how,
+           //joinhow how,
            bool *match, int *matchn,
            int *starts_x, int *lens_x,
            int *starts_y, int *lens_y) {
-  //if (how != left)
-  //  error("only left join implemented so far");
   bool x_ord=true, y_ord=true;
   int *x_o=0, *y_o=0;
   
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   int *x_lens=0, *y_lens=0;
   double t_lens = omp_get_wtime();
-  if (!unq_x) {
+  if (!unq_x || true) {
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
+    //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
   }
-  if (!unq_y) {
+  if (!unq_y || true) {
     y_lens = (int *)R_alloc(ny_starts, sizeof(int));
     grpLens(y_starts, ny_starts, ny, y_lens);
   }
   t_lens = omp_get_wtime() - t_lens;
   
-  double t_batch=0, t_join=0;
-  int i=0, j=0, is=0, js=0, x_i, y_j;
-  if (x_ord && y_ord) {
-    if (how==left && unq_x && unq_y) {
-      error("dev");
-      while (i<nx && j<ny) {
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts_y[i] = j+1; lens_y[i] = 1;
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_y) {
-      error("dev");
-      while (is<nx_starts && j<ny) {
-        i = x_starts[is]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            starts_y[i+ii] = j1; lens_y[i+ii] = 1;
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) j++;
-      }
-    } else if (how==left && unq_x) {
-      error("dev");
-      while (i<nx && js<ny_starts) {
-        j = y_starts[js]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts_y[i] = j+1;
-          lens_y[i] = y_lens[js];
-          i++;
-        } else if (x_i < y_j) i++; else if (x_i > y_j) js++;
-      }
-    } else if (how==left) {
-      double t_batch = omp_get_wtime();
-      //int min_i = 0, max_i = x_starts[nx_starts-1]-1;
-      //Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
-      //int nth = getDTthreads();
-      int nBatch=1;
-      //int b1_min_i = min_i, b1_max_i = max_i;
-      //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
-      //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
-      //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
-      //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
-      t_batch = omp_get_wtime() - t_batch;
-      
-      //int min_y = y[y_starts[0]-1];
-      //int max_y = y[y_starts[nx_starts-1]-1];
-      //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
-      int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization
-      //bool skip=false;
-      t_join = omp_get_wtime();
-      //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads())
-      for (int b=0; b<nBatch; ++b) {
-        int i=0, j=0, is=0, js=0, x_i, y_j;
-        while (is<nx_starts && js<ny_starts) {
-          //Rprintf("============= is=%d\n", is);
-          i = x_starts[is]-1, j = y_starts[js]-1;
-          x_i = x[i], y_j = y[j];
-          if (x_i == y_j) {
-            //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-            int j1 = j+1;
-            for (int ii=0; ii<x_lens[is]; ++ii) {
-              //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-              starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-            }
-            is++;
-          } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+  double t_batch = omp_get_wtime();
+  //int min_i = 0, max_i = x_starts[nx_starts-1]-1;
+  //Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
+  //int nth = getDTthreads();
+  //int nBatch=nth*2;
+  int nBatch=1;
+  //int b1_min_i = min_i, b1_max_i = max_i;
+  //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
+  //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
+  //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
+  //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
+  t_batch = omp_get_wtime() - t_batch;
+  
+  //int min_y = y[y_starts[0]-1];
+  //int max_y = y[y_starts[nx_starts-1]-1];
+  //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
+  int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization
+  //bool skip=false;
+  double t_join = omp_get_wtime();
+  //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads())
+  for (int b=0; b<nBatch; ++b) {
+    //int *xs, *ys, nxs, nys;
+    //xs = x_starts + 1*sizeof(int);
+    // fjoin1(/*local in*/xs, nxs, ys, nys, /*global in*/x, x_lens, /*out*/start_y, lens_y);
+    int i=0, j=0, is=0, js=0, x_i, y_j;
+    while (is<nx_starts && js<ny_starts) {
+      //Rprintf("============= is=%d\n", is);
+      i = x_starts[is]-1, j = y_starts[js]-1;
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+        int j1 = j+1;
+        for (int ii=0; ii<x_lens[is]; ++ii) {
+          //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
         }
-        /*for (is=0; is<nx_starts; ++is) {
-          Rprintf("============= is=%s\n", is);
-          if (skip) {Rprintf("skip is=%s\n", is); continue;} // we need this to stop on !js<ny_starts
-          i = x_starts[is]-1, j = y_starts[js]-1;
-          x_i = x[i], y_j = y[j];
-          if (x_i == y_j) {
-            Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-            int j1 = j+1;
-            for (int ii=0; ii<x_lens[is]; ++ii) {
-              Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-              starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-            }
-            //is++;
-          }
-          else if (x_i < y_j) ;//is++;
-          else if (x_i > y_j) {
-            while(x[x_starts[is+1]-1] < )
-            if (++js>=ny_starts) {skip=true; continue}
-          }
-        }*/
-        th[b] = omp_get_thread_num();
-      }
-      t_join = omp_get_wtime() - t_join;
-      for (int b=0; b<nBatch; ++b) Rprintf("th(batch[%d])=%d\n", b, th[b]);
-      
-      /*while (is<nx_starts && js<ny_starts) {
-        i = x_starts[is]-1, j = y_starts[js]-1;
-        x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          int j1 = j+1;
-          for (int ii=0; ii<x_lens[is]; ++ii) {
-            starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-          }
-          is++;
-        } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
-      }*/
-    } else if (how==inner && unq_x && unq_y) {
-      error("dev");
-    } else if (how==inner) {
-      error("not yet implemented");
-    } else if (how==full && unq_x && unq_y) {
-      error("dev");
-    } else if (how==full) {
-      /*
-       * 
-       */
-      error("not yet implemented");
+        is++;
+      } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
     }
-  } else {
+    th[b] = omp_get_thread_num();
+  }
+  t_join = omp_get_wtime() - t_join;
+  //for (int b=0; b<nBatch; ++b) Rprintf("th(batch[%d])=%d\n", b, th[b]);
+  
+  if (false) {
     error("input must be ordered!");
     if (false) {
       // find bucket ranges
@@ -542,7 +176,7 @@ void fjoin(int *x, int nx, int *x_starts, int nx_starts,
     }
     //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
   }
-  Rprintf("fjoin: lens took %.3fs; batch took %.3fs; join took %.3fs\n", t_lens, t_batch, t_join);
+  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, uniqueN(th, nBatch));
   matchn[0] = nx;
 }
 
@@ -591,7 +225,6 @@ SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y,
   return ans;
 }
 
-
 SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
 
   if (!isInteger(x) || !isInteger(y))
@@ -610,29 +243,15 @@ SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
   double t_sort = omp_get_wtime();
   if (!x_ord) {
     x = PROTECT(sortInt(x, x_idx)); protecti++;
-    //x_ord = true;
-    x_o_idx = PROTECT(forder(x_idx, R_NilValue, ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+    x_o_idx = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
   }
   if (!y_ord) {
     y = PROTECT(sortInt(y, y_idx)); protecti++;
-    //y_ord = true;
   }
   t_sort = omp_get_wtime() - t_sort;
 
   double t_alloc = omp_get_wtime();
-  int ans_n = 0;
-  joinhow how2;
-  //  left:  nx
-  // right:  ny
-  // inner:  min(nx, ny)
-  //  full:  sum(nx, ny)
-  if (STRING_ELT(how, 0)==mkChar("left")) {
-    ans_n = nx; how2=left;
-  } else if (STRING_ELT(how, 0)==mkChar("inner")) {
-    ans_n = MIN(nx, ny); how2=inner;
-  } else if (STRING_ELT(how, 0)==mkChar("full")) {
-    ans_n = nx+ny; how2=full;
-  } else error("other how not yet implemented");
+  int ans_n = nx;
   bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
   int matchn = 0;
   int *starts_x = (int *)R_alloc(0, sizeof(int));
@@ -644,16 +263,21 @@ SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
 
   double t_fjoin = omp_get_wtime();
   //            *x, nx,         *x_starts,        nx_starts,
-  fjoin(INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
-        INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
-        how2, match, &matchn,
-        starts_x, lens_x, starts_y, lens_y);
+  fjoinC(INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
+         INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
+         match, &matchn,
+         starts_x, lens_x, starts_y, lens_y);
   t_fjoin = omp_get_wtime() - t_fjoin;
 
   double t_ans = omp_get_wtime();
-  SEXP ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, INTEGER(x_o_idx));
+  SEXP ans = R_NilValue;
+  if (!x_ord) {
+    ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, INTEGER(x_o_idx));
+  } else {
+    ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, /*anything*/starts_x);
+  }
   t_ans = omp_get_wtime() - t_ans;
-  Rprintf("fjoinR: index %.3fs; sort %.3fs; alloc %.3fs; fjoin %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_fjoin, t_ans);
+  Rprintf("fjoinR: index %.3fs; sort %.3fs; alloc %.3fs; fjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_fjoin, t_ans);
   UNPROTECT(protecti);
   return ans;
 }

From c13113a0f3904b84d88740be865fa5065986c0af Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 13:24:09 +0100
Subject: [PATCH 07/34] prepare for batching

---
 src/fjoin.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/fjoin.c b/src/fjoin.c
index 47b4f1cf8e..6971f7de2d 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -26,12 +26,13 @@ static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) {
   error("not yet handled in min_i_match");
 }
 
-int uniqueN(int *x, int nx) { // x have 0:(nx-1) elements
-  int ans=0;
+// helper to count how many threads were used
+int unqN(int *x, int nx) { // x have 0:(nx-1) values
+  int ans = 0;
   uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
   memset(seen, 0, nx*sizeof(uint8_t));
   for (int i=0; i<nx; ++i) {
-    if(!seen[x[i]]++) ans++;
+    if (!seen[x[i]]++) ans++;
   }
   return ans;
 }
@@ -77,23 +78,25 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
   //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
   //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
   //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
+  
+  
+  int *xs, *ys, nxs, nys;
+  xs = x_starts + 0*sizeof(int);
+  nxs = nx_starts;
+  ys = y_starts + 0*sizeof(int);
+  nys = ny_starts;
+  
+  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report number of used threads
   t_batch = omp_get_wtime() - t_batch;
   
-  //int min_y = y[y_starts[0]-1];
-  //int max_y = y[y_starts[nx_starts-1]-1];
-  //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
-  int *th = (int *)R_alloc(nBatch, sizeof(int)); // debug thread utilization
-  //bool skip=false;
   double t_join = omp_get_wtime();
-  //#pragma omp parallel for schedule(dynamic) num_threads(getDTthreads())
+  //#pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
-    //int *xs, *ys, nxs, nys;
-    //xs = x_starts + 1*sizeof(int);
-    // fjoin1(/*local in*/xs, nxs, ys, nys, /*global in*/x, x_lens, /*out*/start_y, lens_y);
+    // fjoin1(/*local in*/xs, nxs, ys, nys, /*global in*/x, x_lens, y, y_lens, /*out*/start_y, lens_y);
     int i=0, j=0, is=0, js=0, x_i, y_j;
-    while (is<nx_starts && js<ny_starts) {
+    while (is<nxs && js<nys) {
       //Rprintf("============= is=%d\n", is);
-      i = x_starts[is]-1, j = y_starts[js]-1;
+      i = xs[is]-1, j = ys[js]-1;
       x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
@@ -105,6 +108,20 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
         is++;
       } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
     }
+    /*while (is<nx_starts && js<ny_starts) {
+      //Rprintf("============= is=%d\n", is);
+      i = x_starts[is]-1, j = y_starts[js]-1;
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+        int j1 = j+1;
+        for (int ii=0; ii<x_lens[is]; ++ii) {
+          //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
+        }
+        is++;
+      } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+    }*/
     th[b] = omp_get_thread_num();
   }
   t_join = omp_get_wtime() - t_join;
@@ -176,7 +193,7 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
     }
     //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
   }
-  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, uniqueN(th, nBatch));
+  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, unqN(th, nBatch));
   matchn[0] = nx;
 }
 

From ffb521aec9079ae12e4ce9621f9ccc336a5ed9d6 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 19:02:11 +0100
Subject: [PATCH 08/34] finally parallel

---
 inst/tests/fjoin.Rraw |  36 ++-----
 src/fjoin.c           | 212 ++++++++++++++++--------------------------
 2 files changed, 90 insertions(+), 158 deletions(-)

diff --git a/inst/tests/fjoin.Rraw b/inst/tests/fjoin.Rraw
index cfdf0d0beb..1537cdd988 100644
--- a/inst/tests/fjoin.Rraw
+++ b/inst/tests/fjoin.Rraw
@@ -10,22 +10,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   bmerge = data.table:::bmerge
   forderv = data.table:::forderv
 }
-check = function(x, y) {
-  a = d(x, y)
-  b = fjoin(x, y)
-  na = is.na(a[["starts"]])
-  fail = !identical(a[["starts"]][!na], b[["starts_y"]][!na])
-  fail = fail || !identical(a[["lens"]][!na], b[["lens_y"]][!na])
-  if (fail) {
-    if (interactive() && length(x) < 20L) {
-      out = capture.output(print(list(d=a, fj=b)))
-      stop("starts are not equal: ", out)
-    }
-    stop("starts are not equal")
-  }
-  invisible(TRUE)
-}
-
 
 d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2]
 
@@ -74,7 +58,7 @@ ssa = function(unq_n, size, sort=FALSE) {
   if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
   if (sort) sort(ans) else ans
 }
-N = 1e2
+N = 1e2L
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N, sort=TRUE)
 test(11.01, fjoin(x, y), d(x, y))
@@ -106,12 +90,12 @@ test(12.04, fjoin(x, y), d(x, y))
 x = ssa(N, N)
 y = ssa(N, N)
 test(14.01, fjoin(x, y), d(x, y))
-#x = ssa(N, N*1.1)
-#y = ssa(N, N)
-#test(14.02, fjoin(x, y), d(x, y))
-#x = ssa(N, N)
-#y = ssa(N, N*1.1)
-#test(14.03, fjoin(x, y), d(x, y))
-#x = ssa(N, N*1.1)
-#y = ssa(N, N*1.1)
-#test(14.04, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N)
+test(14.02, fjoin(x, y), d(x, y))
+x = ssa(N, N)
+y = ssa(N, N*1.1)
+test(14.03, fjoin(x, y), d(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N*1.1)
+test(14.04, fjoin(x, y), d(x, y))
diff --git a/src/fjoin.c b/src/fjoin.c
index 6971f7de2d..1b3627259f 100644
--- a/src/fjoin.c
+++ b/src/fjoin.c
@@ -6,24 +6,23 @@
  * split whole 1:nx_starts into numbers of buckets equal to threads*2
  * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches
  */
-static int max_i_match(int *y, int ny, int *y_o, bool y_ord, int val) { // those need to be binary search obviously
-  for (int i=0; i<ny; ++i) {
-    //Rprintf("i=%d: y[y_o[i]] > val: %d > %d\n", i, y[y_o[i]-1], val);
-    if (y[y_o[i]-1]>val) {
-      if (i==0) error("i=0 max_i_match, unhandled");
-      return(i-1);
-    }
+static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { // those need to be binary search obviously
+  int i;
+  for (i=0; i<ny_starts; ++i) {
+    //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
+    if (y[y_starts[i]-1]==val) return(i);
+    else if (y[y_starts[i]-1]>val) return(i-1);
   }
-  error("not yet handled in max_i_match");
+  return(i-1);
 }
-static int min_i_match(int *y, int ny, int *y_o, bool y_ord, int val) {
-  for (int i=ny-1; i<ny; --i) {
-    if (y[y_o[i]-1]<val) {
-      if (i==ny-1) error("i=ny-1 min_i_match, unhandled");
-      return(i+1);
-    }
+static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
+  int i;
+  for (i=ny_starts-1; i>=0; --i) {
+    //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
+    if (y[y_starts[i]-1]==val) return(i);
+    else if (y[y_starts[i]-1]<val) return(i+1);
   }
-  error("not yet handled in min_i_match");
+  return(i+1);
 }
 
 // helper to count how many threads were used
@@ -37,6 +36,7 @@ int unqN(int *x, int nx) { // x have 0:(nx-1) values
   return ans;
 }
 
+// materialize counts of duplicated entries
 void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
   for (int i=0; i<n_starts-1; ++i)
     lens[i] = starts[i+1]-starts[i];
@@ -44,19 +44,37 @@ void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
   return;
 }
 
+// workhorse join that runs in parallel
+void fjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts_y, int *lens_y) {
+  int i=0, j=0, is=0, js=0, x_i, y_j;
+  while (is<nxs && js<nys) {
+    //Rprintf("============= is=%d\n", is);
+    i = xs[is]-1, j = ys[js]-1;
+    x_i = x[i], y_j = y[j];
+    if (x_i == y_j) {
+      //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+      int j1 = j+1;
+      //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
+      for (int ii=0; ii<xl[is]; ++ii) {
+        //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+        starts_y[i+ii] = j1; lens_y[i+ii] = yl[js];
+      }
+      is++;
+    } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+  }
+  return;
+}
+
 void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
            int *y, int ny, int *y_starts, int ny_starts,
            //joinhow how,
            bool *match, int *matchn,
            int *starts_x, int *lens_x,
            int *starts_y, int *lens_y) {
-  bool x_ord=true, y_ord=true;
-  int *x_o=0, *y_o=0;
-  
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   int *x_lens=0, *y_lens=0;
   double t_lens = omp_get_wtime();
-  if (!unq_x || true) {
+  if (!unq_x || true) { // else true as we dont yet have branches for unq cases
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
     //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
@@ -68,132 +86,62 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
   t_lens = omp_get_wtime() - t_lens;
   
   double t_batch = omp_get_wtime();
-  //int min_i = 0, max_i = x_starts[nx_starts-1]-1;
-  //Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
-  //int nth = getDTthreads();
+  /*
+  int min_i = 0, max_i = x_starts[nx_starts-1]-1;
+  Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
+  int b1_min_i = min_i, b1_max_i = max_i;
+  int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
+  int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
+  Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
+  Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
+  */
+  int nth = 2;//getDTthreads();
   //int nBatch=nth*2;
-  int nBatch=1;
-  //int b1_min_i = min_i, b1_max_i = max_i;
-  //int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
-  //int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
-  //Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
-  //Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
+  int nBatch=2;
   
-  
-  int *xs, *ys, nxs, nys;
+  int *xs, *ys, nxs, nys, *xl, *yl;
+  /*
   xs = x_starts + 0*sizeof(int);
   nxs = nx_starts;
   ys = y_starts + 0*sizeof(int);
   nys = ny_starts;
-  
-  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report number of used threads
+   */
+  int b1_nxs = nx_starts/2, b2_nxs = nx_starts-b1_nxs;
+  int *b1_xs = x_starts, *b2_xs = x_starts + b1_nxs;
+  int *b1_xl = x_lens, *b2_xl = x_lens + b1_nxs;
+  //Rprintf("batch sizes nx: b1=%d; b2=%d\nidx of first in batch: x_starts[0]: b1=%d; b2=%d\nfirst value in batch: x[x_starts[0]-1]: b1=%d; b2=%d\n\n",
+  //                      b1_nxs, b2_nxs,                                 b1_xs[0], b2_xs[0],                         x[b1_xs[0]-1], x[b2_xs[0]-1]);
+  //Rprintf("finding bounds of b1 y starts indices\n\n");
+  int b1_ylow = min_i_match(y, y_starts, ny_starts, x[b1_xs[0]-1]);
+  int b1_yupp = max_i_match(y, y_starts, ny_starts, x[b1_xs[b1_nxs-1]-1]);
+  //Rprintf("\nb1_ylow=%d; b1_yupp=%d\ny[y_starts[b1_ylow]-1]=%d; y[ny_starts[b1_yupp]-1]=%d\n", b1_ylow, b1_yupp, y[y_starts[b1_ylow]-1], y[y_starts[b1_yupp]-1]);
+  int b2_ylow = min_i_match(y, y_starts, ny_starts, x[b2_xs[0]-1]);
+  int b2_yupp = max_i_match(y, y_starts, ny_starts, x[b2_xs[b2_nxs-1]-1]);
+  //Rprintf("\nb2_ylow=%d; b2_yupp=%d\ny[y_starts[b2_ylow]-1]=%d; y[ny_starts[b2_yupp]-1]=%d\n", b2_ylow, b2_yupp, y[y_starts[b2_ylow]-1], y[y_starts[b2_yupp]-1]);
+  int b1_nys = b1_yupp-b1_ylow+1, b2_nys = b2_yupp-b2_ylow+1;
+  int *b1_ys = y_starts, *b2_ys = y_starts + b1_nys;
+  int *b1_yl = y_lens, *b2_yl = y_lens + b1_nys;
+  //Rprintf("batch sizes ny: b1=%d; b2=%d\nidx of first in batch: y_starts[0]: b1=%d; b2=%d\nfirst value in batch: y[y_starts[0]-1]: b1=%d; b2=%d\n\n",
+  //        b1_nys, b2_nys,                                 b1_ys[0], b2_ys[0],                         y[b1_ys[0]-1], y[b2_ys[0]-1]);
+  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   t_batch = omp_get_wtime() - t_batch;
-  
   double t_join = omp_get_wtime();
-  //#pragma omp parallel for schedule(dynamic) num_threads(nth)
+  #pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
-    // fjoin1(/*local in*/xs, nxs, ys, nys, /*global in*/x, x_lens, y, y_lens, /*out*/start_y, lens_y);
-    int i=0, j=0, is=0, js=0, x_i, y_j;
-    while (is<nxs && js<nys) {
-      //Rprintf("============= is=%d\n", is);
-      i = xs[is]-1, j = ys[js]-1;
-      x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-        int j1 = j+1;
-        for (int ii=0; ii<x_lens[is]; ++ii) {
-          //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-        }
-        is++;
-      } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+    if (b==0) {
+      xs = b1_xs; nxs = b1_nxs; ys = b1_ys; nys = b1_nys, xl = b1_xl, yl = b1_yl;
+    } else if (b==1) {
+      xs = b2_xs; nxs = b2_nxs; ys = b2_ys; nys = b2_nys, xl = b2_xl, yl = b2_yl;
     }
-    /*while (is<nx_starts && js<ny_starts) {
-      //Rprintf("============= is=%d\n", is);
-      i = x_starts[is]-1, j = y_starts[js]-1;
-      x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-        int j1 = j+1;
-        for (int ii=0; ii<x_lens[is]; ++ii) {
-          //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-          starts_y[i+ii] = j1; lens_y[i+ii] = y_lens[js];
-        }
-        is++;
-      } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
-    }*/
+    fjoin1(xs, xl, nxs, ys, yl, nys, /*batch specific input: shifted pointers and size*/
+           x, y,                     /*common input*/
+           starts_y, lens_y);        /*common output*/
     th[b] = omp_get_thread_num();
   }
+  for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", th[b]);
   t_join = omp_get_wtime() - t_join;
-  //for (int b=0; b<nBatch; ++b) Rprintf("th(batch[%d])=%d\n", b, th[b]);
-  
-  if (false) {
-    error("input must be ordered!");
-    if (false) {
-      // find bucket ranges
-      int min_x_i = x_o[0]-1, max_x_i = x_o[nx-1]-1;
-      int min_y_i = y_o[0]-1, max_y_i = y_o[ny-1]-1;
-      //int min_x = x[min_x_i], max_x = x[max_x_i];
-      //int min_y = y[min_y_i], max_y = y[max_y_i];
-      
-      //bool trim_min_x = min_x < min_y;
-      //bool trim_min_y = min_x > min_y;
-      //bool trim_max_x = max_x > max_y;
-      //bool trim_max_y = max_x < max_y;
-      // todo trim buckets to join-able subsets of x and y
-      
-      //Rprintf("min_x=%d; max_x=%d; min_y=%d, max_y=%d\n", min_x, max_x, min_y, max_y);
-      //int min_xy = imin(min_x, min_y), max_xy = imin(max_x, max_y);
-      //if (max_xy - min_xy < 2) error("not handled yet");
-      
-      int n_buckets = 2;
-      if (n_buckets != 2) error("not handled yet");
-      // this needs to be done for every bucket
-      if (y_ord) error("y must not be sorted");
-      
-      int b1_max_x_o_i = max_i_match(x, nx, x_o, x_ord, x[x_o[nx/2]-1]);
-      Rprintf("b1_max_x_o_i=%d; x[x_o[b1_max_x_o_i]-1]=%d\n", b1_max_x_o_i, x[x_o[b1_max_x_o_i]-1]);
-      int b1_min_x_i = min_x_i, b1_max_x_i = x_o[b1_max_x_o_i]-1; // this need to ensure that duplicate goes into same bucket!! fjoin(c(1:3,3L,3L,5L,4L), y), 3 goes into both buckets!
-      int b2_min_x_i = x_o[b1_max_x_o_i+1]-1, b2_max_x_i = max_x_i;
-      if (x[b1_max_x_i]==x[b2_min_x_i]) error("value of x=%d fits into multiple buckets", x[b2_min_x_i]);
-      Rprintf("min(x)=%d; max(x)=%d; len(x)=%d\n", x[min_x_i], x[max_x_i], nx);
-      Rprintf("b1(min(x))=%d; b1(max(x))=%d; b1(len(x))=%d\n", x[b1_min_x_i], x[b1_max_x_i], b1_max_x_o_i-0);
-      Rprintf("b2(min(x))=%d; b2(max(x))=%d; b2(len(x))=%d\n", x[b2_min_x_i], x[b2_max_x_i], nx-b1_max_x_o_i+1);
-      error("dev");
-      int b1_max_y_o_i = max_i_match(y, ny, y_o, y_ord, /*value to match to*/x[b1_max_x_i]);
-      int b1_min_y_i = min_y_i, b1_max_y_i = y_o[b1_max_y_o_i]-1;
-      int b2_min_y_o_i = min_i_match(y, ny, y_o, y_ord, x[b2_min_x_i]);
-      int b2_min_y_i = y_o[b2_min_y_o_i]-1, b2_max_y_i = max_y_i;
-      Rprintf("b1_min_x_i=%d, b1_max_x_i=%d, b2_min_x_i=%d, b2_max_x_i=%d\n", b1_min_x_i, b1_max_x_i, b2_min_x_i, b2_max_x_i);
-      Rprintf("buckets: 1(x:%d-%d, y:%d-%d); 2(x:%d-%d, y:%d-%d)\n", x[b1_min_x_i], x[b1_max_x_i], y[b1_min_y_i], y[b1_max_y_i], x[b2_min_x_i], x[b2_max_x_i], y[b2_min_y_i], y[b2_max_y_i]);
-      // openmp here!!
-      for (int b=0; b<n_buckets; ++b) {
-        int *x_o_, nx_, *y_o_, ny_;
-        if (b==0) {
-          x_o_ = x_o;
-          nx_ = nx/2;
-          y_o_ = y_o;
-          ny_ = b1_max_y_o_i;
-        } else {
-          x_o_ = x_o + (nx/2+1)*sizeof(int);
-          nx_ = nx-nx/2;
-          y_o_ = y_o + b2_min_y_o_i*sizeof(int);
-          ny_ = ny-b2_min_y_o_i+1;
-        }
-        Rprintf("--- bucket %d content ---", b);
-        Rprintf("\nx (len %d):", nx_);
-        for (int i=0; i<nx_; ++i) Rprintf(" %d", x[x_o_[i]-1]);
-        Rprintf("\ny (len %d):", ny_);
-        for (int i=0; i<ny_; ++i) Rprintf(" %d", y[y_o_[i]-1]);
-        Rprintf("\n");
-        //bucketjoin(x, y, bucket_len_x, bucket_start_x, x_o, x_min, y
-      }
-      error("dev");
-      //error("dev: !x_ord && !y_ord && (!unq_x || !unq_y)");
-    }
-    //Rprintf("x_i=%d; y[y_o[j]-1]= y[y_o[%d]-1]= y[%d-1]= y[%d]= %d\n", x[i], j, y_o[j], y_o[j]-1, y[y_o[j]-1]);
-  }
-  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %d th\n", t_lens, t_batch, t_join, unqN(th, nBatch));
+
+  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth\n", t_lens, t_batch, t_join, unqN(th, nBatch));
   matchn[0] = nx;
 }
 

From 518b36cdabfe7cb43d4d6ad00ff6c441ebb2cb93 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 21:52:08 +0100
Subject: [PATCH 09/34] parallel

---
 R/wrappers.R                           |   4 +-
 inst/tests/{fjoin.Rraw => smjoin.Rraw} |  44 +++----
 src/data.table.h                       |   4 +-
 src/init.c                             |   5 +-
 src/{fjoin.c => smjoin.c}              | 152 ++++++++++++++-----------
 5 files changed, 115 insertions(+), 94 deletions(-)
 rename inst/tests/{fjoin.Rraw => smjoin.Rraw} (70%)
 rename src/{fjoin.c => smjoin.c} (63%)

diff --git a/R/wrappers.R b/R/wrappers.R
index 6b9da0d799..29fc5da8d1 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-fjoin = function(x, y, how=c("left","inner","full")) {
-  ans = .Call(CfjoinR, x, y, match.arg(how))
+smjoin = function(x, y, how=c("left","inner","full")) {
+  ans = .Call(CsmjoinR, x, y, match.arg(how))
   ans[[4L]][is.na(ans[[3L]])] = 1L
   ans[3:4]
 }
diff --git a/inst/tests/fjoin.Rraw b/inst/tests/smjoin.Rraw
similarity index 70%
rename from inst/tests/fjoin.Rraw
rename to inst/tests/smjoin.Rraw
index 1537cdd988..9bc193264b 100644
--- a/inst/tests/fjoin.Rraw
+++ b/inst/tests/smjoin.Rraw
@@ -6,7 +6,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
-  fjoin = data.table:::fjoin
+  smjoin = data.table:::smjoin
   bmerge = data.table:::bmerge
   forderv = data.table:::forderv
 }
@@ -16,37 +16,37 @@ d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, roll
 ## x y sorted
 x = c(1L,2L,3L,4L)
 y = c(2L,3L,5L)
-test(1.01, fjoin(x, y), d(x, y))
+test(1.01, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,5L)
-test(1.02, fjoin(x, y), d(x, y))
+test(1.02, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,4L)
 y = c(2L,3L,3L,5L)
-test(1.03, fjoin(x, y), d(x, y))
+test(1.03, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,3L,5L)
-test(1.04, fjoin(x, y), d(x, y))
+test(1.04, smjoin(x, y), d(x, y))
 
 ## y unsorted
 x = c(1L,2L,3L,4L)
 y = c(2L,5L,3L)
-test(2.01, fjoin(x, y), d(x, y))
+test(2.01, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(3L,2L,5L)
-test(2.02, fjoin(x, y), d(x, y))
+test(2.02, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,4L)
 y = c(5L,3L,2L,3L)
-test(2.03, fjoin(x, y), d(x, y))
+test(2.03, smjoin(x, y), d(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(5L,3L,3L,2L)
-test(2.04, fjoin(x, y), d(x, y))
+test(2.04, smjoin(x, y), d(x, y))
 
 #...
 
 ## xy unsorted
 x = c(4L,1L,3L,2L)
 y = c(2L,5L,3L)
-test(4.01, fjoin(x, y), d(x, y))
+test(4.01, smjoin(x, y), d(x, y))
 
 #...
 
@@ -61,41 +61,41 @@ ssa = function(unq_n, size, sort=FALSE) {
 N = 1e2L
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N, sort=TRUE)
-test(11.01, fjoin(x, y), d(x, y))
+test(11.01, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N, sort=TRUE)
-test(11.02, fjoin(x, y), d(x, y))
+test(11.02, smjoin(x, y), d(x, y))
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N*1.1, sort=TRUE)
-test(11.03, fjoin(x, y), d(x, y))
+test(11.03, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N*1.1, sort=TRUE)
-test(11.04, fjoin(x, y), d(x, y))
+test(11.04, smjoin(x, y), d(x, y))
 
 ## y unsorted
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N)
-test(12.01, fjoin(x, y), d(x, y))
+test(12.01, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N)
-test(12.02, fjoin(x, y), d(x, y))
+test(12.02, smjoin(x, y), d(x, y))
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N*1.1)
-test(12.03, fjoin(x, y), d(x, y))
+test(12.03, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N*1.1)
-test(12.04, fjoin(x, y), d(x, y))
+test(12.04, smjoin(x, y), d(x, y))
 
 ## xy unsorted
 x = ssa(N, N)
 y = ssa(N, N)
-test(14.01, fjoin(x, y), d(x, y))
+test(14.01, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1)
 y = ssa(N, N)
-test(14.02, fjoin(x, y), d(x, y))
+test(14.02, smjoin(x, y), d(x, y))
 x = ssa(N, N)
 y = ssa(N, N*1.1)
-test(14.03, fjoin(x, y), d(x, y))
+test(14.03, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1)
 y = ssa(N, N*1.1)
-test(14.04, fjoin(x, y), d(x, y))
+test(14.04, smjoin(x, y), d(x, y))
diff --git a/src/data.table.h b/src/data.table.h
index abd993a592..fe8091e382 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -244,5 +244,5 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k);
 SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
-// fjoin.c
-SEXP fjoinR(SEXP x, SEXP y, SEXP how);
+// smjoin.c
+SEXP smjoinR(SEXP x, SEXP y, SEXP how);
diff --git a/src/init.c b/src/init.c
index 9f7f45ee8c..731f0b765b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -119,8 +119,7 @@ SEXP lock();
 SEXP unlock();
 SEXP islockedR();
 SEXP allNAR();
-//SEXP sjoinR();
-SEXP fjoinR();
+SEXP smjoinR();
 
 // .Externals
 SEXP fastmean();
@@ -213,7 +212,7 @@ R_CallMethodDef callMethods[] = {
 {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
 {"CtestMsgR", (DL_FUNC) &testMsgR, -1},
 {"C_allNAR", (DL_FUNC) &allNAR, -1},
-{"CfjoinR", (DL_FUNC) &fjoinR, -1},
+{"CsmjoinR", (DL_FUNC) &smjoinR, -1},
 {NULL, NULL, 0}
 };
 
diff --git a/src/fjoin.c b/src/smjoin.c
similarity index 63%
rename from src/fjoin.c
rename to src/smjoin.c
index 1b3627259f..222b7f1af1 100644
--- a/src/fjoin.c
+++ b/src/smjoin.c
@@ -3,10 +3,11 @@
 /*
  * sort-merge join
  * 
- * split whole 1:nx_starts into numbers of buckets equal to threads*2
- * each thread do binary merge x of first and last element on y, and then sort-merge to a subset of y defined by those matches
+ * split whole 1:nx_starts into batches and do sort-merge join on each batch separately
  */
-static int max_i_match(int *y, int *y_starts, int ny_starts, int val) { // those need to be binary search obviously
+
+// those helpers need to be binary search! last matching, and first matching index
+static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
   int i;
   for (i=0; i<ny_starts; ++i) {
     //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
@@ -36,43 +37,42 @@ int unqN(int *x, int nx) { // x have 0:(nx-1) values
   return ans;
 }
 
-// materialize counts of duplicated entries
-void grpLens(int *starts, int n_starts, int n, int *lens) { // #4395
+// count of duplicated entries
+void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
   for (int i=0; i<n_starts-1; ++i)
     lens[i] = starts[i+1]-starts[i];
   lens[n_starts-1] = n - starts[n_starts-1]+1;
   return;
 }
 
-// workhorse join that runs in parallel
-void fjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts_y, int *lens_y) {
+// workhorse join that runs in parallel on batches
+void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts_y, int *lens_y) {
   int i=0, j=0, is=0, js=0, x_i, y_j;
-  while (is<nxs && js<nys) {
-    //Rprintf("============= is=%d\n", is);
+  while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
     i = xs[is]-1, j = ys[js]-1;
     x_i = x[i], y_j = y[j];
-    if (x_i == y_j) {
-      //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-      int j1 = j+1;
-      //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
-      for (int ii=0; ii<xl[is]; ++ii) {
-        //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-        starts_y[i+ii] = j1; lens_y[i+ii] = yl[js];
+    if (x_i == y_j) { //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+      int j1 = j+1; //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
+      for (int ii=0; ii<xl[is]; ++ii) { //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
+        starts_y[i+ii] = j1;
+        lens_y[i+ii] = yl[js];
       }
       is++;
-    } else if (x_i < y_j) is++; else if (x_i > y_j) js++;
+    }
+    else if (x_i < y_j) is++;
+    else if (x_i > y_j) js++;
   }
   return;
 }
 
-void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
-           int *y, int ny, int *y_starts, int ny_starts,
-           //joinhow how,
-           bool *match, int *matchn,
-           int *starts_x, int *lens_x,
-           int *starts_y, int *lens_y) {
+void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
+            int *y, int ny, int *y_starts, int ny_starts,
+            bool *match, int *matchn,
+            int *starts_x, int *lens_x,
+            int *starts_y, int *lens_y) {
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   int *x_lens=0, *y_lens=0;
+
   double t_lens = omp_get_wtime();
   if (!unq_x || true) { // else true as we dont yet have branches for unq cases
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
@@ -84,33 +84,51 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
     grpLens(y_starts, ny_starts, ny, y_lens);
   }
   t_lens = omp_get_wtime() - t_lens;
-  
+
   double t_batch = omp_get_wtime();
-  /*
-  int min_i = 0, max_i = x_starts[nx_starts-1]-1;
-  Rprintf("x[i]=[%d:%d]\n", min_i, max_i);
-  int b1_min_i = min_i, b1_max_i = max_i;
-  int b1_min_i = min_i, b1_max_i = x_starts[nx_starts/2]-1;
-  int b2_min_i = x_starts[nx_starts/2+1]-1, b2_max_i = max_i;
-  Rprintf("x: 1[%d-%d]\n", b1_min_i, b1_max_i);
-  Rprintf("x: 2[%d-%d]\n", b2_min_i, b2_max_i);
-  */
-  int nth = 2;//getDTthreads();
-  //int nBatch=nth*2;
-  int nBatch=2;
-  
-  int *xs, *ys, nxs, nys, *xl, *yl;
-  /*
-  xs = x_starts + 0*sizeof(int);
-  nxs = nx_starts;
-  ys = y_starts + 0*sizeof(int);
-  nys = ny_starts;
-   */
-  int b1_nxs = nx_starts/2, b2_nxs = nx_starts-b1_nxs;
+  int nth = getDTthreads();
+  int nBatch = nth*2;
+  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
+
+  size_t batchSize = (nx_starts-1)/nBatch + 1;
+  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
+  //Rprintf("batchSize=%d; lastBatchSize=%d\n", batchSize, lastBatchSize);
+  int *nxsB = (int *)R_alloc(nBatch, sizeof(int));
+  int **xsB = (int **)R_alloc(nBatch, sizeof(int*));
+  int **xlB = (int **)R_alloc(nBatch, sizeof(int*));
+  int *nysB = (int *)R_alloc(nBatch, sizeof(int));
+  int **ysB = (int **)R_alloc(nBatch, sizeof(int*));
+  int **ylB = (int **)R_alloc(nBatch, sizeof(int*));
+  for (int b=0; b<nBatch-1; ++b) {
+    nxsB[b] = batchSize;
+    xsB[b] = x_starts + b*batchSize;
+    xlB[b] = x_lens + b*batchSize;
+    int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
+    int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
+    nysB[b] = y_max - y_min + 1;
+    ysB[b] = y_starts + y_min;
+    ylB[b] = y_lens + y_min;
+  }
+  nxsB[nBatch-1] = lastBatchSize;
+  xsB[nBatch-1] = x_starts + nx_starts-lastBatchSize;
+  xlB[nBatch-1] = x_lens + nx_starts-lastBatchSize;
+  int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][0]-1]);
+  int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][nxsB[nBatch-1]-1]-1]);
+  nysB[nBatch-1] = y_max - y_min + 1;
+  ysB[nBatch-1] = y_starts + y_min;
+  ylB[nBatch-1] = y_lens + y_min;
+  //for (int b=0; b<nBatch; ++b) Rprintf("b=%d:\n  nx=%d; xs[0]=%d; xl[0]=%d; x[xs[0]-1]=%d\n  ny=%d; ys[0]=%d; yl[0]=%d; y[ys[0]-1]=%d\n", b, nxsB[b], xsB[b][0], xlB[b][0], x[xsB[b][0]-1], nysB[b], ysB[b][0], ylB[b][0], y[ysB[b][0]-1]);
+
+  // these are 2 batches hardcoded logic, replaced by the above, still keep in case debugging needed
+  /*int b1_nxs = batchSize, b2_nxs = nx_starts-b1_nxs;
   int *b1_xs = x_starts, *b2_xs = x_starts + b1_nxs;
   int *b1_xl = x_lens, *b2_xl = x_lens + b1_nxs;
+  //Rprintf("b2_xs[0]: (x_starts + b1_nxs)[0]=%d\n", (x_starts + b1_nxs)[0]);
+  //Rprintf("xsB[1][0]: (x_starts + nx_starts-lastBatchSize)[0]=%d\n", (x_starts + nx_starts-lastBatchSize)[0]);
+  //Rprintf("\nb1_nxs=%d; nx_starts=%d; lastBatchSize=%d\n\n", b1_nxs, nx_starts, lastBatchSize);
   //Rprintf("batch sizes nx: b1=%d; b2=%d\nidx of first in batch: x_starts[0]: b1=%d; b2=%d\nfirst value in batch: x[x_starts[0]-1]: b1=%d; b2=%d\n\n",
   //                      b1_nxs, b2_nxs,                                 b1_xs[0], b2_xs[0],                         x[b1_xs[0]-1], x[b2_xs[0]-1]);
+  
   //Rprintf("finding bounds of b1 y starts indices\n\n");
   int b1_ylow = min_i_match(y, y_starts, ny_starts, x[b1_xs[0]-1]);
   int b1_yupp = max_i_match(y, y_starts, ny_starts, x[b1_xs[b1_nxs-1]-1]);
@@ -118,30 +136,32 @@ void fjoinC(int *x, int nx, int *x_starts, int nx_starts,
   int b2_ylow = min_i_match(y, y_starts, ny_starts, x[b2_xs[0]-1]);
   int b2_yupp = max_i_match(y, y_starts, ny_starts, x[b2_xs[b2_nxs-1]-1]);
   //Rprintf("\nb2_ylow=%d; b2_yupp=%d\ny[y_starts[b2_ylow]-1]=%d; y[ny_starts[b2_yupp]-1]=%d\n", b2_ylow, b2_yupp, y[y_starts[b2_ylow]-1], y[y_starts[b2_yupp]-1]);
+
   int b1_nys = b1_yupp-b1_ylow+1, b2_nys = b2_yupp-b2_ylow+1;
   int *b1_ys = y_starts, *b2_ys = y_starts + b1_nys;
   int *b1_yl = y_lens, *b2_yl = y_lens + b1_nys;
   //Rprintf("batch sizes ny: b1=%d; b2=%d\nidx of first in batch: y_starts[0]: b1=%d; b2=%d\nfirst value in batch: y[y_starts[0]-1]: b1=%d; b2=%d\n\n",
   //        b1_nys, b2_nys,                                 b1_ys[0], b2_ys[0],                         y[b1_ys[0]-1], y[b2_ys[0]-1]);
-  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
+   */
   t_batch = omp_get_wtime() - t_batch;
+  //error("dev");
+
   double t_join = omp_get_wtime();
   #pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
-    if (b==0) {
-      xs = b1_xs; nxs = b1_nxs; ys = b1_ys; nys = b1_nys, xl = b1_xl, yl = b1_yl;
-    } else if (b==1) {
-      xs = b2_xs; nxs = b2_nxs; ys = b2_ys; nys = b2_nys, xl = b2_xl, yl = b2_yl;
-    }
-    fjoin1(xs, xl, nxs, ys, yl, nys, /*batch specific input: shifted pointers and size*/
-           x, y,                     /*common input*/
-           starts_y, lens_y);        /*common output*/
+    smjoin1(
+      /*batch specific input: shifted pointers and their size*/
+      xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b],
+      /*common input*/
+      x, y,
+      /*common output*/
+      starts_y, lens_y);
     th[b] = omp_get_thread_num();
   }
-  for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", th[b]);
   t_join = omp_get_wtime() - t_join;
+  //for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", th[b]);
 
-  Rprintf("fjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth\n", t_lens, t_batch, t_join, unqN(th, nBatch));
+  Rprintf("smjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth\n", t_lens, t_batch, t_join, unqN(th, nBatch));
   matchn[0] = nx;
 }
 
@@ -150,6 +170,7 @@ SEXP sortInt(SEXP x, SEXP idx) {
     error("already sorted, should be escaped");
   SEXP ans = PROTECT(allocVector(INTSXP, LENGTH(x)));
   int *xp = INTEGER(x), *idxp = INTEGER(idx), *ansp = INTEGER(ans);
+  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
   for (int i=0; i<LENGTH(x); ++i)
     ansp[i] = xp[idxp[i]-1];
   UNPROTECT(1);
@@ -190,7 +211,7 @@ SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y,
   return ans;
 }
 
-SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
+SEXP smjoinR(SEXP x, SEXP y, SEXP how) {
 
   if (!isInteger(x) || !isInteger(y))
     error("must be integer");
@@ -226,13 +247,14 @@ SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
   for (int i=0; i<ans_n; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
   t_alloc = omp_get_wtime() - t_alloc;
 
-  double t_fjoin = omp_get_wtime();
-  //            *x, nx,         *x_starts,        nx_starts,
-  fjoinC(INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
-         INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
-         match, &matchn,
-         starts_x, lens_x, starts_y, lens_y);
-  t_fjoin = omp_get_wtime() - t_fjoin;
+  double t_smjoin = omp_get_wtime();
+  smjoinC(
+    INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
+    INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
+    match, &matchn,
+    starts_x, lens_x, starts_y, lens_y
+  );
+  t_smjoin = omp_get_wtime() - t_smjoin;
 
   double t_ans = omp_get_wtime();
   SEXP ans = R_NilValue;
@@ -242,7 +264,7 @@ SEXP fjoinR(SEXP x, SEXP y, SEXP how) {
     ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, /*anything*/starts_x);
   }
   t_ans = omp_get_wtime() - t_ans;
-  Rprintf("fjoinR: index %.3fs; sort %.3fs; alloc %.3fs; fjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_fjoin, t_ans);
+  Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans);
   UNPROTECT(protecti);
   return ans;
 }

From fd03e045f2e57a1393da910fe7b92ce5b2693f63 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 6 Jun 2020 22:52:34 +0100
Subject: [PATCH 10/34] cleanup and speedup

---
 R/wrappers.R     |   8 +-
 src/data.table.h |   2 +-
 src/smjoin.c     | 194 ++++++++++++++++++-----------------------------
 3 files changed, 78 insertions(+), 126 deletions(-)

diff --git a/R/wrappers.R b/R/wrappers.R
index 29fc5da8d1..f8d51219c5 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smjoin = function(x, y, how=c("left","inner","full")) {
-  ans = .Call(CsmjoinR, x, y, match.arg(how))
-  ans[[4L]][is.na(ans[[3L]])] = 1L
-  ans[3:4]
+smjoin = function(x, y) {
+  ans = .Call(CsmjoinR, x, y, NULL, NULL)
+  ans[["lens"]][is.na(ans[["starts"]])] = 1L
+  ans
 }
diff --git a/src/data.table.h b/src/data.table.h
index fe8091e382..74d101429d 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -245,4 +245,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smjoinR(SEXP x, SEXP y, SEXP how);
+SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx);
diff --git a/src/smjoin.c b/src/smjoin.c
index 222b7f1af1..1af803a53e 100644
--- a/src/smjoin.c
+++ b/src/smjoin.c
@@ -3,7 +3,10 @@
 /*
  * sort-merge join
  * 
- * split whole 1:nx_starts into batches and do sort-merge join on each batch separately
+ * join on a single integer column
+ * sort LHS and RHS
+ * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches
+ * parallel sort-merge join
  */
 
 // those helpers need to be binary search! last matching, and first matching index
@@ -39,6 +42,7 @@ int unqN(int *x, int nx) { // x have 0:(nx-1) values
 
 // count of duplicated entries
 void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
+  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
   for (int i=0; i<n_starts-1; ++i)
     lens[i] = starts[i+1]-starts[i];
   lens[n_starts-1] = n - starts[n_starts-1]+1;
@@ -66,10 +70,8 @@ void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *
 }
 
 void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
-            int *y, int ny, int *y_starts, int ny_starts,
-            bool *match, int *matchn,
-            int *starts_x, int *lens_x,
-            int *starts_y, int *lens_y) {
+             int *y, int ny, int *y_starts, int ny_starts,
+             int *matchn, int *starts_y, int *lens_y) {
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   int *x_lens=0, *y_lens=0;
 
@@ -89,73 +91,32 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   int nth = getDTthreads();
   int nBatch = nth*2;
   int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
-
   size_t batchSize = (nx_starts-1)/nBatch + 1;
   size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
   //Rprintf("batchSize=%d; lastBatchSize=%d\n", batchSize, lastBatchSize);
-  int *nxsB = (int *)R_alloc(nBatch, sizeof(int));
-  int **xsB = (int **)R_alloc(nBatch, sizeof(int*));
-  int **xlB = (int **)R_alloc(nBatch, sizeof(int*));
-  int *nysB = (int *)R_alloc(nBatch, sizeof(int));
-  int **ysB = (int **)R_alloc(nBatch, sizeof(int*));
-  int **ylB = (int **)R_alloc(nBatch, sizeof(int*));
+  int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*));
+  int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*));
   for (int b=0; b<nBatch-1; ++b) {
-    nxsB[b] = batchSize;
-    xsB[b] = x_starts + b*batchSize;
-    xlB[b] = x_lens + b*batchSize;
+    nxsB[b] = batchSize; xsB[b] = x_starts + b*batchSize; xlB[b] = x_lens + b*batchSize;
     int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
     int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
-    nysB[b] = y_max - y_min + 1;
-    ysB[b] = y_starts + y_min;
-    ylB[b] = y_lens + y_min;
+    nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
   }
-  nxsB[nBatch-1] = lastBatchSize;
-  xsB[nBatch-1] = x_starts + nx_starts-lastBatchSize;
-  xlB[nBatch-1] = x_lens + nx_starts-lastBatchSize;
+  nxsB[nBatch-1] = lastBatchSize; xsB[nBatch-1] = x_starts + nx_starts-lastBatchSize; xlB[nBatch-1] = x_lens + nx_starts-lastBatchSize;
   int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][0]-1]);
   int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][nxsB[nBatch-1]-1]-1]);
-  nysB[nBatch-1] = y_max - y_min + 1;
-  ysB[nBatch-1] = y_starts + y_min;
-  ylB[nBatch-1] = y_lens + y_min;
+  nysB[nBatch-1] = y_max - y_min + 1; ysB[nBatch-1] = y_starts + y_min; ylB[nBatch-1] = y_lens + y_min;
   //for (int b=0; b<nBatch; ++b) Rprintf("b=%d:\n  nx=%d; xs[0]=%d; xl[0]=%d; x[xs[0]-1]=%d\n  ny=%d; ys[0]=%d; yl[0]=%d; y[ys[0]-1]=%d\n", b, nxsB[b], xsB[b][0], xlB[b][0], x[xsB[b][0]-1], nysB[b], ysB[b][0], ylB[b][0], y[ysB[b][0]-1]);
-
-  // these are 2 batches hardcoded logic, replaced by the above, still keep in case debugging needed
-  /*int b1_nxs = batchSize, b2_nxs = nx_starts-b1_nxs;
-  int *b1_xs = x_starts, *b2_xs = x_starts + b1_nxs;
-  int *b1_xl = x_lens, *b2_xl = x_lens + b1_nxs;
-  //Rprintf("b2_xs[0]: (x_starts + b1_nxs)[0]=%d\n", (x_starts + b1_nxs)[0]);
-  //Rprintf("xsB[1][0]: (x_starts + nx_starts-lastBatchSize)[0]=%d\n", (x_starts + nx_starts-lastBatchSize)[0]);
-  //Rprintf("\nb1_nxs=%d; nx_starts=%d; lastBatchSize=%d\n\n", b1_nxs, nx_starts, lastBatchSize);
-  //Rprintf("batch sizes nx: b1=%d; b2=%d\nidx of first in batch: x_starts[0]: b1=%d; b2=%d\nfirst value in batch: x[x_starts[0]-1]: b1=%d; b2=%d\n\n",
-  //                      b1_nxs, b2_nxs,                                 b1_xs[0], b2_xs[0],                         x[b1_xs[0]-1], x[b2_xs[0]-1]);
-  
-  //Rprintf("finding bounds of b1 y starts indices\n\n");
-  int b1_ylow = min_i_match(y, y_starts, ny_starts, x[b1_xs[0]-1]);
-  int b1_yupp = max_i_match(y, y_starts, ny_starts, x[b1_xs[b1_nxs-1]-1]);
-  //Rprintf("\nb1_ylow=%d; b1_yupp=%d\ny[y_starts[b1_ylow]-1]=%d; y[ny_starts[b1_yupp]-1]=%d\n", b1_ylow, b1_yupp, y[y_starts[b1_ylow]-1], y[y_starts[b1_yupp]-1]);
-  int b2_ylow = min_i_match(y, y_starts, ny_starts, x[b2_xs[0]-1]);
-  int b2_yupp = max_i_match(y, y_starts, ny_starts, x[b2_xs[b2_nxs-1]-1]);
-  //Rprintf("\nb2_ylow=%d; b2_yupp=%d\ny[y_starts[b2_ylow]-1]=%d; y[ny_starts[b2_yupp]-1]=%d\n", b2_ylow, b2_yupp, y[y_starts[b2_ylow]-1], y[y_starts[b2_yupp]-1]);
-
-  int b1_nys = b1_yupp-b1_ylow+1, b2_nys = b2_yupp-b2_ylow+1;
-  int *b1_ys = y_starts, *b2_ys = y_starts + b1_nys;
-  int *b1_yl = y_lens, *b2_yl = y_lens + b1_nys;
-  //Rprintf("batch sizes ny: b1=%d; b2=%d\nidx of first in batch: y_starts[0]: b1=%d; b2=%d\nfirst value in batch: y[y_starts[0]-1]: b1=%d; b2=%d\n\n",
-  //        b1_nys, b2_nys,                                 b1_ys[0], b2_ys[0],                         y[b1_ys[0]-1], y[b2_ys[0]-1]);
-   */
   t_batch = omp_get_wtime() - t_batch;
-  //error("dev");
 
   double t_join = omp_get_wtime();
   #pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
     smjoin1(
-      /*batch specific input: shifted pointers and their size*/
-      xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b],
-      /*common input*/
-      x, y,
-      /*common output*/
-      starts_y, lens_y);
+      xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b], // batch specific input: shifted pointers and their sizes
+      x, y,                                             // common input
+      starts_y, lens_y                                  // common output
+    );
     th[b] = omp_get_thread_num();
   }
   t_join = omp_get_wtime() - t_join;
@@ -165,105 +126,96 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   matchn[0] = nx;
 }
 
-SEXP sortInt(SEXP x, SEXP idx) {
-  if (!LENGTH(idx))
-    error("already sorted, should be escaped");
-  SEXP ans = PROTECT(allocVector(INTSXP, LENGTH(x)));
-  int *xp = INTEGER(x), *idxp = INTEGER(idx), *ansp = INTEGER(ans);
+void sortInt(int *x, int nx, int *idx, int *ans) {
   #pragma omp parallel for schedule(static) num_threads(getDTthreads())
-  for (int i=0; i<LENGTH(x); ++i)
-    ansp[i] = xp[idxp[i]-1];
-  UNPROTECT(1);
-  return ans;
+  for (int i=0; i<nx; ++i)
+    ans[i] = x[idx[i]-1];
+  return;
 }
 
-SEXP joinOut(int matchn, int *starts_x, int *lens_x, int *starts_y, int *lens_y, bool x_ord, int *x_o_idx) {
-  SEXP ans = PROTECT(allocVector(VECSXP, 4)), ansnames;
-  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 4));
-  SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, 0));
-  SET_STRING_ELT(ansnames, 0, mkChar("starts_x"));
-  //int *starts_x_p = INTEGER(VECTOR_ELT(ans, 0));
-  SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, 0));
-  SET_STRING_ELT(ansnames, 1, mkChar("lens_x"));
-  //int *lens_x_p = INTEGER(VECTOR_ELT(ans, 1));
-  SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 2, mkChar("starts"));
-  int *starts_y_p = INTEGER(VECTOR_ELT(ans, 2));
-  SET_VECTOR_ELT(ans, 3, allocVector(INTSXP, matchn));
-  SET_STRING_ELT(ansnames, 3, mkChar("lens"));
-  int *lens_y_p = INTEGER(VECTOR_ELT(ans, 3));
-  if (!x_ord) {
-    for (int i=0; i<matchn; ++i) {
-      //starts_x_p[i] = starts_x[i];
-      //lens_x_p[i] = lens_x[i];
-      starts_y_p[i] = starts_y[x_o_idx[i]-1];
-      lens_y_p[i] = lens_y[x_o_idx[i]-1];
-    }
+SEXP joinOut(int n, int *starts, int *lens, bool x_ord, SEXP out_starts, SEXP out_lens, int *xoop) {
+  SEXP ans = PROTECT(allocVector(VECSXP, 2)), ansnames;
+  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 2));
+  SET_STRING_ELT(ansnames, 0, char_starts);
+  SET_STRING_ELT(ansnames, 1, char_lens);
+  if (x_ord) {
+    SET_VECTOR_ELT(ans, 0, out_starts);
+    SET_VECTOR_ELT(ans, 1, out_lens);
   } else {
-    for (int i=0; i<matchn; ++i) {
-      //starts_x_p[i] = starts_x[i];
-      //lens_x_p[i] = lens_x[i];
-      starts_y_p[i] = starts_y[i];
-      lens_y_p[i] = lens_y[i];
-    }
+    SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
+    SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
+    sortInt(starts, n, xoop, INTEGER(VECTOR_ELT(ans, 0)));
+    sortInt(lens, n, xoop, INTEGER(VECTOR_ELT(ans, 1)));
   }
   UNPROTECT(1);
   return ans;
 }
 
-SEXP smjoinR(SEXP x, SEXP y, SEXP how) {
+SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
 
   if (!isInteger(x) || !isInteger(y))
     error("must be integer");
   int protecti=0, nx = LENGTH(x), ny = LENGTH(y);
 
   double t_index = omp_get_wtime();
-  SEXP x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
-  SEXP y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  if (isNull(x_idx)) {
+    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  }
+  if (isNull(y_idx)) {
+    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  }
   SEXP x_starts = getAttrib(x_idx, sym_starts);
   SEXP y_starts = getAttrib(y_idx, sym_starts);
+  if (isNull(x_starts) || isNull(y_starts))
+    error("Indices provided to smjoin must carry 'starts' attribute");
   t_index = omp_get_wtime() - t_index;
 
-  bool x_ord = LENGTH(x_idx)==0, y_ord = LENGTH(y_idx)==0;
-  SEXP x_o_idx = R_NilValue;
   double t_sort = omp_get_wtime();
+  bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
+  SEXP xoo = R_NilValue; // order of an order of x
+  int *xp, *yp, *xoop=0;
   if (!x_ord) {
-    x = PROTECT(sortInt(x, x_idx)); protecti++;
-    x_o_idx = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
-  }
+    xp = (int *)R_alloc(nx, sizeof(int));
+    sortInt(INTEGER(x), nx, INTEGER(x_idx), xp);
+    xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+    xoop = INTEGER(xoo);
+  } else xp = INTEGER(x);
   if (!y_ord) {
-    y = PROTECT(sortInt(y, y_idx)); protecti++;
-  }
+    yp = (int *)R_alloc(ny, sizeof(int));
+    sortInt(INTEGER(y), ny, INTEGER(y_idx), yp);
+  } else yp = INTEGER(y);
   t_sort = omp_get_wtime() - t_sort;
 
   double t_alloc = omp_get_wtime();
-  int ans_n = nx;
-  bool *match = (bool *)R_alloc(0, sizeof(bool)); // not sure if we need this
-  int matchn = 0;
-  int *starts_x = (int *)R_alloc(0, sizeof(int));
-  int *lens_x = (int *)R_alloc(0, sizeof(int));
-  int *starts_y = (int *)R_alloc(ans_n, sizeof(int));
-  int *lens_y = (int *)R_alloc(ans_n, sizeof(int));
-  for (int i=0; i<ans_n; ++i) /*starts_x[i] = lens_x[i] = */starts_y[i] = lens_y[i] = NA_INTEGER;
+  SEXP out_starts = R_NilValue, out_lens = R_NilValue;
+  int *starts=0, *lens=0;
+  if (x_ord) { // we dont need to reorder results so can save one allocation
+    out_starts = PROTECT(allocVector(INTSXP, nx)); protecti++;
+    out_lens = PROTECT(allocVector(INTSXP, nx)); protecti++;
+    starts = INTEGER(out_starts);
+    lens = INTEGER(out_lens);
+  } else {
+    starts = (int *)R_alloc(nx, sizeof(int));
+    lens = (int *)R_alloc(nx, sizeof(int));
+  }
+  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
+  for (int i=0; i<nx; ++i)
+    starts[i] = lens[i] = NA_INTEGER;
   t_alloc = omp_get_wtime() - t_alloc;
 
   double t_smjoin = omp_get_wtime();
+  int matchn = 0; // not yet used
   smjoinC(
-    INTEGER(x), nx, INTEGER(x_starts), LENGTH(x_starts),
-    INTEGER(y), ny, INTEGER(y_starts), LENGTH(y_starts),
-    match, &matchn,
-    starts_x, lens_x, starts_y, lens_y
+    xp, nx, INTEGER(x_starts), LENGTH(x_starts),
+    yp, ny, INTEGER(y_starts), LENGTH(y_starts),
+    &matchn, starts, lens
   );
   t_smjoin = omp_get_wtime() - t_smjoin;
 
   double t_ans = omp_get_wtime();
-  SEXP ans = R_NilValue;
-  if (!x_ord) {
-    ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, INTEGER(x_o_idx));
-  } else {
-    ans = joinOut(matchn, starts_x, lens_x, starts_y, lens_y, x_ord, /*anything*/starts_x);
-  }
+  SEXP ans = joinOut(matchn, starts, lens, x_ord, out_starts, out_lens, xoop); // usually also sorts inside
   t_ans = omp_get_wtime() - t_ans;
+
   Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans);
   UNPROTECT(protecti);
   return ans;

From 71c348b50f6c04066360b2d16bf82d50e042c83b Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sun, 7 Jun 2020 01:22:41 +0100
Subject: [PATCH 11/34] bucketing now uses binary search

---
 R/wrappers.R           |   4 +-
 inst/tests/smjoin.Rraw |   8 +++
 src/smjoin.c           | 140 ++++++++++++++++++++++++++++++++---------
 3 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/R/wrappers.R b/R/wrappers.R
index f8d51219c5..ba43d8cba2 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,8 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smjoin = function(x, y) {
-  ans = .Call(CsmjoinR, x, y, NULL, NULL)
+smjoin = function(x, y, x.idx=NULL, y.idx=NULL) {
+  ans = .Call(CsmjoinR, x, y, x.idx, y.idx)
   ans[["lens"]][is.na(ans[["starts"]])] = 1L
   ans
 }
diff --git a/inst/tests/smjoin.Rraw b/inst/tests/smjoin.Rraw
index 9bc193264b..48a479463c 100644
--- a/inst/tests/smjoin.Rraw
+++ b/inst/tests/smjoin.Rraw
@@ -99,3 +99,11 @@ test(14.03, smjoin(x, y), d(x, y))
 x = ssa(N, N*1.1)
 y = ssa(N, N*1.1)
 test(14.04, smjoin(x, y), d(x, y))
+
+# sparse
+x = sample(2e2L, 1e2L)
+y = sample(2e2L, 1e2L)
+test(21.01, smjoin(x, y), d(x, y))
+x = sample(2e2L, 1e2L, TRUE)
+y = sample(2e2L, 1e2L, TRUE)
+test(21.02, smjoin(x, y), d(x, y))
diff --git a/src/smjoin.c b/src/smjoin.c
index 1af803a53e..ca80067bdd 100644
--- a/src/smjoin.c
+++ b/src/smjoin.c
@@ -9,26 +9,6 @@
  * parallel sort-merge join
  */
 
-// those helpers need to be binary search! last matching, and first matching index
-static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
-  int i;
-  for (i=0; i<ny_starts; ++i) {
-    //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
-    if (y[y_starts[i]-1]==val) return(i);
-    else if (y[y_starts[i]-1]>val) return(i-1);
-  }
-  return(i-1);
-}
-static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
-  int i;
-  for (i=ny_starts-1; i>=0; --i) {
-    //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
-    if (y[y_starts[i]-1]==val) return(i);
-    else if (y[y_starts[i]-1]<val) return(i+1);
-  }
-  return(i+1);
-}
-
 // helper to count how many threads were used
 int unqN(int *x, int nx) { // x have 0:(nx-1) values
   int ans = 0;
@@ -69,9 +49,84 @@ void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *
   return;
 }
 
+// those helpers need to be binary search! last matching, and first matching index
+static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
+  int i;
+  for (i=0; i<ny_starts; ++i) {
+    //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
+    if (y[y_starts[i]-1]==val) return(i);
+    else if (y[y_starts[i]-1]>val) return(i-1);
+  }
+  return(i-1);
+}
+static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
+  int i;
+  for (i=ny_starts-1; i>=0; --i) {
+    //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
+    if (y[y_starts[i]-1]==val) return(i);
+    else if (y[y_starts[i]-1]<val) return(i+1);
+  }
+  return(i+1);
+}
+/*
+ * 'rolling nearest' binary search
+ * used to find 'y' lower and upper bounds for each batch
+ * side -1: lower bound; side 1: upper bound
+ */
+static int rollbs(int *x, int *ix, int nix, int val, int side) {
+  bool verbose = false;
+  if (verbose) Rprintf("rollbs: side=%d; val=%d\n", side, val);
+  if (side < 0) { // lower bound early stopping
+    if (x[ix[0]-1] == val) {
+      if (verbose) Rprintf("rollbs: min elements %d match\n", val);
+      return 0;
+    }
+    if (x[ix[nix-1]-1] < val) {
+      if (verbose) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val);
+      return -1;
+    }
+    if (x[ix[0]-1] > val) {
+      if (verbose) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val);
+      return 0;
+    }
+  } else if (side > 0) { // upper bound early stopping
+    if (x[ix[nix-1]-1] == val) {
+      if (verbose) Rprintf("rollbs: max elements %d match\n", val);
+      return nix-1;
+    }
+    if (x[ix[0]-1] > val) {
+      if (verbose) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val);
+      return -1;
+    }
+    if (x[ix[nix-1]-1] < val) {
+      if (verbose) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val);
+      return nix-1;
+    }
+  }
+  int lower=0, upper=nix, i=0;
+  while (lower<=upper) {
+    i = lower + (upper-lower)/2;
+    int thisx = x[ix[i]-1];
+    //if (verbose) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val);
+    if (thisx==val)
+      return(i);
+    else if (thisx < val)
+      lower = i+1;
+    else if (thisx > val)
+      upper = i-1;
+  }
+  if (verbose) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side);
+  if (side < 0) // anyone to stress test this logic?
+    return lower;
+  else
+    return upper;
+}
+
 void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
              int *y, int ny, int *y_starts, int ny_starts,
              int *matchn, int *starts_y, int *lens_y) {
+
+  bool verbose = true;
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   int *x_lens=0, *y_lens=0;
 
@@ -87,25 +142,45 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   }
   t_lens = omp_get_wtime() - t_lens;
 
-  double t_batch = omp_get_wtime();
+  double t_batch = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs!
   int nth = getDTthreads();
-  int nBatch = nth*2;
+  int nBatch = 0; // for a single threaded or small unq count use single batch
+  if (nth == 1 || nx_starts < 4) {
+    nBatch = 1;
+  } else {
+    nBatch = nth * 2;
+  }
   int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   size_t batchSize = (nx_starts-1)/nBatch + 1;
   size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
-  //Rprintf("batchSize=%d; lastBatchSize=%d\n", batchSize, lastBatchSize);
+  //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize);
   int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*));
   int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*));
+  bool do_not_set_to_false = false; int y_min2, y_max2;
+  //#pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch-1; ++b) {
     nxsB[b] = batchSize; xsB[b] = x_starts + b*batchSize; xlB[b] = x_lens + b*batchSize;
-    int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
-    int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
+    if (do_not_set_to_false) y_min2 = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
+    if (do_not_set_to_false) y_max2 = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
+    //Rprintf("nxsB[b]=%d; y_min=%d; y_max=%d\n", nxsB[b], y_min, y_max);
+    int y_min = rollbs(y, y_starts, ny_starts, x[xsB[b][0]-1], -1);
+    int y_max = rollbs(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1], 1);
+    //Rprintf("nxsB[b]=%d; y_mn2=%d; y_mx2=%d\n", nxsB[b], y_min2, y_max2);
+    if (do_not_set_to_false && y_min!=y_min2) error("y_min different: %d vs %d", y_min2, y_min);
+    if (do_not_set_to_false && y_max!=y_max2) error("y_max different: %d vs %d", y_max2, y_max);
     nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
   }
-  nxsB[nBatch-1] = lastBatchSize; xsB[nBatch-1] = x_starts + nx_starts-lastBatchSize; xlB[nBatch-1] = x_lens + nx_starts-lastBatchSize;
-  int y_min = min_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][0]-1]);
-  int y_max = max_i_match(y, y_starts, ny_starts, x[xsB[nBatch-1][nxsB[nBatch-1]-1]-1]);
-  nysB[nBatch-1] = y_max - y_min + 1; ysB[nBatch-1] = y_starts + y_min; ylB[nBatch-1] = y_lens + y_min;
+  int b = nBatch-1;
+  nxsB[b] = lastBatchSize; xsB[b] = x_starts + nx_starts-lastBatchSize; xlB[b] = x_lens + nx_starts-lastBatchSize;
+  if (do_not_set_to_false) y_min2 = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
+  if (do_not_set_to_false) y_max2 = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
+  //Rprintf("nxsB[b]=%d; y_min=%d; y_max=%d\n", nxsB[b], y_min, y_max);
+  int y_min = rollbs(y, y_starts, ny_starts, x[xsB[b][0]-1], -1);
+  int y_max = rollbs(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1], 1);
+  //Rprintf("nxsB[b]=%d; y_mn2=%d; y_mx2=%d\n", nxsB[b], y_min2, y_max2);
+  if (do_not_set_to_false && y_min!=y_min2) error("y_min different: %d vs %d", y_min2, y_min);
+  if (do_not_set_to_false && y_max!=y_max2) error("y_max different: %d vs %d", y_max2, y_max);
+  nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
   //for (int b=0; b<nBatch; ++b) Rprintf("b=%d:\n  nx=%d; xs[0]=%d; xl[0]=%d; x[xs[0]-1]=%d\n  ny=%d; ys[0]=%d; yl[0]=%d; y[ys[0]-1]=%d\n", b, nxsB[b], xsB[b][0], xlB[b][0], x[xsB[b][0]-1], nysB[b], ysB[b][0], ylB[b][0], y[ysB[b][0]-1]);
   t_batch = omp_get_wtime() - t_batch;
 
@@ -122,7 +197,8 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   t_join = omp_get_wtime() - t_join;
   //for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", th[b]);
 
-  Rprintf("smjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth\n", t_lens, t_batch, t_join, unqN(th, nBatch));
+  if (verbose)
+    Rprintf("smjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth in %d batch\n", t_lens, t_batch, t_join, unqN(th, nBatch), nBatch);
   matchn[0] = nx;
 }
 
@@ -153,6 +229,7 @@ SEXP joinOut(int n, int *starts, int *lens, bool x_ord, SEXP out_starts, SEXP ou
 
 SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
 
+  bool verbose = true;
   if (!isInteger(x) || !isInteger(y))
     error("must be integer");
   int protecti=0, nx = LENGTH(x), ny = LENGTH(y);
@@ -216,7 +293,8 @@ SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
   SEXP ans = joinOut(matchn, starts, lens, x_ord, out_starts, out_lens, xoop); // usually also sorts inside
   t_ans = omp_get_wtime() - t_ans;
 
-  Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans);
+  if (verbose)
+    Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans);
   UNPROTECT(protecti);
   return ans;
 }

From 86a4bcceea7a1e1c688258289d671a56b0f53330 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sun, 7 Jun 2020 01:43:10 +0100
Subject: [PATCH 12/34] rename vars

---
 src/smjoin.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/smjoin.c b/src/smjoin.c
index ca80067bdd..d768659ce5 100644
--- a/src/smjoin.c
+++ b/src/smjoin.c
@@ -30,16 +30,16 @@ void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
 }
 
 // workhorse join that runs in parallel on batches
-void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts_y, int *lens_y) {
+void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts, int *lens) {
   int i=0, j=0, is=0, js=0, x_i, y_j;
   while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
     i = xs[is]-1, j = ys[js]-1;
     x_i = x[i], y_j = y[j];
     if (x_i == y_j) { //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
       int j1 = j+1; //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
-      for (int ii=0; ii<xl[is]; ++ii) { //Rprintf("starts_y[%d]=%d\n", i+ii, j1);
-        starts_y[i+ii] = j1;
-        lens_y[i+ii] = yl[js];
+      for (int ii=0; ii<xl[is]; ++ii) { //Rprintf("starts[%d]=%d\n", i+ii, j1);
+        starts[i+ii] = j1;
+        lens[i+ii] = yl[js];
       }
       is++;
     }
@@ -124,7 +124,7 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
 
 void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
              int *y, int ny, int *y_starts, int ny_starts,
-             int *matchn, int *starts_y, int *lens_y) {
+             int *matchn, int *starts, int *lens) {
 
   bool verbose = true;
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
@@ -145,7 +145,7 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   double t_batch = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs!
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
-  if (nth == 1 || nx_starts < 4) {
+  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024
     nBatch = 1;
   } else {
     nBatch = nth * 2;
@@ -190,7 +190,7 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
     smjoin1(
       xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b], // batch specific input: shifted pointers and their sizes
       x, y,                                             // common input
-      starts_y, lens_y                                  // common output
+      starts, lens                                      // common output
     );
     th[b] = omp_get_thread_num();
   }

From 36b6762121fae8db9b3a3fb1da480cdccd94f6cb Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 8 Jun 2020 00:40:47 +0100
Subject: [PATCH 13/34] getting closer

---
 R/bmerge.R                 |  15 +-
 R/wrappers.R               |   6 +-
 inst/tests/smerge.Rraw     | 129 +++++++++++++++++
 inst/tests/smjoin.Rraw     | 109 ---------------
 src/bmerge.c               |   6 +-
 src/data.table.h           |   7 +-
 src/init.c                 |  14 +-
 src/{smjoin.c => smerge.c} | 280 +++++++++++++++++++++++++++----------
 8 files changed, 369 insertions(+), 197 deletions(-)
 create mode 100644 inst/tests/smerge.Rraw
 delete mode 100644 inst/tests/smjoin.Rraw
 rename src/{smjoin.c => smerge.c} (51%)

diff --git a/R/bmerge.R b/R/bmerge.R
index 3d6ab028f3..1cf98bcfa8 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -1,6 +1,19 @@
 
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
+  if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
+      && isTRUE(getOption("datatable.smerge"))                     ## enable option
+      && identical(nomatch, NA_integer_)                           ## for now only outer join
+      && identical(mult, "all")                                    ## for now only mult='all'
+      && identical(ops, 1L)                                        ## equi join
+      && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
+      ) {
+    if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
+    ans = smerge(x=i[[icols]], y=x[[xcols]])
+    if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
+    ans$xo = c(ans$xo) ## drop starts and maxgrpn
+    return(ans[c("starts","lens","indices","allLen1","allGrp1","xo")])
+  }
   callersi = i
   i = shallow(i)
   # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
@@ -180,9 +193,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...\n");flush.console()}
   ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
   if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()}
-  # TO DO: xo could be moved inside Cbmerge
 
-  ans$xo = xo  # for further use by [.data.table
   return(ans)
 }
 
diff --git a/R/wrappers.R b/R/wrappers.R
index ba43d8cba2..06a25924b7 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,8 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smjoin = function(x, y, x.idx=NULL, y.idx=NULL) {
-  ans = .Call(CsmjoinR, x, y, x.idx, y.idx)
-  ans[["lens"]][is.na(ans[["starts"]])] = 1L
-  ans
-}
+smerge = function(x, y, x.idx=NULL, y.idx=NULL) .Call(CsmergeR, x, y, x.idx, y.idx)
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
new file mode 100644
index 0000000000..f0f4530a3f
--- /dev/null
+++ b/inst/tests/smerge.Rraw
@@ -0,0 +1,129 @@
+require(methods)
+
+if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
+  if ((tt<-compiler::enableJIT(-1))>0)
+    cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
+} else {
+  require(data.table)
+  test = data.table:::test
+  smerge = data.table:::smerge
+  bmerge = data.table:::bmerge
+  forderv = data.table:::forderv
+}
+
+bm = function(x, y) {
+  ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)
+  ## if undefining SMERGE_ALL
+  #ans[c("starts","lens")]
+  ans
+}
+sm = function(x, y) {
+  ans = smerge(x, y)
+  ans = ans[c("starts","lens","indices","allLen1","allGrp1","xo")] ## drop extra stats
+  ans[["xo"]] = c(ans[["xo"]])                                     ## removes starts and maxgrpn, and possibly others in future
+  ## if undefining SMERGE_ALL
+  #ans[c("starts","lens")]
+  ans
+}
+
+## x y sorted
+x = c(1L,2L,3L,4L)
+y = c(2L,3L,5L)
+test(1.01, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(2L,3L,5L)
+test(1.02, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L)
+y = c(2L,3L,3L,5L)
+test(1.03, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(2L,3L,3L,5L)
+test(1.04, sm(x, y), bm(x, y))
+
+## y unsorted
+x = c(1L,2L,3L,4L)
+y = c(2L,5L,3L)
+test(2.01, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(3L,2L,5L)
+test(2.02, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L)
+y = c(5L,3L,2L,3L)
+test(2.03, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,3L,4L)
+y = c(5L,3L,3L,2L)
+test(2.04, sm(x, y), bm(x, y))
+
+#...
+
+## xy unsorted
+x = c(4L,1L,3L,2L)
+y = c(2L,5L,3L)
+test(4.01, sm(x, y), bm(x, y))
+
+#...
+
+## xy sorted
+ssa = function(unq_n, size, sort=FALSE) {
+  stopifnot(unq_n <= size)
+  unq_sub = seq_len(unq_n)
+  ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
+  if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
+  if (sort) sort(ans) else ans
+}
+N = 1e2L
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N, sort=TRUE)
+test(11.01, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N, sort=TRUE)
+test(11.02, sm(x, y), bm(x, y))
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N*1.1, sort=TRUE)
+test(11.03, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N*1.1, sort=TRUE)
+test(11.04, sm(x, y), bm(x, y))
+
+## y unsorted
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N)
+test(12.01, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N)
+test(12.02, sm(x, y), bm(x, y))
+x = ssa(N, N, sort=TRUE)
+y = ssa(N, N*1.1)
+test(12.03, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1, sort=TRUE)
+y = ssa(N, N*1.1)
+test(12.04, sm(x, y), bm(x, y))
+
+## xy unsorted
+x = ssa(N, N)
+y = ssa(N, N)
+test(14.01, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N)
+test(14.02, sm(x, y), bm(x, y))
+x = ssa(N, N)
+y = ssa(N, N*1.1)
+test(14.03, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N*1.1)
+test(14.04, sm(x, y), bm(x, y))
+
+# sparse
+x = sample(2e2L, 1e2L) ## may fail when no matches!
+y = sample(2e2L, 1e2L)
+test(21.01, sm(x, y), bm(x, y))
+x = sample(2e2L, 1e2L, TRUE)
+y = sample(2e2L, 1e2L, TRUE)
+test(21.02, sm(x, y), bm(x, y))
+
+d1 = data.table(x=x, v1=seq_along(x))
+d2 = data.table(y=y, v2=seq_along(y))
+options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
+test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
+options(datatable.smerge=TRUE)
+test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge")
diff --git a/inst/tests/smjoin.Rraw b/inst/tests/smjoin.Rraw
deleted file mode 100644
index 48a479463c..0000000000
--- a/inst/tests/smjoin.Rraw
+++ /dev/null
@@ -1,109 +0,0 @@
-require(methods)
-
-if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
-  if ((tt<-compiler::enableJIT(-1))>0)
-    cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
-} else {
-  require(data.table)
-  test = data.table:::test
-  smjoin = data.table:::smjoin
-  bmerge = data.table:::bmerge
-  forderv = data.table:::forderv
-}
-
-d = function(x, y) bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)[1:2]
-
-## x y sorted
-x = c(1L,2L,3L,4L)
-y = c(2L,3L,5L)
-test(1.01, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,3L,4L)
-y = c(2L,3L,5L)
-test(1.02, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,4L)
-y = c(2L,3L,3L,5L)
-test(1.03, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,3L,4L)
-y = c(2L,3L,3L,5L)
-test(1.04, smjoin(x, y), d(x, y))
-
-## y unsorted
-x = c(1L,2L,3L,4L)
-y = c(2L,5L,3L)
-test(2.01, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,3L,4L)
-y = c(3L,2L,5L)
-test(2.02, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,4L)
-y = c(5L,3L,2L,3L)
-test(2.03, smjoin(x, y), d(x, y))
-x = c(1L,2L,3L,3L,4L)
-y = c(5L,3L,3L,2L)
-test(2.04, smjoin(x, y), d(x, y))
-
-#...
-
-## xy unsorted
-x = c(4L,1L,3L,2L)
-y = c(2L,5L,3L)
-test(4.01, smjoin(x, y), d(x, y))
-
-#...
-
-## xy sorted
-ssa = function(unq_n, size, sort=FALSE) {
-  stopifnot(unq_n <= size)
-  unq_sub = seq_len(unq_n)
-  ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
-  if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
-  if (sort) sort(ans) else ans
-}
-N = 1e2L
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N, sort=TRUE)
-test(11.01, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N, sort=TRUE)
-test(11.02, smjoin(x, y), d(x, y))
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N*1.1, sort=TRUE)
-test(11.03, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N*1.1, sort=TRUE)
-test(11.04, smjoin(x, y), d(x, y))
-
-## y unsorted
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N)
-test(12.01, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N)
-test(12.02, smjoin(x, y), d(x, y))
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N*1.1)
-test(12.03, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N*1.1)
-test(12.04, smjoin(x, y), d(x, y))
-
-## xy unsorted
-x = ssa(N, N)
-y = ssa(N, N)
-test(14.01, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1)
-y = ssa(N, N)
-test(14.02, smjoin(x, y), d(x, y))
-x = ssa(N, N)
-y = ssa(N, N*1.1)
-test(14.03, smjoin(x, y), d(x, y))
-x = ssa(N, N*1.1)
-y = ssa(N, N*1.1)
-test(14.04, smjoin(x, y), d(x, y))
-
-# sparse
-x = sample(2e2L, 1e2L)
-y = sample(2e2L, 1e2L)
-test(21.01, smjoin(x, y), d(x, y))
-x = sample(2e2L, 1e2L, TRUE)
-y = sample(2e2L, 1e2L, TRUE)
-test(21.02, smjoin(x, y), d(x, y))
diff --git a/src/bmerge.c b/src/bmerge.c
index 5273ae59b9..cd6d2b307d 100644
--- a/src/bmerge.c
+++ b/src/bmerge.c
@@ -175,18 +175,20 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE
     memcpy(INTEGER(retLengthArg), retLength, sizeof(int)*ctr);
     memcpy(INTEGER(retIndexArg), retIndex, sizeof(int)*ctr);
   }
-  SEXP ans = PROTECT(allocVector(VECSXP, 5)); protecti++;
-  SEXP ansnames = PROTECT(allocVector(STRSXP, 5)); protecti++;
+  SEXP ans = PROTECT(allocVector(VECSXP, 6)); protecti++;
+  SEXP ansnames = PROTECT(allocVector(STRSXP, 6)); protecti++;
   SET_VECTOR_ELT(ans, 0, retFirstArg);
   SET_VECTOR_ELT(ans, 1, retLengthArg);
   SET_VECTOR_ELT(ans, 2, retIndexArg);
   SET_VECTOR_ELT(ans, 3, allLen1Arg);
   SET_VECTOR_ELT(ans, 4, allGrp1Arg);
+  SET_VECTOR_ELT(ans, 5, xoArg);
   SET_STRING_ELT(ansnames, 0, char_starts);  // changed from mkChar to char_ to pass the grep in CRAN_Release.cmd
   SET_STRING_ELT(ansnames, 1, char_lens);
   SET_STRING_ELT(ansnames, 2, char_indices);
   SET_STRING_ELT(ansnames, 3, char_allLen1);
   SET_STRING_ELT(ansnames, 4, char_allGrp1);
+  SET_STRING_ELT(ansnames, 5, char_xo);
   setAttrib(ans, R_NamesSymbol, ansnames);
   if (nqmaxgrp > 1 && mult == ALL) {
     Free(retFirst);
diff --git a/src/data.table.h b/src/data.table.h
index 74d101429d..0a5fc02ebd 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -79,6 +79,11 @@ extern SEXP char_lens;
 extern SEXP char_indices;
 extern SEXP char_allLen1;
 extern SEXP char_allGrp1;
+extern SEXP char_xo;
+extern SEXP char_io;
+extern SEXP char_lhsLen1;
+extern SEXP char_xyLen1;
+extern SEXP char_nMatch;
 extern SEXP char_factor;
 extern SEXP char_ordered;
 extern SEXP char_datatable;
@@ -245,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx);
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx);
diff --git a/src/init.c b/src/init.c
index 731f0b765b..a47cfd99c7 100644
--- a/src/init.c
+++ b/src/init.c
@@ -15,6 +15,11 @@ SEXP char_lens;
 SEXP char_indices;
 SEXP char_allLen1;
 SEXP char_allGrp1;
+SEXP char_xo;
+SEXP char_io;
+SEXP char_lhsLen1;
+SEXP char_xyLen1;
+SEXP char_nMatch;
 SEXP char_factor;
 SEXP char_ordered;
 SEXP char_datatable;
@@ -119,7 +124,7 @@ SEXP lock();
 SEXP unlock();
 SEXP islockedR();
 SEXP allNAR();
-SEXP smjoinR();
+SEXP smergeR();
 
 // .Externals
 SEXP fastmean();
@@ -212,7 +217,7 @@ R_CallMethodDef callMethods[] = {
 {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1},
 {"CtestMsgR", (DL_FUNC) &testMsgR, -1},
 {"C_allNAR", (DL_FUNC) &allNAR, -1},
-{"CsmjoinR", (DL_FUNC) &smjoinR, -1},
+{"CsmergeR", (DL_FUNC) &smergeR, -1},
 {NULL, NULL, 0}
 };
 
@@ -319,6 +324,11 @@ void attribute_visible R_init_datatable(DllInfo *info)
   char_indices =   PRINTNAME(install("indices"));
   char_allLen1 =   PRINTNAME(install("allLen1"));
   char_allGrp1 =   PRINTNAME(install("allGrp1"));
+  char_xo =        PRINTNAME(install("xo"));
+  char_io =        PRINTNAME(install("io"));
+  char_lhsLen1 =   PRINTNAME(install("lhsLen1"));
+  char_xyLen1 =    PRINTNAME(install("xyLen1"));
+  char_nMatch =    PRINTNAME(install("nMatch"));
   char_factor =    PRINTNAME(install("factor"));
   char_ordered =   PRINTNAME(install("ordered"));
   char_datatable = PRINTNAME(install("data.table"));
diff --git a/src/smjoin.c b/src/smerge.c
similarity index 51%
rename from src/smjoin.c
rename to src/smerge.c
index d768659ce5..25e2e455b8 100644
--- a/src/smjoin.c
+++ b/src/smerge.c
@@ -2,12 +2,19 @@
 
 /*
  * sort-merge join
- * 
+ *
  * join on a single integer column
  * sort LHS and RHS
  * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches
  * parallel sort-merge join
+ *
+ * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_ALL
+ * y_len1/y_lens1/allLen1:       signals if there where multiple matches in RHS of join, ['s _x_ table
+ * x_len1/x_lens1/lhsLen1:       signals if multiple matches in LHS of join, ['s _i_ table
+ * xy_len1/xy_lens1/xlLen1:      signals if "many to many" matches between LHS and RHS
+ * cnt/nmatch/n_match/n_matchr:  count of matches, taking multiple matches into account, uint64_t
  */
+#define SMERGE_ALL
 
 // helper to count how many threads were used
 int unqN(int *x, int nx) { // x have 0:(nx-1) values
@@ -30,26 +37,105 @@ void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
 }
 
 // workhorse join that runs in parallel on batches
-void smjoin1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys, int *x, int *y, int *starts, int *lens) {
+void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
+             int *x, bool unq_x, int *y, bool unq_y,
+             int *starts, int *lens,
+             uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
   int i=0, j=0, is=0, js=0, x_i, y_j;
-  while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
-    i = xs[is]-1, j = ys[js]-1;
-    x_i = x[i], y_j = y[j];
-    if (x_i == y_j) { //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-      int j1 = j+1; //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
-      for (int ii=0; ii<xl[is]; ++ii) { //Rprintf("starts[%d]=%d\n", i+ii, j1);
-        starts[i+ii] = j1;
-        lens[i+ii] = yl[js];
+  uint64_t cnt=0;
+  bool xlen1 = true, ylen1 = true, xylen1 = true;
+  if (unq_x && unq_y) { //Rprintf("smerge1: unq_x && unq_y\n");
+    while (i<nxs && j<nys) {
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        starts[i] = j+1;
+        lens[i] = 1;
+        i++;
+#ifdef SMERGE_ALL
+        cnt++;
+#endif
+      } else if (x_i < y_j) {
+        i++;
+      } else if (x_i > y_j) {
+        j++;
+      }
+    }
+  } else if (unq_x) { //Rprintf("smerge1: unq_x && !unq_y\n");
+    while (i<nxs && js<nys) {
+      j = ys[js]-1;
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        starts[i] = j+1;
+        int yl1 = yl[js];
+        lens[i] = yl1;
+        i++;
+#ifdef SMERGE_ALL
+        if (ylen1 && yl1>1)
+          ylen1 = false;
+        cnt += (uint64_t)yl1;
+#endif
+      } else if (x_i < y_j) {
+        i++;
+      } else if (x_i > y_j) {
+        js++;
+      }
+    }
+  } else if (unq_y) { //Rprintf("smerge1: !unq_x && unq_y\n");
+    while (is<nxs && j<nys) {
+      i = xs[is]-1;
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        int j1 = j+1;
+        int xl1 = xl[is];
+        for (int ii=0; ii<xl1; ++ii) {
+          starts[i+ii] = j1;
+          lens[i+ii] = 1;
+        }
+        is++;
+#ifdef SMERGE_ALL
+        if (xlen1 && xl1>1)
+          xlen1 = false;
+        cnt += (uint64_t)xl1;
+#endif
+      } else if (x_i < y_j) {
+        is++;
+      } else if (x_i > y_j) {
+        j++;
+      }
+    }
+  } else { //Rprintf("smerge1: !unq_x && !unq_y\n");
+    while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
+      i = xs[is]-1, j = ys[js]-1;
+      x_i = x[i], y_j = y[j];
+      if (x_i == y_j) { //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
+        int j1 = j+1, yl1 = yl[js]; //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
+        int xl1 = xl[is];
+        for (int ii=0; ii<xl1; ++ii) { //Rprintf("starts[%d]=%d\n", i+ii, j1);
+          starts[i+ii] = j1;
+          lens[i+ii] = yl1;
+        }
+        is++;
+#ifdef SMERGE_ALL
+        if (xlen1 && xl1>1)
+          xlen1 = false;
+        if (ylen1 && yl1>1)
+          ylen1 = false;
+        if (xylen1 && xl1>1 && yl1>1)
+          xylen1 = false;
+        cnt += (uint64_t)xl1 * (uint64_t)yl1;
+#endif
+      } else if (x_i < y_j) {
+        is++;
+      } else if (x_i > y_j) {
+        js++;
       }
-      is++;
     }
-    else if (x_i < y_j) is++;
-    else if (x_i > y_j) js++;
   }
+  xlens1[0] = xlen1; ylens1[0] = ylen1; xylens1[0] = xylen1; nmatch[0] = cnt;
   return;
 }
 
-// those helpers need to be binary search! last matching, and first matching index
+// those two helpers function may be removed once we will confirm rollbs works as expected, they find last matching, and first matching index
 static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
   int i;
   for (i=0; i<ny_starts; ++i) {
@@ -74,7 +160,7 @@ static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
  * side -1: lower bound; side 1: upper bound
  */
 static int rollbs(int *x, int *ix, int nix, int val, int side) {
-  bool verbose = false;
+  bool verbose = false; // develd debug only
   if (verbose) Rprintf("rollbs: side=%d; val=%d\n", side, val);
   if (side < 0) { // lower bound early stopping
     if (x[ix[0]-1] == val) {
@@ -122,27 +208,30 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
     return upper;
 }
 
-void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
+void smergeC(int *x, int nx, int *x_starts, int nx_starts,
              int *y, int ny, int *y_starts, int ny_starts,
-             int *matchn, int *starts, int *lens) {
+             int *starts, int *lens,
+             uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
+             int verbose) {
 
-  bool verbose = true;
   bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
-  int *x_lens=0, *y_lens=0;
+  int *x_lens = 0, *y_lens = 0;
+  double t = 0;
 
-  double t_lens = omp_get_wtime();
-  if (!unq_x || true) { // else true as we dont yet have branches for unq cases
+  t = omp_get_wtime();
+  if (!unq_x) { // else true as we dont yet have branches for unq cases
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
     //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
   }
-  if (!unq_y || true) {
+  if (!unq_y) {
     y_lens = (int *)R_alloc(ny_starts, sizeof(int));
     grpLens(y_starts, ny_starts, ny, y_lens);
   }
-  t_lens = omp_get_wtime() - t_lens;
+  if (verbose>0)
+    Rprintf("smergeC: grpLens took %.3fs\n", omp_get_wtime() - t);
 
-  double t_batch = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs!
+  t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
   if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024
@@ -156,7 +245,7 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize);
   int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*));
   int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*));
-  bool do_not_set_to_false = false; int y_min2, y_max2;
+  bool do_not_set_to_false = true; int y_min2, y_max2;
   //#pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch-1; ++b) {
     nxsB[b] = batchSize; xsB[b] = x_starts + b*batchSize; xlB[b] = x_lens + b*batchSize;
@@ -182,24 +271,28 @@ void smjoinC(int *x, int nx, int *x_starts, int nx_starts,
   if (do_not_set_to_false && y_max!=y_max2) error("y_max different: %d vs %d", y_max2, y_max);
   nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
   //for (int b=0; b<nBatch; ++b) Rprintf("b=%d:\n  nx=%d; xs[0]=%d; xl[0]=%d; x[xs[0]-1]=%d\n  ny=%d; ys[0]=%d; yl[0]=%d; y[ys[0]-1]=%d\n", b, nxsB[b], xsB[b][0], xlB[b][0], x[xsB[b][0]-1], nysB[b], ysB[b][0], ylB[b][0], y[ysB[b][0]-1]);
-  t_batch = omp_get_wtime() - t_batch;
+  if (verbose>0)
+    Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
 
-  double t_join = omp_get_wtime();
-  #pragma omp parallel for schedule(dynamic) num_threads(nth)
+  t = omp_get_wtime();
+  bool xlens1 = true, ylens1 = true, xylens1 = true;
+  uint64_t nmatch = 0;
+  #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
-    smjoin1(
+    smerge1(
       xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b], // batch specific input: shifted pointers and their sizes
-      x, y,                                             // common input
-      starts, lens                                      // common output
+      x, unq_x, y, unq_y,                               // common input
+      starts, lens,                                     // common output
+      &nmatch, &xlens1, &ylens1, &xylens1               // common reduction
     );
     th[b] = omp_get_thread_num();
   }
-  t_join = omp_get_wtime() - t_join;
-  //for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", th[b]);
+  x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
+  //for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", b, th[b]); // threads utilization for each batch
+  if (verbose>0)
+    Rprintf("smergeC: %d calls to smerge1 using %d/%d threads took %.3fs\n", nBatch, unqN(th, nBatch), nth, omp_get_wtime() - t);
 
-  if (verbose)
-    Rprintf("smjoinC: lens %.3fs; batch %.3fs; join %.3fs; used %dth in %d batch\n", t_lens, t_batch, t_join, unqN(th, nBatch), nBatch);
-  matchn[0] = nx;
+  return;
 }
 
 void sortInt(int *x, int nx, int *idx, int *ans) {
@@ -209,61 +302,81 @@ void sortInt(int *x, int nx, int *idx, int *ans) {
   return;
 }
 
-SEXP joinOut(int n, int *starts, int *lens, bool x_ord, SEXP out_starts, SEXP out_lens, int *xoop) {
-  SEXP ans = PROTECT(allocVector(VECSXP, 2)), ansnames;
-  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 2));
+// this produce output consistent to bmerge, note that bmerge i,x is smerge x,y
+SEXP joinOut(int n, int *starts, int *lens, bool x_ord,
+             SEXP out_starts, SEXP out_lens,
+             SEXP x_idx, SEXP y_idx,
+             SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1) {
+  SEXP ans = PROTECT(allocVector(VECSXP, 10)), ansnames;
+  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 10));
   SET_STRING_ELT(ansnames, 0, char_starts);
   SET_STRING_ELT(ansnames, 1, char_lens);
+  SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join
+  SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1);
+  SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true));    // const for equi join
+  SET_STRING_ELT(ansnames, 5, char_xo);      SET_VECTOR_ELT(ans, 5, y_idx);
+  SET_STRING_ELT(ansnames, 6, char_io);      SET_VECTOR_ELT(ans, 6, x_idx);
+  SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1);
+  SET_STRING_ELT(ansnames, 8, char_xyLen1);  SET_VECTOR_ELT(ans, 8, xy_lens1);
+  SET_STRING_ELT(ansnames, 9, char_nMatch);  SET_VECTOR_ELT(ans, 9, n_match);
   if (x_ord) {
     SET_VECTOR_ELT(ans, 0, out_starts);
     SET_VECTOR_ELT(ans, 1, out_lens);
   } else {
     SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
     SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
-    sortInt(starts, n, xoop, INTEGER(VECTOR_ELT(ans, 0)));
-    sortInt(lens, n, xoop, INTEGER(VECTOR_ELT(ans, 1)));
+    SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
+    sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
+    sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
+    UNPROTECT(1);
   }
   UNPROTECT(1);
   return ans;
 }
 
-SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
 
-  bool verbose = true;
+  const int verbose = GetVerbose()*2; // remove *2 after #4491
   if (!isInteger(x) || !isInteger(y))
-    error("must be integer");
-  int protecti=0, nx = LENGTH(x), ny = LENGTH(y);
+    error("'x' and 'y' must be integer");
+  int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
+  double t = 0;
 
-  double t_index = omp_get_wtime();
-  if (isNull(x_idx)) {
-    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  t = omp_get_wtime();
+  bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
+  if (do_x_idx) {
+    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
   }
-  if (isNull(y_idx)) {
-    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
+  if (do_y_idx) {
+    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
   }
-  SEXP x_starts = getAttrib(x_idx, sym_starts);
-  SEXP y_starts = getAttrib(y_idx, sym_starts);
+  if (!isInteger(x_idx) || !isInteger(y_idx))
+    error("'x.idx' and 'y.idx' must be integer");
+  SEXP x_starts = getAttrib(x_idx, sym_starts); SEXP y_starts = getAttrib(y_idx, sym_starts);
   if (isNull(x_starts) || isNull(y_starts))
-    error("Indices provided to smjoin must carry 'starts' attribute");
-  t_index = omp_get_wtime() - t_index;
+    error("Indices provided to smerge must carry 'starts' attribute");
+  if (verbose>0)
+    Rprintf("smergeR: index took %.3fs\n", omp_get_wtime() - t);
 
-  double t_sort = omp_get_wtime();
+  t = omp_get_wtime();
   bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
-  SEXP xoo = R_NilValue; // order of an order of x
-  int *xp, *yp, *xoop=0;
+  int *xp, *yp;
   if (!x_ord) {
     xp = (int *)R_alloc(nx, sizeof(int));
     sortInt(INTEGER(x), nx, INTEGER(x_idx), xp);
-    xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++;
-    xoop = INTEGER(xoo);
-  } else xp = INTEGER(x);
+  } else {
+    xp = INTEGER(x);
+  }
   if (!y_ord) {
     yp = (int *)R_alloc(ny, sizeof(int));
     sortInt(INTEGER(y), ny, INTEGER(y_idx), yp);
-  } else yp = INTEGER(y);
-  t_sort = omp_get_wtime() - t_sort;
+  } else {
+    yp = INTEGER(y);
+  }
+  if (verbose>0)
+    Rprintf("smergeR: sort took %.3fs\n", omp_get_wtime() - t);
 
-  double t_alloc = omp_get_wtime();
+  t = omp_get_wtime();
   SEXP out_starts = R_NilValue, out_lens = R_NilValue;
   int *starts=0, *lens=0;
   if (x_ord) { // we dont need to reorder results so can save one allocation
@@ -276,25 +389,40 @@ SEXP smjoinR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
     lens = (int *)R_alloc(nx, sizeof(int));
   }
   #pragma omp parallel for schedule(static) num_threads(getDTthreads())
-  for (int i=0; i<nx; ++i)
-    starts[i] = lens[i] = NA_INTEGER;
-  t_alloc = omp_get_wtime() - t_alloc;
+  for (int i=0; i<nx; ++i) {
+    starts[i] = NA_INTEGER;
+    lens[i] = 1;
+  }
+  uint64_t n_match = 0;
+  SEXP x_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
+  SEXP y_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
+  SEXP xy_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
+  SEXP n_matchr = PROTECT(allocVector(REALSXP, 1)); protecti++;
+  if (verbose>0)
+    Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t);
 
-  double t_smjoin = omp_get_wtime();
-  int matchn = 0; // not yet used
-  smjoinC(
+  t = omp_get_wtime();
+  smergeC(
     xp, nx, INTEGER(x_starts), LENGTH(x_starts),
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
-    &matchn, starts, lens
+    starts, lens,
+    &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1),
+    verbose-1
   );
-  t_smjoin = omp_get_wtime() - t_smjoin;
+  if (n_match > DBL_MAX) {
+    REAL(n_matchr)[0] = NA_REAL;
+    warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field");
+  } else {
+    REAL(n_matchr)[0] = (double)n_match;
+  }
+  if (verbose>0)
+    Rprintf("smergeR: smergeC of %d x %d => %"PRIu64" took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
 
-  double t_ans = omp_get_wtime();
-  SEXP ans = joinOut(matchn, starts, lens, x_ord, out_starts, out_lens, xoop); // usually also sorts inside
-  t_ans = omp_get_wtime() - t_ans;
+  t = omp_get_wtime();
+  SEXP ans = joinOut(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1);
+  if (verbose>0)
+    Rprintf("smergeR: joinOut (unsort) took %.3fs\n", omp_get_wtime() - t);
 
-  if (verbose)
-    Rprintf("smjoinR: index %.3fs; sort %.3fs; alloc %.3fs; smjoinC %.3fs; ans %.3fs\n", t_index, t_sort, t_alloc, t_smjoin, t_ans);
   UNPROTECT(protecti);
   return ans;
 }

From b1f34634b12a51a8b8f915238f14ea9dc84877d3 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 8 Jun 2020 00:49:13 +0100
Subject: [PATCH 14/34] switch magic option

---
 src/smerge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smerge.c b/src/smerge.c
index 25e2e455b8..3e0213707a 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -245,7 +245,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize);
   int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*));
   int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*));
-  bool do_not_set_to_false = true; int y_min2, y_max2;
+  bool do_not_set_to_false = false; int y_min2, y_max2;
   //#pragma omp parallel for schedule(dynamic) num_threads(nth)
   for (int b=0; b<nBatch-1; ++b) {
     nxsB[b] = batchSize; xsB[b] = x_starts + b*batchSize; xlB[b] = x_lens + b*batchSize;

From cba274ab8b31e8d855261c2f873e8ee572172cdf Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 8 Jun 2020 21:33:07 +0100
Subject: [PATCH 15/34] more robust, batching into new function and struct

---
 R/bmerge.R             |  10 +-
 R/wrappers.R           |   2 +-
 inst/tests/smerge.Rraw |  22 ++--
 src/data.table.h       |   2 +-
 src/smerge.c           | 271 +++++++++++++++++++++++------------------
 5 files changed, 178 insertions(+), 129 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index 1cf98bcfa8..f9ab80cd26 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -8,11 +8,15 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       && identical(ops, 1L)                                        ## equi join
       && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
       ) {
+    getIdxGrp = function(x, cols) { ## get index only if retGrp=T
+      if (is.numeric(cols)) cols = names(x)[cols]
+      idx = attr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), exact=TRUE)
+      if (!is.null(attr(idx, "starts", exact=TRUE))) idx
+    }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    ans = smerge(x=i[[icols]], y=x[[xcols]])
+    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
-    ans$xo = c(ans$xo) ## drop starts and maxgrpn
-    return(ans[c("starts","lens","indices","allLen1","allGrp1","xo")])
+    return(ans)
   }
   callersi = i
   i = shallow(i)
diff --git a/R/wrappers.R b/R/wrappers.R
index 06a25924b7..83e3d73417 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,4 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smerge = function(x, y, x.idx=NULL, y.idx=NULL) .Call(CsmergeR, x, y, x.idx, y.idx)
+smerge = function(x, y, x.idx=NULL, y.idx=NULL, out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, out.bmerge)
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index f0f4530a3f..3d8cb27b79 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -12,17 +12,15 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 }
 
 bm = function(x, y) {
+  stopifnot(is.integer(x), is.integer(y))
   ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)
-  ## if undefining SMERGE_ALL
-  #ans[c("starts","lens")]
+  ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans
 }
 sm = function(x, y) {
-  ans = smerge(x, y)
-  ans = ans[c("starts","lens","indices","allLen1","allGrp1","xo")] ## drop extra stats
-  ans[["xo"]] = c(ans[["xo"]])                                     ## removes starts and maxgrpn, and possibly others in future
-  ## if undefining SMERGE_ALL
-  #ans[c("starts","lens")]
+  stopifnot(is.integer(x), is.integer(y))
+  ans = smerge(x, y, out.bmerge=TRUE)
+  ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans
 }
 
@@ -127,3 +125,13 @@ options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #449
 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
 options(datatable.smerge=TRUE)
 test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge")
+
+setindexv2 = function(x, cols) { ## pretend we are after #4386
+  stopifnot(is.data.table(x), is.character(cols))
+  if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer())
+  setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), forderv(x, cols, retGrp=TRUE))
+  invisible(x)
+}
+setindexv2(d1, "x")
+setindexv2(d2, "y")
+test(101.03, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge")
diff --git a/src/data.table.h b/src/data.table.h
index 0a5fc02ebd..9ceb722cea 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx);
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge);
diff --git a/src/smerge.c b/src/smerge.c
index 3e0213707a..6dac785b2c 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -8,50 +8,42 @@
  * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches
  * parallel sort-merge join
  *
- * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_ALL
+ * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_STATS
  * y_len1/y_lens1/allLen1:       signals if there where multiple matches in RHS of join, ['s _x_ table
  * x_len1/x_lens1/lhsLen1:       signals if multiple matches in LHS of join, ['s _i_ table
  * xy_len1/xy_lens1/xlLen1:      signals if "many to many" matches between LHS and RHS
  * cnt/nmatch/n_match/n_matchr:  count of matches, taking multiple matches into account, uint64_t
  */
-#define SMERGE_ALL
+#define SMERGE_STATS
 
-// helper to count how many threads were used
-int unqN(int *x, int nx) { // x have 0:(nx-1) values
-  int ans = 0;
-  uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
-  memset(seen, 0, nx*sizeof(uint8_t));
-  for (int i=0; i<nx; ++i) {
-    if (!seen[x[i]]++) ans++;
-  }
-  return ans;
-}
-
-// count of duplicated entries
-void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
-  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
-  for (int i=0; i<n_starts-1; ++i)
-    lens[i] = starts[i+1]-starts[i];
-  lens[n_starts-1] = n - starts[n_starts-1]+1;
-  return;
-}
+// shifted pointers so each batch knows where to start and end
+struct smBatch {
+  int *xs; // x's starts: indices on which unique values can be found
+  int *xl; // x's lens:   counts of each unique value
+  int nxs; // x's starts length
+  int *ys; // same for y
+  int *yl;
+  int nys;
+};
 
 // workhorse join that runs in parallel on batches
-void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
-             int *x, bool unq_x, int *y, bool unq_y,
-             int *starts, int *lens,
-             uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
+void smerge(struct smBatch *batch,
+            int *x, bool unq_x, int *y, bool unq_y,
+            int *starts, int *lens,
+            uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
+  int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs;
+  int *ys=batch->ys, *yl=batch->yl, nys=batch->nys;
   int i=0, j=0, is=0, js=0, x_i, y_j;
   uint64_t cnt=0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
-  if (unq_x && unq_y) { //Rprintf("smerge1: unq_x && unq_y\n");
+  if (unq_x && unq_y) { //Rprintf("smerge: unq_x && unq_y\n");
     while (i<nxs && j<nys) {
       x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         starts[i] = j+1;
         lens[i] = 1;
         i++;
-#ifdef SMERGE_ALL
+#ifdef SMERGE_STATS
         cnt++;
 #endif
       } else if (x_i < y_j) {
@@ -60,7 +52,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
         j++;
       }
     }
-  } else if (unq_x) { //Rprintf("smerge1: unq_x && !unq_y\n");
+  } else if (unq_x) { //Rprintf("smerge: unq_x && !unq_y\n");
     while (i<nxs && js<nys) {
       j = ys[js]-1;
       x_i = x[i], y_j = y[j];
@@ -69,7 +61,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
         int yl1 = yl[js];
         lens[i] = yl1;
         i++;
-#ifdef SMERGE_ALL
+#ifdef SMERGE_STATS
         if (ylen1 && yl1>1)
           ylen1 = false;
         cnt += (uint64_t)yl1;
@@ -80,7 +72,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
         js++;
       }
     }
-  } else if (unq_y) { //Rprintf("smerge1: !unq_x && unq_y\n");
+  } else if (unq_y) { //Rprintf("smerge: !unq_x && unq_y\n");
     while (is<nxs && j<nys) {
       i = xs[is]-1;
       x_i = x[i], y_j = y[j];
@@ -92,7 +84,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
           lens[i+ii] = 1;
         }
         is++;
-#ifdef SMERGE_ALL
+#ifdef SMERGE_STATS
         if (xlen1 && xl1>1)
           xlen1 = false;
         cnt += (uint64_t)xl1;
@@ -103,7 +95,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
         j++;
       }
     }
-  } else { //Rprintf("smerge1: !unq_x && !unq_y\n");
+  } else { //Rprintf("smerge: !unq_x && !unq_y\n");
     while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
       i = xs[is]-1, j = ys[js]-1;
       x_i = x[i], y_j = y[j];
@@ -115,7 +107,7 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
           lens[i+ii] = yl1;
         }
         is++;
-#ifdef SMERGE_ALL
+#ifdef SMERGE_STATS
         if (xlen1 && xl1>1)
           xlen1 = false;
         if (ylen1 && yl1>1)
@@ -135,11 +127,10 @@ void smerge1(int *xs, int *xl, int nxs, int *ys, int *yl, int nys,
   return;
 }
 
-// those two helpers function may be removed once we will confirm rollbs works as expected, they find last matching, and first matching index
+// those two helpers needs to be removed once we will confirm rollbs works as expected, they find "first/last within" index
 static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
   int i;
-  for (i=0; i<ny_starts; ++i) {
-    //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
+  for (i=0; i<ny_starts; ++i) { //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
     if (y[y_starts[i]-1]==val) return(i);
     else if (y[y_starts[i]-1]>val) return(i-1);
   }
@@ -147,8 +138,7 @@ static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
 }
 static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
   int i;
-  for (i=ny_starts-1; i>=0; --i) {
-    //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
+  for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
     if (y[y_starts[i]-1]==val) return(i);
     else if (y[y_starts[i]-1]<val) return(i+1);
   }
@@ -160,32 +150,32 @@ static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
  * side -1: lower bound; side 1: upper bound
  */
 static int rollbs(int *x, int *ix, int nix, int val, int side) {
-  bool verbose = false; // develd debug only
-  if (verbose) Rprintf("rollbs: side=%d; val=%d\n", side, val);
+  int verbose = 0; // devel debug only
+  if (verbose>0) Rprintf("rollbs: side=%d; val=%d\n", side, val);
   if (side < 0) { // lower bound early stopping
     if (x[ix[0]-1] == val) {
-      if (verbose) Rprintf("rollbs: min elements %d match\n", val);
+      if (verbose>0) Rprintf("rollbs: min elements %d match\n", val);
       return 0;
     }
     if (x[ix[nix-1]-1] < val) {
-      if (verbose) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val);
+      if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val);
       return -1;
     }
     if (x[ix[0]-1] > val) {
-      if (verbose) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val);
+      if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val);
       return 0;
     }
   } else if (side > 0) { // upper bound early stopping
     if (x[ix[nix-1]-1] == val) {
-      if (verbose) Rprintf("rollbs: max elements %d match\n", val);
+      if (verbose>0) Rprintf("rollbs: max elements %d match\n", val);
       return nix-1;
     }
     if (x[ix[0]-1] > val) {
-      if (verbose) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val);
+      if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val);
       return -1;
     }
     if (x[ix[nix-1]-1] < val) {
-      if (verbose) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val);
+      if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val);
       return nix-1;
     }
   }
@@ -193,7 +183,7 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
   while (lower<=upper) {
     i = lower + (upper-lower)/2;
     int thisx = x[ix[i]-1];
-    //if (verbose) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val);
+    //if (verbose>0) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val);
     if (thisx==val)
       return(i);
     else if (thisx < val)
@@ -201,28 +191,89 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
     else if (thisx > val)
       upper = i-1;
   }
-  if (verbose) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side);
+  if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side);
   if (side < 0) // anyone to stress test this logic?
     return lower;
   else
     return upper;
 }
 
+// cuts x_starts into equal batches and binary search corresponding y_starts ranges
+struct smBatch *batching(int nBatch,
+                         int *x, int nx, int *x_starts, int nx_starts, int *x_lens,
+                         int *y, int ny, int *y_starts, int ny_starts, int *y_lens,
+                         int verbose) {
+  if (verbose>0)
+    Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch);
+  struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch));
+  size_t batchSize = (nx_starts-1)/nBatch + 1;
+  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
+  bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
+  for (int b=0; b<nBatch; ++b) {
+    if (b<nBatch-1) {
+      B[b].xs = x_starts + b*batchSize; B[b].xl = x_lens + b*batchSize; B[b].nxs = batchSize;
+    } else { // last batch
+      B[b].xs = x_starts + nx_starts-lastBatchSize; B[b].xl = x_lens + nx_starts-lastBatchSize; B[b].nxs = lastBatchSize;
+    }
+    int x_i_min = B[b].xs[0];
+    int x_i_max = B[b].xs[B[b].nxs-1];
+    int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
+    int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
+    if (extra_validate) {
+      int y_i_min2 = min_i_match(y, y_starts, ny_starts, x[x_i_min-1]);
+      int y_i_max2 = max_i_match(y, y_starts, ny_starts, x[x_i_max-1]);
+      if (y_i_min!=y_i_min2)
+        error("y lower bound different: %d vs %d", y_i_min2, y_i_min);
+      if (y_i_max!=y_i_max2)
+        error("y upper bound different: %d vs %d", y_i_max2, y_i_max);
+    }
+    B[b].ys = y_starts + y_i_min; B[b].yl = y_lens + y_i_min; B[b].nys = y_i_max - y_i_min + 1;
+  }
+  if (verbose>0) { // print smBatch objects, 1-indexed! x y sorted! for debugging and verbose
+    for (int b=0; b<nBatch; ++b) {
+      int x_i_min = B[b].xs[0], y_i_min = B[b].ys[0];
+      int x_i_max = B[b].xs[B[b].nxs-1], y_i_max = B[b].ys[B[b].nys-1];
+      Rprintf("#### batch[%d]: unq n: x=%d, y=%d\n", b+1, B[b].nxs, B[b].nys);
+      Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
+      Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
+    }
+  }
+  return B;
+}
+
+// helper to count how many threads were used
+int unqNth(int *x, int nx) { // x have 0:(nx-1) values
+  int ans = 0;
+  uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
+  memset(seen, 0, nx*sizeof(uint8_t));
+  for (int i=0; i<nx; ++i) {
+    if (!seen[x[i]]++) ans++;
+  }
+  return ans;
+}
+
+// count of duplicated entries
+void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
+#pragma omp parallel for schedule(static) num_threads(getDTthreads())
+  for (int i=0; i<n_starts-1; ++i)
+    lens[i] = starts[i+1]-starts[i];
+  lens[n_starts-1] = n - starts[n_starts-1]+1;
+  return;
+}
+
+// pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel
 void smergeC(int *x, int nx, int *x_starts, int nx_starts,
              int *y, int ny, int *y_starts, int ny_starts,
              int *starts, int *lens,
              uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
              int verbose) {
 
-  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
+  double t = omp_get_wtime();
   int *x_lens = 0, *y_lens = 0;
-  double t = 0;
-
-  t = omp_get_wtime();
-  if (!unq_x) { // else true as we dont yet have branches for unq cases
-    x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // #4395
+  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
+  if (!unq_x) {
+    x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // remove after #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
-    //for (int i=0; i<nx_starts; ++i) Rprintf("x_lens[%d]=%d\n", i, x_lens[i]);
   }
   if (!unq_y) {
     y_lens = (int *)R_alloc(ny_starts, sizeof(int));
@@ -234,43 +285,13 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
-  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024
+  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV
     nBatch = 1;
   } else {
     nBatch = nth * 2;
   }
   int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
-  size_t batchSize = (nx_starts-1)/nBatch + 1;
-  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
-  //Rprintf("nBatch=%d; batchSize=%d; lastBatchSize=%d\n", nBatch, batchSize, lastBatchSize);
-  int *nxsB = (int *)R_alloc(nBatch, sizeof(int)), **xsB = (int **)R_alloc(nBatch, sizeof(int*)), **xlB = (int **)R_alloc(nBatch, sizeof(int*));
-  int *nysB = (int *)R_alloc(nBatch, sizeof(int)), **ysB = (int **)R_alloc(nBatch, sizeof(int*)), **ylB = (int **)R_alloc(nBatch, sizeof(int*));
-  bool do_not_set_to_false = false; int y_min2, y_max2;
-  //#pragma omp parallel for schedule(dynamic) num_threads(nth)
-  for (int b=0; b<nBatch-1; ++b) {
-    nxsB[b] = batchSize; xsB[b] = x_starts + b*batchSize; xlB[b] = x_lens + b*batchSize;
-    if (do_not_set_to_false) y_min2 = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
-    if (do_not_set_to_false) y_max2 = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
-    //Rprintf("nxsB[b]=%d; y_min=%d; y_max=%d\n", nxsB[b], y_min, y_max);
-    int y_min = rollbs(y, y_starts, ny_starts, x[xsB[b][0]-1], -1);
-    int y_max = rollbs(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1], 1);
-    //Rprintf("nxsB[b]=%d; y_mn2=%d; y_mx2=%d\n", nxsB[b], y_min2, y_max2);
-    if (do_not_set_to_false && y_min!=y_min2) error("y_min different: %d vs %d", y_min2, y_min);
-    if (do_not_set_to_false && y_max!=y_max2) error("y_max different: %d vs %d", y_max2, y_max);
-    nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
-  }
-  int b = nBatch-1;
-  nxsB[b] = lastBatchSize; xsB[b] = x_starts + nx_starts-lastBatchSize; xlB[b] = x_lens + nx_starts-lastBatchSize;
-  if (do_not_set_to_false) y_min2 = min_i_match(y, y_starts, ny_starts, x[xsB[b][0]-1]);
-  if (do_not_set_to_false) y_max2 = max_i_match(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1]);
-  //Rprintf("nxsB[b]=%d; y_min=%d; y_max=%d\n", nxsB[b], y_min, y_max);
-  int y_min = rollbs(y, y_starts, ny_starts, x[xsB[b][0]-1], -1);
-  int y_max = rollbs(y, y_starts, ny_starts, x[xsB[b][nxsB[b]-1]-1], 1);
-  //Rprintf("nxsB[b]=%d; y_mn2=%d; y_mx2=%d\n", nxsB[b], y_min2, y_max2);
-  if (do_not_set_to_false && y_min!=y_min2) error("y_min different: %d vs %d", y_min2, y_min);
-  if (do_not_set_to_false && y_max!=y_max2) error("y_max different: %d vs %d", y_max2, y_max);
-  nysB[b] = y_max - y_min + 1; ysB[b] = y_starts + y_min; ylB[b] = y_lens + y_min;
-  //for (int b=0; b<nBatch; ++b) Rprintf("b=%d:\n  nx=%d; xs[0]=%d; xl[0]=%d; x[xs[0]-1]=%d\n  ny=%d; ys[0]=%d; yl[0]=%d; y[ys[0]-1]=%d\n", b, nxsB[b], xsB[b][0], xlB[b][0], x[xsB[b][0]-1], nysB[b], ysB[b][0], ylB[b][0], y[ysB[b][0]-1]);
+  struct smBatch *B = batching(nBatch, x, nx, x_starts, nx_starts, x_lens, y, ny, y_starts, ny_starts, y_lens, verbose-1);
   if (verbose>0)
     Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
 
@@ -279,18 +300,17 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   uint64_t nmatch = 0;
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
-    smerge1(
-      xsB[b], xlB[b], nxsB[b], ysB[b], ylB[b], nysB[b], // batch specific input: shifted pointers and their sizes
-      x, unq_x, y, unq_y,                               // common input
-      starts, lens,                                     // common output
-      &nmatch, &xlens1, &ylens1, &xylens1               // common reduction
+    smerge(
+      &B[b],                              // batch input
+      x, unq_x, y, unq_y,                 // common input
+      starts, lens,                       // common output
+      &nmatch, &xlens1, &ylens1, &xylens1 // common reduction output
     );
     th[b] = omp_get_thread_num();
   }
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
-  //for (int b=0; b<nBatch; ++b) Rprintf("th[%d]=%d\n", b, th[b]); // threads utilization for each batch
   if (verbose>0)
-    Rprintf("smergeC: %d calls to smerge1 using %d/%d threads took %.3fs\n", nBatch, unqN(th, nBatch), nth, omp_get_wtime() - t);
+    Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t);
 
   return;
 }
@@ -302,23 +322,17 @@ void sortInt(int *x, int nx, int *idx, int *ans) {
   return;
 }
 
-// this produce output consistent to bmerge, note that bmerge i,x is smerge x,y
-SEXP joinOut(int n, int *starts, int *lens, bool x_ord,
-             SEXP out_starts, SEXP out_lens,
-             SEXP x_idx, SEXP y_idx,
-             SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1) {
-  SEXP ans = PROTECT(allocVector(VECSXP, 10)), ansnames;
-  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, 10));
+// for bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y
+SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord,
+                SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
+                SEXP x_idx, SEXP y_idx,
+                SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1,
+                bool bmerge) {
+  int out_len = bmerge ? 6 : 10;
+  SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames;
+  setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len));
   SET_STRING_ELT(ansnames, 0, char_starts);
   SET_STRING_ELT(ansnames, 1, char_lens);
-  SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join
-  SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1);
-  SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true));    // const for equi join
-  SET_STRING_ELT(ansnames, 5, char_xo);      SET_VECTOR_ELT(ans, 5, y_idx);
-  SET_STRING_ELT(ansnames, 6, char_io);      SET_VECTOR_ELT(ans, 6, x_idx);
-  SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1);
-  SET_STRING_ELT(ansnames, 8, char_xyLen1);  SET_VECTOR_ELT(ans, 8, xy_lens1);
-  SET_STRING_ELT(ansnames, 9, char_nMatch);  SET_VECTOR_ELT(ans, 9, n_match);
   if (x_ord) {
     SET_VECTOR_ELT(ans, 0, out_starts);
     SET_VECTOR_ELT(ans, 1, out_lens);
@@ -330,19 +344,40 @@ SEXP joinOut(int n, int *starts, int *lens, bool x_ord,
     sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
     UNPROTECT(1);
   }
+  SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join
+  SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1);
+  SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true));    // const for equi join
+  if (bmerge) {
+    y_idx = shallow_duplicate(y_idx); // possibly improve after #4467
+    setAttrib(y_idx, sym_starts, R_NilValue);
+    setAttrib(y_idx, sym_maxgrpn, R_NilValue);
+    //setAttrib(y_idx, sym_anyna, R_NilValue); // enable after #4386
+    //setAttrib(y_idx, sym_anyinfnan, R_NilValue);
+    //setAttrib(y_idx, sym_anynotascii, R_NilValue);
+    //setAttrib(y_idx, sym_anynotutf8, R_NilValue);
+  }
+  SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx);
+  if (!bmerge) {
+    SET_STRING_ELT(ansnames, 6, char_io);      SET_VECTOR_ELT(ans, 6, x_idx);
+    SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1);
+    SET_STRING_ELT(ansnames, 8, char_xyLen1);  SET_VECTOR_ELT(ans, 8, xy_lens1);
+    SET_STRING_ELT(ansnames, 9, char_nMatch);  SET_VECTOR_ELT(ans, 9, n_match);
+  }
   UNPROTECT(1);
   return ans;
 }
 
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
 
-  const int verbose = GetVerbose()*2; // remove *2 after #4491
+  double t_total = omp_get_wtime();
+  const int verbose = GetVerbose()*3; // remove *3 after #4491
   if (!isInteger(x) || !isInteger(y))
     error("'x' and 'y' must be integer");
+  if (!IS_TRUE_OR_FALSE(out_bmerge))
+    error("'out.bmerge' must be TRUE or FALSE");
   int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
-  double t = 0;
 
-  t = omp_get_wtime();
+  double t = omp_get_wtime();
   bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
   if (do_x_idx) {
     x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
@@ -409,19 +444,21 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx) {
     &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1),
     verbose-1
   );
-  if (n_match > DBL_MAX) {
+  if (n_match > (uint64_t)DBL_MAX) {
     REAL(n_matchr)[0] = NA_REAL;
     warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field");
   } else {
     REAL(n_matchr)[0] = (double)n_match;
   }
   if (verbose>0)
-    Rprintf("smergeR: smergeC of %d x %d => %"PRIu64" took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
+    Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
 
   t = omp_get_wtime();
-  SEXP ans = joinOut(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1);
+  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]);
+  if (verbose>0)
+    Rprintf("smergeR: outSmerge (unsort) took %.3fs\n", omp_get_wtime() - t);
   if (verbose>0)
-    Rprintf("smergeR: joinOut (unsort) took %.3fs\n", omp_get_wtime() - t);
+    Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total);
 
   UNPROTECT(protecti);
   return ans;

From 9b0d195162ae4a6440ef329adcf70055bfeaafda Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 8 Jun 2020 21:42:25 +0100
Subject: [PATCH 16/34] static funs and comments

---
 src/smerge.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/smerge.c b/src/smerge.c
index 6dac785b2c..8b5b2658c3 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -27,11 +27,11 @@ struct smBatch {
 };
 
 // workhorse join that runs in parallel on batches
-void smerge(struct smBatch *batch,
-            int *x, bool unq_x, int *y, bool unq_y,
-            int *starts, int *lens,
-            uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
-  int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs;
+static void smerge(struct smBatch *batch,
+                   int *x, bool unq_x, int *y, bool unq_y,
+                   int *starts, int *lens,
+                   uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
+  int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs; // unpack smBatch struct
   int *ys=batch->ys, *yl=batch->yl, nys=batch->nys;
   int i=0, j=0, is=0, js=0, x_i, y_j;
   uint64_t cnt=0;
@@ -199,16 +199,16 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
 }
 
 // cuts x_starts into equal batches and binary search corresponding y_starts ranges
-struct smBatch *batching(int nBatch,
-                         int *x, int nx, int *x_starts, int nx_starts, int *x_lens,
-                         int *y, int ny, int *y_starts, int ny_starts, int *y_lens,
-                         int verbose) {
+static struct smBatch *batching(int nBatch,
+                                int *x, int nx, int *x_starts, int nx_starts, int *x_lens,
+                                int *y, int ny, int *y_starts, int ny_starts, int *y_lens,
+                                int verbose) {
   if (verbose>0)
     Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch);
   struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch));
   size_t batchSize = (nx_starts-1)/nBatch + 1;
   size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
-  bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
+  bool extra_validate = false; // validates against very slow m[in|ax]_i_match // #DEV
   for (int b=0; b<nBatch; ++b) {
     if (b<nBatch-1) {
       B[b].xs = x_starts + b*batchSize; B[b].xl = x_lens + b*batchSize; B[b].nxs = batchSize;
@@ -242,7 +242,7 @@ struct smBatch *batching(int nBatch,
 }
 
 // helper to count how many threads were used
-int unqNth(int *x, int nx) { // x have 0:(nx-1) values
+static int unqNth(int *x, int nx) { // x have 0:(nx-1) values
   int ans = 0;
   uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
   memset(seen, 0, nx*sizeof(uint8_t));
@@ -285,7 +285,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
-  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV
+  if (nth == 1 || nx_starts < 1024) { // this is 4 only for testing, will be probably 1024 // #DEV
     nBatch = 1;
   } else {
     nBatch = nth * 2;
@@ -322,7 +322,7 @@ void sortInt(int *x, int nx, int *idx, int *ans) {
   return;
 }
 
-// for bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y
+// wrap results into list, bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y
 SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord,
                 SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
                 SEXP x_idx, SEXP y_idx,
@@ -367,6 +367,7 @@ SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord,
   return ans;
 }
 
+// main interface from R
 SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
 
   double t_total = omp_get_wtime();

From 4c5d1ea4c6a30920750935de93340a4cb40ed736 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 8 Jun 2020 22:55:31 +0100
Subject: [PATCH 17/34] better verbose

---
 R/bmerge.R             | 12 +++++++++---
 inst/tests/smerge.Rraw |  8 ++++++--
 src/smerge.c           | 26 +++++++++++++++++++-------
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index f9ab80cd26..3ec080fb18 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -1,6 +1,7 @@
 
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
+  smergeOpt = FALSE
   if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
       && isTRUE(getOption("datatable.smerge"))                     ## enable option
       && identical(nomatch, NA_integer_)                           ## for now only outer join
@@ -9,14 +10,15 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
       ) {
     getIdxGrp = function(x, cols) { ## get index only if retGrp=T
+      if (!isTRUE(getOption("datatable.use.index"))) return()
       if (is.numeric(cols)) cols = names(x)[cols]
       idx = attr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), exact=TRUE)
       if (!is.null(attr(idx, "starts", exact=TRUE))) idx
     }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
+    smans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
-    return(ans)
+    smergeOpt = TRUE
   }
   callersi = i
   i = shallow(i)
@@ -147,7 +149,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
     } else {
       xo = NULL
       if (isTRUE(getOption("datatable.use.index"))) {
-        xo = getindex(x, names(x)[xcols])
+        xo = c(getindex(x, names(x)[xcols])) ## c takes care of future #4386
         if (verbose && !is.null(xo)) cat("on= matches existing index, using index\n")
       }
       if (is.null(xo)) {
@@ -198,6 +200,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
   if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()}
 
+  if (smergeOpt && !identical(ans, smans)) {
+    (if (!interactive()) stop else message)("bmerge opt to smerge produced different results")
+    browser()
+  }
   return(ans)
 }
 
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 3d8cb27b79..cb7bf40661 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -24,6 +24,8 @@ sm = function(x, y) {
   ans
 }
 
+#options(datatable.verbose=TRUE)
+
 ## x y sorted
 x = c(1L,2L,3L,4L)
 y = c(2L,3L,5L)
@@ -124,7 +126,7 @@ d2 = data.table(y=y, v2=seq_along(y))
 options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
 options(datatable.smerge=TRUE)
-test(101.02, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge")
+test(101.02, d1[d2, on="x==y"], expected, output="smerge") ## for now extra computation of bmerge is still done so no: #, notOutput="bmerge")
 
 setindexv2 = function(x, cols) { ## pretend we are after #4386
   stopifnot(is.data.table(x), is.character(cols))
@@ -134,4 +136,6 @@ setindexv2 = function(x, cols) { ## pretend we are after #4386
 }
 setindexv2(d1, "x")
 setindexv2(d2, "y")
-test(101.03, d1[d2, on="x==y"], expected, output="smerge", notOutput="bmerge")
+test(101.03, d1[d2, on="x==y"], expected, output="smerge.*already indexed")
+options(datatable.use.index=FALSE)
+test(101.04, d1[d2, on="x==y"], expected, output="smerge", notOutput="already indexed")
diff --git a/src/smerge.c b/src/smerge.c
index 8b5b2658c3..5035fe9486 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -208,7 +208,7 @@ static struct smBatch *batching(int nBatch,
   struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch));
   size_t batchSize = (nx_starts-1)/nBatch + 1;
   size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
-  bool extra_validate = false; // validates against very slow m[in|ax]_i_match // #DEV
+  bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
   for (int b=0; b<nBatch; ++b) {
     if (b<nBatch-1) {
       B[b].xs = x_starts + b*batchSize; B[b].xl = x_lens + b*batchSize; B[b].nxs = batchSize;
@@ -252,6 +252,18 @@ static int unqNth(int *x, int nx) { // x have 0:(nx-1) values
   return ans;
 }
 
+// helper for verbose messages to print what was actually computed
+static const char *verboseDone(bool x, bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) {
+  if (!x && !y)
+    return(not_xy);
+  else if (!x && y)
+    return(not_x);
+  else if (x && !y)
+    return(not_y);
+  else
+    return(xy);
+}
+
 // count of duplicated entries
 void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
 #pragma omp parallel for schedule(static) num_threads(getDTthreads())
@@ -280,12 +292,12 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
     grpLens(y_starts, ny_starts, ny, y_lens);
   }
   if (verbose>0)
-    Rprintf("smergeC: grpLens took %.3fs\n", omp_get_wtime() - t);
+    Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
   t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
-  if (nth == 1 || nx_starts < 1024) { // this is 4 only for testing, will be probably 1024 // #DEV
+  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV
     nBatch = 1;
   } else {
     nBatch = nth * 2;
@@ -310,7 +322,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   }
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
   if (verbose>0)
-    Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t);
+    Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic)
 
   return;
 }
@@ -392,7 +404,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   if (isNull(x_starts) || isNull(y_starts))
     error("Indices provided to smerge must carry 'starts' attribute");
   if (verbose>0)
-    Rprintf("smergeR: index took %.3fs\n", omp_get_wtime() - t);
+    Rprintf("smergeR: index %s took %.3fs\n", verboseDone(do_x_idx, do_y_idx, "(already indexed)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
   t = omp_get_wtime();
   bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
@@ -410,7 +422,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     yp = INTEGER(y);
   }
   if (verbose>0)
-    Rprintf("smergeR: sort took %.3fs\n", omp_get_wtime() - t);
+    Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
   t = omp_get_wtime();
   SEXP out_starts = R_NilValue, out_lens = R_NilValue;
@@ -457,7 +469,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   t = omp_get_wtime();
   SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]);
   if (verbose>0)
-    Rprintf("smergeR: outSmerge (unsort) took %.3fs\n", omp_get_wtime() - t);
+    Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t);
   if (verbose>0)
     Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total);
 

From 19d1df6e0a378f0abbcacdd75891c9805cddda63 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 9 Jun 2020 02:39:48 +0100
Subject: [PATCH 18/34] simpler batching, more strict type defs

---
 inst/tests/smerge.Rraw |   1 +
 src/smerge.c           | 153 +++++++++++++++++++++--------------------
 2 files changed, 79 insertions(+), 75 deletions(-)

diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index cb7bf40661..3502efa392 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -71,6 +71,7 @@ ssa = function(unq_n, size, sort=FALSE) {
   if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
   if (sort) sort(ans) else ans
 }
+set.seed(108)
 N = 1e2L
 x = ssa(N, N, sort=TRUE)
 y = ssa(N, N, sort=TRUE)
diff --git a/src/smerge.c b/src/smerge.c
index 5035fe9486..8ca13dd616 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -16,29 +16,21 @@
  */
 #define SMERGE_STATS
 
-// shifted pointers so each batch knows where to start and end
-struct smBatch {
-  int *xs; // x's starts: indices on which unique values can be found
-  int *xl; // x's lens:   counts of each unique value
-  int nxs; // x's starts length
-  int *ys; // same for y
-  int *yl;
-  int nys;
-};
-
 // workhorse join that runs in parallel on batches
-static void smerge(struct smBatch *batch,
-                   int *x, bool unq_x, int *y, bool unq_y,
-                   int *starts, int *lens,
+static void smerge(const int bx_off, const int bnx,
+                   const int by_off, const int bny,
+                   const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
+                   const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
+                   int *restrict starts, int *restrict lens,
                    uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
-  int *xs=batch->xs, *xl=batch->xl, nxs=batch->nxs; // unpack smBatch struct
-  int *ys=batch->ys, *yl=batch->yl, nys=batch->nys;
-  int i=0, j=0, is=0, js=0, x_i, y_j;
-  uint64_t cnt=0;
+  uint64_t cnt = 0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
-  if (unq_x && unq_y) { //Rprintf("smerge: unq_x && unq_y\n");
-    while (i<nxs && j<nys) {
-      x_i = x[i], y_j = y[j];
+  if (unq_x && unq_y) {
+    int i=0, j=0;
+    const int *restrict bx = x + bx_off;
+    const int *restrict by = y + by_off;
+    while (i<bnx && j<bny) {
+      const int x_i = bx[i], y_j = by[j];
       if (x_i == y_j) {
         starts[i] = j+1;
         lens[i] = 1;
@@ -52,13 +44,16 @@ static void smerge(struct smBatch *batch,
         j++;
       }
     }
-  } else if (unq_x) { //Rprintf("smerge: unq_x && !unq_y\n");
-    while (i<nxs && js<nys) {
-      j = ys[js]-1;
-      x_i = x[i], y_j = y[j];
+  } else if (unq_x) {
+    int i=0, js=0;
+    const int *restrict bx = x + bx_off;
+    const int *restrict ys = y_starts + by_off, *restrict yl = y_lens + by_off;
+    while (i<bnx && js<bny) {
+      const int j = ys[js]-1;
+      const int x_i = bx[i], y_j = y[j];
       if (x_i == y_j) {
         starts[i] = j+1;
-        int yl1 = yl[js];
+        const int yl1 = yl[js];
         lens[i] = yl1;
         i++;
 #ifdef SMERGE_STATS
@@ -72,13 +67,16 @@ static void smerge(struct smBatch *batch,
         js++;
       }
     }
-  } else if (unq_y) { //Rprintf("smerge: !unq_x && unq_y\n");
-    while (is<nxs && j<nys) {
-      i = xs[is]-1;
-      x_i = x[i], y_j = y[j];
+  } else if (unq_y) {
+    int is=0, j=0;
+    const int *restrict xs = x_starts + bx_off, *restrict xl = x_lens + bx_off;
+    const int *restrict by = y + by_off;
+    while (is<bnx && j<bny) {
+      const int i = xs[is]-1;
+      const int x_i = x[i], y_j = by[j];
       if (x_i == y_j) {
-        int j1 = j+1;
-        int xl1 = xl[is];
+        const int j1 = j+1;
+        const int xl1 = xl[is];
         for (int ii=0; ii<xl1; ++ii) {
           starts[i+ii] = j1;
           lens[i+ii] = 1;
@@ -95,14 +93,17 @@ static void smerge(struct smBatch *batch,
         j++;
       }
     }
-  } else { //Rprintf("smerge: !unq_x && !unq_y\n");
-    while (is<nxs && js<nys) { //Rprintf("============= is=%d\n", is);
-      i = xs[is]-1, j = ys[js]-1;
-      x_i = x[i], y_j = y[j];
-      if (x_i == y_j) { //Rprintf("x[%d]==y[%d]: %d\n", i, j, x_i);
-        int j1 = j+1, yl1 = yl[js]; //Rprintf("x_lens[is]=x_lens[%d]=%d\n",is,xl[is]);
-        int xl1 = xl[is];
-        for (int ii=0; ii<xl1; ++ii) { //Rprintf("starts[%d]=%d\n", i+ii, j1);
+  } else {
+    int is=0, js=0;
+    const int *restrict xs = x_starts + bx_off, *restrict xl = x_lens + bx_off;
+    const int *restrict ys = y_starts + by_off, *restrict yl = y_lens + by_off;
+    while (is<bnx && js<bny) {
+      const int i = xs[is]-1, j = ys[js]-1;
+      const int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        const int j1 = j+1, yl1 = yl[js];
+        const int xl1 = xl[is];
+        for (int ii=0; ii<xl1; ++ii) {
           starts[i+ii] = j1;
           lens[i+ii] = yl1;
         }
@@ -128,7 +129,7 @@ static void smerge(struct smBatch *batch,
 }
 
 // those two helpers needs to be removed once we will confirm rollbs works as expected, they find "first/last within" index
-static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
+static int max_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) {
   int i;
   for (i=0; i<ny_starts; ++i) { //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
     if (y[y_starts[i]-1]==val) return(i);
@@ -136,7 +137,7 @@ static int max_i_match(int *y, int *y_starts, int ny_starts, int val) {
   }
   return(i-1);
 }
-static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
+static int min_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) {
   int i;
   for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
     if (y[y_starts[i]-1]==val) return(i);
@@ -149,7 +150,7 @@ static int min_i_match(int *y, int *y_starts, int ny_starts, int val) {
  * used to find 'y' lower and upper bounds for each batch
  * side -1: lower bound; side 1: upper bound
  */
-static int rollbs(int *x, int *ix, int nix, int val, int side) {
+static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) {
   int verbose = 0; // devel debug only
   if (verbose>0) Rprintf("rollbs: side=%d; val=%d\n", side, val);
   if (side < 0) { // lower bound early stopping
@@ -199,24 +200,21 @@ static int rollbs(int *x, int *ix, int nix, int val, int side) {
 }
 
 // cuts x_starts into equal batches and binary search corresponding y_starts ranges
-static struct smBatch *batching(int nBatch,
-                                int *x, int nx, int *x_starts, int nx_starts, int *x_lens,
-                                int *y, int ny, int *y_starts, int ny_starts, int *y_lens,
-                                int verbose) {
+static void batching(const int nBatch,
+                     const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+                     const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+                     int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny,
+                     const int verbose) {
   if (verbose>0)
     Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch);
-  struct smBatch *B = (struct smBatch *)R_alloc(nBatch, sizeof(struct smBatch));
   size_t batchSize = (nx_starts-1)/nBatch + 1;
   size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
   bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
   for (int b=0; b<nBatch; ++b) {
-    if (b<nBatch-1) {
-      B[b].xs = x_starts + b*batchSize; B[b].xl = x_lens + b*batchSize; B[b].nxs = batchSize;
-    } else { // last batch
-      B[b].xs = x_starts + nx_starts-lastBatchSize; B[b].xl = x_lens + nx_starts-lastBatchSize; B[b].nxs = lastBatchSize;
-    }
-    int x_i_min = B[b].xs[0];
-    int x_i_max = B[b].xs[B[b].nxs-1];
+    Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
+    Bnx[b] = b<nBatch-1 ? batchSize : lastBatchSize;
+    int x_i_min = (x_starts + Bx_off[b])[0];
+    int x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
     int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
     int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
     if (extra_validate) {
@@ -227,22 +225,23 @@ static struct smBatch *batching(int nBatch,
       if (y_i_max!=y_i_max2)
         error("y upper bound different: %d vs %d", y_i_max2, y_i_max);
     }
-    B[b].ys = y_starts + y_i_min; B[b].yl = y_lens + y_i_min; B[b].nys = y_i_max - y_i_min + 1;
+    By_off[b] = y_i_min;
+    Bny[b] = y_i_max - y_i_min + 1;
   }
-  if (verbose>0) { // print smBatch objects, 1-indexed! x y sorted! for debugging and verbose
+  if (verbose>0) { // print batches, 1-indexed! x y sorted! for debugging and verbose
     for (int b=0; b<nBatch; ++b) {
-      int x_i_min = B[b].xs[0], y_i_min = B[b].ys[0];
-      int x_i_max = B[b].xs[B[b].nxs-1], y_i_max = B[b].ys[B[b].nys-1];
-      Rprintf("#### batch[%d]: unq n: x=%d, y=%d\n", b+1, B[b].nxs, B[b].nys);
+      int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
+      int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1];
+      Rprintf("#### batch[%d]: unq n: x=%d, y=%d\n", b+1, Bnx[b], Bny[b]);
       Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
       Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
     }
   }
-  return B;
+  return;
 }
 
 // helper to count how many threads were used
-static int unqNth(int *x, int nx) { // x have 0:(nx-1) values
+static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values
   int ans = 0;
   uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
   memset(seen, 0, nx*sizeof(uint8_t));
@@ -253,7 +252,7 @@ static int unqNth(int *x, int nx) { // x have 0:(nx-1) values
 }
 
 // helper for verbose messages to print what was actually computed
-static const char *verboseDone(bool x, bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) {
+static const char *verboseDone(const bool x, const bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) {
   if (!x && !y)
     return(not_xy);
   else if (!x && y)
@@ -265,8 +264,8 @@ static const char *verboseDone(bool x, bool y, const char *not_xy, const char *n
 }
 
 // count of duplicated entries
-void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
-#pragma omp parallel for schedule(static) num_threads(getDTthreads())
+void grpLens(const int *restrict starts, const int n_starts, const int n, int *restrict lens) { // #4395
+  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
   for (int i=0; i<n_starts-1; ++i)
     lens[i] = starts[i+1]-starts[i];
   lens[n_starts-1] = n - starts[n_starts-1]+1;
@@ -274,15 +273,15 @@ void grpLens(int *starts, int n_starts, int n, /*output*/int *lens) { // #4395
 }
 
 // pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel
-void smergeC(int *x, int nx, int *x_starts, int nx_starts,
-             int *y, int ny, int *y_starts, int ny_starts,
-             int *starts, int *lens,
+void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+             const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+             int *restrict starts, int *restrict lens,
              uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
-             int verbose) {
+             const int verbose) {
 
   double t = omp_get_wtime();
-  int *x_lens = 0, *y_lens = 0;
-  bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
+  int *restrict x_lens = 0, *restrict y_lens = 0;
+  const bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   if (!unq_x) {
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // remove after #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
@@ -294,7 +293,7 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   if (verbose>0)
     Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
-  t = omp_get_wtime(); // this section needs comprehensive testing for correctness of rollbs! whole section of breaking batches should probably be isolated into own function
+  t = omp_get_wtime();
   int nth = getDTthreads();
   int nBatch = 0; // for a single threaded or small unq count use single batch
   if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV
@@ -303,7 +302,9 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
     nBatch = nth * 2;
   }
   int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
-  struct smBatch *B = batching(nBatch, x, nx, x_starts, nx_starts, x_lens, y, ny, y_starts, ny_starts, y_lens, verbose-1);
+  int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int));
+  int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int));
+  batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1);
   if (verbose>0)
     Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
 
@@ -313,8 +314,10 @@ void smergeC(int *x, int nx, int *x_starts, int nx_starts,
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
     smerge(
-      &B[b],                              // batch input
-      x, unq_x, y, unq_y,                 // common input
+      Bx_off[b], Bnx[b],                  // batch input x
+      By_off[b], Bny[b],                  // batch input y
+      x, x_starts, x_lens, unq_x,         // common input x
+      y, y_starts, y_lens, unq_y,         // common input y
       starts, lens,                       // common output
       &nmatch, &xlens1, &ylens1, &xylens1 // common reduction output
     );
@@ -426,7 +429,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
 
   t = omp_get_wtime();
   SEXP out_starts = R_NilValue, out_lens = R_NilValue;
-  int *starts=0, *lens=0;
+  int *restrict starts=0, *restrict lens=0;
   if (x_ord) { // we dont need to reorder results so can save one allocation
     out_starts = PROTECT(allocVector(INTSXP, nx)); protecti++;
     out_lens = PROTECT(allocVector(INTSXP, nx)); protecti++;

From 7efaa6ec9e36e7710b320b07c1eeea4d9a8eb6eb Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 9 Jun 2020 22:39:03 +0100
Subject: [PATCH 19/34] last batch not balanced anymore

---
 inst/tests/smerge.Rraw | 173 ++++++++++++++++++++++++++++++----------
 src/smerge.c           | 177 ++++++++++++++++++++++++-----------------
 2 files changed, 237 insertions(+), 113 deletions(-)

diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 3502efa392..99eb2cfef9 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -19,95 +19,186 @@ bm = function(x, y) {
 }
 sm = function(x, y) {
   stopifnot(is.integer(x), is.integer(y))
+  #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS")
   ans = smerge(x, y, out.bmerge=TRUE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans
 }
 
+#setDTthreads(2)
 #options(datatable.verbose=TRUE)
 
+# unique and sort
 ## x y sorted
-x = c(1L,2L,3L,4L)
-y = c(2L,3L,5L)
+x = c(1L,2L,3L,4L) # unq
+y = c(2L,3L,5L)    # unq
 test(1.01, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
-y = c(2L,3L,5L)
+y = c(2L,3L,5L)    # unq
 test(1.02, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,4L)
-y = c(2L,3L,3L,5L)
+y = c(2L,3L,3L,5L) # unq
 test(1.03, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,3L,5L)
 test(1.04, sm(x, y), bm(x, y))
-
 ## y unsorted
-x = c(1L,2L,3L,4L)
-y = c(2L,5L,3L)
+x = c(1L,2L,3L,4L) # unq
+y = c(2L,5L,3L)    # unq
 test(2.01, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
-y = c(3L,2L,5L)
+y = c(3L,2L,5L)    # unq
 test(2.02, sm(x, y), bm(x, y))
-x = c(1L,2L,3L,4L)
+x = c(1L,2L,3L,4L) # unq
 y = c(5L,3L,2L,3L)
 test(2.03, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(5L,3L,3L,2L)
 test(2.04, sm(x, y), bm(x, y))
-
-#...
-
+## x unsorted
+x = c(2L,3L,1L,4L) # unq
+y = c(2L,3L,5L)    # unq
+test(3.01, sm(x, y), bm(x, y))
+x = c(1L,3L,2L,4L,3L)
+y = c(2L,3L,5L)    # unq
+test(3.02, sm(x, y), bm(x, y))
+x = c(4L,2L,3L,1L)
+y = c(2L,3L,3L,5L) # unq
+test(3.03, sm(x, y), bm(x, y))
+x = c(1L,2L,4L,3L,3L)
+y = c(2L,3L,3L,5L)
+test(3.04, sm(x, y), bm(x, y))
 ## xy unsorted
-x = c(4L,1L,3L,2L)
+x = c(4L,1L,3L,2L) # unq
 y = c(2L,5L,3L)
 test(4.01, sm(x, y), bm(x, y))
+x = c(1L,3L,2L,4L,3L)
+y = c(5L,3L,2L)    # unq
+test(4.02, sm(x, y), bm(x, y))
+x = c(4L,2L,3L,1L)
+y = c(3L,3L,2L,5L) # unq
+test(4.03, sm(x, y), bm(x, y))
+x = c(1L,2L,4L,3L,3L)
+y = c(5L,2L,3L,3L)
+test(4.04, sm(x, y), bm(x, y))
 
-#...
+# ties
+x = c(1L,2L,3L,4L,5L)
+y = c(2L,4L) # within
+test(5.01, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L,5L)
+y = c(-1L,2L,4L) # left tie
+test(5.02, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L,5L)
+y = c(2L,4L,7L) # right tie
+test(5.03, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L,5L)
+y = c(-1L,2L,4L,6L) # both ties
+test(5.04, sm(x, y), bm(x, y))
 
-## xy sorted
+# nomatch
+x = c(1L,3L,5L)
+y = c(2L,4L) # within nomatch
+test(6.01, sm(x, y), bm(x, y))
+x = c(1L,2L,3L,4L,5L)
+y = c(-1L,6L) # ties nomatch
+test(6.02, sm(x, y), bm(x, y))
+x = c(1L,2L,2L,2L,5L)
+y = c(2L,4L) # x duplicates single match
+test(6.03, sm(x, y), bm(x, y))
+x = c(1L,2L,2L,2L,3L,3L,4L,5L)
+y = c(2L,4L) # x duplicates multi, single match
+test(6.04, sm(x, y), bm(x, y))
+x = c(1L,2L,2L,2L,3L,4L,4L,5L)
+y = c(2L,4L) # x duplicates multi, multi match
+test(6.05, sm(x, y), bm(x, y))
+x = c(1L,2L,2L,2L,5L)
+y = c(-1L,6L) # x duplicates nomatch
+test(6.06, sm(x, y), bm(x, y))
+
+# skew
+N = 2e3L
+x = seq_len(N)
+y = c(head(x), tail(x))
+test(7.01, sm(x, y), bm(x, y))
+y = c(1:6, 750L, 1250L, 1995:2000)
+test(7.02, sm(x, y), bm(x, y))
+
+# custom cases
+x=c(39L, 41L, 41L, 37L, 86L, 93L, 20L, 34L, 38L, 21L, 79L, 84L,
+2L, 80L, 51L, 58L, 66L, 33L, 32L, 22L, 24L, 4L, 67L, 59L, 89L,
+1L, 44L, 62L, 34L, 18L, 93L, 67L, 22L, 42L, 8L, 72L, 45L, 87L,
+41L, 85L, 30L, 61L, 5L, 45L, 48L, 41L, 57L, 63L, 68L, 96L, 72L,
+62L, 14L, 84L, 57L, 43L, 6L, 49L, 33L, 68L, 2L, 18L, 69L, 41L,
+2L, 52L, 69L, 94L, 56L, 72L, 13L, 50L, 86L, 81L, 8L, 28L, 96L,
+28L, 87L, 28L, 1L, 27L, 60L, 61L, 99L, 19L, 39L, 99L, 67L, 70L,
+53L, 86L, 64L, 49L, 99L, 91L, 36L, 7L, 57L, 63L)
+y=c(44L, 50L, 47L, 26L, 44L, 11L, 18L, 60L, 9L, 96L, 25L, 59L,
+53L, 82L, 4L, 41L, 65L, 30L, 29L, 34L, 29L, 23L, 12L, 40L, 76L,
+40L, 30L, 29L, 98L, 2L, 57L, 13L, 44L, 68L, 72L, 82L, 19L, 88L,
+19L, 95L, 22L, 46L, 43L, 36L, 67L, 96L, 34L, 6L, 16L, 20L, 86L,
+65L, 89L, 78L, 36L, 95L, 19L, 67L, 65L, 99L, 59L, 77L, 16L, 50L,
+99L, 98L, 72L, 26L, 35L, 46L, 52L, 55L, 56L, 1L, 91L, 21L, 52L,
+69L, 7L, 87L, 97L, 97L, 71L, 48L, 6L, 35L, 62L, 26L, 44L, 36L,
+50L, 75L, 100L, 63L, 39L, 3L, 94L, 85L, 99L, 61L)
+test(8.01, sm(x, y), bm(x, y))
+
+# scale up
 ssa = function(unq_n, size, sort=FALSE) {
-  stopifnot(unq_n <= size)
+  if (unq_n <= size) return(sample.int(unq_n, size, TRUE))
   unq_sub = seq_len(unq_n)
   ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
-  if (unq_n==size) stopifnot(uniqueN(ans)==length(ans))
   if (sort) sort(ans) else ans
 }
 set.seed(108)
 N = 1e2L
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N, sort=TRUE)
+## xy sorted
+x = ssa(N, N, sort=TRUE) # unq
+y = ssa(N, N, sort=TRUE) # unq
 test(11.01, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N, sort=TRUE)
+y = ssa(N, N, sort=TRUE) # unq
 test(11.02, sm(x, y), bm(x, y))
-x = ssa(N, N, sort=TRUE)
+x = ssa(N, N, sort=TRUE) # unq
 y = ssa(N, N*1.1, sort=TRUE)
 test(11.03, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N*1.1, sort=TRUE)
 test(11.04, sm(x, y), bm(x, y))
-
 ## y unsorted
-x = ssa(N, N, sort=TRUE)
-y = ssa(N, N)
+x = ssa(N, N, sort=TRUE) # unq
+y = ssa(N, N) # unq
 test(12.01, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
-y = ssa(N, N)
+y = ssa(N, N) # unq
 test(12.02, sm(x, y), bm(x, y))
-x = ssa(N, N, sort=TRUE)
+x = ssa(N, N, sort=TRUE) # unq
 y = ssa(N, N*1.1)
 test(12.03, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1, sort=TRUE)
 y = ssa(N, N*1.1)
 test(12.04, sm(x, y), bm(x, y))
-
+## x unsorted
+x = ssa(N, N) # unq
+y = ssa(N, N, sort=TRUE) # unq
+test(13.01, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N, sort=TRUE) # unq
+test(13.02, sm(x, y), bm(x, y))
+x = ssa(N, N) # unq
+y = ssa(N, N*1.1, sort=TRUE)
+test(13.03, sm(x, y), bm(x, y))
+x = ssa(N, N*1.1)
+y = ssa(N, N*1.1, sort=TRUE)
+test(13.04, sm(x, y), bm(x, y))
 ## xy unsorted
-x = ssa(N, N)
-y = ssa(N, N)
+x = ssa(N, N) # unq
+y = ssa(N, N) # unq
 test(14.01, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1)
-y = ssa(N, N)
+y = ssa(N, N) # unq
 test(14.02, sm(x, y), bm(x, y))
-x = ssa(N, N)
+x = ssa(N, N) # unq
 y = ssa(N, N*1.1)
 test(14.03, sm(x, y), bm(x, y))
 x = ssa(N, N*1.1)
@@ -115,28 +206,30 @@ y = ssa(N, N*1.1)
 test(14.04, sm(x, y), bm(x, y))
 
 # sparse
-x = sample(2e2L, 1e2L) ## may fail when no matches!
-y = sample(2e2L, 1e2L)
+x = sample.int(2e2L, 1e2L)
+y = sample.int(2e2L, 1e2L)
 test(21.01, sm(x, y), bm(x, y))
-x = sample(2e2L, 1e2L, TRUE)
-y = sample(2e2L, 1e2L, TRUE)
+x = sample.int(2e2L, 1e2L, TRUE)
+y = sample.int(2e2L, 1e2L, TRUE)
 test(21.02, sm(x, y), bm(x, y))
 
-d1 = data.table(x=x, v1=seq_along(x))
-d2 = data.table(y=y, v2=seq_along(y))
+# [.data.table join
+d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x))
+d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y))
 options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
 options(datatable.smerge=TRUE)
 test(101.02, d1[d2, on="x==y"], expected, output="smerge") ## for now extra computation of bmerge is still done so no: #, notOutput="bmerge")
-
 setindexv2 = function(x, cols) { ## pretend we are after #4386
   stopifnot(is.data.table(x), is.character(cols))
   if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer())
   setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), forderv(x, cols, retGrp=TRUE))
   invisible(x)
 }
-setindexv2(d1, "x")
-setindexv2(d2, "y")
+options(datatable.verbose=FALSE)
+setindexv2(d1, "x"); setindexv2(d2, "y")
+options(datatable.use.index=TRUE, datatable.verbose=TRUE)
 test(101.03, d1[d2, on="x==y"], expected, output="smerge.*already indexed")
 options(datatable.use.index=FALSE)
 test(101.04, d1[d2, on="x==y"], expected, output="smerge", notOutput="already indexed")
+options(datatable.use.index=TRUE)
diff --git a/src/smerge.c b/src/smerge.c
index 8ca13dd616..5ed2ef20a9 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -26,14 +26,13 @@ static void smerge(const int bx_off, const int bnx,
   uint64_t cnt = 0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
   if (unq_x && unq_y) {
-    int i=0, j=0;
-    const int *restrict bx = x + bx_off;
-    const int *restrict by = y + by_off;
-    while (i<bnx && j<bny) {
-      const int x_i = bx[i], y_j = by[j];
+    int i = bx_off, j = by_off;
+    const int ni = bx_off+bnx, nj = by_off+bny;
+    while (i<ni && j<nj) {
+      const int x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         starts[i] = j+1;
-        lens[i] = 1;
+        //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc, same for unq_y branch
         i++;
 #ifdef SMERGE_STATS
         cnt++;
@@ -45,15 +44,14 @@ static void smerge(const int bx_off, const int bnx,
       }
     }
   } else if (unq_x) {
-    int i=0, js=0;
-    const int *restrict bx = x + bx_off;
-    const int *restrict ys = y_starts + by_off, *restrict yl = y_lens + by_off;
-    while (i<bnx && js<bny) {
-      const int j = ys[js]-1;
-      const int x_i = bx[i], y_j = y[j];
+    int i = bx_off, js = by_off;
+    const int ni = bx_off+bnx, njs = by_off+bny;
+    while (i<ni && js<njs) {
+      const int j = y_starts[js]-1;
+      const int x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         starts[i] = j+1;
-        const int yl1 = yl[js];
+        const int yl1 = y_lens[js];
         lens[i] = yl1;
         i++;
 #ifdef SMERGE_STATS
@@ -68,18 +66,17 @@ static void smerge(const int bx_off, const int bnx,
       }
     }
   } else if (unq_y) {
-    int is=0, j=0;
-    const int *restrict xs = x_starts + bx_off, *restrict xl = x_lens + bx_off;
-    const int *restrict by = y + by_off;
-    while (is<bnx && j<bny) {
-      const int i = xs[is]-1;
-      const int x_i = x[i], y_j = by[j];
+    int is = bx_off, j = by_off;
+    const int nis = bx_off+bnx, nj = by_off+bny;
+    while (is<nis && j<nj) {
+      const int i = x_starts[is]-1;
+      const int x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         const int j1 = j+1;
-        const int xl1 = xl[is];
+        const int xl1 = x_lens[is];
         for (int ii=0; ii<xl1; ++ii) {
           starts[i+ii] = j1;
-          lens[i+ii] = 1;
+          //lens[i+ii] = 1; // see comment in unq_x && unq_y branch
         }
         is++;
 #ifdef SMERGE_STATS
@@ -94,15 +91,14 @@ static void smerge(const int bx_off, const int bnx,
       }
     }
   } else {
-    int is=0, js=0;
-    const int *restrict xs = x_starts + bx_off, *restrict xl = x_lens + bx_off;
-    const int *restrict ys = y_starts + by_off, *restrict yl = y_lens + by_off;
-    while (is<bnx && js<bny) {
-      const int i = xs[is]-1, j = ys[js]-1;
+    int is = bx_off, js = by_off;
+    const int nis = bx_off+bnx, njs = by_off+bny;
+    while (is<nis && js<njs) {
+      const int i = x_starts[is]-1, j = y_starts[js]-1;
       const int x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
-        const int j1 = j+1, yl1 = yl[js];
-        const int xl1 = xl[is];
+        const int j1 = j+1, yl1 = y_lens[js];
+        const int xl1 = x_lens[is];
         for (int ii=0; ii<xl1; ++ii) {
           starts[i+ii] = j1;
           lens[i+ii] = yl1;
@@ -124,6 +120,7 @@ static void smerge(const int bx_off, const int bnx,
       }
     }
   }
+  //Rprintf("cnt=%"PRIu64"\n", cnt);
   xlens1[0] = xlen1; ylens1[0] = ylen1; xylens1[0] = xylen1; nmatch[0] = cnt;
   return;
 }
@@ -147,36 +144,36 @@ static int min_i_match(const int *restrict y, const int *restrict y_starts, cons
 }
 /*
  * 'rolling nearest' binary search
- * used to find 'y' lower and upper bounds for each batch
+ * used to find 'y' lower and upper, 0-based, bounds for each batch
  * side -1: lower bound; side 1: upper bound
  */
 static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) {
   int verbose = 0; // devel debug only
-  if (verbose>0) Rprintf("rollbs: side=%d; val=%d\n", side, val);
+  if (verbose>0) Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val);
+  if (x[ix[0]-1] == val) { // common early stopping
+    if (verbose>0) Rprintf("rollbs: min elements %d match: 0\n", val);
+    return 0;
+  }
+  if (x[ix[nix-1]-1] == val) {
+    if (verbose>0) Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1);
+    return nix-1;
+  }
   if (side < 0) { // lower bound early stopping
-    if (x[ix[0]-1] == val) {
-      if (verbose>0) Rprintf("rollbs: min elements %d match\n", val);
-      return 0;
-    }
     if (x[ix[nix-1]-1] < val) {
-      if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d\n", x[ix[nix-1]-1], val);
+      if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val);
       return -1;
     }
     if (x[ix[0]-1] > val) {
-      if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d\n", x[ix[0]-1], val);
+      if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val);
       return 0;
     }
   } else if (side > 0) { // upper bound early stopping
-    if (x[ix[nix-1]-1] == val) {
-      if (verbose>0) Rprintf("rollbs: max elements %d match\n", val);
-      return nix-1;
-    }
     if (x[ix[0]-1] > val) {
-      if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d\n", x[ix[0]-1], val);
+      if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val);
       return -1;
     }
     if (x[ix[nix-1]-1] < val) {
-      if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d\n", x[ix[nix-1]-1], val);
+      if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1);
       return nix-1;
     }
   }
@@ -192,7 +189,7 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix,
     else if (thisx > val)
       upper = i-1;
   }
-  if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d\n", i, x[ix[i]-1], lower, upper, side);
+  if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper);
   if (side < 0) // anyone to stress test this logic?
     return lower;
   else
@@ -205,10 +202,17 @@ static void batching(const int nBatch,
                      const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
                      int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny,
                      const int verbose) {
+  //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov
+  //size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile, at least when we hardcode nBatch
+  //size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
+  size_t batchSize = nx_starts / nBatch;
+  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; // because of the above line last batch can be anything between 1 and 2*batchSize-1
   if (verbose>0)
-    Rprintf("batching: %d batches of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nBatch);
-  size_t batchSize = (nx_starts-1)/nBatch + 1;
-  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
+    Rprintf("batching: input %d into %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, nBatch, batchSize, lastBatchSize);
+  if (lastBatchSize==0)
+    error("internal error: lastBatchSize must not be zero"); // # nocov
+  if ((nBatch-1) * batchSize + lastBatchSize != nx_starts)
+    error("internal error: batching %d input is attempting to use invalid batches: nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, nBatch, batchSize, lastBatchSize); // # nocov
   bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
   for (int b=0; b<nBatch; ++b) {
     Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
@@ -217,7 +221,8 @@ static void batching(const int nBatch,
     int x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
     int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
     int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
-    if (extra_validate) {
+    const bool y_match = y_i_min >= 0 && y_i_max >= 0;
+    if (extra_validate && y_match) {
       int y_i_min2 = min_i_match(y, y_starts, ny_starts, x[x_i_min-1]);
       int y_i_max2 = max_i_match(y, y_starts, ny_starts, x[x_i_max-1]);
       if (y_i_min!=y_i_min2)
@@ -225,16 +230,18 @@ static void batching(const int nBatch,
       if (y_i_max!=y_i_max2)
         error("y upper bound different: %d vs %d", y_i_max2, y_i_max);
     }
-    By_off[b] = y_i_min;
-    Bny[b] = y_i_max - y_i_min + 1;
+    By_off[b] = y_match ? y_i_min : 0;
+    Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0;
   }
   if (verbose>0) { // print batches, 1-indexed! x y sorted! for debugging and verbose
     for (int b=0; b<nBatch; ++b) {
-      int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
-      int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1];
       Rprintf("#### batch[%d]: unq n: x=%d, y=%d\n", b+1, Bnx[b], Bny[b]);
-      Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
-      Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
+      if (Bny[b] > 0) {
+        int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
+        int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1];
+        Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
+        Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
+      }
     }
   }
   return;
@@ -279,7 +286,9 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
              uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
              const int verbose) {
 
-  double t = omp_get_wtime();
+  double t = 0;
+  if (verbose>0)
+    t = omp_get_wtime();
   int *restrict x_lens = 0, *restrict y_lens = 0;
   const bool unq_x = nx_starts==nx, unq_y = ny_starts==ny;
   if (!unq_x) {
@@ -293,36 +302,51 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
   if (verbose>0)
     Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
-  t = omp_get_wtime();
-  int nth = getDTthreads();
-  int nBatch = 0; // for a single threaded or small unq count use single batch
-  if (nth == 1 || nx_starts < 4) { // this is 4 only for testing, will be probably 1024 // #DEV
-    nBatch = 1;
+  if (verbose>0)
+    t = omp_get_wtime();
+  int nBatch = 0;
+  const int nth = getDTthreads();
+  if (nth == 1 || nx_starts < -1) { // nx_starts threshold disabled during dev
+    nBatch = 1; // any hardcoding here is likely to raise internal error in batching or segfault
+  } else if (nx_starts < nth * 2) {
+    nBatch = nx_starts; // stress test single row batches, will be usually escaped by branch above
   } else {
     nBatch = nth * 2;
   }
-  int *th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int));
   int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int));
   batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1);
+  int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   if (verbose>0)
     Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
 
-  t = omp_get_wtime();
+  if (verbose>0)
+    t = omp_get_wtime();
   bool xlens1 = true, ylens1 = true, xylens1 = true;
   uint64_t nmatch = 0;
+  //for (int z=0; z<nx; ++z) starts[z] = z;
+  //Rprintf("starts before smerge\n");
+  //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
+    bool bxlens1 = true, bylens1 = true, bxylens1 = true;
+    uint64_t bnmatch = 0;
     smerge(
       Bx_off[b], Bnx[b],                  // batch input x
       By_off[b], Bny[b],                  // batch input y
       x, x_starts, x_lens, unq_x,         // common input x
       y, y_starts, y_lens, unq_y,         // common input y
       starts, lens,                       // common output
-      &nmatch, &xlens1, &ylens1, &xylens1 // common reduction output
+      &bnmatch, &bxlens1, &bylens1, &bxylens1 // common reduction output
     );
+    nmatch += bnmatch;
+    xlens1 = xlens1 && bxlens1;
+    ylens1 = ylens1 && bylens1;
+    xylens1 = xylens1 && bxylens1;
     th[b] = omp_get_thread_num();
   }
+  //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);
+  //Rprintf("nmatch=%"PRIu64"\n", nmatch);
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
   if (verbose>0)
     Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic)
@@ -330,7 +354,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
   return;
 }
 
-void sortInt(int *x, int nx, int *idx, int *ans) {
+void sortInt(const int *restrict x, const int nx, const int *restrict idx, int *restrict ans) {
   #pragma omp parallel for schedule(static) num_threads(getDTthreads())
   for (int i=0; i<nx; ++i)
     ans[i] = x[idx[i]-1];
@@ -338,11 +362,11 @@ void sortInt(int *x, int nx, int *idx, int *ans) {
 }
 
 // wrap results into list, bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y
-SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord,
+SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord,
                 SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
                 SEXP x_idx, SEXP y_idx,
                 SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1,
-                bool bmerge) {
+                const bool bmerge) {
   int out_len = bmerge ? 6 : 10;
   SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames;
   setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len));
@@ -385,16 +409,19 @@ SEXP outSmergeR(int n, int *starts, int *lens, bool x_ord,
 // main interface from R
 SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
 
-  double t_total = omp_get_wtime();
   const int verbose = GetVerbose()*3; // remove *3 after #4491
+  double t_total = 0, t = 0;
+  if (verbose>0)
+    t_total = omp_get_wtime();
   if (!isInteger(x) || !isInteger(y))
     error("'x' and 'y' must be integer");
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
   int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
 
-  double t = omp_get_wtime();
-  bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
+  if (verbose>0)
+    t = omp_get_wtime();
+  const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
   if (do_x_idx) {
     x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
   }
@@ -409,8 +436,9 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   if (verbose>0)
     Rprintf("smergeR: index %s took %.3fs\n", verboseDone(do_x_idx, do_y_idx, "(already indexed)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
-  t = omp_get_wtime();
-  bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
+  if (verbose>0)
+    t = omp_get_wtime();
+  const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
   int *xp, *yp;
   if (!x_ord) {
     xp = (int *)R_alloc(nx, sizeof(int));
@@ -427,7 +455,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   if (verbose>0)
     Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
-  t = omp_get_wtime();
+  if (verbose>0)
+    t = omp_get_wtime();
   SEXP out_starts = R_NilValue, out_lens = R_NilValue;
   int *restrict starts=0, *restrict lens=0;
   if (x_ord) { // we dont need to reorder results so can save one allocation
@@ -452,7 +481,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   if (verbose>0)
     Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t);
 
-  t = omp_get_wtime();
+  if (verbose>0)
+    t = omp_get_wtime();
   smergeC(
     xp, nx, INTEGER(x_starts), LENGTH(x_starts),
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
@@ -469,7 +499,8 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
   if (verbose>0)
     Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
 
-  t = omp_get_wtime();
+  if (verbose>0)
+    t = omp_get_wtime();
   SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]);
   if (verbose>0)
     Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t);

From 6423c0b489a1a2917769644b56a2ab8521e59f12 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 9 Jun 2020 23:39:53 +0100
Subject: [PATCH 20/34] batching balanced again

---
 inst/tests/smerge.Rraw |  5 +++-
 src/smerge.c           | 52 +++++++++++++++++++++++-------------------
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 99eb2cfef9..78ae166982 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -9,12 +9,14 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   smerge = data.table:::smerge
   bmerge = data.table:::bmerge
   forderv = data.table:::forderv
+  vecseq = data.table:::vecseq
 }
 
 bm = function(x, y) {
   stopifnot(is.integer(x), is.integer(y))
   ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
+  ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL))))
   ans
 }
 sm = function(x, y) {
@@ -22,6 +24,7 @@ sm = function(x, y) {
   #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS")
   ans = smerge(x, y, out.bmerge=TRUE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
+  ans$nMatch = smerge(x, y, out.bmerge=FALSE)$nMatch
   ans
 }
 
@@ -151,7 +154,7 @@ ssa = function(unq_n, size, sort=FALSE) {
   if (sort) sort(ans) else ans
 }
 set.seed(108)
-N = 1e2L
+N = 1e4L
 ## xy sorted
 x = ssa(N, N, sort=TRUE) # unq
 y = ssa(N, N, sort=TRUE) # unq
diff --git a/src/smerge.c b/src/smerge.c
index 5ed2ef20a9..cdfdef57aa 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -13,8 +13,11 @@
  * x_len1/x_lens1/lhsLen1:       signals if multiple matches in LHS of join, ['s _i_ table
  * xy_len1/xy_lens1/xlLen1:      signals if "many to many" matches between LHS and RHS
  * cnt/nmatch/n_match/n_matchr:  count of matches, taking multiple matches into account, uint64_t
+ *
+ * when hardcoding or changing default number of batches it is advised to undefine SMERGE_BATCHING_BALANCED
  */
 #define SMERGE_STATS
+#define SMERGE_BATCHING_BALANCED
 
 // workhorse join that runs in parallel on batches
 static void smerge(const int bx_off, const int bnx,
@@ -202,18 +205,20 @@ static void batching(const int nBatch,
                      const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
                      int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny,
                      const int verbose) {
+#ifdef SMERGE_BATCHING_BALANCED
   //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov
-  //size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile, at least when we hardcode nBatch
-  //size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
-  size_t batchSize = nx_starts / nBatch;
-  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize; // because of the above line last batch can be anything between 1 and 2*batchSize-1
+  size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch
+  bool balanced = true; // this is only for verbose message
+#else
+  size_t batchSize = nx_starts / nBatch; // last batch size can be anything between 1 and 2*batchSize-1
+  bool balanced = false;
+#endif
+  size_t lastBatchSize = nx_starts - (nBatch-1)*batchSize;
   if (verbose>0)
-    Rprintf("batching: input %d into %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, nBatch, batchSize, lastBatchSize);
-  if (lastBatchSize==0)
-    error("internal error: lastBatchSize must not be zero"); // # nocov
-  if ((nBatch-1) * batchSize + lastBatchSize != nx_starts)
-    error("internal error: batching %d input is attempting to use invalid batches: nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, nBatch, batchSize, lastBatchSize); // # nocov
-  bool extra_validate = true; // validates against very slow m[in|ax]_i_match // #DEV
+    Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize);
+  if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts))
+    error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
+  bool extra_validate = false; // validates against very slow m[in|ax]_i_match // this needs to be removed #DEV
   for (int b=0; b<nBatch; ++b) {
     Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
     Bnx[b] = b<nBatch-1 ? batchSize : lastBatchSize;
@@ -306,8 +311,12 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     t = omp_get_wtime();
   int nBatch = 0;
   const int nth = getDTthreads();
-  if (nth == 1 || nx_starts < -1) { // nx_starts threshold disabled during dev
-    nBatch = 1; // any hardcoding here is likely to raise internal error in batching or segfault
+  if (nth == 1 || nx_starts < 1024) { // nx_starts threshold disabled during dev
+#ifdef SMERGE_BATCHING_BALANCED
+    nBatch = 1; // when using balanced lastBatchSize is never bigger than batchSize, any hardcoding here is likely to raise internal error in batching or segfault, testing possible with: for (i in 1:10) cc("smerge.Rraw")
+#else
+    nBatch = 1; // so if hardcoding needed, do it here, and undefine SMERGE_BATCHING_BALANCED at the top, for unbalanced batching lastBatchSize is anything between 1 and 2*batchSize-1
+#endif
   } else if (nx_starts < nth * 2) {
     nBatch = nx_starts; // stress test single row batches, will be usually escaped by branch above
   } else {
@@ -329,20 +338,17 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
   //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
+    uint64_t bnmatch = 0; // local reduction variables
     bool bxlens1 = true, bylens1 = true, bxylens1 = true;
-    uint64_t bnmatch = 0;
     smerge(
-      Bx_off[b], Bnx[b],                  // batch input x
-      By_off[b], Bny[b],                  // batch input y
-      x, x_starts, x_lens, unq_x,         // common input x
-      y, y_starts, y_lens, unq_y,         // common input y
-      starts, lens,                       // common output
-      &bnmatch, &bxlens1, &bylens1, &bxylens1 // common reduction output
+      Bx_off[b], Bnx[b],                      // batch input x
+      By_off[b], Bny[b],                      // batch input y
+      x, x_starts, x_lens, unq_x,             // common input x
+      y, y_starts, y_lens, unq_y,             // common input y
+      starts, lens,                           // common output
+      &bnmatch, &bxlens1, &bylens1, &bxylens1 // reduction output
     );
-    nmatch += bnmatch;
-    xlens1 = xlens1 && bxlens1;
-    ylens1 = ylens1 && bylens1;
-    xylens1 = xylens1 && bxylens1;
+    nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num();
   }
   //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);

From 045e9e81352b55fd9a916ca72c1d5151b4f59cea Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 9 Jun 2020 23:51:22 +0100
Subject: [PATCH 21/34] remove extra checks in bmerge to smerge opt

---
 R/bmerge.R | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index 3ec080fb18..c385eab265 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -1,7 +1,6 @@
 
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
-  smergeOpt = FALSE
   if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
       && isTRUE(getOption("datatable.smerge"))                     ## enable option
       && identical(nomatch, NA_integer_)                           ## for now only outer join
@@ -16,9 +15,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       if (!is.null(attr(idx, "starts", exact=TRUE))) idx
     }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    smans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
+    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
-    smergeOpt = TRUE
+    return(ans)
   }
   callersi = i
   i = shallow(i)
@@ -200,10 +199,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
   if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()}
 
-  if (smergeOpt && !identical(ans, smans)) {
-    (if (!interactive()) stop else message)("bmerge opt to smerge produced different results")
-    browser()
-  }
   return(ans)
 }
 

From 739a35a4fdc8254a17b666e5426ac3cb9795bd44 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Wed, 10 Jun 2020 21:34:57 +0100
Subject: [PATCH 22/34] fix test function

---
 inst/tests/smerge.Rraw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 78ae166982..31dcd6e07b 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -148,7 +148,7 @@ test(8.01, sm(x, y), bm(x, y))
 
 # scale up
 ssa = function(unq_n, size, sort=FALSE) {
-  if (unq_n <= size) return(sample.int(unq_n, size, TRUE))
+  if (unq_n > size) return(sample.int(unq_n, size))
   unq_sub = seq_len(unq_n)
   ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
   if (sort) sort(ans) else ans

From 47cfd8a68df469bf9765956fcf71d1c5a3d6cf94 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Wed, 10 Jun 2020 21:35:54 +0100
Subject: [PATCH 23/34] improve verbose msg

---
 src/smerge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smerge.c b/src/smerge.c
index cdfdef57aa..6028110961 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -509,7 +509,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     t = omp_get_wtime();
   SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]);
   if (verbose>0)
-    Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(unsort)", omp_get_wtime() - t);
+    Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t);
   if (verbose>0)
     Rprintf("smergeR: all took %.3fs\n", omp_get_wtime() - t_total);
 

From 2806017cba78d6def44fee85d09b14bf3629e3f1 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Thu, 11 Jun 2020 00:38:54 +0100
Subject: [PATCH 24/34] cleanup dev mostly

---
 src/smerge.c | 81 +++++++++++++++++++---------------------------------
 1 file changed, 30 insertions(+), 51 deletions(-)

diff --git a/src/smerge.c b/src/smerge.c
index 6028110961..2049065087 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -123,28 +123,10 @@ static void smerge(const int bx_off, const int bnx,
       }
     }
   }
-  //Rprintf("cnt=%"PRIu64"\n", cnt);
   xlens1[0] = xlen1; ylens1[0] = ylen1; xylens1[0] = xylen1; nmatch[0] = cnt;
   return;
 }
 
-// those two helpers needs to be removed once we will confirm rollbs works as expected, they find "first/last within" index
-static int max_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) {
-  int i;
-  for (i=0; i<ny_starts; ++i) { //Rprintf("max_i_match: y[y_starts[%d]-1] >= val: %d >= %d\n", i, y[y_starts[i]-1], val);
-    if (y[y_starts[i]-1]==val) return(i);
-    else if (y[y_starts[i]-1]>val) return(i-1);
-  }
-  return(i-1);
-}
-static int min_i_match(const int *restrict y, const int *restrict y_starts, const int ny_starts, const int val) {
-  int i;
-  for (i=ny_starts-1; i>=0; --i) { //Rprintf("min_i_match: y[y_starts[%d]-1] <= val: %d <= %d\n", i, y[y_starts[i]-1], val);
-    if (y[y_starts[i]-1]==val) return(i);
-    else if (y[y_starts[i]-1]<val) return(i+1);
-  }
-  return(i+1);
-}
 /*
  * 'rolling nearest' binary search
  * used to find 'y' lower and upper, 0-based, bounds for each batch
@@ -152,31 +134,38 @@ static int min_i_match(const int *restrict y, const int *restrict y_starts, cons
  */
 static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) {
   int verbose = 0; // devel debug only
-  if (verbose>0) Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val);
+  if (verbose>0)
+    Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov
   if (x[ix[0]-1] == val) { // common early stopping
-    if (verbose>0) Rprintf("rollbs: min elements %d match: 0\n", val);
+    if (verbose>0)
+      Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov
     return 0;
   }
   if (x[ix[nix-1]-1] == val) {
-    if (verbose>0) Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1);
+    if (verbose>0)
+      Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov
     return nix-1;
   }
   if (side < 0) { // lower bound early stopping
     if (x[ix[nix-1]-1] < val) {
-      if (verbose>0) Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val);
+      if (verbose>0)
+        Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov
       return -1;
     }
     if (x[ix[0]-1] > val) {
-      if (verbose>0) Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val);
+      if (verbose>0)
+        Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov
       return 0;
     }
   } else if (side > 0) { // upper bound early stopping
     if (x[ix[0]-1] > val) {
-      if (verbose>0) Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val);
+      if (verbose>0)
+        Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov
       return -1;
     }
     if (x[ix[nix-1]-1] < val) {
-      if (verbose>0) Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1);
+      if (verbose>0)
+        Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov
       return nix-1;
     }
   }
@@ -184,7 +173,7 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix,
   while (lower<=upper) {
     i = lower + (upper-lower)/2;
     int thisx = x[ix[i]-1];
-    //if (verbose>0) Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val);
+    //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out
     if (thisx==val)
       return(i);
     else if (thisx < val)
@@ -192,7 +181,8 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix,
     else if (thisx > val)
       upper = i-1;
   }
-  if (verbose>0) Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper);
+  if (verbose>0)
+    Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov
   if (side < 0) // anyone to stress test this logic?
     return lower;
   else
@@ -218,7 +208,6 @@ static void batching(const int nBatch,
     Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize);
   if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts))
     error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
-  bool extra_validate = false; // validates against very slow m[in|ax]_i_match // this needs to be removed #DEV
   for (int b=0; b<nBatch; ++b) {
     Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
     Bnx[b] = b<nBatch-1 ? batchSize : lastBatchSize;
@@ -227,14 +216,6 @@ static void batching(const int nBatch,
     int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
     int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
     const bool y_match = y_i_min >= 0 && y_i_max >= 0;
-    if (extra_validate && y_match) {
-      int y_i_min2 = min_i_match(y, y_starts, ny_starts, x[x_i_min-1]);
-      int y_i_max2 = max_i_match(y, y_starts, ny_starts, x[x_i_max-1]);
-      if (y_i_min!=y_i_min2)
-        error("y lower bound different: %d vs %d", y_i_min2, y_i_min);
-      if (y_i_max!=y_i_max2)
-        error("y upper bound different: %d vs %d", y_i_max2, y_i_max);
-    }
     By_off[b] = y_match ? y_i_min : 0;
     Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0;
   }
@@ -252,7 +233,7 @@ static void batching(const int nBatch,
   return;
 }
 
-// helper to count how many threads were used
+// helper for verbose messages to count how many threads were used, due to schedule dynamic not all may be used
 static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values
   int ans = 0;
   uint8_t *seen = (uint8_t *)R_alloc(nx, sizeof(uint8_t));
@@ -266,13 +247,13 @@ static int unqNth(const int *x, const int nx) { // x have 0:(nx-1) values
 // helper for verbose messages to print what was actually computed
 static const char *verboseDone(const bool x, const bool y, const char *not_xy, const char *not_x, const char *not_y, const char *xy) {
   if (!x && !y)
-    return(not_xy);
+    return not_xy;
   else if (!x && y)
-    return(not_x);
+    return not_x;
   else if (x && !y)
-    return(not_y);
+    return not_y;
   else
-    return(xy);
+    return xy;
 }
 
 // count of duplicated entries
@@ -311,7 +292,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     t = omp_get_wtime();
   int nBatch = 0;
   const int nth = getDTthreads();
-  if (nth == 1 || nx_starts < 1024) { // nx_starts threshold disabled during dev
+  if (nth == 1 || nx_starts < 1024) {
 #ifdef SMERGE_BATCHING_BALANCED
     nBatch = 1; // when using balanced lastBatchSize is never bigger than batchSize, any hardcoding here is likely to raise internal error in batching or segfault, testing possible with: for (i in 1:10) cc("smerge.Rraw")
 #else
@@ -333,9 +314,6 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     t = omp_get_wtime();
   bool xlens1 = true, ylens1 = true, xylens1 = true;
   uint64_t nmatch = 0;
-  //for (int z=0; z<nx; ++z) starts[z] = z;
-  //Rprintf("starts before smerge\n");
-  //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
     uint64_t bnmatch = 0; // local reduction variables
@@ -351,8 +329,6 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num();
   }
-  //for (int z=0; z<nx; ++z) Rprintf("starts[%d]=%d\n", z, starts[z]);
-  //Rprintf("nmatch=%"PRIu64"\n", nmatch);
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
   if (verbose>0)
     Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic)
@@ -367,7 +343,7 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int *
   return;
 }
 
-// wrap results into list, bmerge=true this produce bmerge consistent output, note that bmerge i,x is smerge x,y
+// wrap results into list, bmerge=true produces bmerge replacement output, note that bmerge i,x is smerge x,y (as of now)
 SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord,
                 SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
                 SEXP x_idx, SEXP y_idx,
@@ -423,6 +399,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     error("'x' and 'y' must be integer");
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
+  const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0];
   int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
 
   if (verbose>0)
@@ -474,10 +451,12 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     starts = (int *)R_alloc(nx, sizeof(int));
     lens = (int *)R_alloc(nx, sizeof(int));
   }
+  // this fills default values, bmerge's defaults are tricky (dictated by how they are consumed): nomatch=0 makes starts=0 not NA, lens=0 is fine there; nomatch=NA makes lens=1 not NA, starts=NA is fine there
+  const int default_lens = 1; //ans_bmerge ? 1 : NA_INTEGER; // for now keep it fixed to bmerge nomatch out: 1
   #pragma omp parallel for schedule(static) num_threads(getDTthreads())
   for (int i=0; i<nx; ++i) {
     starts[i] = NA_INTEGER;
-    lens[i] = 1;
+    lens[i] = default_lens;
   }
   uint64_t n_match = 0;
   SEXP x_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
@@ -496,7 +475,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1),
     verbose-1
   );
-  if (n_match > (uint64_t)DBL_MAX) {
+  if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here
     REAL(n_matchr)[0] = NA_REAL;
     warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field");
   } else {
@@ -507,7 +486,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
 
   if (verbose>0)
     t = omp_get_wtime();
-  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, (bool)LOGICAL(out_bmerge)[0]);
+  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, ans_bmerge);
   if (verbose>0)
     Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t);
   if (verbose>0)

From 4e5f56d23b5fb4a49b8aed88f89bdd47e7088caf Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Thu, 11 Jun 2020 02:01:04 +0100
Subject: [PATCH 25/34] comment about thread utilization

---
 src/smerge.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smerge.c b/src/smerge.c
index 2049065087..e1a64e313d 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -327,7 +327,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
       &bnmatch, &bxlens1, &bylens1, &bxylens1 // reduction output
     );
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
-    th[b] = omp_get_thread_num();
+    th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message
   }
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
   if (verbose>0)

From 405f55d7bb62e502380321db522fdffc57b234de Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 12 Jun 2020 11:47:01 +0100
Subject: [PATCH 26/34] mult support

---
 R/wrappers.R           |   2 +-
 inst/tests/smerge.Rraw |  15 ++--
 src/data.table.h       |   2 +-
 src/smerge.c           | 195 ++++++++++++++++++++++++++++++++---------
 4 files changed, 161 insertions(+), 53 deletions(-)

diff --git a/R/wrappers.R b/R/wrappers.R
index 83e3d73417..636e6e04ee 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,4 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smerge = function(x, y, x.idx=NULL, y.idx=NULL, out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, out.bmerge)
+smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge)
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 31dcd6e07b..3a3bf9812a 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -12,19 +12,18 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   vecseq = data.table:::vecseq
 }
 
-bm = function(x, y) {
+bm = function(x, y, mult="all") {
   stopifnot(is.integer(x), is.integer(y))
-  ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult="all", ops=1L, verbose=FALSE)
+  ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL))))
   ans
 }
-sm = function(x, y) {
+sm = function(x, y, mult="all") {
   stopifnot(is.integer(x), is.integer(y))
-  #saveRDS(x, "~/git/data.table/x.RDS"); saveRDS(y, "~/git/data.table/y.RDS")
-  ans = smerge(x, y, out.bmerge=TRUE)
+  ans = smerge(x, y, mult=mult, out.bmerge=TRUE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
-  ans$nMatch = smerge(x, y, out.bmerge=FALSE)$nMatch
+  ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch
   ans
 }
 
@@ -39,8 +38,8 @@ test(1.01, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,5L)    # unq
 test(1.02, sm(x, y), bm(x, y))
-x = c(1L,2L,3L,4L)
-y = c(2L,3L,3L,5L) # unq
+x = c(1L,2L,3L,4L) # unq
+y = c(2L,3L,3L,5L)
 test(1.03, sm(x, y), bm(x, y))
 x = c(1L,2L,3L,3L,4L)
 y = c(2L,3L,3L,5L)
diff --git a/src/data.table.h b/src/data.table.h
index 9ceb722cea..231f8f28b4 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge);
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
diff --git a/src/smerge.c b/src/smerge.c
index e1a64e313d..41de40d82e 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -19,13 +19,17 @@
 #define SMERGE_STATS
 #define SMERGE_BATCHING_BALANCED
 
+// this only refers to RHS side: y
+enum emult{ALL, FIRST, LAST, ERR};
+
 // workhorse join that runs in parallel on batches
 static void smerge(const int bx_off, const int bnx,
                    const int by_off, const int bny,
                    const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
                    const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
                    int *restrict starts, int *restrict lens,
-                   uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1) {
+                   uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1,
+                   const enum emult mult) {
   uint64_t cnt = 0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
   if (unq_x && unq_y) {
@@ -49,23 +53,59 @@ static void smerge(const int bx_off, const int bnx,
   } else if (unq_x) {
     int i = bx_off, js = by_off;
     const int ni = bx_off+bnx, njs = by_off+bny;
-    while (i<ni && js<njs) {
-      const int j = y_starts[js]-1;
-      const int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        starts[i] = j+1;
-        const int yl1 = y_lens[js];
-        lens[i] = yl1;
-        i++;
+    if (mult == ALL || mult == ERR) { // mult==err is raised based on ylen1 flag outside of parallel region
+      while (i<ni && js<njs) {
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+1;
+          const int yl1 = y_lens[js];
+          lens[i] = yl1;
+          i++;
 #ifdef SMERGE_STATS
-        if (ylen1 && yl1>1)
-          ylen1 = false;
-        cnt += (uint64_t)yl1;
+          if (ylen1 && yl1>1)
+            ylen1 = false;
+          cnt += (uint64_t)yl1;
 #endif
-      } else if (x_i < y_j) {
-        i++;
-      } else if (x_i > y_j) {
-        js++;
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
+        }
+      }
+    } else if (mult == FIRST) {
+      while (i<ni && js<njs) {
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+1;
+          lens[i] = 1; // could be pre-allocated with 1 for mult!=all?
+          i++;
+#ifdef SMERGE_STATS
+          cnt++;
+#endif
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
+        }
+      }
+    } else if (mult == LAST) {
+      while (i<ni && js<njs) {
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+y_lens[js]; // check if unsort will do it correctly
+          lens[i] = 1;
+          i++;
+#ifdef SMERGE_STATS
+          cnt++;
+#endif
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
+        }
       }
     }
   } else if (unq_y) {
@@ -96,30 +136,78 @@ static void smerge(const int bx_off, const int bnx,
   } else {
     int is = bx_off, js = by_off;
     const int nis = bx_off+bnx, njs = by_off+bny;
-    while (is<nis && js<njs) {
-      const int i = x_starts[is]-1, j = y_starts[js]-1;
-      const int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        const int j1 = j+1, yl1 = y_lens[js];
-        const int xl1 = x_lens[is];
-        for (int ii=0; ii<xl1; ++ii) {
-          starts[i+ii] = j1;
-          lens[i+ii] = yl1;
+    if (mult==ALL || mult==ERR) {
+      while (is<nis && js<njs) {
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          const int j1 = j+1, yl1 = y_lens[js];
+          const int xl1 = x_lens[is];
+          for (int ii=0; ii<xl1; ++ii) {
+            starts[i+ii] = j1;
+            lens[i+ii] = yl1;
+          }
+          is++;
+  #ifdef SMERGE_STATS
+          if (xlen1 && xl1>1)
+            xlen1 = false;
+          if (ylen1 && yl1>1)
+            ylen1 = false;
+          if (xylen1 && xl1>1 && yl1>1)
+            xylen1 = false;
+          cnt += (uint64_t)xl1 * (uint64_t)yl1;
+  #endif
+        } else if (x_i < y_j) {
+          is++;
+        } else if (x_i > y_j) {
+          js++;
         }
-        is++;
+      }
+    } else if (mult==FIRST) {
+      while (is<nis && js<njs) {
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          const int j1 = j+1;
+          const int xl1 = x_lens[is];
+          for (int ii=0; ii<xl1; ++ii) {
+            starts[i+ii] = j1;
+            lens[i+ii] = 1;
+          }
+          is++;
 #ifdef SMERGE_STATS
-        if (xlen1 && xl1>1)
-          xlen1 = false;
-        if (ylen1 && yl1>1)
-          ylen1 = false;
-        if (xylen1 && xl1>1 && yl1>1)
-          xylen1 = false;
-        cnt += (uint64_t)xl1 * (uint64_t)yl1;
+          if (xlen1 && xl1>1)
+            xlen1 = false;
+          cnt += (uint64_t)xl1;
 #endif
-      } else if (x_i < y_j) {
-        is++;
-      } else if (x_i > y_j) {
-        js++;
+        } else if (x_i < y_j) {
+          is++;
+        } else if (x_i > y_j) {
+          js++;
+        }
+      }
+    } else if (mult==LAST) {
+      while (is<nis && js<njs) {
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          const int j1 = j+y_lens[js];
+          const int xl1 = x_lens[is];
+          for (int ii=0; ii<xl1; ++ii) {
+            starts[i+ii] = j1;
+            lens[i+ii] = 1;
+          }
+          is++;
+#ifdef SMERGE_STATS
+          if (xlen1 && xl1>1)
+            xlen1 = false;
+          cnt += (uint64_t)xl1;
+#endif
+        } else if (x_i < y_j) {
+          is++;
+        } else if (x_i > y_j) {
+          js++;
+        }
       }
     }
   }
@@ -270,7 +358,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
              const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
              int *restrict starts, int *restrict lens,
              uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
-             const int verbose) {
+             const enum emult mult, const int verbose) {
 
   double t = 0;
   if (verbose>0)
@@ -281,12 +369,12 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     x_lens = (int *)R_alloc(nx_starts, sizeof(int)); // remove after #4395
     grpLens(x_starts, nx_starts, nx, x_lens);
   }
-  if (!unq_y) {
+  if (!(unq_y || mult==FIRST)) {
     y_lens = (int *)R_alloc(ny_starts, sizeof(int));
     grpLens(y_starts, ny_starts, ny, y_lens);
   }
   if (verbose>0)
-    Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !unq_y, "(already unique)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
+    Rprintf("smergeC: grpLens %s took %.3fs\n", verboseDone(!unq_x, !(unq_y || mult==FIRST), "(x already unq, y unq or mult='first')", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
 
   if (verbose>0)
     t = omp_get_wtime();
@@ -324,7 +412,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
       x, x_starts, x_lens, unq_x,             // common input x
       y, y_starts, y_lens, unq_y,             // common input y
       starts, lens,                           // common output
-      &bnmatch, &bxlens1, &bylens1, &bxylens1 // reduction output
+      &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output
+      mult
     );
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message
@@ -332,6 +421,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
   x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
   if (verbose>0)
     Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic)
+  if (mult==ERR && !ylens1)
+    error("mult='error' and multiple matches during merge");
 
   return;
 }
@@ -388,8 +479,23 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
   return ans;
 }
 
+const enum emult matchMultArg(SEXP multArg) {
+  enum emult mult;
+  if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all"))
+    mult = ALL;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first"))
+    mult = FIRST;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last"))
+    mult = LAST;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "error"))
+    mult = ERR;
+  else
+    error(_("Internal error: invalid value for 'mult'. please report to data.table issue tracker")); // # nocov
+  return mult;
+}
+
 // main interface from R
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) {
 
   const int verbose = GetVerbose()*3; // remove *3 after #4491
   double t_total = 0, t = 0;
@@ -397,6 +503,9 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     t_total = omp_get_wtime();
   if (!isInteger(x) || !isInteger(y))
     error("'x' and 'y' must be integer");
+  if (!isString(multArg))
+    error("'mult' must be a string");
+  const enum emult mult = matchMultArg(multArg);
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
   const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0];
@@ -473,7 +582,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP out_bmerge) {
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
     starts, lens,
     &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1),
-    verbose-1
+    mult, verbose-1
   );
   if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here
     REAL(n_matchr)[0] = NA_REAL;

From 9ee0a7a30121192a25504986662743318475126f Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 12 Jun 2020 15:54:26 +0100
Subject: [PATCH 27/34] avoid lens allocation and unsort for mult=first|last

---
 R/data.table.R |  1 +
 src/smerge.c   | 81 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 75b6b290ed..27e27dec2d 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -513,6 +513,7 @@ replace_dot_alias = function(e) {
         }
         # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"?
         # TODO: how about when nomatch=0L, can we avoid allocating then as well?
+        # if we take nomatch out from [b|s]merge then it should be easier to avoid allocation
       }
       if (length(xo) && length(irows)) {
         irows = xo[irows]   # TO DO: fsort here?
diff --git a/src/smerge.c b/src/smerge.c
index 41de40d82e..7fa907416e 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -39,7 +39,7 @@ static void smerge(const int bx_off, const int bnx,
       const int x_i = x[i], y_j = y[j];
       if (x_i == y_j) {
         starts[i] = j+1;
-        //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc, same for unq_y branch
+        //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc
         i++;
 #ifdef SMERGE_STATS
         cnt++;
@@ -53,7 +53,7 @@ static void smerge(const int bx_off, const int bnx,
   } else if (unq_x) {
     int i = bx_off, js = by_off;
     const int ni = bx_off+bnx, njs = by_off+bny;
-    if (mult == ALL || mult == ERR) { // mult==err is raised based on ylen1 flag outside of parallel region
+    if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region
       while (i<ni && js<njs) {
         const int j = y_starts[js]-1;
         const int x_i = x[i], y_j = y[j];
@@ -79,7 +79,7 @@ static void smerge(const int bx_off, const int bnx,
         const int x_i = x[i], y_j = y[j];
         if (x_i == y_j) {
           starts[i] = j+1;
-          lens[i] = 1; // could be pre-allocated with 1 for mult!=all?
+          //lens[i] = 1;
           i++;
 #ifdef SMERGE_STATS
           cnt++;
@@ -95,8 +95,8 @@ static void smerge(const int bx_off, const int bnx,
         const int j = y_starts[js]-1;
         const int x_i = x[i], y_j = y[j];
         if (x_i == y_j) {
-          starts[i] = j+y_lens[js]; // check if unsort will do it correctly
-          lens[i] = 1;
+          starts[i] = j+y_lens[js];
+          //lens[i] = 1;
           i++;
 #ifdef SMERGE_STATS
           cnt++;
@@ -119,7 +119,7 @@ static void smerge(const int bx_off, const int bnx,
         const int xl1 = x_lens[is];
         for (int ii=0; ii<xl1; ++ii) {
           starts[i+ii] = j1;
-          //lens[i+ii] = 1; // see comment in unq_x && unq_y branch
+          //lens[i+ii] = 1;
         }
         is++;
 #ifdef SMERGE_STATS
@@ -172,7 +172,7 @@ static void smerge(const int bx_off, const int bnx,
           const int xl1 = x_lens[is];
           for (int ii=0; ii<xl1; ++ii) {
             starts[i+ii] = j1;
-            lens[i+ii] = 1;
+            //lens[i+ii] = 1;
           }
           is++;
 #ifdef SMERGE_STATS
@@ -195,7 +195,7 @@ static void smerge(const int bx_off, const int bnx,
           const int xl1 = x_lens[is];
           for (int ii=0; ii<xl1; ++ii) {
             starts[i+ii] = j1;
-            lens[i+ii] = 1;
+            //lens[i+ii] = 1;
           }
           is++;
 #ifdef SMERGE_STATS
@@ -439,22 +439,43 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
                 SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
                 SEXP x_idx, SEXP y_idx,
                 SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1,
-                const bool bmerge) {
+                const bool multLen1, const bool bmerge) {
   int out_len = bmerge ? 6 : 10;
   SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames;
   setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len));
   SET_STRING_ELT(ansnames, 0, char_starts);
   SET_STRING_ELT(ansnames, 1, char_lens);
-  if (x_ord) {
-    SET_VECTOR_ELT(ans, 0, out_starts);
-    SET_VECTOR_ELT(ans, 1, out_lens);
+  if (bmerge) {
+    if (x_ord) {
+      SET_VECTOR_ELT(ans, 0, out_starts);
+      SET_VECTOR_ELT(ans, 1, out_lens);
+    } else {
+      SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
+      SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
+      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
+      sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
+      sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
+      UNPROTECT(1);
+    }
   } else {
-    SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
-    SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
-    SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
-    sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
-    sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
-    UNPROTECT(1);
+    const bool skipLens = !multLen1 && LOGICAL(y_lens1)[0];
+    if (skipLens) { // compact lens if ylens1, mult=first|last already has compact lens
+      out_lens = PROTECT(allocVector(INTSXP, 0));
+    }
+    if (x_ord) {
+      SET_VECTOR_ELT(ans, 0, out_starts);
+      SET_VECTOR_ELT(ans, 1, out_lens);
+    } else {
+      SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
+      SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, LOGICAL(y_lens1)[0] ? 0 : n));
+      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
+      sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
+      if (LENGTH(VECTOR_ELT(ans, 1))) // not need to unsort 1-only vector, it is now compact 0 length
+        sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
+      UNPROTECT(1);
+    }
+    if (skipLens)
+      UNPROTECT(1);
   }
   SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join
   SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1);
@@ -506,6 +527,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
   if (!isString(multArg))
     error("'mult' must be a string");
   const enum emult mult = matchMultArg(multArg);
+  const bool multLen1 = mult==FIRST || mult==LAST;
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
   const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0];
@@ -551,21 +573,28 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
     t = omp_get_wtime();
   SEXP out_starts = R_NilValue, out_lens = R_NilValue;
   int *restrict starts=0, *restrict lens=0;
+  const int lens_len = (!multLen1 || ans_bmerge) ? nx : 0; // for mult=first|last we dont need to allocate lens
   if (x_ord) { // we dont need to reorder results so can save one allocation
     out_starts = PROTECT(allocVector(INTSXP, nx)); protecti++;
-    out_lens = PROTECT(allocVector(INTSXP, nx)); protecti++;
+    out_lens = PROTECT(allocVector(INTSXP, lens_len)); protecti++;
     starts = INTEGER(out_starts);
     lens = INTEGER(out_lens);
   } else {
     starts = (int *)R_alloc(nx, sizeof(int));
-    lens = (int *)R_alloc(nx, sizeof(int));
+    lens = (int *)R_alloc(lens_len, sizeof(int));
   }
   // this fills default values, bmerge's defaults are tricky (dictated by how they are consumed): nomatch=0 makes starts=0 not NA, lens=0 is fine there; nomatch=NA makes lens=1 not NA, starts=NA is fine there
-  const int default_lens = 1; //ans_bmerge ? 1 : NA_INTEGER; // for now keep it fixed to bmerge nomatch out: 1
-  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
-  for (int i=0; i<nx; ++i) {
-    starts[i] = NA_INTEGER;
-    lens[i] = default_lens;
+  // AFAIU it make sense to take out nomatch argument from merge
+  if (multLen1 && !ans_bmerge) {
+    #pragma omp parallel for schedule(static) num_threads(getDTthreads())
+    for (int i=0; i<nx; ++i)
+      starts[i] = NA_INTEGER;
+  } else {
+    #pragma omp parallel for schedule(static) num_threads(getDTthreads())
+    for (int i=0; i<nx; ++i) {
+      starts[i] = NA_INTEGER;
+      lens[i] = 1;
+    }
   }
   uint64_t n_match = 0;
   SEXP x_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
@@ -595,7 +624,7 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
 
   if (verbose>0)
     t = omp_get_wtime();
-  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, ans_bmerge);
+  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge);
   if (verbose>0)
     Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t);
   if (verbose>0)

From 58165497c456a77a6f87570f73f0851cc9bf6ae5 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 12 Jun 2020 17:24:33 +0100
Subject: [PATCH 28/34] move R allocs

---
 src/smerge.c | 61 ++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/smerge.c b/src/smerge.c
index 7fa907416e..ef908b73f5 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -357,7 +357,7 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r
 void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
              const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
              int *restrict starts, int *restrict lens,
-             uint64_t *n_match, int *x_lens1, int *y_lens1, int *xy_lens1,
+             uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1,
              const enum emult mult, const int verbose) {
 
   double t = 0;
@@ -400,8 +400,8 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
 
   if (verbose>0)
     t = omp_get_wtime();
-  bool xlens1 = true, ylens1 = true, xylens1 = true;
   uint64_t nmatch = 0;
+  bool xlens1 = true, ylens1 = true, xylens1 = true;
   #pragma omp parallel for schedule(dynamic) reduction(&&:xlens1,ylens1,xylens1) reduction(+:nmatch) num_threads(nth)
   for (int b=0; b<nBatch; ++b) {
     uint64_t bnmatch = 0; // local reduction variables
@@ -418,7 +418,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message
   }
-  x_lens1[0] = (int)xlens1; y_lens1[0] = (int)ylens1; xy_lens1[0] = (int)xylens1; n_match[0] = nmatch;
+  x_lens1[0] = xlens1; y_lens1[0] = ylens1; xy_lens1[0] = xylens1; n_match[0] = nmatch;
   if (verbose>0)
     Rprintf("smergeC: %d calls to smerge using %d/%d threads took %.3fs\n", nBatch, unqNth(th, nBatch), nth, omp_get_wtime() - t); // all threads may not always be used bc schedule(dynamic)
   if (mult==ERR && !ylens1)
@@ -438,39 +438,41 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int *
 SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord,
                 SEXP out_starts, SEXP out_lens, // used only when x was sorted, saves one allocation
                 SEXP x_idx, SEXP y_idx,
-                SEXP n_match, SEXP x_lens1, SEXP y_lens1, SEXP xy_lens1,
+                uint64_t n_match, bool x_lens1, bool y_lens1, bool xy_lens1,
                 const bool multLen1, const bool bmerge) {
-  int out_len = bmerge ? 6 : 10;
+  const int out_len = bmerge ? 6 : 10;
   SEXP ans = PROTECT(allocVector(VECSXP, out_len)), ansnames;
   setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len));
   SET_STRING_ELT(ansnames, 0, char_starts);
   SET_STRING_ELT(ansnames, 1, char_lens);
-  if (bmerge) {
+  if (bmerge) { // for bmerge we need still allocate, but not unsort, for !bmerge no alloc and no unsort
     if (x_ord) {
       SET_VECTOR_ELT(ans, 0, out_starts);
       SET_VECTOR_ELT(ans, 1, out_lens);
     } else {
       SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
       SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
-      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
+      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533
       sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
       sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
       UNPROTECT(1);
     }
   } else {
-    const bool skipLens = !multLen1 && LOGICAL(y_lens1)[0];
+    const bool skipLens = !multLen1 && y_lens1;
     if (skipLens) { // compact lens if ylens1, mult=first|last already has compact lens
       out_lens = PROTECT(allocVector(INTSXP, 0));
     }
     if (x_ord) {
+      if (y_lens1 && LENGTH(out_lens))
+        error("internal error: lens should be already compact 0 length integer"); // # nocov
       SET_VECTOR_ELT(ans, 0, out_starts);
       SET_VECTOR_ELT(ans, 1, out_lens);
     } else {
       SET_VECTOR_ELT(ans, 0, allocVector(INTSXP, n));
-      SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, LOGICAL(y_lens1)[0] ? 0 : n));
-      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); // verbose=verbose-2L after #4533
+      SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, y_lens1 ? 0 : n));
+      SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533
       sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
-      if (LENGTH(VECTOR_ELT(ans, 1))) // not need to unsort 1-only vector, it is now compact 0 length
+      if (!y_lens1) // not need to unsort 1-only vector, it is now compact 0 length
         sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
       UNPROTECT(1);
     }
@@ -478,7 +480,7 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
       UNPROTECT(1);
   }
   SET_STRING_ELT(ansnames, 2, char_indices); SET_VECTOR_ELT(ans, 2, allocVector(INTSXP, 0)); // const for equi join
-  SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, y_lens1);
+  SET_STRING_ELT(ansnames, 3, char_allLen1); SET_VECTOR_ELT(ans, 3, ScalarLogical(y_lens1));
   SET_STRING_ELT(ansnames, 4, char_allGrp1); SET_VECTOR_ELT(ans, 4, ScalarLogical(true));    // const for equi join
   if (bmerge) {
     y_idx = shallow_duplicate(y_idx); // possibly improve after #4467
@@ -492,9 +494,17 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
   SET_STRING_ELT(ansnames, 5, char_xo); SET_VECTOR_ELT(ans, 5, y_idx);
   if (!bmerge) {
     SET_STRING_ELT(ansnames, 6, char_io);      SET_VECTOR_ELT(ans, 6, x_idx);
-    SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, x_lens1);
-    SET_STRING_ELT(ansnames, 8, char_xyLen1);  SET_VECTOR_ELT(ans, 8, xy_lens1);
-    SET_STRING_ELT(ansnames, 9, char_nMatch);  SET_VECTOR_ELT(ans, 9, n_match);
+    SET_STRING_ELT(ansnames, 7, char_lhsLen1); SET_VECTOR_ELT(ans, 7, ScalarLogical(x_lens1));
+    SET_STRING_ELT(ansnames, 8, char_xyLen1);  SET_VECTOR_ELT(ans, 8, ScalarLogical(xy_lens1));
+    SEXP n_matchr = PROTECT(allocVector(REALSXP, 1));
+    if (n_match > (uint64_t)DBL_MAX) { // 1e9 x 1e9 cartesian join results 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here
+      REAL(n_matchr)[0] = NA_REAL;
+      warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field");
+    } else {
+      REAL(n_matchr)[0] = (double)n_match;
+    }
+    SET_STRING_ELT(ansnames, 9, char_nMatch);  SET_VECTOR_ELT(ans, 9, n_matchr);
+    UNPROTECT(1);
   }
   UNPROTECT(1);
   return ans;
@@ -537,10 +547,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
     t = omp_get_wtime();
   const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
   if (do_x_idx) {
-    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
+    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (do_y_idx) {
-    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(TRUE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE))); protecti++; // verbose=verbose-2L after #4533
+    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (!isInteger(x_idx) || !isInteger(y_idx))
     error("'x.idx' and 'y.idx' must be integer");
@@ -596,35 +606,26 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
       lens[i] = 1;
     }
   }
-  uint64_t n_match = 0;
-  SEXP x_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
-  SEXP y_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
-  SEXP xy_lens1 = PROTECT(allocVector(LGLSXP, 1)); protecti++;
-  SEXP n_matchr = PROTECT(allocVector(REALSXP, 1)); protecti++;
   if (verbose>0)
     Rprintf("smergeR: alloc of size %d took %.3fs\n", nx, omp_get_wtime() - t);
 
   if (verbose>0)
     t = omp_get_wtime();
+  uint64_t n_match = 0;
+  bool x_lens1 = true, y_lens1 = true, xy_lens1 = true;
   smergeC(
     xp, nx, INTEGER(x_starts), LENGTH(x_starts),
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
     starts, lens,
-    &n_match, INTEGER(x_lens1), INTEGER(y_lens1), INTEGER(xy_lens1),
+    &n_match, &x_lens1, &y_lens1, &xy_lens1,
     mult, verbose-1
   );
-  if (n_match > (uint64_t)DBL_MAX) { // 1e9x1e9 cartesian join 1e18 still less than DBL_MAX, should we check against DBL_MAX or something lesser? we cast uinst64_t to double here
-    REAL(n_matchr)[0] = NA_REAL;
-    warning("count of matches exceeds DBL_MAX, returning NA in 'nMatch' field");
-  } else {
-    REAL(n_matchr)[0] = (double)n_match;
-  }
   if (verbose>0)
     Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
 
   if (verbose>0)
     t = omp_get_wtime();
-  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_matchr, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge);
+  SEXP ans = outSmergeR(nx, starts, lens, x_ord, out_starts, out_lens, x_idx, y_idx, n_match, x_lens1, y_lens1, xy_lens1, multLen1, ans_bmerge);
   if (verbose>0)
     Rprintf("smergeR: outSmerge %s took %.3fs\n", x_ord ? "(was sorted)" : "(alloc and unsort)", omp_get_wtime() - t);
   if (verbose>0)

From d7c2b4bad7f5f29daf3be2baf3ee7fe2d9384008 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Fri, 12 Jun 2020 20:21:57 +0100
Subject: [PATCH 29/34] avoid one more unsort

---
 src/smerge.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/smerge.c b/src/smerge.c
index ef908b73f5..de6ce473e2 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -433,6 +433,12 @@ void sortInt(const int *restrict x, const int nx, const int *restrict idx, int *
     ans[i] = x[idx[i]-1];
   return;
 }
+void copyInt(const int *restrict x, const int nx, int *restrict ans) {
+  #pragma omp parallel for schedule(static) num_threads(getDTthreads())
+  for (int i=0; i<nx; ++i)
+    ans[i] = x[i];
+  return;
+}
 
 // wrap results into list, bmerge=true produces bmerge replacement output, note that bmerge i,x is smerge x,y (as of now)
 SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict lens, const bool x_ord,
@@ -445,7 +451,7 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
   setAttrib(ans, R_NamesSymbol, ansnames=allocVector(STRSXP, out_len));
   SET_STRING_ELT(ansnames, 0, char_starts);
   SET_STRING_ELT(ansnames, 1, char_lens);
-  if (bmerge) { // for bmerge we need still allocate, but not unsort, for !bmerge no alloc and no unsort
+  if (bmerge) { // for bmerge we still need allocate, but not unsort, for !bmerge no alloc no unsort
     if (x_ord) {
       SET_VECTOR_ELT(ans, 0, out_starts);
       SET_VECTOR_ELT(ans, 1, out_lens);
@@ -454,7 +460,10 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
       SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, n));
       SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533
       sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
-      sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
+      if (!y_lens1) // no need to unsort 1-only vector
+        sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
+      else
+        copyInt(lens, n, INTEGER(VECTOR_ELT(ans, 1)));
       UNPROTECT(1);
     }
   } else {
@@ -472,7 +481,7 @@ SEXP outSmergeR(const int n, const int *restrict starts, const int *restrict len
       SET_VECTOR_ELT(ans, 1, allocVector(INTSXP, y_lens1 ? 0 : n));
       SEXP xoo = PROTECT(forder(x_idx, R_NilValue, /*retGrp=*/ScalarLogical(false), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); // verbose=verbose-2L after #4533
       sortInt(starts, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 0)));
-      if (!y_lens1) // not need to unsort 1-only vector, it is now compact 0 length
+      if (!y_lens1) // no need to unsort 1-only vector, it is now compact 0 length anyway
         sortInt(lens, n, INTEGER(xoo), INTEGER(VECTOR_ELT(ans, 1)));
       UNPROTECT(1);
     }

From 49ea9daca232ad1dd3375c65b5aa8ad49f04f32e Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 13 Jun 2020 11:40:55 +0100
Subject: [PATCH 30/34] mult already supported

---
 R/bmerge.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index c385eab265..dd2b43fdff 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -4,7 +4,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
       && isTRUE(getOption("datatable.smerge"))                     ## enable option
       && identical(nomatch, NA_integer_)                           ## for now only outer join
-      && identical(mult, "all")                                    ## for now only mult='all'
       && identical(ops, 1L)                                        ## equi join
       && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
       ) {

From 9f1ea21cf06c895b180e6ba3b67691759442768b Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 13 Jun 2020 11:41:54 +0100
Subject: [PATCH 31/34] mult already supported2

---
 R/bmerge.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index dd2b43fdff..69e9c89e02 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -14,7 +14,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       if (!is.null(attr(idx, "starts", exact=TRUE))) idx
     }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), out.bmerge=TRUE)
+    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
     return(ans)
   }

From 103b9c63cdb10c7ab60dd2a3f185caf20b2eb70d Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Sat, 13 Jun 2020 11:49:13 +0100
Subject: [PATCH 32/34] better algo description

---
 src/smerge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/smerge.c b/src/smerge.c
index de6ce473e2..8918546e88 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -5,7 +5,8 @@
  *
  * join on a single integer column
  * sort LHS and RHS
- * split LHS into equal batches, RHS into corresponding batches by matching upper-lower bounds of LHS batches
+ * split LHS into equal batches based on its unique values
+ * split RHS into batches by matching corresponding upper-lower bounds of LHS batches using binary search
  * parallel sort-merge join
  *
  * for a maximum speed collecting of following statistics can be disabled by undefine SMERGE_STATS

From 2acaf233c9636656e793125aa5a0052b95b979d5 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 16 Jun 2020 01:54:53 +0100
Subject: [PATCH 33/34] multiple columns support

---
 R/bmerge.R             |  23 +-
 R/wrappers.R           |   3 +-
 inst/tests/smerge.Rraw |  22 +-
 src/data.table.h       |   2 +-
 src/smerge.c           | 498 +++++++++++++++++++++++++----------------
 5 files changed, 335 insertions(+), 213 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index 69e9c89e02..a2c20f3576 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -1,10 +1,13 @@
+intCols = function(x, cols) all(vapply(cols, function(col, x) is.integer(x[[col]]), NA, x))
 
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
-  if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
-      && isTRUE(getOption("datatable.smerge"))                     ## enable option
-      && identical(nomatch, NA_integer_)                           ## for now only outer join
-      && identical(ops, 1L)                                        ## equi join
+  if (TRUE
+      && isTRUE(getOption("datatable.smerge"))                     ## switch
+      && length(icols)==length(xcols)                              ## avoid invalid input
+      && intCols(i, icols) && intCols(x, xcols)                    ## all columns integer
+      #&& identical(nomatch, NA_integer_)                          ## nomatch=0L is made as post-processing
+      && all(ops==1L)                                              ## equi join
       && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
       ) {
     getIdxGrp = function(x, cols) { ## get index only if retGrp=T
@@ -14,8 +17,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       if (!is.null(attr(idx, "starts", exact=TRUE))) idx
     }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
+    ans = smerge(x=i, y=x, x.cols=icols, y.cols=xcols, x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
+    if (identical(nomatch, 0L)) {
+      nom = is.na(ans$starts)
+      ans$starts[nom] = 0L
+      ans$lens[nom] = 0L
+    }
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
+    if (verbose && !isTRUE(getOption("datatable.smerge"))) { ## just to satisfy existing bmerge unit tests, when switch above commented out
+      #cat("1\n", file="~/git/smergeOptCount.out", append=TRUE) ## be sure to have this path if you comment out switch of "datatable.smerge" option, 24 times used in main.Rraw
+      cat("existing index\n")
+      cat("ad hoc\n")
+    }
     return(ans)
   }
   callersi = i
diff --git a/R/wrappers.R b/R/wrappers.R
index 636e6e04ee..a8ee7b9989 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,4 +13,5 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge)
+smerge = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.cols, y.cols, x.idx, y.idx, match.arg(mult), out.bmerge)
+
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 3a3bf9812a..099628a4be 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -12,18 +12,22 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   vecseq = data.table:::vecseq
 }
 
-bm = function(x, y, mult="all") {
-  stopifnot(is.integer(x), is.integer(y))
-  ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE)
+bm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
+  if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
+  if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
+  stopifnot(is.data.table(x), is.data.table(y))
+  ans = bmerge(x, y, x.cols, y.cols, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=rep.int(1L, length(x.cols)), verbose=FALSE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL))))
   ans
 }
-sm = function(x, y, mult="all") {
-  stopifnot(is.integer(x), is.integer(y))
-  ans = smerge(x, y, mult=mult, out.bmerge=TRUE)
+sm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
+  if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
+  if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
+  stopifnot(is.data.table(x), is.data.table(y))
+  ans = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=TRUE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
-  ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch
+  ans$nMatch = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=FALSE)$nMatch
   ans
 }
 
@@ -216,8 +220,8 @@ y = sample.int(2e2L, 1e2L, TRUE)
 test(21.02, sm(x, y), bm(x, y))
 
 # [.data.table join
-d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x))
-d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y))
+d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_len(1e2L))
+d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_len(1e2L))
 options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
 options(datatable.smerge=TRUE)
diff --git a/src/data.table.h b/src/data.table.h
index 231f8f28b4..afef343eb3 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
+SEXP smergeR(SEXP x, SEXP y,SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
diff --git a/src/smerge.c b/src/smerge.c
index 8918546e88..8dd403d5d7 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -26,29 +26,33 @@ enum emult{ALL, FIRST, LAST, ERR};
 // workhorse join that runs in parallel on batches
 static void smerge(const int bx_off, const int bnx,
                    const int by_off, const int bny,
-                   const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
-                   const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
+                   const int **restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
+                   const int **restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
                    int *restrict starts, int *restrict lens,
                    uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1,
-                   const enum emult mult) {
+                   const enum emult mult, const int ncol) {
   uint64_t cnt = 0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
   if (unq_x && unq_y) {
     int i = bx_off, j = by_off;
     const int ni = bx_off+bnx, nj = by_off+bny;
     while (i<ni && j<nj) {
-      const int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        starts[i] = j+1;
-        //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc
-        i++;
-#ifdef SMERGE_STATS
-        cnt++;
-#endif
-      } else if (x_i < y_j) {
-        i++;
-      } else if (x_i > y_j) {
-        j++;
+      for (int c=0; c<ncol; ++c) {
+        const int x_i = x[c][i], y_j = y[c][j];
+        if (x_i == y_j) {
+          if (c < ncol-1)
+            continue; // check next columns
+          starts[i] = j+1;
+          //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc
+          i++;
+  #ifdef SMERGE_STATS
+          cnt++;
+  #endif
+        } else if (x_i < y_j) {
+          i++; break;
+        } else if (x_i > y_j) {
+          j++; break;
+        }
       }
     }
   } else if (unq_x) {
@@ -56,56 +60,68 @@ static void smerge(const int bx_off, const int bnx,
     const int ni = bx_off+bnx, njs = by_off+bny;
     if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region
       while (i<ni && js<njs) {
-        const int j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts[i] = j+1;
-          const int yl1 = y_lens[js];
-          lens[i] = yl1;
-          i++;
-#ifdef SMERGE_STATS
-          if (ylen1 && yl1>1)
-            ylen1 = false;
-          cnt += (uint64_t)yl1;
-#endif
-        } else if (x_i < y_j) {
-          i++;
-        } else if (x_i > y_j) {
-          js++;
+        for (int c=0; c<ncol; ++c) {
+          const int j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            starts[i] = j+1;
+            const int yl1 = y_lens[js];
+            lens[i] = yl1;
+            i++;
+  #ifdef SMERGE_STATS
+            if (ylen1 && yl1>1)
+              ylen1 = false;
+            cnt += (uint64_t)yl1;
+  #endif
+          } else if (x_i < y_j) {
+            i++; break;
+          } else if (x_i > y_j) {
+            js++; break;
+          }
         }
       }
     } else if (mult == FIRST) {
       while (i<ni && js<njs) {
-        const int j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts[i] = j+1;
-          //lens[i] = 1;
-          i++;
-#ifdef SMERGE_STATS
-          cnt++;
-#endif
-        } else if (x_i < y_j) {
-          i++;
-        } else if (x_i > y_j) {
-          js++;
+        for (int c=0; c<ncol; ++c) {
+          const int j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            starts[i] = j+1;
+            //lens[i] = 1;
+            i++;
+  #ifdef SMERGE_STATS
+            cnt++;
+  #endif
+          } else if (x_i < y_j) {
+            i++; break;
+          } else if (x_i > y_j) {
+            js++; break;
+          }
         }
       }
     } else if (mult == LAST) {
       while (i<ni && js<njs) {
-        const int j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          starts[i] = j+y_lens[js];
-          //lens[i] = 1;
-          i++;
-#ifdef SMERGE_STATS
-          cnt++;
-#endif
-        } else if (x_i < y_j) {
-          i++;
-        } else if (x_i > y_j) {
-          js++;
+        for (int c=0; c<ncol; ++c) {
+          const int j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            starts[i] = j+y_lens[js];
+            //lens[i] = 1;
+            i++;
+  #ifdef SMERGE_STATS
+            cnt++;
+  #endif
+          } else if (x_i < y_j) {
+            i++; break;
+          } else if (x_i > y_j) {
+            js++; break;
+          }
         }
       }
     }
@@ -113,101 +129,117 @@ static void smerge(const int bx_off, const int bnx,
     int is = bx_off, j = by_off;
     const int nis = bx_off+bnx, nj = by_off+bny;
     while (is<nis && j<nj) {
-      const int i = x_starts[is]-1;
-      const int x_i = x[i], y_j = y[j];
-      if (x_i == y_j) {
-        const int j1 = j+1;
-        const int xl1 = x_lens[is];
-        for (int ii=0; ii<xl1; ++ii) {
-          starts[i+ii] = j1;
-          //lens[i+ii] = 1;
-        }
-        is++;
-#ifdef SMERGE_STATS
-        if (xlen1 && xl1>1)
-          xlen1 = false;
-        cnt += (uint64_t)xl1;
-#endif
-      } else if (x_i < y_j) {
-        is++;
-      } else if (x_i > y_j) {
-        j++;
-      }
-    }
-  } else {
-    int is = bx_off, js = by_off;
-    const int nis = bx_off+bnx, njs = by_off+bny;
-    if (mult==ALL || mult==ERR) {
-      while (is<nis && js<njs) {
-        const int i = x_starts[is]-1, j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
+      for (int c=0; c<ncol; ++c) {
+        const int i = x_starts[is]-1;
+        const int x_i = x[c][i], y_j = y[c][j];
         if (x_i == y_j) {
-          const int j1 = j+1, yl1 = y_lens[js];
+          if (c < ncol-1)
+            continue;
+          const int j1 = j+1;
           const int xl1 = x_lens[is];
           for (int ii=0; ii<xl1; ++ii) {
             starts[i+ii] = j1;
-            lens[i+ii] = yl1;
+            //lens[i+ii] = 1;
           }
           is++;
   #ifdef SMERGE_STATS
           if (xlen1 && xl1>1)
             xlen1 = false;
-          if (ylen1 && yl1>1)
-            ylen1 = false;
-          if (xylen1 && xl1>1 && yl1>1)
-            xylen1 = false;
-          cnt += (uint64_t)xl1 * (uint64_t)yl1;
+          cnt += (uint64_t)xl1;
   #endif
         } else if (x_i < y_j) {
-          is++;
+          is++; break;
         } else if (x_i > y_j) {
-          js++;
+          j++; break;
+        }
+      }
+    }
+  } else {
+    int is = bx_off, js = by_off;
+    const int nis = bx_off+bnx, njs = by_off+bny;
+    if (mult==ALL || mult==ERR) {
+      while (is<nis && js<njs) {
+        for (int c=0; c<ncol; ++c) {
+          const int i = x_starts[is]-1, j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            const int j1 = j+1, yl1 = y_lens[js];
+            const int xl1 = x_lens[is];
+            for (int ii=0; ii<xl1; ++ii) {
+              starts[i+ii] = j1;
+              lens[i+ii] = yl1;
+            }
+            is++;
+    #ifdef SMERGE_STATS
+            if (xlen1 && xl1>1)
+              xlen1 = false;
+            if (ylen1 && yl1>1)
+              ylen1 = false;
+            if (xylen1 && xl1>1 && yl1>1)
+              xylen1 = false;
+            cnt += (uint64_t)xl1 * (uint64_t)yl1;
+    #endif
+          } else if (x_i < y_j) {
+            is++; break;
+          } else if (x_i > y_j) {
+            js++; break;
+          }
         }
       }
     } else if (mult==FIRST) {
       while (is<nis && js<njs) {
-        const int i = x_starts[is]-1, j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          const int j1 = j+1;
-          const int xl1 = x_lens[is];
-          for (int ii=0; ii<xl1; ++ii) {
-            starts[i+ii] = j1;
-            //lens[i+ii] = 1;
+        for (int c=0; c<ncol; ++c) {
+          const int i = x_starts[is]-1, j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            const int j1 = j+1;
+            const int xl1 = x_lens[is];
+            for (int ii=0; ii<xl1; ++ii) {
+              starts[i+ii] = j1;
+              //lens[i+ii] = 1;
+            }
+            is++;
+  #ifdef SMERGE_STATS
+            if (xlen1 && xl1>1)
+              xlen1 = false;
+            cnt += (uint64_t)xl1;
+  #endif
+          } else if (x_i < y_j) {
+            is++; break;
+          } else if (x_i > y_j) {
+            js++; break;
           }
-          is++;
-#ifdef SMERGE_STATS
-          if (xlen1 && xl1>1)
-            xlen1 = false;
-          cnt += (uint64_t)xl1;
-#endif
-        } else if (x_i < y_j) {
-          is++;
-        } else if (x_i > y_j) {
-          js++;
         }
       }
     } else if (mult==LAST) {
       while (is<nis && js<njs) {
-        const int i = x_starts[is]-1, j = y_starts[js]-1;
-        const int x_i = x[i], y_j = y[j];
-        if (x_i == y_j) {
-          const int j1 = j+y_lens[js];
-          const int xl1 = x_lens[is];
-          for (int ii=0; ii<xl1; ++ii) {
-            starts[i+ii] = j1;
-            //lens[i+ii] = 1;
+        for (int c=0; c<ncol; ++c) {
+          const int i = x_starts[is]-1, j = y_starts[js]-1;
+          const int x_i = x[c][i], y_j = y[c][j];
+          if (x_i == y_j) {
+            if (c < ncol-1)
+              continue;
+            const int j1 = j+y_lens[js];
+            const int xl1 = x_lens[is];
+            for (int ii=0; ii<xl1; ++ii) {
+              starts[i+ii] = j1;
+              //lens[i+ii] = 1;
+            }
+            is++;
+  #ifdef SMERGE_STATS
+            if (xlen1 && xl1>1)
+              xlen1 = false;
+            cnt += (uint64_t)xl1;
+  #endif
+          } else if (x_i < y_j) {
+            is++; break;
+          } else if (x_i > y_j) {
+            js++; break;
           }
-          is++;
-#ifdef SMERGE_STATS
-          if (xlen1 && xl1>1)
-            xlen1 = false;
-          cnt += (uint64_t)xl1;
-#endif
-        } else if (x_i < y_j) {
-          is++;
-        } else if (x_i > y_j) {
-          js++;
         }
       }
     }
@@ -216,62 +248,103 @@ static void smerge(const int bx_off, const int bnx,
   return;
 }
 
+// helper when printing from batching
+void printRow(const int **restrict x, const int row, const int ncol) {
+  for (int c=0; c<ncol; ++c)
+    Rprintf("%d%s", x[c][row], c<ncol-1?",":"");
+}
+
+// helper to compare specific rows for early stopping rollbs
+int rowCmp(const int **restrict x, const int x_row, const int **restrict y, const int y_row, const int ncol) {
+  for (int c=0; c<ncol; ++c) {
+    int thisx = x[c][x_row];
+    int thisy = y[c][y_row];
+    if (thisx==thisy) {
+      if (c < ncol-1)
+        continue;
+      return 0; // rows equal
+    } else if (thisx > thisy) {
+      return 1;
+    } else if (thisx < thisy) {
+      return -1;
+    }
+  }
+  error("rowCmp got ncol=0?");
+  return 0;
+}
+
 /*
  * 'rolling nearest' binary search
  * used to find 'y' lower and upper, 0-based, bounds for each batch
  * side -1: lower bound; side 1: upper bound
  */
-static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) {
-  int verbose = 0; // devel debug only
-  if (verbose>0)
-    Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov
-  if (x[ix[0]-1] == val) { // common early stopping
+static int rollbs(const int **restrict x, const int *restrict ix, const int nix, const int **restrict vals, const int vali, const int ncol, const int side) {
+  int verbose = 0; // manual devel debug only
+  if (nix<1) {
     if (verbose>0)
-      Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov
+      Rprintf("rollbs: -1: empty input\n");
+    return -1;
+  }
+  if (rowCmp(x, ix[0]-1, vals, vali, ncol)==0) { // common early stopping
+    if (verbose>0) {
+      Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+    }
     return 0;
   }
-  if (x[ix[nix-1]-1] == val) {
-    if (verbose>0)
-      Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov
+  if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)==0) {
+    if (verbose>0) {
+      Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+    }
     return nix-1;
   }
   if (side < 0) { // lower bound early stopping
-    if (x[ix[nix-1]-1] < val) {
-      if (verbose>0)
-        Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov
+    if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) {
+      if (verbose>0) {
+        Rprintf("rollbs: -1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is still smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+      }
       return -1;
     }
-    if (x[ix[0]-1] > val) {
-      if (verbose>0)
-        Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov
+    if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) {
+      if (verbose>0) {
+        Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+      }
       return 0;
     }
   } else if (side > 0) { // upper bound early stopping
-    if (x[ix[0]-1] > val) {
-      if (verbose>0)
-        Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov
+    if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) {
+      if (verbose>0) {
+        Rprintf("rollbs: -1: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is still bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+      }
       return -1;
     }
-    if (x[ix[nix-1]-1] < val) {
-      if (verbose>0)
-        Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov
+    if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) {
+      if (verbose>0) {
+        Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+      }
       return nix-1;
     }
   }
-  int lower=0, upper=nix, i=0;
-  while (lower<=upper) {
-    i = lower + (upper-lower)/2;
-    int thisx = x[ix[i]-1];
-    //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out
-    if (thisx==val)
-      return(i);
-    else if (thisx < val)
-      lower = i+1;
-    else if (thisx > val)
-      upper = i-1;
+  int lower=0, upper=nix, i=0; // upper=nix is correct, just in case of wanting to "fix" for nix-1
+  for (int c=0; c<ncol; ++c) {
+    const int val = vals[c][vali];
+    while (lower<=upper) {
+      i = lower + (upper-lower)/2;
+      int thisx = x[c][ix[i]-1];
+      //Rprintf("rollbs: x[%d][ix[%d]-1]=%d ?? %d\n", c, i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out
+      if (thisx==val) {
+        if (c<ncol-1)
+          break;
+        //Rprintf("rollbs: %d: this x %d match value %d at last column\n", i, thisx, val);
+        return(i);
+      } else if (thisx < val)
+        lower = i+1;
+      else if (thisx > val)
+        upper = i-1;
+    }
+  }
+  if (verbose>0) {
+    Rprintf("rollbs: %d: last iter %d, element (", side<0?lower:upper, i); printRow(x, ix[i]-1, ncol); Rprintf(") nomatch to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
   }
-  if (verbose>0)
-    Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov
   if (side < 0) // anyone to stress test this logic?
     return lower;
   else
@@ -280,10 +353,10 @@ static int rollbs(const int *restrict x, const int *restrict ix, const int nix,
 
 // cuts x_starts into equal batches and binary search corresponding y_starts ranges
 static void batching(const int nBatch,
-                     const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
-                     const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+                     const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+                     const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
                      int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny,
-                     const int verbose) {
+                     const int ncol, const int verbose) {
 #ifdef SMERGE_BATCHING_BALANCED
   //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov
   size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch
@@ -296,14 +369,14 @@ static void batching(const int nBatch,
   if (verbose>0)
     Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize);
   if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts))
-    error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
+    error("internal error: batching %d input is attempting to use invalid batches: balanced=%s, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
   for (int b=0; b<nBatch; ++b) {
     Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
     Bnx[b] = b<nBatch-1 ? batchSize : lastBatchSize;
     int x_i_min = (x_starts + Bx_off[b])[0];
     int x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
-    int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
-    int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
+    int y_i_min = rollbs(y, y_starts, ny_starts, x, x_i_min-1, ncol, -1);
+    int y_i_max = rollbs(y, y_starts, ny_starts, x, x_i_max-1, ncol, 1);
     const bool y_match = y_i_min >= 0 && y_i_max >= 0;
     By_off[b] = y_match ? y_i_min : 0;
     Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0;
@@ -314,8 +387,8 @@ static void batching(const int nBatch,
       if (Bny[b] > 0) {
         int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
         int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1];
-        Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
-        Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
+        Rprintf("## lower: x[%d]: (", x_i_min); printRow(x, x_i_min-1, ncol); Rprintf(") <= ("); printRow(y, y_i_min-1, ncol); Rprintf(") :y[%d]\n", y_i_min);
+        Rprintf("## upper: x[%d]: (", x_i_max); printRow(x, x_i_max-1, ncol); Rprintf(") >= ("); printRow(y, y_i_max-1, ncol); Rprintf(") :y[%d]\n", y_i_max);
       }
     }
   }
@@ -355,12 +428,14 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r
 }
 
 // pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel
-void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
-             const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+             const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
              int *restrict starts, int *restrict lens,
              uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1,
-             const enum emult mult, const int verbose) {
+             const int ncol, const enum emult mult, const int verbose) {
 
+  if (!nx)
+    return;
   double t = 0;
   if (verbose>0)
     t = omp_get_wtime();
@@ -394,7 +469,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
   }
   int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int));
   int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int));
-  batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1);
+  batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, ncol, verbose-1);
   int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   if (verbose>0)
     Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
@@ -414,7 +489,7 @@ void smergeC(const int *restrict x, const int nx, const int *restrict x_starts,
       y, y_starts, y_lens, unq_y,             // common input y
       starts, lens,                           // common output
       &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output
-      mult
+      mult, ncol
     );
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message
@@ -535,15 +610,38 @@ const enum emult matchMultArg(SEXP multArg) {
   return mult;
 }
 
+SEXP ord1(int n) {
+  SEXP ans = PROTECT(allocVector(INTSXP, n));
+  int *ansp = INTEGER(ans);
+  for (int i=0; i<n; ++i)
+    ansp[i]=1;
+  UNPROTECT(1);
+  return ans;
+}
+
 // main interface from R
-SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) {
+SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) {
 
   const int verbose = GetVerbose()*3; // remove *3 after #4491
   double t_total = 0, t = 0;
   if (verbose>0)
     t_total = omp_get_wtime();
-  if (!isInteger(x) || !isInteger(y))
-    error("'x' and 'y' must be integer");
+  if (!isNewList(x) || !isNewList(y))
+    error("'x' and 'y' must be data.table");
+  if (!isInteger(x_cols) || !isInteger(y_cols))
+    error("'x.cols' and 'y.cols' must be integer");
+  if (LENGTH(x_cols) != LENGTH(y_cols))
+    error("'x.cols' and 'y.cols' must be equal length");
+  const int ncol = LENGTH(x_cols), nxcol = length(x), nycol = length(y);
+  if (!nxcol || !nycol)
+    error("'x' and 'y' must be non 0 length");
+  for (int i=0; i<ncol; ++i) {
+    int x_col = INTEGER(x_cols)[i], y_col = INTEGER(y_cols)[i];
+    if (!(x_col > 0 && x_col <= nxcol) || !(y_col > 0 && y_col <= nycol))
+      error("'x.cols' and 'y.cols' must specify existing columns");
+    if (!isInteger(VECTOR_ELT(x, x_col-1)) || !isInteger(VECTOR_ELT(y, y_col-1)))
+      error("columns of 'x' and 'y' to join must be integer");
+  }
   if (!isString(multArg))
     error("'mult' must be a string");
   const enum emult mult = matchMultArg(multArg);
@@ -551,16 +649,16 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
   const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0];
-  int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
+  int protecti = 0, nx = LENGTH(VECTOR_ELT(x, 0)), ny = LENGTH(VECTOR_ELT(y, 0));
 
   if (verbose>0)
     t = omp_get_wtime();
   const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
   if (do_x_idx) {
-    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
+    x_idx = PROTECT(forder(x, x_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (do_y_idx) {
-    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
+    y_idx = PROTECT(forder(y, y_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (!isInteger(x_idx) || !isInteger(y_idx))
     error("'x.idx' and 'y.idx' must be integer");
@@ -573,18 +671,24 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
   if (verbose>0)
     t = omp_get_wtime();
   const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
-  int *xp, *yp;
-  if (!x_ord) {
-    xp = (int *)R_alloc(nx, sizeof(int));
-    sortInt(INTEGER(x), nx, INTEGER(x_idx), xp);
-  } else {
-    xp = INTEGER(x);
-  }
-  if (!y_ord) {
-    yp = (int *)R_alloc(ny, sizeof(int));
-    sortInt(INTEGER(y), ny, INTEGER(y_idx), yp);
-  } else {
-    yp = INTEGER(y);
+  const int **restrict xp=(const int **)R_alloc(ncol, sizeof(int*)), **restrict yp=(const int **)R_alloc(ncol, sizeof(int*));
+  for (int i=0; i<ncol; ++i) {
+    SEXP this_x = VECTOR_ELT(x, INTEGER(x_cols)[i]-1);
+    if (x_ord) {
+      xp[i] = INTEGER(this_x);
+    } else {
+      int *tmp = (int *)R_alloc(nx, sizeof(int));
+      sortInt(INTEGER(this_x), nx, INTEGER(x_idx), tmp);
+      xp[i] = tmp;
+    }
+    SEXP this_y = VECTOR_ELT(y, INTEGER(y_cols)[i]-1);
+    if (y_ord) {
+      yp[i] = INTEGER(this_y);
+    } else {
+      int *tmp = (int *)R_alloc(ny, sizeof(int));
+      sortInt(INTEGER(this_y), ny, INTEGER(y_idx), tmp);
+      yp[i] = tmp;
+    }
   }
   if (verbose>0)
     Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
@@ -628,10 +732,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmer
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
     starts, lens,
     &n_match, &x_lens1, &y_lens1, &xy_lens1,
-    mult, verbose-1
+    ncol, mult, verbose-1
   );
   if (verbose>0)
-    Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
+    Rprintf("smergeR: smergeC %d cols of %d x %d = %"PRIu64"; took %.3fs\n", ncol, nx, ny, n_match, omp_get_wtime() - t);
 
   if (verbose>0)
     t = omp_get_wtime();

From e1643c2ce72e8bfe2acc00ead23ac7f7282d20da Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Mon, 22 Jun 2020 18:21:07 +0100
Subject: [PATCH 34/34] Revert "multiple columns support"

This reverts commit 2acaf233c9636656e793125aa5a0052b95b979d5.
---
 R/bmerge.R             |  23 +-
 R/wrappers.R           |   3 +-
 inst/tests/smerge.Rraw |  22 +-
 src/data.table.h       |   2 +-
 src/smerge.c           | 498 ++++++++++++++++-------------------------
 5 files changed, 213 insertions(+), 335 deletions(-)

diff --git a/R/bmerge.R b/R/bmerge.R
index a2c20f3576..69e9c89e02 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -1,13 +1,10 @@
-intCols = function(x, cols) all(vapply(cols, function(col, x) is.integer(x[[col]]), NA, x))
 
 bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
 {
-  if (TRUE
-      && isTRUE(getOption("datatable.smerge"))                     ## switch
-      && length(icols)==length(xcols)                              ## avoid invalid input
-      && intCols(i, icols) && intCols(x, xcols)                    ## all columns integer
-      #&& identical(nomatch, NA_integer_)                          ## nomatch=0L is made as post-processing
-      && all(ops==1L)                                              ## equi join
+  if (length(icols)==1L && length(xcols)==1L && is.integer(i[[icols]]) && is.integer(x[[xcols]]) ## single column integer
+      && isTRUE(getOption("datatable.smerge"))                     ## enable option
+      && identical(nomatch, NA_integer_)                           ## for now only outer join
+      && identical(ops, 1L)                                        ## equi join
       && identical(roll, 0) && identical(rollends, c(FALSE, TRUE)) ## non-rolling join
       ) {
     getIdxGrp = function(x, cols) { ## get index only if retGrp=T
@@ -17,18 +14,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
       if (!is.null(attr(idx, "starts", exact=TRUE))) idx
     }
     if (verbose) {last.started.at=proc.time();cat("Starting smerge ...\n");flush.console()}
-    ans = smerge(x=i, y=x, x.cols=icols, y.cols=xcols, x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
-    if (identical(nomatch, 0L)) {
-      nom = is.na(ans$starts)
-      ans$starts[nom] = 0L
-      ans$lens[nom] = 0L
-    }
+    ans = smerge(x=i[[icols]], y=x[[xcols]], x.idx=getIdxGrp(i, icols), y.idx=getIdxGrp(x, xcols), mult=mult, out.bmerge=TRUE)
     if (verbose) {cat("smerge done in",timetaken(last.started.at),"\n"); flush.console()}
-    if (verbose && !isTRUE(getOption("datatable.smerge"))) { ## just to satisfy existing bmerge unit tests, when switch above commented out
-      #cat("1\n", file="~/git/smergeOptCount.out", append=TRUE) ## be sure to have this path if you comment out switch of "datatable.smerge" option, 24 times used in main.Rraw
-      cat("existing index\n")
-      cat("ad hoc\n")
-    }
     return(ans)
   }
   callersi = i
diff --git a/R/wrappers.R b/R/wrappers.R
index a8ee7b9989..636e6e04ee 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,5 +13,4 @@ coerceFill = function(x) .Call(CcoerceFillR, x)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-smerge = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.cols, y.cols, x.idx, y.idx, match.arg(mult), out.bmerge)
-
+smerge = function(x, y, x.idx=NULL, y.idx=NULL, mult=c("all","first","last","error"), out.bmerge=FALSE) .Call(CsmergeR, x, y, x.idx, y.idx, match.arg(mult), out.bmerge)
diff --git a/inst/tests/smerge.Rraw b/inst/tests/smerge.Rraw
index 099628a4be..3a3bf9812a 100644
--- a/inst/tests/smerge.Rraw
+++ b/inst/tests/smerge.Rraw
@@ -12,22 +12,18 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   vecseq = data.table:::vecseq
 }
 
-bm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
-  if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
-  if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
-  stopifnot(is.data.table(x), is.data.table(y))
-  ans = bmerge(x, y, x.cols, y.cols, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=rep.int(1L, length(x.cols)), verbose=FALSE)
+bm = function(x, y, mult="all") {
+  stopifnot(is.integer(x), is.integer(y))
+  ans = bmerge(data.table(x=x), data.table(y=y), 1L, 1L, roll=0, rollends=c(FALSE, TRUE), nomatch=NA_integer_, mult=mult, ops=1L, verbose=FALSE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
   ans$nMatch = as.numeric(sum(!is.na(vecseq(ans$starts, ans$lens, NULL))))
   ans
 }
-sm = function(x, y, x.cols=seq_along(x), y.cols=seq_along(y), mult="all") {
-  if (is.integer(x)) {x = data.table(x=x); x.cols=1L}
-  if (is.integer(y)) {y = data.table(y=y); y.cols=1L}
-  stopifnot(is.data.table(x), is.data.table(y))
-  ans = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=TRUE)
+sm = function(x, y, mult="all") {
+  stopifnot(is.integer(x), is.integer(y))
+  ans = smerge(x, y, mult=mult, out.bmerge=TRUE)
   ## if undefining SMERGE_STATS then we have to ignore allLen1 as well
-  ans$nMatch = smerge(x, y, x.cols, y.cols, mult=mult, out.bmerge=FALSE)$nMatch
+  ans$nMatch = smerge(x, y, mult=mult, out.bmerge=FALSE)$nMatch
   ans
 }
 
@@ -220,8 +216,8 @@ y = sample.int(2e2L, 1e2L, TRUE)
 test(21.02, sm(x, y), bm(x, y))
 
 # [.data.table join
-d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_len(1e2L))
-d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_len(1e2L))
+d1 = data.table(x=sample.int(2e2L, 1e2L, TRUE), v1=seq_along(x))
+d2 = data.table(y=sample.int(2e2L, 1e2L, TRUE), v2=seq_along(y))
 options(datatable.smerge=FALSE, datatable.verbose=TRUE) ## verbose=2L after #4491
 test(101.01, expected <- d1[d2, on="x==y"], output="bmerge", notOutput="smerge")
 options(datatable.smerge=TRUE)
diff --git a/src/data.table.h b/src/data.table.h
index afef343eb3..231f8f28b4 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -250,4 +250,4 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na);
 SEXP fcaseR(SEXP na, SEXP rho, SEXP args);
 
 // smjoin.c
-SEXP smergeR(SEXP x, SEXP y,SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge);
diff --git a/src/smerge.c b/src/smerge.c
index 8dd403d5d7..8918546e88 100644
--- a/src/smerge.c
+++ b/src/smerge.c
@@ -26,33 +26,29 @@ enum emult{ALL, FIRST, LAST, ERR};
 // workhorse join that runs in parallel on batches
 static void smerge(const int bx_off, const int bnx,
                    const int by_off, const int bny,
-                   const int **restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
-                   const int **restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
+                   const int *restrict x, const int *restrict x_starts, const int *restrict x_lens, const bool unq_x,
+                   const int *restrict y, const int *restrict y_starts, const int *restrict y_lens, const bool unq_y,
                    int *restrict starts, int *restrict lens,
                    uint64_t *nmatch, bool *xlens1, bool *ylens1, bool *xylens1,
-                   const enum emult mult, const int ncol) {
+                   const enum emult mult) {
   uint64_t cnt = 0;
   bool xlen1 = true, ylen1 = true, xylen1 = true;
   if (unq_x && unq_y) {
     int i = bx_off, j = by_off;
     const int ni = bx_off+bnx, nj = by_off+bny;
     while (i<ni && j<nj) {
-      for (int c=0; c<ncol; ++c) {
-        const int x_i = x[c][i], y_j = y[c][j];
-        if (x_i == y_j) {
-          if (c < ncol-1)
-            continue; // check next columns
-          starts[i] = j+1;
-          //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc
-          i++;
-  #ifdef SMERGE_STATS
-          cnt++;
-  #endif
-        } else if (x_i < y_j) {
-          i++; break;
-        } else if (x_i > y_j) {
-          j++; break;
-        }
+      const int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        starts[i] = j+1;
+        //lens[i] = 1; // already filled with 1's we will need this line if we change default alloc
+        i++;
+#ifdef SMERGE_STATS
+        cnt++;
+#endif
+      } else if (x_i < y_j) {
+        i++;
+      } else if (x_i > y_j) {
+        j++;
       }
     }
   } else if (unq_x) {
@@ -60,68 +56,56 @@ static void smerge(const int bx_off, const int bnx,
     const int ni = bx_off+bnx, njs = by_off+bny;
     if (mult == ALL || mult == ERR) { // mult==err is raised based on ylens1 flag outside of parallel region
       while (i<ni && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            starts[i] = j+1;
-            const int yl1 = y_lens[js];
-            lens[i] = yl1;
-            i++;
-  #ifdef SMERGE_STATS
-            if (ylen1 && yl1>1)
-              ylen1 = false;
-            cnt += (uint64_t)yl1;
-  #endif
-          } else if (x_i < y_j) {
-            i++; break;
-          } else if (x_i > y_j) {
-            js++; break;
-          }
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+1;
+          const int yl1 = y_lens[js];
+          lens[i] = yl1;
+          i++;
+#ifdef SMERGE_STATS
+          if (ylen1 && yl1>1)
+            ylen1 = false;
+          cnt += (uint64_t)yl1;
+#endif
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
         }
       }
     } else if (mult == FIRST) {
       while (i<ni && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            starts[i] = j+1;
-            //lens[i] = 1;
-            i++;
-  #ifdef SMERGE_STATS
-            cnt++;
-  #endif
-          } else if (x_i < y_j) {
-            i++; break;
-          } else if (x_i > y_j) {
-            js++; break;
-          }
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+1;
+          //lens[i] = 1;
+          i++;
+#ifdef SMERGE_STATS
+          cnt++;
+#endif
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
         }
       }
     } else if (mult == LAST) {
       while (i<ni && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            starts[i] = j+y_lens[js];
-            //lens[i] = 1;
-            i++;
-  #ifdef SMERGE_STATS
-            cnt++;
-  #endif
-          } else if (x_i < y_j) {
-            i++; break;
-          } else if (x_i > y_j) {
-            js++; break;
-          }
+        const int j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          starts[i] = j+y_lens[js];
+          //lens[i] = 1;
+          i++;
+#ifdef SMERGE_STATS
+          cnt++;
+#endif
+        } else if (x_i < y_j) {
+          i++;
+        } else if (x_i > y_j) {
+          js++;
         }
       }
     }
@@ -129,117 +113,101 @@ static void smerge(const int bx_off, const int bnx,
     int is = bx_off, j = by_off;
     const int nis = bx_off+bnx, nj = by_off+bny;
     while (is<nis && j<nj) {
-      for (int c=0; c<ncol; ++c) {
-        const int i = x_starts[is]-1;
-        const int x_i = x[c][i], y_j = y[c][j];
+      const int i = x_starts[is]-1;
+      const int x_i = x[i], y_j = y[j];
+      if (x_i == y_j) {
+        const int j1 = j+1;
+        const int xl1 = x_lens[is];
+        for (int ii=0; ii<xl1; ++ii) {
+          starts[i+ii] = j1;
+          //lens[i+ii] = 1;
+        }
+        is++;
+#ifdef SMERGE_STATS
+        if (xlen1 && xl1>1)
+          xlen1 = false;
+        cnt += (uint64_t)xl1;
+#endif
+      } else if (x_i < y_j) {
+        is++;
+      } else if (x_i > y_j) {
+        j++;
+      }
+    }
+  } else {
+    int is = bx_off, js = by_off;
+    const int nis = bx_off+bnx, njs = by_off+bny;
+    if (mult==ALL || mult==ERR) {
+      while (is<nis && js<njs) {
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
         if (x_i == y_j) {
-          if (c < ncol-1)
-            continue;
-          const int j1 = j+1;
+          const int j1 = j+1, yl1 = y_lens[js];
           const int xl1 = x_lens[is];
           for (int ii=0; ii<xl1; ++ii) {
             starts[i+ii] = j1;
-            //lens[i+ii] = 1;
+            lens[i+ii] = yl1;
           }
           is++;
   #ifdef SMERGE_STATS
           if (xlen1 && xl1>1)
             xlen1 = false;
-          cnt += (uint64_t)xl1;
+          if (ylen1 && yl1>1)
+            ylen1 = false;
+          if (xylen1 && xl1>1 && yl1>1)
+            xylen1 = false;
+          cnt += (uint64_t)xl1 * (uint64_t)yl1;
   #endif
         } else if (x_i < y_j) {
-          is++; break;
+          is++;
         } else if (x_i > y_j) {
-          j++; break;
-        }
-      }
-    }
-  } else {
-    int is = bx_off, js = by_off;
-    const int nis = bx_off+bnx, njs = by_off+bny;
-    if (mult==ALL || mult==ERR) {
-      while (is<nis && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int i = x_starts[is]-1, j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            const int j1 = j+1, yl1 = y_lens[js];
-            const int xl1 = x_lens[is];
-            for (int ii=0; ii<xl1; ++ii) {
-              starts[i+ii] = j1;
-              lens[i+ii] = yl1;
-            }
-            is++;
-    #ifdef SMERGE_STATS
-            if (xlen1 && xl1>1)
-              xlen1 = false;
-            if (ylen1 && yl1>1)
-              ylen1 = false;
-            if (xylen1 && xl1>1 && yl1>1)
-              xylen1 = false;
-            cnt += (uint64_t)xl1 * (uint64_t)yl1;
-    #endif
-          } else if (x_i < y_j) {
-            is++; break;
-          } else if (x_i > y_j) {
-            js++; break;
-          }
+          js++;
         }
       }
     } else if (mult==FIRST) {
       while (is<nis && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int i = x_starts[is]-1, j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            const int j1 = j+1;
-            const int xl1 = x_lens[is];
-            for (int ii=0; ii<xl1; ++ii) {
-              starts[i+ii] = j1;
-              //lens[i+ii] = 1;
-            }
-            is++;
-  #ifdef SMERGE_STATS
-            if (xlen1 && xl1>1)
-              xlen1 = false;
-            cnt += (uint64_t)xl1;
-  #endif
-          } else if (x_i < y_j) {
-            is++; break;
-          } else if (x_i > y_j) {
-            js++; break;
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          const int j1 = j+1;
+          const int xl1 = x_lens[is];
+          for (int ii=0; ii<xl1; ++ii) {
+            starts[i+ii] = j1;
+            //lens[i+ii] = 1;
           }
+          is++;
+#ifdef SMERGE_STATS
+          if (xlen1 && xl1>1)
+            xlen1 = false;
+          cnt += (uint64_t)xl1;
+#endif
+        } else if (x_i < y_j) {
+          is++;
+        } else if (x_i > y_j) {
+          js++;
         }
       }
     } else if (mult==LAST) {
       while (is<nis && js<njs) {
-        for (int c=0; c<ncol; ++c) {
-          const int i = x_starts[is]-1, j = y_starts[js]-1;
-          const int x_i = x[c][i], y_j = y[c][j];
-          if (x_i == y_j) {
-            if (c < ncol-1)
-              continue;
-            const int j1 = j+y_lens[js];
-            const int xl1 = x_lens[is];
-            for (int ii=0; ii<xl1; ++ii) {
-              starts[i+ii] = j1;
-              //lens[i+ii] = 1;
-            }
-            is++;
-  #ifdef SMERGE_STATS
-            if (xlen1 && xl1>1)
-              xlen1 = false;
-            cnt += (uint64_t)xl1;
-  #endif
-          } else if (x_i < y_j) {
-            is++; break;
-          } else if (x_i > y_j) {
-            js++; break;
+        const int i = x_starts[is]-1, j = y_starts[js]-1;
+        const int x_i = x[i], y_j = y[j];
+        if (x_i == y_j) {
+          const int j1 = j+y_lens[js];
+          const int xl1 = x_lens[is];
+          for (int ii=0; ii<xl1; ++ii) {
+            starts[i+ii] = j1;
+            //lens[i+ii] = 1;
           }
+          is++;
+#ifdef SMERGE_STATS
+          if (xlen1 && xl1>1)
+            xlen1 = false;
+          cnt += (uint64_t)xl1;
+#endif
+        } else if (x_i < y_j) {
+          is++;
+        } else if (x_i > y_j) {
+          js++;
         }
       }
     }
@@ -248,103 +216,62 @@ static void smerge(const int bx_off, const int bnx,
   return;
 }
 
-// helper when printing from batching
-void printRow(const int **restrict x, const int row, const int ncol) {
-  for (int c=0; c<ncol; ++c)
-    Rprintf("%d%s", x[c][row], c<ncol-1?",":"");
-}
-
-// helper to compare specific rows for early stopping rollbs
-int rowCmp(const int **restrict x, const int x_row, const int **restrict y, const int y_row, const int ncol) {
-  for (int c=0; c<ncol; ++c) {
-    int thisx = x[c][x_row];
-    int thisy = y[c][y_row];
-    if (thisx==thisy) {
-      if (c < ncol-1)
-        continue;
-      return 0; // rows equal
-    } else if (thisx > thisy) {
-      return 1;
-    } else if (thisx < thisy) {
-      return -1;
-    }
-  }
-  error("rowCmp got ncol=0?");
-  return 0;
-}
-
 /*
  * 'rolling nearest' binary search
  * used to find 'y' lower and upper, 0-based, bounds for each batch
  * side -1: lower bound; side 1: upper bound
  */
-static int rollbs(const int **restrict x, const int *restrict ix, const int nix, const int **restrict vals, const int vali, const int ncol, const int side) {
-  int verbose = 0; // manual devel debug only
-  if (nix<1) {
+static int rollbs(const int *restrict x, const int *restrict ix, const int nix, const int val, const int side) {
+  int verbose = 0; // devel debug only
+  if (verbose>0)
+    Rprintf("rollbs: side=%d=%s; val=%d\n", side, side<0?"min":"max", val); // # nocov
+  if (x[ix[0]-1] == val) { // common early stopping
     if (verbose>0)
-      Rprintf("rollbs: -1: empty input\n");
-    return -1;
-  }
-  if (rowCmp(x, ix[0]-1, vals, vali, ncol)==0) { // common early stopping
-    if (verbose>0) {
-      Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-    }
+      Rprintf("rollbs: min elements %d match: 0\n", val); // # nocov
     return 0;
   }
-  if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)==0) {
-    if (verbose>0) {
-      Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") match to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-    }
+  if (x[ix[nix-1]-1] == val) {
+    if (verbose>0)
+      Rprintf("rollbs: max elements %d match: nix-1=%d\n", val, nix-1); // # nocov
     return nix-1;
   }
   if (side < 0) { // lower bound early stopping
-    if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) {
-      if (verbose>0) {
-        Rprintf("rollbs: -1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is still smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-      }
+    if (x[ix[nix-1]-1] < val) {
+      if (verbose>0)
+        Rprintf("rollbs: max element %d is still smaller than %d: -1\n", x[ix[nix-1]-1], val); // # nocov
       return -1;
     }
-    if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) {
-      if (verbose>0) {
-        Rprintf("rollbs: 0: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-      }
+    if (x[ix[0]-1] > val) {
+      if (verbose>0)
+        Rprintf("rollbs: min element %d is bigger than %d: 0\n", x[ix[0]-1], val); // # nocov
       return 0;
     }
   } else if (side > 0) { // upper bound early stopping
-    if (rowCmp(x, ix[0]-1, vals, vali, ncol)>0) {
-      if (verbose>0) {
-        Rprintf("rollbs: -1: min element ("); printRow(x, ix[0]-1, ncol); Rprintf(") is still bigger than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-      }
+    if (x[ix[0]-1] > val) {
+      if (verbose>0)
+        Rprintf("rollbs: min element %d is still bigger than %d: -1\n", x[ix[0]-1], val); // # nocov
       return -1;
     }
-    if (rowCmp(x, ix[nix-1]-1, vals, vali, ncol)<0) {
-      if (verbose>0) {
-        Rprintf("rollbs: nix-1: max element ("); printRow(x, ix[nix-1]-1, ncol); Rprintf(") is smaller than value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
-      }
+    if (x[ix[nix-1]-1] < val) {
+      if (verbose>0)
+        Rprintf("rollbs: max element %d is smaller than %d: nix-1=%d\n", x[ix[nix-1]-1], val, nix-1); // # nocov
       return nix-1;
     }
   }
-  int lower=0, upper=nix, i=0; // upper=nix is correct, just in case of wanting to "fix" for nix-1
-  for (int c=0; c<ncol; ++c) {
-    const int val = vals[c][vali];
-    while (lower<=upper) {
-      i = lower + (upper-lower)/2;
-      int thisx = x[c][ix[i]-1];
-      //Rprintf("rollbs: x[%d][ix[%d]-1]=%d ?? %d\n", c, i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out
-      if (thisx==val) {
-        if (c<ncol-1)
-          break;
-        //Rprintf("rollbs: %d: this x %d match value %d at last column\n", i, thisx, val);
-        return(i);
-      } else if (thisx < val)
-        lower = i+1;
-      else if (thisx > val)
-        upper = i-1;
-    }
-  }
-  if (verbose>0) {
-    Rprintf("rollbs: %d: last iter %d, element (", side<0?lower:upper, i); printRow(x, ix[i]-1, ncol); Rprintf(") nomatch to value ("); printRow(vals, vali, ncol); Rprintf(")\n"); // # nocov
+  int lower=0, upper=nix, i=0;
+  while (lower<=upper) {
+    i = lower + (upper-lower)/2;
+    int thisx = x[ix[i]-1];
+    //Rprintf("rollbs: x[ix[%d]-1]=%d ?? %d\n", i, thisx, val); // if (verbose) here would slow down while loop so need to be commented out
+    if (thisx==val)
+      return(i);
+    else if (thisx < val)
+      lower = i+1;
+    else if (thisx > val)
+      upper = i-1;
   }
+  if (verbose>0)
+    Rprintf("rollbs: nomatch: i=%d; this=%d; lower=%d, upper=%d; side=%d: %d\n", i, x[ix[i]-1], lower, upper, side, side<0?lower:upper); // # nocov
   if (side < 0) // anyone to stress test this logic?
     return lower;
   else
@@ -353,10 +280,10 @@ static int rollbs(const int **restrict x, const int *restrict ix, const int nix,
 
 // cuts x_starts into equal batches and binary search corresponding y_starts ranges
 static void batching(const int nBatch,
-                     const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
-                     const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+                     const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+                     const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
                      int *restrict Bx_off, int *restrict Bnx, int *restrict By_off, int *restrict Bny,
-                     const int ncol, const int verbose) {
+                     const int verbose) {
 #ifdef SMERGE_BATCHING_BALANCED
   //if (nBatch > nx_starts || (nx_starts/nBatch==1 && nx_starts%nBatch>0)) error("internal error: batching %d input into %d batches, number of batches should have been reduced be now", nx_starts, nBatch); // # nocov
   size_t batchSize = (nx_starts-1)/nBatch + 1; // this is fragile for arbitrary nBatch
@@ -369,14 +296,14 @@ static void batching(const int nBatch,
   if (verbose>0)
     Rprintf("batching: input %d into %s %d batches (batchSize=%d, lastBatchSize=%d) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:\n", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize);
   if (lastBatchSize==0 || ((nBatch-1) * batchSize + lastBatchSize != nx_starts))
-    error("internal error: batching %d input is attempting to use invalid batches: balanced=%s, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
+    error("internal error: batching %d input is attempting to use invalid batches: balanced=%d, nBatch=%d, batchSize=%d, lastBatchSize=%d", nx_starts, balanced?"balanced":"unbalanced", nBatch, batchSize, lastBatchSize); // # nocov
   for (int b=0; b<nBatch; ++b) {
     Bx_off[b] = b<nBatch-1 ? b*batchSize : nx_starts-lastBatchSize;
     Bnx[b] = b<nBatch-1 ? batchSize : lastBatchSize;
     int x_i_min = (x_starts + Bx_off[b])[0];
     int x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
-    int y_i_min = rollbs(y, y_starts, ny_starts, x, x_i_min-1, ncol, -1);
-    int y_i_max = rollbs(y, y_starts, ny_starts, x, x_i_max-1, ncol, 1);
+    int y_i_min = rollbs(y, y_starts, ny_starts, x[x_i_min-1], -1);
+    int y_i_max = rollbs(y, y_starts, ny_starts, x[x_i_max-1], 1);
     const bool y_match = y_i_min >= 0 && y_i_max >= 0;
     By_off[b] = y_match ? y_i_min : 0;
     Bny[b] = y_match ? y_i_max - y_i_min + 1 : 0;
@@ -387,8 +314,8 @@ static void batching(const int nBatch,
       if (Bny[b] > 0) {
         int x_i_min = (x_starts + Bx_off[b])[0], x_i_max = (x_starts + Bx_off[b])[Bnx[b]-1];
         int y_i_min = (y_starts + By_off[b])[0], y_i_max = (y_starts + By_off[b])[Bny[b]-1];
-        Rprintf("## lower: x[%d]: (", x_i_min); printRow(x, x_i_min-1, ncol); Rprintf(") <= ("); printRow(y, y_i_min-1, ncol); Rprintf(") :y[%d]\n", y_i_min);
-        Rprintf("## upper: x[%d]: (", x_i_max); printRow(x, x_i_max-1, ncol); Rprintf(") >= ("); printRow(y, y_i_max-1, ncol); Rprintf(") :y[%d]\n", y_i_max);
+        Rprintf("## lower: x[%d]: %d <= %d :y[%d]\n", x_i_min, x[x_i_min-1], y[y_i_min-1], y_i_min);
+        Rprintf("## upper: x[%d]: %d >= %d :y[%d]\n", x_i_max, x[x_i_max-1], y[y_i_max-1], y_i_max);
       }
     }
   }
@@ -428,14 +355,12 @@ void grpLens(const int *restrict starts, const int n_starts, const int n, int *r
 }
 
 // pure C smerge: takes already sorted input, compute grpLens, prepare batches, smerge in parallel
-void smergeC(const int **restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
-             const int **restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
+void smergeC(const int *restrict x, const int nx, const int *restrict x_starts, const int nx_starts,
+             const int *restrict y, const int ny, const int *restrict y_starts, const int ny_starts,
              int *restrict starts, int *restrict lens,
              uint64_t *n_match, bool *x_lens1, bool *y_lens1, bool *xy_lens1,
-             const int ncol, const enum emult mult, const int verbose) {
+             const enum emult mult, const int verbose) {
 
-  if (!nx)
-    return;
   double t = 0;
   if (verbose>0)
     t = omp_get_wtime();
@@ -469,7 +394,7 @@ void smergeC(const int **restrict x, const int nx, const int *restrict x_starts,
   }
   int *restrict Bx_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bnx = (int *)R_alloc(nBatch, sizeof(int));
   int *restrict By_off = (int *)R_alloc(nBatch, sizeof(int)), *restrict Bny = (int *)R_alloc(nBatch, sizeof(int));
-  batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, ncol, verbose-1);
+  batching(nBatch, x, nx, x_starts, nx_starts, y, ny, y_starts, ny_starts, Bx_off, Bnx, By_off, Bny, verbose-1);
   int *restrict th = (int *)R_alloc(nBatch, sizeof(int)); // report threads used
   if (verbose>0)
     Rprintf("smergeC: preparing %d batches took %.3fs\n", nBatch, omp_get_wtime() - t);
@@ -489,7 +414,7 @@ void smergeC(const int **restrict x, const int nx, const int *restrict x_starts,
       y, y_starts, y_lens, unq_y,             // common input y
       starts, lens,                           // common output
       &bnmatch, &bxlens1, &bylens1, &bxylens1,// reduction output
-      mult, ncol
+      mult
     );
     nmatch += bnmatch; xlens1 = bxlens1 && xlens1; ylens1 = bylens1 && ylens1; xylens1 = bxylens1 && xylens1;
     th[b] = omp_get_thread_num(); // only to report thread utilization in verbose message
@@ -610,38 +535,15 @@ const enum emult matchMultArg(SEXP multArg) {
   return mult;
 }
 
-SEXP ord1(int n) {
-  SEXP ans = PROTECT(allocVector(INTSXP, n));
-  int *ansp = INTEGER(ans);
-  for (int i=0; i<n; ++i)
-    ansp[i]=1;
-  UNPROTECT(1);
-  return ans;
-}
-
 // main interface from R
-SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) {
+SEXP smergeR(SEXP x, SEXP y, SEXP x_idx, SEXP y_idx, SEXP multArg, SEXP out_bmerge) {
 
   const int verbose = GetVerbose()*3; // remove *3 after #4491
   double t_total = 0, t = 0;
   if (verbose>0)
     t_total = omp_get_wtime();
-  if (!isNewList(x) || !isNewList(y))
-    error("'x' and 'y' must be data.table");
-  if (!isInteger(x_cols) || !isInteger(y_cols))
-    error("'x.cols' and 'y.cols' must be integer");
-  if (LENGTH(x_cols) != LENGTH(y_cols))
-    error("'x.cols' and 'y.cols' must be equal length");
-  const int ncol = LENGTH(x_cols), nxcol = length(x), nycol = length(y);
-  if (!nxcol || !nycol)
-    error("'x' and 'y' must be non 0 length");
-  for (int i=0; i<ncol; ++i) {
-    int x_col = INTEGER(x_cols)[i], y_col = INTEGER(y_cols)[i];
-    if (!(x_col > 0 && x_col <= nxcol) || !(y_col > 0 && y_col <= nycol))
-      error("'x.cols' and 'y.cols' must specify existing columns");
-    if (!isInteger(VECTOR_ELT(x, x_col-1)) || !isInteger(VECTOR_ELT(y, y_col-1)))
-      error("columns of 'x' and 'y' to join must be integer");
-  }
+  if (!isInteger(x) || !isInteger(y))
+    error("'x' and 'y' must be integer");
   if (!isString(multArg))
     error("'mult' must be a string");
   const enum emult mult = matchMultArg(multArg);
@@ -649,16 +551,16 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S
   if (!IS_TRUE_OR_FALSE(out_bmerge))
     error("'out.bmerge' must be TRUE or FALSE");
   const bool ans_bmerge = (bool)LOGICAL(out_bmerge)[0];
-  int protecti = 0, nx = LENGTH(VECTOR_ELT(x, 0)), ny = LENGTH(VECTOR_ELT(y, 0));
+  int protecti = 0, nx = LENGTH(x), ny = LENGTH(y);
 
   if (verbose>0)
     t = omp_get_wtime();
   const bool do_x_idx = isNull(x_idx), do_y_idx = isNull(y_idx);
   if (do_x_idx) {
-    x_idx = PROTECT(forder(x, x_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
+    x_idx = PROTECT(forder(x, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (do_y_idx) {
-    y_idx = PROTECT(forder(y, y_cols, ScalarLogical(true), ScalarLogical(true), ord1(ncol), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
+    y_idx = PROTECT(forder(y, R_NilValue, ScalarLogical(true), ScalarLogical(true), ScalarInteger(1), ScalarLogical(false))); protecti++; // verbose=verbose-2L after #4533
   }
   if (!isInteger(x_idx) || !isInteger(y_idx))
     error("'x.idx' and 'y.idx' must be integer");
@@ -671,24 +573,18 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S
   if (verbose>0)
     t = omp_get_wtime();
   const bool x_ord = !LENGTH(x_idx), y_ord = !LENGTH(y_idx);
-  const int **restrict xp=(const int **)R_alloc(ncol, sizeof(int*)), **restrict yp=(const int **)R_alloc(ncol, sizeof(int*));
-  for (int i=0; i<ncol; ++i) {
-    SEXP this_x = VECTOR_ELT(x, INTEGER(x_cols)[i]-1);
-    if (x_ord) {
-      xp[i] = INTEGER(this_x);
-    } else {
-      int *tmp = (int *)R_alloc(nx, sizeof(int));
-      sortInt(INTEGER(this_x), nx, INTEGER(x_idx), tmp);
-      xp[i] = tmp;
-    }
-    SEXP this_y = VECTOR_ELT(y, INTEGER(y_cols)[i]-1);
-    if (y_ord) {
-      yp[i] = INTEGER(this_y);
-    } else {
-      int *tmp = (int *)R_alloc(ny, sizeof(int));
-      sortInt(INTEGER(this_y), ny, INTEGER(y_idx), tmp);
-      yp[i] = tmp;
-    }
+  int *xp, *yp;
+  if (!x_ord) {
+    xp = (int *)R_alloc(nx, sizeof(int));
+    sortInt(INTEGER(x), nx, INTEGER(x_idx), xp);
+  } else {
+    xp = INTEGER(x);
+  }
+  if (!y_ord) {
+    yp = (int *)R_alloc(ny, sizeof(int));
+    sortInt(INTEGER(y), ny, INTEGER(y_idx), yp);
+  } else {
+    yp = INTEGER(y);
   }
   if (verbose>0)
     Rprintf("smergeR: sort %s took %.3fs\n", verboseDone(!x_ord, !y_ord, "(already sorted)", "(y)", "(x)", "(x, y)"), omp_get_wtime() - t);
@@ -732,10 +628,10 @@ SEXP smergeR(SEXP x, SEXP y, SEXP x_cols, SEXP y_cols, SEXP x_idx, SEXP y_idx, S
     yp, ny, INTEGER(y_starts), LENGTH(y_starts),
     starts, lens,
     &n_match, &x_lens1, &y_lens1, &xy_lens1,
-    ncol, mult, verbose-1
+    mult, verbose-1
   );
   if (verbose>0)
-    Rprintf("smergeR: smergeC %d cols of %d x %d = %"PRIu64"; took %.3fs\n", ncol, nx, ny, n_match, omp_get_wtime() - t);
+    Rprintf("smergeR: smergeC of %d x %d = %"PRIu64"; took %.3fs\n", nx, ny, n_match, omp_get_wtime() - t);
 
   if (verbose>0)
     t = omp_get_wtime();