diff --git a/NAMESPACE b/NAMESPACE index 2bc30543f..d20f87450 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,8 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(cbindlist) +export(mergelist) export(substitute2) #export(DT) # mtcars |> DT(i,j,by) #4872 #5472 diff --git a/NEWS.md b/NEWS.md index 8b15449e8..d5d433226 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,10 @@ 1. In `DT[, variable := value]`, when value is class `POSIXlt`, we automatically coerce it to class `POSIXct` instead, [#1724](https://github.com/Rdatatable/data.table/issues/1724). Thanks to @linzhp for the report, and Benjamin Schwendinger for the fix. +## NEW FEATURES + +1. (add example here?) New functions `cbindlist` and `mergelist` have been implemented and exported. Works like `cbind`/`merge` but takes `list` of data.tables on input. `merge` happens in `Reduce` fashion. Supports `how` (_left_, _inner_, _full_, _right_, _semi_, _anti_, _cross_) joins and `mult` argument, closes [#599](https://github.com/Rdatatable/data.table/issues/599) and [#2576](https://github.com/Rdatatable/data.table/issues/2576). + ## NOTES 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix. diff --git a/R/data.table.R b/R/data.table.R index 3b58bb502..02ea2a694 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -199,7 +199,7 @@ replace_dot_alias = function(e) { } return(x) } - if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'") + if (!mult %chin% c("first", "last", "all", "error")) stopf("mult argument can only be 'first', 'last', 'all' or 'error'") missingroll = missing(roll) if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") if (is.character(roll)) { @@ -495,6 +495,7 @@ replace_dot_alias = function(e) { } i = .shallow(i, retain.key = TRUE) ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose) + if (mult=="error") mult="all" xo = ans$xo ## to make it available for further use. # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this # 'setorder', as there's another 'setorder' in generating 'irows' below... @@ -516,13 +517,22 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} - irows = if (allLen1) f__ else vecseq(f__,len__, - if (allow.cartesian || - notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). - !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) { - NULL # #742. If 'i' has no duplicates, ignore - } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). + if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + irows = if (allLen1) f__ else { + join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility + anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) { + NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). + !allow.cartesian || + !join.many)) + as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_))) + limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore + if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i)) + else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov + } + vecseq(f__, len__, limit) + } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid diff --git a/R/mergelist.R b/R/mergelist.R new file mode 100644 index 000000000..2a5c00e54 --- /dev/null +++ b/R/mergelist.R @@ -0,0 +1,335 @@ +cbindlist = function(l, copy=TRUE) { + ans = .Call(Ccbindlist, l, copy) + if (anyDuplicated(names(ans))) { ## invalidate key and index + setattr(ans, "sorted", NULL) + setattr(ans, "index", integer()) + } + setDT(ans) + ans +} + +# when 'on' is missing then use keys, used only for inner and full join +onkeys = function(x, y) { + if (is.null(x) && !is.null(y)) y + else if (!is.null(x) && is.null(y)) x + else if (!is.null(x) && !is.null(y)) { + if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key + else intersect(x, y) + } else NULL # nocov ## internal error is being called later in mergepair +} +someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) { + keep = colnamesInt(x, keep) + drop = colnamesInt(x, drop) + cols = colnamesInt(x, cols) + ans = union(keep, setdiff(cols, drop)) + if (!retain.order) return(ans) + intersect(colnamesInt(x, NULL), ans) +} +hasindex = function(x, by, retGrp=FALSE) { + index = attr(x, "index", TRUE) + if (is.null(index)) return(FALSE) + idx_name = paste0("__",by,collapse="") + idx = attr(index, idx_name, TRUE) + if (is.null(idx)) return(FALSE) + if (!retGrp) return(TRUE) + return(!is.null(attr(idx, "starts", TRUE))) +} + +# fdistinct applies mult='first|last' +# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE] +# it may not copy when copy=FALSE and x is unique by 'on' +fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) { + if (!perhaps.data.table(x)) + stopf("'x' must be data.table") + if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x))) + stopf("'on' must be character column names of 'x' argument") + mult = match.arg(mult) + if (is.null(cols)) + cols = seq_along(x) + else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols)) + stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument") + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last" + ## this short circuit will work after #4386 because it requires retGrp=T + #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE) + sort = TRUE ## above line does not work for the moment, test 302.02 + o = forderv(x, by=on, sort=sort, retGrp=TRUE) + if (attr(o, "maxgrpn", TRUE) <= 1L) { + ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE) + if (copy) ans = copy(ans) + return(ans) + } + f = attr(o, "starts", exact=TRUE) + if (mult=="last") { + if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov + f = c(f[-1L]-1L, nrow(x)) ## last of each group + } + if (length(o)) f = o[f] + if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order + .Call(CsubsetDT, x, f, someCols(x, cols, keep=on)) +} + +# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow) +# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT +dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) { + nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_) + nomatch0 = identical(nomatch, 0L) + if (is.null(mult)) + mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error") + if (void && mult!="error") + internal_error("void must be used with mult='error'") # nocov + if (how=="cross") { ## short-circuit bmerge results only for cross join + if (length(on) || mult!="all" || !join.many) + stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE") + if (void) + internal_error("cross join must be used with void=FALSE") # nocov + ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer()) + } else { + if (!length(on)) + stopf("'on' must be non-zero length character vector") + if (mult=="all" && (how=="semi" || how=="anti")) + stopf("semi and anti joins must be used with mult!='all'") + icols = colnamesInt(i, on, check_dups=TRUE) + xcols = colnamesInt(x, on, check_dups=TRUE) + ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose) + if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line + return(invisible(NULL)) + } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit + irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows + if (length(irows)==length(ans$lens)) irows = NULL + return(list(ans=ans, irows=irows)) + } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check + !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_)) + ) + stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + } + + ## xrows, join-to + xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) + if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)] + len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here + + ## irows, join-from + irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens) + len.i = if (is.null(irows)) nrow(i) else length(irows) + + if (length(ans$xo) && length(xrows)) + xrows = ans$xo[xrows] + len.x = length(xrows) + + if (len.i!=len.x) + internal_error("dtmerge out len.i != len.x") # nocov + + return(list(ans=ans, irows=irows, xrows=xrows)) +} + +# atomic join between two tables +mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) { + semianti = how=="semi" || how=="anti" + innerfull = how=="inner" || how=="full" + { + if (how!="cross") { + if (is.null(on)) { + if (how=="left" || semianti) on = key(rhs) + else if (how=="right") on = key(lhs) + else if (innerfull) on = onkeys(key(lhs), key(rhs)) + if (is.null(on)) + stopf("'on' is missing and necessary key is not present") + } + if (any(bad.on <- !on %chin% names(lhs))) + stopf("'on' argument specify columns to join [%s] that are not present in LHS table [%s]", brackify(on[bad.on]), brackify(names(lhs))) + if (any(bad.on <- !on %chin% names(rhs))) + stopf("'on' argument specify columns to join [%s] that are not present in RHS table [%s]", brackify(on[bad.on]), brackify(names(rhs))) + } else if (is.null(on)) { + on = character() ## cross join only + } + } ## on + { + if (how!="right") { + jnfm = lhs; fm.cols = lhs.cols; jnto = rhs; to.cols = rhs.cols + } else { + jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols + } + } ## join-to and join-from tables and columns (right outer join swap) + + ## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table + cp.i = FALSE ## copy marker of out.i + if ((innerfull) && !is.null(mult) && (mult=="first" || mult=="last")) { + jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) ## might not copy when already unique by 'on' + cp.i = nrow(jnfm)!=nrow(lhs) ## nrow(lhs) bc how='inner|full' so jnfm=lhs + } else if (how=="inner" && (is.null(mult) || mult=="error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges + dtmerge(x=jnfm, i=jnto, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE) + } + + ## binary merge + ans = dtmerge(x=jnto, i=jnfm, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many) + + ## make i side + out.i = if (is.null(ans$irows)) + .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on, retain.order=semianti), retain.key=TRUE) + else + .Call(CsubsetDT, jnfm, ans$irows, someCols(jnfm, fm.cols, keep=on, retain.order=semianti)) + cp.i = cp.i || !is.null(ans$irows) + + ## make x side + if (semianti) { + out.x = list(); cp.x = TRUE + } else { + out.x = if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below + internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") #.shallow(jnto, cols=someCols(jnto, to.cols, drop=on), retain.key=TRUE) # nocov ## as of now nocov does not make difference r-lib/covr#279 + else + .Call(CsubsetDT, jnto, ans$xrows, someCols(jnto, to.cols, drop=on)) + cp.x = !is.null(ans$xrows) + ## ensure no duplicated column names in merge results + if (any(dup.i<-names(out.i) %chin% names(out.x))) + stopf("merge result has duplicated column names, use 'cols' argument or rename columns in 'l' tables, duplicated column(s): %s", brackify(names(out.i)[dup.i])) + } + + ## stack i and x + if (how!="full") { + if (!cp.i && copy) out.i = copy(out.i) + #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here + out = .Call(Ccbindlist, list(out.i, out.x), FALSE) + if (how=="right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols + } else { # how=="full" + ## we made left join side above, proceed to right join side, so swap tbls + jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols + + cp.r = FALSE + if (!is.null(mult) && (mult=="first" || mult=="last")) { + jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) + cp.r = nrow(jnfm)!=nrow(rhs) ## nrow(rhs) bc jnfm=rhs + } ## mult=="error" check was made on one side already, below we do on the second side, test 101.43 + + ## binary merge anti join + bns = dtmerge(x=jnto, i=jnfm, on=on, how="anti", mult=if (!is.null(mult) && mult!="all") mult, verbose=verbose, join.many=join.many) + + ## make anti join side + out.r = if (is.null(bns$irows)) + .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on), retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i) + else + .Call(CsubsetDT, jnfm, bns$irows, someCols(jnfm, fm.cols, keep=on)) + cp.r = cp.r || !is.null(bns$irows) + + ## short circuit to avoid rbindlist to empty sets and retains keys + if (!nrow(out.r)) { ## possibly also !nrow(out.i) + if (!cp.i && copy) out.i = copy(out.i) + #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here + out = .Call(Ccbindlist, list(out.i, out.x), FALSE) + } else if (!nrow(out.i)) { ## but not !nrow(out.r) + if (!cp.r && copy) out.r = copy(out.r) + if (length(add<-setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA + neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set() + out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied + out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE) + setcolorder(out.r, neworder=neworder) + } + out = out.r + } else { ## all might have not been copied yet, rbindlist will copy + out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE) + out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE) + } + } + setDT(out) +} + +mergelist = function(l, on, cols, how=c("left","inner","full","right","semi","anti","cross"), mult, copy=TRUE, join.many=getOption("datatable.join.many")) { + verbose = getOption("datatable.verbose") + if (verbose) + p = proc.time()[[3L]] + { + if (!is.list(l) || is.data.frame(l)) + stopf("'l' must be a list") + if (!all(vapply_1b(l, is.data.table))) + stopf("Every element of 'l' list must be data.table objects") + if (!all(lengths(l))) + stopf("Tables in 'l' argument must be non-zero columns tables") + if (any(vapply_1i(l, function(x) anyDuplicated(names(x))))) + stopf("Some of the tables in 'l' have duplicated column names") + } ## l + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + n = length(l) + if (n<2L) { + out = if (!n) as.data.table(l) else l[[1L]] + if (copy) out = copy(out) + if (verbose) + catf("mergelist: merging %d table(s), took %.3fs\n", n, proc.time()[[3L]]-p) + return(out) + } + { + if (!is.list(join.many)) + join.many = rep(list(join.many), n-1L) + if (length(join.many)!=n-1L || !all(vapply_1b(join.many, isTRUEorFALSE))) + stopf("'join.many' must be TRUE or FALSE, or a list of such which length must be length(l)-1L") + } ## join.many + { + if (missing(mult)) + mult = NULL + if (!is.list(mult)) + mult = rep(list(mult), n-1L) + if (length(mult)!=n-1L || !all(vapply_1b(mult, function(x) is.null(x) || (is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("error","all","first","last"))))) + stopf("'mult' must be one of [error, all, first, last] or NULL, or a list of such which length must be length(l)-1L") + } ## mult + { + if (missing(how) || is.null(how)) + how = match.arg(how) + if (!is.list(how)) + how = rep(list(how), n-1L) + if (length(how)!=n-1L || !all(vapply_1b(how, function(x) is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("left","inner","full","right","semi","anti","cross")))) + stopf("'how' must be one of [left, inner, full, right, semi, anti, cross], or a list of such which length must be length(l)-1L") + } ## how + { + if (missing(cols) || is.null(cols)) { + cols = vector("list", n) + } else { + if (!is.list(cols)) + stopf("'%s' must be a list", "cols") + if (length(cols) != n) + stopf("'cols' must be same length as 'l'") + skip = vapply_1b(cols, is.null) + if (!all(vapply_1b(cols[!skip], function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) + stopf("'cols' must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULLs (all columns)") + if (any(mapply(function(x, icols) !all(icols %chin% names(x)), l[!skip], cols[!skip]))) + stopf("'cols' specify columns not present in corresponding table") + } + } ## cols + { + if (missing(on) || is.null(on)) { + on = vector("list", n-1L) + } else { + if (!is.list(on)) + on = rep(list(on), n-1L) + if (length(on)!=n-1L || !all(vapply_1b(on, function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) ## length checked in dtmerge + stopf("'on' must be non-NA, non-duplicated, character vector, or a list of such which length must be length(l)-1L") + } + } ## on + + l.mem = lapply(l, vapply, address, "") + out = l[[1L]] + out.cols = cols[[1L]] + for (join.i in seq_len(n-1L)) { + rhs.i = join.i + 1L + out = mergepair( + lhs = out, rhs = l[[rhs.i]], + on = on[[join.i]], + how = how[[join.i]], mult = mult[[join.i]], + lhs.cols = out.cols, rhs.cols = cols[[rhs.i]], + copy = FALSE, ## avoid any copies inside, will copy once below + join.many = join.many[[join.i]], + verbose = verbose + ) + out.cols = copy(names(out)) + } + out.mem = vapply_1c(out, address) + if (copy) + .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))])) + if (verbose) + catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]]-p) + out +} + +seqexp = function(x) .Call(Cseqexp, x) +perhaps.data.table = function(x) .Call(CperhapsDataTableR, x) diff --git a/R/onLoad.R b/R/onLoad.R index 01b159f77..ef6bb4a02 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -81,6 +81,7 @@ "datatable.print.trunc.cols"="FALSE", # for print.data.table "datatable.show.indices"="FALSE", # for print.data.table "datatable.allow.cartesian"="FALSE", # datatable. + "datatable.join.many"="TRUE", # mergelist, [.data.table #4383 #914 "datatable.dfdispatchwarn"="TRUE", # not a function argument "datatable.warnredundantby"="TRUE", # not a function argument "datatable.alloccol"="1024L", # argument 'n' of alloc.col. Over-allocate 1024 spare column slots diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw new file mode 100644 index 000000000..693efa21d --- /dev/null +++ b/inst/tests/mergelist.Rraw @@ -0,0 +1,1031 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + mergepair = data.table:::mergepair + perhaps.data.table = data.table:::perhaps.data.table + hasindex = data.table:::hasindex + fdistinct = data.table:::fdistinct + forderv = data.table:::forderv +} + +addresses = function(x) vapply(x, address, "") +copied = function(ans, l) { + all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses))) +} +notcopied = function(ans, l, how="left", unless=character()) { + if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape + if (how=="full") return( ## either side, left|right, notcopied is fine + all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans)) + ) + all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans)) +} + +# internal helpers + +test(1.01, perhaps.data.table(list())) +test(1.02, perhaps.data.table(list(a=1:2))) +test(1.03, perhaps.data.table(list(a=1:2, b=1:2))) +test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE) + +test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table") +test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of") +test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of") +test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE") +d = data.table(x=1:2, y=1:2) +test(2.05, ans<-fdistinct(d, on="x", mult="last"), d) +test(2.06, intersect(addresses(ans), addresses(d)), character()) +test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d) +test(2.08, addresses(ans), addresses(d)) +d = data.table(x=c(2:1,2L), y=1:3) +test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) +test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386 +test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) + +test(3.01, hasindex(d, "x")) +test(3.02, hasindex(d, "x", retGrp=TRUE)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386 +test(3.03, hasindex(d, "x")) +test(3.04, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", NULL) +test(3.05, !hasindex(d, "x")) +test(3.06, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", integer()) +test(3.07, !hasindex(d, "x")) +test(3.08, !hasindex(d, "x", retGrp=TRUE)) +rm(d) + +# cbindlist + +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +ans = cbindlist(l) +expected = data.table(l$d1, l$d2, l$d3) +test(11.01, ans, expected) +test(11.02, intersect(addresses(ans), addresses(expected)), character()) +ans = cbindlist(l, copy=FALSE) +expected = setDT(c(l$d1, l$d2, l$d3)) +test(11.03, ans, expected) +test(11.04, length(intersect(addresses(ans), addresses(expected))), ncol(expected)) +test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L,d=2L,f=3L)) +rm(expected) +## codecov +test(12.01, cbindlist(data.frame(a=1L), data.frame(b=1L)), error="must be a list") +test(12.02, cbindlist(TRUE, FALSE), error="must be a list") +test(12.03, cbindlist(list(), NA), error="must be TRUE or FALSE") +test(12.04, cbindlist(list(data.table(a=1L), 1L)), error="is not of data.table type") +op = options(datatable.verbose=TRUE) +test(12.05, cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took") +options(op) +test(12.06, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.07, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.08, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer())) +## duplicated names +test(12.09, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L)) +ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2),"a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2))) +test(12.10, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2)) +test(12.11, indices(ans), NULL) +## recycling, first ensure cbind recycling that we want to match to +test(12.12, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_,NA), a=1:2)) +test(12.13, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L,1L), a=1:2)) +test(12.14, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.15, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.16, cbindlist(list(data.table(a=integer()), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") +test(12.17, cbindlist(list(data.table(a=1L), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") + +## retain indices +d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) ## ensure setDT will retain key and indices when it is called on the list, bc Ccbindlist returns list +setkeyv(d, "x"); setindexv(d, list("y", "z")) +a = attributes(d) +attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")] +test(13.01, class(d), "list") +setDT(d) +test(13.02, key(d), "x") +test(13.03, hasindex(d, "y") && hasindex(d, "z")) +l = list( + data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), + data.table(id4=5:1, id5=1:5, v2=1:5), + data.table(id6=5:1, id7=1:5, v3=1:5), + data.table(id8=5:1, id9=5:1, v4=1:5) +) +setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9") +ii = lapply(l, indices) +ans = cbindlist(l) +test(13.04, key(ans), "id1") +test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9")) +test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib + +# mergepair + +## test copy-ness argument in mergepair + +### LHS equal to RHS: no copy in all cases +num = 21.000 +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = data.table(id1=1:2, v1=1:2, v2=1:2) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS + test(num<-num+0.001, notcopied(ans, l, how=how)) + } +} +### RHS includes LHS: no copy in inner, left, right +num = 22.000 +unless = "full" +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:3, v2=1:3) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:2, v1=1:2, v2=1:2), + right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3), + full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS includes RHS: no copy in left, right, full +num = 23.000 +unless = "inner" +l = list( + lhs = data.table(id1=1:3, v1=1:3), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)), + right = data.table(id1=1:2, v1=1:2, v2=1:2), + full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS single nonmatch RHS on both sides: no copy in left, right +num = 24.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=3:1, v1=1:3), + rhs = data.table(id1=c(4L,2:1), v2=1:3) +) +expected = list( + inner = data.table(id1=2:1, v1=2:3, v2=2:3), + left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)), + right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3), + full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS zero match RHS: no copy in left, right +num = 25.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=3:4, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS and RHS zero nrow: no copies +num = 26.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=integer(), v1=integer(), v2=integer()) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS has zero nrow: no copies +num = 27.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=2:1, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### RHS has zero nrow +num = 28.000 +unless = "inner" +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} + +# mergelist + +## coverage +test(101.01, mergelist(data.table(x=1L), on="x"), error="must be a list") +test(101.02, mergelist(list(data.table(x=1L)), on="x", copy=NA), error="must be TRUE or FALSE") +test(101.03, mergelist(list(data.table(x=1L), data.table(x=1L)), how="cross", on="y"), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.04, mergelist(list(data.table(x=1L), list(x=1:2, y=1L)), on="x"), error="must be data.table objects") +l = list(d<-data.table(x=1:2)) +test(101.05, ans<-mergelist(l, on="x", how="left", mult="first"), d) +test(101.06, intersect(addresses(d), addresses(ans)), character()) +test(101.07, ans<-mergelist(l, on="x", how="left", mult="first", copy=FALSE), d) +test(101.08, addresses(d), addresses(ans)) +op = options("datatable.verbose"=TRUE) +test(101.09, mergelist(l, on="x"), d, output="mergelist.*1 table.*took") +options(op) +l = list(data.table(x=1:2), data.table(x=2:3)) +test(101.10, mergelist(l, on=character()), error="non-zero length character vector") +op = options("datatable.verbose"=TRUE) +test(101.11, mergelist(l, on="x"), data.table(x=1:2), output="mergelist.*2 tables.*took") +options(op) +test(101.12, mergelist(l, on="xx"), error="are not present in LHS") +test(101.13, mergelist(l, on="x", join.many=NA), error="must be TRUE or FALSE") +test(101.14, mergelist(list(data.table(a=1L), data.table(a=c(1L,1L))), on="a", mult="all"), data.table(a=c(1L,1L))) ## copyCols(, cols=integer()) +test(101.15, mergelist(list()), data.table()) +test(101.16, mergelist(list(data.table())), error="must be non-zero columns tables") +test(101.17, mergelist(list(data.table(), data.table())), error="must be non-zero columns tables") +test(101.18, mergelist(list(data.table(a=integer()), data.table(a=integer())), on="a"), data.table(a=integer())) +test(101.19, mergelist(list(data.table(a=1L), data.table(a=1L, b=1L, b=1L)), on="a"), error="have duplicated column names") +test(101.20, mergelist(list(data.table(a=1L, b=1L), data.table(a=1L, b=2L)), on="a"), error="merge result has duplicated column names") +test(101.21, mergelist(list(data.table(a=1L, b=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, character())), data.table(a=1L, b=1L)) +test(101.22, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a",NA))), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors") +test(101.23, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors") +test(101.24, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", join.many=list(TRUE, TRUE)), error="must be TRUE or FALSE, or a list of such which length must be") +test(101.25, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", join.many=list(NA)), error="must be TRUE or FALSE, or a list of such which length must be") +test(101.26, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=c("a","a")), error="non-NA, non-duplicated, character vector, or a list") +test(101.27, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=c("a",NA)), error="non-NA, non-duplicated, character vector, or a list") +test(101.28, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", c("a",NA))), error="non-NA, non-duplicated, character vector, or a list") +test(101.29, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", NULL)), error="non-NA, non-duplicated, character vector, or a list") +test(101.30, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L), data.table(a=1L)), on=list("a", c("a","a"))), error="non-NA, non-duplicated, character vector, or a list") +test(101.31, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on=list("a","a")), error="non-NA, non-duplicated, character vector, or a list") +test(101.32, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.33, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a",NA))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.34, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL, c("a","a"))), error="list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(101.35, mergelist(list(data.table(a=1L, d=1L), data.table(a=1L, b=2L)), on="a", cols=list(NULL)), error="must be same length") +test(101.36, mergelist(list(data.table(x=1L), data.table(y=1L)), how="cross", mult="first"), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.37, mergelist(list(data.table(x=1L), data.table(y=1L)), how="cross", join.many=FALSE), error="cross join must be used with zero-length on, mult='all', join.many=TRUE") +test(101.38, mergelist(list(data.table(x=1L), data.table(x=1L)), how="semi", on="x", mult="all"), error="semi and anti joins must be used with mult!='all'") +test(101.39, mergelist(list(data.table(x=1L), data.table(x=1L)), how="anti", on="x", mult="all"), error="semi and anti joins must be used with mult!='all'") +test(101.41, mergelist(list(data.table(id1=c(3:2,2L), v1=1:3), data.table(id1=c(2L,2:1), v2=1:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") +test(101.42, mergelist(list(data.table(id1=3:2, v1=1:2), data.table(id1=c(2L,2:1), v2=1:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") +test(101.43, mergelist(list(data.table(id1=c(3:2,2L), v1=1:3), data.table(id1=2:1, v2=2:3)), how="full", on="id1", mult="error"), error="multiple matches during merge") ## how="full" mult="error" confirm that second binary merge anti join 'bns', is covering both sides error, this test doesnt ensure about that but serves a data that has been used for debugging with extra 'cat("before|after bns\n")' +test(101.44, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", mult="bad"), error="'mult' must be one of [error, all, first, last] or NULL, or a list") +test(101.45, mergelist(list(data.table(x=1:2), data.table(x=1L, v2=1L)), on="x"), data.table(x=1:2, v2=c(1L,NA))) ## match.arg sets how="left", only when missing or NULL, otherwise we do own check +test(101.46, mergelist(list(data.table(x=1:2), data.table(x=1L, v2=1L)), on="x", how=NULL), data.table(x=1:2, v2=c(1L,NA))) ## still match.arg +test(101.47, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", how="bad"), error="'how' must be one of [left, inner, full, right, semi, anti, cross], or a list") +test(101.48, mergelist(list(data.table(x=1L), data.table(x=1L)), on="x", how=list("bad")), error="'how' must be one of [left, inner, full, right, semi, anti, cross], or a list") + +## missing on argument +l = list(data.table(x=1:2), data.table(x=2:3)) +test(102.01, mergelist(l, how="inner"), error="necessary key is not present") +test(102.02, mergelist(l, how="left"), error="necessary key is not present") +test(102.03, mergelist(l, how="right"), error="necessary key is not present") +test(102.04, mergelist(l, how="full"), error="necessary key is not present") +l = list(data.table(x=1:2, key="x"), data.table(x=2:3)) +test(102.11, mergelist(l, how="inner"), data.table(x=2L, key="x")) +test(102.12, mergelist(l, how="left"), error="necessary key is not present") +test(102.13, mergelist(l, how="right"), data.table(x=2:3)) +test(102.14, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2), data.table(x=2:3, key="x")) +test(102.21, mergelist(l, how="inner"), data.table(x=2L)) +test(102.22, mergelist(l, how="left"), data.table(x=1:2)) +test(102.23, mergelist(l, how="right"), error="necessary key is not present") +test(102.24, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2, key="x"), data.table(x=2:3, key="x")) +test(102.31, mergelist(l, how="inner"), data.table(x=2L, key="x")) ## ordered subset +test(102.32, mergelist(l, how="left"), data.table(x=1:2, key="x")) +test(102.33, mergelist(l, how="right"), data.table(x=2:3, key="x")) +test(102.34, mergelist(l, how="full"), data.table(x=1:3)) +l = list(data.table(x=1:2, y=1:2, z=1:2, zz=1:2, key=c("y","x","z","zz")), data.table(a=2:3, b=2:3, x=2:3, y=2:3, key=c("x","y","a"))) +test(102.41, mergelist(l, how="inner"), data.table(x=2L, y=2L, z=2L, zz=2L, a=2L, b=2L, key=c("y","x","z","zz"))) ## key len 2+ to take intersect, and align order, for inner and full +test(102.42, mergelist(l, how="left"), error="specify columns to join.*that are not present in LHS table") +test(102.43, mergelist(l, how="right"), error="specify columns to join.*that are not present in RHS table") +test(102.44, mergelist(l, how="full"), data.table(x=1:3, y=1:3, z=c(1:2,NA), zz=c(1:2,NA), a=c(NA,2:3), b=c(NA,2:3))) +l = list(data.table(a=1:2, x=1:2, key=c("x","a")), data.table(x=2:3, y=2:3, z=2:3, key=c("y","x","z"))) +test(102.51, mergelist(l, how="inner"), data.table(x=2L, a=2L, y=2L, z=2L, key=c("x","a"))) ## align order to shorter +test(102.52, mergelist(l, how="left"), error="specify columns to join.*that are not present in LHS table") +test(102.53, mergelist(l, how="right"), error="specify columns to join.*that are not present in RHS table") +test(102.54, mergelist(l, how="full"), data.table(x=1:3, a=c(1:2,NA), y=c(NA,2:3), z=c(NA,2:3))) +## missing on, cascade join fields +l = list( + fact = data.table(id=1:16, state_id=1:8, population=1, key="id"), + state = data.table(state_id=1:8, division_id=1:4, key="state_id"), + division = data.table(division_id=1:4, region_id=1:2, key="division_id"), + region = data.table(region_id=1:2, key="region_id") +) +ans1 = mergelist(l, how="left") +ans2 = mergelist(rev(l), how="right") +test(102.611, all.equal(ans1, ans2, ignore.col.order=TRUE)) +test(102.612, ans1, data.table(key="id", region_id=rep(1:2, 8), division_id=rep(1:4, 4), state_id=rep(1:8, 2), id=1:16, population=1)) +test(102.613, copied(ans1, l)) +test(102.614, copied(ans2, l)) +ans1 = mergelist(l, how="left", copy=FALSE) +ans2 = mergelist(rev(l), how="right", copy=FALSE) +test(102.621, all.equal(ans1, ans2, ignore.col.order=TRUE)) +test(102.622, ans1, data.table(key="id", region_id=rep(1:2, 8), division_id=rep(1:4, 4), state_id=rep(1:8, 2), id=1:16, population=1)) +test(102.623, notcopied(ans1, l)) +test(102.624, notcopied(ans2, rev(l), how="right")) +test(102.625, !notcopied(ans1, l, how="right")) ## test notcopied helper function rather than mergelist +test(102.626, !notcopied(ans2, rev(l), how="left")) ## test notcopied +l = list( ## duplicates on one level + fact = data.table(id=1:16, state_id=1:8, population=1, key="id"), + state = data.table(state_id=1:8, division_id=1:4, key="state_id"), + division = data.table(division_id=c(1:4,1:2), region_id=1:2, key="division_id"), + region = data.table(region_id=2:1, key="region_id") +) +test(102.631, mergelist(l), error="multiple matches during merge") +test(102.632, nrow(ans1<-mergelist(l, mult="all")), 24L) +test(102.633, mergelist(l, how="right"), error="are not present in RHS table") +test(102.634, mergelist(rev(l), how="right"), error="multiple matches during merge") +test(102.635, nrow(ans2<-mergelist(rev(l), how="right", mult="all")), 24L) +test(102.636, all.equal(ans1, ans2, ignore.col.order=TRUE)) +rm(ans1, ans2) +## on list +test(102.71, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L), data.table(a=1L, z=2L)), on=list("y","a")), data.table(a=1L, y=2L, x=1L, z=2L)) +test(102.72, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L, b=3L), data.table(a=1L, b=3L, z=2L)), on=list("y",c("a","b"))), data.table(a=1L, b=3L, y=2L, x=1L, z=2L)) +test(102.73, mergelist(list(data.table(x=1L, y=2L), data.table(a=1L, y=2L, b=3L), data.table(a=1L, b=3L, z=2L)), on=list("y",c("a","x"))), error="specify columns to join.*that are not present in RHS table") + +## cols argument +l = list(data.table(id1=1:2, v1=1:2, v2=2:1, key="id1"), data.table(id1=2:3, v3=1:2, v4=2:1, key="id1")) +test(103.01, mergelist(l, how="inner"), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.02, mergelist(l, how="left"), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.03, mergelist(l, how="right"), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.04, mergelist(l, how="full"), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +test(103.11, mergelist(l, how="inner", cols="v2"), error="must be a list") +test(103.12, mergelist(l, how="inner", cols=list("v2")), error="must be same length as") +test(103.13, mergelist(l, how="inner", cols=list("v2",2L)), error="must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULL") +test(103.14, mergelist(l, how="inner", cols=list("v2","v5")), error="specify columns not present in corresponding table") +cols = list(c("v1","v2"), c("v3","v4")) +test(103.21, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.22, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.23, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.24, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, c("v3","v4")) +test(103.25, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.26, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.27, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.28, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(c("v1","v2"), NULL) +test(103.29, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.30, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.31, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.32, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, NULL) +test(103.33, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.34, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.35, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.36, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list("v2", NULL) +test(103.41, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v3=1L, v4=2L, key="id1")) +test(103.42, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v3=c(NA,1L), v4=c(NA,2L), key="id1")) +test(103.43, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v3=1:2, v4=2:1, key="id1")) +test(103.44, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v3=c(NA,1:2), v4=c(NA,2:1))) +cols = list(NULL, "v4") +test(103.45, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v1=2L, v2=1L, v4=2L, key="id1")) +test(103.46, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.47, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v1=c(2L,NA), v2=c(1L,NA), v4=2:1, key="id1")) +test(103.48, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v1=c(1:2,NA), v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list("v2", "v4") +test(103.49, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v4=2L, key="id1")) +test(103.50, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.51, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v4=2:1, key="id1")) +test(103.52, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list(c("id1","v2"), c("id1","v4")) +test(103.61, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v2=1L, v4=2L, key="id1")) +test(103.62, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v2=2:1, v4=c(NA,2L), key="id1")) +test(103.63, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v2=c(1L,NA), v4=2:1, key="id1")) +test(103.64, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v2=c(2:1,NA), v4=c(NA,2:1))) +cols = list("id1", c("id1","v4")) +test(103.65, mergelist(l, how="inner", cols=cols), data.table(id1=2L, v4=2L, key="id1")) +test(103.66, mergelist(l, how="left", cols=cols), data.table(id1=1:2, v4=c(NA,2L), key="id1")) +test(103.67, mergelist(l, how="right", cols=cols), data.table(id1=2:3, v4=2:1, key="id1")) +test(103.68, mergelist(l, how="full", cols=cols), data.table(id1=1:3, v4=c(NA,2:1))) +cols = list("id1", "id1") +test(103.69, mergelist(l, how="inner", cols=cols), data.table(id1=2L, key="id1")) +test(103.70, mergelist(l, how="left", cols=cols), data.table(id1=1:2, key="id1")) +test(103.71, mergelist(l, how="right", cols=cols), data.table(id1=2:3, key="id1")) +test(103.72, mergelist(l, how="full", cols=cols), data.table(id1=1:3)) + +## join.many argument #4383 +d = function(n) as.data.table(list(x=rep(1L, n))) +l = list(fm=d(1), to=d(1)) +test(104.01, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +l = list(fm=d(2), to=d(1)) +test(104.02, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +test(104.03, mergelist(l, on="x", how="left", mult="error"), l$to[l$fm, on="x", mult="error"]) ## mult="error" has no effect +l = list(fm=d(1), to=d(2)) +test(104.04, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) +test(104.05, mergelist(l, on="x", how="left"), error="multiple matches during merge") +test(104.06, l$to[l$fm, on="x", mult="error"], error="multiple matches during merge") +options(datatable.join.many=FALSE) +test(104.07, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) ## covers !join.many && length(f__)==1L && len__==nrow(x) +options(datatable.join.many=TRUE) +l = list(fm=d(2), to=d(2)) +options(datatable.join.many=FALSE) +test(104.08, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.09, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.10, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) ## join in [ does not stop on cartesian product +l = list(fm=d(3), to=d(1)) +test(104.11, mergelist(l, on="x", how="left"), l$to[l$fm, on="x"]) +l = list(fm=d(1), to=d(3)) +test(104.12, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x"]) +test(104.13, mergelist(l, on="x", how="left"), error="multiple matches during merge") +test(104.14, l$to[l$fm, on="x", mult="error"], error="multiple matches during merge") +l = list(fm=d(3), to=d(2)) +options(datatable.join.many=FALSE) +test(104.15, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.16, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.17, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.18, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +l = list(fm=d(2), to=d(3)) +options(datatable.join.many=FALSE) +test(104.19, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.20, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.21, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.22, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +l = list(fm=d(3), to=d(3)) +options(datatable.join.many=FALSE) +test(104.23, mergelist(l, on="x", how="left", mult="all"), error="many-to-many join") +test(104.24, l$to[l$fm, on="x"], error="many-to-many join") +options(datatable.join.many=TRUE) +test(104.25, l$to[l$fm, on="x"], error="Check for duplicate key values") +test(104.26, mergelist(l, on="x", how="left", mult="all"), l$to[l$fm, on="x", allow.cartesian=TRUE]) +## join.many list +test(104.31, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,FALSE)), data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2), v3=c(1L,1L,1L,1L))) +test(104.32, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,TRUE)), data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2), v3=c(1L,1L,1L,1L))) +test(104.33, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(FALSE,TRUE)), error="Joining resulted in many-to-many join") +test(104.34, mergelist(list(data.table(id1=c(1L,1L), v1=1:2), data.table(id1=c(1L,1L), v2=1:2), data.table(id1=1L, v3=1L)), on="id1", mult="all", join.many=list(TRUE,NA)), error="must be TRUE or FALSE, or a list of such") + +## how list +test(105.01, mergelist(list(data.table(a=1:3, b=1:3), data.table(a=2L, d=1L), data.table(a=c(1:2,4L), f=1:3)), on="a", how=list("left","full")), data.table(a=1:4, b=c(1:3,NA), d=c(NA,1L,NA,NA), f=c(1:2,NA,3L))) +test(105.02, mergelist(list(data.table(a=1:3, b=1:3), data.table(a=2L, d=1L), data.table(a=c(1:2,4L), f=1:3)), on="a", how=list("left","inner")), data.table(a=1:2, b=1:2, d=c(NA,1L), f=1:2)) +Persons = data.table(PersonName=c("Alice","Bob","Charles"), key="PersonName") ## right outer join use case +Pets = data.table(PetName=c("Rover","Lassie","Fifi"), PersonName=c("Alice","Alice","Charles"), key="PetName") +PetAccessories = data.table(AccessoryName=c("Ball","Bone","Mouse"), PetName=c("Rover","Rover","Fifi"), key="AccessoryName") +expected = data.table(PetName=c("Rover","Rover",NA,"Fifi"), PersonName=c("Alice","Alice","Bob","Charles"), AccessoryName=c("Ball","Bone",NA,"Mouse")) +test(105.11, Pets[PetAccessories, on="PetName", nomatch=NULL][Persons, on="PersonName"], expected) ## test [.data.table +setcolorder(expected, "PersonName"); setkeyv(expected, "PersonName") ## ignore.row.order, ignore.col.order, check.attributes=FALSE +test(105.12, mergelist(list(Pets, PetAccessories, Persons), how=list("inner","right"), on=list("PetName","PersonName"), mult="all"), expected) + +## mult list +test(106.01, mergelist(list(data.table(a=1:2, b=1:2), data.table(a=c(1L,1L), d=1:2), data.table(a=c(1L,1L), f=1:2)), on="a", how="left", mult=list("last","first")), data.table(a=1:2, b=1:2, d=c(2L,NA), f=c(1L,NA))) +test(106.02, mergelist(list(data.table(a=1:2, b=1:2), data.table(a=c(1L,1L), d=1:2), data.table(a=c(1L,1L), f=1:2)), on="a", how="left", mult=list("last","error")), error="multiple matches during merge") +test(106.81, mergelist(list(data.table(a=1:2), data.table(b=1:2)), how="cross"), data.table(a=c(1L,1:2,2L), b=c(1:2,1:2))) ### mult default +test(106.82, mergelist(list(data.table(a=1:2), data.table(b=1:2), data.table(a=1:2, b=1:2)), how=list("cross","anti"), on=list(character(), c("a","b"))), data.table(a=1:2, b=2:1)) ## cool, isnt it? + +## semi join +l = list(data.table(x=c(1L,1L,1:2), y=c("a","a","a","b")), data.table(x=c(1L,1L), z=10:11)) +test(107.01, mergelist(l, how="semi", on="x", mult="first"), data.table(x=c(1L,1L,1L), y=c("a","a","a"))) +l = list(data.table(x=c(1L,3L,1:2,2L), y=c("a","c","a","b","b")), data.table(x=3:2, z=10:11)) +test(107.02, mergelist(l, how="semi", on="x", mult="first"), data.table(x=c(3:2,2L), y=c("c","b","b"))) ## rows order of x, not i +test(107.03, mergelist(list(data.table(id1=1:4, id2=4:1, v1=1L), data.table(id2=3:5, v2=2L)), on="id2", how="semi"), data.table(id1=1:2, id2=4:3, v1=1L)) ## columns order of x, not i +l = list(data.table(id=c(3L,1L,2L,1L,1L), g=c("A","A","B","B","A"), v=(1:5)*10), data.table(id=c(1L,1:3), g="A")) +test(107.11, mergelist(l, how="semi", on=c("id","g"), mult="first"), l[[1L]][ sort(unique(l[[1L]][l[[2L]], on=names(l[[2L]]), nomatch=0L, which=TRUE]))]) +test(107.81, mergelist(list(data.table(a=1:2), data.table(b=1:2), data.table(a=1:2, b=1:2)), how=list("cross","semi"), on=list(character(), c("a","b"))), data.table(a=1:2, b=1:2)) ### mult default + +## anti join +l = list(data.table(x=c(1L,1:2,2L), y=c("a","a","b","b")), data.table(x=c(1L,1L), z=10:11)) +test(108.01, mergelist(l, how="anti", on="x", mult="first"), data.table(x=c(2L,2L), y=c("b","b"))) +l = list(data.table(x=c(1L,3L,1:2,2L), y=c("a","c","a","b","b")), data.table(x=c(4L,1L), z=10:11)) +test(108.02, mergelist(l, how="anti", on="x", mult="first"), data.table(x=c(3:2,2L), y=c("c","b","b"))) ## rows order of x, not i +test(108.03, mergelist(list(data.table(id1=1:4, id2=4:1, v1=1L), data.table(id2=3:5, v2=2L)), on="id2", how="anti"), data.table(id1=3:4, id2=2:1, v1=1L)) ## columns order of x, not i + +## cross join +l = list(data.table(v1=1:2, v2=1:4), data.table(v3=1:3, v4=1:6)) +ans1 = mergelist(l, how="cross", mult="all") +l = list(data.table(v1=1:2, v2=1:4, k=1L), data.table(v3=1:3, v4=1:6, k=1L)) +ans2 = mergelist(l, how="inner", mult="all", on="k")[, "k":=NULL][] +ans3 = l[[2L]][l[[1L]], .(v1,v2,v3,v4), on="k", allow.cartesian=TRUE] +test(109.01, ans1, ans2) +test(109.02, ans1, ans3) +expected = data.table(v1=integer(), v2=integer(), v3=integer(), v4=integer()) +test(109.03, mergelist(list(data.table(v1=1:2, v2=1:4), data.table(v3=integer(), v4=integer())), how="cross", mult="all"), expected) +test(109.04, mergelist(list(data.table(v1=integer(), v2=integer()), data.table(v3=1:3, v4=1:6)), how="cross", mult="all"), expected) +test(109.05, mergelist(list(data.table(v1=integer(), v2=integer()), data.table(v3=integer(), v4=integer())), how="cross", mult="all"), expected) + +## retain index +l = list(data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3), data.table(id1=3:1, v2=1:3)) +setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], "id2") +ans = mergelist(l, on="id1") +test(110.01, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.02, copied(ans, l)) +test(110.03, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="left", copy=FALSE) +test(110.04, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.05, notcopied(ans, l)) +test(110.06, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="full") +test(110.07, ans, data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, v2=3:1, key="id1")) +test(110.08, hasindex(ans, "id2")) +l = list(data.table(id1=1:3, id2=c(2L,1L,2L), v1=1:3, key="id1"), data.table(id1=4:1, v2=1:4)) +test(110.09, !hasindex(mergelist(l, on="id1", how="full"), "id2")) ## no index because size changes +l = list(data.table(id1=integer(), v1=integer()), data.table(id1=1:2, id2=2:1, v2=1:2)) +setkeyv(l[[2L]], "id1"); setindexv(l[[2L]], "id2") +ans = mergelist(l, on="id1", how="full") +test(110.10, ans, data.table(id1=1:2, v1=c(NA_integer_,NA), id2=2:1, v2=1:2, key="id1")) +test(110.11, hasindex(ans, "id2")) +l = list(data.table(id1=3:1, v1=1:3), data.table(id1=1:3, id2=c(2L,1L,2L), v2=1:3)) +setkeyv(l[[2L]], "id1"); setindexv(l[[2L]], "id2") +ans = mergelist(l, on="id1", how="right") +test(110.12, ans, data.table(id1=1:3, v1=3:1, id2=c(2L,1L,2L), v2=1:3, key="id1")) +test(110.13, copied(ans, l)) +test(110.14, hasindex(ans, "id2")) +ans = mergelist(l, on="id1", how="right", copy=FALSE) +test(110.15, ans, data.table(id1=1:3, v1=3:1, id2=c(2L,1L,2L), v2=1:3, key="id1")) +test(110.16, notcopied(ans, l, how="right")) +test(110.17, hasindex(ans, "id2")) + +## 3+ tables mergelist + +### 3 tables +l = list(data.table(id1=3:1, v1=1:3), data.table(id1=2:1, v2=1:2), data.table(id1=3:2, v3=1:2)) +test(111.01, mergelist(l, on="id1", how="left"), data.table(id1=3:1, v1=1:3, v2=c(NA,1:2), v3=c(1:2,NA))) +test(111.02, mergelist(l, on="id1", how="inner"), data.table(id1=2L, v1=2L, v2=1L, v3=2L)) +test(111.03, mergelist(l, on="id1", how="right"), data.table(id1=3:2, v1=c(NA,2L), v2=c(NA,1L), v3=1:2)) +test(111.04, mergelist(l, on="id1", how="full"), data.table(id1=3:1, v1=1:3, v2=c(NA,1:2), v3=c(1:2,NA))) +test(111.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(111.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## mergelist no duplicates + +### LHS equal to RHS +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=1:2, v2=1:2)) +expected = data.table(id1=1:2, v1=1:2, v2=1:2) +test(121.01, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(121.02, mergelist(l, on="id1", how="left", mult="all"), expected) +test(121.03, mergelist(l, on="id1", how="right", mult="all"), expected) +test(121.04, mergelist(l, on="id1", how="full", mult="all"), expected) +test(121.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Single match +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(1L,3L), v2=1:2)) +test(121.11, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=1L, v1=1L, v2=1L)) +test(121.12, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(1L,NA))) +test(121.13, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=c(1L,3L), v1=c(1L,NA), v2=1:2)) +test(121.14, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:3, v1=c(1:2,NA), v2=c(1L,NA,2L))) +test(121.15, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.16, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Two matches +l = list(lhs = data.table(id1=1:3, v1=1:3), rhs = data.table(id1=2:4, v2=1:3)) +test(121.21, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=2:3, v1=2:3, v2=1:2)) +test(121.22, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:3, v1=1:3, v2=c(NA,1:2))) +test(121.23, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=2:4, v1=c(2:3,NA), v2=1:3)) +test(121.24, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:4, v1=c(1:3,NA), v2=c(NA,1:3))) +test(121.25, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.26, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Zero match +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=4:3, v2=1:2)) +test(121.31, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.32, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.33, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=4:3, v1=c(NA_integer_,NA), v2=1:2)) +test(121.34, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(1:2,4:3), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2))) +test(121.35, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.36, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS within RHS +l = list(lhs = data.table(id1=1:4, v1=1:4), rhs = data.table(id1=3:2, v2=1:2)) +test(121.41, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=2:3, v1=2:3, v2=2:1)) +test(121.42, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:4, v1=1:4, v2=c(NA,2:1,NA))) +test(121.43, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=3:2, v1=3:2, v2=1:2)) +test(121.44, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=1:4, v1=1:4, v2=c(NA,2:1,NA))) +test(121.45, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.46, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### RHS within LHS +l = list(lhs = data.table(id1=3:2, v1=1:2), rhs = data.table(id1=1:4, v2=1:4)) +test(121.51, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=3:2, v1=1:2, v2=3:2)) +test(121.52, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=3:2, v1=1:2, v2=3:2)) +test(121.53, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=1:4, v1=c(NA,2:1,NA), v2=1:4)) +test(121.54, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(3:1,4L), v1=c(1:2,NA,NA), v2=c(3:1,4L))) +test(121.55, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.56, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### RHS zero rows +l = list(lhs = data.table(id1=3:2, v1=1:2), rhs = data.table(id1=integer(), v2=integer())) +test(121.61, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.62, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=3:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.63, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.64, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=3:2, v1=1:2, v2=c(NA_integer_,NA))) +test(121.65, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.66, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS zero rows +l = list(lhs = data.table(id1=integer(), v1=integer()), rhs = data.table(id1=2:1, v2=1:2)) +test(121.71, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.72, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=integer(), v1=integer(), v2=integer())) +test(121.73, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)) +test(121.74, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)) +test(121.75, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.76, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### LHS and RHS zero rows +l = list(lhs = data.table(id1=integer(), v1=integer()), rhs = data.table(id1=integer(), v2=integer())) +expected = data.table(id1=integer(), v1=integer(), v2=integer()) +test(121.81, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(121.82, mergelist(l, on="id1", how="left", mult="all"), expected) +test(121.83, mergelist(l, on="id1", how="right", mult="all"), expected) +test(121.84, mergelist(l, on="id1", how="full", mult="all"), expected) +test(121.85, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(121.86, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## mergelist duplicate matches, see sqlite.Rraw for tests vs SQLite db + +### duplicates in RHS and LHS matched in both sides +num = 221.00 +l = list(lhs = data.table(id1=c(1:3,3L), v1=1:4), rhs = data.table(id1=c(1L,1L,3:4), v2=1:4)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1L,1L,3L,4L), v2=c(1:3,3L)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=c(1L,4L), v2=2:3), + error = NULL +), left = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1L,1:4), v2=c(1:2,NA,3L,3L)), + first = data.table(id1=c(1:3,3L), v1=1:4, v2=c(1L,NA,3L,3L)), + last = data.table(id1=c(1:3,3L), v1=1:4, v2=c(2L,NA,3L,3L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,3L,3:4), v1=c(1L,1L,3:4,NA), v2=c(1:3,3:4)), + first = data.table(id1=c(1L,1L,3:4), v1=c(1L,1L,3L,NA), v2=1:4), + last = data.table(id1=c(1L,1L,3:4), v1=c(1L,1L,4L,NA), v2=1:4), + error = NULL +), full = list( + all = data.table(id1=c(1L,1:3,3:4), v1=c(1L,1:4,NA), v2=c(1:2,NA,3L,3:4)), + first = data.table(id1=1:4, v1=c(1:3,NA), v2=c(1L,NA,3:4)), + last = data.table(id1=1:4, v1=c(1:2,4L,NA), v2=c(2L,NA,3:4)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +## duplicates in RHS +num = 222.00 +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(2L,2:3), v2=1:3)) +expected = list(inner = list( + all = data.table(id1=c(2L,2L), v1=c(2L,2L), v2=1:2), + first = data.table(id1=2L, v1=2L, v2=1L), + last = data.table(id1=2L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1:2,2L), v1=c(1:2,2L), v2=c(NA,1:2)), + first = data.table(id1=1:2, v1=1:2, v2=c(NA,1L)), + last = data.table(id1=1:2, v1=1:2, v2=c(NA,2L)), + error = NULL +), right = list( + all = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + first = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + last = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3), + error = data.table(id1=c(2L,2:3), v1=c(2L,2L,NA), v2=1:3) +), full = list( + all = data.table(id1=c(1:2,2:3), v1=c(1:2,2L,NA), v2=c(NA,1:3)), + first = data.table(id1=c(1:2,3L), v1=c(1:2,NA), v2=c(NA,1L,3L)), + last = data.table(id1=c(1:2,3L), v1=c(1:2,NA), v2=c(NA,2:3)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in LHS +num = 223.00 +l = list(lhs = data.table(id1=c(1:2,2L), v1=1:3), rhs = data.table(id1=2:3, v2=1:2)) +expected = list(inner = list( + all = data.table(id1=c(2L,2L), v1=2:3, v2=c(1L,1L)), + first = data.table(id1=2L, v1=2L, v2=1L), + last = data.table(id1=2L, v1=3L, v2=1L), + error = NULL +), left = list( + all = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + first = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + last = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)), + error = data.table(id1=c(1:2,2L), v1=1:3, v2=c(NA,1L,1L)) +), right = list( + all = data.table(id1=c(2L,2:3), v1=c(2:3,NA), v2=c(1L,1:2)), + first = data.table(id1=2:3, v1=c(2L,NA), v2=1:2), + last = data.table(id1=2:3, v1=c(3L,NA), v2=1:2), + error = NULL +), full = list( + all = data.table(id1=c(1:2,2:3), v1=c(1:3,NA), v2=c(NA,1L,1:2)), + first = data.table(id1=1:3, v1=c(1:2,NA), v2=c(NA,1:2)), + last = data.table(id1=1:3, v1=c(1L,3L,NA), v2=c(NA,1:2)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in RHS and LHS, some RHS dups does not have matches in LHS +num = 224.00 +l = list(lhs = data.table(id1=c(1:3,3L), v1=1:4), rhs = data.table(id1=c(1L,1L,3:4,4L), v2=1:5)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1L,1L,3L,4L), v2=c(1:3,3L)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=c(1L,4L), v2=2:3), + error = NULL +), left = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1L,1:4), v2=c(1:2,NA,3L,3L)), + first = data.table(id1=c(1:3,3L), v1=1:4, v2=c(1L,NA,3L,3L)), + last = data.table(id1=c(1:3,3L), v1=1:4, v2=c(2L,NA,3L,3L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,3L,3L,4L,4L), v1=c(1L,1L,3L,4L,NA,NA), v2=c(1:3,3:5)), + first = data.table(id1=c(1L,1L,3L,4L,4L), v1=c(1L,1L,3L,NA,NA), v2=1:5), + last = data.table(id1=c(1L,1L,3L,4L,4L), v1=c(1L,1L,4L,NA,NA), v2=1:5), + error = NULL +), full = list( + all = data.table(id1=c(1L,1:3,3:4,4L), v1=c(1L,1:4,NA,NA), v2=c(1:2,NA,3L,3:5)), + first = data.table(id1=1:4, v1=c(1:3,NA), v2=c(1L,NA,3:4)), + last = data.table(id1=1:4, v1=c(1:2,4L,NA), v2=c(2L,NA,3L,5L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### duplicates in RHS and LHS, some LHS dups does not have matches in RHS +num = 225.00 +l = list(lhs = data.table(id1=c(1L,1L,3:4,4L), v1=1:5), rhs = data.table(id1=c(1:3,3L), v2=1:4)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,3L,3L), v1=c(1:3,3L), v2=c(1L,1L,3:4)), + first = data.table(id1=c(1L,3L), v1=c(1L,3L), v2=c(1L,3L)), + last = data.table(id1=c(1L,3L), v1=2:3, v2=c(1L,4L)), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,3L,3L,4L,4L), v1=c(1:3,3:5), v2=c(1L,1L,3L,4L,NA,NA)), + first = data.table(id1=c(1L,1L,3L,4L,4L), v1=1:5, v2=c(1L,1L,3L,NA,NA)), + last = data.table(id1=c(1L,1L,3L,4L,4L), v1=1:5, v2=c(1L,1L,4L,NA,NA)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1:3,3L), v1=c(1:2,NA,3L,3L), v2=c(1L,1:4)), + first = data.table(id1=c(1:3,3L), v1=c(1L,NA,3L,3L), v2=1:4), + last = data.table(id1=c(1:3,3L), v1=c(2L,NA,3L,3L), v2=1:4), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,3L,3:4,4L,2L), v1=c(1:3,3:5,NA), v2=c(1L,1L,3:4,NA,NA,2L)), + first = data.table(id1=c(1L,3:4,2L), v1=c(1L,3:4,NA), v2=c(1L,3L,NA,2L)), + last = data.table(id1=c(1L,3:4,2L), v1=c(2:3,5L,NA), v2=c(1L,4L,NA,2L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### cartesian match, dups on both sides of match +num = 226.00 +l = list(lhs = data.table(id1=c(1L,1:2), v1=1:3), rhs = data.table(id1=c(1L,1L,3L), v2=1:3)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,1L,1L,2L), v1=c(1L,1L,2L,2L,3L), v2=c(1:2,1:2,NA)), + first = data.table(id1=c(1L,1:2), v1=1:3, v2=c(1L,1L,NA)), + last = data.table(id1=c(1L,1:2), v1=1:3, v2=c(2L,2L,NA)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,1L,1L,3L), v1=c(1:2,1:2,NA), v2=c(1L,1:2,2:3)), + first = data.table(id1=c(1L,1L,3L), v1=c(1L,1L,NA), v2=1:3), + last = data.table(id1=c(1L,1L,3L), v1=c(2L,2L,NA), v2=1:3), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,1L,1:3), v1=c(1L,1:2,2:3,NA), v2=c(1:2,1:2,NA,3L)), + first = data.table(id1=1:3, v1=c(1L,3L,NA), v2=c(1L,NA,3L)), + last = data.table(id1=1:3, v1=c(2L,3L,NA), v2=c(2L,NA,3L)), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} +### cross join duplicates +num = 227.00 +l = list(lhs = data.table(id1=c(1L,1L), v1=1:2), rhs = data.table(id1=c(1L,1L), v2=1:2)) +expected = list(inner = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +), left = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=c(1L,1L), v1=1:2, v2=c(1L,1L)), + last = data.table(id1=c(1L,1L), v1=1:2, v2=c(2L,2L)), + error = NULL +), right = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1:2,1:2), v2=c(1L,1:2,2L)), + first = data.table(id1=c(1L,1L), v1=c(1L,1L), v2=1:2), + last = data.table(id1=c(1L,1L), v1=c(2L,2L), v2=1:2), + error = NULL +), full = list( + all = data.table(id1=c(1L,1L,1L,1L), v1=c(1L,1:2,2L), v2=c(1:2,1:2)), + first = data.table(id1=1L, v1=1L, v2=1L), + last = data.table(id1=1L, v1=2L, v2=2L), + error = NULL +)) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + if (is.null(expected[[how]][[mult]])) { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), error="multiple matches during merge") + } else { + test(num<-num+0.001, mergelist(l, on="id1", how=how, mult=mult), expected[[how]][[mult]]) + } + } +} + +## NAs in join columns + +### LHS equal to RHS and having NA on +l = list(lhs = data.table(id1=c(1:2,NA), v1=1:3), rhs = data.table(id1=c(1:2,NA), v2=1:3)) +expected = data.table(id1=c(1:2,NA), v1=1:3, v2=1:3) +test(251.01, mergelist(l, on="id1", how="inner", mult="all"), expected) +test(251.02, mergelist(l, on="id1", how="left", mult="all"), expected) +test(251.03, mergelist(l, on="id1", how="right", mult="all"), expected) +test(251.04, mergelist(l, on="id1", how="full", mult="all"), expected) +test(251.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(251.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) +### Single match and RHS having NA on +l = list(lhs = data.table(id1=1:2, v1=1:2), rhs = data.table(id1=c(1L,NA,3L), v2=1:3)) +test(251.11, mergelist(l, on="id1", how="inner", mult="all"), data.table(id1=1L, v1=1L, v2=1L)) +test(251.12, mergelist(l, on="id1", how="left", mult="all"), data.table(id1=1:2, v1=1:2, v2=c(1L,NA))) +test(251.13, mergelist(l, on="id1", how="right", mult="all"), data.table(id1=c(1L,NA,3L), v1=c(1L,NA,NA), v2=1:3)) +test(251.14, mergelist(l, on="id1", how="full", mult="all"), data.table(id1=c(1:2,NA,3L), v1=c(1:2,NA,NA), v2=c(1L,NA,2:3))) +test(251.15, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(251.16, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## exceeds overalloc for a table + +ac = getOption("datatable.alloccol") +l = list(lhs = as.data.table(c(list(id1=integer()), setNames(replicate(ac+10L,integer()), paste0("v",seq_len(ac+10L))))), rhs=data.table(id1=2:1)) +test(291.01, mergelist(l, on="id1", how="inner", mult="all"), l$lhs[0L]) +test(291.02, mergelist(l, on="id1", how="left", mult="all"), l$lhs[0L]) +test(291.03, mergelist(l, on="id1", how="right", mult="all"), l$lhs[1:2][, "id1" := 2:1][]) +test(291.04, mergelist(l, on="id1", how="full", mult="all"), rbindlist(l, use.names=TRUE, fill=TRUE)) ## test overalloc for how=="full" && !nrow(out.i) && nrow(out.r) && length(add<-setdiff(names(out.i), names(out.r))) that was failing when used set() +test(291.05, copied(mergelist(l, on="id1", how="left", mult="error", copy=TRUE), l)) +test(291.06, notcopied(mergelist(l, on="id1", how="left", mult="error", copy=FALSE), l)) + +## fdistinct, another round + +dt = data.table(x = +c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L, + 49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L, + 180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L, + 52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L, + 146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L, + 28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L, + 132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L, + 108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L, + 68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L, + 167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L, + 269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L, + 137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L, + 10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L, + 71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L, + 138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L, + 114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L, + 178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L, + 200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L, + 88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L, + 265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L, + 46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L, + 174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L, + 115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L, + 30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L, + 18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L) +) +ans = unique(dt, by="x") +test(301.01, data.table(x=unique(dt$x)), ans) ## OK +test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment + +## SQLite reference tests can be launched via + +### Rscript -e inst/tests/sqlite.Rraw.manual diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index eb3e461f7..33f1dc2eb 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,4 +1,4 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml", "DBI", "RSQLite") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by @@ -761,3 +761,297 @@ if (loaded[["dplyr"]]) { DT = data.table(a = 1, b = 2, c = '1,2,3,4', d = 4) test(30, DT[, c := strsplit(c, ',', fixed = TRUE) %>% lapply(as.integer) %>% as.list]$c, list(1:4)) # nolint: pipe_call_linter. Mimicking MRE as filed. } + +# NB: currently, RSQLite requires DBI, so partially redundant, but future-proof. +if (loaded[["DBI"]] && loaded[["RSQLite"]]) { + # mergelist join tester vs SQLite, based on v1.9.8 non-equi join tester + + # funs ---- + + # produce SQL statement + # ln, rn: lhs names, rhs names, symmult: symmetric mult + mult_all = function(tbl, cols, ...) sprintf( + "(\n SELECT %s FROM %s\n) %s", + paste(setdiff(cols,"row_id"), collapse=", "), tbl, tbl + ) + mult_one = function(tbl, cols, on, mult) sprintf( + "(SELECT %s FROM (\n SELECT *, ROW_NUMBER() OVER (PARTITION BY %s ORDER BY row_id %s) AS rownum FROM %s\n) %s WHERE rownum=1) %s", + paste(setdiff(cols,c("row_id","rownum")), collapse=", "), + paste(on, collapse=", "), + if (mult=="first") "ASC" else "DESC", + tbl, tbl, tbl + ) + sql = function(how, on, mult, ln, rn, symmult=FALSE, notjoin=FALSE) { + stopifnot(length(on)==1L) + # building sql query + if (how=="full") { + return(sprintf( + "%s\nUNION ALL\n%s", + sql("left", on, mult, ln, rn, symmult=mult%in%c("first","last")), + sql("right", on, mult, ln, rn, symmult=mult%in%c("first","last"), notjoin=TRUE) + )) + } + nm = list() + nm[["lhs"]] = ln; nm[["rhs"]] = rn + using = sprintf("USING (%s)", paste(on, collapse=", ")) + lhs = "lhs"; rhs = "rhs" + join = if (how=="inner") { + if (mult=="all") sprintf("%s\nINNER JOIN\n%s\n%s", mult_all(lhs, nm[[lhs]]), mult_all(rhs, nm[[rhs]]), using) + else sprintf("%s\nINNER JOIN\n%s\n%s", mult_one(lhs, nm[[lhs]], on, mult), mult_one(rhs, nm[[rhs]], on, mult), using) + } else if (how=="left") { + if (mult=="all") sprintf("%s\nLEFT JOIN\n%s\n%s", mult_all(lhs, nm[[lhs]]), mult_all(rhs, nm[[rhs]]), using) + else sprintf("%s\nLEFT JOIN\n%s\n%s", (if (symmult) mult_one else mult_all)(lhs, nm[[lhs]], on, mult), mult_one(rhs, nm[[rhs]], on, mult), using) + } else if (how=="right") { ## lhs-rhs swap happens here, mult_one is applied on new rhs + if (mult=="all") sprintf("%s\nLEFT JOIN\n%s\n%s", mult_all(rhs, nm[[rhs]]), mult_all(lhs, nm[[lhs]]), using) + else sprintf("%s\nLEFT JOIN\n%s\n%s", (if (symmult) mult_one else mult_all)(rhs, nm[[rhs]], on, mult), mult_one(lhs, nm[[lhs]], on, mult), using) + } + if (how=="right") {lhs = "rhs"; rhs = "lhs"} ## this name swap is for notjoin and select below + where = if (!notjoin) "" else sprintf("\nWHERE %s IS NULL", paste(rhs, on, sep=".")) + select = sprintf("%s, %s, %s", paste(lhs, on, sep="."), + paste("lhs", setdiff(nm[["lhs"]], c("row_id",on)),sep=".",collapse=", "), + paste("rhs", setdiff(nm[["rhs"]], c("row_id",on)),sep=".",collapse=", ")) + sprintf("SELECT %s FROM\n%s%s", select, join, where) + } + + # .conn SQLite connection, if provided it will use it instead of creating temporary one + # .drop logical TRUE (default) will drop db tables before and after and populate new, when FALSE it expects tables to be populated + join.sql.equal = function(l, on, how="inner", mult="all", allow.cartesian=TRUE, .conn, .drop=TRUE, .debug=interactive(), ans, err=FALSE) { + nm = names(l) + stopifnot(is.null(nm) || identical(nm, c("x","i")) || identical(nm, c("lhs","rhs"))) + names(l) = c("lhs","rhs") + lhs = l[["lhs"]]; rhs = l[["rhs"]] + stopifnot(is.data.table(lhs), is.data.table(rhs), + is.character(how), is.character(mult), length(mult)==1L, + is.character(on), + is.logical(allow.cartesian), is.logical(.drop)) + if (err && mult=="error") { + dt = try(silent=TRUE, mergelist(list(lhs, rhs), on=on, how=how, mult=mult)) + if (!inherits(dt, "try-error")) { + if (.debug) browser() + stop("no error returned from mergelist(mult='error') but err flag set to TRUE in join.sql.equal") + } + err_msg = "mult='error' and multiple matches during merge" + if (!identical(attr(dt, "condition", TRUE)[["message"]], err_msg)) { + if (.debug) browser() + stop("different error returned than expected: ", attr(dt, "condition", TRUE)[["message"]]) + } + return(TRUE) + } + # row_id column required as SQL is not ordered, creating on R side + if (!"row_id" %in% names(lhs)) lhs = copy(lhs)[, "row_id" := seq_len(.N)] + if (!"row_id" %in% names(rhs)) rhs = copy(rhs)[, "row_id" := seq_len(.N)] + # preparing sql environment + conn = if (new.conn <- missing(.conn)) DBI::dbConnect(RSQLite::SQLite()) else .conn + if (.drop) { + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")), silent=TRUE) + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")), silent=TRUE) + DBI::dbWriteTable(conn, name="lhs", value=lhs) + DBI::dbWriteTable(conn, name="rhs", value=rhs) + } + # building sql query + s = sql(how, on, mult, names(lhs), names(rhs)) + s = paste0(s,";\n") + # run data.table and SQLite + dt = mergelist(list(lhs[,!"row_id"], rhs[,!"row_id"]), on=on, how=how, mult=mult) + sq = try(silent=TRUE, as.data.table(DBI::dbGetQuery(conn, s))) + if (inherits(sq, "try-error")) { + if (.debug) {message("error during sql statement"); browser()} + stop("error during sql statement") + } + if (!is.data.table(dt) || !is.data.table(sq)) { + if (.debug) {message("dt and sq must be data.table already"); browser()} + stop("dt and sq must be data.table already") + } + if (how %in% c("inner","full")) { + dt2 = mergelist(list(rhs[,!"row_id"], lhs[,!"row_id"]), on=on, how=how, mult=mult) + setcolorder(dt2, neworder=names(dt)) + setattr(dt, "index", integer()) + setattr(dt2, "index", integer()) + r = all.equal(dt, dt2, ignore.row.order=TRUE) + ## check it is symetric + if (!isTRUE(r)) { + if (.debug) {message("mergelist is not symmetric for ", how); browser()} + stop("mergelist is not symmetric for ", how) + } + } + setattr(sq, "index", integer()) + setattr(dt, "index", integer()) + # compare results + a = all.equal(dt, sq, ignore.row.order=TRUE) + b = all.equal(dt, sq, ignore.row.order=TRUE, ignore.col.order=TRUE) + if (!missing(ans)) { + r = all.equal(ans, sq, ignore.row.order=TRUE) + if (!isTRUE(r)) { + if (.debug) browser() + stop("sql does not match to reference answer") + } + } + if (.drop) { + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")) + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")) + } + if (new.conn) suppressWarnings(DBI::dbDisconnect(conn)) + if (isTRUE(b) && !isTRUE(a)) { + if (.debug) browser() + stop("only column order mismatch") + } + if (!isTRUE(a)) { + if (.debug) browser() + cat(sep="\n",c( + sprintf("# dtq:\nmergelist(l, on='%s', how='%s', mult='%s')", paste(on, collapse=", "), how, mult), + sprintf("# sql:\n%s", s), + a, "\n")) + } + isTRUE(a) + } + + batch.join.sql.equal = function(cases, on, hows=c("inner","left","right","full"), mults=c("all","first","last"), .debug=FALSE) { + if ("error" %in% mults) stop("mult=error is not supported") + p = proc.time()[[3L]] + conn = DBI::dbConnect(RSQLite::SQLite()) + ans = list() + dup_n = 0L + for (case in cases) { + l = data(case) + stopifnot(c("lhs","rhs") %in% names(l)) + case = as.character(case) + lhs = l$lhs; rhs = l$rhs + ans[[case]] = list() + # reuse tables, to test if affects sqlite efficiency + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")), silent = TRUE) + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")), silent = TRUE) + # row_id column required as SQL is not ordered, creating on R side + if (!"row_id" %in% names(lhs)) lhs = copy(lhs)[, "row_id" := seq_len(.N)] + if (!"row_id" %in% names(rhs)) rhs = copy(rhs)[, "row_id" := seq_len(.N)] + DBI::dbWriteTable(conn, name="lhs", value=lhs) + DBI::dbWriteTable(conn, name="rhs", value=rhs) + len = prod(length(cases), length(hows), length(mults)) + if (len > (len.warn <- getOption("tests.length.warning", 1e3))) + warning(sprintf("You are about to run %s number of tests. To suppress this warning use 'tests.length.warning' option, set to numeric threshold or Inf.", len.warn)) + for (how in hows) { + ans[[case]][[how]] = list() + for (mult in mults) { + if (!is.null(ans[[case]][[how]][[mult]])) { + dup_n = dup_n+1L + next #warning("Some tests are duplicated, so far ", dup_n) + } + ans[[case]][[how]][[mult]] = join.sql.equal(list(lhs=lhs, rhs=rhs), on=on, how=how, mult=mult, .conn=conn, .drop=FALSE, .debug=.debug) + } + } + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")) + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")) + } + suppressWarnings(DBI::dbDisconnect(conn)) + cat(sprintf("batch.join.sql.equal: %s%s tests completed in %.1fs\n", + len, if (dup_n) sprintf(" (%s duplicated)", dup_n) else "", proc.time()[[3L]] - p)) + ans + } + data = function(case) { + set.seed(108) + if (case == 1L) { # 2 match + lhs = data.table(id = c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(2L,4L,3L,5L), v2=1:4) + } else if (case == 2L) { # 4 match + lhs = data.table(id = c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(7L,5L,3L,1L), v2=1:4) + } else if (case == 3L) { # 1 match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(1L,2L,4L,6L), v2=1:4) + } else if (case == 4L) { # 0 match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(0L,2L,4L,6L), v2=1:4) + } else if (case == 5L) { # 0 match dup + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(0L,2L,2L,6L), v2=1:4) + } else if (case == 6L) { # 1 match dup + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(1L,2L,2L,6L), v2=1:4) + } else if (case == 7L) { # 1 match dup match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(3L,3L,4L,6L), v2=1:4) + } else if (case == 8L) { # 2 match 2 dup match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(3L,3L,7L,7L), v2=1:4) + } else if (case == 9L) { # 2 dup 2 dup + lhs = data.table(id = c(1L,5L,1L,5L), v1=1:4) + rhs = data.table(id = c(5L,5L,1L,1L), v2=1:4) + } else if (case == 10L) { # 4 dup 4 dup match + lhs = data.table(id = c(1L,1L,1L,1L), v1=1:4) + rhs = data.table(id = c(1L,1L,1L,1L), v2=1:4) + } else if (case == 11L) { # 4 dup 4 dup nomatch + lhs = data.table(id = c(1L,1L,1L,1L), v1=1:4) + rhs = data.table(id = c(2L,2L,2L,2L), v2=1:4) + } else if (case == 12L) { # no match, no overlap + lhs = data.table(id = c(1:4), v1=1:4) + rhs = data.table(id = c(6:9), v2=1:4) + } else if (case == 13L) { # all i matches + lhs = data.table(id = c(1L,5L,3L,7L,9L), v1=1:5) + rhs = data.table(id = c(7L,5L,3L,1L), v2=1:4) + } else if (case == 14L) { # dup match and 1 non-match + ## inner join short circuit test + ## what if some row is excluded but another is duplicated? nrow(i) match + lhs = data.table(id = c(1L,5L,3L,7L,3L), v1=1:5) + rhs = data.table(id = c(7L,5L,3L,2L), v2=1:4) + } else if (case == 15L) { + # does not raise error on mult="error" because dups '13' does not have matching rows! + lhs = data.table(id = as.integer(c(17,14,11,10,5,1,19,7,16,15)), v1=1:10) + rhs = data.table(id = as.integer(c(6,20,13,1,8,13,3,10,17,9)), v2=1:10) + } else if (case == 16L) { + lhs = data.table(id = sample(10L, 10L, TRUE), v1=1:10) + rhs = data.table(id = sample(10L, 10L, TRUE), v2=1:10) + } else if (case == 17L) { + lhs = data.table(id = sample(1e2L, 1e2L, TRUE), v1=1:1e2) + rhs = data.table(id = sample(1e2L, 1e2L, TRUE), v2=1:1e2) + } else if (case == 18L) { + lhs = data.table(id = sample(1e2L, 1e2L, TRUE), v1=1:1e2) + rhs = data.table(id = sample(10L, 20L, TRUE), v2=1:1e2) + } else if (case==19L) { + lhs = as.data.table(list(id=sample(1e3), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3), v2=1:1e3)) + } else if (case==20L) { + lhs = as.data.table(list(id=sample(1e3*2L, 1e3), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3*2L, 1e3), v2=1:1e3)) + } else if (case==21L) { + lhs = as.data.table(list(id=sample(1e3, 1e3*2L, TRUE), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3, 1e3*2L, TRUE), v2=1:1e3)) + } else if (case==22L) { ## LHS equals RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=1:2, v2=1:2) + } else if (case==23L) { ## cross join + lhs = data.table(id=c(1L,1L), v1=1:2) + rhs = data.table(id=c(1L,1L), v2=1:2) + } else if (case==24L) { ## cartesian match, dups on both sides of match + lhs = data.table(id=c(1L,1:2), v1=1:3) + rhs = data.table(id=c(1L,1L,3L), v2=1:3) + } else if (case==25L) { ## duplicates in RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=c(2L,2:3), v2=1:3) + } else if (case==26L) { ## duplicates in RHS and LHS, some RHS dups does not have matches in LHS (merge.data.table+mult fails) + lhs = data.table(id=c(1:3,3L), v1=1:4) + rhs = data.table(id=c(1L,1L,3:4,4L), v2=1:5) + } else if (case==27L) { ## duplicates in RHS and LHS, some LHS dups does not have matches in RHS + lhs = data.table(id=c(1L,1L,3:4,4L), v1=1:5) + rhs = data.table(id=c(1:3,3L), v2=1:4) + } else if (case==28L) { ## duplicates in RHS and LHS + lhs = data.table(id=c(1:3,3L), v1=1:4) + rhs = data.table(id=c(1L,1L,3:4), v2=1:4) + } else if (case==29L) { ## duplicates in RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=c(2L,2:3), v2=1:3) + } else if (case==30L) { ## duplicates in LHS + lhs = data.table(id=c(1:2,2L), v1=1:3) + rhs = data.table(id=2:3, v2=1:2) + } else if (case==31L) { + lhs = data.table(id=integer(), v1=integer()) + rhs = data.table(id=integer(), v2=integer()) + } else stop("case not found") + list(lhs=lhs, rhs=rhs) + } + + # tests ---- + + y = batch.join.sql.equal(cases=1:31, on="id", hows=c("inner","left","right","full"), mults=c("all","first","last"), .debug=interactive()) + y = rapply(y, isTRUE) + if (!all(y)) + stop(sprintf("join tests failed for %s cases:\n%s", sum(!y), paste(" ", names(y)[!y], collapse="\n"))) +} diff --git a/man/cbindlist.Rd b/man/cbindlist.Rd new file mode 100644 index 000000000..50ac9fbf7 --- /dev/null +++ b/man/cbindlist.Rd @@ -0,0 +1,36 @@ +\name{cbindlist} +\alias{cbindlist} +\alias{cbind} +\alias{cbind.data.table} +\title{Column bind multiple data.tables} +\description{ + Column bind multiple \code{data.table}s. +} +\usage{ + cbindlist(l, copy=TRUE) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } + \item{copy}{ \code{logical}, decides if columns has to be copied into resulting object (default) or just referred. } +} +\details{ + Column bind only stacks input elements. Works like \code{\link{data.table}} function but takes \code{list} type on input. Zero-columns tables in \code{l} are ommited. Tables in \code{l} should have matching row count, recycling of rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as a \emph{key} of the first keyed table. +} +\value{ + A new \code{data.table} based on the stacked objects. Eventually when \code{copy} is \code{FALSE}, then resulting object will share columns with \code{l} tables. +} +\note{ + If output object has any duplicate names, then key and indices are removed. +} +\seealso{ + \code{\link{data.table}}, \code{\link{rbindlist}}, \code{\link{mergelist}} +} +\examples{ +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +cbindlist(l) +} +\keyword{ data } \ No newline at end of file diff --git a/man/mergelist.Rd b/man/mergelist.Rd new file mode 100644 index 000000000..bfee1aae1 --- /dev/null +++ b/man/mergelist.Rd @@ -0,0 +1,189 @@ +\name{mergelist} +\alias{mergelist} +\title{Merge multiple data.tables} +\description{ + Faster merge of multiple \code{data.table}s. +} +\usage{ + mergelist(l, on, cols, + how = c("left","inner","full","right","semi","anti","cross"), + mult, copy = TRUE, + join.many = getOption("datatable.join.many")) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } + \item{on}{ \code{character} vector of column names to merge on; when missing, the \emph{key} of \emph{join-to} table is used. } + \item{cols}{ \code{list} of \code{character} column names corresponding to tables in \code{l}, used to subset columns during merges. } + \item{how}{ \code{character} scalar, controls how to merge tables. Allowed values are \code{"left"} (default), \code{"inner"}, \code{"full"}, \code{"right"}, \code{"semi"}, \code{"anti"}, \code{"cross"}. See Details. } + \item{mult}{ \code{character} scalar, controls how to proceed when multiple rows in \emph{join-to} table match to the row in \emph{join-from} table. Allowed values are \code{"error"}, \code{"all"}, \code{"first"}, \code{"last"}. Default depends on \code{how}, described in \emph{details} below. See examples on how to detect duplicated matches. Using \code{"all"} is recommended together with \code{join.many=FALSE}, unless rows explosion or cartesian product are intended. } + \item{copy}{ \code{logical}, defaults to \code{TRUE}, when \code{FALSE}, then resulting object may share columns with tables in \code{l}, depending on matches. } + \item{join.many}{ \code{logical}, defaults to \code{getOption("datatable.join.many")}, which is \code{TRUE} by default; when \code{FALSE} and \code{mult="all"}, then extra check is made to ensure no \emph{many-to-many} matches exist between tables, and if they exist, then exception is raised. Works similarly to \code{allow.cartesian} option in \code{[.data.table} but is more strict. An option \code{"datatable.join.many"} controls that globally for \code{mergelist} and \code{[.data.table}. } +} +\details{ + Function should be considered experimental. Users are encouraged to provide feedback in our issue tracker. + + Merging is performed sequentially, for \code{l} of 3 tables, it will do something like \code{merge(merge(l[[1L]], l[[2L]]), l[[3L]])}. Merging does not support \emph{non-equi joins}, column names to merge on must be common in both tables on each merge. + + Arguments \code{on}, \code{how}, \code{mult}, \code{join.many} could be lists as well, each of length \code{length(l)-1L}, to provide argument to be used for each single tables pair to merge, see examples. + + Terms \emph{join-to} and \emph{join-from} depends on \code{how} argument: + \enumerate{ + \item{ \code{how="left|semi|anti"}: \emph{join-to} is \emph{RHS}, \emph{join-from} is \emph{LHS}. } + \item{ \code{how="inner|full|cross"}: treats \emph{LHS} and \emph{RHS} tables equally, terms applies to both tables. } + \item{ \code{how="right"}: \emph{join-to} is \emph{LHS}, \emph{join-from} is \emph{RHS}. } + } + + Using \code{mult="error"} will raise exception when multiple rows in \emph{join-to} table match to the row in \emph{join-from} table. It should not be used to just detect duplicates, as duplicates might not have matching row, and in such case exception will not be raised. + + Default value for argument \code{mult} depends on \code{how} argument: + \enumerate{ + \item{ \code{how="left|inner|full|right"}: sets \code{mult="error"}. } + \item{ \code{how="semi|anti"}: sets \code{mult="last"}, although works same as \code{mult="first"}. } + \item{ \code{how="cross"}: sets \code{mult="all"}. } + } + + When \code{on} argument is missing, then columns to join on will be decided based on \emph{key} depending on \code{how} argument: + \enumerate{ + \item{ \code{how="left|right|semi|anti"}: key columns of \emph{join-to} table. } + \item{ \code{how="inner|full"}: if only one table has key, then this key is used, if both tables have key, then \code{intersect(key(lhs), key(rhs))}, having its order aligned to shorter key. } + } + + When joining tables that are not directly linked to single table, e.g. snowflake schema, \emph{right} outer join can be used to optimize the sequence of merges, see examples. +} +\value{ + A new \code{data.table} based on the merged objects. +} +\note{ + Using \code{how="inner|full"} together with \code{mult!="all"} is sub-efficient. Unlike during join in \code{[.data.table}, it will apply \code{mult} on both tables. It is to ensure that the join is symmetric so \emph{LHS} and \emph{RHS} tables can be swapped, regardless of \code{mult} argument. It is always possible to apply \code{mult}-like filter manually and join using \code{mult="all"}. + + Using \code{join.many=FALSE} is sub-efficient. Note that it only takes effect when \code{mult="all"}. If input data are verified to not have duplicated matches, then this can safely use the default \code{TRUE}. Otherwise for \code{mult="all"} merges it is recommended to use \code{join.many=FALSE}, unless of course \emph{many-to-many} join, that duplicates rows, is intended. +} +\seealso{ + \code{\link{[.data.table}}, \code{\link{merge.data.table}} +} +\examples{ +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8), + data.table(id1 = 2:3, v2 = 1:2), + data.table(id1 = 3:5, v3 = 1:3) +) +mergelist(l, on="id1") + +## using keys +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8), + data.table(id1 = 3:5, id2 = 1:3, v2 = 1:3, key="id1"), + data.table(id2 = 1:4, v3 = 4:1, key="id2") +) +mergelist(l) + +## select columns +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8, v2 = 8:1), + data.table(id1 = 3:5, v3 = 1:3, v4 = 3:1, v5 = 1L, key="id1") +) +mergelist(l, cols = list(NULL, c("v3","v5"))) + +## different arguments for each merge pair +l = list( + data.table(id1=1:4, id2=4:1), + data.table(id1=c(1:3,1:2), v2=c(1L,1L,1:2,2L)), + data.table(id2=4:5) +) +mergelist(l, + on = list("id1", "id2"), ## first merge on id1, second on id2 + how = list("inner", "anti"), ## first inner join, second anti join + mult = list("last", NULL)) ## use default 'mult' in second join + +## detecting duplicates matches +l = list( + data.table(id1=c(1:4,2:5), v1=1:8), ## dups in LHS are fine + data.table(id1=c(2:3,2L), v2=1:3), ## dups in RHS + data.table(id1=3:5, v3=1:3) +) +#mergelist(l, on="id1") # ERROR: mult='error' and multiple matches during merge +lapply(l[-1L], `[`, j = if (.N>1L) .SD, by = "id1") ## duplicated rows + +## 'star schema' and 'snowflake schema' examples + +### populate fact: US population by state and date + +gt = state.x77[,"Population"] +gt = data.table(state_id=seq_along(state.name), p=gt[state.name]/sum(gt), k=1L) +tt = as.IDate(paste0(as.integer(time(uspop)),"-01-01")) +tt = as.data.table(stats::approx(tt, c(uspop), tt[1L]:tt[length(tt)])) +tt = tt[, .(date=as.IDate(x), date_id=seq_along(x), pop=y, k=1L)] +fact = tt[gt, on="k", allow.cartesian=TRUE, + .(state_id=i.state_id, date_id=x.date_id, population = x.pop * i.p)] +setkeyv(fact, c("state_id","date_id")) + +### populate dimensions: time and geography + +time = data.table(key = "date_id", + date_id = seq_along(tt$date), date = tt$date, + month_id = month(tt$date), month = month.name[month(tt$date)], + year_id = year(tt$date)-1789L, year = as.character(year(tt$date)), + week_id = week(tt$date), week = as.character(week(tt$date)), + weekday_id = wday(tt$date)-1L, weekday = weekdays(tt$date) +)[weekday_id==0L, weekday_id:=7L][] +geog = data.table(key = "state_id", + state_id = seq_along(state.name), state_abb=state.abb, state_name=state.name, + division_id = as.integer(state.division), + division_name = as.character(state.division), + region_id = as.integer(state.region), + region_name = as.character(state.region) +) +rm(gt, tt) + +### denormalize 'star schema' + +l = list(fact, time, geog) +ans = mergelist(l) + +rm(l, ans) + +### turn 'star schema' into 'snowflake schema' + +make.lvl = function(x, cols) { + stopifnot(is.data.table(x)) + lvl = x[, unique(.SD), .SDcols=cols] + setkeyv(lvl, cols[1L]) + setindexv(lvl, as.list(cols)) +} +time = list( + date = make.lvl(time, c("date_id","date","year_id","month_id","week_id", + "weekday_id")), + weekday = make.lvl(time, c("weekday_id","weekday")), + week = make.lvl(time, c("week_id","week")), + month = make.lvl(time, c("month_id","month")), + year = make.lvl(time, c("year_id","year")) +) +geog = list( + state = make.lvl(geog, c("state_id","state_abb","state_name","division_id")), + division = make.lvl(geog, c("division_id","division_name","region_id")), + region = make.lvl(geog, c("region_id","region_name")) +) + +### denormalize 'snowflake schema' + +#### left join all +l = c(list(fact=fact), time, geog) +ans = mergelist(l) + +rm(ans) +#### merge hierarchies alone, reduce sizes in merges of geog dimension +ans = mergelist(list( + fact, + mergelist(time), + mergelist(rev(geog), how="right") +)) + +rm(ans) +#### same but no unnecessary copies +ans = mergelist(list( + fact, + mergelist(time, copy=FALSE), + mergelist(rev(geog), how="right", copy=FALSE) +)) +} +\keyword{ data } diff --git a/src/bmerge.c b/src/bmerge.c index 84519aac9..fde401b67 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -29,7 +29,7 @@ static SEXP nqgrp; static int ncol, *o, *xo, *retFirst, *retLength, *retIndex, *allLen1, *allGrp1, *rollends, ilen, anslen; static int *op, nqmaxgrp; static int ctr, nomatch; // populating matches for non-equi joins -enum {ALL, FIRST, LAST} mult = ALL; +enum {ALL, FIRST, LAST, ERR} mult = ALL; static double roll, rollabs; static Rboolean rollToNearest=FALSE; #define XIND(i) (xo ? xo[(i)]-1 : i) @@ -49,8 +49,10 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r // iArg, xArg, icolsArg and xcolsArg idtVec = SEXPPTR_RO(idt); // set globals so bmerge_r can see them. xdtVec = SEXPPTR_RO(xdt); - if (!isInteger(icolsArg)) internal_error(__func__, "icols is not integer vector"); // # nocov - if (!isInteger(xcolsArg)) internal_error(__func__, "xcols is not integer vector"); // # nocov + if (!isInteger(icolsArg)) + internal_error(__func__, "icols is not integer vector"); // # nocov + if (!isInteger(xcolsArg)) + internal_error(__func__, "xcols is not integer vector"); // # nocov if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.* internal_error(__func__, "icols and xcols must be non-empty integer vectors"); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) internal_error(__func__, "length(icols) [%d] > length(xcols) [%d]", LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov @@ -60,13 +62,18 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0; ncol = LENGTH(icolsArg); // there may be more sorted columns in x than involved in the join for(int col=0; colLENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt)); - if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt)); + if (icols[col]==NA_INTEGER) + internal_error(__func__, "icols[%d] is NA", col); // # nocov + if (xcols[col]==NA_INTEGER) + internal_error(__func__, "xcols[%d] is NA", col); // # nocov + if (icols[col]>LENGTH(idt) || icols[col]<1) + error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt)); + if (xcols[col]>LENGTH(xdt) || xcols[col]<1) + error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt)); int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1)); int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1)); - if (iN && it!=xt) error(_("typeof x.%s (%s) != typeof i.%s (%s)"), CHAR(STRING_ELT(getAttrib(xdt,R_NamesSymbol),xcols[col]-1)), type2char(xt), CHAR(STRING_ELT(getAttrib(idt,R_NamesSymbol),icols[col]-1)), type2char(it)); + if (iN && it!=xt) + error(_("typeof x.%s (%s) != typeof i.%s (%s)"), CHAR(STRING_ELT(getAttrib(xdt,R_NamesSymbol),xcols[col]-1)), type2char(xt), CHAR(STRING_ELT(getAttrib(idt,R_NamesSymbol),icols[col]-1)), type2char(it)); if (iN && it!=LGLSXP && it!=INTSXP && it!=REALSXP && it!=STRSXP) error(_("Type '%s' is not supported for joining/merging"), type2char(it)); } @@ -74,11 +81,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r // rollArg, rollendsArg roll = 0.0; rollToNearest = FALSE; if (isString(rollarg)) { - if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'")); - if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); + if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) + error(_("roll is character but not 'nearest'")); + if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) + error(_("roll='nearest' can't be applied to a character column, yet.")); roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later } else { - if (!isReal(rollarg)) internal_error(__func__, "roll is not character or double"); // # nocov + if (!isReal(rollarg)) + internal_error(__func__, "roll is not character or double"); // # nocov roll = REAL(rollarg)[0]; // more common case (rolling forwards or backwards) or no roll when 0.0 } rollabs = fabs(roll); @@ -97,10 +107,16 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r } // mult arg - if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; - else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST; - else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST; - else internal_error(__func__, "invalid value for 'mult'"); // # nocov + if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) + mult = ALL; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) + mult = FIRST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) + mult = LAST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "error")) + mult = ERR; + else + internal_error(__func__, "invalid value for 'mult'"); // # nocov // opArg if (!isInteger(opArg) || length(opArg)!=ncol) @@ -131,7 +147,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r retLength = R_Calloc(anslen, int); retIndex = R_Calloc(anslen, int); // initialise retIndex here directly, as next loop is meant for both equi and non-equi joins - for (int j=0; j1) allLen1[0] = FALSE; + if (len>1) { + if (mult==ALL) + allLen1[0] = FALSE; // bmerge()$allLen1 + else if (mult==ERR) + error("mult='error' and multiple matches during merge"); + } if (nqmaxgrp == 1) { - const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // extra +1 for 1-based indexing at R level - const int rl = (mult==ALL) ? len : 1; + const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // bmerge()$starts thus extra +1 for 1-based indexing at R level + const int rl = (mult==ALL) ? len : 1; // bmerge()$lens for (int j=ilow+1; j1 && mult==ERR already checked, no dup matches, continue as mult=ALL // for this irow, we've matches on more than one group allGrp1[0] = FALSE; retFirst[ctr+ilen] = xlow+2; @@ -426,7 +451,7 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg } } else { // none of the groups so far have filled in for this index. So use it! - if (mult == ALL) { + if (mult == ALL || mult == ERR) { retFirst[k] = xlow+2; retLength[k] = len; retIndex[k] = k+1; diff --git a/src/data.table.h b/src/data.table.h index e597fb0d4..129a5bf21 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -159,6 +159,7 @@ SEXP int_vec_init(R_len_t n, int val); // vecseq.c SEXP vecseq(SEXP x, SEXP len, SEXP clamp); +SEXP seqexp(SEXP x); // uniqlist.c SEXP uniqlist(SEXP l, SEXP order); @@ -258,6 +259,12 @@ SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg); +int NROW(SEXP x); +int NCOL(SEXP x); +bool isDataTable(SEXP x); +bool isDataList(SEXP x); +bool perhapsDataTable(SEXP x); +SEXP perhapsDataTableR(SEXP x); void internal_error(const char *call_name, const char *format, ...); // types.c @@ -278,6 +285,10 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// mergelist.c +SEXP cbindlist(SEXP x, SEXP copyArg); +SEXP copyCols(SEXP x, SEXP cols); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/init.c b/src/init.c index 83917754a..e23467771 100644 --- a/src/init.c +++ b/src/init.c @@ -71,6 +71,7 @@ R_CallMethodDef callMethods[] = { {"Creorder", (DL_FUNC) &reorder, -1}, {"Crbindlist", (DL_FUNC) &rbindlist, -1}, {"Cvecseq", (DL_FUNC) &vecseq, -1}, +{"Cseqexp", (DL_FUNC) &seqexp, -1}, {"Csetlistelt", (DL_FUNC) &setlistelt, -1}, {"Caddress", (DL_FUNC) &address, -1}, {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, @@ -149,6 +150,9 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, +{"Ccbindlist", (DL_FUNC) &cbindlist, -1}, +{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1}, +{"CcopyCols", (DL_FUNC) ©Cols, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/mergelist.c b/src/mergelist.c new file mode 100644 index 000000000..6d052cbac --- /dev/null +++ b/src/mergelist.c @@ -0,0 +1,98 @@ +#include "data.table.h" + +// set(x, NULL, cols, copy(unclass(x)[cols])) ## but keeps the index +SEXP copyCols(SEXP x, SEXP cols) { + // used in R/mergelist.R + if (!isDataTable(x)) + error("'x' must be a data.table"); // # nocov + if (!isInteger(cols)) + error("'cols' must be integer"); // # nocov + int nx = length(x), ncols = LENGTH(cols), *colsp = INTEGER(cols); + if (!nx || !ncols) + return R_NilValue; + for (int i=0; i1L) length(unique(vapply(x, length, 0L)))==1L else TRUE +static inline bool equalLens(SEXP x) { + int n = LENGTH(x); + if (n < 2) + return true; + R_xlen_t nr = xlength(VECTOR_ELT(x, 0)); + for (int i=1; ilimit) error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit); + if (limit<0) + error(_("clamp must be positive")); // # nocov + if (reslen>limit) + error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit); } SEXP ans = PROTECT(allocVector(INTSXP, reslen)); int *ians = INTEGER(ans); @@ -40,3 +45,37 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp) return(ans); } +SEXP seqexp(SEXP x) { + // used in R/mergelist.R + // function(x) unlist(lapply(seq_along(x), function(i) rep(i, x[[i]]))) + // used to expand bmerge()$lens, when $starts does not have NAs (or duplicates?) + // replaces rep.int(indices__, len__) where indices__ was seq_along(x) + // input: 1,1,2, 1,3, 1,0,1,2 + // output: 1,2,3,3,4,5,5,5,6, 8,9,9 + // all1 returns NULL, this is now commented out to reduce overhead becase seqexp is called only when !(ans$allLen1 && (!inner || len.x==length(ans$starts))) if seqexp would be used in another place we could enable that + if (!isInteger(x)) + error("internal error: 'x' must be an integer"); // # nocov + const int *xp = INTEGER(x), nx = LENGTH(x); + int nans = 0; + for (int i=0; i