diff --git a/R/bmerge.R b/R/bmerge.R index 3af291c78..115f7f2cc 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -15,10 +15,10 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v # TO DO: enforce via .internal.shallow attribute and expose shallow() to users # This is why shallow() is very importantly internal only, currently. - origi = shallow(i) # Only needed for factor to factor joins, to recover the original levels - # Otherwise, types of i join columns are alyways promoted to match x's + origi = shallow(i) # Needed for factor to factor/character joins, to recover the original levels + # Otherwise, types of i join columns are anyways promoted to match x's # types (with warning or verbose) - resetifactor = NULL # Keep track of any factor to factor join cols (only time we keep orig) + resetifactor = NULL # Keep track of any factor to factor/character join cols (only time we keep orig) for (a in seq_along(leftcols)) { # This loop is simply to support joining factor columns # Note that if i is keyed, if this coerces, i's key gets dropped and the key may not be retained @@ -38,20 +38,28 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v if (is.factor(x[[rc]])) { if (is.character(i[[lc]])) { if (verbose) cat("Coercing character column i.'",icnam,"' to factor to match type of x.'",xcnam,"'. If possible please change x.'",xcnam,"' to character. Character columns are now preferred in joins.\n",sep="") - set(i,j=lc,value=factor(i[[lc]])) + set(origi, j=lc, value=factor(origi[[lc]])) # note the use of 'origi' here - see #499 and #945 + # TO DO: we need a way to avoid copying 'value' for internal purposes + # that would allow setting: set(i, j=lc, value=origi[[lc]]) without resulting in a copy. + # until then using 'val <- origi[[lc]]' below to avoid another copy. } else { if (!is.factor(i[[lc]])) stop("x.'",xcnam,"' is a factor column being joined to i.'",icnam,"' which is type '",typeof(i[[lc]]),"'. Factor columns must join to factor or character columns.") - resetifactor = c(resetifactor,lc) - # Retain original levels of i's factor columns in factor to factor joins (important when NAs, - # see tests 687 and 688). } - if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns NA for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true. - newfactor = chmatch(levels(i[[lc]]), levels(x[[rc]]), nomatch=NA_integer_)[i[[lc]]] - levels(newfactor) = levels(x[[rc]]) + # Retain original levels of i's factor columns in factor to factor joins (important when NAs, + # see tests 687 and 688). + # Moved it outside of 'else' to fix #499 and #945. + resetifactor = c(resetifactor,lc) + if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns NA 0 for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true. + val = origi[[lc]] # note: using 'origi' here because set(..., value = .) always copies '.', we need a way to avoid it in internal cases. + lx = levels(x[[rc]]) + li = levels(val) + newfactor = chmatch(li, lx, nomatch=0L)[val] # fix for #945, a hacky solution for now. + levels(newfactor) = lx class(newfactor) = "factor" - set(i,j=lc,value=newfactor) - # NAs can be produced by this level match, in which case the C code (it knows integer value NA) + set(i, j=lc, value=newfactor) + # COMMENT BELOW IS NOT TRUE ANYMORE... had to change nomatch to 0L to take care of case where 'NA' occurs as a separate value... See #945. + # NAs can be produced by this level match, in which case the C code (it knows integer value NA) # can skip over the lookup. It's therefore important we pass NA rather than 0 to the C code. } if (is.integer(x[[rc]]) && is.double(i[[lc]])) { diff --git a/README.md b/README.md index 4dc9bfeb6..89b8df4e8 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,8 @@ 35. Update by reference using `:=` after loading from disk where the `data.table` exists within a local environment now works as intended. Closes [#479](https://github.com/Rdatatable/data.table/issues/479). Thanks to @ChongWang for the minimal reproducible example. + 36. Issues on merges involving factor columns with `NA` and merging factor with character type with non-identical levels are both fixed. Closes [#499](https://github.com/Rdatatable/data.table/issues/499) and [#945](https://github.com/Rdatatable/data.table/issues/945). Thanks to @AbielReinhart and @stewbasic for the minimal examples. + #### NOTES 1. Clearer explanation of what `duplicated()` does (borrowed from base). Thanks to @matthieugomez for pointing out. Closes [#872](https://github.com/Rdatatable/data.table/issues/872). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c374d837f..420ff7376 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5942,6 +5942,19 @@ test(1482.1, truelength(ee$DT), 0L) # make sure that the simulated environment i test(1482.2, ee$DT[, z := 3:1], data.table(x=1L, y=1:3, z=3:1), warning="Invalid .internal.selfref detected and") test(1482.3, truelength(ee$DT) >= 100L, TRUE) # truelength restored? +# Fix for #499 and #945 +require(data.table) +x <- data.table(k=as.factor(c(NA,1,2)),v=c(0,1,2), key="k") +y <- data.table(k=as.factor(c(NA,1,3)),v=c(0,1,3), key="k") +test(1483.1, x[y], data.table(k=factor(c(NA,1,3)), v=c(0,1,NA), i.v=c(0,1,3))) +test(1483.2, merge(x,y,all=TRUE), data.table(k=factor(c(NA,1,2,3)), v.x=c(0,1,2,NA), v.y=c(0,1,NA,3), key="k")) + +x <- data.table(country="US") +y <- data.table(country=factor("USA")) +test(1483.3, merge(x,y,by="country",all=T), data.table(country=factor(c("US", "USA")), key="country")) +setkey(y) +test(1483.4, y[x], data.table(country=factor("US"), key="country")) + ##########################