diff --git a/R/bmerge.R b/R/bmerge.R
index 3af291c78..115f7f2cc 100644
--- a/R/bmerge.R
+++ b/R/bmerge.R
@@ -15,10 +15,10 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
# TO DO: enforce via .internal.shallow attribute and expose shallow() to users
# This is why shallow() is very importantly internal only, currently.
- origi = shallow(i) # Only needed for factor to factor joins, to recover the original levels
- # Otherwise, types of i join columns are alyways promoted to match x's
+ origi = shallow(i) # Needed for factor to factor/character joins, to recover the original levels
+ # Otherwise, types of i join columns are anyways promoted to match x's
# types (with warning or verbose)
- resetifactor = NULL # Keep track of any factor to factor join cols (only time we keep orig)
+ resetifactor = NULL # Keep track of any factor to factor/character join cols (only time we keep orig)
for (a in seq_along(leftcols)) {
# This loop is simply to support joining factor columns
# Note that if i is keyed, if this coerces, i's key gets dropped and the key may not be retained
@@ -38,20 +38,28 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
if (is.factor(x[[rc]])) {
if (is.character(i[[lc]])) {
if (verbose) cat("Coercing character column i.'",icnam,"' to factor to match type of x.'",xcnam,"'. If possible please change x.'",xcnam,"' to character. Character columns are now preferred in joins.\n",sep="")
- set(i,j=lc,value=factor(i[[lc]]))
+ set(origi, j=lc, value=factor(origi[[lc]])) # note the use of 'origi' here - see #499 and #945
+ # TO DO: we need a way to avoid copying 'value' for internal purposes
+ # that would allow setting: set(i, j=lc, value=origi[[lc]]) without resulting in a copy.
+ # until then using 'val <- origi[[lc]]' below to avoid another copy.
} else {
if (!is.factor(i[[lc]]))
stop("x.'",xcnam,"' is a factor column being joined to i.'",icnam,"' which is type '",typeof(i[[lc]]),"'. Factor columns must join to factor or character columns.")
- resetifactor = c(resetifactor,lc)
- # Retain original levels of i's factor columns in factor to factor joins (important when NAs,
- # see tests 687 and 688).
}
- if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns NA for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true.
- newfactor = chmatch(levels(i[[lc]]), levels(x[[rc]]), nomatch=NA_integer_)[i[[lc]]]
- levels(newfactor) = levels(x[[rc]])
+ # Retain original levels of i's factor columns in factor to factor joins (important when NAs,
+ # see tests 687 and 688).
+ # Moved it outside of 'else' to fix #499 and #945.
+ resetifactor = c(resetifactor,lc)
+ if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns NA 0 for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true.
+ val = origi[[lc]] # note: using 'origi' here because set(..., value = .) always copies '.', we need a way to avoid it in internal cases.
+ lx = levels(x[[rc]])
+ li = levels(val)
+ newfactor = chmatch(li, lx, nomatch=0L)[val] # fix for #945, a hacky solution for now.
+ levels(newfactor) = lx
class(newfactor) = "factor"
- set(i,j=lc,value=newfactor)
- # NAs can be produced by this level match, in which case the C code (it knows integer value NA)
+ set(i, j=lc, value=newfactor)
+ # COMMENT BELOW IS NOT TRUE ANYMORE... had to change nomatch to 0L to take care of case where 'NA' occurs as a separate value... See #945.
+ # NAs can be produced by this level match, in which case the C code (it knows integer value NA)
# can skip over the lookup. It's therefore important we pass NA rather than 0 to the C code.
}
if (is.integer(x[[rc]]) && is.double(i[[lc]])) {
diff --git a/README.md b/README.md
index 4dc9bfeb6..89b8df4e8 100644
--- a/README.md
+++ b/README.md
@@ -127,6 +127,8 @@
35. Update by reference using `:=` after loading from disk where the `data.table` exists within a local environment now works as intended. Closes [#479](https://github.com/Rdatatable/data.table/issues/479). Thanks to @ChongWang for the minimal reproducible example.
+ 36. Issues on merges involving factor columns with `NA` and merging factor with character type with non-identical levels are both fixed. Closes [#499](https://github.com/Rdatatable/data.table/issues/499) and [#945](https://github.com/Rdatatable/data.table/issues/945). Thanks to @AbielReinhart and @stewbasic for the minimal examples.
+
#### NOTES
1. Clearer explanation of what `duplicated()` does (borrowed from base). Thanks to @matthieugomez for pointing out. Closes [#872](https://github.com/Rdatatable/data.table/issues/872).
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index c374d837f..420ff7376 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -5942,6 +5942,19 @@ test(1482.1, truelength(ee$DT), 0L) # make sure that the simulated environment i
test(1482.2, ee$DT[, z := 3:1], data.table(x=1L, y=1:3, z=3:1), warning="Invalid .internal.selfref detected and")
test(1482.3, truelength(ee$DT) >= 100L, TRUE) # truelength restored?
+# Fix for #499 and #945
+require(data.table)
+x <- data.table(k=as.factor(c(NA,1,2)),v=c(0,1,2), key="k")
+y <- data.table(k=as.factor(c(NA,1,3)),v=c(0,1,3), key="k")
+test(1483.1, x[y], data.table(k=factor(c(NA,1,3)), v=c(0,1,NA), i.v=c(0,1,3)))
+test(1483.2, merge(x,y,all=TRUE), data.table(k=factor(c(NA,1,2,3)), v.x=c(0,1,2,NA), v.y=c(0,1,NA,3), key="k"))
+
+x <- data.table(country="US")
+y <- data.table(country=factor("USA"))
+test(1483.3, merge(x,y,by="country",all=T), data.table(country=factor(c("US", "USA")), key="country"))
+setkey(y)
+test(1483.4, y[x], data.table(country=factor("US"), key="country"))
+
##########################