Closes #499, closes #945. Better handling of merges on factors.

Rdatatable · Feb 8, 2015 · 54d5252 · 54d5252
1 parent 4d339ef
commit 54d5252
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 12 deletions.
diff --git a/R/bmerge.R b/R/bmerge.R
@@ -15,10 +15,10 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
  # TO DO: enforce via .internal.shallow attribute and expose shallow() to users
  # This is why shallow() is very importantly internal only, currently.
 
- origi = shallow(i) # Only needed for factor to factor joins, to recover the original levels
- # Otherwise, types of i join columns are alyways promoted to match x's
+ origi = shallow(i) # Needed for factor to factor/character joins, to recover the original levels
+ # Otherwise, types of i join columns are anyways promoted to match x's
  # types (with warning or verbose)
- resetifactor = NULL # Keep track of any factor to factor join cols (only time we keep orig)
+ resetifactor = NULL  # Keep track of any factor to factor/character join cols (only time we keep orig)
  for (a in seq_along(leftcols)) {
  # This loop is simply to support joining factor columns
  # Note that if i is keyed, if this coerces, i's key gets dropped and the key may not be retained
@@ -38,20 +38,28 @@ bmerge <- function(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, v
  if (is.factor(x[[rc]])) {
  if (is.character(i[[lc]])) {
  if (verbose) cat("Coercing character column i.'",icnam,"' to factor to match type of x.'",xcnam,"'. If possible please change x.'",xcnam,"' to character. Character columns are now preferred in joins.\n",sep="")
- set(i,j=lc,value=factor(i[[lc]]))
+ set(origi, j=lc, value=factor(origi[[lc]])) # note the use of 'origi' here - see #499 and #945
+ # TO DO: we need a way to avoid copying 'value' for internal purposes
+ # that would allow setting: set(i, j=lc, value=origi[[lc]]) without resulting in a copy.
+ # until then using 'val <- origi[[lc]]' below to avoid another copy.
  } else {
  if (!is.factor(i[[lc]]))
  stop("x.'",xcnam,"' is a factor column being joined to i.'",icnam,"' which is type '",typeof(i[[lc]]),"'. Factor columns must join to factor or character columns.")
- resetifactor = c(resetifactor,lc)
- # Retain original levels of i's factor columns in factor to factor joins (important when NAs,
- # see tests 687 and 688).
  }
- if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns NA for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true.
- newfactor = chmatch(levels(i[[lc]]), levels(x[[rc]]), nomatch=NA_integer_)[i[[lc]]]
- levels(newfactor) = levels(x[[rc]])
+ # Retain original levels of i's factor columns in factor to factor joins (important when NAs,
+ # see tests 687 and 688).
+ # Moved it outside of 'else' to fix #499 and #945. 
+ resetifactor = c(resetifactor,lc)
+ if (roll!=0.0 && a==length(leftcols)) stop("Attempting roll join on factor column x.",names(x)[rc],". Only integer, double or character colums may be roll joined.") # because the chmatch on next line returns <strike>NA</strike> <new>0</new> for missing chars in x (rather than some integer greater than existing). Note roll!=0.0 is ok in this 0 special floating point case e.g. as.double(FALSE)==0.0 is ok, and "nearest"!=0.0 is also true.
+ val = origi[[lc]] # note: using 'origi' here because set(..., value = .) always copies '.', we need a way to avoid it in internal cases.
+ lx = levels(x[[rc]])
+ li = levels(val)
+ newfactor = chmatch(li, lx, nomatch=0L)[val] # fix for #945, a hacky solution for now.
+ levels(newfactor) = lx
  class(newfactor) = "factor"
- set(i,j=lc,value=newfactor)
- # NAs can be produced by this level match, in which case the C code (it knows integer value NA)
+ set(i, j=lc, value=newfactor)
+ # COMMENT BELOW IS NOT TRUE ANYMORE... had to change nomatch to 0L to take care of case where 'NA' occurs as a separate value... See #945.
+ # <OUTDATED> NAs can be produced by this level match, in which case the C code (it knows integer value NA)
  # can skip over the lookup. It's therefore important we pass NA rather than 0 to the C code.
  }
  if (is.integer(x[[rc]]) && is.double(i[[lc]])) {

diff --git a/README.md b/README.md
@@ -127,6 +127,8 @@
 
  35. Update by reference using `:=` after loading from disk where the `data.table` exists within a local environment now works as intended. Closes [#479](https://github.com/Rdatatable/data.table/issues/479). Thanks to @ChongWang for the minimal reproducible example.
 
+ 36. Issues on merges involving factor columns with `NA` and merging factor with character type with non-identical levels are both fixed. Closes [#499](https://github.com/Rdatatable/data.table/issues/499) and [#945](https://github.com/Rdatatable/data.table/issues/945). Thanks to @AbielReinhart and @stewbasic for the minimal examples.
+
 #### NOTES
 
  1. Clearer explanation of what `duplicated()` does (borrowed from base). Thanks to @matthieugomez for pointing out. Closes [#872](https://github.com/Rdatatable/data.table/issues/872).

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -5942,6 +5942,19 @@ test(1482.1, truelength(ee$DT), 0L) # make sure that the simulated environment i
 test(1482.2, ee$DT[, z := 3:1], data.table(x=1L, y=1:3, z=3:1), warning="Invalid .internal.selfref detected and")
 test(1482.3, truelength(ee$DT) >= 100L, TRUE) # truelength restored?
 
+# Fix for #499 and #945
+require(data.table)
+x <- data.table(k=as.factor(c(NA,1,2)),v=c(0,1,2), key="k")
+y <- data.table(k=as.factor(c(NA,1,3)),v=c(0,1,3), key="k")
+test(1483.1, x[y], data.table(k=factor(c(NA,1,3)), v=c(0,1,NA), i.v=c(0,1,3)))
+test(1483.2, merge(x,y,all=TRUE), data.table(k=factor(c(NA,1,2,3)), v.x=c(0,1,2,NA), v.y=c(0,1,NA,3), key="k"))
+
+x <- data.table(country="US")
+y <- data.table(country=factor("USA"))
+test(1483.3, merge(x,y,by="country",all=T), data.table(country=factor(c("US", "USA")), key="country"))
+setkey(y)
+test(1483.4, y[x], data.table(country=factor("US"), key="country"))
+
 ##########################