Handles #1642, #1728, #1463, #485. Full precision grouping/ordering/j…

…oining by default, i.e., no rounding.
Rdatatable · Jul 21, 2016 · f982e2e · f982e2e
1 parent fbac186
commit f982e2e
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 26 deletions.
diff --git a/man/setNumericRounding.Rd b/man/setNumericRounding.Rd
@@ -3,32 +3,37 @@
 \alias{getNumericRounding}
 \title{ Change or turn off numeric rounding }
 \description{
-  Change rounding to 0, 1 or 2 bytes when joining, grouping or ordering numeric (i.e. double, POSIXct) columns.
+Change rounding to 0, 1 or 2 bytes when joining, grouping or ordering numeric 
+(i.e. double, POSIXct) columns.
 }
 \usage{
 setNumericRounding(x)
 getNumericRounding()
 }
 \arguments{
-  \item{x}{ integer or numeric vector: 2 (default), 1 or 0 byte rounding }
+  \item{x}{ integer or numeric vector: 0 (default), 1 or 2 byte rounding }
 }
 \details{
-  Computers cannot represent some floating point numbers (such as 0.6) precisely, using base 2. This leads to unexpected behaviour when
-  joining or grouping columns of type 'numeric'; i.e. 'double', see example below.  To deal with this automatically for convenience, 
-  when joining or grouping, data.table rounds such data to apx 11 s.f. which is plenty of digits for many cases. This is achieved by
-  rounding the last 2 bytes off the significand.  Where this is not enough, \code{setNumericRounding} can be used to reduce to 1 byte
-  rounding, or no rounding (0 bytes rounded) for full precision.
-
-  It's bytes rather than bits because it's tied in with the radix sort algorithm for sorting numerics which sorts byte by byte. With the
-  default rounding of 2 bytes, at most 6 passes are needed. With no rounding, at most 8 passes are needed and hence may be slower. The
-  choice of default is not for speed however, but to avoid surprising results such as in the example below.
+Computers cannot represent some floating point numbers (such as 0.6) 
+precisely, using base 2. This leads to unexpected behaviour when joining or 
+grouping columns of type 'numeric'; i.e. 'double', see example below. In 
+cases where this is undesirable, data.table allows rounding such data up to 
+approximately 11 s.f. which is plenty of digits for many cases. This is 
+achieved by rounding the last 2 bytes off the significand. Other possible 
+values are 1 byte rounding, or no rounding (full precision, default).
+
+It's bytes rather than bits because it's tied in with the radix sort 
+algorithm for sorting numerics which sorts byte by byte. With the default 
+rounding of 0 bytes, at most 8 passes are needed. With rounding of 2 bytes, at 
+most 6 passes are needed (and therefore might be a tad faster).
 
-  For large numbers (integers > 2^31), we recommend using \code{bit64::integer64} rather than setting rounding to \code{0}.
-  
-  If you're using \code{POSIXct} type column with \emph{millisecond} (or lower) resolution, you might want to consider setting \code{setNumericRounding(1)} . This'll become the default for \code{POSIXct} types in the future, instead of the default \code{2}.
-}
+For large numbers (integers > 2^31), we recommend using 
+\code{bit64::integer64}, even though the default is to round off 0 bytes (full 
+precision).
+ }
 \value{
-    \code{setNumericRounding} returns no value; the new value is applied. \code{getNumericRounding} returns the current value: 0, 1 or 2.
+\code{setNumericRounding} returns no value; the new value is applied. 
+\code{getNumericRounding} returns the current value: 0, 1 or 2.
 }
 \seealso{
 \code{\link{datatable-optimize}}\cr
@@ -39,22 +44,20 @@ getNumericRounding()
 \examples{
 DT = data.table(a=seq(0,1,by=0.2),b=1:2, key="a")
 DT
-setNumericRounding(0)   # turn off rounding
+setNumericRounding(0)   # By default, rounding is turned off
 DT[.(0.4)]   # works
-DT[.(0.6)]   # no match, confusing since 0.6 is clearly there in DT
+DT[.(0.6)]   # no match, can be confusing since 0.6 is clearly there in DT
+             # happens due to floating point representation limitations
 
-setNumericRounding(2)   # restore default
-DT[.(0.6)]   # works as expected
+setNumericRounding(2)   # round off last 2 bytes
+DT[.(0.6)]   # works
 
 # using type 'numeric' for integers > 2^31 (typically ids)
 DT = data.table(id = c(1234567890123, 1234567890124, 1234567890125), val=1:3)
 print(DT, digits=15)
-DT[,.N,by=id]   # 1 row
+DT[,.N,by=id]   # 1 row, (last 2 bytes rounded)
 setNumericRounding(0)
-DT[,.N,by=id]   # 3 rows
+DT[,.N,by=id]   # 3 rows, (no rounding, default)
 # better to use bit64::integer64 for such ids
-setNumericRounding(2)
 }
 \keyword{ data }
-
-
diff --git a/src/forder.c b/src/forder.c
@@ -422,7 +422,8 @@ static void iradix_r(int *xsub, int *osub, int n, int radix)
 // + changed to MSD and hooked into forder framework here.
 // + replaced tolerance with rounding s.f.
 
-static int dround = 2;
+// No rounding by default, for now. Handles #1642, #1728, #1463, #485
+static int dround = 0;
 static unsigned long long dmask1;
 static unsigned long long dmask2;