From 78124a2b32d61a3ab1cc5798828195970438262b Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Wed, 4 May 2016 16:08:21 +0200
Subject: [PATCH 1/4] Fix fast path of float parsing on x87

The fast path of the float parser relies on the rounding to happen
exactly and directly to the correct number of bits. On x87, instead,
double rounding would occour as the FPU stack defaults to 80 bits of
precision.

This can be fixed by setting the precision of the FPU stack before
performing the int to float conversion. This can be achieved by
changing the value of the x87 control word. This is a somewhat common
operation that is in fact performed whenever a float needs to be
truncated to an integer, but it is undesirable to add its overhead for
code that does not rely on x87 for computations (i.e. on non-x86
architectures, or x86 architectures which perform FPU computations on
using SSE).

Fixes `num::dec2flt::fast_path_correct` (on x87).
---
 src/libcore/lib.rs                   |  2 ++
 src/libcore/num/dec2flt/algorithm.rs | 47 +++++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 7 deletions(-)
diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs
index e1bbdf4a7ae41..a054e41b2084a 100644
--- a/src/libcore/lib.rs
+++ b/src/libcore/lib.rs
@@ -61,7 +61,9 @@
 #![cfg_attr(not(stage0), deny(warnings))]
 
 #![feature(allow_internal_unstable)]
+#![feature(asm)]
 #![feature(associated_type_defaults)]
+#![feature(cfg_target_feature)]
 #![feature(concat_idents)]
 #![feature(const_fn)]
 #![feature(cfg_target_has_atomic)]
diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs
index e33c2814bf2f4..d64ad449e9abf 100644
--- a/src/libcore/num/dec2flt/algorithm.rs
+++ b/src/libcore/num/dec2flt/algorithm.rs
@@ -32,19 +32,49 @@ fn power_of_ten(e: i16) -> Fp {
     Fp { f: sig, e: exp }
 }
 
+#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
+mod fpu_precision {
+    pub fn set_precision<T>() { }
+}
+
+#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
+mod fpu_precision {
+    use mem::size_of;
+    use ops::Drop;
+
+    pub struct FPUControlWord(u16);
+
+    fn set_cw(cw: u16) {
+        unsafe { asm!("fldcw $0" :: "m" (cw)) :: "volatile" }
+    }
+
+    pub fn set_precision<T>() -> FPUControlWord {
+        let cw = 0u16;
+        let cw_precision = match size_of::<T>() {
+            4 => 0x0000, // 32 bits
+            8 => 0x0200, // 64 bits
+            _ => 0x0300, // default, 80 bits
+        };
+        unsafe { asm!("fnstcw $0" : "=*m" (&cw)) ::: "volatile" }
+        set_cw((cw & 0xFCFF) | cw_precision);
+        FPUControlWord(cw)
+    }
+
+    impl Drop for FPUControlWord {
+        fn drop(&mut self) {
+            set_cw(self.0)
+        }
+    }
+}
+
 /// The fast path of Bellerophon using machine-sized integers and floats.
 ///
 /// This is extracted into a separate function so that it can be attempted before constructing
 /// a bignum.
 ///
 /// The fast path crucially depends on arithmetic being correctly rounded, so on x86
-/// without SSE or SSE2 it will be **wrong** (as in, off by one ULP occasionally), because the x87
-/// FPU stack will round to 80 bit first before rounding to 64/32 bit. However, as such hardware
-/// is extremely rare nowadays and in fact all in-tree target triples assume an SSE2-capable
-/// microarchitecture, there is little incentive to deal with that. There's a test that will fail
-/// when SSE or SSE2 is disabled, so people building their own non-SSE copy will get a heads up.
-///
-/// FIXME: It would nevertheless be nice if we had a good way to detect and deal with x87.
+/// without SSE or SSE2 it requires the precision of the x87 FPU stack to be changed
+/// so that it directly rounds to 64/32 bit.
 pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
     let num_digits = integral.len() + fractional.len();
     // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
@@ -60,6 +90,9 @@ pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Opt
     if f > T::max_sig() {
         return None;
     }
+
+    let _cw = fpu_precision::set_precision::<T>();
+
     // The case e < 0 cannot be folded into the other branch. Negative powers result in
     // a repeating fractional part in binary, which are rounded, which causes real
     // (and occasioally quite significant!) errors in the final result.

From f96864dab6d5ed3ec96d850fef20490b1507bc85 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Thu, 12 May 2016 19:03:15 +0200
Subject: [PATCH 2/4] Document the x87 control word

Explain the meaning of the fields of the control word and provide more
details about how the relevant one (Precision Control) is updated in
the fast path.
---
 src/libcore/num/dec2flt/algorithm.rs | 67 +++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs
index d64ad449e9abf..6c53845202fbb 100644
--- a/src/libcore/num/dec2flt/algorithm.rs
+++ b/src/libcore/num/dec2flt/algorithm.rs
@@ -32,31 +32,87 @@ fn power_of_ten(e: i16) -> Fp {
     Fp { f: sig, e: exp }
 }
 
+// Most architectures floating point operations with explicit bit size, therefore the precision of
+// the computation is determined on a per-operation basis.
 #[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
 mod fpu_precision {
     pub fn set_precision<T>() { }
 }
 
+// On x86, the x87 FPU is used for float operations if the SSE[2] extensions are not available.
+// The x87 FPU operates with 80 bits of precision by default, which means that operations will
+// round to 80 bits causing double rounding to happen when values are eventually represented as
+// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
+// computations are performed in the desired precision.
 #[cfg(all(target_arch="x86", not(target_feature="sse2")))]
 mod fpu_precision {
     use mem::size_of;
     use ops::Drop;
 
+    /// A structure used to preserve the original value of the FPU control word, so that it can be
+    /// restored when the structure is dropped.
+    ///
+    /// The x87 FPU is a 16-bits register whose fields are as follows:
+    ///
+    ///    1111 11
+    ///    5432 10 98 76 5 4 3 2 1 0
+    ///   +----+--+--+--+-+-+-+-+-+-+
+    ///   |    |RC|PC|  |P|U|O|Z|D|I|
+    ///   |    |  |  |  |M|M|M|M|M|M|
+    ///   +----+--+--+--+-+-+-+-+-+-+
+    /// The fields are:
+    ///  - Invalid operation Mask
+    ///  - Denormal operand Mask
+    ///  - Zero divide Mask
+    ///  - Overflow Mask
+    ///  - Underflow Mask
+    ///  - Precision Mask
+    ///  - Precision Control
+    ///  - Rounding Control
+    ///
+    /// The fields with no name are unused (on FPUs more modern than 287).
+    ///
+    /// The 6 LSBs (bits 0-5) are the exception mask bits; each blocks a specific type of floating
+    /// point exceptions from being raised.
+    ///
+    /// The Precision Control field determines the precision of the operations performed by the
+    /// FPU. It can set to:
+    ///  - 0b00, single precision i.e. 32-bits
+    ///  - 0b10, double precision i.e. 64-bits
+    ///  - 0b11, double extended precision i.e. 80-bits (default state)
+    /// The 0b01 value is reserved and should not be used.
+    ///
+    /// The Rounding Control field determines how values which cannot be represented exactly are
+    /// rounded. It can be set to:
+    ///  - 0b00, round to nearest even (default state)
+    ///  - 0b01, round down (toward -inf)
+    ///  - 0b10, round up (toward +inf)
+    ///  - 0b11, round toward 0 (truncate)
     pub struct FPUControlWord(u16);
 
     fn set_cw(cw: u16) {
         unsafe { asm!("fldcw $0" :: "m" (cw)) :: "volatile" }
     }
 
+    /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
     pub fn set_precision<T>() -> FPUControlWord {
         let cw = 0u16;
+
+        // Compute the value for the Precision Control field that is appropriate for `T`.
         let cw_precision = match size_of::<T>() {
             4 => 0x0000, // 32 bits
             8 => 0x0200, // 64 bits
             _ => 0x0300, // default, 80 bits
         };
+
+        // Get the original value of the control word to restore it later, when the
+        // `FPUControlWord` structure is dropped
         unsafe { asm!("fnstcw $0" : "=*m" (&cw)) ::: "volatile" }
+
+        // Set the control word to the desired precision. This is achieved by masking away the old
+        // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
         set_cw((cw & 0xFCFF) | cw_precision);
+
         FPUControlWord(cw)
     }
 
@@ -71,10 +127,6 @@ mod fpu_precision {
 ///
 /// This is extracted into a separate function so that it can be attempted before constructing
 /// a bignum.
-///
-/// The fast path crucially depends on arithmetic being correctly rounded, so on x86
-/// without SSE or SSE2 it requires the precision of the x87 FPU stack to be changed
-/// so that it directly rounds to 64/32 bit.
 pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
     let num_digits = integral.len() + fractional.len();
     // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
@@ -91,11 +143,16 @@ pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Opt
         return None;
     }
 
+    // The fast path crucially depends on arithmetic being rounded to the correct number of bits
+    // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
+    // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
+    // The `set_precision` function takes care of setting the precision on architectures which
+    // require setting it by changing the global state (like the control word of the x87 FPU).
     let _cw = fpu_precision::set_precision::<T>();
 
     // The case e < 0 cannot be folded into the other branch. Negative powers result in
     // a repeating fractional part in binary, which are rounded, which causes real
-    // (and occasioally quite significant!) errors in the final result.
+    // (and occasionally quite significant!) errors in the final result.
     if e >= 0 {
         Some(T::from_int(f) * T::short_fast_pow10(e as usize))
     } else {

From 88afeb9cba0f14c76f8455a881a8ef11ea8fb5f1 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Mon, 16 May 2016 15:37:14 +0200
Subject: [PATCH 3/4] Cleanup documentation

Remove irrelevant information (and instead provide pointer to
reference documentation), replace ASCII-art table with the
corresponding MarkDown one, and minor fixes.
---
 src/libcore/num/dec2flt/algorithm.rs | 41 +++++++---------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs
index 6c53845202fbb..35a613ef5fce4 100644
--- a/src/libcore/num/dec2flt/algorithm.rs
+++ b/src/libcore/num/dec2flt/algorithm.rs
@@ -32,14 +32,14 @@ fn power_of_ten(e: i16) -> Fp {
     Fp { f: sig, e: exp }
 }
 
-// Most architectures floating point operations with explicit bit size, therefore the precision of
-// the computation is determined on a per-operation basis.
+// In most architectures, floating point operations have an explicit bit size, therefore the
+// precision of the computation is determined on a per-operation basis.
 #[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
 mod fpu_precision {
     pub fn set_precision<T>() { }
 }
 
-// On x86, the x87 FPU is used for float operations if the SSE[2] extensions are not available.
+// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
 // The x87 FPU operates with 80 bits of precision by default, which means that operations will
 // round to 80 bits causing double rounding to happen when values are eventually represented as
 // 32/64 bit float values. To overcome this, the FPU control word can be set so that the
@@ -54,40 +54,19 @@ mod fpu_precision {
     ///
     /// The x87 FPU is a 16-bits register whose fields are as follows:
     ///
-    ///    1111 11
-    ///    5432 10 98 76 5 4 3 2 1 0
-    ///   +----+--+--+--+-+-+-+-+-+-+
-    ///   |    |RC|PC|  |P|U|O|Z|D|I|
-    ///   |    |  |  |  |M|M|M|M|M|M|
-    ///   +----+--+--+--+-+-+-+-+-+-+
-    /// The fields are:
-    ///  - Invalid operation Mask
-    ///  - Denormal operand Mask
-    ///  - Zero divide Mask
-    ///  - Overflow Mask
-    ///  - Underflow Mask
-    ///  - Precision Mask
-    ///  - Precision Control
-    ///  - Rounding Control
+    /// | 12-15 | 10-11 | 8-9 | 6-7 |  5 |  4 |  3 |  2 |  1 |  0 |
+    /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
+    /// |       | RC    | PC  |     | PM | UM | OM | ZM | DM | IM |
     ///
-    /// The fields with no name are unused (on FPUs more modern than 287).
+    /// The documentation for all of the fields is available in the IA-32 Architectures Software
+    /// Developer's Manual (Volume 1).
     ///
-    /// The 6 LSBs (bits 0-5) are the exception mask bits; each blocks a specific type of floating
-    /// point exceptions from being raised.
-    ///
-    /// The Precision Control field determines the precision of the operations performed by the
-    /// FPU. It can set to:
+    /// The only field which is relevant for the following code is PC, Precision Control. This
+    /// field determines the precision of the operations performed by the  FPU. It can be set to:
     ///  - 0b00, single precision i.e. 32-bits
     ///  - 0b10, double precision i.e. 64-bits
     ///  - 0b11, double extended precision i.e. 80-bits (default state)
     /// The 0b01 value is reserved and should not be used.
-    ///
-    /// The Rounding Control field determines how values which cannot be represented exactly are
-    /// rounded. It can be set to:
-    ///  - 0b00, round to nearest even (default state)
-    ///  - 0b01, round down (toward -inf)
-    ///  - 0b10, round up (toward +inf)
-    ///  - 0b11, round toward 0 (truncate)
     pub struct FPUControlWord(u16);
 
     fn set_cw(cw: u16) {

From 4ec1f8de418a96dc2c5f3e70e4e344fc307569e3 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Mon, 16 May 2016 15:41:45 +0200
Subject: [PATCH 4/4] Fix `asm!` blocks

The `volatile` modifier was incorrectly written outside of the `asm!`
blocks.
---
 src/libcore/num/dec2flt/algorithm.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs
index 35a613ef5fce4..c7af46a1e4f6b 100644
--- a/src/libcore/num/dec2flt/algorithm.rs
+++ b/src/libcore/num/dec2flt/algorithm.rs
@@ -70,7 +70,7 @@ mod fpu_precision {
     pub struct FPUControlWord(u16);
 
     fn set_cw(cw: u16) {
-        unsafe { asm!("fldcw $0" :: "m" (cw)) :: "volatile" }
+        unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
     }
 
     /// Set the precision field of the FPU to `T` and return a `FPUControlWord`
@@ -86,7 +86,7 @@ mod fpu_precision {
 
         // Get the original value of the control word to restore it later, when the
         // `FPUControlWord` structure is dropped
-        unsafe { asm!("fnstcw $0" : "=*m" (&cw)) ::: "volatile" }
+        unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }
 
         // Set the control word to the desired precision. This is achieved by masking away the old
         // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.