From f65dde31151c526725d1d4af27465f05d425d537 Mon Sep 17 00:00:00 2001
From: Lawrence Wu <lawrencejwu@gmail.com>
Date: Sun, 21 Feb 2021 19:19:25 -0600
Subject: [PATCH] Implement floating point conversion with ryu

---
 Data/ByteString/Builder.hs                    |   2 +
 Data/ByteString/Builder/ASCII.hs              |  19 -
 Data/ByteString/Builder/RealFloat.hs          | 128 ++++++
 Data/ByteString/Builder/RealFloat/D2S.hs      | 292 +++++++++++++
 Data/ByteString/Builder/RealFloat/F2S.hs      | 257 +++++++++++
 Data/ByteString/Builder/RealFloat/Internal.hs | 408 ++++++++++++++++++
 .../Builder/RealFloat/TableGenerator.hs       | 185 ++++++++
 bytestring.cabal                              |   7 +-
 .../Data/ByteString/Builder/Prim/TestUtils.hs |  10 +
 .../builder/Data/ByteString/Builder/Tests.hs  | 301 +++++++++++++
 10 files changed, 1589 insertions(+), 20 deletions(-)
 create mode 100644 Data/ByteString/Builder/RealFloat.hs
 create mode 100644 Data/ByteString/Builder/RealFloat/D2S.hs
 create mode 100644 Data/ByteString/Builder/RealFloat/F2S.hs
 create mode 100644 Data/ByteString/Builder/RealFloat/Internal.hs
 create mode 100644 Data/ByteString/Builder/RealFloat/TableGenerator.hs

diff --git a/Data/ByteString/Builder.hs b/Data/ByteString/Builder.hs
index cf86208ce..09253f86f 100644
--- a/Data/ByteString/Builder.hs
+++ b/Data/ByteString/Builder.hs
@@ -252,6 +252,7 @@ module Data.ByteString.Builder
     , stringUtf8
 
     , module Data.ByteString.Builder.ASCII
+    , module Data.ByteString.Builder.RealFloat
 
     ) where
 
@@ -259,6 +260,7 @@ import           Data.ByteString.Builder.Internal
 import qualified Data.ByteString.Builder.Prim  as P
 import qualified Data.ByteString.Lazy.Internal as L
 import           Data.ByteString.Builder.ASCII
+import           Data.ByteString.Builder.RealFloat
 
 import           Data.String (IsString(..))
 import           System.IO (Handle)
diff --git a/Data/ByteString/Builder/ASCII.hs b/Data/ByteString/Builder/ASCII.hs
index 3643bdade..12e674544 100644
--- a/Data/ByteString/Builder/ASCII.hs
+++ b/Data/ByteString/Builder/ASCII.hs
@@ -36,9 +36,6 @@ module Data.ByteString.Builder.ASCII
     , word64Dec
     , wordDec
 
-    , floatDec
-    , doubleDec
-
       -- *** Hexadecimal numbers
 
       -- | Encoding positive integers as hexadecimal numbers using lower-case
@@ -191,22 +188,6 @@ wordDec :: Word -> Builder
 wordDec = P.primBounded P.wordDec
 
 
--- Floating point numbers
--------------------------
-
--- TODO: Use Bryan O'Sullivan's double-conversion package to speed it up.
-
--- | /Currently slow./ Decimal encoding of an IEEE 'Float'.
-{-# INLINE floatDec #-}
-floatDec :: Float -> Builder
-floatDec = string7 . show
-
--- | /Currently slow./ Decimal encoding of an IEEE 'Double'.
-{-# INLINE doubleDec #-}
-doubleDec :: Double -> Builder
-doubleDec = string7 . show
-
-
 ------------------------------------------------------------------------------
 -- Hexadecimal Encoding
 ------------------------------------------------------------------------------
diff --git a/Data/ByteString/Builder/RealFloat.hs b/Data/ByteString/Builder/RealFloat.hs
new file mode 100644
index 000000000..0e8da7bfc
--- /dev/null
+++ b/Data/ByteString/Builder/RealFloat.hs
@@ -0,0 +1,128 @@
+
+module Data.ByteString.Builder.RealFloat
+  ( FFFormat(..)
+  , floatDec
+  , doubleDec
+  , formatFloat
+  , formatDouble
+  ) where
+
+import Data.ByteString.Builder.Internal (Builder)
+import qualified Data.ByteString.Builder.RealFloat.Internal as R
+import qualified Data.ByteString.Builder.RealFloat.F2S as RF
+import qualified Data.ByteString.Builder.RealFloat.D2S as RD
+import qualified Data.ByteString.Builder.Prim as BP
+import GHC.Float (FFFormat(..), roundTo)
+import GHC.Word (Word64(..))
+import GHC.Show (intToDigit)
+
+{-# INLINABLE floatDec #-}
+floatDec :: Float -> Builder
+floatDec = formatFloat FFGeneric Nothing
+
+{-# INLINABLE doubleDec #-}
+doubleDec :: Double -> Builder
+doubleDec = formatDouble FFGeneric Nothing
+
+{-# INLINABLE formatFloat #-}
+-- TODO precision for general and exponent formats
+formatFloat :: FFFormat -> Maybe Int -> Float -> Builder
+formatFloat fmt prec f =
+  case fmt of
+    FFGeneric ->
+      case specialStr f of
+        Just b -> b
+        Nothing ->
+          if e' >= 0 && e' <= 7
+             then sign f `mappend` showFixed (fromIntegral m) e' prec
+             else BP.primBounded (R.toCharsScientific (f < 0) m e) ()
+      where (RF.FloatingDecimal m e) = RF.f2Intermediate f
+            e' = fromIntegral e + R.decimalLength9 m
+    FFExponent -> RF.f2s f
+    FFFixed -> d2Fixed (realToFrac f) prec
+
+{-# INLINABLE formatDouble #-}
+formatDouble :: FFFormat -> Maybe Int -> Double -> Builder
+-- TODO precision for general and exponent formats
+formatDouble fmt prec f =
+  case fmt of
+    FFGeneric ->
+      case specialStr f of
+        Just b -> b
+        Nothing ->
+          if e' >= 0 && e' <= 7
+             then sign f `mappend` showFixed m e' prec
+             else BP.primBounded (R.toCharsScientific (f < 0) m e) ()
+      where (RD.FloatingDecimal m e) = RD.d2Intermediate f
+            e' = fromIntegral e + R.decimalLength17 m
+    FFExponent -> RD.d2s f
+    FFFixed -> d2Fixed f prec
+
+-- show fixed floating point matching show / showFFloat output by dropping
+-- digits after exponentiation precision
+d2Fixed :: Double -> Maybe Int -> Builder
+d2Fixed f prec =
+  case specialStr f of
+    Just b -> b
+    Nothing -> sign f `mappend` showFixed m e' prec
+  where (RD.FloatingDecimal m e) = RD.d2Intermediate f
+        olength = R.decimalLength17 m
+        -- NB: exponent in exponential format is e' - 1
+        e' = fromIntegral e + olength
+
+-- | Char7 encode a 'Char'.
+{-# INLINE char7 #-}
+char7 :: Char -> Builder
+char7 = BP.primFixed BP.char7
+
+-- | Char7 encode a 'String'.
+{-# INLINE string7 #-}
+string7 :: String -> Builder
+string7 = BP.primMapListFixed BP.char7
+
+sign :: RealFloat a => a -> Builder
+sign f = if f < 0 then char7 '-' else mempty
+
+specialStr :: RealFloat a => a -> Maybe Builder
+specialStr f
+  | isNaN f          = Just $ string7 "NaN"
+  | isInfinite f     = Just $ sign f `mappend` string7 "Infinity"
+  | isNegativeZero f = Just $ string7 "-0.0"
+  | f == 0           = Just $ string7 "0.0"
+  | otherwise        = Nothing
+
+digits :: Word64 -> [Int]
+digits w = go [] w
+  where go ds 0 = ds
+        go ds c = let (q, r) = R.dquotRem10Boxed c
+                   in go (fromIntegral r:ds) q
+
+showFixed :: Word64 -> Int -> Maybe Int -> Builder
+showFixed m e prec =
+  case prec of
+    Nothing
+      | e <= 0 -> char7 '0'
+               `mappend` char7 '.'
+               `mappend` string7 (replicate (-e) '0')
+               `mappend` mconcat (digitsToBuilder ds)
+      | otherwise ->
+          let f 0 s     rs = mk0 (reverse s) `mappend` char7 '.' `mappend` mk0 rs
+              f n s     [] = f (n-1) (char7 '0':s) []
+              f n s (r:rs) = f (n-1) (r:s) rs
+           in f e [] (digitsToBuilder ds)
+    Just p
+      | e >= 0 ->
+          let (ei, is') = roundTo 10 (p' + e) ds
+              (ls, rs) = splitAt (e + ei) (digitsToBuilder is')
+           in mk0 ls `mappend` mkDot rs
+      | otherwise ->
+          let (ei, is') = roundTo 10 p' (replicate (-e) 0 ++ ds)
+              (b:bs) = digitsToBuilder (if ei > 0 then is' else 0:is')
+           in b `mappend` mkDot bs
+          where p' = max p 0
+  where
+    mk0 ls = case ls of [] -> char7 '0'; _ -> mconcat ls
+    mkDot rs = if null rs then mempty else char7 '.' `mappend` mconcat rs
+    ds = digits m
+    digitsToBuilder = fmap (char7 . intToDigit)
+
diff --git a/Data/ByteString/Builder/RealFloat/D2S.hs b/Data/ByteString/Builder/RealFloat/D2S.hs
new file mode 100644
index 000000000..e6ea77447
--- /dev/null
+++ b/Data/ByteString/Builder/RealFloat/D2S.hs
@@ -0,0 +1,292 @@
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE Strict #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+
+module Data.ByteString.Builder.RealFloat.D2S
+    ( FloatingDecimal(..)
+    , d2s
+    , d2Intermediate
+    ) where
+
+import Data.Array.Base
+import Data.Bits ((.|.), (.&.))
+import Data.ByteString.Builder.Internal (Builder)
+import Data.ByteString.Builder.Prim (primBounded)
+import Data.ByteString.Builder.RealFloat.Internal
+import Data.ByteString.Builder.RealFloat.TableGenerator
+import Data.Maybe (fromMaybe)
+import GHC.Exts
+import GHC.Int (Int32(..), Int64(..))
+import GHC.Word (Word32(..), Word64(..))
+
+double_pow5_inv_split :: UArray Word64 Word128
+double_pow5_inv_split = listArray (0, fromIntegral double_max_inv_split)
+    $(gen_table_d double_max_inv_split (finv $ fromIntegral double_pow5_inv_bitcount))
+
+double_pow5_split :: UArray Word64 Word128
+double_pow5_split = listArray (0, fromIntegral double_max_split)
+    $(gen_table_d double_max_split (fnorm $ fromIntegral double_pow5_bitcount))
+
+double_mantissa_bits :: Word64
+double_mantissa_bits = 52
+
+double_exponent_bits :: Word64
+double_exponent_bits = 11
+
+double_bias :: Word64
+double_bias = 1023
+
+data FloatingDecimal = FloatingDecimal
+  { dmantissa :: !Word64
+  , dexponent :: !Int32
+  } deriving (Show, Eq)
+
+toS :: Word64 -> Int64
+toS = fromIntegral
+
+toU :: Int64 -> Word64
+toU = fromIntegral
+
+d2dSmallInt :: Word64 -> Word32 -> Maybe FloatingDecimal
+d2dSmallInt m e =
+  let m2 = (1 .<< double_mantissa_bits) .|. m
+      e2 = fromIntegral e - toS double_bias - toS double_mantissa_bits
+      fraction = m2 .&. mask (toU $ -e2)
+   in case () of
+        _ -- f = m2 * 2^e2 >= 2^53 is an integer.
+          -- Ignore this case for now.
+          | e2 > 0 -> Nothing
+          -- f < 1
+          | e2 < -52 -> Nothing
+          -- Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52:
+          --    1 <= f = m2 / 2^-e2 < 2^53.
+          -- Test if the lower -e2 bits of the significand are 0, i.e.
+          -- whether the fraction is 0.
+          | fraction /= 0 -> Nothing
+          -- f is an integer in the range [1, 2^53).
+          -- Note: mantissa might contain trailing (decimal) 0's.
+          -- Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+          | otherwise -> Just $ FloatingDecimal (m2 .>> (toU $ -e2)) 0
+
+unifySmallTrailing :: FloatingDecimal -> FloatingDecimal
+unifySmallTrailing fd@(FloatingDecimal (W64# m) e) =
+  let !(# q, r #) = dquotRem10 m
+   in case r `neWord#` 0## of
+        1# -> fd
+        0# -> unifySmallTrailing $ FloatingDecimal (W64# q) (e + 1)
+
+-- TODO: 128-bit intrinsics
+mulShift64Unboxed :: Word# -> (# Word#, Word# #) -> Int# -> Word#
+mulShift64Unboxed m (# factorHi, factorLo #) shift =
+  let !(# b0Hi, _ #) = m `timesWord2#` factorLo
+      !(# b1Hi, b1Lo #) = m `timesWord2#` factorHi
+      total = b0Hi `plusWord#` b1Lo
+      high = b1Hi `plusWord#` (int2Word# (total `ltWord#` b0Hi))
+      dist = shift -# 64#
+   in (high `uncheckedShiftL#` (64# -# dist)) `or#` (total `uncheckedShiftRL#` dist)
+
+get_double_pow5_inv_split :: Int# -> (# Word#, Word# #)
+get_double_pow5_inv_split i =
+  let !(UArray _ _ _ arr) = double_pow5_inv_split
+   in (# indexWord64Array# arr (i *# 2#), indexWord64Array# arr (i *# 2# +# 1#) #)
+
+get_double_pow5_split :: Int# -> (# Word#, Word# #)
+get_double_pow5_split i =
+  let !(UArray _ _ _ arr) = double_pow5_split
+   in (# indexWord64Array# arr (i *# 2#), indexWord64Array# arr (i *# 2# +# 1#) #)
+
+mulPow5DivPow2 :: Word# -> Int# -> Int# -> Word#
+mulPow5DivPow2 m i j = mulShift64Unboxed m (get_double_pow5_split i) j
+
+mulPow5InvDivPow2 :: Word# -> Word# -> Int# -> Word#
+mulPow5InvDivPow2 m q j = mulShift64Unboxed m (get_double_pow5_inv_split (word2Int# q)) j
+
+
+acceptBounds :: Word64 -> Bool
+acceptBounds (W64# v) = boxToBool (acceptBoundsUnboxed v)
+
+data BoundsState = BoundsState
+    { vu :: Word64
+    , vv :: Word64
+    , vw :: Word64
+    , lastRemovedDigit :: Word64
+    , vuIsTrailingZeros :: Bool
+    , vvIsTrailingZeros :: Bool
+    } deriving Show
+
+trimTrailing' :: BoundsState -> (BoundsState, Int32)
+trimTrailing' d
+  | vw' > vu' =
+    let (vv', vvRem) = dquotRem10Boxed $ vv d
+     in fmap ((+) 1) . trimTrailing' $
+         d { vu = vu'
+           , vv = vv'
+           , vw = vw'
+           , lastRemovedDigit = vvRem
+           , vuIsTrailingZeros = vuIsTrailingZeros d && vuRem == 0
+           , vvIsTrailingZeros = vvIsTrailingZeros d && lastRemovedDigit d == 0
+           }
+  | otherwise = (d, 0)
+  where
+    (vu', vuRem) = dquotRem10Boxed $ vu d
+    vw' = dwrapped dquot10 (vw d)
+
+trimTrailing'' :: BoundsState -> (BoundsState, Int32)
+trimTrailing'' d
+  | vuRem == 0 =
+    let (vv', vvRem) = dquotRem10Boxed $ vv d
+        vw' = dwrapped dquot10 (vw d)
+     in fmap ((+) 1) . trimTrailing'' $
+         d { vu = vu'
+           , vv = vv'
+           , vw = vw'
+           , lastRemovedDigit = vvRem
+           , vvIsTrailingZeros = vvIsTrailingZeros d && lastRemovedDigit d == 0
+           }
+  | otherwise = (d, 0)
+  where
+    (vu', vuRem) = dquotRem10Boxed $ vu d
+
+trimTrailing :: BoundsState -> (BoundsState, Int32)
+trimTrailing d =
+  let (d', r) = trimTrailing' d
+      (d'', r') = if vuIsTrailingZeros d'
+                     then trimTrailing'' d'
+                     else (d', 0)
+      res = if vvIsTrailingZeros d'' && lastRemovedDigit d'' == 5 && vv d'' `rem` 2 == 0
+               -- set `{ lastRemovedDigit = 4 }` to round-even
+               then d''
+               else d''
+   in (res, r + r')
+
+trimNoTrailing'' :: Word# -> Word# -> Word# -> Word# -> Int# -> (# Word#, Word#, Word#, Int# #)
+trimNoTrailing'' u' v' w' lastRemoved count =
+  case vw' `gtWord#` vu' of
+    1# -> let !(# vv', ld #) = dquotRem10 v'
+           in trimNoTrailing' vu' vv' vw' ld (count +# 1#)
+    0# -> (# u', v', lastRemoved , count #)
+  where
+      vu' = dquot10 u'
+      vw' = dquot10 w'
+
+trimNoTrailing' :: Word# -> Word# -> Word# -> Word# -> Int# -> (# Word#, Word#, Word#, Int# #)
+trimNoTrailing' u' v' w' lastRemoved count =
+  -- Loop iterations below (approximately), without div 100 optimization:
+  -- 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+  -- Loop iterations below (approximately), with div 100 optimization:
+  -- 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+  let vw' = dquot100 w'
+      vu' = dquot100 u'
+   in case vw' `gtWord#` vu' of
+        1# -> let vv' = dquot100 v'
+                  ld = dquot10 (v' `minusWord#` (vv' `timesWord#` 100##))
+               in trimNoTrailing'' vu' vv' vw' ld (count +# 2#)
+        0# -> trimNoTrailing'' u' v' w' lastRemoved count
+
+trimNoTrailing :: BoundsState -> (BoundsState, Int32)
+trimNoTrailing (BoundsState (W64# u' ) (W64# v') (W64# w') (W64# ld) _ _) =
+  let !(# vu', vv', ld', c' #) = trimNoTrailing' u' v' w' ld 0#
+   in (BoundsState (W64# vu') (W64# vv') 0 (W64# ld') False False, I32# c')
+
+d2dGT :: Int32 -> Word64 -> Word64 -> Word64 -> (BoundsState, Int32)
+d2dGT (I32# e2) (W64# u) (W64# v) (W64# w) =
+  let q = int2Word# (log10pow2Unboxed e2 -# (e2 ># 3#))
+      e10 = word2Int# q
+      k = unbox double_pow5_inv_bitcount +# pow5bitsUnboxed (word2Int# q) -# 1#
+      i = (negateInt# e2) +# word2Int# q +# k
+      u' = mulPow5InvDivPow2 u q i
+      v' = mulPow5InvDivPow2 v q i
+      w' = mulPow5InvDivPow2 w q i
+      !(# vvTrailing, vuTrailing, vw' #) =
+        case () of
+          _ | boxToBool ((q `leWord#` 21##) `andI#` (frem5 v `eqWord#` 0##))
+                -> (# multipleOfPowerOf5_UnboxedB v q, False, w' #)
+            | boxToBool ((q `leWord#` 21##) `andI#` acceptBoundsUnboxed v)
+                -> (# False, multipleOfPowerOf5_UnboxedB u q, w' #)
+            | boxToBool (q `leWord#` 21##)
+                -> (# False, False, w' `minusWord#` int2Word# (multipleOfPowerOf5_Unboxed w q) #)
+            | otherwise
+                -> (# False, False, w' #)
+   in (BoundsState (W64# u') (W64# v') (W64# vw') 0 vuTrailing vvTrailing, (I32# e10))
+
+d2dLT :: Int32 -> Word64 -> Word64 -> Word64 -> (BoundsState, Int32)
+d2dLT (I32# e2) (W64# u) (W64# v) (W64# w) =
+  let nege2 = negateInt# e2
+      q = int2Word# (log10pow5Unboxed nege2 -# (nege2 ># 1#))
+      e10 = word2Int# q +# e2
+      i = nege2 -# word2Int# q
+      k = pow5bitsUnboxed i -# unbox double_pow5_bitcount
+      j = word2Int# q -# k
+      u' = mulPow5DivPow2 u i j
+      v' = mulPow5DivPow2 v i j
+      w' = mulPow5DivPow2 w i j
+      !(# vvTrailing, vuTrailing, vw' #) =
+        case () of
+          _ | boxToBool ((q `leWord#` 1##) `andI#` acceptBoundsUnboxed v)
+                -> (# True, boxToBool ((w `minusWord#` v) `eqWord#` 2##), w' #) -- mmShift == 1
+            | boxToBool (q `leWord#` 1##)
+                -> (# True, False, w' `minusWord#` 1## #)
+            | boxToBool (q `ltWord#` 63##)
+                -> (# boxToBool (multipleOfPowerOf2Unboxed v (q `minusWord#` 1##)), False, w' #)
+            | otherwise
+                -> (# False, False, w' #)
+   in (BoundsState (W64# u') (W64# v') (W64# vw') 0 vuTrailing vvTrailing, (I32# e10))
+
+roundUp :: Bool -> BoundsState -> Bool
+roundUp b s = (vv s == vu s && b) || lastRemovedDigit s >= 5
+
+calculate :: Bool -> BoundsState -> Word64
+calculate b s = vv s + asWord (roundUp b s)
+
+d2d :: Word64 -> Word32 -> FloatingDecimal
+d2d m e =
+  let mf = if e == 0
+              then m
+              else (1 .<< double_mantissa_bits) .|. m
+      ef = if e == 0
+              then toS 1 - toS double_bias - toS double_mantissa_bits
+              else fromIntegral e - toS double_bias - toS double_mantissa_bits
+      e2 = fromIntegral ef - 2 :: Int32
+      -- Step 2. 3-tuple (u, v, w) * 2**e2
+      u = 4 * mf - 1 - asWord (m /= 0 || e <= 1)
+      v = 4 * mf
+      w = 4 * mf + 2
+      -- Step 3. convert to decimal power base
+      (state, e10) =
+        if e2 >= 0
+           then d2dGT e2 u v w
+           else d2dLT e2 u v w
+      -- Step 4: Find the shortest decimal representation in the interval of
+      -- valid representations.
+      (output, removed) =
+        if vvIsTrailingZeros state || vuIsTrailingZeros state
+           then pmap (\s -> calculate (not (acceptBounds v)
+                                    || not (vuIsTrailingZeros s)) s)
+                                      $ trimTrailing state
+           else pmap (calculate True) $ trimNoTrailing state
+      e' = e10 + removed
+   in FloatingDecimal output e'
+
+breakdown :: Double -> (Bool, Word64, Word64)
+breakdown f =
+  let bits = dcoerceToWord f
+      sign = ((bits .>> (double_mantissa_bits + double_exponent_bits)) .&. 1) /= 0
+      mantissa = bits .&. mask double_mantissa_bits
+      expo = (bits .>> double_mantissa_bits) .&. mask double_exponent_bits
+   in (sign, mantissa, expo)
+
+{-# INLINE d2s' #-}
+d2s' :: (Bool -> Word64 -> Int32 -> a) -> (Bool -> Bool -> Bool -> a) -> Double -> a
+d2s' formatter specialFormatter d =
+  let (sign, mantissa, expo) = breakdown d
+   in if (expo == mask double_exponent_bits) || (expo == 0 && mantissa == 0)
+         then specialFormatter sign (expo > 0) (mantissa > 0)
+         else let v = unifySmallTrailing <$> d2dSmallInt mantissa (fromIntegral expo)
+                  FloatingDecimal m e = fromMaybe (d2d mantissa (fromIntegral expo)) v
+               in formatter sign m e
+
+d2s :: Double -> Builder
+d2s d = primBounded (d2s' toCharsScientific special d) ()
+
+d2Intermediate :: Double -> FloatingDecimal
+d2Intermediate = d2s' (const FloatingDecimal) (\_ _ _ -> FloatingDecimal 0 0)
diff --git a/Data/ByteString/Builder/RealFloat/F2S.hs b/Data/ByteString/Builder/RealFloat/F2S.hs
new file mode 100644
index 000000000..023142893
--- /dev/null
+++ b/Data/ByteString/Builder/RealFloat/F2S.hs
@@ -0,0 +1,257 @@
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE Strict #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+
+module Data.ByteString.Builder.RealFloat.F2S
+    ( FloatingDecimal(..)
+    , f2s
+    , f2Intermediate
+    ) where
+
+import Data.Array.Unboxed
+import Data.Array.Base (UArray(..))
+import Data.Bits ((.|.), (.&.))
+import Data.ByteString.Builder.Internal (Builder)
+import Data.ByteString.Builder.Prim (primBounded)
+import Data.ByteString.Builder.RealFloat.Internal
+import Data.ByteString.Builder.RealFloat.TableGenerator
+import GHC.Exts
+import GHC.Int (Int32(..))
+import GHC.Word (Word32(..), Word64(..))
+
+float_pow5_inv_split :: UArray Word32 Word64
+float_pow5_inv_split = listArray (0, fromIntegral float_max_inv_split)
+    $(gen_table_f float_max_inv_split (finv $ fromIntegral float_pow5_inv_bitcount))
+
+float_pow5_split :: UArray Word32 Word64
+float_pow5_split = listArray (0, fromIntegral float_max_split)
+    $(gen_table_f float_max_split (fnorm $ fromIntegral float_pow5_bitcount))
+
+float_mantissa_bits :: Word32
+float_mantissa_bits = 23
+
+float_exponent_bits :: Word32
+float_exponent_bits = 8
+
+float_bias :: Word32
+float_bias = 127
+
+data FloatingDecimal = FloatingDecimal
+  { fmantissa :: !Word32
+  , fexponent :: !Int32
+  } deriving (Show, Eq)
+
+toS :: Word32 -> Int32
+toS = fromIntegral
+
+mulShift32Unboxed :: Word# -> Word# -> Int# -> Word#
+mulShift32Unboxed m factor shift =
+  let factorLo = narrow32Word# factor
+      factorHi = factor `uncheckedShiftRL#` 32#
+      bits0 = m `timesWord#` factorLo
+      bits1 = m `timesWord#` factorHi
+      total  = (bits0 `uncheckedShiftRL#` 32#) `plusWord#` bits1
+   in narrow32Word# (total `uncheckedShiftRL#` (shift -# 32#))
+
+get_float_pow5_inv_split :: Int# -> Word#
+get_float_pow5_inv_split i =
+  let !(UArray _ _ _ arr) = float_pow5_inv_split
+   in indexWord64Array# arr i
+
+get_float_pow5_split :: Int# -> Word#
+get_float_pow5_split i =
+  let !(UArray _ _ _ arr) = float_pow5_split
+   in indexWord64Array# arr i
+
+mulPow5InvDivPow2 :: Word# -> Word# -> Int# -> Word#
+mulPow5InvDivPow2 m q j = mulShift32Unboxed m (get_float_pow5_inv_split (word2Int# q)) j
+
+mulPow5DivPow2 :: Word# -> Int# -> Int# -> Word#
+mulPow5DivPow2 m i j = mulShift32Unboxed m (get_float_pow5_split i) j
+
+acceptBounds :: Word32 -> Bool
+acceptBounds (W32# v) = boxToBool (acceptBoundsUnboxed v)
+
+data BoundsState = BoundsState
+    { vu :: Word32
+    , vv :: Word32
+    , vw :: Word32
+    , lastRemovedDigit :: Word32
+    , vuIsTrailingZeros :: Bool
+    , vvIsTrailingZeros :: Bool
+    }
+
+trimTrailing' :: BoundsState -> (BoundsState, Int32)
+trimTrailing' d
+  | vw' > vu' =
+    let (vv', vvRem) = fquotRem10Boxed $ vv d
+     in fmap ((+) 1) . trimTrailing' $
+         d { vu = vu'
+           , vv = vv'
+           , vw = vw'
+           , lastRemovedDigit = vvRem
+           , vuIsTrailingZeros = vuIsTrailingZeros d && vuRem == 0
+           , vvIsTrailingZeros = vvIsTrailingZeros d && lastRemovedDigit d == 0
+           }
+  | otherwise = (d, 0)
+  where
+    (vu', vuRem) = fquotRem10Boxed $ vu d
+    vw' = fwrapped fquot10 (vw d)
+
+trimTrailing'' :: BoundsState -> (BoundsState, Int32)
+trimTrailing'' d
+  | vuRem == 0 =
+    let (vv', vvRem) = fquotRem10Boxed $ vv d
+        vw' = fwrapped fquot10 (vw d)
+     in fmap ((+) 1) . trimTrailing'' $
+         d { vu = vu'
+           , vv = vv'
+           , vw = vw'
+           , lastRemovedDigit = vvRem
+           , vvIsTrailingZeros = vvIsTrailingZeros d && lastRemovedDigit d == 0
+           }
+  | otherwise = (d, 0)
+  where
+    (vu', vuRem) = fquotRem10Boxed $ vu d
+
+trimTrailing :: BoundsState -> (BoundsState, Int32)
+trimTrailing d =
+  let (d', r) = trimTrailing' d
+      (d'', r') = if vuIsTrailingZeros d'
+                     then trimTrailing'' d'
+                     else (d', 0)
+      res = if vvIsTrailingZeros d'' && lastRemovedDigit d'' == 5 && vv d'' `rem` 2 == 0
+               -- set `{ lastRemovedDigit = 4 }` to round-even
+               then d''
+               else d''
+   in (res, r + r')
+
+trimNoTrailing' :: Word# -> Word# -> Word# -> Word# -> Int# -> (# Word#, Word#, Word#, Int# #)
+trimNoTrailing' u' v' w' lastRemoved count =
+  case vw' `gtWord#` vu' of
+    1# -> let !(# vv', ld #) = fquotRem10 v'
+           in trimNoTrailing' vu' vv' vw' ld (count +# 1#)
+    0# -> (# u', v', lastRemoved , count #)
+  where
+    vu' = fquot10 u'
+    vw' = fquot10 w'
+
+trimNoTrailing :: BoundsState -> (BoundsState, Int32)
+trimNoTrailing (BoundsState (W32# u') (W32# v') (W32# w') (W32# ld) _ _) =
+  let !(# vu', vv', ld', c' #) = trimNoTrailing' u' v' w' ld 0#
+   in (BoundsState (W32# vu') (W32# vv') 0 (W32# ld') False False, I32# c')
+
+f2dGT :: Int32 -> Word32 -> Word32 -> Word32 -> (BoundsState, Int32)
+f2dGT (I32# e2) (W32# u) (W32# v) (W32# w) =
+  let q = int2Word# (log10pow2Unboxed e2)
+      e10 = word2Int# q
+      k = unbox float_pow5_inv_bitcount +# pow5bitsUnboxed (word2Int# q) -# 1#
+      i = negateInt# e2 +# word2Int# q +# k
+      u' = mulPow5InvDivPow2 u q i
+      v' = mulPow5InvDivPow2 v q i
+      w' = mulPow5InvDivPow2 w q i
+      lastRemoved =
+        case (q `neWord#` 0##) `andI#` ((fquot10 (w' `minusWord#` 1##)) `leWord#` fquot10 u') of
+          -- We need to know one removed digit even if we are not going to loop
+          -- below. We could use q = X - 1 above, except that would require 33
+          -- bits for the result, and we've found that 32-bit arithmetic is
+          -- faster even on 64-bit machines.
+          1# -> let l = unbox float_pow5_inv_bitcount +# pow5bitsUnboxed (word2Int# q -# 1#) -# 1#
+                 in frem10 (mulPow5InvDivPow2 v (q `minusWord#` 1##) (negateInt# e2 +# word2Int# q -# 1# +# l))
+          0# -> 0##
+      !(# vvTrailing, vuTrailing, vw' #) =
+        case () of
+          _ | boxToBool ((q `leWord#` 9##) `andI#` (frem5 v `eqWord#` 0##))
+                -> (# multipleOfPowerOf5_UnboxedB v q, False, w' #)
+            | boxToBool ((q `leWord#` 9##) `andI#` acceptBoundsUnboxed v)
+                -> (# False, multipleOfPowerOf5_UnboxedB u q, w' #)
+            | boxToBool (q `leWord#` 9##)
+                -> (# False, False, w' `minusWord#` int2Word# (multipleOfPowerOf5_Unboxed w q) #)
+            | otherwise
+                -> (# False, False, w' #)
+   in (BoundsState (W32# u') (W32# v') (W32# vw') (W32# lastRemoved) vuTrailing vvTrailing , (I32# e10))
+
+f2dLT :: Int32 -> Word32 -> Word32 -> Word32 -> (BoundsState, Int32)
+f2dLT (I32# e2) (W32# u) (W32# v) (W32# w) =
+  let q = int2Word# (log10pow5Unboxed (negateInt# e2))
+      e10 = word2Int# q +# e2
+      i = (negateInt# e2) -# word2Int# q
+      k = pow5bitsUnboxed i -# unbox float_pow5_bitcount
+      j = word2Int# q -# k
+      u' = mulPow5DivPow2 u i j
+      v' = mulPow5DivPow2 v i j
+      w' = mulPow5DivPow2 w i j
+      lastRemoved =
+        case (q `neWord#` 0##) `andI#` ((fquot10 (u'`minusWord#` 1##)) `leWord#` fquot10 u') of
+          1# -> let j' = word2Int# q -# 1# -# (pow5bitsUnboxed (i +# 1#) -# unbox float_pow5_bitcount)
+                 in frem10 (mulPow5DivPow2 v (i +# 1#) j')
+          0# -> 0##
+      !(# vvTrailing , vuTrailing, vw' #) =
+        case () of
+          _ | boxToBool ((q `leWord#` 1##) `andI#` acceptBoundsUnboxed v)
+                -> (# True, boxToBool ((w `minusWord#` v) `eqWord#` 2##), w' #) -- mmShift == 1
+            | boxToBool (q `leWord#` 1##)
+                -> (# True, False, w' `minusWord#` 1## #)
+            | boxToBool (q `ltWord#` 31##)
+                -> (# boxToBool (multipleOfPowerOf2Unboxed v (q `minusWord#` 1##)), False, w' #)
+            | otherwise
+                -> (# False, False, w' #)
+   in (BoundsState (W32# u') (W32# v') (W32# vw') (W32# lastRemoved) vuTrailing vvTrailing , (I32# e10))
+
+roundUp :: Bool -> BoundsState -> Bool
+roundUp b s = (vv s == vu s && b) || lastRemovedDigit s >= 5
+
+calculate :: Bool -> BoundsState -> Word32
+calculate b s = vv s + asWord (roundUp b s)
+
+f2d :: Word32 -> Word32 -> FloatingDecimal
+f2d m e =
+  let mf = if e == 0
+              then m
+              else (1 .<< float_mantissa_bits) .|. m
+      ef = if e == 0
+              then toS 1 - toS (float_bias + float_mantissa_bits)
+              else toS e - toS (float_bias + float_mantissa_bits)
+      e2 = ef - 2
+      -- Step 2. 3-tuple (u, v, w) * 2**e2
+      u = 4 * mf - 1 - asWord (m /= 0 || e <= 1)
+      v = 4 * mf
+      w = 4 * mf + 2
+      -- Step 3. convert to decimal power base
+      (state, e10) =
+        if e2 >= 0
+           then f2dGT e2 u v w
+           else f2dLT e2 u v w
+      -- Step 4: Find the shortest decimal representation in the interval of
+      -- valid representations.
+      (output, removed) =
+        if vvIsTrailingZeros state || vuIsTrailingZeros state
+           then pmap (\s -> calculate (not (acceptBounds v)
+                                    || not (vuIsTrailingZeros s)) s)
+                                      $ trimTrailing state
+           else pmap (calculate True) $ trimNoTrailing state
+      e' = e10 + removed
+   in FloatingDecimal output e'
+
+breakdown :: Float -> (Bool, Word32, Word32)
+breakdown f =
+  let bits = fcoerceToWord f
+      sign = ((bits .>> (float_mantissa_bits + float_exponent_bits)) .&. 1) /= 0
+      mantissa = bits .&. mask float_mantissa_bits
+      expo = (bits .>> float_mantissa_bits) .&. mask float_exponent_bits
+   in (sign, mantissa, expo)
+
+{-# INLINE f2s' #-}
+f2s' :: (Bool -> Word32 -> Int32 -> a) -> (Bool -> Bool -> Bool -> a) -> Float -> a
+f2s' formatter specialFormatter f =
+  let (sign, mantissa, expo) = breakdown f
+   in if (expo == mask float_exponent_bits) || (expo == 0 && mantissa == 0)
+         then specialFormatter sign (expo > 0) (mantissa > 0)
+         else let FloatingDecimal m e = f2d mantissa expo
+               in formatter sign m e
+
+f2s :: Float -> Builder
+f2s f = primBounded (f2s' toCharsScientific special f) ()
+
+f2Intermediate :: Float -> FloatingDecimal
+f2Intermediate = f2s' (const FloatingDecimal) (\_ _ _ -> FloatingDecimal 0 0)
diff --git a/Data/ByteString/Builder/RealFloat/Internal.hs b/Data/ByteString/Builder/RealFloat/Internal.hs
new file mode 100644
index 000000000..f964bf22e
--- /dev/null
+++ b/Data/ByteString/Builder/RealFloat/Internal.hs
@@ -0,0 +1,408 @@
+{-# LANGUAGE Strict #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+
+module Data.ByteString.Builder.RealFloat.Internal
+    ( (.>>)
+    , (.<<)
+    , mask
+    , asWord
+    , pmap
+    , (...)
+    , special
+    , decimalLength9
+    , decimalLength17
+    , pow5bitsUnboxed
+    , log10pow2Unboxed
+    , log10pow5Unboxed
+    , pow5_factor
+    , multipleOfPowerOf5_Unboxed
+    , multipleOfPowerOf5_UnboxedB
+    , multipleOfPowerOf2Unboxed
+    , acceptBoundsUnboxed
+    , toCharsScientific
+    , fcoerceToWord
+    , dcoerceToWord
+    -- hand-rolled division and remainder for f2s and d2s
+    , fquot10
+    , frem10
+    , fquotRem10
+    , fquotRem10Boxed
+    , fquot5
+    , frem5
+    , fquotRem5
+    , fwrapped
+    , dquot10
+    , drem10
+    , dquotRem10
+    , dquotRem10Boxed
+    , dquot5
+    , drem5
+    , dquotRem5
+    , dquot100
+    , dwrapped
+    -- prim-op helpers
+    , boxToBool
+    , box
+    , unbox
+    ) where
+
+import Control.Monad (foldM)
+import Data.Array.Unboxed (UArray, IArray(..), listArray)
+import Data.Array.Base (unsafeAt, STUArray(..), MArray(..), castSTUArray, readArray)
+import Data.Bits (Bits(..), FiniteBits(..))
+import Data.ByteString.Internal (c2w)
+import Data.ByteString.Builder.Prim.Internal (BoundedPrim, boundedPrim)
+import Data.Char (ord)
+import GHC.Int (Int(..), Int32)
+import GHC.Exts
+import GHC.Word (Word8, Word16, Word32(..), Word64(..))
+import GHC.ST (ST(..), runST)
+import Foreign.Ptr (Ptr, plusPtr, castPtr)
+import Foreign.Storable (poke)
+
+{-# INLINABLE (.>>) #-}
+(.>>) :: (Bits a, Integral b) => a -> b -> a
+a .>> s = unsafeShiftR a (fromIntegral s)
+
+{-# INLINABLE (.<<) #-}
+(.<<) :: (Bits a, Integral b) => a -> b -> a
+a .<< s = unsafeShiftL a (fromIntegral s)
+
+{-# INLINABLE mask #-}
+mask :: (Bits a, Integral a) => a -> a
+mask = flip (-) 1 . (.<<) 1
+
+{-# INLINABLE asWord #-}
+asWord :: Integral w => Bool -> w
+asWord = fromIntegral . fromEnum
+
+pmap :: (a -> c) -> (a, b) -> (c, b)
+pmap f (a, b) = (f a, b)
+
+-- Returns the number of decimal digits in v, which must not contain more than 9 digits.
+decimalLength9 :: Word32 -> Int
+decimalLength9 v
+  | v >= 100000000 = 9
+  | v >= 10000000 = 8
+  | v >= 1000000 = 7
+  | v >= 100000 = 6
+  | v >= 10000 = 5
+  | v >= 1000 = 4
+  | v >= 100 = 3
+  | v >= 10 = 2
+  | otherwise = 1
+
+-- Returns the number of decimal digits in v, which must not contain more than 17 digits.
+decimalLength17 :: Word64 -> Int
+decimalLength17 v
+  | v >= 10000000000000000 = 17
+  | v >= 1000000000000000 = 16
+  | v >= 100000000000000 = 15
+  | v >= 10000000000000 = 14
+  | v >= 1000000000000 = 13
+  | v >= 100000000000 = 12
+  | v >= 10000000000 = 11
+  | v >= 1000000000 = 10
+  | v >= 100000000 = 9
+  | v >= 10000000 = 8
+  | v >= 1000000 = 7
+  | v >= 100000 = 6
+  | v >= 10000 = 5
+  | v >= 1000 = 4
+  | v >= 100 = 3
+  | v >= 10 = 2
+  | otherwise = 1
+
+infixr 9 `dot`
+dot :: (a -> b) -> (c -> d -> a) -> c -> d -> b
+dot = (.) . (.)
+
+infixr 9 ...
+(...) :: (a -> b) -> (c -> d -> e -> a) -> c -> d -> e -> b
+(...) = dot . (.)
+
+-- TODO encode proof in types?
+-- From 'In-and-Out Conversions' https://dl.acm.org/citation.cfm?id=362887, we
+-- have that a conversion from a base-b n-digit number to a base-v m-digit
+-- number such that the round-trip conversion is identity requires
+--
+--    v^(m-1) > b^n
+--
+-- Specifically for binary floating point to decimal conversion, we must have
+--
+--    10^(m-1) > 2^n
+-- => log(10^(m-1)) > log(2^n)
+-- => (m-1) * log(10) > n * log(2)
+-- => m-1 > n * log(2) / log(10)
+-- => m-1 >= ceil(n * log(2) / log(10))
+-- => m >= ceil(n * log(2) / log(10)) + 1
+--
+-- And since 32 and 64-bit floats have 23 and 52 bits of mantissa (and then an
+-- implicit leading-bit), we need
+--
+--    ceil(24 * log(2) / log(10)) + 1 => 9
+--    ceil(53 * log(2) / log(10)) + 1 => 17
+--
+-- In addition, the exponent range from floats is [-45,38] and doubles is
+-- [-324,308] (including subnormals) which are 3 and 4 digits respectively
+--
+-- Thus we have,
+--
+--    floats: 1 (sign) + 9 (mantissa) + 1 (.) + 1 (e) + 3 (exponent) = 15
+--    doubles: 1 (sign) + 17 (mantissa) + 1 (.) + 1 (e) + 4 (exponent) = 24
+--
+maxEncodedLength :: Int
+maxEncodedLength = 32
+
+-- TODO TH?
+pokeAll :: String -> Ptr Word8 -> IO (Ptr Word8)
+pokeAll s ptr = foldM pokeOne ptr s
+  where pokeOne p c = poke p (c2w c) >> return (p `plusPtr` 1)
+
+boundString :: String -> BoundedPrim ()
+boundString s = boundedPrim maxEncodedLength $ const (pokeAll s)
+
+--         Sign -> Exp  -> Mantissa
+special :: Bool -> Bool -> Bool -> BoundedPrim ()
+special    _       _       True =  boundString "NaN"
+special    True    False   _    =  boundString "-0.0e0"
+special    False   False   _    =  boundString "0.0e0"
+special    True    True    _    =  boundString "-Infinity"
+special    False   True    _    =  boundString "Infinity"
+
+-- Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+pow5bitsUnboxed :: Int# -> Int#
+pow5bitsUnboxed e = (e *# 1217359#) `uncheckedIShiftRL#` 19# +# 1#
+
+-- Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+log10pow2Unboxed :: Int# -> Int#
+log10pow2Unboxed e = (e *# 78913#) `uncheckedIShiftRL#` 18#
+
+-- Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+log10pow5Unboxed :: Int# -> Int#
+log10pow5Unboxed e = (e *# 732928#) `uncheckedIShiftRL#` 20#
+
+acceptBoundsUnboxed :: Word# -> Int#
+acceptBoundsUnboxed _ = 0#
+-- for round-to-even and correct shortest
+-- acceptBoundsUnboxed v = ((v `uncheckedShiftRL#` 2#) `and#` 1##) `eqWord#` 0##
+
+fcoerceToWord :: Float -> Word32
+fcoerceToWord x = runST (cast x)
+
+dcoerceToWord :: Double -> Word64
+dcoerceToWord x = runST (cast x)
+
+{-# INLINE cast #-}
+cast :: (MArray (STUArray s) a (ST s),
+         MArray (STUArray s) b (ST s)) => a -> ST s b
+cast x = newArray (0 :: Int, 0) x >>= castSTUArray >>= flip readArray 0
+
+fquot10 :: Word# -> Word#
+fquot10 w = (w `timesWord#` 0xCCCCCCCD##) `uncheckedShiftRL#` 35#
+
+frem10 :: Word# -> Word#
+frem10 w =
+  let w' = fquot10 w
+   in w `minusWord#` (w' `timesWord#` 10##)
+
+fquotRem10 :: Word# -> (# Word#, Word# #)
+fquotRem10 w =
+  let w' = fquot10 w
+   in (# w', w `minusWord#` (w' `timesWord#` 10##) #)
+
+fquot5 :: Word# -> Word#
+fquot5 w = (w `timesWord#` 0xCCCCCCCD##) `uncheckedShiftRL#` 34#
+
+frem5 :: Word# -> Word#
+frem5 w =
+  let w' = fquot5 w
+   in w `minusWord#` (w' `timesWord#` 5##)
+
+fquotRem5 :: Word# -> (# Word#, Word# #)
+fquotRem5 w =
+  let w' = fquot5 w
+   in (# w', w `minusWord#` (w' `timesWord#` 5##) #)
+
+fquotRem10Boxed :: Word32 -> (Word32, Word32)
+fquotRem10Boxed (W32# w) = let !(# q, r #) = fquotRem10 w in (W32# q, W32# r)
+
+fwrapped :: (Word# -> Word#) -> Word32 -> Word32
+fwrapped f (W32# w) = W32# (f w)
+
+dquot10 :: Word# -> Word#
+dquot10 w =
+  let !(# rdx, _ #) = w `timesWord2#` 0xCCCCCCCCCCCCCCCD##
+    in rdx `uncheckedShiftRL#` 3#
+
+dquot100 :: Word# -> Word#
+dquot100 w =
+  let !(# rdx, _ #) = (w `uncheckedShiftRL#` 2#) `timesWord2#` 0x28F5C28F5C28F5C3##
+    in rdx `uncheckedShiftRL#` 2#
+
+drem10 :: Word# -> Word#
+drem10 w =
+  let w' = dquot10 w
+   in w `minusWord#` (w' `timesWord#` 10##)
+
+dquotRem10 :: Word# -> (# Word#, Word# #)
+dquotRem10 w =
+  let w' = dquot10 w
+   in (# w', w `minusWord#` (w' `timesWord#` 10##) #)
+
+dquot5 :: Word# -> Word#
+dquot5 w =
+  let !(# rdx, _ #) = w `timesWord2#` 0xCCCCCCCCCCCCCCCD##
+    in rdx `uncheckedShiftRL#` 2#
+
+drem5 :: Word# -> Word#
+drem5 w =
+  let w' = dquot5 w
+   in w `minusWord#` (w' `timesWord#` 5##)
+
+dquotRem5 :: Word# -> (# Word#, Word# #)
+dquotRem5 w =
+  let w' = dquot5 w
+   in (# w', w `minusWord#` (w' `timesWord#` 5##) #)
+
+dquotRem10Boxed :: Word64 -> (Word64, Word64)
+dquotRem10Boxed (W64# w) = let !(# q, r #) = dquotRem10 w in (W64# q, W64# r)
+
+dwrapped :: (Word# -> Word#) -> Word64 -> Word64
+dwrapped f (W64# w) = W64# (f w)
+
+boxToBool :: Int# -> Bool
+boxToBool i = case i of
+                1# -> True
+                0# -> False
+
+box :: Int# -> Int
+box i = I# i
+
+unbox :: Int -> Int#
+unbox (I# i) = i
+
+pow5_factor :: Word# -> Int# -> Int#
+pow5_factor w count =
+  let !(# q, r #) = dquotRem5 w
+   in case r `eqWord#` 0## of
+        0# -> count
+        1# -> pow5_factor q (count +# 1#)
+
+multipleOfPowerOf5_Unboxed :: Word# -> Word# -> Int#
+multipleOfPowerOf5_Unboxed value p = pow5_factor value 0# >=# word2Int# p
+
+multipleOfPowerOf5_UnboxedB :: Word# -> Word# -> Bool
+multipleOfPowerOf5_UnboxedB value p = boxToBool (multipleOfPowerOf5_Unboxed value p)
+
+multipleOfPowerOf2Unboxed :: Word# -> Word# -> Int#
+multipleOfPowerOf2Unboxed value p = (value `and#` ((1## `uncheckedShiftL#` word2Int# p) `minusWord#` 1##)) `eqWord#` 0##
+
+class (IArray UArray a, FiniteBits a, Integral a) => Mantissa a where
+  decimalLength :: a -> Int
+  quotRem100 :: a -> (a, a)
+  quotRem10000 :: a -> (a, a)
+
+instance Mantissa Word32 where
+  decimalLength = decimalLength9
+  quotRem100 (W32# w) =
+    let w' = (w `timesWord#` 0x51EB851F##) `uncheckedShiftRL#` 37#
+      in (W32# w', W32# (w `minusWord#` (w' `timesWord#` 100##)))
+  quotRem10000 (W32# w) =
+    let w' = (w `timesWord#` 0xD1B71759##) `uncheckedShiftRL#` 45#
+      in (W32# w', W32# (w `minusWord#` (w' `timesWord#` 10000##)))
+
+instance Mantissa Word64 where
+    decimalLength = decimalLength17
+    quotRem100 (W64# w) =
+      let w' = dquot100 w
+       in (W64# w', W64# (w `minusWord#` (w' `timesWord#` 100##)))
+    quotRem10000 (W64# w) =
+      let !(# rdx, _ #) = w `timesWord2#` 0x346DC5D63886594B##
+          w' = rdx `uncheckedShiftRL#` 11#
+       in (W64# w', W64# (w `minusWord#` (w' `timesWord#` 10000##)))
+
+type DigitStore = Word16
+
+toAscii :: (Integral a, Integral b) => a -> b
+toAscii = fromIntegral . (+) (fromIntegral $ ord '0')
+
+digit_table :: UArray Int32 DigitStore
+digit_table = listArray (0, 99) [ (toAscii b .<< (8 :: Word16)) .|. toAscii a | a <- [0..9 :: Word16], b <- [0..9 :: Word16] ]
+
+copy :: DigitStore -> Ptr Word8 -> IO ()
+copy d p = poke (castPtr p) d
+
+first :: DigitStore -> Word8
+first = fromIntegral . flip (.>>) (8 :: Word16)
+
+second :: DigitStore -> Word8
+second = fromIntegral
+
+-- for loop recursively...
+{-# SPECIALIZE writeMantissa :: Ptr Word8 -> Int -> Word32 -> IO (Ptr Word8) #-}
+{-# SPECIALIZE writeMantissa :: Ptr Word8 -> Int -> Word64 -> IO (Ptr Word8) #-}
+writeMantissa :: (Mantissa a) => Ptr Word8 -> Int -> a -> IO (Ptr Word8)
+writeMantissa ptr olength = go (ptr `plusPtr` olength)
+  where
+    go p mantissa
+      | mantissa >= 10000 = do
+          let (m', c) = quotRem10000 mantissa
+              (c1, c0) = quotRem100 c
+          copy (digit_table `unsafeAt` fromIntegral c0) (p `plusPtr` (-1))
+          copy (digit_table `unsafeAt` fromIntegral c1) (p `plusPtr` (-3))
+          go (p `plusPtr` (-4)) m'
+      | mantissa >= 100 = do
+          let (m', c) = quotRem100 mantissa
+          copy (digit_table `unsafeAt` fromIntegral c) (p `plusPtr` (-1))
+          finalize m'
+      | otherwise = finalize mantissa
+    finalize mantissa
+      | mantissa >= 10 = do
+          let bs = digit_table `unsafeAt` fromIntegral mantissa
+          poke (ptr `plusPtr` 2) (first bs)
+          poke (ptr `plusPtr` 1) (c2w '.')
+          poke ptr (second bs)
+          return (ptr `plusPtr` (olength + 1))
+      | olength > 1 = do
+          copy ((fromIntegral (c2w '.') .<< (8 :: Word16)) .|. toAscii mantissa) ptr
+          return $ ptr `plusPtr` (olength + 1)
+      | otherwise = do
+          poke (ptr `plusPtr` 2) (c2w '0')
+          poke (ptr `plusPtr` 1) (c2w '.')
+          poke ptr (toAscii mantissa)
+          return (ptr `plusPtr` 3)
+
+writeExponent :: Ptr Word8 -> Int32 -> IO (Ptr Word8)
+writeExponent ptr expo
+  | expo >= 100 = do
+      let (e1, e0) = fquotRem10Boxed (fromIntegral expo)
+      copy (digit_table `unsafeAt` fromIntegral e1) ptr
+      poke (ptr `plusPtr` 2) (toAscii e0 :: Word8)
+      return $ ptr `plusPtr` 3
+  | expo >= 10 = do
+      copy (digit_table `unsafeAt` fromIntegral expo) ptr
+      return $ ptr `plusPtr` 2
+  | otherwise = do
+      poke ptr (toAscii expo)
+      return $ ptr `plusPtr` 1
+
+writeSign :: Ptr Word8 -> Bool -> IO (Ptr Word8)
+writeSign ptr True = do
+  poke ptr (c2w '-')
+  return $ ptr `plusPtr` 1
+writeSign ptr False = return ptr
+
+{-# INLINABLE toCharsScientific #-}
+{-# SPECIALIZE toCharsScientific :: Bool -> Word32 -> Int32 -> BoundedPrim () #-}
+{-# SPECIALIZE toCharsScientific :: Bool -> Word64 -> Int32 -> BoundedPrim () #-}
+toCharsScientific :: (Mantissa a) => Bool -> a -> Int32 -> BoundedPrim ()
+toCharsScientific sign mantissa expo = boundedPrim maxEncodedLength $ \_ p0 -> do
+  let olength = decimalLength mantissa
+      expo' = expo + fromIntegral olength - 1
+  p1 <- writeSign p0 sign
+  p2 <- writeMantissa p1 olength mantissa
+  poke p2 (c2w 'e')
+  p3 <- writeSign (p2 `plusPtr` 1) (expo' < 0)
+  writeExponent p3 (abs expo')
diff --git a/Data/ByteString/Builder/RealFloat/TableGenerator.hs b/Data/ByteString/Builder/RealFloat/TableGenerator.hs
new file mode 100644
index 000000000..7d6458dfe
--- /dev/null
+++ b/Data/ByteString/Builder/RealFloat/TableGenerator.hs
@@ -0,0 +1,185 @@
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
+
+module Data.ByteString.Builder.RealFloat.TableGenerator
+  ( Word128(..)
+  , float_pow5_inv_bitcount
+  , float_pow5_bitcount
+  , double_pow5_bitcount
+  , double_pow5_inv_bitcount
+  , float_max_split
+  , float_max_inv_split
+  , double_max_split
+  , double_max_inv_split
+  , finv
+  , fnorm
+  , gen_table_f
+  , gen_table_d
+  ) where
+
+import Data.Array.Base
+import Data.Bits ((.&.), shiftR, unsafeShiftL, unsafeShiftR)
+import GHC.Exts
+import GHC.ST (ST(..), runST)
+import GHC.Word (Word64(..))
+import Language.Haskell.TH
+
+data Word128 = Word128
+  { word128Hi64 :: !Word64
+  , word128Lo64 :: !Word64
+  }
+
+instance Num Word128 where
+  (+) = plus128
+  (-) = minus128
+  (*) = times128
+  negate = negate128
+  abs = id
+  signum = signum128
+  fromInteger = fromInteger128
+
+{-# INLINABLE plus128 #-}
+plus128 :: Word128 -> Word128 -> Word128
+plus128 (Word128 (W64# a1) (W64# a0)) (Word128 (W64# b1) (W64# b0)) =
+  Word128 (W64# s1) (W64# s0)
+  where
+    !(# c1, s0 #) = plusWord2# a0 b0
+    s1a = plusWord# a1 b1
+    s1 = plusWord# c1 s1a
+
+{-# INLINABLE minus128 #-}
+minus128 :: Word128 -> Word128 -> Word128
+minus128 (Word128 (W64# a1) (W64# a0)) (Word128 (W64# b1) (W64# b0)) =
+  Word128 (W64# d1) (W64# d0)
+  where
+    !(# d0, c1 #) = subWordC# a0 b0
+    a1c = minusWord# a1 (int2Word# c1)
+    d1 = minusWord# a1c b1
+
+times128 :: Word128 -> Word128 -> Word128
+times128 (Word128 (W64# a1) (W64# a0)) (Word128 (W64# b1) (W64# b0)) =
+  Word128 (W64# p1) (W64# p0)
+  where
+    !(# c1, p0 #) = timesWord2# a0 b0
+    p1a = timesWord# a1 b0
+    p1b = timesWord# a0 b1
+    p1c = plusWord# p1a p1b
+    p1 = plusWord# p1c c1
+
+{-# INLINABLE negate128 #-}
+negate128 :: Word128 -> Word128
+negate128 (Word128 (W64# a1) (W64# a0)) =
+  case plusWord2# (not# a0) 1## of
+    (# c, s #) -> Word128 (W64# (plusWord# (not# a1) c)) (W64# s)
+
+{-# INLINABLE signum128 #-}
+signum128 :: Word128 -> Word128
+signum128 (Word128 (W64# 0##) (W64# 0##)) = Word128 0 0
+signum128 _ = Word128 0 1
+
+fromInteger128 :: Integer -> Word128
+fromInteger128 i = Word128 (fromIntegral $ i `shiftR` 64) (fromIntegral i)
+
+instance MArray (STUArray s) Word128 (ST s) where
+  {-# INLINE getBounds #-}
+  getBounds (STUArray l u _ _) = return (l,u)
+
+  {-# INLINE getNumElements #-}
+  getNumElements (STUArray _ _ n _) = return n
+
+  {-# INLINE unsafeNewArray_ #-}
+  unsafeNewArray_ (l,u) = unsafeNewArraySTUArray_ (l,u) (*# 16#)
+
+  {-# INLINE unsafeRead #-}
+  unsafeRead (STUArray _ _ _ marr) (I# i) = ST $ \s1 ->
+    let !(# s2, w1 #) = readWord64Array# marr (i *# 2#) s1
+        !(# _, w2 #) = readWord64Array# marr (i *# 2# +# 1#) s2
+     in (# s2, Word128 (W64# w1) (W64# w2) #)
+
+  {-# INLINE unsafeWrite #-}
+  unsafeWrite (STUArray _ _ _ marr) (I# i) (Word128 (W64# w1) (W64# w2)) = ST $ \s1 ->
+    let s2 = writeWord64Array# marr (i *# 2#) w1 s1
+        s3 = writeWord64Array# marr (i *# 2# +# 1#) w2 s2
+     in (# s3, () #)
+
+instance IArray UArray Word128 where
+  {-# INLINE bounds #-}
+  bounds (UArray l u _ _) = (l,u)
+
+  {-# INLINE numElements #-}
+  numElements (UArray _ _ n _) = n
+
+  {-# INLINE unsafeArray #-}
+  unsafeArray lu ies = runST (unsafeArrayUArray lu ies 0)
+
+  -- NB: don't actually use this anywhere but...
+  {-# INLINE unsafeAt #-}
+  unsafeAt (UArray _ _ _ arr) (I# i) =
+    let w1 = indexWord64Array# arr (i *# 2#)
+        w2 = indexWord64Array# arr (i *# 2# +# 1#)
+     in Word128 (W64# w1) (W64# w2)
+
+
+float_pow5_inv_bitcount :: Int
+float_pow5_inv_bitcount = 59
+
+float_pow5_bitcount :: Int
+float_pow5_bitcount = 61
+
+double_pow5_bitcount :: Int
+double_pow5_bitcount = 125
+
+double_pow5_inv_bitcount :: Int
+double_pow5_inv_bitcount = 125
+
+blen :: Integer -> Integer
+blen 0 = 0
+blen 1 = 1
+blen n = 1 + blen (n `quot` 2)
+
+finv :: Integer -> Integer -> Integer
+finv bitcount i =
+  let p = 5^i
+   in (1 `unsafeShiftL` fromIntegral (blen p - 1 + bitcount)) `div` p + 1
+
+fnorm :: Integer -> Integer -> Integer
+fnorm bitcount i =
+  let p = 5^i
+   in p `unsafeShiftR` fromIntegral (blen p - bitcount)
+
+gen_table_f :: (Integral a) => a -> (a -> Integer) -> Q Exp
+gen_table_f n f = return $ ListE (fmap (LitE . IntegerL . f) [0..n])
+
+gen_table_d :: forall a. (Integral a) => a -> (a -> Integer) -> Q Exp
+gen_table_d n f = return $ ListE (fmap ff [0..n])
+  where
+    ff :: a -> Exp
+    ff c = let r = f c
+               hi = r `unsafeShiftR` 64
+               lo = r .&. ((1 `unsafeShiftL` 64) - 1)
+            in AppE (AppE (ConE 'Word128) (LitE . IntegerL $ hi)) (LitE . IntegerL $ lo)
+
+get_range :: forall ff. (RealFloat ff) => ff -> (Integer, Integer)
+get_range f =
+  let (emin, emax) = floatRange f
+      mantissaDigits = floatDigits f
+      emin' = fromIntegral $ emin - mantissaDigits - 2
+      emax' = fromIntegral $ emax - mantissaDigits - 2
+      log10 :: ff -> ff
+      log10 x = log x / log 10
+   in ((-emin') - floor (fromIntegral (-emin') * log10 5), floor $ emax' * log10 2)
+
+float_max_split :: Integer
+float_max_inv_split :: Integer
+(float_max_split, float_max_inv_split) = get_range (undefined :: Float)
+
+-- we take a slightly different codepath s.t we need one extra entry
+double_max_split :: Integer
+double_max_inv_split :: Integer
+(double_max_split, double_max_inv_split) =
+    let (m, mi) = get_range (undefined :: Double)
+     in (m + 1, mi)
+
diff --git a/bytestring.cabal b/bytestring.cabal
index 1a5748cb8..04d2c91e1 100644
--- a/bytestring.cabal
+++ b/bytestring.cabal
@@ -70,7 +70,7 @@ flag integer-simple
   default: False
 
 library
-  build-depends:     base >= 4.3 && < 5, ghc-prim, deepseq
+  build-depends:     base >= 4.2 && < 5, ghc-prim, deepseq, template-haskell, array
 
   exposed-modules:   Data.ByteString
                      Data.ByteString.Char8
@@ -85,6 +85,7 @@ library
                      Data.ByteString.Builder
                      Data.ByteString.Builder.Extra
                      Data.ByteString.Builder.Prim
+                     Data.ByteString.Builder.RealFloat
 
                      -- perhaps only exposed temporarily
                      Data.ByteString.Builder.Internal
@@ -95,6 +96,10 @@ library
                      Data.ByteString.Builder.Prim.ASCII
                      Data.ByteString.Builder.Prim.Internal.Floating
                      Data.ByteString.Builder.Prim.Internal.Base16
+                     Data.ByteString.Builder.RealFloat.F2S
+                     Data.ByteString.Builder.RealFloat.D2S
+                     Data.ByteString.Builder.RealFloat.Internal
+                     Data.ByteString.Builder.RealFloat.TableGenerator
 
   default-language:  Haskell2010
   other-extensions:  CPP,
diff --git a/tests/builder/Data/ByteString/Builder/Prim/TestUtils.hs b/tests/builder/Data/ByteString/Builder/Prim/TestUtils.hs
index 03858bd87..45b662bc1 100644
--- a/tests/builder/Data/ByteString/Builder/Prim/TestUtils.hs
+++ b/tests/builder/Data/ByteString/Builder/Prim/TestUtils.hs
@@ -58,7 +58,9 @@ module Data.ByteString.Builder.Prim.TestUtils (
   , float_list
   , double_list
   , coerceFloatToWord32
+  , coerceWord32ToFloat
   , coerceDoubleToWord64
+  , coerceWord64ToDouble
 
   ) where
 
@@ -346,11 +348,19 @@ double_list f = f . coerceDoubleToWord64
 coerceFloatToWord32 :: Float -> Word32
 coerceFloatToWord32 x = unsafePerformIO (with x (peek . castPtr))
 
+{-# NOINLINE coerceWord32ToFloat #-}
+coerceWord32ToFloat :: Word32 -> Float
+coerceWord32ToFloat x = unsafePerformIO (with x (peek . castPtr))
+
 -- | Convert a 'Double' to a 'Word64'.
 {-# NOINLINE coerceDoubleToWord64 #-}
 coerceDoubleToWord64 :: Double -> Word64
 coerceDoubleToWord64 x = unsafePerformIO (with x (peek . castPtr))
 
+{-# NOINLINE coerceWord64ToDouble #-}
+coerceWord64ToDouble :: Word64 -> Double
+coerceWord64ToDouble x = unsafePerformIO (with x (peek . castPtr))
+
 -- | Parse a variable length encoding
 parseVar :: (Num a, Bits a) => [Word8] -> (a, [Word8])
 parseVar =
diff --git a/tests/builder/Data/ByteString/Builder/Tests.hs b/tests/builder/Data/ByteString/Builder/Tests.hs
index 38de8a979..0329af615 100644
--- a/tests/builder/Data/ByteString/Builder/Tests.hs
+++ b/tests/builder/Data/ByteString/Builder/Tests.hs
@@ -23,9 +23,11 @@ import           Control.Monad.Trans.Writer (WriterT, execWriterT, tell)
 import           Foreign (minusPtr)
 
 import           Data.Char (chr)
+import           Data.Bits ((.|.), shiftL)
 import qualified Data.DList      as D
 import           Data.Foldable
 import           Data.Word
+import           Numeric (showEFloat)
 
 import qualified Data.ByteString          as S
 import qualified Data.ByteString.Internal as S
@@ -62,6 +64,7 @@ tests =
   testsEncodingToBuilder ++
   testsBinary ++
   testsASCII ++
+  testsFloating ++
   testsChar8 ++
   testsUtf8
 
@@ -601,6 +604,304 @@ testsASCII =
   where
     enlarge (n, e) = n ^ (abs (e `mod` (50 :: Integer)))
 
+testsFloating :: [TestTree]
+testsFloating =
+  [ testMatches "f2sBasic" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 0.0    , "0.0e0" )
+        , ( (-0.0) , "-0.0e0" )
+        , ( 1.0    , "1.0e0" )
+        , ( (-1.0) , "-1.0e0" )
+        , ( (0/0)  , "NaN" )
+        , ( (1/0)  , "Infinity" )
+        , ( (-1/0) , "-Infinity" )
+        ]
+  , testMatches "f2sSubnormal" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 1.1754944e-38 , "1.1754944e-38" )
+        ]
+  , testMatches "f2sMinAndMax" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( coerceWord32ToFloat 0x7f7fffff , "3.4028235e38" )
+        , ( coerceWord32ToFloat 0x00000001 , "1.0e-45" )
+        ]
+  , testMatches "f2sBoundaryRound" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 3.355445e7   , "3.3554448e7" )
+        , ( 8.999999e9   , "8.999999e9" )
+        , ( 3.4366717e10 , "3.4366718e10" )
+        ]
+  , testMatches "f2sExactValueRound" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 3.0540412e5 , "3.0540413e5" )
+        , ( 8.0990312e3 , "8.0990313e3" )
+        ]
+  , testMatches "f2sTrailingZeros" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        -- Pattern for the first test: 00111001100000000000000000000000
+        [ ( 2.4414062e-4 , "2.4414063e-4" )
+        , ( 2.4414062e-3 , "2.4414063e-3" )
+        , ( 4.3945312e-3 , "4.3945313e-3" )
+        , ( 6.3476562e-3 , "6.3476563e-3" )
+        ]
+  , testMatches "f2sRegression" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 4.7223665e21   , "4.7223665e21" )
+        , ( 8388608.0      , "8.388608e6" )
+        , ( 1.6777216e7    , "1.6777216e7" )
+        , ( 3.3554436e7    , "3.3554436e7" )
+        , ( 6.7131496e7    , "6.7131496e7" )
+        , ( 1.9310392e-38  , "1.9310392e-38" )
+        , ( (-2.47e-43)    , "-2.47e-43" )
+        , ( 1.993244e-38   , "1.993244e-38" )
+        , ( 4103.9003      , "4.1039004e3" )
+        , ( 5.3399997e9    , "5.3399997e9" )
+        , ( 6.0898e-39     , "6.0898e-39" )
+        , ( 0.0010310042   , "1.0310042e-3" )
+        , ( 2.8823261e17   , "2.882326e17" )
+        , ( 7.0385309e-26  , "7.038531e-26" )
+        , ( 9.2234038e17   , "9.223404e17" )
+        , ( 6.7108872e7    , "6.710887e7" )
+        , ( 1.0e-44        , "1.0e-44" )
+        , ( 2.816025e14    , "2.816025e14" )
+        , ( 9.223372e18    , "9.223372e18" )
+        , ( 1.5846085e29   , "1.5846086e29" )
+        , ( 1.1811161e19   , "1.1811161e19" )
+        , ( 5.368709e18    , "5.368709e18" )
+        , ( 4.6143165e18   , "4.6143166e18" )
+        , ( 0.007812537    , "7.812537e-3" )
+        , ( 1.4e-45        , "1.0e-45" )
+        , ( 1.18697724e20  , "1.18697725e20" )
+        , ( 1.00014165e-36 , "1.00014165e-36" )
+        , ( 200.0          , "2.0e2" )
+        , ( 3.3554432e7    , "3.3554432e7" )
+        , ( 2.0019531      , "2.0019531e0" )
+        , ( 2.001953       , "2.001953e0" )
+        ]
+  , testMatches "f2sLooksLikePowerOf5" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( coerceWord32ToFloat 0x5D1502F9 , "6.7108864e17" )
+        , ( coerceWord32ToFloat 0x5D9502F9 , "1.3421773e18" )
+        , ( coerceWord32ToFloat 0x5e1502F9 , "2.6843546e18" )
+        ]
+  , testMatches "f2sOutputLength" (formatFloat FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 1.0            , "1.0e0" )
+        , ( 1.2            , "1.2e0" )
+        , ( 1.23           , "1.23e0" )
+        , ( 1.234          , "1.234e0" )
+        , ( 1.2345         , "1.2345e0" )
+        , ( 1.23456        , "1.23456e0" )
+        , ( 1.234567       , "1.234567e0" )
+        , ( 1.2345678      , "1.2345678e0" )
+        , ( 1.23456735e-36 , "1.23456735e-36" )
+        ]
+  , testMatches "d2sBasic" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 0.0    , "0.0e0" )
+        , ( (-0.0) , "-0.0e0" )
+        , ( 1.0    , "1.0e0" )
+        , ( (-1.0) , "-1.0e0" )
+        , ( (0/0)  , "NaN" )
+        , ( (1/0)  , "Infinity" )
+        , ( (-1/0) , "-Infinity" )
+        ]
+  , testMatches "d2sSubnormal" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 2.2250738585072014e-308 , "2.2250738585072014e-308" )
+        ]
+  , testMatches "d2sMinAndMax" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( (coerceWord64ToDouble 0x7fefffffffffffff) , "1.7976931348623157e308" )
+        , ( (coerceWord64ToDouble 0x0000000000000001) , "5.0e-324" )
+        ]
+  , testMatches "d2sTrailingZeros" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 2.98023223876953125e-8 , "2.9802322387695313e-8" )
+        ]
+  , testMatches "d2sRegression" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( (-2.109808898695963e16) , "-2.1098088986959632e16" )
+        , ( 4.940656e-318           , "4.940656e-318" )
+        , ( 1.18575755e-316         , "1.18575755e-316" )
+        , ( 2.989102097996e-312     , "2.989102097996e-312" )
+        , ( 9.0608011534336e15      , "9.0608011534336e15" )
+        , ( 4.708356024711512e18    , "4.708356024711512e18" )
+        , ( 9.409340012568248e18    , "9.409340012568248e18" )
+        , ( 1.2345678               , "1.2345678e0" )
+        , ( 1.9430376160308388e16   , "1.9430376160308388e16" )
+        , ( (-6.9741824662760956e19), "-6.9741824662760956e19" )
+        , ( 4.3816050601147837e18   , "4.3816050601147837e18" )
+        ]
+  , testMatches "d2sLooksLikePowerOf5" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( (coerceWord64ToDouble 0x4830F0CF064DD592) , "5.764607523034235e39" )
+        , ( (coerceWord64ToDouble 0x4840F0CF064DD592) , "1.152921504606847e40" )
+        , ( (coerceWord64ToDouble 0x4850F0CF064DD592) , "2.305843009213694e40" )
+        ]
+  , testMatches "d2sOutputLength" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 1                  , "1.0e0" )
+        , ( 1.2                , "1.2e0" )
+        , ( 1.23               , "1.23e0" )
+        , ( 1.234              , "1.234e0" )
+        , ( 1.2345             , "1.2345e0" )
+        , ( 1.23456            , "1.23456e0" )
+        , ( 1.234567           , "1.234567e0" )
+        , ( 1.2345678          , "1.2345678e0" )
+        , ( 1.23456789         , "1.23456789e0" )
+        , ( 1.234567895        , "1.234567895e0" )
+        , ( 1.2345678901       , "1.2345678901e0" )
+        , ( 1.23456789012      , "1.23456789012e0" )
+        , ( 1.234567890123     , "1.234567890123e0" )
+        , ( 1.2345678901234    , "1.2345678901234e0" )
+        , ( 1.23456789012345   , "1.23456789012345e0" )
+        , ( 1.234567890123456  , "1.234567890123456e0" )
+        , ( 1.2345678901234567 , "1.2345678901234567e0" )
+
+        -- Test 32-bit chunking
+        , ( 4.294967294 , "4.294967294e0" )
+        , ( 4.294967295 , "4.294967295e0" )
+        , ( 4.294967296 , "4.294967296e0" )
+        , ( 4.294967297 , "4.294967297e0" )
+        , ( 4.294967298 , "4.294967298e0" )
+        ]
+  , testMatches "d2sMinMaxShift" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( (ieeeParts2Double False 4 0) , "1.7800590868057611e-307" )
+        -- 32-bit opt-size=0:  49 <= dist <= 49
+        -- 32-bit opt-size=1:  28 <= dist <= 49
+        -- 64-bit opt-size=0:  50 <= dist <= 50
+        -- 64-bit opt-size=1:  28 <= dist <= 50
+        , ( (ieeeParts2Double False 6 maxMantissa) , "2.8480945388892175e-306" )
+        -- 32-bit opt-size=0:  52 <= dist <= 53
+        -- 32-bit opt-size=1:   2 <= dist <= 53
+        -- 64-bit opt-size=0:  53 <= dist <= 53
+        -- 64-bit opt-size=1:   2 <= dist <= 53
+        , ( (ieeeParts2Double False 41 0) , "2.446494580089078e-296" )
+        -- 32-bit opt-size=0:  52 <= dist <= 52
+        -- 32-bit opt-size=1:   2 <= dist <= 52
+        -- 64-bit opt-size=0:  53 <= dist <= 53
+        -- 64-bit opt-size=1:   2 <= dist <= 53
+        , ( (ieeeParts2Double False 40 maxMantissa) , "4.8929891601781557e-296" )
+        -- 32-bit opt-size=0:  57 <= dist <= 58
+        -- 32-bit opt-size=1:  57 <= dist <= 58
+        -- 64-bit opt-size=0:  58 <= dist <= 58
+        -- 64-bit opt-size=1:  58 <= dist <= 58
+        , ( (ieeeParts2Double False 1077 0) , "1.8014398509481984e16" )
+        -- 32-bit opt-size=0:  57 <= dist <= 57
+        -- 32-bit opt-size=1:  57 <= dist <= 57
+        -- 64-bit opt-size=0:  58 <= dist <= 58
+        -- 64-bit opt-size=1:  58 <= dist <= 58
+        , ( (ieeeParts2Double False 1076 maxMantissa) , "3.6028797018963964e16" )
+        -- 32-bit opt-size=0:  51 <= dist <= 52
+        -- 32-bit opt-size=1:  51 <= dist <= 59
+        -- 64-bit opt-size=0:  52 <= dist <= 52
+        -- 64-bit opt-size=1:  52 <= dist <= 59
+        , ( (ieeeParts2Double False 307 0) , "2.900835519859558e-216" )
+        -- 32-bit opt-size=0:  51 <= dist <= 51
+        -- 32-bit opt-size=1:  51 <= dist <= 59
+        -- 64-bit opt-size=0:  52 <= dist <= 52
+        -- 64-bit opt-size=1:  52 <= dist <= 59
+        , ( (ieeeParts2Double False 306 maxMantissa) , "5.801671039719115e-216" )
+        -- 32-bit opt-size=0:  49 <= dist <= 49
+        -- 32-bit opt-size=1:  44 <= dist <= 49
+        -- 64-bit opt-size=0:  50 <= dist <= 50
+        -- 64-bit opt-size=1:  44 <= dist <= 50
+        , ( (ieeeParts2Double False 934 0x000FA7161A4D6e0C) , "3.196104012172126e-27" )
+        ]
+  , testMatches "d2sSmallIntegers" (formatDouble FFExponent Nothing) (flip (showEFloat Nothing) "")
+        [ ( 9007199254740991.0 , "9.007199254740991e15" )
+        , ( 9007199254740992.0 , "9.007199254740992e15" )
+
+        , ( 1.0e+0                , "1.0e0" )
+        , ( 1.2e+1                , "1.2e1" )
+        , ( 1.23e+2               , "1.23e2" )
+        , ( 1.234e+3              , "1.234e3" )
+        , ( 1.2345e+4             , "1.2345e4" )
+        , ( 1.23456e+5            , "1.23456e5" )
+        , ( 1.234567e+6           , "1.234567e6" )
+        , ( 1.2345678e+7          , "1.2345678e7" )
+        , ( 1.23456789e+8         , "1.23456789e8" )
+        , ( 1.23456789e+9         , "1.23456789e9" )
+        , ( 1.234567895e+9        , "1.234567895e9" )
+        , ( 1.2345678901e+10      , "1.2345678901e10" )
+        , ( 1.23456789012e+11     , "1.23456789012e11" )
+        , ( 1.234567890123e+12    , "1.234567890123e12" )
+        , ( 1.2345678901234e+13   , "1.2345678901234e13" )
+        , ( 1.23456789012345e+14  , "1.23456789012345e14" )
+        , ( 1.234567890123456e+15 , "1.234567890123456e15" )
+
+        -- 10^i
+        , ( 1.0e+0  , "1.0e0" )
+        , ( 1.0e+1  , "1.0e1" )
+        , ( 1.0e+2  , "1.0e2" )
+        , ( 1.0e+3  , "1.0e3" )
+        , ( 1.0e+4  , "1.0e4" )
+        , ( 1.0e+5  , "1.0e5" )
+        , ( 1.0e+6  , "1.0e6" )
+        , ( 1.0e+7  , "1.0e7" )
+        , ( 1.0e+8  , "1.0e8" )
+        , ( 1.0e+9  , "1.0e9" )
+        , ( 1.0e+10 , "1.0e10" )
+        , ( 1.0e+11 , "1.0e11" )
+        , ( 1.0e+12 , "1.0e12" )
+        , ( 1.0e+13 , "1.0e13" )
+        , ( 1.0e+14 , "1.0e14" )
+        , ( 1.0e+15 , "1.0e15" )
+
+        -- 10^15 + 10^i
+        , ( (1.0e+15 + 1.0e+0)  , "1.000000000000001e15" )
+        , ( (1.0e+15 + 1.0e+1)  , "1.00000000000001e15" )
+        , ( (1.0e+15 + 1.0e+2)  , "1.0000000000001e15" )
+        , ( (1.0e+15 + 1.0e+3)  , "1.000000000001e15" )
+        , ( (1.0e+15 + 1.0e+4)  , "1.00000000001e15" )
+        , ( (1.0e+15 + 1.0e+5)  , "1.0000000001e15" )
+        , ( (1.0e+15 + 1.0e+6)  , "1.000000001e15" )
+        , ( (1.0e+15 + 1.0e+7)  , "1.00000001e15" )
+        , ( (1.0e+15 + 1.0e+8)  , "1.0000001e15" )
+        , ( (1.0e+15 + 1.0e+9)  , "1.000001e15" )
+        , ( (1.0e+15 + 1.0e+10) , "1.00001e15" )
+        , ( (1.0e+15 + 1.0e+11) , "1.0001e15" )
+        , ( (1.0e+15 + 1.0e+12) , "1.001e15" )
+        , ( (1.0e+15 + 1.0e+13) , "1.01e15" )
+        , ( (1.0e+15 + 1.0e+14) , "1.1e15" )
+
+        -- Largest power of 2 <= 10^(i+1)
+        , ( 8.0                , "8.0e0" )
+        , ( 64.0               , "6.4e1" )
+        , ( 512.0              , "5.12e2" )
+        , ( 8192.0             , "8.192e3" )
+        , ( 65536.0            , "6.5536e4" )
+        , ( 524288.0           , "5.24288e5" )
+        , ( 8388608.0          , "8.388608e6" )
+        , ( 67108864.0         , "6.7108864e7" )
+        , ( 536870912.0        , "5.36870912e8" )
+        , ( 8589934592.0       , "8.589934592e9" )
+        , ( 68719476736.0      , "6.8719476736e10" )
+        , ( 549755813888.0     , "5.49755813888e11" )
+        , ( 8796093022208.0    , "8.796093022208e12" )
+        , ( 70368744177664.0   , "7.0368744177664e13" )
+        , ( 562949953421312.0  , "5.62949953421312e14" )
+        , ( 9007199254740992.0 , "9.007199254740992e15" )
+
+        -- 1000 * (Largest power of 2 <= 10^(i+1))
+        , ( 8.0e+3             , "8.0e3" )
+        , ( 64.0e+3            , "6.4e4" )
+        , ( 512.0e+3           , "5.12e5" )
+        , ( 8192.0e+3          , "8.192e6" )
+        , ( 65536.0e+3         , "6.5536e7" )
+        , ( 524288.0e+3        , "5.24288e8" )
+        , ( 8388608.0e+3       , "8.388608e9" )
+        , ( 67108864.0e+3      , "6.7108864e10" )
+        , ( 536870912.0e+3     , "5.36870912e11" )
+        , ( 8589934592.0e+3    , "8.589934592e12" )
+        , ( 68719476736.0e+3   , "6.8719476736e13" )
+        , ( 549755813888.0e+3  , "5.49755813888e14" )
+        , ( 8796093022208.0e+3 , "8.796093022208e15" )
+        ]
+  , testMatches "f2sPowersOf10" floatDec show $
+        fmap asShowRef [read ("1.0e" ++ show x) :: Float | x <- [-46..39 :: Int]]
+  , testMatches "d2sPowersOf10" doubleDec show $
+        fmap asShowRef [read ("1.0e" ++ show x) :: Double | x <- [-324..309 :: Int]]
+  ]
+  where
+    testMatches :: (Show a) => TestName -> (a -> Builder) -> (a -> String) -> [(a, String)] -> TestTree
+    testMatches name dec refdec lst = testProperty name $
+      all (\(x, ref) -> L.unpack (toLazyByteString (dec x)) == encodeASCII (refdec x)
+                        && refdec x == ref) lst
+
+    maxMantissa = (1 `shiftL` 53) - 1 :: Word64
+
+    ieeeParts2Double :: Bool -> Int -> Word64 -> Double
+    ieeeParts2Double sign expo mantissa =
+        coerceWord64ToDouble $ (fromIntegral (fromEnum sign) `shiftL` 63) .|. (fromIntegral expo `shiftL` 52) .|. mantissa
+
+    asShowRef x = (x, show x)
+
 testsChar8 :: [TestTree]
 testsChar8 =
   [ testBuilderConstr "charChar8" char8_list char8