From 22b464a3e7d77201c2257c604f452960a06acd2e Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 14:45:52 -0500
Subject: [PATCH 1/7] Introduce fast (non-IEEE) versions of floating point
 intrinsics

---
 base/inference.jl  |  1 +
 src/init.c         |  3 ++-
 src/intrinsics.cpp | 48 +++++++++++++++++++++++++++++++++++++++-------
 src/julia.h        |  7 +++++++
 4 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/base/inference.jl b/base/inference.jl
index 1dfc6d3c3b0c7..fcb17692191d5 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -50,6 +50,7 @@ immutable JLCompilerOpts
     opt_level::Int8
     depwarn::Int8
     can_inline::Int8
+    fast_math::Int8
 end
 
 compileropts() = unsafe_load(cglobal(:jl_compileropts, JLCompilerOpts))
diff --git a/src/init.c b/src/init.c
index ac6071a8e0a6d..f4bfe96d0b93a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -97,7 +97,8 @@ jl_compileropts_t jl_compileropts = { NULL, // julia_home
                                       JL_COMPILEROPT_COMPILE_DEFAULT,
                                       0,    // opt_level
                                       1,    // depwarn
-                                      1     // can_inline
+                                      1,    // can_inline
+                                      JL_COMPILEROPT_FAST_MATH_DEFAULT
 };
 
 int jl_boot_file_loaded = 0;
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index e2ffb45f68949..6ba00608a84b4 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -6,6 +6,9 @@ namespace JL_I {
         neg_int, add_int, sub_int, mul_int,
         sdiv_int, udiv_int, srem_int, urem_int, smod_int,
         neg_float, add_float, sub_float, mul_float, div_float, rem_float,
+        // fast arithmetic
+        neg_float_fast, add_float_fast, sub_float_fast,
+        mul_float_fast, div_float_fast, rem_float_fast,
         // same-type comparisons
         eq_int,  ne_int,
         slt_int, ult_int,
@@ -718,6 +721,26 @@ static Value *emit_srem(Value *x, Value *den, jl_codectx_t *ctx)
     return ret;
 }
 
+// Temporarily switch the builder to fast-math mode if requested
+struct math_builder {
+    FastMathFlags old_fmf;
+    math_builder(jl_codectx_t *ctx, bool always_fast = false):
+        old_fmf(builder.getFastMathFlags())
+    {
+        if (jl_compileropts.fast_math != JL_COMPILEROPT_FAST_MATH_OFF &&
+            (always_fast ||
+             jl_compileropts.fast_math == JL_COMPILEROPT_FAST_MATH_ON)) {
+            FastMathFlags fmf;
+            fmf.setUnsafeAlgebra();
+            builder.SetFastMathFlags(fmf);
+        }
+    }
+    IRBuilder<>& operator()() const { return builder; }
+    ~math_builder() {
+        builder.SetFastMathFlags(old_fmf);
+    }
+};
+
 static Value *emit_smod(Value *x, Value *den, jl_codectx_t *ctx)
 {
     Type *t = den->getType();
@@ -926,15 +949,24 @@ static Value *emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
 // that do the correct thing on LLVM <= 3.3 and >= 3.5 respectively.
 // See issue #7868
 #ifdef LLVM35
-    HANDLE(neg_float,1) return builder.CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
+    HANDLE(neg_float,1) return math_builder(ctx)().CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
+    HANDLE(neg_float_fast,1) return math_builder(ctx, true)().CreateFNeg(FP(x));
 #else
-    HANDLE(neg_float,1) return builder.CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
+    HANDLE(neg_float,1)
+        return math_builder(ctx)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
+    HANDLE(neg_float_fast,1)
+        return math_builder(ctx, true)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
 #endif
-    HANDLE(add_float,2) return builder.CreateFAdd(FP(x), FP(y));
-    HANDLE(sub_float,2) return builder.CreateFSub(FP(x), FP(y));
-    HANDLE(mul_float,2) return builder.CreateFMul(FP(x), FP(y));
-    HANDLE(div_float,2) return builder.CreateFDiv(FP(x), FP(y));
-    HANDLE(rem_float,2) return builder.CreateFRem(FP(x), FP(y));
+    HANDLE(add_float,2) return math_builder(ctx)().CreateFAdd(FP(x), FP(y));
+    HANDLE(sub_float,2) return math_builder(ctx)().CreateFSub(FP(x), FP(y));
+    HANDLE(mul_float,2) return math_builder(ctx)().CreateFMul(FP(x), FP(y));
+    HANDLE(div_float,2) return math_builder(ctx)().CreateFDiv(FP(x), FP(y));
+    HANDLE(rem_float,2) return math_builder(ctx)().CreateFRem(FP(x), FP(y));
+    HANDLE(add_float_fast,2) return math_builder(ctx, true)().CreateFAdd(FP(x), FP(y));
+    HANDLE(sub_float_fast,2) return math_builder(ctx, true)().CreateFSub(FP(x), FP(y));
+    HANDLE(mul_float_fast,2) return math_builder(ctx, true)().CreateFMul(FP(x), FP(y));
+    HANDLE(div_float_fast,2) return math_builder(ctx, true)().CreateFDiv(FP(x), FP(y));
+    HANDLE(rem_float_fast,2) return math_builder(ctx, true)().CreateFRem(FP(x), FP(y));
 
     HANDLE(checked_sadd,2)
     HANDLE(checked_uadd,2)
@@ -1262,6 +1294,8 @@ extern "C" void jl_init_intrinsic_functions(void)
     ADD_I(smod_int);
     ADD_I(neg_float); ADD_I(add_float); ADD_I(sub_float); ADD_I(mul_float);
     ADD_I(div_float); ADD_I(rem_float);
+    ADD_I(neg_float_fast); ADD_I(add_float_fast); ADD_I(sub_float_fast);
+    ADD_I(mul_float_fast); ADD_I(div_float_fast); ADD_I(rem_float_fast);
     ADD_I(eq_int); ADD_I(ne_int);
     ADD_I(slt_int); ADD_I(ult_int);
     ADD_I(sle_int); ADD_I(ule_int);
diff --git a/src/julia.h b/src/julia.h
index 578cdb3c68b48..4348474d56cc9 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -411,6 +411,7 @@ extern jl_sym_t *abstracttype_sym; extern jl_sym_t *bitstype_sym;
 extern jl_sym_t *compositetype_sym; extern jl_sym_t *type_goto_sym;
 extern jl_sym_t *global_sym;  extern jl_sym_t *tuple_sym;
 extern jl_sym_t *boundscheck_sym; extern jl_sym_t *copyast_sym;
+extern jl_sym_t *fastmath_sym;
 extern jl_sym_t *simdloop_sym; extern jl_sym_t *meta_sym;
 extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym;
 
@@ -1327,6 +1328,8 @@ void show_execution_point(char *filename, int lno);
 
 // compiler options -----------------------------------------------------------
 
+// Note: need to keep this in sync with its initialization in
+// src/init.c, and with JLCompilerOpts in base/inference.jl
 typedef struct {
     const char *julia_home;
     const char *julia_bin;
@@ -1342,6 +1345,7 @@ typedef struct {
     int8_t opt_level;
     int8_t depwarn;
     int8_t can_inline;
+    int8_t fast_math;
 } jl_compileropts_t;
 
 extern DLLEXPORT jl_compileropts_t jl_compileropts;
@@ -1354,6 +1358,9 @@ extern DLLEXPORT jl_compileropts_t jl_compileropts;
 #define JL_COMPILEROPT_CHECK_BOUNDS_DEFAULT 0
 #define JL_COMPILEROPT_CHECK_BOUNDS_ON 1
 #define JL_COMPILEROPT_CHECK_BOUNDS_OFF 2
+#define JL_COMPILEROPT_FAST_MATH_DEFAULT 0
+#define JL_COMPILEROPT_FAST_MATH_ON 1
+#define JL_COMPILEROPT_FAST_MATH_OFF 2
 #define JL_COMPILEROPT_COMPILE_DEFAULT 1
 #define JL_COMPILEROPT_COMPILE_OFF 0
 #define JL_COMPILEROPT_COMPILE_ON  1

From 5417ba41ffb7efb1084b147d974876ddcecadeb6 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 14:46:40 -0500
Subject: [PATCH 2/7] Add `@fastmath` macro that converts expressions to use
 fast math

---
 base/base.jl      | 18 ++++++++++++++++++
 base/exports.jl   | 10 ++++++++++
 base/float.jl     | 42 ++++++++++++++++++++++++++++++++++++++++++
 src/alloc.c       |  1 +
 src/interpreter.c |  3 +++
 src/jltypes.c     |  1 +
 6 files changed, 75 insertions(+)

diff --git a/base/base.jl b/base/base.jl
index daf59b0f9433a..92870891ea881 100644
--- a/base/base.jl
+++ b/base/base.jl
@@ -242,6 +242,24 @@ macro inbounds(blk)
     :(@boundscheck false $(esc(blk)))
 end
 
+function make_fastmath(expr)
+    if isa(expr, Expr)
+        Expr(make_fastmath(expr.head), map(make_fastmath, expr.args)...)
+    elseif expr==:+; :add_fast
+    elseif expr==:-; :sub_fast
+    elseif expr==:*; :mul_fast
+    elseif expr==:/; :div_fast
+    elseif expr==:rem; :rem_fast
+    elseif expr==:mod; :mod_fast
+    elseif expr==:cmp; :cmp_fast
+    else expr
+    end
+end
+
+macro fastmath(expr)
+    make_fastmath(esc(expr))
+end
+
 macro label(name::Symbol)
     Expr(:symboliclabel, name)
 end
diff --git a/base/exports.jl b/base/exports.jl
index b14c9cfc2d7f7..e319785717794 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -464,6 +464,15 @@ export
     √,
     ∛,
 
+# fast math
+    add_fast,
+    sub_fast,
+    mul_fast,
+    div_fast,
+    rem_fast,
+    mod_fast,
+    cmp_fast,
+
 # specfun
     airy,
     airyai,
@@ -1417,6 +1426,7 @@ export
     @deprecate,
     @boundscheck,
     @inbounds,
+    @fastmath,
     @simd,
     @label,
     @goto,
diff --git a/base/float.jl b/base/float.jl
index 6938ce799abef..2bfa86a713aa1 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -204,6 +204,35 @@ widen(::Type{Float32}) = Float64
 rem(x::Float32, y::Float32) = box(Float32,rem_float(unbox(Float32,x),unbox(Float32,y)))
 rem(x::Float64, y::Float64) = box(Float64,rem_float(unbox(Float64,x),unbox(Float64,y)))
 
+# fast versions that may violate strict IEEE semantics
+# TODO: provide isnan_fast and friends
+for (op_fast, op) in ((:add_fast, :+), (:sub_fast, :-),
+                      (:mul_fast, :*), (:div_fast, :/),
+                      (:rem_fast, :rem), (:mod_fast, :mod),
+                      (:cmp_fast, :cmp))
+    @eval begin
+        # fall-back implementation for non-numeric types
+        ($op_fast)(xs...) = ($op)(xs...)
+        # type promotion
+        ($op_fast)(x::Number, y::Number, zs::Number...) =
+            ($op_fast)(promote(x,y,zs...)...)
+        # fall-back implementation that applies after promotion
+        ($op_fast){T<:Number}(x::T,ys::T...) = ($op)(x,ys...)
+    end
+end
+for T in (Float32, Float64)
+    @eval begin
+        sub_fast(x::$T) = box($T,neg_float_fast(unbox($T,x)))
+        add_fast(x::$T, y::$T) = box($T,add_float_fast(unbox($T,x),unbox($T,y)))
+        sub_fast(x::$T, y::$T) = box($T,sub_float_fast(unbox($T,x),unbox($T,y)))
+        mul_fast(x::$T, y::$T) = box($T,mul_float_fast(unbox($T,x),unbox($T,y)))
+        div_fast(x::$T, y::$T) = box($T,div_float_fast(unbox($T,x),unbox($T,y)))
+        rem_fast(x::$T, y::$T) = box($T,rem_float_fast(unbox($T,x),unbox($T,y)))
+        add_fast(x::$T, y::$T, zs::$T...) = add_fast(add_fast(x, y), zs...)
+        mul_fast(x::$T, y::$T, zs::$T...) = mul_fast(mul_fast(x, y), zs...)
+    end
+end
+
 cld{T<:FloatingPoint}(x::T, y::T) = -fld(-x,y)
 
 function mod{T<:FloatingPoint}(x::T, y::T)
@@ -217,6 +246,16 @@ function mod{T<:FloatingPoint}(x::T, y::T)
     end
 end
 
+function mod_fast{T<:FloatingPoint}(x::T, y::T)
+    r = rem_fast(x,y)
+    if r == 0
+        copysign(r,y)
+    elseif (r > 0) $ (y > 0)
+        r+y
+    else
+        r
+    end
+end
 
 ## floating point comparisons ##
 ==(x::Float32, y::Float32) = eq_float(unbox(Float32,x),unbox(Float32,y))
@@ -248,6 +287,9 @@ function cmp(x::FloatingPoint, y::Real)
     ifelse(x<y, -1, ifelse(x>y, 1, 0))
 end
 
+cmp_fast(x::Float32, y::Float32) = ifelse(x<y, -1, ifelse(x>y, 1, 0))
+cmp_fast(x::Float64, y::Float64) = ifelse(x<y, -1, ifelse(x>y, 1, 0))
+
 for Ti in (Int64,UInt64,Int128,UInt128)
     for Tf in (Float32,Float64)
         @eval begin
diff --git a/src/alloc.c b/src/alloc.c
index 9ea24664d5e9b..72abc7fa378be 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -93,6 +93,7 @@ jl_sym_t *compositetype_sym; jl_sym_t *type_goto_sym;
 jl_sym_t *global_sym; jl_sym_t *tuple_sym;
 jl_sym_t *dot_sym;    jl_sym_t *newvar_sym;
 jl_sym_t *boundscheck_sym; jl_sym_t *copyast_sym;
+jl_sym_t *fastmath_sym;
 jl_sym_t *simdloop_sym; jl_sym_t *meta_sym;
 jl_sym_t *arrow_sym; jl_sym_t *ldots_sym;
 
diff --git a/src/interpreter.c b/src/interpreter.c
index 307535251183f..37146f82ac53d 100644
--- a/src/interpreter.c
+++ b/src/interpreter.c
@@ -449,6 +449,9 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl)
     else if (ex->head == boundscheck_sym) {
         return (jl_value_t*)jl_nothing;
     }
+    else if (ex->head == fastmath_sym) {
+        return (jl_value_t*)jl_nothing;
+    }
     else if (ex->head == simdloop_sym) {
         return (jl_value_t*)jl_nothing;
     }
diff --git a/src/jltypes.c b/src/jltypes.c
index f58a723f7f162..f2825bad2eab8 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3279,6 +3279,7 @@ void jl_init_types(void)
     kw_sym = jl_symbol("kw");
     dot_sym = jl_symbol(".");
     boundscheck_sym = jl_symbol("boundscheck");
+    fastmath_sym = jl_symbol("fastmath");
     newvar_sym = jl_symbol("newvar");
     copyast_sym = jl_symbol("copyast");
     simdloop_sym = jl_symbol("simdloop");

From 7f23d5291fb308d7c6b6ea93da09f11553cf83a5 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 14:46:54 -0500
Subject: [PATCH 3/7] Add test cases for fast math

---
 test/numbers.jl | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test/numbers.jl b/test/numbers.jl
index aab864fd9d2fd..411a155aaddd0 100644
--- a/test/numbers.jl
+++ b/test/numbers.jl
@@ -59,6 +59,36 @@
 @test minmax(NaN, 3.) == (3., 3.)
 @test isequal(minmax(NaN, NaN), (NaN, NaN))
 
+# fast math
+const one32 = one(Float32)
+const eps32 = eps(Float32)
+const eps32_2 = eps32/2
+# Note: Cannot use local functions since these are not yet optimized
+fm_ieee_32(x) = x + eps32_2 + eps32_2
+fm_fast_32(x) = @fastmath x + eps32_2 + eps32_2
+@test fm_ieee_32(one32) == one32
+@test (fm_fast_32(one32) == one32 ||
+       fm_fast_32(one32) == one32 + eps32 > one32)
+
+const one64 = one(Float64)
+const eps64 = eps(Float64)
+const eps64_2 = eps64/2
+# Note: Cannot use local functions since these are not yet optimized
+fm_ieee_64(x) = x + eps64_2 + eps64_2
+fm_fast_64(x) = @fastmath x + eps64_2 + eps64_2
+@test fm_ieee_64(one64) == one64
+@test (fm_fast_64(one64) == one64 ||
+       fm_fast_64(one64) == one64 + eps64 > one64)
+
+let epsf = 1.0f0/2^15, one_epsf = 1+epsf
+    @test isapprox((@fastmath one_epsf * one_epsf - 1),
+                   float32(65537/1073741824))
+end
+let eps = 1.0/2^30, one_eps = 1+eps
+    @test isapprox((@fastmath one_eps * one_eps - 1),
+                   2147483649/1152921504606846976)
+end
+
 # lexing typemin(Int64)
 @test (-9223372036854775808)^1 == -9223372036854775808
 @test [1 -1 -9223372036854775808] == [1 -1 typemin(Int64)]

From 9d462ed0526524ffed0a23d14ffcc6214ffc8f48 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 14:47:19 -0500
Subject: [PATCH 4/7] Add command line option `--math-mode=ieee` to disable
 fast math

---
 ui/repl.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/ui/repl.c b/ui/repl.c
index 2437350ac7aec..d6d303ad5f6ab 100644
--- a/ui/repl.c
+++ b/ui/repl.c
@@ -86,7 +86,8 @@ void parse_opts(int *argcp, char ***argvp)
 	opt_dump_bitcode,
 	opt_compile,
 	opt_depwarn,
-	opt_inline
+	opt_inline,
+        opt_math_mode
     };
     static const char shortopts[] = "+H:hJ:C:O";
     static const struct option longopts[] = {
@@ -105,6 +106,7 @@ void parse_opts(int *argcp, char ***argvp)
         { "compile",       required_argument, 0, opt_compile },
         { "depwarn",       required_argument, 0, opt_depwarn },
         { "inline",        required_argument, 0, opt_inline },
+        { "math-mode",     required_argument, 0, opt_math_mode },
         { 0, 0, 0, 0 }
     };
     int c;
@@ -218,6 +220,16 @@ void parse_opts(int *argcp, char ***argvp)
                 exit(1);
             }
             break;
+        case opt_math_mode:
+            if (!strcmp(optarg, "ieee"))
+                jl_compileropts.fast_math = JL_COMPILEROPT_FAST_MATH_OFF;
+            else if (!strcmp(optarg, "user"))
+                jl_compileropts.fast_math = JL_COMPILEROPT_FAST_MATH_DEFAULT;
+            else {
+                ios_printf(ios_stderr, "julia: invalid argument to --math-mode (%s)\n", optarg);
+                exit(1);
+            }
+            break;
         default:
             ios_printf(ios_stderr, "julia: unhandled option -- %c\n",  c);
             ios_printf(ios_stderr, "This is a bug, please report it.\n");

From 877603920235d85b55e332d49b408244c6254281 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 14:47:29 -0500
Subject: [PATCH 5/7] Document fast math

---
 doc/man/julia.1                 | 5 +++++
 doc/manual/getting-started.rst  | 2 ++
 doc/manual/performance-tips.rst | 4 ++++
 3 files changed, 11 insertions(+)

diff --git a/doc/man/julia.1 b/doc/man/julia.1
index 62762e548d34a..d25a8bcbc3b2b 100644
--- a/doc/man/julia.1
+++ b/doc/man/julia.1
@@ -120,6 +120,11 @@ Enable or disable color text
 --check-bounds={yes|no}
 Emit bounds checks always or never (ignoring declarations)
 
+.TP
+--math-mode={ieee|user}
+Always use IEEE semantics for math (ignoring declarations),
+or adhere to declarations in source code
+
 .TP
 --int-literals={32|64}
 Select integer literal size independent of platform
diff --git a/doc/manual/getting-started.rst b/doc/manual/getting-started.rst
index 533b3f5d7187a..656fd553724d7 100644
--- a/doc/manual/getting-started.rst
+++ b/doc/manual/getting-started.rst
@@ -130,6 +130,8 @@ those available for the ``perl`` and ``ruby`` programs::
      --track-allocation={none|user|all}
                               Count bytes allocated by each source line
      --check-bounds={yes|no}  Emit bounds checks always or never (ignoring declarations)
+     --math-mode={ieee|user}  Always use IEEE semantics for math (ignoring declarations),
+                              or adhere to declarations in source code
      -O, --optimize           Run time-intensive code optimizations
      --int-literals={32|64}   Select integer literal size independent of platform
      --dump-bitcode={yes|no}  Dump bitcode for the system image (used with --build)
diff --git a/doc/manual/performance-tips.rst b/doc/manual/performance-tips.rst
index e5b508f6428de..0fcaadb613d93 100644
--- a/doc/manual/performance-tips.rst
+++ b/doc/manual/performance-tips.rst
@@ -546,6 +546,10 @@ properties.
 -  Use :obj:`@inbounds` to eliminate array bounds checking within expressions.
    Be certain before doing this. If the subscripts are ever out of bounds,
    you may suffer crashes or silent corruption.
+-  Use :obj:`@fastmath` to allow floating point optimizations that are
+   correct for real numbers, but lead to differences for IEEE numbers.
+   Be careful when doing this, as this may change numerical results.
+   This corresponds to the ``-ffast-math`` option of clang.
 -  Write :obj:`@simd` in front of ``for`` loops that are amenable to vectorization.
    **This feature is experimental** and could change or disappear in future
    versions of Julia.

From 0c4bab3179480d557d8d53c57a379d325ead4b43 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Tue, 6 Jan 2015 16:31:34 -0500
Subject: [PATCH 6/7] Add a non-trivial `@fastmath` example to the
 documentation

---
 doc/manual/performance-tips.rst | 91 ++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/doc/manual/performance-tips.rst b/doc/manual/performance-tips.rst
index 0fcaadb613d93..7b251d1d6039b 100644
--- a/doc/manual/performance-tips.rst
+++ b/doc/manual/performance-tips.rst
@@ -554,7 +554,7 @@ properties.
    **This feature is experimental** and could change or disappear in future
    versions of Julia.
 
-Here is an example with both forms of markup::
+Here is an example with both :obj:`@inbounds` and :obj:`@simd` markup::
 
     function inner( x, y )
         s = zero(eltype(x))
@@ -625,6 +625,95 @@ properties:
    LLVM auto-vectorization may kick in automatically, leading to no further
    speedup with :obj:`@simd`.
 
+Here is an example with all three kinds of markup. This program first
+calculates the finite difference of a one-dimensional array, and then
+evaluates the L2-norm of the result::
+
+    function init!(u)
+        n = length(u)
+        dx = 1.0 / (n-1)
+        @fastmath @inbounds @simd for i in 1:n
+            u[i] = sin(2pi*dx*i)
+        end
+    end
+     
+    function deriv!(u, du)
+        n = length(u)
+        dx = 1.0 / (n-1)
+        @fastmath @inbounds du[1] = (u[2] - u[1]) / dx
+        @fastmath @inbounds @simd for i in 2:n-1
+            du[i] = (u[i+1] - u[i-1]) / (2*dx)
+        end
+        @fastmath @inbounds du[n] = (u[n] - u[n-1]) / dx
+    end
+     
+    function norm(u)
+        n = length(u)
+        T = eltype(u)
+        s = zero(T)
+        @fastmath @inbounds @simd for i in 1:n
+            s += u[i]^2
+        end
+        @fastmath @inbounds return sqrt(s/n)
+    end
+     
+    function main()
+        n = 2000
+        u = Array(Float64, n)
+        init!(u)
+        du = similar(u)
+        
+        deriv!(u, du)
+        nu = norm(du)
+        
+        @time for i in 1:10^6
+            deriv!(u, du)
+            nu = norm(du)
+        end
+        
+        println(nu)
+    end
+     
+    main()
+
+On a computer with a 2.7 GHz Intel Core i7 processor, this produces::
+
+    $ julia wave.jl
+    elapsed time: 1.207814709 seconds (0 bytes allocated)
+    4.443986180758243
+
+    $ julia --math-mode=ieee wave.jl
+    elapsed time: 4.487083643 seconds (0 bytes allocated)
+    4.443986180758243
+
+Here, the option ``--math-mode=ieee`` disables the :opt:`@fastmath`
+macro, so that we can compare results.
+
+In this case, the speedup due to :opt:`@fastmath` is a factor of about
+3.7. This is unusually large -- in general, the speedup will be
+smaller. (In this particular example, the working set of the benchmark
+is small enough to fit into the L1 cache of the processor, so that
+memory access latency does not play a role, and computing time is
+dominated by CPU usage. In many real world programs this is not the
+case.) Also, in this case this optimization does not change the result
+-- in general, the result will be slightly different. In some cases,
+especially for numerically unstable algorithms, the result can be very
+different.
+
+The annotation :opt:`@fastmath` re-arranges floating point
+expressions, e.g. changing the order of evaluation, or assuming that
+certain special cases (inf, nan) cannot occur. In this case (and on
+this particular computer), the main difference is that the expression
+``1 / (2*dx)`` in the function ``deriv`` is hoisted out of the loop
+(i.e. calculated outside the loop), as if one had written ``idx = 1 /
+(2*dx)``. In the loop, the expression ``... / (2*dx)`` then becomes
+``... * idx``, which is much faster to evaluate. Of course, both the
+actual optimization that is applied by the compiler as well as the
+resulting speedup depend very much on the hardware. You can examine
+the change in generated code by using Julia's :obj:`code_native`
+function.
+
+
 .. _man-code-warntype:
 
 :obj:`@code_warntype`

From 8e1b7304e36b145ea19616077ebeb6b1b6596d22 Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Mon, 12 Jan 2015 14:59:32 -0500
Subject: [PATCH 7/7] Describe `--math-mode` option in run-time help-text

---
 ui/repl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ui/repl.c b/ui/repl.c
index d6d303ad5f6ab..aa72e7a6ae84e 100644
--- a/ui/repl.c
+++ b/ui/repl.c
@@ -73,6 +73,8 @@ static const char opts[] =
     "                          Count bytes allocated by each source line\n"
     " --check-bounds={yes|no}  Emit bounds checks always or never (ignoring declarations)\n"
     " --inline={yes|no}        Control whether inlining is permitted (even for functions declared as @inline)\n"
+    " --math-mode={ieee|user}  Always use IEEE semantics for math (ignoring declarations),\n"
+    "                          or adhere to declarations in source code\n"
     " -O, --optimize           Run time-intensive code optimizations\n"
     " --int-literals={32|64}   Select integer literal size independent of platform\n"
     " --dump-bitcode={yes|no}  Dump bitcode for the system image (used with --build)\n"