From fae0768f4a79d1fec637a53104c24d4761312abe Mon Sep 17 00:00:00 2001
From: Mohamed Tarek <mohamed82008@gmail.com>
Date: Tue, 6 Nov 2018 20:40:51 +1100
Subject: [PATCH] [WIP] Fix HagerZhang bugs (#136)

* fixes bugs and paper discrepancies in HagerZhang linesearch

* Initial HagerZhang fixes

* fix tests
---
 src/hagerzhang.jl   |  53 +++++++++----------
 src/initialguess.jl | 120 +++++++++++++++++++++++---------------------
 test/initial.jl     |   2 +-
 3 files changed, 92 insertions(+), 83 deletions(-)

diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl
index 48cd58d..9cf87cd 100644
--- a/src/hagerzhang.jl
+++ b/src/hagerzhang.jl
@@ -116,8 +116,10 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
     if !(isfinite(phi_0) && isfinite(dphi_0))
         throw(ArgumentError("Value and slope at step length = 0 must be finite."))
     end
-    if dphi_0 >= zeroT
+    if dphi_0 >= eps(T) * abs(phi_0)
         throw(ArgumentError("Search direction is not a direction of descent."))
+    elseif dphi_0 >= 0
+        return zeroT, phi_0
     end
 
     # Prevent values of x_new = x+αs that are likely to make
@@ -132,7 +134,8 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
 
 
     phi_lim = phi_0 + epsilon * abs(phi_0)
-    @assert c > zeroT
+    @assert c >= 0
+    c <= eps(T) && return zeroT, phi_0
     @assert isfinite(c) && c <= alphamax
     phi_c, dphi_c = ϕdϕ(c)
     iterfinite = 1
@@ -145,7 +148,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
     if !(isfinite(phi_c) && isfinite(dphi_c))
         @warn("Failed to achieve finite new evaluation point, using alpha=0")
         mayterminate[] = false # reset in case another initial guess is used next
-        return zeroT, ϕ(zeroT) # phi_0
+        return zeroT, phi_0
     end
     push!(alphas, c)
     push!(values, phi_c)
@@ -191,7 +194,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
             # The value is higher, but the slope is downward, so we must
             # have crested over the peak. Use bisection.
             ib = length(alphas)
-            ia = ib - 1
+            ia = 1
             if c ≉  alphas[ib] || slopes[ib] >= zeroT
                 error("c = ", c)
             end
@@ -199,24 +202,32 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
             ia, ib = bisect!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, display)
             isbracketed = true
         else
-            # We'll still going downhill, expand the interval and try again
+            # We'll still going downhill, expand the interval and try again.
+            # Reaching this branch means that dphi_c < 0 and phi_c <= phi_0 + ϵ_k
+            # So cold = c has a lower objective than phi_0 up to epsilon. 
+            # This makes it a viable step to return if bracketing fails.
+
+            # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. 
+            # Going back to the loop with c = cold will only result in infinite cycling.
+            # So returning (cold, phi_cold) and exiting the line search is the best move.
             cold = c
+            phi_cold = phi_c
+            if nextfloat(cold) >= alphamax
+                mayterminate[] = false # reset in case another initial guess is used next
+                return cold, phi_cold
+            end
             c *= rho
             if c > alphamax
-                c = (alphamax + cold)/2
+                c = alphamax
                 if display & BRACKET > 0
-                    println("bracket: exceeding alphamax, bisecting: alphamax = ", alphamax,
-                    ", cold = ", cold, ", new c = ", c)
-                end
-                if c == cold || nextfloat(c) >= alphamax
-                    mayterminate[] = false # reset in case another initial guess is used next
-                    return cold, phi_c
+                    println("bracket: exceeding alphamax, using c = alphamax = ", alphamax,
+                    ", cold = ", cold)
                 end
             end
             phi_c, dphi_c = ϕdϕ(c)
             iterfinite = 1
             while !(isfinite(phi_c) && isfinite(dphi_c)) && c > nextfloat(cold) && iterfinite < iterfinitemax
-                alphamax = c
+                alphamax = c # shrinks alphamax, assumes that steps >= c can never have finite phi_c and dphi_c
                 iterfinite += 1
                 if display & BRACKET > 0
                     println("bracket: non-finite value, bisection")
@@ -225,24 +236,14 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
                 phi_c, dphi_c = ϕdϕ(c)
             end
             if !(isfinite(phi_c) && isfinite(dphi_c))
-                mayterminate[] = false # reset in case another initial guess is used next
-                return cold, ϕ(cold)
-            elseif dphi_c < zeroT && c == alphamax
-                # We're on the edge of the allowed region, and the
-                # value is still decreasing. This can be due to
-                # roundoff error in barrier penalties, a barrier
-                # coefficient being so small that being eps() away
-                # from it still doesn't turn the slope upward, or
-                # mistakes in the user's function.
-                if iterfinite >= iterfinitemax
+                if display & BRACKET > 0
                     println("Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.")
                     println("c = ", c,
                             ", alphamax = ", alphamax,
                             ", phi_c = ", phi_c,
                             ", dphi_c = ", dphi_c)
                 end
-                mayterminate[] = false # reset in case another initial guess is used next
-                return c, phi_c
+                return cold, phi_cold
             end
             push!(alphas, c)
             push!(values, phi_c)
@@ -394,7 +395,7 @@ function secant2!(ϕdϕ,
         # we updated a, do it for b too
         c = secant(alphas, values, slopes, ia, iA)
     end
-    if a <= c <= b
+    if (iA == ic || iB == ic) && a <= c <= b
         if display & SECANT2 > 0
             println("secant2: second c = ", c)
         end
diff --git a/src/initialguess.jl b/src/initialguess.jl
index b4a1b55..74b334b 100644
--- a/src/initialguess.jl
+++ b/src/initialguess.jl
@@ -164,13 +164,14 @@ If α0 is NaN, then procedure I0 is called at the first iteration,
 otherwise, we select according to procedure I1-2, with starting value α0.
 """
 @with_kw struct InitialHagerZhang{T}
-    ψ0::T         = 0.01
-    ψ1::T         = 0.2
-    ψ2::T         = 2.0
-    ψ3::T         = 0.1
-    αmax::T       = Inf
-    α0::T         = 1.0 # Initial alpha guess. NaN => algorithm calculates
-    verbose::Bool = false
+    ψ0::T          = 0.01
+    ψ1::T          = 0.2
+    ψ2::T          = 2.0
+    ψ3::T          = 0.1
+    αmax::T        = Inf
+    α0::T          = 1.0 # Initial alpha guess. NaN => algorithm calculates
+    quadstep::Bool = true
+    verbose::Bool  = false
 end
 
 function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls
@@ -181,6 +182,7 @@ function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls
         # pick the initial step size according to HZ #I0
         state.alpha = _hzI0(state.x, NLSolversBase.gradient(df),
                             NLSolversBase.value(df),
+                            is.αmax,
                             convert(eltype(state.x), is.ψ0)) # Hack to deal with type instability between is{T} and state.x
         if Tls <: HagerZhang
             ls.mayterminate[] = false
@@ -194,7 +196,7 @@ function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls
         end
         T = eltype(state.alpha)
         state.alpha = _hzI12(state.alpha, df, state.x, state.s, state.x_ls, phi_0, dphi_0,
-                   is.ψ1, is.ψ2, is.ψ3, convert(T, is.αmax), is.verbose, mayterminate)
+                   is.ψ1, is.ψ2, is.ψ3, T(is.αmax), is.verbose, is.quadstep, mayterminate)
     end
     return state.alpha
 end
@@ -212,6 +214,7 @@ function _hzI12(alpha::T,
                 psi3::Real,
                 alphamax::T,
                 verbose::Bool,
+                quadstep::Bool,
                 mayterminate) where {Tx,T}
     ϕ = make_ϕ(df, x_new, x, s)
 
@@ -221,65 +224,55 @@ function _hzI12(alpha::T,
 
     alphatest = psi1 * alpha
     alphatest = min(alphatest, alphamax)
-
+    alphatest == 0 && return alphatest
     phitest = ϕ(alphatest)
+    alphatest, phitest = get_finite(alphatest, phitest, ϕ, psi3, iterfinitemax, phi_0, mayterminate)
+    alphatest == 0 && return alphatest
 
-    iterfinite = 1
-    while !isfinite(phitest)
-        if iterfinite >= iterfinitemax
-            mayterminate[] = true
-            return convert(T, 0)
-            # TODO: Throw error / LineSearchException instead?
-            # error("Failed to achieve finite test value; alphatest = ", alphatest)
-        end
-
-        alphatest = psi3 * alphatest
-        phitest = ϕ(alphatest)
-        iterfinite += 1
-    end
-    a = ((phitest-phi_0)/alphatest - dphi_0)/alphatest  # quadratic fit
-
-    if verbose == true
-        println("quadfit: alphatest = ", alphatest,
-                ", phi_0 = ", phi_0,
-                ", dphi_0 = ", dphi_0,
-                ", phitest = ", phitest,
-                ", quadcoef = ", a)
-    end
-    mayterminate[] = false
-    if isfinite(a) && a > 0 && phitest <= phi_0
-        alpha = -dphi_0 / 2 / a # if convex, choose minimum of quadratic
-        if alpha == 0
-            error("alpha is zero. dphi_0 = ", dphi_0, ", phi_0 = ", phi_0, ", phitest = ", phitest, ", alphatest = ", alphatest, ", a = ", a)
-        end
-        if alpha <= alphamax
-            mayterminate[] = true
-        else
-            alpha = alphamax
-            mayterminate[] = false
-        end
+    mayterminate[] = quadstep_success = false
+    if quadstep
+        a = ((phitest - phi_0) / alphatest - dphi_0) / alphatest  # quadratic fit
         if verbose == true
-            println("alpha guess (quadratic): ", alpha,
-                    ",(mayterminate = ", mayterminate, ")")
+            println("quadfit: alphatest = ", alphatest,
+                    ", phi_0 = ", phi_0,
+                    ", dphi_0 = ", dphi_0,
+                    ", phitest = ", phitest,
+                    ", quadcoef = ", a)
         end
-    else
-        if phitest > phi_0
-            alpha = alphatest
-        else
-            alpha *= psi2 # if not convex, expand the interval
+        if isfinite(a) && a > 0 && phitest <= phi_0
+            alphatest2 = -dphi_0 / 2 / a # if convex, choose minimum of quadratic
+            alphatest2 = min(alphatest2, alphamax)
+            phitest2 = ϕ(alphatest2)
+            if isfinite(phitest2)
+                mayterminate[] = quadstep_success = true
+                alphatest = alphatest2
+                phitest = phitest2
+                if verbose == true
+                    println("alpha guess (quadratic): ", alphatest,
+                            ",(mayterminate = ", mayterminate[], ")")
+                end
+            end
         end
     end
-    alpha = min(alphamax, alpha)
-    if verbose == true
-        println("alpha guess (expand): ", alpha)
+    if (!quadstep || !quadstep_success) && phitest <= phi_0
+        # If no quadstep or it fails, expand the interval.
+        # While the phitest <= phi_0 condition was not in the paper, it gives a significant boost to the speed. The rationale behind it is that since the slope at alpha = 0 is negative, if phitest > phi_0 then a local minimum must be between alpha = 0 and alpha = alphatest, so alpha_test is good enough to return.
+        alphatest = psi2 * alpha 
+        alphatest = min(alphatest, alphamax)
+        phitest = ϕ(alphatest)
+        alphatest, phitest = get_finite(alphatest, phitest, ϕ, psi3, iterfinitemax, phi_0, mayterminate)
+        if verbose == true
+            println("alpha guess (expand): ", alphatest)
+        end
     end
-    return alpha
+    return alphatest
 end
 
 # Generate initial guess for step size (HZ, stage I0)
 function _hzI0(x::AbstractArray{Tx},
                gr::AbstractArray{Tx},
                f_x::T,
+               alphamax::T,
                psi0::T = convert(T, 1)/100) where {Tx,T}
     zeroT = convert(T, 0)
     alpha = convert(T, 1)
@@ -289,8 +282,23 @@ function _hzI0(x::AbstractArray{Tx},
         if x_max != zeroT
             alpha = psi0 * x_max / gr_max
         elseif f_x != zeroT
-            alpha = psi0 * abs(f_x) / norm(gr)
+            alpha = psi0 * abs(f_x) / norm(gr)^2
         end
     end
-    return alpha
+    return min(alpha, alphamax)
+end
+
+function get_finite(alpha::T, phi, ϕ, factor, itermax, phi_0, mayterminate) where {T}
+    iter = 1
+    while !isfinite(phi)
+        if iter >= itermax
+            mayterminate[] = true
+            return T(0), phi_0
+        end
+
+        alpha = factor * alpha
+        phi = ϕ(alpha)
+        iter += 1
+    end
+    return alpha, phi
 end
diff --git a/test/initial.jl b/test/initial.jl
index fa2bcb3..15de7c3 100644
--- a/test/initial.jl
+++ b/test/initial.jl
@@ -90,7 +90,7 @@
     state.f_x_previous = 2*phi_0
     is = InitialQuadratic(snap2one=(0.9,Inf))
     is(ls, state, phi_0, dphi_0, df)
-    @test state.alpha == 0.8200000000000001
+    @test state.alpha == 1.0
     @test ls.mayterminate[] == false
 
     # Test Quadratic snap2one