From fae0768f4a79d1fec637a53104c24d4761312abe Mon Sep 17 00:00:00 2001 From: Mohamed Tarek Date: Tue, 6 Nov 2018 20:40:51 +1100 Subject: [PATCH] [WIP] Fix HagerZhang bugs (#136) * fixes bugs and paper discrepancies in HagerZhang linesearch * Initial HagerZhang fixes * fix tests --- src/hagerzhang.jl | 53 +++++++++---------- src/initialguess.jl | 120 +++++++++++++++++++++++--------------------- test/initial.jl | 2 +- 3 files changed, 92 insertions(+), 83 deletions(-) diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl index 48cd58d..9cf87cd 100644 --- a/src/hagerzhang.jl +++ b/src/hagerzhang.jl @@ -116,8 +116,10 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, if !(isfinite(phi_0) && isfinite(dphi_0)) throw(ArgumentError("Value and slope at step length = 0 must be finite.")) end - if dphi_0 >= zeroT + if dphi_0 >= eps(T) * abs(phi_0) throw(ArgumentError("Search direction is not a direction of descent.")) + elseif dphi_0 >= 0 + return zeroT, phi_0 end # Prevent values of x_new = x+αs that are likely to make @@ -132,7 +134,8 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, phi_lim = phi_0 + epsilon * abs(phi_0) - @assert c > zeroT + @assert c >= 0 + c <= eps(T) && return zeroT, phi_0 @assert isfinite(c) && c <= alphamax phi_c, dphi_c = ϕdϕ(c) iterfinite = 1 @@ -145,7 +148,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, if !(isfinite(phi_c) && isfinite(dphi_c)) @warn("Failed to achieve finite new evaluation point, using alpha=0") mayterminate[] = false # reset in case another initial guess is used next - return zeroT, ϕ(zeroT) # phi_0 + return zeroT, phi_0 end push!(alphas, c) push!(values, phi_c) @@ -191,7 +194,7 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, # The value is higher, but the slope is downward, so we must # have crested over the peak. Use bisection. ib = length(alphas) - ia = ib - 1 + ia = 1 if c ≉ alphas[ib] || slopes[ib] >= zeroT error("c = ", c) end @@ -199,24 +202,32 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, ia, ib = bisect!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, display) isbracketed = true else - # We'll still going downhill, expand the interval and try again + # We'll still going downhill, expand the interval and try again. + # Reaching this branch means that dphi_c < 0 and phi_c <= phi_0 + ϵ_k + # So cold = c has a lower objective than phi_0 up to epsilon. + # This makes it a viable step to return if bracketing fails. + + # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. + # Going back to the loop with c = cold will only result in infinite cycling. + # So returning (cold, phi_cold) and exiting the line search is the best move. cold = c + phi_cold = phi_c + if nextfloat(cold) >= alphamax + mayterminate[] = false # reset in case another initial guess is used next + return cold, phi_cold + end c *= rho if c > alphamax - c = (alphamax + cold)/2 + c = alphamax if display & BRACKET > 0 - println("bracket: exceeding alphamax, bisecting: alphamax = ", alphamax, - ", cold = ", cold, ", new c = ", c) - end - if c == cold || nextfloat(c) >= alphamax - mayterminate[] = false # reset in case another initial guess is used next - return cold, phi_c + println("bracket: exceeding alphamax, using c = alphamax = ", alphamax, + ", cold = ", cold) end end phi_c, dphi_c = ϕdϕ(c) iterfinite = 1 while !(isfinite(phi_c) && isfinite(dphi_c)) && c > nextfloat(cold) && iterfinite < iterfinitemax - alphamax = c + alphamax = c # shrinks alphamax, assumes that steps >= c can never have finite phi_c and dphi_c iterfinite += 1 if display & BRACKET > 0 println("bracket: non-finite value, bisection") @@ -225,24 +236,14 @@ function (ls::HagerZhang)(ϕ, ϕdϕ, phi_c, dphi_c = ϕdϕ(c) end if !(isfinite(phi_c) && isfinite(dphi_c)) - mayterminate[] = false # reset in case another initial guess is used next - return cold, ϕ(cold) - elseif dphi_c < zeroT && c == alphamax - # We're on the edge of the allowed region, and the - # value is still decreasing. This can be due to - # roundoff error in barrier penalties, a barrier - # coefficient being so small that being eps() away - # from it still doesn't turn the slope upward, or - # mistakes in the user's function. - if iterfinite >= iterfinitemax + if display & BRACKET > 0 println("Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.") println("c = ", c, ", alphamax = ", alphamax, ", phi_c = ", phi_c, ", dphi_c = ", dphi_c) end - mayterminate[] = false # reset in case another initial guess is used next - return c, phi_c + return cold, phi_cold end push!(alphas, c) push!(values, phi_c) @@ -394,7 +395,7 @@ function secant2!(ϕdϕ, # we updated a, do it for b too c = secant(alphas, values, slopes, ia, iA) end - if a <= c <= b + if (iA == ic || iB == ic) && a <= c <= b if display & SECANT2 > 0 println("secant2: second c = ", c) end diff --git a/src/initialguess.jl b/src/initialguess.jl index b4a1b55..74b334b 100644 --- a/src/initialguess.jl +++ b/src/initialguess.jl @@ -164,13 +164,14 @@ If α0 is NaN, then procedure I0 is called at the first iteration, otherwise, we select according to procedure I1-2, with starting value α0. """ @with_kw struct InitialHagerZhang{T} - ψ0::T = 0.01 - ψ1::T = 0.2 - ψ2::T = 2.0 - ψ3::T = 0.1 - αmax::T = Inf - α0::T = 1.0 # Initial alpha guess. NaN => algorithm calculates - verbose::Bool = false + ψ0::T = 0.01 + ψ1::T = 0.2 + ψ2::T = 2.0 + ψ3::T = 0.1 + αmax::T = Inf + α0::T = 1.0 # Initial alpha guess. NaN => algorithm calculates + quadstep::Bool = true + verbose::Bool = false end function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls @@ -181,6 +182,7 @@ function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls # pick the initial step size according to HZ #I0 state.alpha = _hzI0(state.x, NLSolversBase.gradient(df), NLSolversBase.value(df), + is.αmax, convert(eltype(state.x), is.ψ0)) # Hack to deal with type instability between is{T} and state.x if Tls <: HagerZhang ls.mayterminate[] = false @@ -194,7 +196,7 @@ function (is::InitialHagerZhang)(ls::Tls, state, phi_0, dphi_0, df) where Tls end T = eltype(state.alpha) state.alpha = _hzI12(state.alpha, df, state.x, state.s, state.x_ls, phi_0, dphi_0, - is.ψ1, is.ψ2, is.ψ3, convert(T, is.αmax), is.verbose, mayterminate) + is.ψ1, is.ψ2, is.ψ3, T(is.αmax), is.verbose, is.quadstep, mayterminate) end return state.alpha end @@ -212,6 +214,7 @@ function _hzI12(alpha::T, psi3::Real, alphamax::T, verbose::Bool, + quadstep::Bool, mayterminate) where {Tx,T} ϕ = make_ϕ(df, x_new, x, s) @@ -221,65 +224,55 @@ function _hzI12(alpha::T, alphatest = psi1 * alpha alphatest = min(alphatest, alphamax) - + alphatest == 0 && return alphatest phitest = ϕ(alphatest) + alphatest, phitest = get_finite(alphatest, phitest, ϕ, psi3, iterfinitemax, phi_0, mayterminate) + alphatest == 0 && return alphatest - iterfinite = 1 - while !isfinite(phitest) - if iterfinite >= iterfinitemax - mayterminate[] = true - return convert(T, 0) - # TODO: Throw error / LineSearchException instead? - # error("Failed to achieve finite test value; alphatest = ", alphatest) - end - - alphatest = psi3 * alphatest - phitest = ϕ(alphatest) - iterfinite += 1 - end - a = ((phitest-phi_0)/alphatest - dphi_0)/alphatest # quadratic fit - - if verbose == true - println("quadfit: alphatest = ", alphatest, - ", phi_0 = ", phi_0, - ", dphi_0 = ", dphi_0, - ", phitest = ", phitest, - ", quadcoef = ", a) - end - mayterminate[] = false - if isfinite(a) && a > 0 && phitest <= phi_0 - alpha = -dphi_0 / 2 / a # if convex, choose minimum of quadratic - if alpha == 0 - error("alpha is zero. dphi_0 = ", dphi_0, ", phi_0 = ", phi_0, ", phitest = ", phitest, ", alphatest = ", alphatest, ", a = ", a) - end - if alpha <= alphamax - mayterminate[] = true - else - alpha = alphamax - mayterminate[] = false - end + mayterminate[] = quadstep_success = false + if quadstep + a = ((phitest - phi_0) / alphatest - dphi_0) / alphatest # quadratic fit if verbose == true - println("alpha guess (quadratic): ", alpha, - ",(mayterminate = ", mayterminate, ")") + println("quadfit: alphatest = ", alphatest, + ", phi_0 = ", phi_0, + ", dphi_0 = ", dphi_0, + ", phitest = ", phitest, + ", quadcoef = ", a) end - else - if phitest > phi_0 - alpha = alphatest - else - alpha *= psi2 # if not convex, expand the interval + if isfinite(a) && a > 0 && phitest <= phi_0 + alphatest2 = -dphi_0 / 2 / a # if convex, choose minimum of quadratic + alphatest2 = min(alphatest2, alphamax) + phitest2 = ϕ(alphatest2) + if isfinite(phitest2) + mayterminate[] = quadstep_success = true + alphatest = alphatest2 + phitest = phitest2 + if verbose == true + println("alpha guess (quadratic): ", alphatest, + ",(mayterminate = ", mayterminate[], ")") + end + end end end - alpha = min(alphamax, alpha) - if verbose == true - println("alpha guess (expand): ", alpha) + if (!quadstep || !quadstep_success) && phitest <= phi_0 + # If no quadstep or it fails, expand the interval. + # While the phitest <= phi_0 condition was not in the paper, it gives a significant boost to the speed. The rationale behind it is that since the slope at alpha = 0 is negative, if phitest > phi_0 then a local minimum must be between alpha = 0 and alpha = alphatest, so alpha_test is good enough to return. + alphatest = psi2 * alpha + alphatest = min(alphatest, alphamax) + phitest = ϕ(alphatest) + alphatest, phitest = get_finite(alphatest, phitest, ϕ, psi3, iterfinitemax, phi_0, mayterminate) + if verbose == true + println("alpha guess (expand): ", alphatest) + end end - return alpha + return alphatest end # Generate initial guess for step size (HZ, stage I0) function _hzI0(x::AbstractArray{Tx}, gr::AbstractArray{Tx}, f_x::T, + alphamax::T, psi0::T = convert(T, 1)/100) where {Tx,T} zeroT = convert(T, 0) alpha = convert(T, 1) @@ -289,8 +282,23 @@ function _hzI0(x::AbstractArray{Tx}, if x_max != zeroT alpha = psi0 * x_max / gr_max elseif f_x != zeroT - alpha = psi0 * abs(f_x) / norm(gr) + alpha = psi0 * abs(f_x) / norm(gr)^2 end end - return alpha + return min(alpha, alphamax) +end + +function get_finite(alpha::T, phi, ϕ, factor, itermax, phi_0, mayterminate) where {T} + iter = 1 + while !isfinite(phi) + if iter >= itermax + mayterminate[] = true + return T(0), phi_0 + end + + alpha = factor * alpha + phi = ϕ(alpha) + iter += 1 + end + return alpha, phi end diff --git a/test/initial.jl b/test/initial.jl index fa2bcb3..15de7c3 100644 --- a/test/initial.jl +++ b/test/initial.jl @@ -90,7 +90,7 @@ state.f_x_previous = 2*phi_0 is = InitialQuadratic(snap2one=(0.9,Inf)) is(ls, state, phi_0, dphi_0, df) - @test state.alpha == 0.8200000000000001 + @test state.alpha == 1.0 @test ls.mayterminate[] == false # Test Quadratic snap2one