From df16dd95e5ae01aea7bf7c0c624936b7b46e2953 Mon Sep 17 00:00:00 2001 From: Felix Zimmermann Date: Mon, 13 Jan 2025 16:00:54 +0100 Subject: [PATCH] update --- src/mrpro/algorithms/optimizers/adam.py | 1 - src/mrpro/algorithms/optimizers/cg.py | 40 +++++++++++++----------- src/mrpro/algorithms/optimizers/lbfgs.py | 12 +++---- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/mrpro/algorithms/optimizers/adam.py b/src/mrpro/algorithms/optimizers/adam.py index 8e6da66a9..ec6a42f7f 100644 --- a/src/mrpro/algorithms/optimizers/adam.py +++ b/src/mrpro/algorithms/optimizers/adam.py @@ -50,7 +50,6 @@ def adam( 3. Compute biased corrected estimates of the moments :math:`\hat{m}_t` and :math:`\hat{v}_t`. 4. Update parameters using the adaptive step size. - The AdamW [LOS2019]_ variant improves generalization by decoupling weight decay from gradient-based updates. This function wraps PyTorch's :class:`torch.optim.Adam` and :class:`torch.optim.AdamW` implementations, supporting both standard Adam and decoupled weight decay regularization (AdamW) [LOS2019]_ diff --git a/src/mrpro/algorithms/optimizers/cg.py b/src/mrpro/algorithms/optimizers/cg.py index 2d458bfa0..367e72c96 100644 --- a/src/mrpro/algorithms/optimizers/cg.py +++ b/src/mrpro/algorithms/optimizers/cg.py @@ -25,42 +25,44 @@ def cg( ) -> torch.Tensor: r"""CG for solving a linear system :math:`Hx=b`. - Thereby, :math:`H` is a linear self-adjoint operator, :math:`b` is the right-hand-side - of the system and :math:`x` is the sought solution. + This algorithm solves systems of the form :math:`H x = b`, where :math:`H` is a self-adjoint linear operator + and :math:`b` is the right-hand side. The method can solve a batch of :math:`N` systems jointly, thereby taking + :math:`H` as a block-diagonal with blocks :math:`H_i` and :math:`b = [b_1, ..., b_N] ^T`. - Note that this implementation allows for simultaneously solving a batch of :math:`N` problems - of the form :math:`H_i x_i = b_i` with :math:`i=1,...,N`. + The method performs the following steps: - Thereby, the underlying assumption is that the considered problem is :math:`Hx=b` with - :math:`H:= diag(H_1, ..., H_N)` and :math:`b:= [b_1, ..., b_N]^T`. + 1. Initialize the residual :math:`r_0 = b - Hx_0` (with :math:`x_0` as the initial guess). + 2. Set the search direction :math:`p_0 = r_0`. + 3. For each iteration :math:`k = 0, 1, 2, ...`: + - Compute :math:`\alpha_k = \frac{r_k^T r_k}{p_k^T H p_k}`. + - Update the solution: :math:`x_{k+1} = x_k + \alpha_k p_k`. + - Compute the new residual: :math:`r_{k+1} = r_k - \alpha_k H p_k`. + - Update the search direction: :math:`\beta_k = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_k}`, + then :math:`p_{k+1} = r_{k+1} + \beta_k p_k`. - Thus, if all :math:`H_i` are self-adjoint, so is :math:`H` and the CG can be applied. - Note however, that the accuracy of the obtained solutions might vary among the different - problems. - Note also that we don't test if the input operator is self-adjoint or not. + This implementation assumes that :math:`H` is self-adjoint and does not verify this condition. - Further, note that if the condition of :math:`H` is very large, a small residual does not necessarily - imply that the solution is accurate. Parameters ---------- operator - self-adjoint operator (named H above) + self-adjoint operator (named :math:`H` above) right_hand_side - right-hand-side of the system (named b above) + right-hand-side of the system (named :math:`b` above) initial_value - initial value of the algorithm; if None, it will be set to right_hand_side + initial value of the algorithm; if `None`, it will be set to `right_hand_side` max_iterations - maximal number of iterations + maximal number of iterations. Can be used for early stopping. tolerance tolerance for the residual; if set to zero, the maximal number of iterations - is the only stopping criterion used to stop the cg + is the only stopping criterion used to stop the cg. + If the condition number of :math:`H` is large, a small residual may not imply a highly accurate solution. callback - function to be called at each iteration + function to be called at each iteration. This can be used to monitor the progress of the algorithm. Returns ------- - an approximate solution of the linear system Hx=b + an approximate solution of the linear system :math:`Hx=b` """ if initial_value is not None and (initial_value.shape != right_hand_side.shape): raise ValueError( diff --git a/src/mrpro/algorithms/optimizers/lbfgs.py b/src/mrpro/algorithms/optimizers/lbfgs.py index 7a74c3b2f..239f4b8b3 100644 --- a/src/mrpro/algorithms/optimizers/lbfgs.py +++ b/src/mrpro/algorithms/optimizers/lbfgs.py @@ -34,7 +34,7 @@ def lbfgs( .. math:: - \theta_{k+1} = \theta_k - \alpha_k H_k \nabla f(\theta_k), + \theta_{k+1} = \theta_k - \alpha_k H_k \nabla f(\theta_k), where :math:`H_k` is a limited-memory approximation of the inverse Hessian, and :math:`\alpha_k` is the step size determined via line search (e.g., strong Wolfe conditions). @@ -48,16 +48,16 @@ def lbfgs( 5. Store the latest gradient and update information. This implementation wraps PyTorch's `torch.optim.LBFGS` class. - For more information, see [WIKI_LBFGS]_., [NOC1980]_, and [LIU1989]_. + For more information, see [WIKI]_, [NOC1980]_, and [LIU1989]_. References ---------- .. [NOC1980] Nocedal, J. (1980). "Updating quasi-Newton matrices with limited storage." - *Mathematics of Computation*, 35(151), 773-782. DOI:10.1090/S0025-5718-1980-0572855-7 - https://doi.org/10.1090/S0025-5718-1980-0572855-7 + *Mathematics of Computation*, 35(151), 773-782. + `10.1090/S0025-5718-1980-0572855-7`_ .. [LIU1989] Liu, D. C., & Nocedal, J. (1989). "On the limited memory BFGS method for large scale optimization." - *Mathematical Programming*, 45(1-3), 503-528. DOI:10.1007/BF01589116 https://doi.org/10.1007/BF01589116 - .. [WIKI_LBFGS] Wikipedia: `LBFGS `_ + *Mathematical Programming*, 45(1-3), 503-528. `10.1007/BF01589116`_ + .. [WIKI] Wikipedia: `Limited-memory_BFGS `_ Parameters