From df16dd95e5ae01aea7bf7c0c624936b7b46e2953 Mon Sep 17 00:00:00 2001
From: Felix Zimmermann <fzimmermann89@gmail.com>
Date: Mon, 13 Jan 2025 16:00:54 +0100
Subject: [PATCH] update

---
 src/mrpro/algorithms/optimizers/adam.py  |  1 -
 src/mrpro/algorithms/optimizers/cg.py    | 40 +++++++++++++-----------
 src/mrpro/algorithms/optimizers/lbfgs.py | 12 +++----
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/mrpro/algorithms/optimizers/adam.py b/src/mrpro/algorithms/optimizers/adam.py
index 8e6da66a9..ec6a42f7f 100644
--- a/src/mrpro/algorithms/optimizers/adam.py
+++ b/src/mrpro/algorithms/optimizers/adam.py
@@ -50,7 +50,6 @@ def adam(
     3. Compute biased corrected estimates of the moments :math:`\hat{m}_t` and :math:`\hat{v}_t`.
     4. Update parameters using the adaptive step size.
 
-    The AdamW [LOS2019]_ variant improves generalization by decoupling weight decay from gradient-based updates.
     This function wraps PyTorch's :class:`torch.optim.Adam` and :class:`torch.optim.AdamW` implementations,
     supporting both standard Adam and decoupled weight decay regularization (AdamW) [LOS2019]_
 
diff --git a/src/mrpro/algorithms/optimizers/cg.py b/src/mrpro/algorithms/optimizers/cg.py
index 2d458bfa0..367e72c96 100644
--- a/src/mrpro/algorithms/optimizers/cg.py
+++ b/src/mrpro/algorithms/optimizers/cg.py
@@ -25,42 +25,44 @@ def cg(
 ) -> torch.Tensor:
     r"""CG for solving a linear system :math:`Hx=b`.
 
-    Thereby, :math:`H` is a linear self-adjoint operator, :math:`b` is the right-hand-side
-    of the system and :math:`x` is the sought solution.
+    This algorithm solves systems of the form :math:`H x = b`, where :math:`H` is a self-adjoint linear operator
+    and :math:`b` is the right-hand side. The method can solve a batch of :math:`N` systems jointly, thereby taking
+    :math:`H` as a block-diagonal with blocks :math:`H_i` and :math:`b = [b_1, ..., b_N] ^T`.
 
-    Note that this implementation allows for simultaneously solving a batch of :math:`N` problems
-    of the form :math:`H_i x_i = b_i` with :math:`i=1,...,N`.
+     The method performs the following steps:
 
-    Thereby, the underlying assumption is that the considered problem is :math:`Hx=b` with
-    :math:`H:= diag(H_1, ..., H_N)` and  :math:`b:= [b_1, ..., b_N]^T`.
+     1. Initialize the residual :math:`r_0 = b - Hx_0` (with :math:`x_0` as the initial guess).
+     2. Set the search direction :math:`p_0 = r_0`.
+     3. For each iteration :math:`k = 0, 1, 2, ...`:
+     - Compute :math:`\alpha_k = \frac{r_k^T r_k}{p_k^T H p_k}`.
+     - Update the solution: :math:`x_{k+1} = x_k + \alpha_k p_k`.
+     - Compute the new residual: :math:`r_{k+1} = r_k - \alpha_k H p_k`.
+     - Update the search direction: :math:`\beta_k = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_k}`,
+         then :math:`p_{k+1} = r_{k+1} + \beta_k p_k`.
 
-    Thus, if all :math:`H_i` are self-adjoint, so is :math:`H` and the CG can be applied.
-    Note however, that the accuracy of the obtained solutions might vary among the different
-    problems.
-    Note also that we don't test if the input operator is self-adjoint or not.
+    This implementation assumes that :math:`H` is self-adjoint and does not verify this condition.
 
-    Further, note that if the condition of :math:`H` is very large, a small residual does not necessarily
-    imply that the solution is accurate.
 
     Parameters
     ----------
     operator
-        self-adjoint operator (named H above)
+        self-adjoint operator (named :math:`H` above)
     right_hand_side
-        right-hand-side of the system (named b above)
+        right-hand-side of the system (named :math:`b` above)
     initial_value
-        initial value of the algorithm; if None, it will be set to right_hand_side
+        initial value of the algorithm; if `None`, it will be set to `right_hand_side`
     max_iterations
-        maximal number of iterations
+        maximal number of iterations. Can be used for early stopping.
     tolerance
         tolerance for the residual; if set to zero, the maximal number of iterations
-        is the only stopping criterion used to stop the cg
+        is the only stopping criterion used to stop the cg.
+        If the condition number of :math:`H` is large, a small residual may not imply a highly accurate solution.
     callback
-        function to be called at each iteration
+        function to be called at each iteration. This can be used to monitor the progress of the algorithm.
 
     Returns
     -------
-        an approximate solution of the linear system Hx=b
+        an approximate solution of the linear system :math:`Hx=b`
     """
     if initial_value is not None and (initial_value.shape != right_hand_side.shape):
         raise ValueError(
diff --git a/src/mrpro/algorithms/optimizers/lbfgs.py b/src/mrpro/algorithms/optimizers/lbfgs.py
index 7a74c3b2f..239f4b8b3 100644
--- a/src/mrpro/algorithms/optimizers/lbfgs.py
+++ b/src/mrpro/algorithms/optimizers/lbfgs.py
@@ -34,7 +34,7 @@ def lbfgs(
 
     .. math::
 
-    \theta_{k+1} = \theta_k - \alpha_k H_k \nabla f(\theta_k),
+        \theta_{k+1} = \theta_k - \alpha_k H_k \nabla f(\theta_k),
 
     where :math:`H_k` is a limited-memory approximation of the inverse Hessian,
     and :math:`\alpha_k` is the step size determined via line search (e.g., strong Wolfe conditions).
@@ -48,16 +48,16 @@ def lbfgs(
     5. Store the latest gradient and update information.
 
     This implementation wraps PyTorch's `torch.optim.LBFGS` class.
-    For more information, see [WIKI_LBFGS]_., [NOC1980]_, and [LIU1989]_.
+    For more information, see [WIKI]_, [NOC1980]_, and [LIU1989]_.
 
     References
     ----------
     .. [NOC1980] Nocedal, J. (1980). "Updating quasi-Newton matrices with limited storage."
-       *Mathematics of Computation*, 35(151), 773-782. DOI:10.1090/S0025-5718-1980-0572855-7
-       https://doi.org/10.1090/S0025-5718-1980-0572855-7
+       *Mathematics of Computation*, 35(151), 773-782.
+       `10.1090/S0025-5718-1980-0572855-7`<https://doi.org/10.1090/S0025-5718-1980-0572855-7>_
     .. [LIU1989] Liu, D. C., & Nocedal, J. (1989). "On the limited memory BFGS method for large scale optimization."
-       *Mathematical Programming*, 45(1-3), 503-528. DOI:10.1007/BF01589116 https://doi.org/10.1007/BF01589116
-    .. [WIKI_LBFGS] Wikipedia: `LBFGS <https://en.wikipedia.org/wiki/Limited-memory_BFGS>`_
+       *Mathematical Programming*, 45(1-3), 503-528. `10.1007/BF01589116`<https://doi.org/10.1007/BF01589116>_
+    .. [WIKI] Wikipedia: `Limited-memory_BFGS <https://en.wikipedia.org/wiki/Limited-memory_BFGS>`_
 
 
     Parameters