diff --git a/CHANGELOG.md b/CHANGELOG.md
index 39ae70677..1d31da7c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
     - Updated the `SPDHG` algorithm to include setters for step sizes (#1644)
     - Add FluxNormaliser processor (#1878)
     - SAPBY for the BlockDataContainer now does not require an `out` to be passed (#2008)
+    - Fixed the rendering of the SAG/SAGA documentation (#2011)
   - Dependencies:
     - Added scikit-image to CIL-Demos conda install command as needed for new Callbacks notebook.
   - Changes that break backwards compatibility:
@@ -21,7 +22,7 @@
     - The `run` method in the cil algorithm class will no longer run if a number of iterations is not passed (#1940)
     - Paganin processor now requires the CIL data order (#1920)
     - The gradient descent algorithm now takes `f` instead of `objective_function` to match with ISTA and FISTA (#2006)
-- Testing
+  - Testing
     - Added a new test file `test_algorithm_convergence` that will hold our algorithm tests that run to convergence (#2019)
 
 
diff --git a/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py b/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py
index 761641759..504ffc4ad 100644
--- a/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py
+++ b/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py
@@ -227,7 +227,9 @@ def data_passes_indices(self):
 
     @property
     def data_passes(self):
-        """ The property :code:`data_passes` is a list of floats that holds the amount of data that has been processed up until each call of `gradient`. This list is updated each time `gradient` is called by appending the proportion of the data used when calculating the approximate gradient since the class was initialised (a full gradient calculation would be 1 full data pass). Warning: if your functions do not contain an equal `amount` of data, for example your data was not partitioned into equal batches, then you must first use the `set_data_partition_weights" function for this to be accurate.   """
+        """ The property :code:`data_passes` is a list of floats that holds the amount of data that has been processed up until each call of `gradient`. 
+        This list is updated each time `gradient` is called by appending the proportion of the data used when calculating the approximate gradient since the class was initialised (a full gradient calculation would be 1 full data pass). 
+        Note that if your functions do not contain an equal `amount` of data, for example your data was not partitioned into equal batches, then you must first use the `set_data_partition_weights` function for this to be accurate.   """
         data_passes = []
         for el in self.data_passes_indices:
             try:
diff --git a/Wrappers/Python/cil/optimisation/functions/SAGFunction.py b/Wrappers/Python/cil/optimisation/functions/SAGFunction.py
index 31e0a0ca3..c731e4502 100644
--- a/Wrappers/Python/cil/optimisation/functions/SAGFunction.py
+++ b/Wrappers/Python/cil/optimisation/functions/SAGFunction.py
@@ -29,10 +29,10 @@
 class SAGFunction(ApproximateGradientSumFunction):
 
     r"""
-    The stochastic average gradient (SAG) function takes a index :math:`i_k` and calculates the approximate gradient of :math:`\sum_{i=1}^{n-1}f_i` at iteration :math:`x_k` as
+    The stochastic average gradient (SAG) function takes a index :math:`i_k` and calculates the approximate gradient of :math:`\sum_{i=0}^{n-1}f_i` at iteration :math:`x_k` as
     
     .. math ::
-                \sum_{i=1}^{n-1} g_i^k \qquad \text{where} \qquad g_i^k= \begin{cases}
+                \sum_{i=0}^{n-1} g_i^k \qquad \text{where} \qquad g_i^k= \begin{cases}
                                                                             \nabla f_i(x_k), \text{ if } i=i_k\\
                                                                             g_i^{k-1},\text{ otherwise }
                                                                             \end{cases}
@@ -46,14 +46,14 @@ class SAGFunction(ApproximateGradientSumFunction):
     -----
     Compared with the literature, we do not divide by :math:`n`, the number of functions, so that we return an approximate gradient of the whole sum function and not an average gradient.
 
-    Reference
+    Note
     ----------
-    Schmidt, M., Le Roux, N. and Bach, F., 2017. Minimizing finite sums with the stochastic average gradient. Mathematical Programming, 162, pp.83-112. https://doi.org/10.1007/s10107-016-1030-6. 
+    Reference: Schmidt, M., Le Roux, N. and Bach, F., 2017. Minimizing finite sums with the stochastic average gradient. Mathematical Programming, 162, pp.83-112. https://doi.org/10.1007/s10107-016-1030-6. 
 
-    Parameters:
+    Parameters
     -----------
     functions : `list`  of functions
-        A list of functions: :math:`[f_{0}, f_{1}, ..., f_{n-1}]`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. All functions must have the same domain. The number of functions (equivalently the length of the list `n`) must be strictly greater than 1. 
+        A list of functions: :math:`f_{0}, f_{1}, ..., f_{n-1}`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. All functions must have the same domain. The number of functions (equivalently the length of the list `n`) must be strictly greater than 1. 
     sampler: An instance of a CIL Sampler class ( :meth:`~optimisation.utilities.sampler`) or of another class which has a `next` function implemented to output integers in :math:`{0,...,n-1}`.
         This sampler is called each time `gradient` is called and sets the internal `function_num` passed to the `approximate_gradient` function.  Default is `Sampler.random_with_replacement(len(functions))`. 
     
@@ -62,7 +62,7 @@ class SAGFunction(ApproximateGradientSumFunction):
     
     The user has the option of calling the class method `warm_start_approximate_gradients` after initialising this class. This will compute and store the gradient for each function at an initial point, equivalently setting :math:`g_i^0=\nabla f_i(x_0)` for initial point :math:`x_0`.  If this method is not called, the gradients are initialised with zeros. 
 
-    Note:  
+    Note
     ------  
 
     This function's memory requirements are `n + 3` times the image space, that is with 100 subsets the memory requirement is 103 images, which is huge.
@@ -134,7 +134,7 @@ def _update_approx_gradient(self, out):
         return out 
     
     def warm_start_approximate_gradients(self, initial):
-        """A function to warm start SAG or SAGA algorithms by initialising all the gradients at an initial point. Equivalently setting :math:`g_i^0=\nabla f_i(x_0)` for initial point :math:`x_0`. 
+        r"""A function to warm start SAG or SAGA algorithms by initialising all the gradients at an initial point. Equivalently setting :math:`g_i^0 = \nabla f_i(x_0)` for initial point :math:`x_0`. 
         
         Parameters
         ----------
@@ -142,9 +142,10 @@ def warm_start_approximate_gradients(self, initial):
             The initial point to warmstart the calculation
             
         Note
-        ----
+        ------
         When using SAG or SAGA with a deterministic algorithm, you should warm start the SAG-SAGA Function with the same initial point that you initialise the algorithm
         
+        
         """
         self._list_stored_gradients = [
             fi.gradient(initial) for fi in self.functions]
@@ -167,10 +168,10 @@ def data_passes_indices(self):
 class SAGAFunction(SAGFunction):
 
     r"""
-    SAGA (SAG-Ameliore) is an accelerated version of the stochastic average gradient (SAG) function which takes a index :math:`i_k` and calculates the approximate gradient of :math:`\sum_{i=1}^{n-1}f_i` at iteration :math:`x_k` as
+    SAGA (SAG-Ameliore) is an accelerated version of the stochastic average gradient (SAG) function which takes a index :math:`i_k` and calculates the approximate gradient of :math:`\sum_{i=0}^{n-1}f_i` at iteration :math:`x_k` as
     
     .. math ::
-                 n\left(g_{i_k}^{k}-g_{i_k}^{k-1}\right)+\sum_{i=1}^{n-1} g_i^{k-1} \qquad \text{where} \qquad g_i^k= \begin{cases}
+                 n\left(g_{i_k}^{k}-g_{i_k}^{k-1}\right)+\sum_{i=0}^{n-1} g_i^{k-1} \qquad \text{where} \qquad g_i^k= \begin{cases}
                                                                             \nabla f_i(x_k), \text{ if } i=i_k\\
                                                                             g_i^{k-1},\text{ otherwise}
                                                                             \end{cases}
@@ -182,17 +183,19 @@ class SAGAFunction(SAGFunction):
     ------
     Compared with the literature, we do not divide by :math:`n`, the number of functions, so that we return an approximate gradient of the whole sum function and not an average gradient.
 
-    Note:  
+
+    Note  
     ------  
 
     This function's memory requirements are `n + 3` times the image space, that is with 100 subsets the memory requirement is 103 images, which is huge.
     
-    Reference
-    ----------
-    Defazio, A., Bach, F. and Lacoste-Julien, S., 2014. SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives. Advances in neural information processing systems, 27. https://proceedings.neurips.cc/paper_files/paper/2014/file/ede7e2b6d13a41ddf9f4bdef84fdc737-Paper.pdf
+    
+    Note
+    ------
+    Reference: Defazio, A., Bach, F. and Lacoste-Julien, S., 2014. SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives. Advances in neural information processing systems, 27. https://proceedings.neurips.cc/paper_files/paper/2014/file/ede7e2b6d13a41ddf9f4bdef84fdc737-Paper.pdf
    
 
-    Parameters:
+    Parameters
     -----------
     functions : `list`  of functions
                 A list of functions: :code:`[f_{0}, f_{1}, ..., f_{n-1}]`. Each function is assumed to be smooth function with an implemented :func:`~Function.gradient` method. Each function must have the same domain. The number of functions must be strictly greater than 1.