diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc0be4e350..4ad5b90d50 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-* 24.x.x
+* 24.2.0
   - New Features:
     - Added SVRG and LSVRG stochastic functions (#1625)
     - Added SAG and SAGA stochastic functions (#1624)
diff --git a/NOTICE.txt b/NOTICE.txt
index e5887d3b80..f4e3e24610 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -65,8 +65,10 @@ Ashley Gillman (2024) -12
 Zeljko Kereta (2024) - 5
 Evgueni Ovtchinnikov (2024) -1 
 Georg Schramm (2024) - 13
+Sam Porter (2024) - 5
 Joshua Hellier (2024) - 3
 Nicholas Whyatt (2024) - 1
+Rasmia Kulan (2024) - 1
 
 CIL Advisory Board:
 Llion Evans - 9
diff --git a/README.md b/README.md
index e77b9a9768..6960eb726c 100644
--- a/README.md
+++ b/README.md
@@ -21,13 +21,13 @@ We recommend using either [`miniconda`](https://docs.conda.io/projects/miniconda
 Install a new environment using:
 
 ```sh
-conda create --name cil -c conda-forge -c https://software.repos.intel.com/python/conda -c ccpi cil=24.1.0
+conda create --name cil -c conda-forge -c https://software.repos.intel.com/python/conda -c ccpi cil=24.2.0
 ```
 
 To install CIL and the additional packages and plugins needed to run the [CIL demos](https://github.com/TomographicImaging/CIL-Demos) install the environment with:
 
 ```sh
-conda create --name cil -c conda-forge -c https://software.repos.intel.com/python/conda -c ccpi cil=24.1.0 astra-toolbox=*=cuda* tigre ccpi-regulariser tomophantom ipykernel ipywidgets
+conda create --name cil -c conda-forge -c https://software.repos.intel.com/python/conda -c ccpi cil=24.2.0 astra-toolbox=*=cuda* tigre ccpi-regulariser tomophantom ipykernel ipywidgets
 ```
 
 where:
diff --git a/Wrappers/Python/cil/optimisation/functions/OperatorCompositionFunction.py b/Wrappers/Python/cil/optimisation/functions/OperatorCompositionFunction.py
index 843648f48f..7f8a3b2f1e 100644
--- a/Wrappers/Python/cil/optimisation/functions/OperatorCompositionFunction.py
+++ b/Wrappers/Python/cil/optimisation/functions/OperatorCompositionFunction.py
@@ -23,7 +23,7 @@
 
 class OperatorCompositionFunction(Function):
 
-    """ Composition of a function with an operator as : :math:`(F \otimes A)(x) = F(Ax)`
+    """ Composition of a function with an operator as : :math:`(F \circ A)(x) = F(Ax)`
 
             :parameter function: :code:`Function` F
             :parameter operator: :code:`Operator` A
@@ -66,9 +66,9 @@ def __call__(self, x):
 
     def gradient(self, x, out=None):
 
-        """ Return the gradient of F(Ax),
+        """ Return the gradient of :math:`F(Ax)`,
 
-        ..math ::  (F(Ax))' = A^{T}F'(Ax)
+        :math:`(F(Ax))' = A^{T}F'(Ax)`
 
         """
 
diff --git a/Wrappers/Python/cil/optimisation/functions/SVRGFunction.py b/Wrappers/Python/cil/optimisation/functions/SVRGFunction.py
index 51d088e092..28bad61e0d 100644
--- a/Wrappers/Python/cil/optimisation/functions/SVRGFunction.py
+++ b/Wrappers/Python/cil/optimisation/functions/SVRGFunction.py
@@ -33,8 +33,8 @@ class SVRGFunction(ApproximateGradientSumFunction):
     r"""
     The Stochastic Variance Reduced Gradient (SVRG) function calculates the approximate gradient of :math:`\sum_{i=1}^{n-1}f_i`.  For this approximation, every `snapshot_update_interval` number of iterations, a full gradient calculation is made at this "snapshot" point. Intermediate gradient calculations update this snapshot by taking a index :math:`i_k` and calculating the gradient of :math:`f_{i_k}`s at the current iterate and the snapshot, updating the approximate gradient to be:
 
-        .. math ::
-            n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x}),
+    .. math ::
+        n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x}),
 
     where :math:`\tilde{x}` is the latest "snapshot" point and :math:`x_k` is the value at the current iteration. 
 
@@ -86,7 +86,7 @@ def __init__(self, functions, sampler=None, snapshot_update_interval=None, store
         self.snapshot = None
 
     def gradient(self, x, out=None):
-        """ Selects a random function using the `sampler` and then calls the approximate gradient at :code:`x` or calculates a full gradient depending on the update frequency
+        r""" Selects a random function using the `sampler` and then calls the approximate gradient at :code:`x` or calculates a full gradient depending on the update frequency
 
         Parameters
         ----------
@@ -115,9 +115,10 @@ def gradient(self, x, out=None):
             return self.approximate_gradient(x, self.function_num, out=out)
 
     def approximate_gradient(self, x, function_num, out=None):
-        """ Calculates the stochastic gradient at the point :math:`x` by using the gradient of the selected function, indexed by :math:`i_k`, the `function_number` in {0,...,len(functions)-1}, and the full gradient at the snapshot :math:`\tilde{x}`
-            .. math ::
-                n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x})
+        r""" Calculates the stochastic gradient at the point :math:`x` by using the gradient of the selected function, indexed by :math:`i_k`, the `function_number` in {0,...,len(functions)-1}, and the full gradient at the snapshot :math:`\tilde{x}`
+        
+        .. math ::
+            n*\nabla f_{i_k}(x_k) - n*\nabla f_{i_k}(\tilde{x}) + \nabla \sum_{i=0}^{n-1}f_i(\tilde{x})
 
         Note
         -----
diff --git a/Wrappers/Python/cil/optimisation/operators/Operator.py b/Wrappers/Python/cil/optimisation/operators/Operator.py
index 1afec3c156..59dfcc211b 100644
--- a/Wrappers/Python/cil/optimisation/operators/Operator.py
+++ b/Wrappers/Python/cil/optimisation/operators/Operator.py
@@ -609,7 +609,7 @@ class CompositionOperator(Operator):
 
     Parameters
     ----------
-    args: `Operator`s
+    args: `Operator` s
         Operators to be composed. As in mathematical notation, the operators will be applied right to left
 
     """
diff --git a/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py b/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py
index a680bd24d7..836cb01bf4 100644
--- a/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py
+++ b/Wrappers/Python/cil/optimisation/utilities/StepSizeMethods.py
@@ -77,6 +77,13 @@ class ArmijoStepSizeRule(StepSizeRule):
 
     The Armijo rule runs a while loop to find the appropriate step_size by starting from a very large number (`alpha`). The step_size is found by reducing the step size (by a factor `beta`) in an iterative way until a certain criterion is met. To avoid infinite loops, we add a maximum number of times (`max_iterations`) the while loop is run.
 
+    Reference
+    ---------
+    - Algorithm 3.1 in Nocedal, J. and Wright, S.J. eds., 1999. Numerical optimization. New York, NY: Springer New York. https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf)
+    
+    - https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080
+    
+    
     Parameters
     ----------
     alpha: float, optional, default=1e6
@@ -89,12 +96,6 @@ class ArmijoStepSizeRule(StepSizeRule):
         If `warmstart = True` the initial step size at each Armijo iteration is the calculated step size from the last iteration. If `warmstart = False` at each  Armijo iteration, the initial step size is reset to the original, large `alpha`. 
         In the case of *well-behaved* convex functions, `warmstart = True` is likely to be computationally less expensive. In the case of non-convex functions, or particularly tricky functions, setting `warmstart = False` may be beneficial. 
 
-    Reference
-    ------------
-    - Algorithm 3.1 in Nocedal, J. and Wright, S.J. eds., 1999. Numerical optimization. New York, NY: Springer New York. https://www.math.uci.edu/~qnie/Publications/NumericalOptimization.pdf)
-    
-    - https://projecteuclid.org/download/pdf_1/euclid.pjm/1102995080
-
     """
 
     def __init__(self, alpha=1e6, beta=0.5, max_iterations=None, warmstart=True):
diff --git a/Wrappers/Python/cil/optimisation/utilities/sampler.py b/Wrappers/Python/cil/optimisation/utilities/sampler.py
index b518ecd1b5..b313e10c8a 100644
--- a/Wrappers/Python/cil/optimisation/utilities/sampler.py
+++ b/Wrappers/Python/cil/optimisation/utilities/sampler.py
@@ -549,14 +549,21 @@ def _herman_meyer_function(num_indices,  addition_arr, repeat_length_arr, iterat
 
     @staticmethod
     def herman_meyer(num_indices):
-        """
-        Instantiates a sampler which outputs in a Herman Meyer order.
+        r"""Instantiates a sampler which outputs in a Herman Meyer order.
 
         Parameters
         ----------
         num_indices: int
             The sampler will select from a range of indices 0 to num_indices. For Herman-Meyer sampling this number should not be prime.
-
+        
+        Returns
+        -------
+        Sampler
+            An instance of the Sampler class which outputs in a Herman Meyer order.
+        
+        
+        
+            
         Reference
         ----------
         With thanks to Imraj Singh and Zeljko Kereta for their help with the initial implementation of the Herman Meyer sampling. Their implementation was used in:
@@ -567,11 +574,6 @@ def herman_meyer(num_indices):
 
         Herman GT, Meyer LB. Algebraic reconstruction techniques can be made computationally efficient. IEEE Trans Med Imaging.  doi: 10.1109/42.241889.
 
-        Returns
-        -------
-        Sampler
-            An instance of the Sampler class which outputs in a Herman Meyer order.
-
         Example
         -------
         >>> sampler=Sampler.herman_meyer(12)
diff --git a/Wrappers/Python/cil/processors/CofR_image_sharpness.py b/Wrappers/Python/cil/processors/CofR_image_sharpness.py
index 2969c071d5..15d2513011 100644
--- a/Wrappers/Python/cil/processors/CofR_image_sharpness.py
+++ b/Wrappers/Python/cil/processors/CofR_image_sharpness.py
@@ -56,20 +56,22 @@ class CofR_image_sharpness(Processor):
 
     Example
     -------
-    from cil.processors import CentreOfRotationCorrector
+    .. code-block :: python 
+        from cil.processors import CentreOfRotationCorrector
 
-    processor = CentreOfRotationCorrector.image_sharpness('centre', 'tigre')
-    processor.set_input(data)
-    data_centred = processor.get_output()
+        processor = CentreOfRotationCorrector.image_sharpness('centre', 'tigre')
+        processor.set_input(data)
+        data_centred = processor.get_output()
 
 
     Example
     -------
-    from cil.processors import CentreOfRotationCorrector
+    .. code-block :: python
+        from cil.processors import CentreOfRotationCorrector
 
-    processor = CentreOfRotationCorrector.image_sharpness(slice_index=120, 'astra')
-    processor.set_input(data)
-    processor.get_output(out=data)
+        processor = CentreOfRotationCorrector.image_sharpness(slice_index=120, 'astra')
+        processor.set_input(data)
+        processor.get_output(out=data)
 
 
     Note
diff --git a/Wrappers/Python/cil/processors/Padder.py b/Wrappers/Python/cil/processors/Padder.py
index 82cd7548e8..ad6a5448b7 100644
--- a/Wrappers/Python/cil/processors/Padder.py
+++ b/Wrappers/Python/cil/processors/Padder.py
@@ -40,12 +40,12 @@ class Padder(DataProcessor):
 
     Notes
     -----
-    `pad_width` behaviour (number of pixels):
+    `pad_width`  behaviour (number of pixels):
         - int: Each axis will be padded with a border of this size
         - tuple(int, int): Each axis will be padded with an asymmetric border i.e. (before, after)
         - dict: Specified axes will be padded: e.g. {'horizontal':(8, 23), 'vertical': 10}
 
-    `pad_values` behaviour:
+    `pad_values`  behaviour:
         - float: Each border will use this value
         - tuple(float, float): Each value will be used asymmetrically for each axis i.e. (before, after)
         - dict: Specified axes and values: e.g. {'horizontal':(8, 23), 'channel':5}
@@ -106,12 +106,12 @@ def constant(pad_width=None, constant_values=0.0):
 
         Notes
         -----
-        `pad_width` behaviour (number of pixels):
+        `pad_width`  behaviour (number of pixels):
          - int: Each axis will be padded with a border of this size
          - tuple(int, int): Each axis will be padded with an asymmetric border i.e. (before, after)
          - dict: Specified axes will be padded: e.g. {'horizontal':(8, 23), 'vertical': 10}
 
-        `constant_values` behaviour (value of pixels):
+        `constant_values`  behaviour (value of pixels):
          - float: Each border will be set to this value
          - tuple(float, float): Each border value will be used asymmetrically for each axis i.e. (before, after)
          - dict: Specified axes and values: e.g. {'horizontal':(8, 23), 'channel':5}
diff --git a/docs/source/optimisation.rst b/docs/source/optimisation.rst
index c7d1fa7eba..72fc5a6885 100644
--- a/docs/source/optimisation.rst
+++ b/docs/source/optimisation.rst
@@ -239,7 +239,7 @@ Note
 ----
  All the approximate gradients written in CIL are of a similar order of magnitude to the full gradient calculation. For example, in the :code:`SGFunction` we approximate the full gradient by :math:`n\nabla f_i` for an index :math:`i` given by the sampler. 
  The multiplication by :math:`n` is a choice to more easily allow comparisons between stochastic and non-stochastic methods and between stochastic methods with varying numbers of subsets.
- The multiplication ensures that the (SAGA, SGD, and SVRG  and LSVRG) approximate gradients are an unbiased estimator of the full gradient ie :math:`\mathbb{E}\left[\tilde\nabla f(x)\right] =\nabla f(x)``.
+ The multiplication ensures that the (SAGA, SGD, and SVRG  and LSVRG) approximate gradients are an unbiased estimator of the full gradient ie :math:`\mathbb{E}\left[\tilde\nabla f(x)\right] =\nabla f(x)`.
   This has an implication when choosing step sizes. For example, a suitable step size for GD with a SGFunction could be 
   :math:`\propto 1/(L_{max}*n)`, where :math:`L_{max}` is the largest Lipschitz constant of the list of functions in the SGFunction and the additional factor of  :math:`n` reflects this multiplication by  :math:`n` in the approximate gradient. 
 
@@ -411,14 +411,14 @@ This class allows the user to write a function which does the following:
 
   F ( x ) = G ( Ax )
 
-where :math:`A` is an operator. For instance the least squares function l2norm_ :code:`Norm2Sq` can
+where :math:`A` is an operator. For instance the least squares function can
 be expressed as
 
 .. math::
 
-  F(x) = || Ax - b ||^2_2
+  F(x) = || Ax - b ||^2_2 \qquad \text{where} \qquad G(y) = || y - b ||^2_2
 
-.. code::python
+.. code-block :: python
 
   F1 = Norm2Sq(A, b)
   # or equivalently