From 040f214cabaad620eeee0a51108fd0974bb01933 Mon Sep 17 00:00:00 2001 From: Margaret Duff <43645617+MargaretDuff@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:33:46 +0000 Subject: [PATCH] ApproximateGradientSumFunction class and example SGFunction for Stochastic Gradient Descent (#1550) Signed-off-by: Margaret Duff <43645617+MargaretDuff@users.noreply.github.com> Co-authored-by: epapoutsellis Co-authored-by: gfardell Co-authored-by: Edoardo Pasca --- CHANGELOG.md | 1 + NOTICE.txt | 8 +- .../ApproximateGradientSumFunction.py | 239 ++++++++++++++++++ .../cil/optimisation/functions/Function.py | 3 + .../cil/optimisation/functions/SGFunction.py | 82 ++++++ .../cil/optimisation/functions/__init__.py | 3 + .../Python/test/test_approximate_gradient.py | 203 +++++++++++++++ docs/source/optimisation.rst | 127 +++++++++- 8 files changed, 663 insertions(+), 3 deletions(-) create mode 100644 Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py create mode 100644 Wrappers/Python/cil/optimisation/functions/SGFunction.py create mode 100644 Wrappers/Python/test/test_approximate_gradient.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6198f8407c..ac43faaed2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ - Improved import error/warning messages - New adjoint operator - Bug fix for complex matrix adjoint + - Added the ApproximateGradientSumFunction and SGFunction to allow for stochastic gradient algorithms to be created using functions with an approximate gradient and deterministic algorithms * 23.1.0 diff --git a/NOTICE.txt b/NOTICE.txt index f9dc60e4d4..cb4148589a 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -32,6 +32,8 @@ Institutions Key: 9 - Swansea University 10 - University of Warwick 11 - University of Helsinki +12 - Australian e-Health Research, Australia +13 - KU Leuven CIL Developers in date order: Edoardo Pasca (2017 – present) - 1 @@ -49,7 +51,7 @@ Srikanth Nagella (2017-2018) - 1 Daniil Kazantsev (2018) - 3 Ryan Warr (2019) - 3 Tomas Kulhanek (2019) - 1 -Claire Delplancke (2019 - 2022) - 7 +Claire Delplancke (2019 - 2022) - 7 Matthias Ehrhardt (2019 - 2023) - 7 Richard Brown (2020-2021) - 5 Sam Tygier (2022) - 1 @@ -57,6 +59,10 @@ Andrew Sharits (2022) - 8 Kyle Pidgeon (2023) - 1 Letizia Protopapa (2023) - 1 Tommi Heikkilä (2023) - 11 +Ashley Gillman (2024) -12 +Zeljko Kerata (2024) - 5 +Evgueni Ovtchinnikov (2024) -1 +Georg Schramm (2024) - 13 CIL Advisory Board: Llion Evans - 9 diff --git a/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py b/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py new file mode 100644 index 0000000000..e4109c7980 --- /dev/null +++ b/Wrappers/Python/cil/optimisation/functions/ApproximateGradientSumFunction.py @@ -0,0 +1,239 @@ +# Copyright 2024 United Kingdom Research and Innovation +# Copyright 2024 The University of Manchester +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Authors: +# - CIL Developers, listed at: https://github.com/TomographicImaging/CIL/blob/master/NOTICE.txt +# - Daniel Deidda (National Physical Laboratory, UK) +# - Claire Delplancke (Electricite de France, Research and Development) +# - Ashley Gillman (Australian e-Health Res. Ctr., CSIRO, Brisbane, Queensland, Australia) +# - Zeljko Kerata (Department of Computer Science, University College London, UK) +# - Evgueni Ovtchinnikov (STFC - UKRI) +# - Georg Schramm (Department of Imaging and Pathology, Division of Nuclear Medicine, KU Leuven, Leuven, Belgium) + + + +from cil.optimisation.functions import SumFunction +from cil.optimisation.utilities import Sampler +import numbers +from abc import ABC, abstractmethod +import numpy as np + + +class ApproximateGradientSumFunction(SumFunction, ABC): + r"""ApproximateGradientSumFunction represents the following sum + + .. math:: \sum_{i=0}^{n-1} f_{i} = (f_{0} + f_{2} + ... + f_{n-1}) + + where there are :math:`n` functions. This function class has two ways of calling gradient + + - `full_gradient` calculates the gradient of the sum :math:`\sum_{i=0}^{n-1} \nabla f_{i}` + - `gradient` calls an `approximate_gradient` function which may be less computationally expensive to calculate than the full gradient + + + + This class is an abstract class. + + Parameters + ----------- + functions : `list` of functions + A list of functions: :math:`[f_{0}, f_{2}, ..., f_{n-1}]`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. All functions must have the same domain. The number of functions (equivalently the length of the list) must be strictly greater than 1. + sampler: An instance of a CIL Sampler class ( :meth:`~optimisation.utilities.sampler`) or of another class which has a :code:`__next__` function implemented to output integers in :math:`{0,...,n-1}`. + This sampler is called each time :code:`gradient` is called and sets the internal :code:`function_num` passed to the :code:`approximate_gradient` function. Default is :code:`Sampler.random_with_replacement(len(functions))`. + + Note + ----- + We ensure that the approximate gradient is of a similar order of magnitude to the full gradient calculation. For example, in the :code:`SGFunction` we approximate the full gradient by :math:`n\nabla f_i` for an index :math:`i` given by the sampler. + The multiplication by :math:`n` is a choice to more easily allow comparisons between stochastic and non-stochastic methods and between stochastic methods with varying numbers of subsets. + + Note + ----- + Each time :code:`gradient` is called the class keeps track of which functions have been used to calculate the gradient. This may be useful for debugging or plotting after using this function in an iterative algorithm. + + - The property :code:`data_passes_indices` is a list of lists holding the indices of the functions that are processed in each call of `gradient`. This list is updated each time `gradient` is called by appending a list of the indices of the functions used to calculate the gradient. + - The property :code:`data_passes` is a list of floats that holds the amount of data that has been processed up until each call of `gradient`. This list is updated each time `gradient` is called by appending the proportion of the data used when calculating the approximate gradient since the class was initialised (a full gradient calculation would be 1 full data pass). Warning: if your functions do not contain an equal `amount` of data, for example your data was not partitioned into equal batches, then you must first use the `set_data_partition_weights" function for this to be accurate. + + + + Note + ---- + The :meth:`~ApproximateGradientSumFunction.gradient` returns the approximate gradient depending on an index provided by the :code:`sampler` method. + + Example + ------- + This class is an abstract base class, so we give an example using the SGFunction child class. + + Consider the objective is to minimise: + + .. math:: \sum_{i=0}^{n-1} f_{i}(x) = \sum_{i=0}^{n-1}\|A_{i} x - b_{i}\|^{2} + + >>> list_of_functions = [LeastSquares(Ai, b=bi)] for Ai,bi in zip(A_subsets, b_subsets)) + >>> f = ApproximateGradientSumFunction(list_of_functions) + + >>> list_of_functions = [LeastSquares(Ai, b=bi)] for Ai,bi in zip(A_subsets, b_subsets)) + >>> sampler = Sampler.sequential(len(list_of_functions)) + >>> f = SGFunction(list_of_functions, sampler=sampler) + >>> f.full_gradient(x) + This will return :math:`\sum_{i=0}^{n-1} \nabla f_{i}(x)` + >>> f.gradient(x) + As per the approximate gradient implementation in the SGFunction this will return :math:`\nabla f_{0}`. The choice of the `0` index is because we chose a `sequential` sampler and this is the first time we called `gradient`. + >>> f.gradient(x) + This will return :math:`\nabla f_{1}` because we chose a `sequential` sampler and this is the second time we called `gradient`. + + + + """ + + def __init__(self, functions, sampler=None): + + if sampler is None: + sampler = Sampler.random_with_replacement(len(functions)) + + if not isinstance(functions, list): + raise TypeError("Input to functions should be a list of functions") + if not hasattr(sampler, "next"): + raise ValueError('The provided sampler must have a `next` method') + + self.sampler = sampler + + self._partition_weights = [1 / len(functions)] * len(functions) + + self._data_passes_indices = [] + + super(ApproximateGradientSumFunction, self).__init__(*functions) + + def __call__(self, x): + r"""Returns the value of the sum of functions at :math:`x`. + + .. math:: (f_{0} + f_{1} + ... + f_{n-1})(x) = f_{0}(x) + f_{1}(x) + ... + f_{n-1}(x) + + Parameters + ---------- + x : DataContainer + + -------- + float + the value of the SumFunction at x + + + """ + return super(ApproximateGradientSumFunction, self).__call__(x) + + def full_gradient(self, x, out=None): + r"""Returns the value of the full gradient of the sum of functions at :math:`x`. + + .. math:: \nabla_x(f_{0} + f_{1} + ... + f_{n-1})(x) = \nabla_xf_{0}(x) + \nabla_xf_{1}(x) + ... + \nabla_xf_{n-1}(x) + + Parameters + ---------- + x : DataContainer + out: return DataContainer, if `None` a new DataContainer is returned, default `None`. + + Returns + -------- + DataContainer + The value of the gradient of the sum function at x or nothing if `out` + """ + + return super(ApproximateGradientSumFunction, self).gradient(x, out=out) + + @abstractmethod + def approximate_gradient(self, x, function_num, out=None): + """ Returns the approximate gradient at a given point :code:`x` given a `function_number` in {0,...,len(functions)-1}. + + Parameters + ---------- + x : DataContainer + out: return DataContainer, if `None` a new DataContainer is returned, default `None`. + function_num: `int` + Between 0 and the number of functions in the list + Returns + -------- + DataContainer + the value of the approximate gradient of the sum function at :code:`x` given a `function_number` in {0,...,len(functions)-1} + """ + pass + + def gradient(self, x, out=None): + """ Selects a random function using the `sampler` and then calls the approximate gradient at :code:`x` + + Parameters + ---------- + x : DataContainer + out: return DataContainer, if `None` a new DataContainer is returned, default `None`. + + Returns + -------- + DataContainer + the value of the approximate gradient of the sum function at :code:`x` + """ + + self.function_num = self.sampler.next() + + self._update_data_passes_indices([self.function_num]) + + + + return self.approximate_gradient(x, self.function_num, out=out) + + + def _update_data_passes_indices(self, indices): + """ Internal function that updates the list of lists containing the function indices used to calculate the approximate gradient. + Parameters + ---------- + indices: list + List of indices used to calculate the approximate gradient in a given iteration + + """ + self._data_passes_indices.append(indices) + + def set_data_partition_weights(self, weights): + """ Setter for the partition weights used to calculate the data passes + + Parameters + ---------- + weights: list of positive floats that sum to one. + The proportion of the data held in each function. Equivalent to the proportions that you partitioned your data into. + + """ + if len(weights) != len(self.functions): + raise ValueError( + 'The provided weights must be a list the same length as the number of functions') + + if abs(sum(weights) - 1) > 1e-6: + raise ValueError('The provided weights must sum to one') + + if any(np.array(weights) < 0): + raise ValueError( + 'The provided weights must be greater than or equal to zero') + + self._partition_weights = weights + + @property + def data_passes_indices(self): + """ The property :code:`data_passes_indices` is a list of lists holding the indices of the functions that are processed in each call of `gradient`. This list is updated each time `gradient` is called by appending a list of the indices of the functions used to calculate the gradient. """ + return self._data_passes_indices + + @property + def data_passes(self): + """ The property :code:`data_passes` is a list of floats that holds the amount of data that has been processed up until each call of `gradient`. This list is updated each time `gradient` is called by appending the proportion of the data used when calculating the approximate gradient since the class was initialised (a full gradient calculation would be 1 full data pass). Warning: if your functions do not contain an equal `amount` of data, for example your data was not partitioned into equal batches, then you must first use the `set_data_partition_weights" function for this to be accurate. """ + data_passes = [] + for el in self._data_passes_indices: + try: + data_passes.append(data_passes[-1]) + except IndexError: + data_passes.append(0) + for i in el: + data_passes[-1] += self._partition_weights[i] + return data_passes diff --git a/Wrappers/Python/cil/optimisation/functions/Function.py b/Wrappers/Python/cil/optimisation/functions/Function.py index 2e8136de96..dce45f9617 100644 --- a/Wrappers/Python/cil/optimisation/functions/Function.py +++ b/Wrappers/Python/cil/optimisation/functions/Function.py @@ -377,6 +377,9 @@ def __add__(self, other): else: return super(SumFunction, self).__add__(other) + @property + def num_functions(self): + return len(self.functions) class ScaledFunction(Function): diff --git a/Wrappers/Python/cil/optimisation/functions/SGFunction.py b/Wrappers/Python/cil/optimisation/functions/SGFunction.py new file mode 100644 index 0000000000..ba7dd780b4 --- /dev/null +++ b/Wrappers/Python/cil/optimisation/functions/SGFunction.py @@ -0,0 +1,82 @@ +# Copyright 2024 United Kingdom Research and Innovation +# Copyright 2024 The University of Manchester +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Authors: +# - CIL Developers, listed at: https://github.com/TomographicImaging/CIL/blob/master/NOTICE.txt +# - Daniel Deidda (National Physical Laboratory, UK) +# - Claire Delplancke (Electricite de France, Research and Development) +# - Ashley Gillman (Australian e-Health Res. Ctr., CSIRO, Brisbane, Queensland, Australia) +# - Zeljko Kerata (Department of Computer Science, University College London, UK) +# - Evgueni Ovtchinnikov (STFC - UKRI) +# - Georg Schramm (Department of Imaging and Pathology, Division of Nuclear Medicine, KU Leuven, Leuven, Belgium) + +from .ApproximateGradientSumFunction import ApproximateGradientSumFunction +from .Function import SumFunction + +class SGFunction(ApproximateGradientSumFunction): + + r""" + Stochastic gradient function, a child class of :code:`ApproximateGradientSumFunction`, which defines from a list of functions, :math:`{f_0,...,f_{n-1}}` a `SumFunction`, :math:`f_0+...+f_{n-1}` where each time the `gradient` is called, the :code:`sampler` provides an index, :math:`i \in {0,...,n-1}` + and the :code:`gradient` method returns the approximate gradient :math:`n \nabla_x f_i(x)`. This can be used with the :code:`cil.optimisation.algorithms` algorithm :code:`GD` to give a stochastic gradient descent algorithm. + + Parameters + ----------- + functions : `list` of functions + A list of functions: :code:`[f_{0}, f_{1}, ..., f_{n-1}]`. Each function is assumed to be smooth with an implemented :func:`~Function.gradient` method. Although CIL does not define a domain of a :code:`Function`, all functions are supposed to have the same domain. The number of functions must be strictly greater than 1. + sampler: An instance of a CIL Sampler class ( :meth:`~optimisation.utilities.sampler`) or of another class which has a :code:`__next__` function implemented to output integers in {0,...,n-1}. + This sampler is called each time gradient is called and sets the internal :code:`function_num` passed to the :code:`approximate_gradient` function. Default is :code:`Sampler.random_with_replacement(len(functions))`. + """ + + def __init__(self, functions, sampler=None): + super(SGFunction, self).__init__(functions, sampler) + + + def approximate_gradient(self, x, function_num, out=None): + + r""" Returns the gradient of the function at index `function_num` at :code:`x`. + + Parameters + ---------- + x : DataContainer + out: return DataContainer, if `None` a new DataContainer is returned, default `None`. + function_num: `int` + Between 0 and the number of functions in the list + Returns + -------- + DataContainer + the value of the approximate gradient of the sum function at :code:`x` given a `function_number` in {0,...,len(functions)-1} + """ + if self.function_num >= self.num_functions or self.function_num<0 : + raise IndexError( + 'The sampler has outputted an index larger than the number of functions to sample from. Please ensure your sampler samples from {0,1,...,len(functions)-1} only.') + + + + # compute gradient of the function indexed by function_num + if out is None: + out = self.functions[function_num].gradient(x) + else: + self.functions[function_num].gradient(x, out = out) + + # scale wrt number of functions + out*=self.num_functions + + return out + + + + + + \ No newline at end of file diff --git a/Wrappers/Python/cil/optimisation/functions/__init__.py b/Wrappers/Python/cil/optimisation/functions/__init__.py index 2edbb10afc..d8ac7cc5da 100644 --- a/Wrappers/Python/cil/optimisation/functions/__init__.py +++ b/Wrappers/Python/cil/optimisation/functions/__init__.py @@ -34,3 +34,6 @@ from .KullbackLeibler import KullbackLeibler from .Rosenbrock import Rosenbrock from .TotalVariation import TotalVariation +from .ApproximateGradientSumFunction import ApproximateGradientSumFunction +from .SGFunction import SGFunction + diff --git a/Wrappers/Python/test/test_approximate_gradient.py b/Wrappers/Python/test/test_approximate_gradient.py new file mode 100644 index 0000000000..774cdd9fa2 --- /dev/null +++ b/Wrappers/Python/test/test_approximate_gradient.py @@ -0,0 +1,203 @@ +# Copyright 2024 United Kingdom Research and Innovation +# Copyright 2024 The University of Manchester +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Authors: +# CIL Developers, listed at: https://github.com/TomographicImaging/CIL/blob/master/NOTICE.txt + + +import unittest +from utils import initialise_tests + +import numpy as np + +from cil.framework import VectorData + + +from cil.utilities import dataexample +from cil.optimisation.functions import LeastSquares +from cil.optimisation.functions import ApproximateGradientSumFunction +from cil.optimisation.functions import SGFunction +from cil.optimisation.operators import MatrixOperator +from cil.optimisation.algorithms import GD +from cil.framework import VectorData +from cil.optimisation.utilities import Sampler, SamplerRandom + +from testclass import CCPiTestClass +from utils import has_astra + +initialise_tests() + +if has_astra: + from cil.plugins.astra import ProjectionOperator + +from utils import has_cvxpy + +if has_cvxpy: + import cvxpy + + +class TestApproximateGradientSumFunction(CCPiTestClass): + + def setUp(self): + self.sampler = Sampler.random_with_replacement(5) + self.initial = VectorData(np.zeros(10)) + self.b = VectorData(np.random.normal(0, 1, 10)) + self.functions = [] + for i in range(5): + diagonal = np.zeros(10) + diagonal[2*i:2*(i+1)] = 1 + A = MatrixOperator(np.diag(diagonal)) + self.functions.append(LeastSquares(A, A.direct(self.b))) + if i == 0: + self.objective = LeastSquares(A, A.direct(self.b)) + else: + self.objective += LeastSquares(A, A.direct(self.b)) + + def test_ABC(self): + with self.assertRaises(TypeError): + self.stochastic_objective = ApproximateGradientSumFunction( + self.functions, self.sampler) + + +class TestSGD(CCPiTestClass): + + def setUp(self): + + + + self.sampler = Sampler.random_with_replacement(6) + self.initial = VectorData(np.zeros(30)) + b = VectorData(np.array(range(30))/50) + self.n_subsets = 6 + self.f_subsets = [] + for i in range(6): + diagonal = np.zeros(30) + diagonal[5*i:5*(i+1)] = 1 + Ai = MatrixOperator(np.diag(diagonal)) + self.f_subsets.append(LeastSquares(Ai, Ai.direct(b))) + self.A=MatrixOperator(np.diag(np.ones(30))) + self.f = LeastSquares(self.A, b) + self.f_stochastic = SGFunction(self.f_subsets, self.sampler) + + + + def test_approximate_gradient_not_equal_full(self): + self.assertFalse((self.f_stochastic.full_gradient( + self.initial) == self.f_stochastic.gradient(self.initial).array).all()) + + + def test_sampler(self): + self.assertTrue(isinstance(self.f_stochastic.sampler, SamplerRandom)) + f = SGFunction(self.f_subsets) + self.assertTrue(isinstance(f.sampler, SamplerRandom)) + self.assertEqual(f.sampler._type, 'random_with_replacement') + + + def test_call(self): + self.assertAlmostEqual(self.f_stochastic( + self.initial), self.f(self.initial), 1) + + + def test_full_gradient(self): + self.assertNumpyArrayAlmostEqual(self.f_stochastic.full_gradient( + self.initial).array, self.f.gradient(self.initial).array, 2) + + + def test_value_error_with_only_one_function(self): + with self.assertRaises(ValueError): + SGFunction([self.f], self.sampler) + pass + + + def test_type_error_if_functions_not_a_list(self): + with self.assertRaises(TypeError): + SGFunction(self.f, self.sampler) + + + def test_sampler_without_next(self): + class bad_Sampler(): + def init(self): + pass + bad_sampler = bad_Sampler() + with self.assertRaises(ValueError): + SGFunction([self.f, self.f], bad_sampler) + + + def test_sampler_out_of_range(self): + bad_sampler = Sampler.sequential(10) + f = SGFunction([self.f, self.f], bad_sampler) + with self.assertRaises(IndexError): + f.gradient(self.initial) + f.gradient(self.initial) + f.gradient(self.initial) + + def test_partition_weights(self): + f_stochastic=SGFunction(self.f_subsets, Sampler.sequential(self.n_subsets)) + self.assertListEqual(f_stochastic._partition_weights, [1 / self.n_subsets] * self.n_subsets) + with self.assertRaises(ValueError): + f_stochastic.set_data_partition_weights( list(range(self.n_subsets))) + with self.assertRaises(ValueError): + f_stochastic.set_data_partition_weights( [1]) + with self.assertRaises(ValueError): + f_stochastic.set_data_partition_weights( [-1]+[2/(self.n_subsets-1)]*(self.n_subsets-1)) + a=[i/float(sum(range(self.n_subsets))) for i in range(self.n_subsets)] + f_stochastic.set_data_partition_weights( a) + self.assertListEqual(f_stochastic._partition_weights, a ) + f_stochastic.gradient(self.initial) + for i in range(1,20): + f_stochastic.gradient(self.initial) + self.assertEqual(f_stochastic.data_passes[i], f_stochastic.data_passes[i-1]+a[i%self.n_subsets]) + + + + + + @unittest.skipUnless(has_cvxpy, "CVXpy not installed") + def test_SGD_toy_example(self): + sampler = Sampler.random_with_replacement(5) + initial = VectorData(np.zeros(25)) + b = VectorData(np.array(range(25))) + functions = [] + for i in range(5): + diagonal = np.zeros(25) + diagonal[5*i:5*(i+1)] = 1 + A = MatrixOperator(np.diag(diagonal)) + functions.append(0.5*LeastSquares(A, A.direct(b))) + + Aop=MatrixOperator(np.diag(np.ones(25))) + + u_cvxpy = cvxpy.Variable(b.shape[0]) + objective = cvxpy.Minimize( 0.5*cvxpy.sum_squares(Aop.A @ u_cvxpy - Aop.direct(b).array)) + p = cvxpy.Problem(objective) + p.solve(verbose=True, solver=cvxpy.SCS, eps=1e-4) + + + stochastic_objective = SGFunction(functions, sampler) + + alg_stochastic = GD(initial=initial, + objective_function=stochastic_objective, update_objective_interval=1000, + step_size=1/stochastic_objective.L) + alg_stochastic.run(600, verbose=0) + self.assertAlmostEqual(stochastic_objective.data_passes[-1], 600/5) + self.assertListEqual(stochastic_objective.data_passes_indices[-1], [stochastic_objective.function_num]) + + np.testing.assert_allclose(p.value ,stochastic_objective(alg_stochastic.x) , atol=1e-1) + self.assertNumpyArrayAlmostEqual( + alg_stochastic.x.as_array(), u_cvxpy.value, 3) + self.assertNumpyArrayAlmostEqual( + alg_stochastic.x.as_array(), b.as_array(), 3) + + + \ No newline at end of file diff --git a/docs/source/optimisation.rst b/docs/source/optimisation.rst index b51b639ce1..1309e379f0 100644 --- a/docs/source/optimisation.rst +++ b/docs/source/optimisation.rst @@ -32,8 +32,8 @@ The fundamental components are: -Algorithms -========== +Algorithms (Deterministic) +========================== A number of generic algorithm implementations are provided including Gradient Descent (GD), Conjugate Gradient Least Squares (CGLS), @@ -119,13 +119,121 @@ LADMM :members: :inherited-members: run, update_objective_interval, max_iteration + + +Algorithms (Stochastic) +======================== + +Consider optimisation problems that take the form of a separable sum: + +.. math:: \min_{x} f(x)+g(x) = \min_{x} \sum_{i=0}^{n-1} f_{i}(x) + g(x) = \min_{x} (f_{0}(x) + f_{1}(x) + ... + f_{n-1}(x))+g(x) + +where :math:`n` is the number of functions. Where there is a large number of :math:`f_i` or their gradients are expensive to calculate, stochastic optimisation methods could prove more efficient. +There is a growing range of Stochastic optimisation algorithms available with potential benefits of faster convergence in number of iterations or in computational cost. +This is an area of continued development for CIL and, depending on the properties of the :math:`f_i` and the regulariser :math:`g`, there is a range of different options for the user. + + + SPDHG ----- +Stochastic Primal Dual Hybrid Gradient (SPDHG) is a stochastic version of PDHG and deals with optimisation problems of the form: + + .. math:: + + \min_{x} f(Kx) + g(x) = \min_{x} \sum f_i(K_i x) + g(x) + +where :math:`f_i` and the regulariser :math:`g` need only be proper, convex and lower semi-continuous ( i.e. do not need to be differentiable). +Each iteration considers just one index of the sum, potentially reducing computational cost. For more examples see our [user notebooks]( https://github.com/vais-ral/CIL-Demos/blob/master/Tomography/Simulated/Single%20Channel/PDHG_vs_SPDHG.py). + + .. autoclass:: cil.optimisation.algorithms.SPDHG :members: :inherited-members: run, update_objective_interval, max_iteration + + +Approximate gradient methods +---------------------------------- + +Alternatively, consider that, in addition to the functions :math:`f_i` and the regulariser :math:`g` being proper, convex and lower semi-continuous, the :math:`f_i` are differentiable. In this case we consider stochastic methods that replace a gradient calculation in a deterministic algorithm with a, potentially cheaper to calculate, approximate gradient. +For example, when :math:`g(x)=0`, the standard Gradient Descent algorithm utilises iterations of the form + + .. math:: + x_{k+1}=x_k-\alpha \nabla f(x_k) =x_k-\alpha \sum_{i=0}^{n-1}\nabla f_i(x_k). + +Replacing, :math:`\nabla f(x_k)=\sum_{i=0}^{n-1}\nabla f_i(x_k)` with :math:`n \nabla f_i(x_k)`, for an index :math:`i` which changes each iteration, leads to the well known stochastic gradient descent algorithm. + +In addition, if :math:`g(x)\neq 0` and has a calculable proximal ( need not be differentiable) one can consider ISTA iterations: + + .. math:: + x_{k+1}=prox_{\alpha g}(x_k-\alpha \nabla f(x_k) )=prox_{\alpha g}(x_k-\alpha \sum_{i=0}^{n-1}\nabla f_i(x_k)) + +and again replacing :math:`\nabla f(x_k)=\sum_{i=0}^{n-1}\nabla f_i(x_k)` with an approximate gradient. + +In a similar way, plugging approximate gradient calculations into deterministic algorithms can lead to a range of stochastic algorithms. In the following table, the left hand column has the approximate gradient function subclass, :ref:`Approximate Gradient base class` the header row has one of CIL's deterministic optimisation algorithm and the body of the table has the resulting stochastic algorithm. + ++----------------+-------+------------+----------------+ +| | GD | ISTA | FISTA | ++----------------+-------+------------+----------------+ +| SGFunction | SGD | Prox-SGD | Acc-Prox-SGD | ++----------------+-------+------------+----------------+ +| SAGFunction\* | SAG | Prox-SAG | Acc-Prox-SAG | ++----------------+-------+------------+----------------+ +| SAGAFunction\* | SAGA | Prox-SAGA | Acc-Prox-SAGA | ++----------------+-------+------------+----------------+ +| SVRGFunction\* | SVRG | Prox-SVRG | Acc-Prox-SVRG | ++----------------+-------+------------+----------------+ +| LSVRGFunction\*| LSVRG | Prox-LSVRG | Acc-Prox-LSVRG | ++----------------+-------+------------+----------------+ + +\*In development + +The stochastic gradient functions can be found listed under functions in the documentation. + +Stochastic Gradient Descent Example +---------------------------------- +The below is an example of Stochastic Gradient Descent built of the SGFunction and Gradient Descent algorithm: + +.. code-block :: python + + from cil.optimisation.utilities import Sampler + from cil.optimisation.algorithms import GD + from cil.optimisation.functions import LeastSquares, SGFunction + from cil.utilities import dataexample + from cil.plugins.astra.operators import ProjectionOperator + + # get the data + data = dataexample.SIMULATED_PARALLEL_BEAM_DATA.get() + data.reorder('astra') + data = data.get_slice(vertical='centre') + + # create the geometries + ag = data.geometry + ig = ag.get_ImageGeometry() + + # partition the data and build the projectors + n_subsets = 10 + partitioned_data = data.partition(n_subsets, 'sequential') + A_partitioned = ProjectionOperator(ig, partitioned_data.geometry, device = "cpu") + + # create the list of functions for the stochastic sum + list_of_functions = [LeastSquares(Ai, b=bi) for Ai,bi in zip(A_partitioned, partitioned_data)] + + #define the sampler and the stochastic gradient function + sampler = Sampler.staggered(len(list_of_functions)) + f = SGFunction(list_of_functions, sampler=sampler) + + #set up and run the gradient descent algorithm + alg = GD(initial=ig.allocate(0), objective_function=f, step_size=1/f.L) + alg.run(300) + + + + + + + Operators ========= The two most important methods are :code:`direct` and :code:`adjoint` @@ -366,6 +474,21 @@ Total variation :members: :inherited-members: +Approximate Gradient base class +-------------------------------- + +.. autoclass:: cil.optimisation.functions.ApproximateGradientSumFunction + :members: + :inherited-members: + + +Stochastic Gradient function +----------------------------- + +.. autoclass:: cil.optimisation.functions.SGFunction + :members: + :inherited-members: + Utilities =========