Merge branch 'maintenance/v0.1.x' into dev

kevin931 · Jul 10, 2023 · 4143105 · 4143105
2 parents 190d475 + aec8da7
commit 4143105
Show file tree

Hide file tree

Showing 14 changed files with 723 additions and 35 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright 2022 Cytomulate Developers
+Copyright 2022-2023 Cytomulate Developers
 
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),

diff --git a/assets/pycytodata_flowchart.png b/assets/pycytodata_flowchart.png
diff --git a/cytomulate/cell_graph_general.py b/cytomulate/cell_graph_general.py
@@ -32,7 +32,8 @@ def generate_trajectories(self,
         cell_types: dict
             A dictionary of CellType objects
         kwargs:
-            Extra parameters needed for non-default path generation algorithms
+            Extra parameters needed for non-default path generation algorithms, which
+            are passed to ``cytomulate.utilities.trajectories``.
         """
         edges = self.graph.edges
         for e in edges:

diff --git a/cytomulate/creation/cytof_data.py b/cytomulate/creation/cytof_data.py
@@ -23,20 +23,28 @@ def __init__(self,
                  n_markers: int = 20,
                  n_trees: int = 2,
                  background_noise_model: Optional[Union[Callable, dict]] = None) -> None:
-        """Initialize the CreationCytofData object
+        """The Creation Mode object for Cytomulate.
+        
+        This class serves as a starting point for the Creation Mode of Cytomulate. The constructor
+        defines the key parameters of the simulation, including the number of batches, cell types,
+        protein markers, and trees. The number of cells is defined later at a sampling step.
 
         Parameters
         ----------
         n_batches: int
-            Number of batches to be simulated
+            Number of batches to be simulated. All other the parameters apply to every batch.
         n_types: int
-            Number of cell types to be simulated
+            Number of cell types to be simulated.
         n_markers: int
-            Number of markers (columns) to be used
+            Number of protein markers (columns) to be simulated.
         n_trees: int
-            Number of trees in the cell graph
+            Number of trees in the cell graph. Each tree encapsulates and represents the relationship
+            between cell types. Note that it is not necessary to add trajectory in complex simulation
+            even though trees are used in general.
         background_noise_model: Callable or dict
-            The function used to generate background noise. It should only take one input: size
+            The function used to generate background noise. It should only take one input: size. In the
+            cases of multiple batches with different noise models, a dictionary with batch number as keys
+            and function as value is used.
         """
         super().__init__(n_batches, background_noise_model)
 
@@ -63,18 +71,26 @@ def initialize_cell_types(self,
                               scale: float = 0.5,
                               n_components: int = 1,
                               variance_mode: float = 0.01) -> None:
-        """Initialize cell type objects
+        """Initialize cell type models.
+        
+        This method initialzes the models for each cell type. Namely, a Gaussian Mixture Model
+        is generated for each cell type at this stage according to the parameters specified.
 
         Parameters
         ----------
         L: int
-            Number of levels of expressions
+            Number of levels of expressions. The levels are used to differentiate between cell types
+            whose expressions for the same marker may be different. We recommend at least 2, but
+            not too many.
         scale: float
-            The scale parameter used in generating expression levels
+            The scale parameter used in generating expression levels' mean, which comes from a 
+            truncated normal distribution on the positive reals. The ``scale`` is the standard the
+            deviation of the distribution. When the scale is large, the levels of expressions
+            are more spead out, and vice versa.
         n_components: int
-            Number of components in a GMM
+            Number of components in a GMM.
         variance_mode: float
-            The mode of the variance of the inverse wishart distribution
+            The mode of the variance of the inverse wishart distribution.
         """
         # We first generate high expression levels and low expression levels
         # Truncated normals are used to ensure the ordering
@@ -91,6 +107,9 @@ def initialize_cell_types(self,
     def generate_cell_graph(self, **kwargs) -> None:
         """Generate cell differentiation paths
 
+        This method is part of complex simulation's cellular trajectory simulation. It
+        generates differentiation paths, which will be used at the sampling stage.
+
         Parameters
         ----------
         kwargs:

diff --git a/cytomulate/cytof_data_general.py b/cytomulate/cytof_data_general.py
@@ -23,7 +23,11 @@ class GeneralCytofData:
     def __init__(self,
                  n_batches: int = 1,
                  background_noise_model: Optional[Union[Callable, dict]] = None) -> None:
-        """Initialize the GeneralCytofData object
+        """Initialize the GeneralCytofData object.
+        
+        This is the base class for `CreationCytofData` and `EmulationCytofData`, both of
+        which inherits most of the methods. This class provides functionalities including
+        sampling and complex simulations.
 
         Parameters
         ----------
@@ -53,12 +57,18 @@ def __init__(self,
 
     def generate_cell_abundances(self,
                                  is_random: bool = True) -> None:
-        """Generate cell abundances
+        """Generate cell abundances.
+        
+        This method generates the cell abundane for each batch. The probability
+        of each cell type can be either random or fixed with equal probabilities.
+        See `is_random` parameter for details.
 
         Parameters
         ----------
         is_random: bool
-            Whether the cell abundances should be randomly generated
+            Whether the cell abundances should be randomly generated. If
+            `True`, the abundance of each cell type is sampled from a dirichlet
+            distribution. If `False`, then all cell types an have equal probability. 
         """
         if is_random:
             # If randomly generate cell abundances,
@@ -218,13 +228,16 @@ def sample_one_batch(self,
             Number of samples
         cell_abundances: dict or None
             A dictionary whose keys are the cell labels. The corresponding values should be
-            either the actual number of events for each cell type or the probability of each cell type
+            either the actual number of events for each cell type or the probability of each cell type.
+            If this is not provided, the one stored in the object will be used. Defaults to `None`.
         batch: int
-            The index of the batch for which we want to draw samples
+            The index of the batch for which we want to draw samples. Defaults to 0.
         beta_alpha: float or int
-            The alpha parameter of the beta distribution
+            The alpha parameter of the beta distribution, which should be contrained to the positive reals.
+            Defaults to 0.4.
         beta_beta: float or int
-            The beta parameter of the beta distribution
+            The beta parameter of the beta distribution, which should be contrained to the positive reals.
+            Defaults to 1.0.
 
         Returns
         -------
@@ -355,9 +368,11 @@ def sample(self,
             It can be a plain dictionary whose keys are the cell labels. The corresponding values should be
             either the actual number of events for each cell type or the probability of each cell type
         beta_alpha: float, int, or dict
-            The alpha parameters of the beta distribution
+            The alpha parameters of the beta distribution, which should be contrained to the positive reals.
+            Defaults to 0.4.
         beta_beta: float, int, or dict
-            The beta parameters of the beta distribution
+            The beta parameters of the beta distribution, which should be contrained to the positive reals.
+            Defaults to 0.4.
 
         Returns
         -------

diff --git a/cytomulate/emulation/cytof_data.py b/cytomulate/emulation/cytof_data.py
@@ -14,15 +14,20 @@
 from cytomulate.cytof_data_general import GeneralCytofData
 
 # Typing
-from typing import Union, Optional, Callable
+from typing import Union, Optional, Callable, Tuple, List
 
 
 class EmulationCytofData(GeneralCytofData):
     def __init__(self,
                  n_batches: int = 1,
                  background_noise_model: Optional[Union[Callable, dict]] = None,
                  bead_label: Optional[Union[str, int]] = None) -> None:
-        """Initialize the EmulationCytofData object
+        """The Emulation Mode object for Cytomulate.
+        
+        This class serves as a starting point for the Emulation Mode of Cytomulate. The constructor
+        defines the key parameters of the simulation, including the number of batches. Unlike the
+        Creation mode, other parameters such as the number of protein markers are fixed from the
+        dataset rather than user-soecified. The number of cells is defined later at a sampling step.
 
         Parameters
         ----------
@@ -46,21 +51,33 @@ def initialize_cell_types(self,
                               labels: np.ndarray,
                               max_components: int = 9,
                               min_components: int = 1,
-                              covariance_types: Union[list, tuple] = ("full", "tied", "diag", "spherical")) -> None:
+                              covariance_types: Union[List[str], Tuple[str]] = ("full", "tied", "diag", "spherical")) -> None:
         """Initialize cell type models by fitting Gaussian mixtures
         
+        This method fits the GMM models for each cell type. Namely, a Gaussian Mixture Model
+        is generated for each cell type at this stage according to the parameters specified.
+        An extensive model selection procedure based on the Bayesian Information Criterion (BIC)
+        is performed when multiple possibilities of components and covariance types are
+        specified. See details in `max_components` and `covariance_types`.
+        
         Parameters
         ----------
         expression_matrix: np.ndarray
             A matrix containing the expression levels of cell events
         labels: np.ndarray
             A vector of cell type labels
         max_components: int
-            Used for Gaussian mixture model selection. The maximal number of components for a Gaussian mixture
+            The maximal number of components for a Gaussian mixture. Used for Gaussian mixture model selection.
+            This must be smaller or equal to the `max_components`. If `max_components` equals `min_components`,
+            the exact number will be used for fitting. Otherwise, a model selection procedure will ensue using
+            Bayesian Information Criterion.
         min_components: int
-            Used for Gaussian mixture model selection. The minimal number of components for a Gaussian mxitrue
+            The minimal number of components for a Gaussian mxitrue. Used for Gaussian mixture model selection.
+            This must be smaller or equal to the `max_components`. See `max_components` for details on model
+            selection.
         covariance_types: list or tuple
-            Used for Gaussian mixture model selection. The candidate types of covariances
+            The candidate types of covariances used for Gaussian mixture model selection. If only one is specified,
+            no model selection will be performed based on the covariance structure.
         """
         self.n_markers = np.shape(expression_matrix)[1]
 
@@ -92,6 +109,9 @@ def generate_cell_graph(self,
                             graph_topology: str = "forest",
                             **kwargs) -> None:
         """Generate a cell graph as well as differentiation paths
+        
+        This method is part of complex simulation's cellular trajectory simulation. It
+        generates differentiation paths, which will be used at the sampling stage.
 
         Parameters
         ----------
@@ -110,12 +130,26 @@ def generate_cell_abundances(self,
                                  is_random: bool = True) -> None:
         """Generate cell abundances
 
+        Generate the cell abundances for all cell types: namely, the amount
+        of cells in each cell type. This method supports either data-based
+        cell abundance or randomly-generated cell abundance. In the latter
+        case, each cell type's probability can be further randomized.
+
         Parameters
         ----------
         use_observed: bool
             Whether the cell abundances should use the observed ones
         is_random: bool
-            Whether the cell abundances should be randomly generated
+            In the case that `user_obsersed` is `False`, whether the cell abundances'
+            probability should be randomly generated. If `True`, the abundance of each
+            cell type is sampled from a dirichlet distribution. If `False`, then all cell
+            types an have equal probability.
+            
+        Note
+        -----
+        If you wish to use the default observed cell abundance from the data,
+        it is not necessary to call this method. Otherwise, you should always
+        set ``used_observed`` to ``False``.
         """
         if use_observed:
             for b in range(self.n_batches):

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -33,9 +33,13 @@ For more details, **read our tutorials and documentations linked below!** Or try
    :maxdepth: 1
    :caption: Tutorial
 
+   tutorial/emulation
+   tutorial/creation
    tutorial/cli
    tutorial/complex
    tutorial/visualization
+   tutorial/pycytodata
+   tutorial/benchmark
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -70,9 +70,9 @@ Now, you can have the option to output your simulation results in a ``PyCytoData
 
 .. note::
 
-    ``PyCytoData`` requires ``Python>=3.7``, which is more strict than ``cytomulate``.
+    ``PyCytoData`` requires ``Python>=3.7``, which is stricter than ``cytomulate``.
     If you are still running an older version, please consider upgrading.
 
-.. image:: ../../../assets/pycytodata.jpg
+.. image:: ../../assets/pycytodata.jpg
    :width: 600
    :alt: PyCytoData Alliance
diff --git a/docs/source/license.rst b/docs/source/license.rst
@@ -8,7 +8,7 @@ Our project is officially licensed under the MIT license:
     
     The MIT License (MIT)
 
-    Copyright 2022 Cytomulate Developers
+    Copyright 2022-2023 Cytomulate Developers
 
     Permission is hereby granted, free of charge, to any person obtaining a
     copy of this software and associated documentation files (the "Software"),

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -72,7 +72,7 @@ this is because Cytomulate can accomodate multiple samples as indexed by
 the dictionary keys. Of course, you can procceed to extract the expression
 matrix and then work with it in downstream analyses.
 
-``PyCytoData`` Output
+PyCytoData Output
 ------------------------
 
 For those of you who are familiar with ``PyCytoData`` or want a cleaner interface
@@ -82,7 +82,7 @@ to work with, Cytomulate can output a ``PyCytoData`` object.
 
     ``PyCytoData`` is required to be installed for this to work. Since it is an
     optional dependency, read our `Installation Guide <https://cytomulate.readthedocs.io/en/dev/installation.html>`_
-    for further details. Once installed, ``PyCytoData`` is fully compatible.
+    for further details. Once installed, ``PyCytoData`` is fully compatible with ``Cytomulate``
 
 
 To do this, simply use the following method instead:
@@ -237,7 +237,7 @@ Now, let's look at our outputs:
                'Erythroblast'], dtype='<U17')}
 
 
-``PyCytoData`` Output
+PyCytoData Output
 ------------------------
 
 If you have fallen in love with ``PyCytoData``, good news: the emulation mode is compatible with

diff --git a/docs/source/tutorial/benchmark.rst b/docs/source/tutorial/benchmark.rst
@@ -0,0 +1,3 @@
+*************************
+Benchmark Analyses
+*************************