diff --git a/LICENSE.txt b/LICENSE.txt index 9ca1618..83ebe94 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright 2022 Cytomulate Developers +Copyright 2022-2023 Cytomulate Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), diff --git a/assets/pycytodata_flowchart.png b/assets/pycytodata_flowchart.png new file mode 100644 index 0000000..d932d07 Binary files /dev/null and b/assets/pycytodata_flowchart.png differ diff --git a/cytomulate/cell_graph_general.py b/cytomulate/cell_graph_general.py index 71e2067..057f751 100644 --- a/cytomulate/cell_graph_general.py +++ b/cytomulate/cell_graph_general.py @@ -32,7 +32,8 @@ def generate_trajectories(self, cell_types: dict A dictionary of CellType objects kwargs: - Extra parameters needed for non-default path generation algorithms + Extra parameters needed for non-default path generation algorithms, which + are passed to ``cytomulate.utilities.trajectories``. """ edges = self.graph.edges for e in edges: diff --git a/cytomulate/creation/cytof_data.py b/cytomulate/creation/cytof_data.py index 5f5c107..b3beb83 100644 --- a/cytomulate/creation/cytof_data.py +++ b/cytomulate/creation/cytof_data.py @@ -23,20 +23,28 @@ def __init__(self, n_markers: int = 20, n_trees: int = 2, background_noise_model: Optional[Union[Callable, dict]] = None) -> None: - """Initialize the CreationCytofData object + """The Creation Mode object for Cytomulate. + + This class serves as a starting point for the Creation Mode of Cytomulate. The constructor + defines the key parameters of the simulation, including the number of batches, cell types, + protein markers, and trees. The number of cells is defined later at a sampling step. Parameters ---------- n_batches: int - Number of batches to be simulated + Number of batches to be simulated. All other the parameters apply to every batch. n_types: int - Number of cell types to be simulated + Number of cell types to be simulated. n_markers: int - Number of markers (columns) to be used + Number of protein markers (columns) to be simulated. n_trees: int - Number of trees in the cell graph + Number of trees in the cell graph. Each tree encapsulates and represents the relationship + between cell types. Note that it is not necessary to add trajectory in complex simulation + even though trees are used in general. background_noise_model: Callable or dict - The function used to generate background noise. It should only take one input: size + The function used to generate background noise. It should only take one input: size. In the + cases of multiple batches with different noise models, a dictionary with batch number as keys + and function as value is used. """ super().__init__(n_batches, background_noise_model) @@ -63,18 +71,26 @@ def initialize_cell_types(self, scale: float = 0.5, n_components: int = 1, variance_mode: float = 0.01) -> None: - """Initialize cell type objects + """Initialize cell type models. + + This method initialzes the models for each cell type. Namely, a Gaussian Mixture Model + is generated for each cell type at this stage according to the parameters specified. Parameters ---------- L: int - Number of levels of expressions + Number of levels of expressions. The levels are used to differentiate between cell types + whose expressions for the same marker may be different. We recommend at least 2, but + not too many. scale: float - The scale parameter used in generating expression levels + The scale parameter used in generating expression levels' mean, which comes from a + truncated normal distribution on the positive reals. The ``scale`` is the standard the + deviation of the distribution. When the scale is large, the levels of expressions + are more spead out, and vice versa. n_components: int - Number of components in a GMM + Number of components in a GMM. variance_mode: float - The mode of the variance of the inverse wishart distribution + The mode of the variance of the inverse wishart distribution. """ # We first generate high expression levels and low expression levels # Truncated normals are used to ensure the ordering @@ -91,6 +107,9 @@ def initialize_cell_types(self, def generate_cell_graph(self, **kwargs) -> None: """Generate cell differentiation paths + This method is part of complex simulation's cellular trajectory simulation. It + generates differentiation paths, which will be used at the sampling stage. + Parameters ---------- kwargs: diff --git a/cytomulate/cytof_data_general.py b/cytomulate/cytof_data_general.py index 0d82224..080c799 100644 --- a/cytomulate/cytof_data_general.py +++ b/cytomulate/cytof_data_general.py @@ -23,7 +23,11 @@ class GeneralCytofData: def __init__(self, n_batches: int = 1, background_noise_model: Optional[Union[Callable, dict]] = None) -> None: - """Initialize the GeneralCytofData object + """Initialize the GeneralCytofData object. + + This is the base class for `CreationCytofData` and `EmulationCytofData`, both of + which inherits most of the methods. This class provides functionalities including + sampling and complex simulations. Parameters ---------- @@ -53,12 +57,18 @@ def __init__(self, def generate_cell_abundances(self, is_random: bool = True) -> None: - """Generate cell abundances + """Generate cell abundances. + + This method generates the cell abundane for each batch. The probability + of each cell type can be either random or fixed with equal probabilities. + See `is_random` parameter for details. Parameters ---------- is_random: bool - Whether the cell abundances should be randomly generated + Whether the cell abundances should be randomly generated. If + `True`, the abundance of each cell type is sampled from a dirichlet + distribution. If `False`, then all cell types an have equal probability. """ if is_random: # If randomly generate cell abundances, @@ -218,13 +228,16 @@ def sample_one_batch(self, Number of samples cell_abundances: dict or None A dictionary whose keys are the cell labels. The corresponding values should be - either the actual number of events for each cell type or the probability of each cell type + either the actual number of events for each cell type or the probability of each cell type. + If this is not provided, the one stored in the object will be used. Defaults to `None`. batch: int - The index of the batch for which we want to draw samples + The index of the batch for which we want to draw samples. Defaults to 0. beta_alpha: float or int - The alpha parameter of the beta distribution + The alpha parameter of the beta distribution, which should be contrained to the positive reals. + Defaults to 0.4. beta_beta: float or int - The beta parameter of the beta distribution + The beta parameter of the beta distribution, which should be contrained to the positive reals. + Defaults to 1.0. Returns ------- @@ -355,9 +368,11 @@ def sample(self, It can be a plain dictionary whose keys are the cell labels. The corresponding values should be either the actual number of events for each cell type or the probability of each cell type beta_alpha: float, int, or dict - The alpha parameters of the beta distribution + The alpha parameters of the beta distribution, which should be contrained to the positive reals. + Defaults to 0.4. beta_beta: float, int, or dict - The beta parameters of the beta distribution + The beta parameters of the beta distribution, which should be contrained to the positive reals. + Defaults to 0.4. Returns ------- diff --git a/cytomulate/emulation/cytof_data.py b/cytomulate/emulation/cytof_data.py index d5237fd..952f54a 100644 --- a/cytomulate/emulation/cytof_data.py +++ b/cytomulate/emulation/cytof_data.py @@ -14,7 +14,7 @@ from cytomulate.cytof_data_general import GeneralCytofData # Typing -from typing import Union, Optional, Callable +from typing import Union, Optional, Callable, Tuple, List class EmulationCytofData(GeneralCytofData): @@ -22,7 +22,12 @@ def __init__(self, n_batches: int = 1, background_noise_model: Optional[Union[Callable, dict]] = None, bead_label: Optional[Union[str, int]] = None) -> None: - """Initialize the EmulationCytofData object + """The Emulation Mode object for Cytomulate. + + This class serves as a starting point for the Emulation Mode of Cytomulate. The constructor + defines the key parameters of the simulation, including the number of batches. Unlike the + Creation mode, other parameters such as the number of protein markers are fixed from the + dataset rather than user-soecified. The number of cells is defined later at a sampling step. Parameters ---------- @@ -46,9 +51,15 @@ def initialize_cell_types(self, labels: np.ndarray, max_components: int = 9, min_components: int = 1, - covariance_types: Union[list, tuple] = ("full", "tied", "diag", "spherical")) -> None: + covariance_types: Union[List[str], Tuple[str]] = ("full", "tied", "diag", "spherical")) -> None: """Initialize cell type models by fitting Gaussian mixtures + This method fits the GMM models for each cell type. Namely, a Gaussian Mixture Model + is generated for each cell type at this stage according to the parameters specified. + An extensive model selection procedure based on the Bayesian Information Criterion (BIC) + is performed when multiple possibilities of components and covariance types are + specified. See details in `max_components` and `covariance_types`. + Parameters ---------- expression_matrix: np.ndarray @@ -56,11 +67,17 @@ def initialize_cell_types(self, labels: np.ndarray A vector of cell type labels max_components: int - Used for Gaussian mixture model selection. The maximal number of components for a Gaussian mixture + The maximal number of components for a Gaussian mixture. Used for Gaussian mixture model selection. + This must be smaller or equal to the `max_components`. If `max_components` equals `min_components`, + the exact number will be used for fitting. Otherwise, a model selection procedure will ensue using + Bayesian Information Criterion. min_components: int - Used for Gaussian mixture model selection. The minimal number of components for a Gaussian mxitrue + The minimal number of components for a Gaussian mxitrue. Used for Gaussian mixture model selection. + This must be smaller or equal to the `max_components`. See `max_components` for details on model + selection. covariance_types: list or tuple - Used for Gaussian mixture model selection. The candidate types of covariances + The candidate types of covariances used for Gaussian mixture model selection. If only one is specified, + no model selection will be performed based on the covariance structure. """ self.n_markers = np.shape(expression_matrix)[1] @@ -92,6 +109,9 @@ def generate_cell_graph(self, graph_topology: str = "forest", **kwargs) -> None: """Generate a cell graph as well as differentiation paths + + This method is part of complex simulation's cellular trajectory simulation. It + generates differentiation paths, which will be used at the sampling stage. Parameters ---------- @@ -110,12 +130,26 @@ def generate_cell_abundances(self, is_random: bool = True) -> None: """Generate cell abundances + Generate the cell abundances for all cell types: namely, the amount + of cells in each cell type. This method supports either data-based + cell abundance or randomly-generated cell abundance. In the latter + case, each cell type's probability can be further randomized. + Parameters ---------- use_observed: bool Whether the cell abundances should use the observed ones is_random: bool - Whether the cell abundances should be randomly generated + In the case that `user_obsersed` is `False`, whether the cell abundances' + probability should be randomly generated. If `True`, the abundance of each + cell type is sampled from a dirichlet distribution. If `False`, then all cell + types an have equal probability. + + Note + ----- + If you wish to use the default observed cell abundance from the data, + it is not necessary to call this method. Otherwise, you should always + set ``used_observed`` to ``False``. """ if use_observed: for b in range(self.n_batches): diff --git a/docs/source/index.rst b/docs/source/index.rst index b20423e..42f4ae0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,9 +33,13 @@ For more details, **read our tutorials and documentations linked below!** Or try :maxdepth: 1 :caption: Tutorial + tutorial/emulation + tutorial/creation tutorial/cli tutorial/complex tutorial/visualization + tutorial/pycytodata + tutorial/benchmark .. toctree:: :maxdepth: 1 diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 2ce36be..6ea7abc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -70,9 +70,9 @@ Now, you can have the option to output your simulation results in a ``PyCytoData .. note:: - ``PyCytoData`` requires ``Python>=3.7``, which is more strict than ``cytomulate``. + ``PyCytoData`` requires ``Python>=3.7``, which is stricter than ``cytomulate``. If you are still running an older version, please consider upgrading. -.. image:: ../../../assets/pycytodata.jpg +.. image:: ../../assets/pycytodata.jpg :width: 600 :alt: PyCytoData Alliance diff --git a/docs/source/license.rst b/docs/source/license.rst index 552ef38..e6e00e4 100644 --- a/docs/source/license.rst +++ b/docs/source/license.rst @@ -8,7 +8,7 @@ Our project is officially licensed under the MIT license: The MIT License (MIT) - Copyright 2022 Cytomulate Developers + Copyright 2022-2023 Cytomulate Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 801044d..7634137 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -72,7 +72,7 @@ this is because Cytomulate can accomodate multiple samples as indexed by the dictionary keys. Of course, you can procceed to extract the expression matrix and then work with it in downstream analyses. -``PyCytoData`` Output +PyCytoData Output ------------------------ For those of you who are familiar with ``PyCytoData`` or want a cleaner interface @@ -82,7 +82,7 @@ to work with, Cytomulate can output a ``PyCytoData`` object. ``PyCytoData`` is required to be installed for this to work. Since it is an optional dependency, read our `Installation Guide `_ - for further details. Once installed, ``PyCytoData`` is fully compatible. + for further details. Once installed, ``PyCytoData`` is fully compatible with ``Cytomulate`` To do this, simply use the following method instead: @@ -237,7 +237,7 @@ Now, let's look at our outputs: 'Erythroblast'], dtype='>> from cytomulate import CreationCytofData + +In this tutorial, we're not showcasing PyCytoData since it's purely downstream from what we're doing! See +our `QuickStart Guide `_ on how to output a +``PyCytoData`` object from Cytomulate and we will point out at the appropriate place as well. + + +------------------------------- + +******************* +Simple Simulation +******************* + +In the simplest setting, a Creation Mode simulation looks like the following: + +.. code:: python + >>> cytof_data = CreationCytofData(n_batches = 1, + ... n_types = 10, + ... n_markers = 20, + ... n_trees = 2) + >>> cytof_data.initialize_cell_types(L = 4, + ... scale = 0.5, + ... n_components = 1, + ... variance_mode = 0.01) + >>> expression_matrices, labels, _, _ = cytof_data.sample(n_samples = 100) + +Again, we risk being a broken record by writing out all the defaults in this simulation! +Given the nature of the creation mode, there are many more options to contend with. Here +is a list of default options, and we explain some of the more obscure ones in detail: + +- 1 batch +- 10 cell types +- 20 protein markers +- 2 differentiation trees: Note that trees merely represent relationships between cell types, which is necessary even without trajectory. +- 100 cells in total + +Then, all the parameters in the ``initialized_cell_types`` methods pertain to the simulation model, which will be explained later in more +complex simulation situations. + +.. note:: + + If you wish to use PyCytoData as output format, replace the ``cytof_data.sample()`` method + with ``cytof_data.sample_to_pycytodata()``. The interface otherwise remains unchanged. + +-------------------------------------- + +********************************* +Fine Tuning of Cytomulate Models +********************************* + +The Creation Mode also uses Gaussian Mixture Models (GMMs), which sounds awfully similar to the Emulation Mode, but the key difference +is that we simulate the marker pattens and statistics necessary for each GMM rather than learning from data. This part of the +tutorial walks you through some of the details in the implementation. + +Cell Types and Expressions +----------------------------- + +With the Creation Mode, we have the assumption that cell types are identified via gating in real life. For example, we use CD4 and CD8 +as markers to identify T cells. For a given cell type, we can thus some channels to be highly expressed whereas other channels to be +not as highly expressed. This framework serves as the foundation of Cytomulate's Creation Mode. Internally, we use a matrix to encode +the expression pattern for each cell type, where as levels of expression for each channel is encoded. You can tune these parameters +to make sure that diverse pattrns can be generated. + +The first parameter you can tune is the levels of expressions for all channels, which is the ``L`` parameter in the ``initialize_cell_types`` +method. Intuitively, we at least need two levels because otherwise all cell types will have the same pattern. By default, we +used 4 levels, where the smallest level represents "not expressed at all" and the highest level represents "most highly expressed". +For simple datasets with few cell types, you a smaller ``L`` may be fine, but in general, we think that the default will work well. +You can tune this parameter by using the following snippet (all other parameters are defaults): + +.. code:: python + + >>> cytof_data = CreationCytofData() + >>> cytof_data.initialize_cell_types(L = 2) + + +Once you have the levels determined, you can decide how different these levels are because the levels are generated using a +truncated normal distribution. If you wish to have more spead-out levels, a large variance is needed and vice verse. In +Cytomulate, we use the ``scale`` parameter because the truncated Normal belongs to the Location-Scale family. By default, +the scale (standard deviation) is 0.5, and you can change it to any positive real you prefer: + +.. code:: python + + >>> cytof_data = CreationCytofData() + >>> cytof_data.initialize_cell_types(scale = 0.9) + + +Generating Expression Patterns +------------------------------- + +Once the above parameters are set, the expression patterns will be generated by calling the ``initialize_cell_types`` method +as shown. The expression pattern is not shown and Cytomulate will have it taken care of. An example of tuning both parameters +is like this: + +.. code:: python + + >>> cytof_data = CreationCytofData() + >>> cytof_data.initialize_cell_types(L = 2, + ... scale = 0.9) + + +Fitting GMMs +---------------- + +Finally, we fit a GMM for each cell type using the expression patterns generated above, which are used as mean parameters. +Here, we recommend the defaults, which just uses 1 component. In real data, we often need more components because the +expressions can have many modes and cell-typing is not always a perfect procedures. However, the Creation Mode makes +things a bit easier. If you wish to use more components, you still can: + + +.. code:: python + + >>> cytof_data = CreationCytofData() + >>> cytof_data.initialize_cell_types(n_components = 2) + +Besides the number if components, there is an additional parameter called ``variance_mode``, which is used to +calculate shape and rate parameters used for the Inverse Wishert Distribution. We do not recommend changing this +parameter. + + diff --git a/docs/source/tutorial/emulation.rst b/docs/source/tutorial/emulation.rst new file mode 100644 index 0000000..bd5f5d1 --- /dev/null +++ b/docs/source/tutorial/emulation.rst @@ -0,0 +1,389 @@ +#################### +Emulation Mode +#################### + +Sometimes when we already have data on hand and we can't help but wanting more datasets, then this is the +time to Cytomulate with the **Emulation Mode**. As introduced in the `Quickstart Guide `_, +the Emulation Mode allows users to input their own datasets and then simulate data based on the given +dataset. This is useful in a number of scenarios: + +1. More datasets are needed to validate a method. +2. The current dataset has only 1 batch, but more than 1 is desired. +3. Biological interpretation with real protein channels and cell types are desired. +4. Analyses call for a basis on real data. + +Of course, there are many more reasons to pick Emulation Mode over Creation Mode. The only prerequisite +is that users have to have their own dataset already. If not, they will have to find a dataset online +or actually perform some experiments in the lab. But for the sake of this tutorial, we will assume +that everyone has a dataset available to be emulated. + +******************* +Setting Things Up +******************* + +Before we start, let's make some necessary imports: + +.. code-block:: python + + >>> from cytomulate import EmulationCytofData + >>> from PyCytoData import DataLoader, FileIO, PyCytoData + +Throughout the rest of the tutorial, we will use these functions directly. + +.. note:: + ``PyCytoData`` is optional for the demonstration here, and it is not a required dependency + for ``Cytomulate`` unless the CLI is used. We include examples with ``PyCytoData`` for + its convenience in Python. + +------------------------------ + +*************** +Load Datasets +*************** + +First things first, we need to load our existing dataset into Python. For this tutorial, we would +like to assume that the data are stored in plain text (*e.g.* txt, csv, etc.) rather than the ``fcs`` +format. We understand that the latter is popular in the field, but we do not yet have a good tutorial +on this. We would defer to the documentation of tools such as `fcspasrser `_ +or `flowCore `_. If you do have +``fcs`` files, you can use these tools to load the files, extract the expression matrices and cell +types, and then save the results as plain text files. + + +In this section, we will showcase a few methods of loading the datasets. For all methods, we will +save the expression matrix as ``dataset`` whereas the cell types as ``cell_types``, both of which +are NumPy arrays. Further, we generally assume that you don't have instrument channels, such as +"Time" and "Bead", in your dataset! If you do, you may need some preprocssing before passing them to +Cytomulate. + +The `PyCytoData` Method +------------------------ + +We highly recommend the usage of PyCytoData to load the data into Python because it is quite easy +to do! If you have the data on hand, you can quickly load the expression matrix and cell types +with the following snippet of code: + +.. code-block:: python + + >>> dataset = FileIO.load_delim("", skiprows = 1, delim = "\t", dtype = float) + >>> cell_types = FileIO.load_delim("", skiprows = 0, delim = "\t", dtype = str) + +You will replace the the paths with the actual paths to your files. Note that in the above example, we assume +that the expression matrix has the first row as protein marker names, which will be skipped. We further assume +that the file is saved as a tab-delimited file. You can customize the process to your own files. + + +If you would like to use the ``PyCytoData`` object, you can use the following example: + +.. code-block:: python + + >>> data = FileIO.load_expression("", col_names = True, delim = "\t", dtype = float) + >>> cell_types = FileIO.load_delim("", skiprows = 0, delim = "\t", dtype = str) + >>> data.cell_types = cell_types + +This has the advantage of storing the channel names in the object via the ``data.channels`` attribute. Now, +the only difference is that you need to use ``data.expression_matrix`` to access your expression +matrix. + +PyCytoData has many other features and details which are all helpful in some way, but we cannot rewrite +the documentation here. You can read go ahead and read the `documentation `_ +if interested. + + +The `numpy` Method +--------------------- + +The PyCytoData's IO methods are actually built on top of the NumPy package. If you wish to use the NumPy directly, +it is also easy to do so: + +.. code-block:: python + + >>> import numpy as np + >>> dataset = np.loadtxt("", skiprows = 1, delim = "\t", dtype = float) + >>> cell_types = np.loadtxt("", skiprows = 0, delim = "\t", dtype = str) + +which has a similar interface to the PyCytoData interface. This is arguably a simpler interface. However, NumPy does +not have the benefits of a PyCytoData object, which you may or may not need. + +The `pandas` Method +-------------------- + +Of course, if you would like to read data using Pandas, you can do so with ease as well. We are going to read the +data and extract the necessary components to be used by Cytomulate. Here, we assume a few things for demonstration +purposes: + +1. Everything is in one file, including a column with cell types. +2. The cell type column is named "Cell_Types". +3. Column names are available. + +.. code-block:: python + + >>> import pandas as pd + >>> df = pd.read_csv("", sep = "\t", header = 0) + >>> cell_types = df["Cell_Types"].to_numpy() + >>> dataset = df.drop("Cell_Types", axis = 1).to_numpy() + +As with all other methods, there are tons of variations based on the data available! You likely need some +daat wrangling, but the principles are simple! + + +---------------------------------- + +******************* +Simple Simulation +******************* + +Once you have your datasets, you can run your simplest simulation: + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 9, + min_components = 1, + covariance_types = ("full", "tied", "diag", "spherical")) + >>> expression_matrices, labels, _, _ = cytof_data.sample(n_samples = 100) + + +As opposed to the `Quickstart Guide `_, here +we are more explicit about the parameters that users can supply or change. Note that all above +listed parameters are defaults, except for the ``expression_matrix`` and ``labels`` that need to be +user supplied. In the above function calls, we are doing the following: + +1. Initialize a ``EmulationCytofData`` object with 1 batch. +2. Ask the object to emulate the given ``dataset`` with the given ``cell_types``. The GMM model is allowed to use between 1 and 9 components with one of the four covariance types. +3. We sample 100 cells in total for the dataset. + +We would like to note that ``n_samples`` indicates the sample size, not the number of batches. In this case, we have +the following outputs: + +.. code-block:: python + + >>> expression_matrices + {0: array([[158.04318784, 143.54615373, 14.55784182, ..., 28.23468037, + 2.053792 , 0. ], + [1.00986273, 6.6360988 , 0. , ..., 6.54024249, + 1.7951401 , 19.13473223], + [4.30063734, 5.7733213 , 0. , ..., 23.30509231, + 8.80240499, 21.81913369], + ..., + [191.03235818, 18.41559666, 0. , ..., 333.3101886 , + 0. , 215.16717053], + [310.09391901, 123.0910225 , 0. , ..., 0. , + 0. , 164.39680483], + [382.54861993, 8.39272611, 3.83182708, ..., 48.24234018, + 3.20327527, 184.68550649]])} + >>> labels + {0: array(['Mature CD38lo B', 'Erythroblast', 'Erythroblast', + 'CD11bhi Monocyte', 'Mature CD38lo B', 'NotGated', 'NotGated', + 'CD11b- Monocyte', 'NotGated', 'NotGated', 'NotGated', 'NotGated', + 'NotGated', 'NotGated', 'NotGated', 'NotGated', 'Mature CD4+ T', + 'NotGated', 'NotGated', 'NotGated', 'NotGated', 'Mature CD8+ T', + 'NotGated', 'Mature CD4+ T', 'NotGated', 'Mature CD4+ T', + 'NotGated', 'NotGated', 'NotGated', 'NotGated', 'Mature CD4+ T', + 'NotGated', 'NotGated', 'NotGated', 'NotGated', 'Erythroblast', + 'NotGated', 'NotGated', 'CD11bhi Monocyte', 'NotGated', 'NotGated', + 'CD11bhi Monocyte', 'CD11bmid Monocyte', 'NotGated', + 'Naive CD4+ T', 'Erythroblast', 'Plasmacytoid DC', 'Naive CD8+ T', + 'NotGated', 'Erythroblast', 'NotGated', 'CD11bhi Monocyte', + 'Megakaryocyte', 'NotGated', 'Mature CD4+ T', 'NotGated', + 'Mature CD4+ T', 'Mature CD4+ T', 'NotGated', 'NK', 'NotGated', + 'Naive CD8+ T', 'NotGated', 'NotGated', 'Mature CD8+ T', + 'NotGated', 'NK', 'NotGated', 'Mature CD8+ T', 'NotGated', + 'NotGated', 'NotGated', 'Mature CD4+ T', 'Mature CD38lo B', + 'CD11bhi Monocyte', 'NotGated', 'Mature CD38lo B', 'Naive CD8+ T', + 'Mature CD4+ T', 'NotGated', 'NotGated', 'Erythroblast', + 'NotGated', 'NotGated', 'NotGated', 'NotGated', 'Mature CD4+ T', + 'Mature CD4+ T', 'Erythroblast', 'Mature CD38lo B', 'Erythroblast', + 'NotGated', 'NotGated', 'Naive CD8+ T', 'NotGated', + 'Mature CD8+ T', 'NotGated', 'NotGated', 'Naive CD8+ T', + 'Mature CD8+ T'], dtype='`_. + +------------------------------ + +************************* +GMM and Model Selection +************************* + +Cytomulate supports numerous model configurations for each dataset. Since the overall model is based on the +Gaussian Mixture Model (GMM), there are two major tuning parameters that are needed: + +1. The number of components +2. The covariance matrix structure + +As you can see from the example above, you can specify these manually using the simple simulation or go +with the defaults. If you have a specific configuration in mind (e.g. 3 components with a full covariance +matrix), you can simulate with the following: + + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 3, + min_components = 3, + covariance_types = "full") + +which tells Cytomulate to use 3 components exactly with a full covariance matrix. + +For components, you can choose any you like (except that they should be positive integers! We can't owe you +any components even if you wish!). For covariance matrix, you have four choices: "full", "diag", "tied", or +"spherical". Look at ``sklearn``'s +`documentation here `_ +for an explanation of how each of the four options work. In short, if you want everything, the "full" is a +good choice; or if you want independent channels, you need to choose "diag". While we don't recommend the +latter as we saw that correlation between protein channels are present, you can certainly try it if you +see a need. + + +Model Selection +---------------- + +As evident from the example above, there are many choices regarding these two crucial parameters. By default, +Cytomulate performs a model selection procedure using the +`Bayesian Information Criterion (BIC) `_ +for model selection. We perform this for both the number of components and the covariance structure. + +For example, if you wish to slect between 1 and 5 components (inclusive) but with the full covariance +matrix, you can run the following code: + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 5, + min_components = 1, + covariance_types = "full") + + +Or if you wish to fix the number of components while selecting the type of covariance matrices, +run this instead: + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 5, + min_components = 5, + covariance_types = ("full", "tied", "diag", "spherical")) + +Note that as long as ``max_component == min_components``, the specified number will be used. Also, +you can specify a subset of teh ``covariance_types`` without having to run all four. + +Whether you wish to run model selection or not is up to your dataset and need. By default, Cytomulate +does perform model selection to allow for the best fit for the data. The downside is that this process +does slow down the algorithm because the model needs to be fit multiple times. + +Defaults and Recommendations +------------------------------ + +As mentioned, model selection is performed by Cytomulate. If you would like to select the parameters +yourself, we have a few heuristics that may be helpful: + +1. When in doubt, use the "full" covariance matrix to allow for the most flexible model. +2. the default betwene 1 and 9 components per cell type is often sufficient (hence the default). +3. If setting the components manually, perform an exploratory data analysis on your data first to see the data. Increase the numner of components if Cytomulate doesn't fit well. + +Of course, there is no one-size-fit-all recommendation! Try it and find out! + + +---------------------------------- + + +**************************** +Cell Abundance +**************************** + +First, let us define cell abundance: The proportion of each cell type in a given sample. From a first +glance, this is incredibly vague because, and then, it may occur to people that this is somewhat trivial +if we are not dealing with simulated datasets. However, being a good simulator, Cytomulate allows users +to tweak the cell abundance in their dataset or use a real-data-based decision rule for simulation. But +before diving into details, we will discuss a bit of context for Cytomulate. + +As defined in our paper, Cytomulate fits a GMM for each cell type, meaning that there are as many GMMs +as cell types. The cell types are user specified naturally. It may be tempting to fit one single GMM +to all the data, which is definitely possible, but the caveate is that a very complex model is needed. +So Cytomulate tries to take advantage of cell types or clusters as given by the user. Then, when the GMMs +are estimated, the next step is to sample from them, which is the ``sample()`` method seen earlier. Recall +that the method takes only 1 argument, which is the number of cells in the output rather than the number +of cells in each cell type. Thus, this section deals with how we estimate the makeup of our dataset. + +Default: Data-Based +----------------------- + +By default, we learn from the dataset itself. So if you use the following snippet as shown earlier, + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 9, + min_components = 1, + covariance_types = ("full", "tied", "diag", "spherical")) + >>> expression_matrices, labels, _, _ = cytof_data.sample(n_samples = 100) + +Cytomulate will estimate the proportion of cells from real data. This is generally not affected by other +parameters, such as the number of components and the number of batches. We recommend the default if +you wish the simulated dataset to be as similar to the simulated data as possible. + + +Random Cell Abundance +----------------------- + +There are some situations in which you may want to change the cell abundance, such as + +- inducing more variance +- wanting more rare cell types +- needing some difference from the reference dataset. + +In such cases, we can allow cell abundance to be randomly generated. There are in general two settings you +can choose. If you wish each cell type to have **equal probability**, then you should run the following +snippet: + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 9, + min_components = 1, + covariance_types = ("full", "tied", "diag", "spherical")) + >>> cytof_data.generate_cell_abundance(use_observed = False, + is_random = False) + +.. note:: + It is uncessary to call ``generate_cell_abundance`` if you wish to use the observed cell abundance + since it's the default behavior. + + +On the other hand, if you wish to add some randomness into the mix, the probability for each cell type +can be generated with the Dirichlet distribution by simply setting ``is_random`` to ``True``: + +.. code-block:: python + + >>> cytof_data = EmulationCytofData(n_batches = 1) + >>> cytof_data.initialize_cell_types(expression_matrix = dataset, + labels = cell_types, + max_components = 9, + min_components = 1, + covariance_types = ("full", "tied", "diag", "spherical")) + >>> cytof_data.generate_cell_abundance(use_observed = False, + is_random = True) + +This last example is especially useful is you wish to have a sample that is slightly different from +real samples. \ No newline at end of file diff --git a/docs/source/tutorial/pycytodata.rst b/docs/source/tutorial/pycytodata.rst new file mode 100644 index 0000000..098fdbb --- /dev/null +++ b/docs/source/tutorial/pycytodata.rst @@ -0,0 +1,77 @@ +############################## +Should I install PyCytoData? +############################## + + +In case you haven't heard, `PyCytoData `_ provides a unified +framework for analyzing CyTOF data in python. The supported workflow includes File IO, preprocessing, and +dimension reduction and visualization via CytofDR. While we believe in the future of PyCytoData, we recognize +that many people prefer a leaner package without the baggage of an ecosystem. Thus, we kindly made ``PyCytoData`` +to only those who need it. In this tutorial, we walk you through whether you need to install ``PyCytoData`` +and help you make the best decisions. + + +------------------------------ + +********************* +Decision at a Glance +********************* + +If you need a TLDR, we create a nifty flowchart for you to follow. At each step, ask yourself the question +and you will arrive at a conclusion. We want to emphasize that currently only the Cytomulate CLI strictly +requires ``PyCytoData`` because we don't want to maintain two branches of the same IO code. For all other +workflows, you can judge the necessity of Cytomulate yourself. + + +.. image:: ../../../assets/pycytodata_flowchart.png + :width: 600 + :alt: A flowchart for installing PyCytoData. + +----------------------------------- + +***************************** +How is PyCytoData optional? +***************************** + +Currently, we need ``PyCytoData`` for two functionalities: + +- The Cytomulate CLI +- Outputting to a PyCytoData object + +For the former, we require ``PyCytoData`` because we rely on its File IO operations so that we don't have +reinvent a great module. For the latter, users have the option to use the ``sample_to_pycytodata`` method +or just the regular ``sample`` method. In the case that ``PyCytoData`` is not installed, user can have +NumPy arrays returned instead. As you may see, we do not rely on ``PyCytoData`` for the core model and +estimation procedures. ``PyCytoData`` is rather just a convenience feature for those who need it. + +In terms of implementation, we check whether ``PyCytoData`` is importable at the beginning. If not, no +``ImportError`` is raised until absolutely necessary. This means that unless users explicitly call +``smaple_to_pycytodata`` without a proper installation, we do not complain! Of course, this implementation +has its downsides: namely, it can surprise those who are not aware of the optional dependency. We will +revisit this in the future. + + +*************************************** +What are added benefits of PyCytoData? +*************************************** + +The main reason to use ``PyCytoData`` is its convenience. Specifically, it can handle batches, metadata, +and working with expression matrices easily. For those who want a wrapper around arrays, we believe that +``PyCytoData`` is a good fit. Of course, let's not forget the File IO capabilities and downstream integration +with CytofDR. + +On the note of downstream analyses, ``PyCytoData`` is being actively developed. So, there will be some neat +features coming in the futrue when we add more tools and integrate more libraries. + +*********************************************** +Should I skip PyCytoData for CytofDR directly? +*********************************************** + +If you are asking this question, then **YES**! The ``CytofDR`` package provides a more complete set of +tools for benchmarking and performing DR in CyTOF. While ``PyCytoData`` can have workarounds to allow +these functionalities, we recommend using ``CytofDR`` directly. However, you are always welcomed to +install ``PyCytoData`` to manage other aspects of the CyTOF workflow. + + + +