PixelgenTechnologies · ptajvar · Oct 22, 2024 · Oct 15, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+-   Add `is_potential_doublet` and `n_edges_to_split_doublet` columns to adata.obs.
+-   Add `fraction_potential_doublets` and `n_edges_to_split_potential_doublets` to annotate report.json.
+-   Add `--max-edges-to-split` option to `graph` to specify the maximum number of edges that can be removed between two sub-components during multiplet recovery.
 -   Add `abundance_colocalization_plot` function to make scatter plots of selected marker-pairs' abundance.
 -   Add `plot_polarity_diff_volcano` to make statistical comparison plots of selected component groups.
 -   Add `get_differential_polarity` to statistically compare polarity scores of selected component groups.

diff --git a/src/pixelator/annotate/__init__.py b/src/pixelator/annotate/__init__.py
@@ -376,6 +376,8 @@ class AnnotateAnndataStatistics(typing.TypedDict):
     min_size_threshold: Optional[int]
     max_size_threshold: Optional[int]
     doublet_size_threshold: Optional[int]
+    fraction_potential_doublets: Optional[float]
+    n_edges_to_split_potential_doublets: Optional[int]
 
 
 def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
@@ -412,6 +414,8 @@ def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
         "min_size_threshold": None,
         "max_size_threshold": None,
         "doublet_size_threshold": None,
+        "fraction_potential_doublets": None,
+        "n_edges_to_split_potential_doublets": None,
     }
 
     # Tau type will only be available if it has been added in the annotate step
@@ -433,4 +437,14 @@ def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
     if "doublet_size_threshold" in adata.uns:
         metrics["doublet_size_threshold"] = adata.uns["doublet_size_threshold"]
 
+    if "is_potential_doublet" in adata.obs:
+        metrics["fraction_potential_doublets"] = adata.obs[
+            "is_potential_doublet"
+        ].mean()
+
+    if "n_edges_to_split_doublet" in adata.obs:
+        metrics["n_edges_to_split_potential_doublets"] = adata.obs[
+            "n_edges_to_split_doublet"
+        ].sum()
+
     return metrics
diff --git a/src/pixelator/graph/constants.py b/src/pixelator/graph/constants.py
@@ -5,3 +5,6 @@
 
 MIN_PIXELS_TO_REFINE = 100
 LEIDEN_RESOLUTION = 0.01
+RELATIVE_ANNOTATE_RESOLUTION = (
+    0.5  # A lower resolution is used for annotation of potential doublets
+)
diff --git a/src/pixelator/pixeldataset/utils.py b/src/pixelator/pixeldataset/utils.py
@@ -13,8 +13,10 @@
 import numpy as np
 import pandas as pd
 from anndata import AnnData, ImplicitModificationWarning, read_h5ad
+from graspologic.partition import leiden
 
 from pixelator.graph import components_metrics
+from pixelator.graph.constants import LEIDEN_RESOLUTION, RELATIVE_ANNOTATE_RESOLUTION
 from pixelator.statistics import (
     clr_transformation,
     log1p_transformation,
@@ -217,6 +219,79 @@ def write_anndata(adata: AnnData, filename: PathType) -> None:
     adata.write(filename=filename, compression="gzip")
 
 
+def _compute_sub_communities(
+    component_edgelist: pd.DataFrame, n_edges_reconnect: int | None = None
+) -> pd.Series:
+    component_edgelist = (
+        component_edgelist.groupby(["upia", "upib"], observed=True)["count"]
+        .count()
+        .reset_index()
+        .sort_values(["upia", "upib"])
+    )
+    edgelist_tuple = list(
+        map(tuple, np.array(component_edgelist[["upia", "upib", "count"]]))
+    )
+    component_communities_dict = leiden(
+        edgelist_tuple,
+        resolution=RELATIVE_ANNOTATE_RESOLUTION * LEIDEN_RESOLUTION,
+        random_seed=42,
+    )
+    component_communities = pd.Series(component_communities_dict)
+
+    return component_communities
+
+
+def _assess_doublet(component_edgelist: pd.DataFrame) -> tuple[bool, int]:
+    """Check whether a component is a potential doublet and how many edges should be removed to split it.
+
+    A component is a potential doublet if a) it has more than one community and
+    b) the second largest community is at least 20% of the size of the largest
+    community. A lower resolution is to be used for annotation of potential doublets
+    compared to the component recovery in the graph phase. The reduction factor in
+    annotate resolution is set by RELATIVE_ANNOTATE_RESOLUTION (default is 0.5).
+
+    """
+    component_communities = _compute_sub_communities(component_edgelist)
+    component_community_sizes = component_communities.value_counts().sort_values(
+        ascending=False
+    )
+    if len(component_community_sizes) > 1 and component_community_sizes.iloc[1] > (
+        0.2 * component_community_sizes.iloc[0]
+    ):
+        edges_to_remove = (
+            component_edgelist["upia"].map(component_communities)
+            != component_edgelist["upib"].map(component_communities)
+        ).sum()
+        return True, edges_to_remove
+    else:
+        return False, 0
+
+
+def mark_potential_doublets(
+    edgelist: pd.DataFrame,
+) -> tuple[pd.Series, pd.Series]:
+    """Mark whether a component is a potential doublet.
+
+    A component is a potential doublet if a) it has more than one community and
+    b) the second largest community is at least 20% of the size of the largest
+    community.
+
+    :param edgelist: the edge list dataframe containing component labels.
+    :returns: a boolean series indicating whether a component is a potential doublet.
+    :rtype: pd.Series
+    """
+    is_potential_doublet = pd.Series(index=edgelist["component"].unique(), dtype=bool)
+    n_edges_to_split_doublet = pd.Series(
+        index=edgelist["component"].unique(), dtype=int
+    )
+    for component_id, component_edgelist in edgelist.groupby("component"):
+        is_potential_doublet[component_id], n_edges_to_split_doublet[component_id] = (
+            _assess_doublet(component_edgelist)
+        )
+
+    return is_potential_doublet, n_edges_to_split_doublet
+
+
 def edgelist_to_anndata(
     edgelist: pd.DataFrame,
     panel: AntibodyPanel,
@@ -262,7 +337,10 @@ def edgelist_to_anndata(
     # compute components metrics (obs) and re-index
     components_metrics_df = components_metrics(edgelist=edgelist)
     components_metrics_df = components_metrics_df.reindex(index=counts_df.index)
-
+    (
+        components_metrics_df["is_potential_doublet"],
+        components_metrics_df["n_edges_to_split_doublet"],
+    ) = mark_potential_doublets(edgelist=edgelist)
     # compute antibody metrics (var) and re-index
     antibody_metrics_df = antibody_metrics(edgelist=edgelist)
     antibody_metrics_df = antibody_metrics_df.reindex(index=panel.markers, fill_value=0)

diff --git a/src/pixelator/report/models/annotate.py b/src/pixelator/report/models/annotate.py
@@ -25,6 +25,20 @@ class AnnotateSampleReport(SampleReport):
         description="The fraction of pixels (A and B pixels) in the largest component.",
     )
 
+    fraction_potential_doublets: float | None = pydantic.Field(
+        description=(
+            "The fraction of components that appear to consist of multiple "
+            "parts by the community detection algorithm."
+        ),
+    )
+
+    n_edges_to_split_potential_doublets: int | None = pydantic.Field(
+        description=(
+            "The total number of edges that need to be removed to split the "
+            "potential doublets into their sub-communities."
+        ),
+    )
+
     # ------------------------------------------------------------------------------- #
     #   Annotate metrics
     # ------------------------------------------------------------------------------- #

diff --git a/tests/pixeldataset/test_utils.py b/tests/pixeldataset/test_utils.py
@@ -94,6 +94,8 @@ def test_adata_creation(edgelist: pd.DataFrame, panel: AntibodyPanel):
             "molecules",
             "pixels",
             "reads",
+            "is_potential_doublet",
+            "n_edges_to_split_doublet",
         ]
     )
     assert "clr" in adata.obsm

diff --git a/tests/report/assets/reports_only/annotate/pbmcs_unstimulated.report.json b/tests/report/assets/reports_only/annotate/pbmcs_unstimulated.report.json
@@ -2,6 +2,8 @@
     "sample_id": "pbmcs_unstimulated",
     "fraction_molecules_in_largest_component": 0.0006480881399870382,
     "fraction_pixels_in_largest_component": 0.0004888381945576015,
+    "fraction_potential_doublets": 0.0,
+    "n_edges_to_split_potential_doublets": 0,
     "input_cell_count": 3052,
     "input_read_count": 6237,
     "cell_count": 3052,

diff --git a/tests/report/assets/reports_only/annotate/uropod_control.report.json b/tests/report/assets/reports_only/annotate/uropod_control.report.json
@@ -2,6 +2,8 @@
     "sample_id": "uropod_control",
     "fraction_molecules_in_largest_component": 0.0007451564828614009,
     "fraction_pixels_in_largest_component": 0.000501378791677112,
+    "fraction_potential_doublets": 0.0,
+    "n_edges_to_split_potential_doublets": 0,
     "input_cell_count": 3963,
     "input_read_count": 8117,
     "cell_count": 3963,

diff --git a/.../test_annotate_metrics_lookup/pbmcs_unstimulated/pbmcs_unstimulated_annotate_metrics.json b/.../test_annotate_metrics_lookup/pbmcs_unstimulated/pbmcs_unstimulated_annotate_metrics.json
@@ -2,6 +2,8 @@
     "sample_id": "pbmcs_unstimulated",
     "fraction_molecules_in_largest_component": 0.0006480881399870382,
     "fraction_pixels_in_largest_component": 0.0004888381945576015,
+    "fraction_potential_doublets": 0.0,
+    "n_edges_to_split_potential_doublets": 0,
     "input_cell_count": 3052,
     "input_read_count": 6237,
     "cell_count": 3052,

diff --git a/...annotate/test_annotate_metrics_lookup/uropod_control/uropod_control_annotate_metrics.json b/...annotate/test_annotate_metrics_lookup/uropod_control/uropod_control_annotate_metrics.json
@@ -2,6 +2,8 @@
     "sample_id": "uropod_control",
     "fraction_molecules_in_largest_component": 0.0007451564828614009,
     "fraction_pixels_in_largest_component": 0.000501378791677112,
+    "fraction_potential_doublets": 0.0,
+    "n_edges_to_split_potential_doublets": 0,
     "input_cell_count": 3963,
     "input_read_count": 8117,
     "cell_count": 3963,

diff --git a/tests/report/snapshots/test_annotate/test_annotate_summary/annotate_summary.csv b/tests/report/snapshots/test_annotate/test_annotate_summary/annotate_summary.csv
@@ -1,3 +1,3 @@
-sample_id,fraction_molecules_in_largest_component,fraction_pixels_in_largest_component,input_cell_count,input_read_count,cell_count,marker_count,total_marker_count,molecule_count,a_pixel_count,b_pixel_count,read_count,aggregate_count,molecules_in_aggregates_count,reads_in_aggregates_count,min_size_threshold,max_size_threshold,doublet_size_threshold,size_filter_fail_cell_count,size_filter_fail_molecule_count,size_filter_fail_read_count,pixel_count,fraction_aggregate_components,fraction_reads_in_aggregates,fraction_molecules_in_aggregates,molecule_count_per_cell_mean,molecule_count_per_cell_std,molecule_count_per_cell_min,molecule_count_per_cell_q1,molecule_count_per_cell_q2,molecule_count_per_cell_q3,molecule_count_per_cell_max,molecule_count_per_cell_count,molecule_count_per_cell_iqr,read_count_per_cell_mean,read_count_per_cell_std,read_count_per_cell_min,read_count_per_cell_q1,read_count_per_cell_q2,read_count_per_cell_q3,read_count_per_cell_max,read_count_per_cell_count,read_count_per_cell_iqr,a_pixel_count_per_cell_mean,a_pixel_count_per_cell_std,a_pixel_count_per_cell_min,a_pixel_count_per_cell_q1,a_pixel_count_per_cell_q2,a_pixel_count_per_cell_q3,a_pixel_count_per_cell_max,a_pixel_count_per_cell_count,a_pixel_count_per_cell_iqr,b_pixel_count_per_cell_mean,b_pixel_count_per_cell_std,b_pixel_count_per_cell_min,b_pixel_count_per_cell_q1,b_pixel_count_per_cell_q2,b_pixel_count_per_cell_q3,b_pixel_count_per_cell_max,b_pixel_count_per_cell_count,b_pixel_count_per_cell_iqr,marker_count_per_cell_mean,marker_count_per_cell_std,marker_count_per_cell_min,marker_count_per_cell_q1,marker_count_per_cell_q2,marker_count_per_cell_q3,marker_count_per_cell_max,marker_count_per_cell_count,marker_count_per_cell_iqr,a_pixel_b_pixel_ratio_per_cell_mean,a_pixel_b_pixel_ratio_per_cell_std,a_pixel_b_pixel_ratio_per_cell_min,a_pixel_b_pixel_ratio_per_cell_q1,a_pixel_b_pixel_ratio_per_cell_q2,a_pixel_b_pixel_ratio_per_cell_q3,a_pixel_b_pixel_ratio_per_cell_max,a_pixel_b_pixel_ratio_per_cell_count,a_pixel_b_pixel_ratio_per_cell_iqr,molecule_count_per_a_pixel_per_cell_mean,molecule_count_per_a_pixel_per_cell_std,molecule_count_per_a_pixel_per_cell_min,molecule_count_per_a_pixel_per_cell_q1,molecule_count_per_a_pixel_per_cell_q2,molecule_count_per_a_pixel_per_cell_q3,molecule_count_per_a_pixel_per_cell_max,molecule_count_per_a_pixel_per_cell_count,molecule_count_per_a_pixel_per_cell_iqr,b_pixel_count_per_a_pixel_per_cell_mean,b_pixel_count_per_a_pixel_per_cell_std,b_pixel_count_per_a_pixel_per_cell_min,b_pixel_count_per_a_pixel_per_cell_q1,b_pixel_count_per_a_pixel_per_cell_q2,b_pixel_count_per_a_pixel_per_cell_q3,b_pixel_count_per_a_pixel_per_cell_max,b_pixel_count_per_a_pixel_per_cell_count,b_pixel_count_per_a_pixel_per_cell_iqr,a_pixel_count_per_b_pixel_per_cell_mean,a_pixel_count_per_b_pixel_per_cell_std,a_pixel_count_per_b_pixel_per_cell_min,a_pixel_count_per_b_pixel_per_cell_q1,a_pixel_count_per_b_pixel_per_cell_q2,a_pixel_count_per_b_pixel_per_cell_q3,a_pixel_count_per_b_pixel_per_cell_max,a_pixel_count_per_b_pixel_per_cell_count,a_pixel_count_per_b_pixel_per_cell_iqr
-pbmcs_unstimulated,0.0006480881399870382,0.0004888381945576015,3052,6237,3052,68,80,3086,3077,3060,6237,3052,3086,6237,,,,0,0,0,6137,1.0,1.0,1.0,1.011140235910878,0.1049577584303709,1.0,1.0,1.0,1.0,2.0,3052,0.0,2.043577981651376,0.2592989644447359,2.0,2.0,2.0,2.0,5.0,3052,0.0,1.0081913499344692,0.0901346310843966,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0075360419397117,0.08648265728800528,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0068807339449541,0.09380465569259074,0.5,1.0,1.0,1.0,2.0,3052,0.0,1.0029488859764089,0.05422351932424758,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0
-uropod_control,0.0007451564828614009,0.000501378791677112,3963,8117,3963,68,80,4026,3996,3982,8117,3963,4026,8117,,,,0,0,0,7978,1.0,1.0,1.0,1.015897047691143,0.1309898320781511,1.0,1.0,1.0,1.0,3.0,3963,0.0,2.0481958112541006,0.2970944331488577,2.0,2.0,2.0,2.0,7.0,3963,0.0,1.008327024981075,0.09360744530597291,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0047943477163765,0.06907504575713982,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0136260408781226,0.1180891191803105,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.005803684077719,0.09839298617811536,0.5,1.0,1.0,1.0,3.0,3963,0.0,1.0074438556649004,0.0884874529853118,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0
+sample_id,fraction_molecules_in_largest_component,fraction_pixels_in_largest_component,fraction_potential_doublets,n_edges_to_split_potential_doublets,input_cell_count,input_read_count,cell_count,marker_count,total_marker_count,molecule_count,a_pixel_count,b_pixel_count,read_count,aggregate_count,molecules_in_aggregates_count,reads_in_aggregates_count,min_size_threshold,max_size_threshold,doublet_size_threshold,size_filter_fail_cell_count,size_filter_fail_molecule_count,size_filter_fail_read_count,pixel_count,fraction_aggregate_components,fraction_reads_in_aggregates,fraction_molecules_in_aggregates,molecule_count_per_cell_mean,molecule_count_per_cell_std,molecule_count_per_cell_min,molecule_count_per_cell_q1,molecule_count_per_cell_q2,molecule_count_per_cell_q3,molecule_count_per_cell_max,molecule_count_per_cell_count,molecule_count_per_cell_iqr,read_count_per_cell_mean,read_count_per_cell_std,read_count_per_cell_min,read_count_per_cell_q1,read_count_per_cell_q2,read_count_per_cell_q3,read_count_per_cell_max,read_count_per_cell_count,read_count_per_cell_iqr,a_pixel_count_per_cell_mean,a_pixel_count_per_cell_std,a_pixel_count_per_cell_min,a_pixel_count_per_cell_q1,a_pixel_count_per_cell_q2,a_pixel_count_per_cell_q3,a_pixel_count_per_cell_max,a_pixel_count_per_cell_count,a_pixel_count_per_cell_iqr,b_pixel_count_per_cell_mean,b_pixel_count_per_cell_std,b_pixel_count_per_cell_min,b_pixel_count_per_cell_q1,b_pixel_count_per_cell_q2,b_pixel_count_per_cell_q3,b_pixel_count_per_cell_max,b_pixel_count_per_cell_count,b_pixel_count_per_cell_iqr,marker_count_per_cell_mean,marker_count_per_cell_std,marker_count_per_cell_min,marker_count_per_cell_q1,marker_count_per_cell_q2,marker_count_per_cell_q3,marker_count_per_cell_max,marker_count_per_cell_count,marker_count_per_cell_iqr,a_pixel_b_pixel_ratio_per_cell_mean,a_pixel_b_pixel_ratio_per_cell_std,a_pixel_b_pixel_ratio_per_cell_min,a_pixel_b_pixel_ratio_per_cell_q1,a_pixel_b_pixel_ratio_per_cell_q2,a_pixel_b_pixel_ratio_per_cell_q3,a_pixel_b_pixel_ratio_per_cell_max,a_pixel_b_pixel_ratio_per_cell_count,a_pixel_b_pixel_ratio_per_cell_iqr,molecule_count_per_a_pixel_per_cell_mean,molecule_count_per_a_pixel_per_cell_std,molecule_count_per_a_pixel_per_cell_min,molecule_count_per_a_pixel_per_cell_q1,molecule_count_per_a_pixel_per_cell_q2,molecule_count_per_a_pixel_per_cell_q3,molecule_count_per_a_pixel_per_cell_max,molecule_count_per_a_pixel_per_cell_count,molecule_count_per_a_pixel_per_cell_iqr,b_pixel_count_per_a_pixel_per_cell_mean,b_pixel_count_per_a_pixel_per_cell_std,b_pixel_count_per_a_pixel_per_cell_min,b_pixel_count_per_a_pixel_per_cell_q1,b_pixel_count_per_a_pixel_per_cell_q2,b_pixel_count_per_a_pixel_per_cell_q3,b_pixel_count_per_a_pixel_per_cell_max,b_pixel_count_per_a_pixel_per_cell_count,b_pixel_count_per_a_pixel_per_cell_iqr,a_pixel_count_per_b_pixel_per_cell_mean,a_pixel_count_per_b_pixel_per_cell_std,a_pixel_count_per_b_pixel_per_cell_min,a_pixel_count_per_b_pixel_per_cell_q1,a_pixel_count_per_b_pixel_per_cell_q2,a_pixel_count_per_b_pixel_per_cell_q3,a_pixel_count_per_b_pixel_per_cell_max,a_pixel_count_per_b_pixel_per_cell_count,a_pixel_count_per_b_pixel_per_cell_iqr
+pbmcs_unstimulated,0.0006480881399870382,0.0004888381945576015,0.0,0,3052,6237,3052,68,80,3086,3077,3060,6237,3052,3086,6237,,,,0,0,0,6137,1.0,1.0,1.0,1.011140235910878,0.1049577584303709,1.0,1.0,1.0,1.0,2.0,3052,0.0,2.043577981651376,0.2592989644447359,2.0,2.0,2.0,2.0,5.0,3052,0.0,1.0081913499344692,0.0901346310843966,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0075360419397117,0.08648265728800528,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0068807339449541,0.09380465569259074,0.5,1.0,1.0,1.0,2.0,3052,0.0,1.0029488859764089,0.05422351932424758,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0
+uropod_control,0.0007451564828614009,0.000501378791677112,0.0,0,3963,8117,3963,68,80,4026,3996,3982,8117,3963,4026,8117,,,,0,0,0,7978,1.0,1.0,1.0,1.015897047691143,0.1309898320781511,1.0,1.0,1.0,1.0,3.0,3963,0.0,2.0481958112541006,0.2970944331488577,2.0,2.0,2.0,2.0,7.0,3963,0.0,1.008327024981075,0.09360744530597291,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0047943477163765,0.06907504575713982,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0136260408781226,0.1180891191803105,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.005803684077719,0.09839298617811536,0.5,1.0,1.0,1.0,3.0,3963,0.0,1.0074438556649004,0.0884874529853118,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0