Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding annotation of potential doublets to anndata obs #193

Merged
merged 17 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add `is_potential_doublet` and `n_edges_to_split_doublet` columns to adata.obs.
- Add `fraction_potential_doublets` and `n_edges_to_split_potential_doublets` to annotate report.json.
- Add `--max-edges-to-split` option to `graph` to specify the maximum number of edges that can be removed between two sub-components during multiplet recovery.
- Add `abundance_colocalization_plot` function to make scatter plots of selected marker-pairs' abundance.
- Add `plot_polarity_diff_volcano` to make statistical comparison plots of selected component groups.
- Add `get_differential_polarity` to statistically compare polarity scores of selected component groups.
Expand Down
14 changes: 14 additions & 0 deletions src/pixelator/annotate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,8 @@ class AnnotateAnndataStatistics(typing.TypedDict):
min_size_threshold: Optional[int]
max_size_threshold: Optional[int]
doublet_size_threshold: Optional[int]
fraction_potential_doublets: Optional[float]
n_edges_to_split_potential_doublets: Optional[int]


def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
Expand Down Expand Up @@ -412,6 +414,8 @@ def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
"min_size_threshold": None,
"max_size_threshold": None,
"doublet_size_threshold": None,
"fraction_potential_doublets": None,
"n_edges_to_split_potential_doublets": None,
}

# Tau type will only be available if it has been added in the annotate step
Expand All @@ -433,4 +437,14 @@ def anndata_metrics(adata: AnnData) -> AnnotateAnndataStatistics:
if "doublet_size_threshold" in adata.uns:
metrics["doublet_size_threshold"] = adata.uns["doublet_size_threshold"]

if "is_potential_doublet" in adata.obs:
metrics["fraction_potential_doublets"] = adata.obs[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are no is_potential_doublet here, what will it happen when run mean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There will be a key error, that's why we check for it first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But when adata.obs["is_potential_doublet"] returns empty and we called mean() - what happens then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only run the mean() function if "is_potential_doublet" exists as a column in adata.obs. Otherwise that part is not run and "fraction_potential_doublets" remains the default value which is 0 right now but following the discussion we're going to change it to None.

"is_potential_doublet"
].mean()

if "n_edges_to_split_doublet" in adata.obs:
metrics["n_edges_to_split_potential_doublets"] = adata.obs[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same with sum

"n_edges_to_split_doublet"
].sum()

return metrics
3 changes: 3 additions & 0 deletions src/pixelator/graph/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@

MIN_PIXELS_TO_REFINE = 100
LEIDEN_RESOLUTION = 0.01
RELATIVE_ANNOTATE_RESOLUTION = (
0.5 # A lower resolution is used for annotation of potential doublets
)
80 changes: 79 additions & 1 deletion src/pixelator/pixeldataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import numpy as np
import pandas as pd
from anndata import AnnData, ImplicitModificationWarning, read_h5ad
from graspologic.partition import leiden

from pixelator.graph import components_metrics
from pixelator.graph.constants import LEIDEN_RESOLUTION, RELATIVE_ANNOTATE_RESOLUTION
from pixelator.statistics import (
clr_transformation,
log1p_transformation,
Expand Down Expand Up @@ -217,6 +219,79 @@ def write_anndata(adata: AnnData, filename: PathType) -> None:
adata.write(filename=filename, compression="gzip")


def _compute_sub_communities(
ptajvar marked this conversation as resolved.
Show resolved Hide resolved
component_edgelist: pd.DataFrame, n_edges_reconnect: int | None = None
) -> pd.Series:
component_edgelist = (
component_edgelist.groupby(["upia", "upib"], observed=True)["count"]
.count()
.reset_index()
.sort_values(["upia", "upib"])
)
edgelist_tuple = list(
map(tuple, np.array(component_edgelist[["upia", "upib", "count"]]))
)
component_communities_dict = leiden(
edgelist_tuple,
resolution=RELATIVE_ANNOTATE_RESOLUTION * LEIDEN_RESOLUTION,
random_seed=42,
)
component_communities = pd.Series(component_communities_dict)

return component_communities


def _assess_doublet(component_edgelist: pd.DataFrame) -> tuple[bool, int]:
"""Check whether a component is a potential doublet and how many edges should be removed to split it.

A component is a potential doublet if a) it has more than one community and
b) the second largest community is at least 20% of the size of the largest
community. A lower resolution is to be used for annotation of potential doublets
compared to the component recovery in the graph phase. The reduction factor in
annotate resolution is set by RELATIVE_ANNOTATE_RESOLUTION (default is 0.5).

"""
component_communities = _compute_sub_communities(component_edgelist)
component_community_sizes = component_communities.value_counts().sort_values(
ascending=False
)
if len(component_community_sizes) > 1 and component_community_sizes.iloc[1] > (
0.2 * component_community_sizes.iloc[0]
):
edges_to_remove = (
component_edgelist["upia"].map(component_communities)
!= component_edgelist["upib"].map(component_communities)
).sum()
return True, edges_to_remove
else:
return False, 0


def mark_potential_doublets(
edgelist: pd.DataFrame,
) -> tuple[pd.Series, pd.Series]:
"""Mark whether a component is a potential doublet.

A component is a potential doublet if a) it has more than one community and
b) the second largest community is at least 20% of the size of the largest
community.

:param edgelist: the edge list dataframe containing component labels.
:returns: a boolean series indicating whether a component is a potential doublet.
:rtype: pd.Series
"""
is_potential_doublet = pd.Series(index=edgelist["component"].unique(), dtype=bool)
n_edges_to_split_doublet = pd.Series(
index=edgelist["component"].unique(), dtype=int
)
for component_id, component_edgelist in edgelist.groupby("component"):
is_potential_doublet[component_id], n_edges_to_split_doublet[component_id] = (
_assess_doublet(component_edgelist)
)

return is_potential_doublet, n_edges_to_split_doublet


def edgelist_to_anndata(
edgelist: pd.DataFrame,
panel: AntibodyPanel,
Expand Down Expand Up @@ -262,7 +337,10 @@ def edgelist_to_anndata(
# compute components metrics (obs) and re-index
components_metrics_df = components_metrics(edgelist=edgelist)
components_metrics_df = components_metrics_df.reindex(index=counts_df.index)

(
components_metrics_df["is_potential_doublet"],
components_metrics_df["n_edges_to_split_doublet"],
) = mark_potential_doublets(edgelist=edgelist)
# compute antibody metrics (var) and re-index
antibody_metrics_df = antibody_metrics(edgelist=edgelist)
antibody_metrics_df = antibody_metrics_df.reindex(index=panel.markers, fill_value=0)
Expand Down
14 changes: 14 additions & 0 deletions src/pixelator/report/models/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ class AnnotateSampleReport(SampleReport):
description="The fraction of pixels (A and B pixels) in the largest component.",
)

fraction_potential_doublets: float | None = pydantic.Field(
description=(
"The fraction of components that appear to consist of multiple "
"parts by the community detection algorithm."
),
)

n_edges_to_split_potential_doublets: int | None = pydantic.Field(
description=(
"The total number of edges that need to be removed to split the "
"potential doublets into their sub-communities."
),
)

# ------------------------------------------------------------------------------- #
# Annotate metrics
# ------------------------------------------------------------------------------- #
Expand Down
2 changes: 2 additions & 0 deletions tests/pixeldataset/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def test_adata_creation(edgelist: pd.DataFrame, panel: AntibodyPanel):
"molecules",
"pixels",
"reads",
"is_potential_doublet",
"n_edges_to_split_doublet",
]
)
assert "clr" in adata.obsm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"sample_id": "pbmcs_unstimulated",
"fraction_molecules_in_largest_component": 0.0006480881399870382,
"fraction_pixels_in_largest_component": 0.0004888381945576015,
"fraction_potential_doublets": 0.0,
"n_edges_to_split_potential_doublets": 0,
"input_cell_count": 3052,
"input_read_count": 6237,
"cell_count": 3052,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"sample_id": "uropod_control",
"fraction_molecules_in_largest_component": 0.0007451564828614009,
"fraction_pixels_in_largest_component": 0.000501378791677112,
"fraction_potential_doublets": 0.0,
"n_edges_to_split_potential_doublets": 0,
"input_cell_count": 3963,
"input_read_count": 8117,
"cell_count": 3963,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"sample_id": "pbmcs_unstimulated",
"fraction_molecules_in_largest_component": 0.0006480881399870382,
"fraction_pixels_in_largest_component": 0.0004888381945576015,
"fraction_potential_doublets": 0.0,
"n_edges_to_split_potential_doublets": 0,
"input_cell_count": 3052,
"input_read_count": 6237,
"cell_count": 3052,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"sample_id": "uropod_control",
"fraction_molecules_in_largest_component": 0.0007451564828614009,
"fraction_pixels_in_largest_component": 0.000501378791677112,
"fraction_potential_doublets": 0.0,
"n_edges_to_split_potential_doublets": 0,
"input_cell_count": 3963,
"input_read_count": 8117,
"cell_count": 3963,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample_id,fraction_molecules_in_largest_component,fraction_pixels_in_largest_component,input_cell_count,input_read_count,cell_count,marker_count,total_marker_count,molecule_count,a_pixel_count,b_pixel_count,read_count,aggregate_count,molecules_in_aggregates_count,reads_in_aggregates_count,min_size_threshold,max_size_threshold,doublet_size_threshold,size_filter_fail_cell_count,size_filter_fail_molecule_count,size_filter_fail_read_count,pixel_count,fraction_aggregate_components,fraction_reads_in_aggregates,fraction_molecules_in_aggregates,molecule_count_per_cell_mean,molecule_count_per_cell_std,molecule_count_per_cell_min,molecule_count_per_cell_q1,molecule_count_per_cell_q2,molecule_count_per_cell_q3,molecule_count_per_cell_max,molecule_count_per_cell_count,molecule_count_per_cell_iqr,read_count_per_cell_mean,read_count_per_cell_std,read_count_per_cell_min,read_count_per_cell_q1,read_count_per_cell_q2,read_count_per_cell_q3,read_count_per_cell_max,read_count_per_cell_count,read_count_per_cell_iqr,a_pixel_count_per_cell_mean,a_pixel_count_per_cell_std,a_pixel_count_per_cell_min,a_pixel_count_per_cell_q1,a_pixel_count_per_cell_q2,a_pixel_count_per_cell_q3,a_pixel_count_per_cell_max,a_pixel_count_per_cell_count,a_pixel_count_per_cell_iqr,b_pixel_count_per_cell_mean,b_pixel_count_per_cell_std,b_pixel_count_per_cell_min,b_pixel_count_per_cell_q1,b_pixel_count_per_cell_q2,b_pixel_count_per_cell_q3,b_pixel_count_per_cell_max,b_pixel_count_per_cell_count,b_pixel_count_per_cell_iqr,marker_count_per_cell_mean,marker_count_per_cell_std,marker_count_per_cell_min,marker_count_per_cell_q1,marker_count_per_cell_q2,marker_count_per_cell_q3,marker_count_per_cell_max,marker_count_per_cell_count,marker_count_per_cell_iqr,a_pixel_b_pixel_ratio_per_cell_mean,a_pixel_b_pixel_ratio_per_cell_std,a_pixel_b_pixel_ratio_per_cell_min,a_pixel_b_pixel_ratio_per_cell_q1,a_pixel_b_pixel_ratio_per_cell_q2,a_pixel_b_pixel_ratio_per_cell_q3,a_pixel_b_pixel_ratio_per_cell_max,a_pixel_b_pixel_ratio_per_cell_count,a_pixel_b_pixel_ratio_per_cell_iqr,molecule_count_per_a_pixel_per_cell_mean,molecule_count_per_a_pixel_per_cell_std,molecule_count_per_a_pixel_per_cell_min,molecule_count_per_a_pixel_per_cell_q1,molecule_count_per_a_pixel_per_cell_q2,molecule_count_per_a_pixel_per_cell_q3,molecule_count_per_a_pixel_per_cell_max,molecule_count_per_a_pixel_per_cell_count,molecule_count_per_a_pixel_per_cell_iqr,b_pixel_count_per_a_pixel_per_cell_mean,b_pixel_count_per_a_pixel_per_cell_std,b_pixel_count_per_a_pixel_per_cell_min,b_pixel_count_per_a_pixel_per_cell_q1,b_pixel_count_per_a_pixel_per_cell_q2,b_pixel_count_per_a_pixel_per_cell_q3,b_pixel_count_per_a_pixel_per_cell_max,b_pixel_count_per_a_pixel_per_cell_count,b_pixel_count_per_a_pixel_per_cell_iqr,a_pixel_count_per_b_pixel_per_cell_mean,a_pixel_count_per_b_pixel_per_cell_std,a_pixel_count_per_b_pixel_per_cell_min,a_pixel_count_per_b_pixel_per_cell_q1,a_pixel_count_per_b_pixel_per_cell_q2,a_pixel_count_per_b_pixel_per_cell_q3,a_pixel_count_per_b_pixel_per_cell_max,a_pixel_count_per_b_pixel_per_cell_count,a_pixel_count_per_b_pixel_per_cell_iqr
pbmcs_unstimulated,0.0006480881399870382,0.0004888381945576015,3052,6237,3052,68,80,3086,3077,3060,6237,3052,3086,6237,,,,0,0,0,6137,1.0,1.0,1.0,1.011140235910878,0.1049577584303709,1.0,1.0,1.0,1.0,2.0,3052,0.0,2.043577981651376,0.2592989644447359,2.0,2.0,2.0,2.0,5.0,3052,0.0,1.0081913499344692,0.0901346310843966,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0075360419397117,0.08648265728800528,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0068807339449541,0.09380465569259074,0.5,1.0,1.0,1.0,2.0,3052,0.0,1.0029488859764089,0.05422351932424758,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0
uropod_control,0.0007451564828614009,0.000501378791677112,3963,8117,3963,68,80,4026,3996,3982,8117,3963,4026,8117,,,,0,0,0,7978,1.0,1.0,1.0,1.015897047691143,0.1309898320781511,1.0,1.0,1.0,1.0,3.0,3963,0.0,2.0481958112541006,0.2970944331488577,2.0,2.0,2.0,2.0,7.0,3963,0.0,1.008327024981075,0.09360744530597291,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0047943477163765,0.06907504575713982,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0136260408781226,0.1180891191803105,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.005803684077719,0.09839298617811536,0.5,1.0,1.0,1.0,3.0,3963,0.0,1.0074438556649004,0.0884874529853118,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0
sample_id,fraction_molecules_in_largest_component,fraction_pixels_in_largest_component,fraction_potential_doublets,n_edges_to_split_potential_doublets,input_cell_count,input_read_count,cell_count,marker_count,total_marker_count,molecule_count,a_pixel_count,b_pixel_count,read_count,aggregate_count,molecules_in_aggregates_count,reads_in_aggregates_count,min_size_threshold,max_size_threshold,doublet_size_threshold,size_filter_fail_cell_count,size_filter_fail_molecule_count,size_filter_fail_read_count,pixel_count,fraction_aggregate_components,fraction_reads_in_aggregates,fraction_molecules_in_aggregates,molecule_count_per_cell_mean,molecule_count_per_cell_std,molecule_count_per_cell_min,molecule_count_per_cell_q1,molecule_count_per_cell_q2,molecule_count_per_cell_q3,molecule_count_per_cell_max,molecule_count_per_cell_count,molecule_count_per_cell_iqr,read_count_per_cell_mean,read_count_per_cell_std,read_count_per_cell_min,read_count_per_cell_q1,read_count_per_cell_q2,read_count_per_cell_q3,read_count_per_cell_max,read_count_per_cell_count,read_count_per_cell_iqr,a_pixel_count_per_cell_mean,a_pixel_count_per_cell_std,a_pixel_count_per_cell_min,a_pixel_count_per_cell_q1,a_pixel_count_per_cell_q2,a_pixel_count_per_cell_q3,a_pixel_count_per_cell_max,a_pixel_count_per_cell_count,a_pixel_count_per_cell_iqr,b_pixel_count_per_cell_mean,b_pixel_count_per_cell_std,b_pixel_count_per_cell_min,b_pixel_count_per_cell_q1,b_pixel_count_per_cell_q2,b_pixel_count_per_cell_q3,b_pixel_count_per_cell_max,b_pixel_count_per_cell_count,b_pixel_count_per_cell_iqr,marker_count_per_cell_mean,marker_count_per_cell_std,marker_count_per_cell_min,marker_count_per_cell_q1,marker_count_per_cell_q2,marker_count_per_cell_q3,marker_count_per_cell_max,marker_count_per_cell_count,marker_count_per_cell_iqr,a_pixel_b_pixel_ratio_per_cell_mean,a_pixel_b_pixel_ratio_per_cell_std,a_pixel_b_pixel_ratio_per_cell_min,a_pixel_b_pixel_ratio_per_cell_q1,a_pixel_b_pixel_ratio_per_cell_q2,a_pixel_b_pixel_ratio_per_cell_q3,a_pixel_b_pixel_ratio_per_cell_max,a_pixel_b_pixel_ratio_per_cell_count,a_pixel_b_pixel_ratio_per_cell_iqr,molecule_count_per_a_pixel_per_cell_mean,molecule_count_per_a_pixel_per_cell_std,molecule_count_per_a_pixel_per_cell_min,molecule_count_per_a_pixel_per_cell_q1,molecule_count_per_a_pixel_per_cell_q2,molecule_count_per_a_pixel_per_cell_q3,molecule_count_per_a_pixel_per_cell_max,molecule_count_per_a_pixel_per_cell_count,molecule_count_per_a_pixel_per_cell_iqr,b_pixel_count_per_a_pixel_per_cell_mean,b_pixel_count_per_a_pixel_per_cell_std,b_pixel_count_per_a_pixel_per_cell_min,b_pixel_count_per_a_pixel_per_cell_q1,b_pixel_count_per_a_pixel_per_cell_q2,b_pixel_count_per_a_pixel_per_cell_q3,b_pixel_count_per_a_pixel_per_cell_max,b_pixel_count_per_a_pixel_per_cell_count,b_pixel_count_per_a_pixel_per_cell_iqr,a_pixel_count_per_b_pixel_per_cell_mean,a_pixel_count_per_b_pixel_per_cell_std,a_pixel_count_per_b_pixel_per_cell_min,a_pixel_count_per_b_pixel_per_cell_q1,a_pixel_count_per_b_pixel_per_cell_q2,a_pixel_count_per_b_pixel_per_cell_q3,a_pixel_count_per_b_pixel_per_cell_max,a_pixel_count_per_b_pixel_per_cell_count,a_pixel_count_per_b_pixel_per_cell_iqr
pbmcs_unstimulated,0.0006480881399870382,0.0004888381945576015,0.0,0,3052,6237,3052,68,80,3086,3077,3060,6237,3052,3086,6237,,,,0,0,0,6137,1.0,1.0,1.0,1.011140235910878,0.1049577584303709,1.0,1.0,1.0,1.0,2.0,3052,0.0,2.043577981651376,0.2592989644447359,2.0,2.0,2.0,2.0,5.0,3052,0.0,1.0081913499344692,0.0901346310843966,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0075360419397117,0.08648265728800528,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0068807339449541,0.09380465569259074,0.5,1.0,1.0,1.0,2.0,3052,0.0,1.0029488859764089,0.05422351932424758,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0,1.0026212319790302,0.05113082359929531,1.0,1.0,1.0,1.0,2.0,3052,0.0
uropod_control,0.0007451564828614009,0.000501378791677112,0.0,0,3963,8117,3963,68,80,4026,3996,3982,8117,3963,4026,8117,,,,0,0,0,7978,1.0,1.0,1.0,1.015897047691143,0.1309898320781511,1.0,1.0,1.0,1.0,3.0,3963,0.0,2.0481958112541006,0.2970944331488577,2.0,2.0,2.0,2.0,7.0,3963,0.0,1.008327024981075,0.09360744530597291,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0047943477163765,0.06907504575713982,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0136260408781226,0.1180891191803105,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.005803684077719,0.09839298617811536,0.5,1.0,1.0,1.0,3.0,3963,0.0,1.0074438556649004,0.0884874529853118,1.0,1.0,1.0,1.0,3.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0,1.0046681806712088,0.0677001125393875,1.0,1.0,1.0,1.0,2.0,3963,0.0
Loading