diff --git a/CHANGELOG.md b/CHANGELOG.md index c6a3c55c9..a7df805a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,20 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html +## Unreleased + +### Backwards-incompatible changes + +- Use the `umi_count` field instead of `duplicate_count` to store UMI counts. The field `umi_count` has been added to + the AIRR Rearrangement standard in [version 1.4](https://docs.airr-community.org/en/latest/news.html#version-1-4-1-august-27-2022). + Use of `duplicate_count` for UMI counts is now discouraged. Scirpy will use `umi_count` in all `scirpy.io` functions. + It will _not_ change AIRR data that is read through `scirpy.io.read_airr` that still uses the `duplicate_count` column. + Scirpy remains compatible with datasets that still use `duplicate_count`. You can update your dataset using + + ```python + adata.obsm["airr"]["umi_count"] = adata.obsm["airr"]["duplicate_count"] + ``` + ## v0.15.0 ### Fixes diff --git a/pyproject.toml b/pyproject.toml index 77ad727dc..669b8df3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ 'igraph != 0.10.0,!=0.10.1', 'networkx>=2.5', 'squarify', - 'airr>=1.2', + 'airr>=1.4.1', 'tqdm>=4.63', # https://github.com/tqdm/tqdm/issues/1082 'adjustText>=0.7', 'numba>=0.41.0', diff --git a/src/scirpy/io/_io.py b/src/scirpy/io/_io.py index bd3ee03f9..0a2f08044 100644 --- a/src/scirpy/io/_io.py +++ b/src/scirpy/io/_io.py @@ -6,7 +6,7 @@ from collections.abc import Collection, Iterable, Sequence from glob import iglob from pathlib import Path -from typing import Any, Literal, Union +from typing import Any, Union import numpy as np import pandas as pd @@ -125,7 +125,7 @@ def _read_10x_vdj_json( chain["locus"] = chain_type chain["junction"] = contig["cdr3_seq"] chain["junction_aa"] = contig["cdr3"] - chain["duplicate_count"] = contig["umi_count"] + chain["umi_count"] = contig["umi_count"] chain["consensus_count"] = contig["read_count"] chain["productive"] = contig["productive"] chain["is_cell"] = contig["is_cell"] @@ -166,7 +166,7 @@ def _read_10x_vdj_csv( locus=chain_series["chain"], junction_aa=chain_series["cdr3"], junction=chain_series["cdr3_nt"], - duplicate_count=chain_series["umis"], + umi_count=chain_series["umis"], consensus_count=chain_series["reads"], productive=_is_true2(chain_series["productive"]), v_call=chain_series["v_gene"], @@ -352,7 +352,7 @@ def _process_chains(chains, chain_type): ) def read_airr( path: Union[str, Sequence[str], Path, Sequence[Path], pd.DataFrame, Sequence[pd.DataFrame]], - use_umi_count_col: Union[bool, Literal["auto"]] = "auto", + use_umi_count_col: None = None, # deprecated, kept for backwards-compatibility infer_locus: bool = True, cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES, include_fields: Any = None, @@ -380,10 +380,9 @@ def read_airr( as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`. Alternatively, this can be a pandas data frame. use_umi_count_col - Whether to add UMI counts from the non-strandard (but common) `umi_count` - column. When this column is used, the UMI counts are moved over to the - standard `duplicate_count` column. Default: Use `umi_count` if there is - no `duplicate_count` column present. + Deprecated, has no effect as of v0.16. Since v1.4 of the AIRR standard, `umi_count` + is an official field in the Rearrangement schema and preferred over `duplicate_count`. + `umi_count` now always takes precedence over `duplicate_count`. infer_locus Try to infer the `locus` column from gene names, in case it is not specified. cell_attributes @@ -409,16 +408,6 @@ def read_airr( if isinstance(path, (str, Path, pd.DataFrame)): path: list[Union[str, Path, pd.DataFrame]] = [path] # type: ignore - def _decide_use_umi_count_col(chain_dict): - """Logic to decide whether or not to use counts form the `umi_counts` column.""" - if "umi_count" in chain_dict and use_umi_count_col == "auto" and "duplicate_count" not in chain_dict: - logger.warning("Renaming the non-standard `umi_count` column to `duplicate_count`. ") # type: ignore - return True - elif use_umi_count_col is True: - return True - else: - return False - for tmp_path_or_df in path: if isinstance(tmp_path_or_df, pd.DataFrame): iterator = _read_airr_rearrangement_df(tmp_path_or_df) @@ -438,9 +427,6 @@ def _decide_use_umi_count_col(chain_dict): ) airr_cells[cell_id] = tmp_cell - if _decide_use_umi_count_col(chain_dict): - chain_dict["duplicate_count"] = get_rearrangement_schema().to_int(chain_dict.pop("umi_count")) - if infer_locus and "locus" not in chain_dict: logger.warning( "`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns." @@ -742,7 +728,7 @@ def _get(row, field): "junction_aa": _get(row, "CDR3_Translation"), "productive": row["Productive"], "consensus_count": row["Read_Count"], - "duplicate_count": row["Molecule_Count"], + "umi_count": row["Molecule_Count"], } ) tmp_cell.add_chain(tmp_chain) diff --git a/src/scirpy/pp/_index_chains.py b/src/scirpy/pp/_index_chains.py index 976583f04..0822248f4 100644 --- a/src/scirpy/pp/_index_chains.py +++ b/src/scirpy/pp/_index_chains.py @@ -27,7 +27,9 @@ def index_chains( "require_junction_aa", ), sort_chains_by: Mapping[str, Any] = MappingProxyType( - {"duplicate_count": 0, "consensus_count": 0, "junction": "", "junction_aa": ""} + # Since AIRR version v1.4.1, `duplicate_count` is deprecated in favor of `umi_count`. + # We still keep it as sort key for backwards compatibility + {"umi_count": 0, "duplicate_count": 0, "consensus_count": 0, "junction": "", "junction_aa": ""} ), airr_mod: str = "airr", airr_key: str = "airr", @@ -89,8 +91,12 @@ def index_chains( # only warn if those fields are in the key (i.e. this should give a warning if those are missing with # default settings. If the user specifies their own dictionary, they are on their own) - if "duplicate_count" in sort_chains_by and "consensus_count" in sort_chains_by: - if "duplicate_count" not in params.airr.fields and "consensus_count" not in params.airr.fields: + if "duplicate_count" in sort_chains_by and "consensus_count" in sort_chains_by and "umi_count" in sort_chains_by: + if ( + "duplicate_count" not in params.airr.fields + and "consensus_count" not in params.airr.fields + and "umi_count" not in sort_chains_by + ): logging.warning("No expression information available. Cannot rank chains by expression. ") # type: ignore if "locus" not in params.airr.fields: diff --git a/src/scirpy/tests/data/airr/rearrangement_ig.tsv b/src/scirpy/tests/data/airr/rearrangement_ig.tsv index 9d707c6ff..59ac38e9d 100644 --- a/src/scirpy/tests/data/airr/rearrangement_ig.tsv +++ b/src/scirpy/tests/data/airr/rearrangement_ig.tsv @@ -1,4 +1,4 @@ -cell_id clone_id sequence_id sequence sequence_aa productive rev_comp v_call v_cigar d_call d_cigar j_call j_cigar c_call c_cigar sequence_alignment germline_alignment junction junction_aa junction_length junction_aa_length v_sequence_start v_sequence_end d_sequence_start d_sequence_end j_sequence_start j_sequence_end c_sequence_start c_sequence_end consensus_count duplicate_count is_cell +cell_id clone_id sequence_id sequence sequence_aa productive rev_comp v_call v_cigar d_call d_cigar j_call j_cigar c_call c_cigar sequence_alignment germline_alignment junction junction_aa junction_length junction_aa_length v_sequence_start v_sequence_end d_sequence_start d_sequence_end j_sequence_start j_sequence_end c_sequence_start c_sequence_end consensus_count umi_count is_cell AAACCTGAGGAGTCTG-1 AAACCTGAGGAGTCTG-1_contig_1 ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCAGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG MDWTWRILFLVAAATGAHSQVQLVQSGSELKKPGASVKVSCKASGYTFTSYAMNWVRQAPGQGLEWMGWINTNTGNPTYAQGFTGRFVFSLDTSVSTAYLQISSLKAEDTAVYYCASLWQDASGYSYGKYYYYYGMDVWGQGTTVTVSSGSASAPTLFPLVSCENSPSDTSSV T F IGHV7-4-1 49S353M166S IGHJ6 434S63M71S IGHM 497S71M ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCAGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCTGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGAGAATTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG TGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGG CASLWQDASGYSYGKYYYYYGMDVW 75 25 50 402 435 497 498 568 791 7 T AAACCTGAGGAGTCTG-1 AAACCTGAGGAGTCTG-1_contig_2 AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC METPAQLLFLLLLWLPDTTGEIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSLTWTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDN T F IGKV3-20 44S348M176S IGKJ1 394S38M136S IGKC 432S136M AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC CCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACCTGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC TGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTC CQQYGSSLTWTF 36 12 45 392 395 432 433 568 5260 41 T AAACCTGCAGCGTAAG-1 AAACCTGCAGCGTAAG-1_contig_1 GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG MEFGLSWVFLVALLRGVQCQVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKGRVVVGATTPRNYYYYGMDVWGQGTTVTVSSGSASAPTLFPLVSCENSPSDTSSV T F IGHV3-30 80S351M165S IGHJ6 462S63M71S IGHM 525S71M GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG CAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAATTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG TGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGG CAKGRVVVGATTPRNYYYYGMDVW 72 24 81 431 463 525 526 596 1092 14 T diff --git a/src/scirpy/tests/test_io.py b/src/scirpy/tests/test_io.py index 92919fe3c..7d9948340 100644 --- a/src/scirpy/tests/test_io.py +++ b/src/scirpy/tests/test_io.py @@ -245,6 +245,7 @@ def test_write_airr_none_field_issue_454(tmp_path): write_airr(adata, tmp_path / "test.airr.tsv") +@pytest.mark.xfail(reason="Dandelion still uses `duplicate_count` instead of `umi_count`", raises=AssertionError) @pytest.mark.extra @pytest.mark.parametrize( "anndata_from_10x_sample", @@ -269,7 +270,7 @@ def test_convert_dandelion(anndata_from_10x_sample): assert len(ir_obj1.chains) == len(ir_obj2.chains) def _key(chain): - v1, v2 = chain.get("duplicate_count", -1), chain.get("junction", "") + v1, v2 = chain.get("umi_count", -1), chain.get("junction", "") v1 = -1 if v1 is None else v1 v2 = "" if v2 is None else v2 return (v1, v2) @@ -295,7 +296,7 @@ def test_read_10x_csv(): [ "junction_aa", "junction", - "duplicate_count", + "umi_count", "consensus_count", "v_call", "d_call", @@ -314,7 +315,7 @@ def test_read_10x_csv(): assert cell1.name == "AAACCTGAGTACGCCC-1" assert cell1["VDJ_1_junction_aa"] == "CASSLGPSTDTQYF" assert cell1["VDJ_1_junction"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT" - assert cell1["VDJ_1_duplicate_count"] == 55 + assert cell1["VDJ_1_umi_count"] == 55 assert cell1["VDJ_1_consensus_count"] == 18021 assert cell1["VDJ_1_v_call"] == "TRBV7-2" assert cell1["VDJ_1_d_call"] == "TRBD2" @@ -327,8 +328,8 @@ def test_read_10x_csv(): assert cell2.name == "AAACCTGGTCCGTTAA-1" assert cell2["VJ_1_junction_aa"] == "CALNTGGFKTIF" assert cell2["VJ_2_junction_aa"] == "CAVILDARLMF" - assert cell2["VJ_1_duplicate_count"] == 5 - assert cell2["VJ_2_duplicate_count"] == 5 + assert cell2["VJ_1_umi_count"] == 5 + assert cell2["VJ_2_umi_count"] == 5 assert cell2["VJ_1_locus"] == "TRA" assert cell2["VDJ_1_locus"] == "TRB" assert cell2["VJ_2_locus"] == "TRA" @@ -427,7 +428,7 @@ def test_read_10x(): "junction", "np1_length", "np2_length", - "duplicate_count", + "umi_count", "consensus_count", "v_call", "d_call", @@ -450,7 +451,7 @@ def test_read_10x(): assert cell1["VDJ_1_junction"] == "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT" assert cell1["VDJ_1_np1_length"] == 4 assert cell1["VDJ_1_np2_length"] == 7 - assert cell1["VDJ_1_duplicate_count"] == 1 + assert cell1["VDJ_1_umi_count"] == 1 assert cell1["VDJ_1_consensus_count"] == 494 assert cell1["VDJ_1_v_call"] == "TRBV18" assert cell1["VDJ_1_d_call"] == "TRBD1" @@ -462,8 +463,8 @@ def test_read_10x(): assert cell2.name == "AAACCTGAGTACGCCC-1" assert cell2["VJ_1_junction_aa"] == "CAMRVGGSQGNLIF" assert cell2["VJ_2_junction_aa"] == "CATDAKDSNYQLIW" - assert cell2["VJ_1_duplicate_count"] == 9 - assert cell2["VJ_2_duplicate_count"] == 4 + assert cell2["VJ_1_umi_count"] == 9 + assert cell2["VJ_2_umi_count"] == 4 assert np.all(_is_na(cell2[["VDJ_1_junction_aa", "VDJ_2_junction_aa"]])) assert cell2["VJ_1_np1_length"] == 4 assert _is_na(cell2["VJ_1_np2_length"]) @@ -816,7 +817,7 @@ def test_airr_df(): "cell_id", "c_call", "consensus_count", - "duplicate_count", + "umi_count", ], ) @@ -874,7 +875,7 @@ def test_read_bd_per_cell_chain(): adata, [ "locus", - "duplicate_count", + "umi_count", "consensus_count", "junction", "junction_aa", @@ -892,7 +893,7 @@ def test_read_bd_per_cell_chain(): cell85 = obs.loc["85", :] assert cell1["VJ_1_locus"] == "TRA" - assert cell1["VJ_1_duplicate_count"] == 1 + assert cell1["VJ_1_umi_count"] == 1 assert cell1["VJ_1_consensus_count"] == 72 assert cell1["VJ_1_junction"] == "GCTGCCCCAGAATTTTGTC" assert cell1["VJ_1_junction_aa"] == "AAGQNFV" @@ -918,11 +919,11 @@ def test_read_bd_per_cell_chain(): def test_read_bd_contigs(): adata = read_bd_rhapsody(TESTDATA / "bd/test_unfiltered_contigs.csv") - obs = ir.get.airr(adata, ["locus", "duplicate_count"], "VJ_1") + obs = ir.get.airr(adata, ["locus", "umi_count"], "VJ_1") assert obs.shape[0] == 5 cell10681 = obs.loc["10681"] assert cell10681["VJ_1_locus"] == "IGK" - assert cell10681["VJ_1_duplicate_count"] == 2 + assert cell10681["VJ_1_umi_count"] == 2 diff --git a/src/scirpy/tests/test_preprocessing.py b/src/scirpy/tests/test_preprocessing.py index 32f13938a..3190155bc 100644 --- a/src/scirpy/tests/test_preprocessing.py +++ b/src/scirpy/tests/test_preprocessing.py @@ -35,9 +35,30 @@ {"VJ": [1, 0], "VDJ": [2, None], "multichain": False}, {"VJ": [1, None], "VDJ": [0, 2], "multichain": False}, ], - ) + ), + ( + [ + # fmt: off + [ + {"locus": "TRA", "junction_aa": "AAA", "umi_count": 3, "productive": True}, + {"locus": "TRA", "junction_aa": "KKK", "umi_count": 6, "productive": True}, + {"locus": "TRB", "junction_aa": "LLL", "umi_count": 3, "productive": True}, + ], + [ + {"locus": "TRB", "junction_aa": "KKK", "umi_count": 6, "productive": True}, + {"locus": "TRA", "junction_aa": "AAA", "umi_count": 3, "productive": True}, + {"locus": "TRB", "junction_aa": "LLL", "umi_count": 3, "productive": True}, + ], + # fmt: on + ], + [ + # VJ_1, VDJ_1, VJ_2, VDJ_2, multichain + {"VJ": [1, 0], "VDJ": [2, None], "multichain": False}, + {"VJ": [1, None], "VDJ": [0, 2], "multichain": False}, + ], + ), ], - ids=["standard case, multiple rows"], + ids=["using deprecated duplicate_count column", "standard case, multiple rows"], ) def test_index_chains(airr_chains, expected_index): """Test that chain indexing works as expected (default parameters)""" @@ -60,6 +81,16 @@ def test_index_chains(airr_chains, expected_index): }, {"VJ": [3, 0], "VDJ": [None, None], "multichain": False}, ), + ( + ["productive", "require_junction_aa"], + { + "umi_count": 0, + "consensus_count": 0, + "junction": "", + "junction_aa": "", + }, + {"VJ": [3, 0], "VDJ": [None, None], "multichain": False}, + ), ( ["require_junction_aa"], {"junction_aa": ""},