Skip to content

Commit

Permalink
Use 'umi_count' instead of 'duplicate_count'
Browse files Browse the repository at this point in the history
  • Loading branch information
grst committed Jan 26, 2024
1 parent 05fc476 commit 7027f0e
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 43 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,20 @@ and this project adheres to [Semantic Versioning][].
[keep a changelog]: https://keepachangelog.com/en/1.0.0/
[semantic versioning]: https://semver.org/spec/v2.0.0.html

## Unreleased

### Backwards-incompatible changes

- Use the `umi_count` field instead of `duplicate_count` to store UMI counts. The field `umi_count` has been added to
the AIRR Rearrangement standard in [version 1.4](https://docs.airr-community.org/en/latest/news.html#version-1-4-1-august-27-2022).
Use of `duplicate_count` for UMI counts is now discouraged. Scirpy will use `umi_count` in all `scirpy.io` functions.
It will _not_ change AIRR data that is read through `scirpy.io.read_airr` that still uses the `duplicate_count` column.
Scirpy remains compatible with datasets that still use `duplicate_count`. You can update your dataset using

```python
adata.obsm["airr"]["umi_count"] = adata.obsm["airr"]["duplicate_count"]
```

## v0.15.0

### Fixes
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
'igraph != 0.10.0,!=0.10.1',
'networkx>=2.5',
'squarify',
'airr>=1.2',
'airr>=1.4.1',
'tqdm>=4.63', # https://github.com/tqdm/tqdm/issues/1082
'adjustText>=0.7',
'numba>=0.41.0',
Expand Down
30 changes: 8 additions & 22 deletions src/scirpy/io/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections.abc import Collection, Iterable, Sequence
from glob import iglob
from pathlib import Path
from typing import Any, Literal, Union
from typing import Any, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -125,7 +125,7 @@ def _read_10x_vdj_json(
chain["locus"] = chain_type
chain["junction"] = contig["cdr3_seq"]
chain["junction_aa"] = contig["cdr3"]
chain["duplicate_count"] = contig["umi_count"]
chain["umi_count"] = contig["umi_count"]
chain["consensus_count"] = contig["read_count"]
chain["productive"] = contig["productive"]
chain["is_cell"] = contig["is_cell"]
Expand Down Expand Up @@ -166,7 +166,7 @@ def _read_10x_vdj_csv(
locus=chain_series["chain"],
junction_aa=chain_series["cdr3"],
junction=chain_series["cdr3_nt"],
duplicate_count=chain_series["umis"],
umi_count=chain_series["umis"],
consensus_count=chain_series["reads"],
productive=_is_true2(chain_series["productive"]),
v_call=chain_series["v_gene"],
Expand Down Expand Up @@ -352,7 +352,7 @@ def _process_chains(chains, chain_type):
)
def read_airr(
path: Union[str, Sequence[str], Path, Sequence[Path], pd.DataFrame, Sequence[pd.DataFrame]],
use_umi_count_col: Union[bool, Literal["auto"]] = "auto",
use_umi_count_col: None = None, # deprecated, kept for backwards-compatibility
infer_locus: bool = True,
cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES,
include_fields: Any = None,
Expand Down Expand Up @@ -380,10 +380,9 @@ def read_airr(
as a List, e.g. `["path/to/tcr_alpha.tsv", "path/to/tcr_beta.tsv"]`.
Alternatively, this can be a pandas data frame.
use_umi_count_col
Whether to add UMI counts from the non-strandard (but common) `umi_count`
column. When this column is used, the UMI counts are moved over to the
standard `duplicate_count` column. Default: Use `umi_count` if there is
no `duplicate_count` column present.
Deprecated, has no effect as of v0.16. Since v1.4 of the AIRR standard, `umi_count`
is an official field in the Rearrangement schema and preferred over `duplicate_count`.
`umi_count` now always takes precedence over `duplicate_count`.
infer_locus
Try to infer the `locus` column from gene names, in case it is not specified.
cell_attributes
Expand All @@ -409,16 +408,6 @@ def read_airr(
if isinstance(path, (str, Path, pd.DataFrame)):
path: list[Union[str, Path, pd.DataFrame]] = [path] # type: ignore

def _decide_use_umi_count_col(chain_dict):
"""Logic to decide whether or not to use counts form the `umi_counts` column."""
if "umi_count" in chain_dict and use_umi_count_col == "auto" and "duplicate_count" not in chain_dict:
logger.warning("Renaming the non-standard `umi_count` column to `duplicate_count`. ") # type: ignore
return True
elif use_umi_count_col is True:
return True
else:
return False

for tmp_path_or_df in path:
if isinstance(tmp_path_or_df, pd.DataFrame):
iterator = _read_airr_rearrangement_df(tmp_path_or_df)
Expand All @@ -438,9 +427,6 @@ def _decide_use_umi_count_col(chain_dict):
)
airr_cells[cell_id] = tmp_cell

if _decide_use_umi_count_col(chain_dict):
chain_dict["duplicate_count"] = get_rearrangement_schema().to_int(chain_dict.pop("umi_count"))

if infer_locus and "locus" not in chain_dict:
logger.warning(
"`locus` column not found in input data. The locus is being inferred from the {v,d,j,c}_call columns."
Expand Down Expand Up @@ -742,7 +728,7 @@ def _get(row, field):
"junction_aa": _get(row, "CDR3_Translation"),
"productive": row["Productive"],
"consensus_count": row["Read_Count"],
"duplicate_count": row["Molecule_Count"],
"umi_count": row["Molecule_Count"],
}
)
tmp_cell.add_chain(tmp_chain)
Expand Down
12 changes: 9 additions & 3 deletions src/scirpy/pp/_index_chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def index_chains(
"require_junction_aa",
),
sort_chains_by: Mapping[str, Any] = MappingProxyType(
{"duplicate_count": 0, "consensus_count": 0, "junction": "", "junction_aa": ""}
# Since AIRR version v1.4.1, `duplicate_count` is deprecated in favor of `umi_count`.
# We still keep it as sort key for backwards compatibility
{"umi_count": 0, "duplicate_count": 0, "consensus_count": 0, "junction": "", "junction_aa": ""}
),
airr_mod: str = "airr",
airr_key: str = "airr",
Expand Down Expand Up @@ -89,8 +91,12 @@ def index_chains(

# only warn if those fields are in the key (i.e. this should give a warning if those are missing with
# default settings. If the user specifies their own dictionary, they are on their own)
if "duplicate_count" in sort_chains_by and "consensus_count" in sort_chains_by:
if "duplicate_count" not in params.airr.fields and "consensus_count" not in params.airr.fields:
if "duplicate_count" in sort_chains_by and "consensus_count" in sort_chains_by and "umi_count" in sort_chains_by:
if (
"duplicate_count" not in params.airr.fields
and "consensus_count" not in params.airr.fields
and "umi_count" not in sort_chains_by
):
logging.warning("No expression information available. Cannot rank chains by expression. ") # type: ignore

if "locus" not in params.airr.fields:
Expand Down
2 changes: 1 addition & 1 deletion src/scirpy/tests/data/airr/rearrangement_ig.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cell_id clone_id sequence_id sequence sequence_aa productive rev_comp v_call v_cigar d_call d_cigar j_call j_cigar c_call c_cigar sequence_alignment germline_alignment junction junction_aa junction_length junction_aa_length v_sequence_start v_sequence_end d_sequence_start d_sequence_end j_sequence_start j_sequence_end c_sequence_start c_sequence_end consensus_count duplicate_count is_cell
cell_id clone_id sequence_id sequence sequence_aa productive rev_comp v_call v_cigar d_call d_cigar j_call j_cigar c_call c_cigar sequence_alignment germline_alignment junction junction_aa junction_length junction_aa_length v_sequence_start v_sequence_end d_sequence_start d_sequence_end j_sequence_start j_sequence_end c_sequence_start c_sequence_end consensus_count umi_count is_cell
AAACCTGAGGAGTCTG-1 AAACCTGAGGAGTCTG-1_contig_1 ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCAGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG MDWTWRILFLVAAATGAHSQVQLVQSGSELKKPGASVKVSCKASGYTFTSYAMNWVRQAPGQGLEWMGWINTNTGNPTYAQGFTGRFVFSLDTSVSTAYLQISSLKAEDTAVYYCASLWQDASGYSYGKYYYYYGMDVWGQGTTVTVSSGSASAPTLFPLVSCENSPSDTSSV T F IGHV7-4-1 49S353M166S IGHJ6 434S63M71S IGHM 497S71M ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCAGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG ACAACCACACCCCTCCTAAGAAGAAGCCCCTAGACCACAGCTCCACACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTGCAGCTGGTGCAATCTGGGTCTGAGTTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTAGCTATGCTATGAATTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACACCAACACTGGGAACCCAACGTATGCCCAGGGCTTCACAGGACGGTTTGTCTTCTCCTTGGACACCTCTGTCAGCACGGCATATCTGCAGATCTGCAGCCTAAAGGCTGAGGACACTGCCGTGTATTACTGTGCGAGAGAATTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG TGTGCGAGCCTCTGGCAAGATGCCAGTGGATACAGCTATGGTAAATACTACTACTACTACGGTATGGACGTCTGG CASLWQDASGYSYGKYYYYYGMDVW 75 25 50 402 435 497 498 568 791 7 T
AAACCTGAGGAGTCTG-1 AAACCTGAGGAGTCTG-1_contig_2 AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC METPAQLLFLLLLWLPDTTGEIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSLTWTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDN T F IGKV3-20 44S348M176S IGKJ1 394S38M136S IGKC 432S136M AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC CCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACCTGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC TGTCAGCAGTATGGTAGCTCACTTACGTGGACGTTC CQQYGSSLTWTF 36 12 45 392 395 432 433 568 5260 41 T
AAACCTGCAGCGTAAG-1 AAACCTGCAGCGTAAG-1_contig_1 GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG MEFGLSWVFLVALLRGVQCQVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKGRVVVGATTPRNYYYYGMDVWGQGTTVTVSSGSASAPTLFPLVSCENSPSDTSSV T F IGHV3-30 80S351M165S IGHJ6 462S63M71S IGHM 525S71M GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG CAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAATTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG TGTGCGAAGGGGAGAGTCGTAGTGGGAGCTACTACCCCCAGGAACTACTACTACTACGGTATGGACGTCTGG CAKGRVVVGATTPRNYYYYGMDVW 72 24 81 431 463 525 526 596 1092 14 T
Expand Down
29 changes: 15 additions & 14 deletions src/scirpy/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ def test_write_airr_none_field_issue_454(tmp_path):
write_airr(adata, tmp_path / "test.airr.tsv")


@pytest.mark.xfail(reason="Dandelion still uses `duplicate_count` instead of `umi_count`", raises=AssertionError)
@pytest.mark.extra
@pytest.mark.parametrize(
"anndata_from_10x_sample",
Expand All @@ -269,7 +270,7 @@ def test_convert_dandelion(anndata_from_10x_sample):
assert len(ir_obj1.chains) == len(ir_obj2.chains)

def _key(chain):
v1, v2 = chain.get("duplicate_count", -1), chain.get("junction", "")
v1, v2 = chain.get("umi_count", -1), chain.get("junction", "")
v1 = -1 if v1 is None else v1
v2 = "" if v2 is None else v2
return (v1, v2)
Expand All @@ -295,7 +296,7 @@ def test_read_10x_csv():
[
"junction_aa",
"junction",
"duplicate_count",
"umi_count",
"consensus_count",
"v_call",
"d_call",
Expand All @@ -314,7 +315,7 @@ def test_read_10x_csv():
assert cell1.name == "AAACCTGAGTACGCCC-1"
assert cell1["VDJ_1_junction_aa"] == "CASSLGPSTDTQYF"
assert cell1["VDJ_1_junction"] == "TGTGCCAGCAGCTTGGGACCTAGCACAGATACGCAGTATTTT"
assert cell1["VDJ_1_duplicate_count"] == 55
assert cell1["VDJ_1_umi_count"] == 55
assert cell1["VDJ_1_consensus_count"] == 18021
assert cell1["VDJ_1_v_call"] == "TRBV7-2"
assert cell1["VDJ_1_d_call"] == "TRBD2"
Expand All @@ -327,8 +328,8 @@ def test_read_10x_csv():
assert cell2.name == "AAACCTGGTCCGTTAA-1"
assert cell2["VJ_1_junction_aa"] == "CALNTGGFKTIF"
assert cell2["VJ_2_junction_aa"] == "CAVILDARLMF"
assert cell2["VJ_1_duplicate_count"] == 5
assert cell2["VJ_2_duplicate_count"] == 5
assert cell2["VJ_1_umi_count"] == 5
assert cell2["VJ_2_umi_count"] == 5
assert cell2["VJ_1_locus"] == "TRA"
assert cell2["VDJ_1_locus"] == "TRB"
assert cell2["VJ_2_locus"] == "TRA"
Expand Down Expand Up @@ -427,7 +428,7 @@ def test_read_10x():
"junction",
"np1_length",
"np2_length",
"duplicate_count",
"umi_count",
"consensus_count",
"v_call",
"d_call",
Expand All @@ -450,7 +451,7 @@ def test_read_10x():
assert cell1["VDJ_1_junction"] == "TGTGCCAGCTCACCACCGAGCCAGGGCCTTTCTACCGGGGAGCTGTTTTTT"
assert cell1["VDJ_1_np1_length"] == 4
assert cell1["VDJ_1_np2_length"] == 7
assert cell1["VDJ_1_duplicate_count"] == 1
assert cell1["VDJ_1_umi_count"] == 1
assert cell1["VDJ_1_consensus_count"] == 494
assert cell1["VDJ_1_v_call"] == "TRBV18"
assert cell1["VDJ_1_d_call"] == "TRBD1"
Expand All @@ -462,8 +463,8 @@ def test_read_10x():
assert cell2.name == "AAACCTGAGTACGCCC-1"
assert cell2["VJ_1_junction_aa"] == "CAMRVGGSQGNLIF"
assert cell2["VJ_2_junction_aa"] == "CATDAKDSNYQLIW"
assert cell2["VJ_1_duplicate_count"] == 9
assert cell2["VJ_2_duplicate_count"] == 4
assert cell2["VJ_1_umi_count"] == 9
assert cell2["VJ_2_umi_count"] == 4
assert np.all(_is_na(cell2[["VDJ_1_junction_aa", "VDJ_2_junction_aa"]]))
assert cell2["VJ_1_np1_length"] == 4
assert _is_na(cell2["VJ_1_np2_length"])
Expand Down Expand Up @@ -816,7 +817,7 @@ def test_airr_df():
"cell_id",
"c_call",
"consensus_count",
"duplicate_count",
"umi_count",
],
)

Expand Down Expand Up @@ -874,7 +875,7 @@ def test_read_bd_per_cell_chain():
adata,
[
"locus",
"duplicate_count",
"umi_count",
"consensus_count",
"junction",
"junction_aa",
Expand All @@ -892,7 +893,7 @@ def test_read_bd_per_cell_chain():
cell85 = obs.loc["85", :]

assert cell1["VJ_1_locus"] == "TRA"
assert cell1["VJ_1_duplicate_count"] == 1
assert cell1["VJ_1_umi_count"] == 1
assert cell1["VJ_1_consensus_count"] == 72
assert cell1["VJ_1_junction"] == "GCTGCCCCAGAATTTTGTC"
assert cell1["VJ_1_junction_aa"] == "AAGQNFV"
Expand All @@ -918,11 +919,11 @@ def test_read_bd_per_cell_chain():
def test_read_bd_contigs():
adata = read_bd_rhapsody(TESTDATA / "bd/test_unfiltered_contigs.csv")

obs = ir.get.airr(adata, ["locus", "duplicate_count"], "VJ_1")
obs = ir.get.airr(adata, ["locus", "umi_count"], "VJ_1")

assert obs.shape[0] == 5

cell10681 = obs.loc["10681"]

assert cell10681["VJ_1_locus"] == "IGK"
assert cell10681["VJ_1_duplicate_count"] == 2
assert cell10681["VJ_1_umi_count"] == 2
Loading

0 comments on commit 7027f0e

Please sign in to comment.