Skip to content

Commit

Permalink
refact[call]: move candidate snv getter fn to snvs file
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Apr 19, 2023
1 parent 7018e68 commit d38c2f6
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 55 deletions.
55 changes: 2 additions & 53 deletions strkit/call/caller/call_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .repeats import get_repeat_count, get_ref_repeat_count
from .snvs import (
SNV_OUT_OF_RANGE_CHAR,
get_candidate_snvs,
get_read_snvs,
# get_read_snvs_dbsnp,
calculate_useful_snvs,
Expand Down Expand Up @@ -53,41 +54,6 @@
significant_clip_snv_take_in = 250


def get_candidate_snvs(
snv_vcf_file: pysam.VariantFile,
snv_vcf_contigs: tuple[str, ...],
snv_vcf_file_format: Literal["chr", "num", "acc", ""],
contig: str,
left_most_coord: int,
right_most_coord: int,
) -> dict[int, CandidateSNV]:
candidate_snvs_dict: dict[int, CandidateSNV] = {} # Lookup dictionary for candidate SNVs by position

snv_contig: str = contig
if snv_contig not in snv_vcf_contigs:
if snv_vcf_file_format == "num":
snv_contig = snv_contig.removeprefix("chr")
elif snv_vcf_file_format == "acc":
snv_contig = _human_chrom_to_refseq_accession(snv_contig, snv_vcf_contigs)
# Otherwise, leave as-is

for snv in snv_vcf_file.fetch(snv_contig, left_most_coord, right_most_coord + 1):
snv_ref = snv.ref
snv_alts = snv.alts
# check actually is SNV
if snv_ref is not None and len(snv_ref) == 1 and snv_alts and any(len(a) == 1 for a in snv_alts):
# Convert from 1-based indexing to 0-based indexing!!!
# See https://pysam.readthedocs.io/en/latest/api.html#pysam.VariantRecord.pos
candidate_snvs_dict[snv.pos - 1] = CandidateSNV(id=snv.id, ref=snv.ref, alts=snv_alts)

# candidate_snvs_dict_items: list[tuple[int, CandidateSNV]] = list(candidate_snvs_dict.items())
# This flattened version is useful for passing to the Rust extension
# candidate_snvs_dict_items_flat: list[tuple[int, str, str, list[str]]] = [
# (k, v["id"], v["ref"], list(v["alts"])) for k, v in candidate_snvs_dict_items]

return candidate_snvs_dict


def calculate_seq_with_wildcards(qs: str, quals: Optional[list[int]]) -> str:
if quals is None:
return qs # No quality information, so don't do anything
Expand Down Expand Up @@ -297,7 +263,7 @@ def process_read_snvs_for_locus(
only_known_snvs: bool,
logger_: logging.Logger,
locus_log_str: str,
):
) -> set[int]:
# Loop through a second time if we are using SNVs. We do a second loop rather than just using the first loop
# in order to have collected the edges of the reference sequence we can cache for faster SNV calculation.

Expand Down Expand Up @@ -559,23 +525,6 @@ def call_alleles_with_incorporated_snvs(
return assign_method, (call_data, called_useful_snvs)


def _human_chrom_to_refseq_accession(contig: str, snv_vcf_contigs: tuple[str, ...]) -> Optional[str]:
contig = contig.removeprefix("chr")
if contig == "X":
contig = "23"
if contig == "Y":
contig = "24"
if contig == "M":
contig = "12920"
contig = f"NC_{contig.zfill(6)}"

for vcf_contig in snv_vcf_contigs:
if vcf_contig.startswith(contig):
return vcf_contig

return None


def call_locus(
t_idx: int,
t: tuple,
Expand Down
57 changes: 55 additions & 2 deletions strkit/call/caller/snvs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import math
import numpy as np
import pysam
from collections import Counter

from typing import Callable, Optional
from typing import Callable, Literal, Optional

from strkit.logger import logger

Expand All @@ -13,6 +14,8 @@

__all__ = [
"SNV_OUT_OF_RANGE_CHAR",
"SNV_GAP_CHAR",
"get_candidate_snvs",
"get_read_snvs_dbsnp",
"get_read_snvs",
"calculate_useful_snvs",
Expand All @@ -22,7 +25,57 @@
SNV_OUT_OF_RANGE_CHAR = "-"
SNV_GAP_CHAR = "_"

# TODO: annotate with rsID if file provided

def _human_chrom_to_refseq_accession(contig: str, snv_vcf_contigs: tuple[str, ...]) -> Optional[str]:
contig = contig.removeprefix("chr")
if contig == "X":
contig = "23"
if contig == "Y":
contig = "24"
if contig == "M":
contig = "12920"
contig = f"NC_{contig.zfill(6)}"

for vcf_contig in snv_vcf_contigs:
if vcf_contig.startswith(contig):
return vcf_contig

return None


def get_candidate_snvs(
snv_vcf_file: pysam.VariantFile,
snv_vcf_contigs: tuple[str, ...],
snv_vcf_file_format: Literal["chr", "num", "acc", ""],
contig: str,
left_most_coord: int,
right_most_coord: int,
) -> dict[int, CandidateSNV]:
candidate_snvs_dict: dict[int, CandidateSNV] = {} # Lookup dictionary for candidate SNVs by position

snv_contig: str = contig
if snv_contig not in snv_vcf_contigs:
if snv_vcf_file_format == "num":
snv_contig = snv_contig.removeprefix("chr")
elif snv_vcf_file_format == "acc":
snv_contig = _human_chrom_to_refseq_accession(snv_contig, snv_vcf_contigs)
# Otherwise, leave as-is

for snv in snv_vcf_file.fetch(snv_contig, left_most_coord, right_most_coord + 1):
snv_ref = snv.ref
snv_alts = snv.alts
# check actually is SNV
if snv_ref is not None and len(snv_ref) == 1 and snv_alts and any(len(a) == 1 for a in snv_alts):
# Convert from 1-based indexing to 0-based indexing!!!
# See https://pysam.readthedocs.io/en/latest/api.html#pysam.VariantRecord.pos
candidate_snvs_dict[snv.pos - 1] = CandidateSNV(id=snv.id, ref=snv.ref, alts=snv_alts)

# candidate_snvs_dict_items: list[tuple[int, CandidateSNV]] = list(candidate_snvs_dict.items())
# This flattened version is useful for passing to the Rust extension
# candidate_snvs_dict_items_flat: list[tuple[int, str, str, list[str]]] = [
# (k, v["id"], v["ref"], list(v["alts"])) for k, v in candidate_snvs_dict_items]

return candidate_snvs_dict


# We check entropy against a threshold in order to make sure the SNVs we find are somewhat
Expand Down

0 comments on commit d38c2f6

Please sign in to comment.