Skip to content

Commit

Permalink
Merge pull request #60 from leoisl/v0.5.0
Browse files Browse the repository at this point in the history
Properly handling Ns in make_prg
  • Loading branch information
leoisl authored Jul 17, 2023
2 parents 1187a48 + 7454a89 commit 7fbbc8c
Show file tree
Hide file tree
Showing 16 changed files with 4,666 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,4 @@ tests/data/prg_builder/write_prg/sample.bin
tests/data/prg_builder/write_prg/sample.prg.fa
tests/integration_tests/data/output/
tests/integration_tests/data/output_update/
debugging
39 changes: 32 additions & 7 deletions make_prg/utils/io_utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,51 @@
import gzip
import os
import tempfile
from io import StringIO
from pathlib import Path
from typing import Dict, Union
from zipfile import ZipFile

from Bio import AlignIO
from Bio.Seq import Seq

from make_prg import MSA
from make_prg.subcommands.output_type import OutputType
from make_prg.utils.seq_utils import get_majority_consensus_from_MSA


def load_alignment_file(msa_file: Union[str, Path], alignment_format: str) -> MSA:
msa_file = str(msa_file)
if msa_file.endswith(".gz"):
handle = gzip.open(msa_file, "rt")
alignment = AlignIO.read(handle, alignment_format)
handle.close()
else:
def load_alignment_file(
msa_file: Union[str, Path, StringIO], alignment_format: str
) -> MSA:
if isinstance(msa_file, StringIO):
alignment = AlignIO.read(msa_file, alignment_format)
else:
msa_file = str(msa_file)
if msa_file.endswith(".gz"):
with gzip.open(msa_file, "rt") as handle:
alignment = AlignIO.read(handle, alignment_format)
else:
with open(msa_file, "r") as handle:
alignment = AlignIO.read(handle, alignment_format)

# upper case seqs
for record in alignment:
record.seq = record.seq.upper()

# Compute the consensus sequence
consensus = get_majority_consensus_from_MSA(alignment)

# Replace 'N' with the consensus sequence in each record
for record in alignment:
record.seq = Seq(
"".join(
[
consensus[i] if nucleotide == "N" else nucleotide
for i, nucleotide in enumerate(str(record.seq))
]
)
)

return alignment


Expand Down
51 changes: 51 additions & 0 deletions make_prg/utils/seq_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import copy
import hashlib
import itertools
import random
from collections import Counter
from typing import Generator, List, Tuple

import numpy as np
Expand Down Expand Up @@ -234,3 +237,51 @@ def get_consensus_from_MSA(alignment: MSA) -> str:
consensus_string_as_list.append(column.pop())
consensus_string = "".join(consensus_string_as_list)
return consensus_string


def convert_to_upper(sequences: Generator) -> Generator:
return (s.upper() for s in sequences)


def generate_random_seed(sequences: List[str]) -> bytes:
return hashlib.sha256("".join(sequences).encode()).digest()


def get_consensus_residue(position: int, sequences: List[str]) -> str:
# Count the residues at this position, ignoring gaps and Ns
pos_counts = Counter(
seq[position]
for seq in sequences
if seq[position] != GAP and seq[position] != "N"
)

# If there are no residues other than gaps and Ns at this position, use a random base
if len(pos_counts) == 0:
return random.choice("ACGT")

# Find the residue(s) with the highest count
max_count = pos_counts.most_common(1)[0][1]
max_residues = [res for res, count in pos_counts.items() if count == max_count]

# Randomly select a residue from the residues with the highest count
return random.choice(max_residues)


def get_majority_consensus_from_MSA(alignment: MSA) -> str:
"""
Produces a consensus string (composed only of ACGT) just based on the major base for each column.
"""
all_seqs = get_alignment_seqs(alignment)
all_seqs = list(convert_to_upper(all_seqs))
random_seed_for_this_alignment = generate_random_seed(all_seqs)
random.seed(random_seed_for_this_alignment)

# Initialize the consensus sequence as an empty string
consensus = ""

# Loop over the positions in the alignment
for i in range(alignment.get_alignment_length()):
# Add the residue to the consensus sequence
consensus += get_consensus_residue(i, all_seqs)

return consensus
1,578 changes: 1,578 additions & 0 deletions tests/integration_tests/data/amira_MSAs/alsB.fasta

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions tests/integration_tests/data/amira_MSAs/glpG.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>SRR1314424_trimmed;1044_22_3
atgttgatgattacctcttttgctaacccccgcgtggcgcaggcgtttgttgattacatggcgacgcagggtgttatcctcacgattcaacaacataaccaaagcgatgtctggctggcggatgagtcccaggccgagcgcgtacgggcggagctggcgcnnnnnnnnnnctggcaggcaggccataccggcagtggcctgcattatcgccgttatcctttctttgccgccttgcgtgaacgcgcaggtccggtaacctgggtgatgatgatcgcctgcgtggtggtgtttattgccatgcaaattctcggcgatcaggaagtgatgttatggctggcctggccattcgatccagcactgaaatttgagttctggcgttacttcacccacgcgttaatgcacttctcgctgatgcatatcctctttaacctgctctggtggtggtatctcggcggtgcggtggaaaaacgcctcggtagcggtaagctaattgtcattacgcttatcagcgccctgttaagcggctatgtgcagcaaaaattcagcgggccgtggtttggcgggctttctggcgtggtgtatgcgctgatgggctacgtctggctacgtggcgaacgcgatccgcaaagtggcatttacctgcaacgtgggttaattatctttgcgctgatctggattgtcgccggatggtttgatttgtttgggatgtcgat---ggcgaacg--gagcacacatcgccgggttagccgtgggtttagcgatggcttttgttgattcgctcaatgcgcgaaaacgaaaataa
2,834 changes: 2,834 additions & 0 deletions tests/integration_tests/data/amira_MSAs/group_18516.fasta

Large diffs are not rendered by default.

Binary file not shown.
Loading

0 comments on commit 7fbbc8c

Please sign in to comment.