Merge pull request #60 from leoisl/v0.5.0

Properly handling Ns in make_prg
iqbal-lab-org · Jul 17, 2023 · 7fbbc8c · 7fbbc8c
2 parents 1187a48 + 7454a89
commit 7fbbc8c
Show file tree

Hide file tree

Showing 16 changed files with 4,666 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -122,3 +122,4 @@ tests/data/prg_builder/write_prg/sample.bin
 tests/data/prg_builder/write_prg/sample.prg.fa
 tests/integration_tests/data/output/
 tests/integration_tests/data/output_update/
+debugging
diff --git a/make_prg/utils/io_utils.py b/make_prg/utils/io_utils.py
@@ -1,26 +1,51 @@
 import gzip
 import os
 import tempfile
+from io import StringIO
 from pathlib import Path
 from typing import Dict, Union
 from zipfile import ZipFile
 
 from Bio import AlignIO
+from Bio.Seq import Seq
 
 from make_prg import MSA
 from make_prg.subcommands.output_type import OutputType
+from make_prg.utils.seq_utils import get_majority_consensus_from_MSA
 
 
-def load_alignment_file(msa_file: Union[str, Path], alignment_format: str) -> MSA:
-    msa_file = str(msa_file)
-    if msa_file.endswith(".gz"):
-        handle = gzip.open(msa_file, "rt")
-        alignment = AlignIO.read(handle, alignment_format)
-        handle.close()
-    else:
+def load_alignment_file(
+    msa_file: Union[str, Path, StringIO], alignment_format: str
+) -> MSA:
+    if isinstance(msa_file, StringIO):
         alignment = AlignIO.read(msa_file, alignment_format)
+    else:
+        msa_file = str(msa_file)
+        if msa_file.endswith(".gz"):
+            with gzip.open(msa_file, "rt") as handle:
+                alignment = AlignIO.read(handle, alignment_format)
+        else:
+            with open(msa_file, "r") as handle:
+                alignment = AlignIO.read(handle, alignment_format)
+
+    # upper case seqs
     for record in alignment:
         record.seq = record.seq.upper()
+
+    # Compute the consensus sequence
+    consensus = get_majority_consensus_from_MSA(alignment)
+
+    # Replace 'N' with the consensus sequence in each record
+    for record in alignment:
+        record.seq = Seq(
+            "".join(
+                [
+                    consensus[i] if nucleotide == "N" else nucleotide
+                    for i, nucleotide in enumerate(str(record.seq))
+                ]
+            )
+        )
+
     return alignment
 
 

diff --git a/make_prg/utils/seq_utils.py b/make_prg/utils/seq_utils.py
@@ -1,5 +1,8 @@
 import copy
+import hashlib
 import itertools
+import random
+from collections import Counter
 from typing import Generator, List, Tuple
 
 import numpy as np
@@ -234,3 +237,51 @@ def get_consensus_from_MSA(alignment: MSA) -> str:
             consensus_string_as_list.append(column.pop())
     consensus_string = "".join(consensus_string_as_list)
     return consensus_string
+
+
+def convert_to_upper(sequences: Generator) -> Generator:
+    return (s.upper() for s in sequences)
+
+
+def generate_random_seed(sequences: List[str]) -> bytes:
+    return hashlib.sha256("".join(sequences).encode()).digest()
+
+
+def get_consensus_residue(position: int, sequences: List[str]) -> str:
+    # Count the residues at this position, ignoring gaps and Ns
+    pos_counts = Counter(
+        seq[position]
+        for seq in sequences
+        if seq[position] != GAP and seq[position] != "N"
+    )
+
+    # If there are no residues other than gaps and Ns at this position, use a random base
+    if len(pos_counts) == 0:
+        return random.choice("ACGT")
+
+    # Find the residue(s) with the highest count
+    max_count = pos_counts.most_common(1)[0][1]
+    max_residues = [res for res, count in pos_counts.items() if count == max_count]
+
+    # Randomly select a residue from the residues with the highest count
+    return random.choice(max_residues)
+
+
+def get_majority_consensus_from_MSA(alignment: MSA) -> str:
+    """
+    Produces a consensus string (composed only of ACGT) just based on the major base for each column.
+    """
+    all_seqs = get_alignment_seqs(alignment)
+    all_seqs = list(convert_to_upper(all_seqs))
+    random_seed_for_this_alignment = generate_random_seed(all_seqs)
+    random.seed(random_seed_for_this_alignment)
+
+    # Initialize the consensus sequence as an empty string
+    consensus = ""
+
+    # Loop over the positions in the alignment
+    for i in range(alignment.get_alignment_length()):
+        # Add the residue to the consensus sequence
+        consensus += get_consensus_residue(i, all_seqs)
+
+    return consensus
diff --git a/tests/integration_tests/data/amira_MSAs/alsB.fasta b/tests/integration_tests/data/amira_MSAs/alsB.fasta
diff --git a/tests/integration_tests/data/amira_MSAs/glpG.fasta b/tests/integration_tests/data/amira_MSAs/glpG.fasta
@@ -0,0 +1,2 @@
+>SRR1314424_trimmed;1044_22_3
+atgttgatgattacctcttttgctaacccccgcgtggcgcaggcgtttgttgattacatggcgacgcagggtgttatcctcacgattcaacaacataaccaaagcgatgtctggctggcggatgagtcccaggccgagcgcgtacgggcggagctggcgcnnnnnnnnnnctggcaggcaggccataccggcagtggcctgcattatcgccgttatcctttctttgccgccttgcgtgaacgcgcaggtccggtaacctgggtgatgatgatcgcctgcgtggtggtgtttattgccatgcaaattctcggcgatcaggaagtgatgttatggctggcctggccattcgatccagcactgaaatttgagttctggcgttacttcacccacgcgttaatgcacttctcgctgatgcatatcctctttaacctgctctggtggtggtatctcggcggtgcggtggaaaaacgcctcggtagcggtaagctaattgtcattacgcttatcagcgccctgttaagcggctatgtgcagcaaaaattcagcgggccgtggtttggcgggctttctggcgtggtgtatgcgctgatgggctacgtctggctacgtggcgaacgcgatccgcaaagtggcatttacctgcaacgtgggttaattatctttgcgctgatctggattgtcgccggatggtttgatttgtttgggatgtcgat---ggcgaacg--gagcacacatcgccgggttagccgtgggtttagcgatggcttttgttgattcgctcaatgcgcgaaaacgaaaataa
diff --git a/tests/integration_tests/data/amira_MSAs/group_18516.fasta b/tests/integration_tests/data/amira_MSAs/group_18516.fasta
diff --git a/tests/integration_tests/data/truth_output/amira_MSAs/amira_MSAs.prg.bin.zip b/tests/integration_tests/data/truth_output/amira_MSAs/amira_MSAs.prg.bin.zip