iqbal-lab-org · leoisl · Jul 20, 2023 · Dec 2, 2022 · Jun 7, 2023 · Jul 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -122,3 +122,4 @@ tests/data/prg_builder/write_prg/sample.bin
 tests/data/prg_builder/write_prg/sample.prg.fa
 tests/integration_tests/data/output/
 tests/integration_tests/data/output_update/
+debugging
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [0.4.0] - 2022-22-11
+## [0.5.0] - 2023-07-18
+
+### Fixed
+
+- Properly handling Ns in the MSA, and in the denovo sequences (see [PR #60](https://github.com/iqbal-lab-org/make_prg/pull/60)
+and [PR #61](https://github.com/iqbal-lab-org/make_prg/pull/61) for more details);
+
+### Changed
+- `scikit-learn`, `numpy` and `pytest` dependencies updated;
+- The KMeans algorithm used is now `elkan`;
+
+## [0.4.0] - 2022-11-22
 
 ### Added
 - `make_prg update` command, that updates PRGs without requiring to rebuild MSAs and the PRG itself from scratch;
@@ -103,8 +114,9 @@ operations.
   source project CHANGELOG.
 
 
-[Unreleased]: https://github.com/iqbal-lab-org/make_prg/compare/0.4.0...HEAD
+[Unreleased]: https://github.com/iqbal-lab-org/make_prg/compare/0.5.0...HEAD
 
+[0.5.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.4.0...0.5.0
 [0.4.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.2.0...0.4.0
 [0.2.0]: https://github.com/iqbal-lab-org/make_prg/compare/0.1.1...0.2.0
 [0.1.1]: https://github.com/iqbal-lab-org/make_prg/compare/0.1.0...0.1.1

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-# To build: docker build . -t make_prg:0.4.0
+# To build: docker build . -t make_prg:0.5.0
 # Tagged as such, it can be used in scripts/build_precompiled_binary/build_precompiled_binary.sh to build the precompiled binary
 FROM python:3.10-slim
 

diff --git a/README.md b/README.md
@@ -37,13 +37,13 @@ In this binary, all libraries are linked statically. Compilation is done using [
 
 #### Download
 ```
-wget https://github.com/iqbal-lab-org/make_prg/releases/download/0.4.0/make_prg_0.4.0
+wget https://github.com/iqbal-lab-org/make_prg/releases/download/0.5.0/make_prg_0.5.0
 ```
 
 #### Run
 ```
-chmod +x make_prg_0.4.0
-./make_prg_0.4.0 -h
+chmod +x make_prg_0.5.0
+./make_prg_0.5.0 -h
 ```
 
 ### pip
@@ -77,7 +77,7 @@ The above will use the latest version. If you want to specify a version then use
 [tag][quay.io] (or commit) like so.
 
 ```sh
-VERSION="0.4.0"
+VERSION="0.5.0"
 URI="docker://quay.io/iqballab/make_prg:${VERSION}"
 ```
 

diff --git a/make_prg/from_msa/cluster_sequences.py b/make_prg/from_msa/cluster_sequences.py
@@ -259,7 +259,9 @@ def kmeans_cluster_seqs(
             break
         if num_clusters == num_sequences:
             break
-        kmeans = KMeans(n_clusters=num_clusters, random_state=2).fit(count_matrix)
+        kmeans = KMeans(n_clusters=num_clusters, random_state=2, algorithm="elkan").fit(
+            count_matrix
+        )
         prev_cluster_assignment = cluster_assignment
         cluster_assignment = list(kmeans.predict(count_matrix))
         num_fitted_clusters = len(set(cluster_assignment))

diff --git a/make_prg/update/denovo_variants.py b/make_prg/update/denovo_variants.py
@@ -21,6 +21,10 @@ class DenovoError(Exception):
     pass
 
 
+class NonACGTError(Exception):
+    pass
+
+
 class TooLongDeletion(Exception):
     pass
 
@@ -82,7 +86,7 @@ def _param_checking(
     def _check_sequence_is_composed_of_ACGT_only(seq: str):
         sequence_is_composed_of_ACGT_only = all([base in "ACGT" for base in seq])
         if not sequence_is_composed_of_ACGT_only:
-            raise DenovoError(f"Found a non-ACGT seq ({seq}) in a denovo variant")
+            raise NonACGTError(f"Found a non-ACGT seq ({seq}) in a denovo variant")
 
     def __eq__(self, other):
         if isinstance(other, self.__class__):
@@ -496,7 +500,7 @@ def _read_variants(
                     filehandler, long_deletion_threshold
                 )
                 variants.append(denovo_variant)
-            except TooLongDeletion as error:
+            except (TooLongDeletion, NonACGTError) as error:
                 logger.warning(f"Ignoring variant: {error}")
         return variants
 

diff --git a/make_prg/utils/io_utils.py b/make_prg/utils/io_utils.py
@@ -1,26 +1,51 @@
 import gzip
 import os
 import tempfile
+from io import StringIO
 from pathlib import Path
 from typing import Dict, Union
 from zipfile import ZipFile
 
 from Bio import AlignIO
+from Bio.Seq import Seq
 
 from make_prg import MSA
 from make_prg.subcommands.output_type import OutputType
+from make_prg.utils.seq_utils import get_majority_consensus_from_MSA
 
 
-def load_alignment_file(msa_file: Union[str, Path], alignment_format: str) -> MSA:
-    msa_file = str(msa_file)
-    if msa_file.endswith(".gz"):
-        handle = gzip.open(msa_file, "rt")
-        alignment = AlignIO.read(handle, alignment_format)
-        handle.close()
-    else:
+def load_alignment_file(
+    msa_file: Union[str, Path, StringIO], alignment_format: str
+) -> MSA:
+    if isinstance(msa_file, StringIO):
         alignment = AlignIO.read(msa_file, alignment_format)
+    else:
+        msa_file = str(msa_file)
+        if msa_file.endswith(".gz"):
+            with gzip.open(msa_file, "rt") as handle:
+                alignment = AlignIO.read(handle, alignment_format)
+        else:
+            with open(msa_file, "r") as handle:
+                alignment = AlignIO.read(handle, alignment_format)
+
+    # upper case seqs
     for record in alignment:
         record.seq = record.seq.upper()
+
+    # Compute the consensus sequence
+    consensus = get_majority_consensus_from_MSA(alignment)
+
+    # Replace 'N' with the consensus sequence in each record
+    for record in alignment:
+        record.seq = Seq(
+            "".join(
+                [
+                    consensus[i] if nucleotide == "N" else nucleotide
+                    for i, nucleotide in enumerate(str(record.seq))
+                ]
+            )
+        )
+
     return alignment
 
 

diff --git a/make_prg/utils/seq_utils.py b/make_prg/utils/seq_utils.py
@@ -1,5 +1,8 @@
 import copy
+import hashlib
 import itertools
+import random
+from collections import Counter
 from typing import Generator, List, Tuple
 
 import numpy as np
@@ -234,3 +237,54 @@ def get_consensus_from_MSA(alignment: MSA) -> str:
             consensus_string_as_list.append(column.pop())
     consensus_string = "".join(consensus_string_as_list)
     return consensus_string
+
+
+def convert_to_upper(sequences: Generator) -> Generator:
+    return (s.upper() for s in sequences)
+
+
+def generate_random_seed(sequences: List[str]) -> bytes:
+    return hashlib.sha256("".join(sequences).encode()).digest()
+
+
+def get_consensus_residue(
+    position: int, sequences: List[str], local_random: random.Random
+) -> str:
+    # Count the residues at this position, ignoring gaps and Ns
+    pos_counts = Counter(
+        seq[position]
+        for seq in sequences
+        if seq[position] != GAP and seq[position] != "N"
+    )
+
+    # If there are no residues other than gaps and Ns at this position, use a random base
+    if len(pos_counts) == 0:
+        return local_random.choice("ACGT")
+
+    # Find the residue(s) with the highest count
+    max_count = pos_counts.most_common(1)[0][1]
+    max_residues = [res for res, count in pos_counts.items() if count == max_count]
+
+    # Randomly select a residue from the residues with the highest count
+    return local_random.choice(max_residues)
+
+
+def get_majority_consensus_from_MSA(alignment: MSA) -> str:
+    """
+    Produces a consensus string (composed only of ACGT) just based on the major base for each column.
+    """
+    all_seqs = get_alignment_seqs(alignment)
+    all_seqs = list(convert_to_upper(all_seqs))
+    random_seed_for_this_alignment = generate_random_seed(all_seqs)
+    local_random = random.Random()
+    local_random.seed(random_seed_for_this_alignment)
+
+    # Initialize the consensus sequence as an empty string
+    consensus = ""
+
+    # Loop over the positions in the alignment
+    for i in range(alignment.get_alignment_length()):
+        # Add the residue to the consensus sequence
+        consensus += get_consensus_residue(i, all_seqs, local_random)
+
+    return consensus