KwanLab · evanroyrees · Dec 16, 2022 · Jul 16, 2022 · Jul 18, 2022 · Jul 20, 2022
diff --git a/.github/workflows/pytest_codecov.yml b/.github/workflows/pytest_codecov.yml
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [3.7]
+        python-version: [3.8]
     env:
       OS: ${{ matrix.os }}
       PYTHON: ${{ matrix.python-version }}

diff --git a/autometa/binning/large_data_mode.py b/autometa/binning/large_data_mode.py
@@ -39,7 +39,7 @@
 from autometa.common import kmers
 
 from autometa.common.exceptions import TableFormatError, BinningError
-from autometa.taxonomy.ncbi import NCBI
+from autometa.taxonomy.database import TaxonomyDatabase
 from autometa.binning.recursive_dbscan import get_clusters
 from autometa.binning.utilities import (
     write_results,
@@ -312,11 +312,15 @@ def cluster_by_taxon_partitioning(
     """
     if reverse_ranks:
         # species, genus, family, order, class, phylum, superkingdom
-        canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
+        canonical_ranks = [
+            rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
+        ]
     else:
         # superkingdom, phylum, class, order, family, genus, species
         canonical_ranks = [
-            rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"
+            rank
+            for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
+            if rank != "root"
         ]
     # if stage is cached then we can first look to the cache before we begin subsetting main...
     clustered_contigs = set()

diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
@@ -24,7 +24,7 @@
 from autometa.common.markers import load as load_markers
 
 from autometa.common.exceptions import TableFormatError, BinningError
-from autometa.taxonomy.ncbi import NCBI
+from autometa.taxonomy.database import TaxonomyDatabase
 from autometa.binning.utilities import (
     write_results,
     read_annotations,
@@ -628,10 +628,14 @@ def taxon_guided_binning(
     logger.info(f"Using {method} clustering method")
     if reverse_ranks:
         # species, genus, family, order, class, phylum, superkingdom
-        ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
+        ranks = [rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"]
     else:
         # superkingdom, phylum, class, order, family, genus, species
-        ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
+        ranks = [
+            rank
+            for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
+            if rank != "root"
+        ]
     starting_rank_index = ranks.index(starting_rank)
     ranks = ranks[starting_rank_index:]
     logger.debug(f"Using ranks: {', '.join(ranks)}")

diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
@@ -17,7 +17,9 @@
 
 from Bio import SeqIO
 
+from autometa.taxonomy.database import TaxonomyDatabase
 from autometa.taxonomy.ncbi import NCBI
+from autometa.taxonomy.gtdb import GTDB
 from autometa.taxonomy import majority_vote
 from autometa.common.markers import load as load_markers
 
@@ -226,16 +228,16 @@ def get_metabin_stats(
 
 
 def get_metabin_taxonomies(
-    bin_df: pd.DataFrame, ncbi: NCBI, cluster_col: str = "cluster"
+    bin_df: pd.DataFrame, taxa_db: TaxonomyDatabase, cluster_col: str = "cluster"
 ) -> pd.DataFrame:
     """Retrieve taxonomies of all clusters recovered from Autometa binning.
 
     Parameters
     ----------
     bin_df : pd.DataFrame
         Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks]
-    ncbi : autometa.taxonomy.ncbi.NCBI instance
-        Autometa NCBI class instance
+    taxa_db : autometa.taxonomy.ncbi.TaxonomyDatabase instance
+        Autometa NCBI or GTDB class instance
     cluster_col : str, optional
         Clustering column by which to group metabins
 
@@ -246,7 +248,9 @@ def get_metabin_taxonomies(
         Indexed by cluster
     """
     logger.info(f"Retrieving metabin taxonomies for {cluster_col}")
-    canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
+    canonical_ranks = [
+        rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
+    ]
     is_clustered = bin_df[cluster_col].notnull()
     bin_df = bin_df[is_clustered]
     outcols = [cluster_col, "length", "taxid", *canonical_ranks]
@@ -277,11 +281,13 @@ def get_metabin_taxonomies(
             taxonomies[cluster][canonical_rank].update({taxid: length})
         else:
             taxonomies[cluster][canonical_rank][taxid] += length
-    cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi)
+    cluster_taxonomies = majority_vote.rank_taxids(taxonomies, taxa_db=taxa_db)
     # With our cluster taxonomies, let's place these into a dataframe for easy data accession
     cluster_taxa_df = pd.Series(data=cluster_taxonomies, name="taxid").to_frame()
     # With the list of taxids, we'll retrieve their complete canonical-rank information
-    lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(), fillna=True)
+    lineage_df = taxa_db.get_lineage_dataframe(
+        cluster_taxa_df.taxid.tolist(), fillna=True
+    )
     # Now put it all together
     cluster_taxa_df = pd.merge(
         cluster_taxa_df, lineage_df, how="left", left_on="taxid", right_index=True
@@ -323,11 +329,18 @@ def main():
         required=True,
     )
     parser.add_argument(
-        "--ncbi",
-        help="Path to user NCBI databases directory (Required for retrieving metabin taxonomies)",
+        "--dbdir",
+        help="Path to user taxonomy database directory (Required for retrieving metabin taxonomies)",
         metavar="dirpath",
         required=False,
     )
+    parser.add_argument(
+        "--dbtype",
+        help="Taxonomy database type to use (NOTE: must correspond to the same database type used during contig taxon assignment.)",
+        choices=["ncbi", "gtdb"],
+        required=False,
+        default="ncbi",
+    )
     parser.add_argument(
         "--binning-column",
         help="Binning column to use for grouping metabins",
@@ -377,14 +390,17 @@ def main():
     logger.info(f"Wrote metabin stats to {args.output_stats}")
     # Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method.
     if "taxid" in bin_df.columns:
-        if not args.ncbi:
+        if not args.dbdir:
             logger.warn(
-                "taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..."
+                "taxid found in dataframe. --dbdir argument is required to retrieve metabin taxonomies. Skipping..."
             )
         else:
-            ncbi = NCBI(dirpath=args.ncbi)
+            if args.dbtype == "ncbi":
+                taxa_db = NCBI(dbdir=args.dbdir)
+            elif args.dbtype == "gtdb":
+                taxa_db = GTDB(dbdir=args.dbdir)
-            if args.dbtype == "ncbi":
-                taxa_db = NCBI(dbdir=args.dbdir)
-            elif args.dbtype == "gtdb":
-                taxa_db = GTDB(dbdir=args.dbdir)
+            taxa_dbs = {"ncbi": NCBI, "gtdb": GTDB} # i.e. --dbtype choices
+            taxa_db = taxa_dbs[args.dbtype](args.dbdir)
-            if args.dbtype == "ncbi":
-                taxa_db = NCBI(dbdir=args.dbdir)
-            elif args.dbtype == "gtdb":
-                taxa_db = GTDB(dbdir=args.dbdir)
+            taxa_dbs = {"ncbi": NCBI, "gtdb": GTDB} # i.e. --dbtype choices
+            taxa_db = taxa_dbs[args.dbtype](args.dbdir)
             taxa_df = get_metabin_taxonomies(
-                bin_df=bin_df, ncbi=ncbi, cluster_col=args.binning_column
+                bin_df=bin_df, taxa_db=taxa_db, cluster_col=args.binning_column
             )
             taxa_df.to_csv(args.output_taxonomy, sep="\t", index=True, header=True)
 

diff --git a/autometa/binning/utilities.py b/autometa/binning/utilities.py
@@ -33,7 +33,7 @@
 
 from typing import Iterable, Tuple
 
-from autometa.taxonomy.ncbi import NCBI
+from autometa.taxonomy.database import TaxonomyDatabase
 
 
 logger = logging.getLogger(__name__)
@@ -98,7 +98,7 @@ def filter_taxonomy(df: pd.DataFrame, rank: str, name: str) -> pd.DataFrame:
         Provided `name` not found in `rank` column.
     """
     # First clean the assigned taxa by broadcasting lowercase and replacing any whitespace with underscores
-    for canonical_rank in NCBI.CANONICAL_RANKS:
+    for canonical_rank in TaxonomyDatabase.CANONICAL_RANKS:
         if canonical_rank not in df.columns:
             continue
         df[canonical_rank] = df[canonical_rank].map(
@@ -395,7 +395,11 @@ def write_results(
         outcols.extend(annotation_cols)
         # Add in taxonomy columns if taxa are present
         # superkingdom, phylum, class, order, family, genus, species
-        taxa_cols = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
+        taxa_cols = [
+            rank
+            for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
+            if rank != "root"
+        ]
         taxa_cols.append("taxid")
         # superkingdom, phylum, class, order, family, genus, species, taxid
         for taxa_col in taxa_cols:

diff --git a/autometa/config/databases.py b/autometa/config/databases.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import requests
+import sys
 import subprocess
 import tempfile
 
@@ -32,6 +33,7 @@
 from autometa.config.utilities import DEFAULT_CONFIG
 from autometa.config.utilities import AUTOMETA_DIR
 from autometa.config.utilities import put_config, get_config
+from autometa.taxonomy.gtdb import create_gtdb_db
 
 
 logger = logging.getLogger(__name__)
@@ -401,6 +403,30 @@ def download_ncbi_files(self, options: Iterable) -> None:
         if "nr" in options:
             self.format_nr()
 
+    def download_gtdb_files(self) -> None:
+        proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps")
+        gtdb_taxdmp_url = self.config.get("database_urls", "gtdb_taxdmp")
+
+        # User path:
+        proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps")
+        gtdb_taxdmp_filepath = self.config.get("gtdb", "gtdb_taxdmp")
+
+        urls = [proteins_aa_reps_url, gtdb_taxdmp_url]
+        filepaths = [proteins_aa_reps_filepath, gtdb_taxdmp_filepath]
+
+        logger.debug(f"starting GTDB databases download")
+        for url, filepath in zip(urls, filepaths):
+            cmd = ["wget", url, "-O", filepath]
+            full_path = os.path.abspath(filepath)
+            dir_path = os.path.dirname(full_path)
+            if not os.path.exists(dir_path):
+                os.makedirs(dir_path)
+                logger.debug(f"Created missing database directory: {dir_path}")
+            logger.debug(" ".join(cmd))
+            subprocess.run(
+                cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
+            )
+
     def press_hmms(self) -> None:
         """hmmpress markers hmm database files.
 
@@ -714,7 +740,7 @@ def main():
     )
     parser.add_argument(
         "--update-all",
-        help="Update all out-of-date databases.",
+        help="Update all out-of-date databases. (NOTE: Does not update GTDB)",
         action="store_true",
         default=False,
     )
@@ -730,6 +756,12 @@ def main():
         action="store_true",
         default=False,
     )
+    parser.add_argument(
+        "--update-gtdb",
+        help="Download and format the user-configured GTDB release databases",
+        action="store_true",
+        default=False,
+    )
     parser.add_argument(
         "--check-dependencies",
         help="Check database dependencies are satisfied.",
@@ -771,6 +803,18 @@ def main():
         section = "markers"
     elif args.update_ncbi:
         section = "ncbi"
+    elif args.update_gtdb:
+        # Download, generate and format GTDB amino acid database
+        gtdb_combined = create_gtdb_db(
+            reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"),
+            dbdir=dbs.config.get("databases", "gtdb"),
+        )
+        diamond.makedatabase(
+            fasta=gtdb_combined,
+            database=gtdb_combined.replace(".faa", ".dmnd"),
+            cpus=args.cpus,
+        )
+        sys.exit(0)
     else:
         section = None
 
@@ -779,15 +823,11 @@ def main():
             section=section, compare_checksums=compare_checksums
         )
         logger.info(f"Database dependencies satisfied: {dbs_satisfied}")
-        import sys
-
         sys.exit(0)
 
     config = dbs.configure(section=section, no_checksum=args.no_checksum)
 
     if not args.out:
-        import sys
-
         sys.exit(0)
     put_config(config, args.out)
     logger.info(f"{args.out} written.")

diff --git a/autometa/config/default.config b/autometa/config/default.config
@@ -50,6 +50,7 @@ bedtools = None
 [databases]
 base = ${common:home_dir}/autometa/databases
 ncbi = ${databases:base}/ncbi
+gtdb = ${databases:base}/gtdb
 markers = ${databases:base}/markers
 
 [database_urls]
@@ -60,6 +61,9 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da
 bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs
 archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm
 archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs
+proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz
+gtdb_taxdmp = https://${markers:host}/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz
+
 
 [checksums]
 taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5
@@ -79,6 +83,12 @@ merged = ${databases:ncbi}/merged.dmp
 accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz
 nr = ${databases:ncbi}/nr.gz
 
+[gtdb]
+host = data.gtdb.ecogenomic.org
+release = latest
+proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz
+gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz
+
 [markers]
 host = raw.githubusercontent.com
 bacteria_single_copy = ${databases:markers}/bacteria.single_copy.hmm