diff --git a/.github/workflows/pytest_codecov.yml b/.github/workflows/pytest_codecov.yml index aeb86b122..bf35616e4 100644 --- a/.github/workflows/pytest_codecov.yml +++ b/.github/workflows/pytest_codecov.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [3.7] + python-version: [3.8] env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python-version }} diff --git a/autometa-env.yml b/autometa-env.yml index cb614d817..d1b2b8d1d 100644 --- a/autometa-env.yml +++ b/autometa-env.yml @@ -12,6 +12,7 @@ dependencies: - gdown - hdbscan - hmmer + - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809 - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -23,8 +24,9 @@ dependencies: - rsync - samtools>=1.11 - scikit-bio - - scipy==1.8 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 + - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations + - seqkit - tqdm - trimap - tsne diff --git a/autometa/binning/large_data_mode.py b/autometa/binning/large_data_mode.py index c84f87c52..d6dc34b3b 100644 --- a/autometa/binning/large_data_mode.py +++ b/autometa/binning/large_data_mode.py @@ -39,7 +39,7 @@ from autometa.common import kmers from autometa.common.exceptions import TableFormatError, BinningError -from autometa.taxonomy.ncbi import NCBI +from autometa.taxonomy.database import TaxonomyDatabase from autometa.binning.recursive_dbscan import get_clusters from autometa.binning.utilities import ( write_results, @@ -306,17 +306,26 @@ def cluster_by_taxon_partitioning( Raises ------- TableFormatError - No marker information is availble for contigs to be binned. + No marker information is available for contigs to be binned. FileNotFoundError Provided `binning_checkpoints_fpath` does not exist + TableFormatError + No marker information is availble for contigs to be binned. """ + if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath): + raise FileNotFoundError(binning_checkpoints_fpath) + if reverse_ranks: # species, genus, family, order, class, phylum, superkingdom - canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"] + canonical_ranks = [ + rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root" + ] else: # superkingdom, phylum, class, order, family, genus, species canonical_ranks = [ - rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root" + rank + for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS) + if rank != "root" ] # if stage is cached then we can first look to the cache before we begin subsetting main... clustered_contigs = set() @@ -326,15 +335,17 @@ def cluster_by_taxon_partitioning( starting_rank_name_txt = None # Retrieve appropriate starting canonical rank and rank_name_txt from cached binning checkpoints if cache was provided if cache: - if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath): - raise FileNotFoundError(binning_checkpoints_fpath) + if not os.path.exists(cache): + os.makedirs(os.path.realpath(cache), exist_ok=True) + logger.debug(f"Created cache dir: {cache}") + if not os.path.isdir(cache): + raise NotADirectoryError(cache) if not binning_checkpoints_fpath: binning_checkpoints_fpath = os.path.join( cache, "binning_checkpoints.tsv.gz" ) - if os.path.exists(binning_checkpoints_fpath) and os.path.getsize( - binning_checkpoints_fpath - ): + if binning_checkpoints_fpath: + if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath): checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath) binning_checkpoints = checkpoint_info["binning_checkpoints"] starting_rank = checkpoint_info["starting_rank"] @@ -392,9 +403,9 @@ def cluster_by_taxon_partitioning( else None ) # Create canonical rank cache outdir if it does not exist - rank_cache_outdir = os.path.join(cache, canonical_rank) + rank_cache_outdir = os.path.join(cache, canonical_rank) if cache else None if embedding_cache_fpath and not os.path.isdir(rank_cache_outdir): - os.makedirs(rank_cache_outdir) + os.makedirs(rank_cache_outdir, exist_ok=True) rank_embedding = get_kmer_embedding( rank_counts, cache_fpath=embedding_cache_fpath, @@ -546,7 +557,7 @@ def cluster_by_taxon_partitioning( num_clusters += clustered.cluster.nunique() clusters.append(clustered) # Cache binning at rank_name_txt stage (rank-name-txt checkpointing) - if cache: + if binning_checkpoints_fpath: binning_checkpoints = checkpoint( checkpoints_df=binning_checkpoints, clustered=clustered, diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index beb36c077..713e08673 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -24,7 +24,7 @@ from autometa.common.markers import load as load_markers from autometa.common.exceptions import TableFormatError, BinningError -from autometa.taxonomy.ncbi import NCBI +from autometa.taxonomy.database import TaxonomyDatabase from autometa.binning.utilities import ( write_results, read_annotations, @@ -628,10 +628,14 @@ def taxon_guided_binning( logger.info(f"Using {method} clustering method") if reverse_ranks: # species, genus, family, order, class, phylum, superkingdom - ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"] + ranks = [rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"] else: # superkingdom, phylum, class, order, family, genus, species - ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"] + ranks = [ + rank + for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS) + if rank != "root" + ] starting_rank_index = ranks.index(starting_rank) ranks = ranks[starting_rank_index:] logger.debug(f"Using ranks: {', '.join(ranks)}") diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py index 33c25351e..3dff01d46 100644 --- a/autometa/binning/summary.py +++ b/autometa/binning/summary.py @@ -17,7 +17,9 @@ from Bio import SeqIO +from autometa.taxonomy.database import TaxonomyDatabase from autometa.taxonomy.ncbi import NCBI +from autometa.taxonomy.gtdb import GTDB from autometa.taxonomy import majority_vote from autometa.common.markers import load as load_markers @@ -226,7 +228,7 @@ def get_metabin_stats( def get_metabin_taxonomies( - bin_df: pd.DataFrame, ncbi: NCBI, cluster_col: str = "cluster" + bin_df: pd.DataFrame, taxa_db: TaxonomyDatabase, cluster_col: str = "cluster" ) -> pd.DataFrame: """Retrieve taxonomies of all clusters recovered from Autometa binning. @@ -234,8 +236,8 @@ def get_metabin_taxonomies( ---------- bin_df : pd.DataFrame Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks] - ncbi : autometa.taxonomy.ncbi.NCBI instance - Autometa NCBI class instance + taxa_db : autometa.taxonomy.ncbi.TaxonomyDatabase instance + Autometa NCBI or GTDB class instance cluster_col : str, optional Clustering column by which to group metabins @@ -246,7 +248,9 @@ def get_metabin_taxonomies( Indexed by cluster """ logger.info(f"Retrieving metabin taxonomies for {cluster_col}") - canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"] + canonical_ranks = [ + rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root" + ] is_clustered = bin_df[cluster_col].notnull() bin_df = bin_df[is_clustered] outcols = [cluster_col, "length", "taxid", *canonical_ranks] @@ -277,11 +281,13 @@ def get_metabin_taxonomies( taxonomies[cluster][canonical_rank].update({taxid: length}) else: taxonomies[cluster][canonical_rank][taxid] += length - cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi) + cluster_taxonomies = majority_vote.rank_taxids(taxonomies, taxa_db=taxa_db) # With our cluster taxonomies, let's place these into a dataframe for easy data accession cluster_taxa_df = pd.Series(data=cluster_taxonomies, name="taxid").to_frame() # With the list of taxids, we'll retrieve their complete canonical-rank information - lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(), fillna=True) + lineage_df = taxa_db.get_lineage_dataframe( + cluster_taxa_df.taxid.tolist(), fillna=True + ) # Now put it all together cluster_taxa_df = pd.merge( cluster_taxa_df, lineage_df, how="left", left_on="taxid", right_index=True @@ -323,11 +329,18 @@ def main(): required=True, ) parser.add_argument( - "--ncbi", - help="Path to user NCBI databases directory (Required for retrieving metabin taxonomies)", + "--dbdir", + help="Path to user taxonomy database directory (Required for retrieving metabin taxonomies)", metavar="dirpath", required=False, ) + parser.add_argument( + "--dbtype", + help="Taxonomy database type to use (NOTE: must correspond to the same database type used during contig taxon assignment.)", + choices=["ncbi", "gtdb"], + required=False, + default="ncbi", + ) parser.add_argument( "--binning-column", help="Binning column to use for grouping metabins", @@ -377,14 +390,17 @@ def main(): logger.info(f"Wrote metabin stats to {args.output_stats}") # Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method. if "taxid" in bin_df.columns: - if not args.ncbi: + if not args.dbdir: logger.warn( - "taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..." + "taxid found in dataframe. --dbdir argument is required to retrieve metabin taxonomies. Skipping..." ) else: - ncbi = NCBI(dirpath=args.ncbi) + if args.dbtype == "ncbi": + taxa_db = NCBI(dbdir=args.dbdir) + elif args.dbtype == "gtdb": + taxa_db = GTDB(dbdir=args.dbdir) taxa_df = get_metabin_taxonomies( - bin_df=bin_df, ncbi=ncbi, cluster_col=args.binning_column + bin_df=bin_df, taxa_db=taxa_db, cluster_col=args.binning_column ) taxa_df.to_csv(args.output_taxonomy, sep="\t", index=True, header=True) diff --git a/autometa/binning/utilities.py b/autometa/binning/utilities.py index dadf4002d..389eabc5d 100644 --- a/autometa/binning/utilities.py +++ b/autometa/binning/utilities.py @@ -33,7 +33,7 @@ from typing import Iterable, Tuple -from autometa.taxonomy.ncbi import NCBI +from autometa.taxonomy.database import TaxonomyDatabase logger = logging.getLogger(__name__) @@ -98,7 +98,7 @@ def filter_taxonomy(df: pd.DataFrame, rank: str, name: str) -> pd.DataFrame: Provided `name` not found in `rank` column. """ # First clean the assigned taxa by broadcasting lowercase and replacing any whitespace with underscores - for canonical_rank in NCBI.CANONICAL_RANKS: + for canonical_rank in TaxonomyDatabase.CANONICAL_RANKS: if canonical_rank not in df.columns: continue df[canonical_rank] = df[canonical_rank].map( @@ -395,7 +395,11 @@ def write_results( outcols.extend(annotation_cols) # Add in taxonomy columns if taxa are present # superkingdom, phylum, class, order, family, genus, species - taxa_cols = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"] + taxa_cols = [ + rank + for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS) + if rank != "root" + ] taxa_cols.append("taxid") # superkingdom, phylum, class, order, family, genus, species, taxid for taxa_col in taxa_cols: diff --git a/autometa/common/utilities.py b/autometa/common/utilities.py index 4d9e45e0d..546866fc3 100644 --- a/autometa/common/utilities.py +++ b/autometa/common/utilities.py @@ -27,6 +27,25 @@ logger = logging.getLogger(__name__) +def is_gz_file(filepath) -> bool: + """ + Check if the given file is gzipped compressed or not. + + Parameters + ---------- + filepath : str + Filepath to check + + Returns + ------- + bool + True if file is gzipped else False + """ + # https://stackoverflow.com/a/47080739 + with open(filepath, "rb") as test_f: + return test_f.read(2) == b"\x1f\x8b" + + def unpickle(fpath: str) -> Any: """Load a serialized `fpath` from :func:`make_pickle`. diff --git a/autometa/config/databases.py b/autometa/config/databases.py index 83a8aebdb..881afb920 100644 --- a/autometa/config/databases.py +++ b/autometa/config/databases.py @@ -10,6 +10,7 @@ import logging import os import requests +import sys import subprocess import tempfile @@ -32,6 +33,7 @@ from autometa.config.utilities import DEFAULT_CONFIG from autometa.config.utilities import AUTOMETA_DIR from autometa.config.utilities import put_config, get_config +from autometa.taxonomy.gtdb import create_gtdb_db logger = logging.getLogger(__name__) @@ -401,6 +403,30 @@ def download_ncbi_files(self, options: Iterable) -> None: if "nr" in options: self.format_nr() + def download_gtdb_files(self) -> None: + gtdb_taxdump_url = self.config.get("database_urls", "gtdb_taxdmp") + proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps") + + # User path: + gtdb_taxdump_filepath = self.config.get("gtdb", "gtdb_taxdmp") + proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps") + + urls = [gtdb_taxdump_url, proteins_aa_reps_url] + filepaths = [gtdb_taxdump_filepath, proteins_aa_reps_filepath] + + logger.debug(f"starting GTDB databases download") + for url, filepath in zip(urls, filepaths): + cmd = ["wget", url, "-O", filepath] + full_path = os.path.abspath(filepath) + dir_path = os.path.dirname(full_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + logger.debug(f"Created missing database directory: {dir_path}") + logger.debug(" ".join(cmd)) + subprocess.run( + cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True + ) + def press_hmms(self) -> None: """hmmpress markers hmm database files. @@ -524,7 +550,7 @@ def download_missing(self, section: str = None) -> None: ---------- section : str, optional Section to check for missing database files (the default is None). - Choices include 'ncbi' and 'markers'. + Choices include 'ncbi', and 'markers'. Returns ------- @@ -673,7 +699,7 @@ def configure(self, section: str = None, no_checksum: bool = False) -> ConfigPar Raises ------- - ValueError Provided `section` does not match 'ncbi' or 'markers'. + ValueError Provided `section` does not match 'ncbi', or 'markers'. ConnectionError A connection issue occurred when connecting to NCBI or GitHub. @@ -714,7 +740,7 @@ def main(): ) parser.add_argument( "--update-all", - help="Update all out-of-date databases.", + help="Update all out-of-date databases. (NOTE: Does not update GTDB)", action="store_true", default=False, ) @@ -730,6 +756,12 @@ def main(): action="store_true", default=False, ) + parser.add_argument( + "--update-gtdb", + help="Download and format the user-configured GTDB release databases", + action="store_true", + default=False, + ) parser.add_argument( "--check-dependencies", help="Check database dependencies are satisfied.", @@ -771,6 +803,23 @@ def main(): section = "markers" elif args.update_ncbi: section = "ncbi" + elif args.update_gtdb: + if not os.path.exists( + dbs.config.get("gtdb", "proteins_aa_reps") + ) and not os.path.exists(dbs.config.get("gtdb", "gtdb_taxdmp")): + logger.info(f"GTDB database downloading: ") + dbs.download_gtdb_files() + # Format GTDB amino acid database + gtdb_combined = create_gtdb_db( + reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"), + dbdir=dbs.config.get("databases", "gtdb"), + ) + diamond.makedatabase( + fasta=gtdb_combined, + database=gtdb_combined.replace(".faa", ".dmnd"), + cpus=args.nproc, + ) + sys.exit(0) else: section = None @@ -779,15 +828,11 @@ def main(): section=section, compare_checksums=compare_checksums ) logger.info(f"Database dependencies satisfied: {dbs_satisfied}") - import sys - sys.exit(0) config = dbs.configure(section=section, no_checksum=args.no_checksum) if not args.out: - import sys - sys.exit(0) put_config(config, args.out) logger.info(f"{args.out} written.") diff --git a/autometa/config/default.config b/autometa/config/default.config index a5db14477..e16327b2a 100644 --- a/autometa/config/default.config +++ b/autometa/config/default.config @@ -50,6 +50,7 @@ bedtools = None [databases] base = ${common:home_dir}/autometa/databases ncbi = ${databases:base}/ncbi +gtdb = ${databases:base}/gtdb markers = ${databases:base}/markers [database_urls] @@ -60,6 +61,9 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs +proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz +gtdb_taxdmp = https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz + [checksums] taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5 @@ -79,6 +83,12 @@ merged = ${databases:ncbi}/merged.dmp accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz nr = ${databases:ncbi}/nr.gz +[gtdb] +host = data.gtdb.ecogenomic.org +release = latest +proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz +gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz + [markers] host = raw.githubusercontent.com bacteria_single_copy = ${databases:markers}/bacteria.single_copy.hmm diff --git a/autometa/config/utilities.py b/autometa/config/utilities.py index 57607a0fc..bd876b1e6 100644 --- a/autometa/config/utilities.py +++ b/autometa/config/utilities.py @@ -216,7 +216,7 @@ def main(): update_group.add_argument( "--section", help="config section to update", - choices=["environ", "databases", "ncbi", "markers"], + choices=["environ", "databases", "ncbi", "markers", "gtdb"], required=False, ) update_group.add_argument( diff --git a/autometa/taxonomy/database.py b/autometa/taxonomy/database.py new file mode 100644 index 000000000..b3d1a1b56 --- /dev/null +++ b/autometa/taxonomy/database.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python + +import logging +import string + +from abc import ABC, abstractmethod +from typing import Dict, Set, Tuple, List, Union, Iterable +from autometa.common.exceptions import DatabaseOutOfSyncError + +import pandas as pd + + +logger = logging.getLogger(__name__) + + +class TaxonomyDatabase(ABC): + + """ + TaxonomyDatabase Abstract Base Class + + Abstract methods + + 1. parse_nodes(self) + 2. parse_names(self) + 3. parse_merged(self) + 4. parse_delnodes(self) + 5. convert_accessions_to_taxids(self) + + e.g. + + class GTDB(TaxonomyDatabase): + def __init__(self, ...): + self.nodes = self.parse_nodes() + self.names = self.parse_names() + self.merged = self.parse_merged() + self.delnodes = self.parse_delnodes() + ... + def parse_nodes(self): + ... + def parse_nodes(self): + ... + def parse_merged(self): + ... + def parse_delnodes(self): + ... + def convert_accessions_to_taxids(self, accessions): + ... + + Available methods (after aforementioned implementations): + + 1. convert_taxid_dtype + 2. name + 3. rank + 4. parent + 5. lineage + 6. is_common_ancestor + 7. get_lineage_dataframe + + Available attributes: + + CANONICAL_RANKS + """ + + CANONICAL_RANKS = [ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + "root", + ] + + @abstractmethod + def parse_nodes(self) -> Dict[int, Dict[str, Union[str, int]]]: + """ + Parse the `nodes.dmp` database and set to self.nodes. + + Returns + ------- + dict + {child_taxid:{'parent':parent_taxid,'rank':rank}, ...} + """ + + @abstractmethod + def parse_names(self) -> Dict[int, str]: + """ + Parses through the names.dmp in search of the given `taxid` and returns its name + + Parameters + ---------- + taxid : int + `taxid` whose name is being returned + rank : str, optional + If provided, will return `taxid` name at `rank`, by default None + Must be a canonical rank, choices: species, genus, family, order, class, phylum, superkingdom + Eg. self.name(562, 'genus') would return 'Escherichia', where 562 is the taxid for Escherichia coli + + Returns + ------- + str + Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified' + + """ + + @abstractmethod + def parse_merged(self) -> Dict[int, int]: + """ + Parses merged.dmp such that merged `taxid`s may be updated with their up-to-date `taxid`s + + Returns + ------- + dict + {old_taxid: new_taxid, ...} + """ + + @abstractmethod + def parse_delnodes(self) -> Set[int]: + """ + Parses delnodes.dmp such that deleted `taxid`s may be updated with their up-to-date `taxid`s + + Returns + ------- + set + {taxid, ...} + """ + + @abstractmethod + def convert_accessions_to_taxids( + self, + accessions: Dict[str, Set[str]], + ) -> Tuple[Dict[str, Set[int]], pd.DataFrame]: + """ + Translates subject sequence ids to taxids + + Parameters + ---------- + accessions : dict + {qseqid: {sseqid, ...}, ...} + + Returns + ------- + Tuple[Dict[str, Set[int]], pd.DataFrame] + {qseqid: {taxid, taxid, ...}, ...}, index=range, cols=[qseqid, sseqid, raw_taxid, ..., cleaned_taxid] + + """ + + def convert_taxid_dtype(self, taxid: int) -> int: + """ + 1. Converts the given `taxid` to an integer and checks whether it is positive. + 2. Checks whether `taxid` is present in both nodes.dmp and names.dmp. + 3a. If (2) is false, will check for corresponding `taxid` in merged.dmp and convert to this then redo (2). + 3b. If (2) is true, will return converted taxid. + 4. If (3a) is false will look for `taxid` in delnodes.dmp. If present will convert to root (taxid=1) + + Parameters + ---------- + taxid : int + identifier for a taxon in NCBI taxonomy databases - nodes.dmp, names.dmp or merged.dmp + + Returns + ------- + int + `taxid` if the `taxid` is a positive integer and present in either nodes.dmp or names.dmp or + taxid recovered from merged.dmp + + Raises + ------ + ValueError + Provided `taxid` is not a positive integer + DatabaseOutOfSyncError + NCBI databases nodes.dmp, names.dmp and merged.dmp are out of sync with each other + """ + # Step 1 Converts the given `taxid` to an integer and checks whether it is positive. + # Checking taxid instance format + # This checks if an integer has been added as str, eg. "562" + if isinstance(taxid, str): + invalid_chars = set(string.punctuation + string.ascii_letters) + invalid_chars.discard(".") + if set(taxid).intersection(invalid_chars) or taxid.count(".") > 1: + raise ValueError(f"taxid contains invalid character(s)! Given: {taxid}") + taxid = float(taxid) + # a boolean check is needed as they will evaluate silently to 0 or 1 when cast as ints. FALSE=0, TRUE=1 + # float(taxid).is_integer() checks if it is something like 12.0 vs. 12.9 + # is_integer only takes float as input else raises error, thus isinstance( ,float) is used before it to make sure a float is being passed + if isinstance(taxid, bool) or ( + isinstance(taxid, float) and not taxid.is_integer() + ): + raise ValueError(f"taxid must be an integer! Given: {taxid}") + taxid = int(taxid) + if taxid <= 0: + raise ValueError(f"Taxid must be a positive integer! Given: {taxid}") + # Checking databases + # Step 2: Check whether taxid is present in both nodes.dmp and names.dmp. + if taxid not in self.names and taxid not in self.nodes: + # Step 3a. Check for corresponding taxid in merged.dmp + if taxid not in self.merged: + # Step 4: look for taxid in delnodes.dmp. If present will convert to root (taxid=1) + if taxid in self.delnodes: + # Assign deleted taxid to root... + if self.verbose: + logger.debug( + f"Found {taxid} in delnodes.dmp, converting to root (taxid=1)" + ) + taxid = 1 + else: + err_message = f"Databases out of sync. {taxid} not in found in nodes.dmp, names.dmp, merged.dmp or delnodes.dmp" + raise DatabaseOutOfSyncError(err_message) + else: + # Step 3b. convert taxid from merged. + if self.verbose: + logger.debug( + f"Converted {taxid} to {self.merged[taxid]} from merged.dmp" + ) + taxid = self.merged[taxid] + if taxid not in self.names and taxid not in self.nodes: + # NOTE: Do not check delnodes.dmp here, as at this point it appears the databases are indeed out of sync. + # i.e. The taxid should either be in merged.dmp or in delnodes.dmp if not in nodes.dmp or names.dmp + err_message = f"Databases out of sync. Merged taxid ({taxid}) not found in nodes.dmp or names.dmp!" + raise DatabaseOutOfSyncError(err_message) + return taxid + + def name(self, taxid: int, rank: str = None) -> str: + """ + Parses through the names.dmp in search of the given `taxid` and returns its name. + + Parameters + ---------- + taxid : int + `taxid` whose name is being returned + rank : str, optional + If provided, will return `taxid` name at `rank`, by default None + Must be a canonical rank, choices: species, genus, family, order, class, phylum, superkingdom + Eg. self.name(562, 'genus') would return 'Escherichia', where 562 is the taxid for Escherichia coli + + Returns + ------- + str + Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified' + + """ + try: + taxid = self.convert_taxid_dtype(taxid) + except DatabaseOutOfSyncError as err: + logger.warning(err) + taxid = 0 + if not rank: + return self.names.get(taxid, "unclassified") + if rank not in set(TaxonomyDatabase.CANONICAL_RANKS): + logger.warning(f"{rank} not in canonical ranks!") + return "unclassified" + ancestor_taxid = taxid + while ancestor_taxid != 1: + ancestor_rank = self.rank(ancestor_taxid) + if ancestor_rank == rank: + return self.names.get(ancestor_taxid, "unclassified") + ancestor_taxid = self.parent(ancestor_taxid) + # At this point we have not encountered a name for the taxid rank + # so we will place this as unclassified. + return "unclassified" + + def rank(self, taxid: int) -> str: + """ + Return the respective rank of provided `taxid`. + + Parameters + ---------- + taxid : int + `taxid` to retrieve rank from nodes + + Returns + ------- + str + rank name if taxid is found in nodes else "unclassified" + + """ + try: + taxid = self.convert_taxid_dtype(taxid) + except DatabaseOutOfSyncError as err: + logger.warning(err) + taxid = 0 + return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank") + + def parent(self, taxid: int) -> int: + """ + Retrieve the parent taxid of provided `taxid`. + + Parameters + ---------- + taxid : int + child taxid to retrieve parent + + Returns + ------- + int + Parent taxid if found in nodes otherwise 1 + + """ + try: + taxid = self.convert_taxid_dtype(taxid) + except DatabaseOutOfSyncError as err: + logger.warning(err) + taxid = 0 + return self.nodes.get(taxid, {"parent": 1}).get("parent") + + def lineage( + self, taxid: int, canonical: bool = True + ) -> List[Dict[str, Union[str, int]]]: + """ + Returns the lineage of `taxids` encountered when traversing to root + + Parameters + ---------- + taxid : int + `taxid` in nodes.dmp, whose lineage is being returned + canonical : bool, optional + Lineage includes both canonical and non-canonical ranks when False, and only the canonical ranks when True + Canonical ranks include : species, genus , family, order, class, phylum, superkingdom, root + + Returns + ------- + ordered list of dicts + [{'taxid':taxid, 'rank':rank,'name':name}, ...] + """ + lineage = [] + while taxid != 1: + if canonical and self.rank(taxid) not in TaxonomyDatabase.CANONICAL_RANKS: + taxid = self.parent(taxid) + continue + lineage.append( + {"taxid": taxid, "name": self.name(taxid), "rank": self.rank(taxid)} + ) + taxid = self.parent(taxid) + return lineage + + def is_common_ancestor(self, taxid_A: int, taxid_B: int) -> bool: + """ + Determines whether the provided taxids have a non-root common ancestor + + Parameters + ---------- + taxid_A : int + taxid in taxonomy database + taxid_B : int + taxid in taxonomy database + + Returns + ------- + boolean + True if taxids share a common ancestor else False + """ + lineage_a_taxids = {ancestor.get("taxid") for ancestor in self.lineage(taxid_A)} + lineage_b_taxids = {ancestor.get("taxid") for ancestor in self.lineage(taxid_B)} + common_ancestor = lineage_b_taxids.intersection(lineage_a_taxids) + common_ancestor.discard(1) # This discards root + return True if common_ancestor else False + + def get_lineage_dataframe( + self, taxids: Iterable, fillna: bool = True + ) -> pd.DataFrame: + """ + Given an iterable of taxids generate a pandas DataFrame of their canonical + lineages + + Parameters + ---------- + taxids : iterable + `taxids` whose lineage dataframe is being returned + fillna : bool, optional + Whether to fill the empty cells with 'unclassified' or not, default True + + Returns + ------- + pd.DataFrame + index = taxid + columns = [superkingdom,phylum,class,order,family,genus,species] + + Example + ------- + + If you would like to merge the returned DataFrame ('this_df') with another + DataFrame ('your_df'). Let's say where you retrieved your taxids: + + .. code-block:: python + + merged_df = pd.merge( + left=your_df, + right=this_df, + how='left', + left_on=, + right_index=True) + """ + canonical_ranks = [ + rank + for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS) + if rank != "root" + ] + taxids = list(set(taxids)) + ranked_taxids = {} + for rank in canonical_ranks: + for taxid in taxids: + name = self.name(taxid, rank=rank) + if taxid not in ranked_taxids: + ranked_taxids.update({taxid: {rank: name}}) + else: + ranked_taxids[taxid].update({rank: name}) + df = pd.DataFrame(ranked_taxids).transpose() + df.index.name = "taxid" + if fillna: + df.fillna(value="unclassified", inplace=True) + return df diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py new file mode 100644 index 000000000..d81862da8 --- /dev/null +++ b/autometa/taxonomy/gtdb.py @@ -0,0 +1,365 @@ +""" +# License: GNU Affero General Public License v3 or later +# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt. + +File containing definition of the GTDB class and containing functions useful for handling GTDB taxonomy databases +""" + +import gzip +import logging +import os +import re +import string +import tarfile +import glob +from pathlib import Path + +from typing import Dict, List, Set, Tuple +from itertools import chain +from tqdm import tqdm +from typing import Dict + +import pandas as pd +import multiprocessing as mp + +from autometa.common.utilities import file_length, is_gz_file +from autometa.common.external import diamond +from autometa.taxonomy.database import TaxonomyDatabase +from autometa.common.exceptions import DatabaseOutOfSyncError + + +logger = logging.getLogger(__name__) + + +def create_gtdb_db(reps_faa: str, dbdir: str) -> str: + """ + Generate a combined faa file to create the GTDB-t database. + + Parameters + ---------- + reps_faa : str + Directory having faa file of all representative genomes. Can be tarballed. + dbdir : str + Path to output directory. + + Returns + ------- + str + Path to combined faa file. This can be used to make a diamond database. + """ + + if reps_faa.endswith(".tar.gz"): + logger.debug( + f"Extracting tarball containing GTDB ref genome animo acid data sequences to: {dbdir}/protein_faa_reps" + ) + tar = tarfile.open(reps_faa) + tar.extractall(path=dbdir) + tar.close() + logger.debug("Extraction done.") + reps_faa = dbdir + + genome_protein_faa_filepaths = glob.glob( + os.path.join(reps_faa, "**", "*_protein.faa"), recursive=True + ) + + faa_index: Dict[str, str] = {} + for genome_protein_faa_filepath in genome_protein_faa_filepaths: + # Regex to get the genome accession from the file path + genome_acc_search = re.search( + r"GCA_\d+.\d?|GCF_\d+.\d?", genome_protein_faa_filepath + ) + if genome_acc_search: + faa_index[genome_protein_faa_filepath] = genome_acc_search.group() + + # Create dbdir if it doesn't exist + if not os.path.isdir(dbdir): + os.makedirs(dbdir) + + logger.debug(f"Merging {len(faa_index):,} faa files.") + combined_faa = os.path.join(dbdir, "gtdb.faa") + with open(combined_faa, "w") as f_out: + for faa_file, acc in faa_index.items(): + with open(faa_file) as f_in: + for line in f_in: + if line.startswith(">"): + seqheader = line.lstrip(">") + line = f"\n>{acc} {seqheader}" + f_out.write(line) + logger.debug(f"Combined GTDB faa file written to {combined_faa}") + return combined_faa + + +class GTDB(TaxonomyDatabase): + """Taxonomy utilities for GTDB databases.""" + + def __init__(self, dbdir: str, verbose: bool = True): + """ + Instantiates the GTDB class + + """ + self.dbdir = dbdir + self.verbose = verbose + self.disable = not self.verbose + self.dmnd_db = os.path.join(self.dbdir, "gtdb.dmnd") + self.accession2taxid = os.path.join(self.dbdir, "taxid.map") + self.nodes_fpath = os.path.join(self.dbdir, "nodes.dmp") + self.names_fpath = os.path.join(self.dbdir, "names.dmp") + self.merged_fpath = os.path.join(self.dbdir, "merged.dmp") + self.delnodes_fpath = os.path.join(self.dbdir, "delnodes.dmp") + self.verify_databases() + self.names = self.parse_names() + self.nodes = self.parse_nodes() + self.merged = self.parse_merged() + self.delnodes = self.parse_delnodes() + + def verify_databases(self): + """ + Verify if the required databases are present. + + Raises + ------ + FileNotFoundError + One or more of the required database were not found. + """ + for filepath in [ + self.names_fpath, + self.nodes_fpath, + self.dmnd_db, + self.accession2taxid, + ]: + if not os.path.exists(filepath): + raise FileNotFoundError(filepath) + + def __repr__(self): + """ + Operator overloading to return the string representation of the class object + + Returns + ------- + str + String representation of the class object + """ + return str(self) + + def __str__(self): + """ + Operator overloading to return the directory path of the class object + + Returns + ------- + str + Directory path of the class object + """ + # Perhaps should place summary here of files that do or do not exist? + return self.dbdir + + def search_genome_accessions(self, accessions: set) -> Dict[str, int]: + """ + Search taxid.map file + + Parameters + ---------- + accessions : set + Set of subject sequence ids retrieved from diamond blastp search (sseqids) + + Returns + ------- + Dict[str, int] + Dictionary containing sseqids converted to taxids + """ + sseqids_to_taxids = {} + # "rt" open the database in text mode instead of binary to be handled like a text file + fh = ( + gzip.open(self.accession2taxid, "rt") + if is_gz_file(self.accession2taxid) + else open(self.accession2taxid) + ) + # skip the header line + __ = fh.readline() + n_lines = file_length(self.accession2taxid) + converted_sseqid_count = 0 + logger.debug(f"parsing {self.accession2taxid}...") + for line in fh: + acc_ver, taxid = line.strip().split("\t") + taxid = int(taxid) + if acc_ver in accessions: + sseqids_to_taxids[acc_ver] = taxid + converted_sseqid_count += 1 + fh.close() + logger.debug(f"sseqids converted: {converted_sseqid_count:,}") + return sseqids_to_taxids + + def parse_nodes(self) -> Dict[int, str]: + """ + Parse the `nodes.dmp` database. + Note: This is performed when a new GTDB class instance is constructed + + Returns + ------- + dict + {child_taxid:{'parent':parent_taxid,'rank':rank}, ...} + """ + fh = open(self.nodes_fpath) + __ = fh.readline() # root line + nodes = {1: {"parent": 1, "rank": "root"}} + for line in tqdm(fh, desc="parsing nodes", leave=False): + child, parent, rank = line.split("\t|\t")[:3] + parent, child = [int(node) for node in [parent, child]] + rank = rank.lower() + nodes.update({child: {"parent": parent, "rank": rank}}) + fh.close() + return nodes + + def parse_names(self) -> Dict[int, str]: + """ + Parses through names.dmp database and loads taxids with scientific names + + Returns + ------- + dict + {taxid:name, ...} + """ + names = {} + fh = open(self.names_fpath) + for line in tqdm(fh, desc="parsing names", leave=False): + taxid, name, __, classification = line.strip("\t|\n").split("\t|\t")[:4] + taxid = int(taxid) + name = name.lower() + # Only add scientific name entries + if classification == "scientific name": + names.update({taxid: name}) + fh.close() + return names + + def parse_merged(self) -> Dict[int, int]: + """ + Parse the merged.dmp database + + Returns + ------- + dict + {old_taxid: new_taxid, ...} + """ + if self.verbose: + logger.debug(f"Processing nodes from {self.merged_fpath}") + fh = open(self.merged_fpath) + merged = {} + for line in tqdm(fh, disable=self.disable, desc="parsing merged", leave=False): + old_taxid, new_taxid = [ + int(taxid) for taxid in line.strip("\t|\n").split("\t|\t") + ] + merged.update({old_taxid: new_taxid}) + fh.close() + if self.verbose: + logger.debug("merged loaded") + return merged + + def parse_delnodes(self) -> Set[int]: + """ + Parse the delnodes.dmp database + + Returns + ------- + set + {taxid, ...} + """ + if self.verbose: + logger.debug(f"Processing delnodes from {self.delnodes_fpath}") + fh = open(self.delnodes_fpath) + delnodes = set() + for line in tqdm( + fh, disable=self.disable, desc="parsing delnodes", leave=False + ): + del_taxid = int(line.strip("\t|\n")) + delnodes.add(del_taxid) + fh.close() + if self.verbose: + logger.debug("delnodes loaded") + return delnodes + + def convert_accessions_to_taxids( + self, accessions: Dict[str, Set[str]] + ) -> Tuple[Dict[str, Set[int]], pd.DataFrame]: + """ + Translates subject sequence ids to taxids + + + Parameters + ---------- + accessions : dict + {qseqid: {sseqid, ...}, ...} + + Returns + ------- + Tuple[Dict[str, Set[int]], pd.DataFrame] + {qseqid: {taxid, taxid, ...}, ...}, index=range, cols=[qseqid, sseqid, raw_taxid, ..., cleaned_taxid] + + """ + recovered_accessions = set( + chain.from_iterable( + [qseqid_sseqids for qseqid_sseqids in accessions.values()] + ) + ) + sseqids_to_taxids = self.search_genome_accessions(recovered_accessions) + sseqid_not_found = pd.NA + accession_to_taxid_df = pd.DataFrame( + [ + { + "qseqid": qseqid, + "sseqid": sseqid, + "raw_taxid": sseqids_to_taxids.get(sseqid, sseqid_not_found), + } + for qseqid, qseqid_sseqids in accessions.items() + for sseqid in qseqid_sseqids + ] + ) + + taxids = {} + root_taxid = 1 + for qseqid, qseqid_sseqids in accessions.items(): + qseqid_taxids = { + sseqids_to_taxids.get(sseqid, root_taxid) for sseqid in qseqid_sseqids + } + taxids[qseqid] = qseqid_taxids + return taxids, accession_to_taxid_df + + +def main(): + import argparse + import logging as logger + + logger.basicConfig( + format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + level=logger.DEBUG, + ) + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--reps-faa", + help="Path to directory containing GTDB ref genome animo acid data sequences. Can be tarballed.", + required=True, + ) + parser.add_argument( + "--dbdir", help="Path to output GTDB database directory", required=True + ) + parser.add_argument( + "--cpus", + help="Number of cpus to use for diamond-formatting GTDB database", + default=mp.cpu_count(), + ) + + args = parser.parse_args() + + gtdb_combined = create_gtdb_db(reps_faa=args.reps_faa, dbdir=args.dbdir) + diamond.makedatabase( + fasta=gtdb_combined, + database=gtdb_combined.replace(".faa", ".dmnd"), + cpus=args.cpus, + ) + + +if __name__ == "__main__": + # python -m autometa.taxonomy.gtdb --dbdir + main() diff --git a/autometa/taxonomy/lca.py b/autometa/taxonomy/lca.py index 2614c2972..654c6b3a4 100644 --- a/autometa/taxonomy/lca.py +++ b/autometa/taxonomy/lca.py @@ -16,7 +16,6 @@ import os from typing import Dict, Set, Tuple -from itertools import chain import pandas as pd import numpy as np @@ -26,6 +25,8 @@ from autometa.config.environ import get_versions from autometa.taxonomy.ncbi import NCBI, NCBI_DIR +from autometa.taxonomy.database import TaxonomyDatabase +from autometa.taxonomy.gtdb import GTDB from autometa.common.utilities import make_pickle, unpickle, file_length from autometa.common.external import diamond from autometa.common.external import prodigal @@ -34,7 +35,7 @@ logger = logging.getLogger(__name__) -class LCA(NCBI): +class LCA: """LCA class containing methods to retrieve the Lowest Common Ancestor. LCAs may be computed given taxids, a fasta or BLAST results. @@ -74,10 +75,14 @@ class LCA(NCBI): """ - def __init__(self, dbdir: str, verbose: bool = False, cache: str = ""): - super().__init__(dbdir, verbose=verbose) + def __init__( + self, + taxonomy_db: TaxonomyDatabase, + verbose: bool = False, + cache: str = "", + ): + self.taxonomy_db = taxonomy_db self.verbose = verbose - self.dbdir = dbdir self.cache = cache self.disable = False if verbose else True if self.cache == None: @@ -137,7 +142,7 @@ def prepare_tree(self): taxids = {} parents = {} children = {} - for taxid, info in self.nodes.items(): + for taxid, info in self.taxonomy_db.nodes.items(): if taxid == 1: # Skip root continue @@ -337,102 +342,6 @@ def lca(self, node1, node2): # (parent, child) return lca_node[1] - def convert_sseqids_to_taxids( - self, - sseqids: Dict[str, Set[str]], - ) -> Tuple[Dict[str, Set[int]], pd.DataFrame]: - """ - Translates subject sequence ids to taxids from prot.accession2taxid.gz and dead_prot.accession2taxid.gz. - - .. note:: - - If an accession number is no longer available in prot.accesssion2taxid.gz - (either due to being suppressed, deprecated or removed by NCBI), - then root taxid (1) is returned as the taxid for the corresponding sseqid. - - Parameters - ---------- - sseqids : dict - {qseqid: {sseqid, ...}, ...} - - Returns - ------- - Tuple[Dict[str, Set[int]], pd.DataFrame] - {qseqid: {taxid, taxid, ...}, ...}, index=range, cols=[qseqid, sseqid, raw_taxid, merged_taxid, cleaned_taxid] - - Raises - ------- - FileNotFoundError - prot.accession2taxid.gz database is required for sseqid to taxid conversion. - - """ - # We first get all unique sseqids that were retrieved from the blast output - recovered_sseqids = set( - chain.from_iterable([qseqid_sseqids for qseqid_sseqids in sseqids.values()]) - ) - - # Check for sseqid in dead_prot.accession2taxid.gz in case an old db was used. - # Any accessions not found in live prot.accession2taxid db *may* be here. - # This *attempts* to prevent sseqids from being assigned root (root taxid=1) - try: - sseqids_to_taxids = self.search_prot_accessions( - accessions=recovered_sseqids, - db="dead", - sseqids_to_taxids=None, - ) - dead_sseqids_found = set(sseqids_to_taxids.keys()) - except FileNotFoundError as db_fpath: - logger.warn(f"Skipping taxid conversion from {db_fpath}") - sseqids_to_taxids = None - dead_sseqids_found = set() - - # Now build the mapping from sseqid to taxid from the full/live accession2taxid dbs - # Possibly overwriting any merged accessions to live accessions - sseqids_to_taxids = self.search_prot_accessions( - accessions=recovered_sseqids, - db="full", - sseqids_to_taxids=sseqids_to_taxids, - ) - - # Remove sseqids: Ignore any sseqids already found - live_sseqids_found = set(sseqids_to_taxids.keys()) - live_sseqids_found -= dead_sseqids_found - recovered_sseqids -= live_sseqids_found - if recovered_sseqids: - logger.warn( - f"sseqids without corresponding taxid: {len(recovered_sseqids):,}" - ) - root_taxid = 1 - taxids = {} - sseqid_not_found = pd.NA - sseqid_to_taxid_df = pd.DataFrame( - [ - { - "qseqid": qseqid, - "sseqid": sseqid, - "raw_taxid": sseqids_to_taxids.get(sseqid, sseqid_not_found), - } - for qseqid, qseqid_sseqids in sseqids.items() - for sseqid in qseqid_sseqids - ] - ) - # Update taxids if they are old. - sseqid_to_taxid_df["merged_taxid"] = sseqid_to_taxid_df.raw_taxid.map( - lambda tid: self.merged.get(tid, tid) - ) - # If we still have missing taxids, we will set the sseqid value to the root taxid - # fill missing taxids with root_taxid - sseqid_to_taxid_df["cleaned_taxid"] = sseqid_to_taxid_df.merged_taxid.fillna( - root_taxid - ) - for qseqid, qseqid_sseqids in sseqids.items(): - # NOTE: we only want to retrieve the set of unique taxids (not a list) for LCA query - qseqid_taxids = { - sseqids_to_taxids.get(sseqid, root_taxid) for sseqid in qseqid_sseqids - } - taxids[qseqid] = qseqid_taxids - return taxids, sseqid_to_taxid_df - def reduce_taxids_to_lcas( self, taxids: Dict[str, Set[int]] ) -> Tuple[Dict[str, int], pd.DataFrame]: @@ -456,7 +365,6 @@ def reduce_taxids_to_lcas( for qseqid, qseqid_taxids in taxids.items(): # This will convert any deprecated taxids to their current taxids before reduction to LCA. # NOTE: we only want to retrieve the set of unique taxids (not a list) for LCA query - qseqid_taxids = {self.merged.get(taxid, taxid) for taxid in qseqid_taxids} lca_taxid = False num_taxids = len(qseqid_taxids) while not lca_taxid: @@ -560,8 +468,10 @@ def blast2lca( raise FileNotFoundError(blast) elif os.path.exists(blast) and os.path.getsize(blast): logger.debug(f"Retrieving taxids from {blast}") - sseqids = diamond.parse(results=blast, verbose=self.verbose) - taxids, sseqid_to_taxid_df = self.convert_sseqids_to_taxids(sseqids) + accessions = diamond.parse(results=blast, verbose=self.verbose) + taxids, sseqid_to_taxid_df = self.taxonomy_db.convert_accessions_to_taxids( + accessions=accessions + ) if sseqid_to_taxid_output: sseqid_to_taxid_df.to_csv( sseqid_to_taxid_output, sep="\t", index=False, header=True @@ -609,7 +519,7 @@ def write_lcas(self, lcas: Dict[str, int], out: str) -> str: fh.write(lines) lines = "" nlines = 0 - lines += f"{qseqid}\t{self.name(taxid)}\t{self.rank(taxid)}\t{taxid}\n" + lines += f"{qseqid}\t{self.taxonomy_db.name(taxid)}\t{self.taxonomy_db.rank(taxid)}\t{taxid}\n" nlines += 1 fh.write(lines) fh.close() @@ -617,7 +527,9 @@ def write_lcas(self, lcas: Dict[str, int], out: str) -> str: return out def parse( - self, lca_fpath: str, orfs_fpath: str = None + self, + lca_fpath: str, + orfs_fpath: str = None, ) -> Dict[str, Dict[str, Dict[int, int]]]: """Retrieve and construct contig dictionary from provided `lca_fpath`. @@ -686,9 +598,9 @@ def parse( taxid = int(taxid) contig = contigs_from_orfs.get(orf_id) if taxid != 1: - while rank not in set(NCBI.CANONICAL_RANKS): - taxid = self.parent(taxid) - rank = self.rank(taxid) + while rank not in set(TaxonomyDatabase.CANONICAL_RANKS): + taxid = self.taxonomy_db.parent(taxid) + rank = self.taxonomy_db.rank(taxid) if contig not in lca_hits: lca_hits.update({contig: {rank: {taxid: 1}}}) elif rank not in lca_hits[contig]: @@ -723,10 +635,16 @@ def main(): ) parser.add_argument( "--dbdir", - help="Path to NCBI databases directory.", + help="Path to taxonomy databases directory.", metavar="dirpath", default=NCBI_DIR, ) + parser.add_argument( + "--dbtype", + help="Taxonomy database to use", + choices=["ncbi", "gtdb"], + default="ncbi", + ) parser.add_argument( "--lca-output", help="Path to write LCA results.", @@ -774,7 +692,12 @@ def main(): ) args = parser.parse_args() - lca = LCA(dbdir=args.dbdir, verbose=args.verbose, cache=args.cache) + if args.dbtype == "ncbi": + taxonomy_db = NCBI(args.dbdir) + elif args.dbtype == "gtdb": + taxonomy_db = GTDB(args.dbdir) + + lca = LCA(taxonomy_db=taxonomy_db, verbose=args.verbose, cache=args.cache) # The pkl objects will be recomputed if the respective process cache file does not exist if args.force_cache_overwrite: diff --git a/autometa/taxonomy/majority_vote.py b/autometa/taxonomy/majority_vote.py index e9bed47aa..052c9b89d 100644 --- a/autometa/taxonomy/majority_vote.py +++ b/autometa/taxonomy/majority_vote.py @@ -16,12 +16,14 @@ from autometa.taxonomy.lca import LCA from autometa.taxonomy.ncbi import NCBI, NCBI_DIR +from autometa.taxonomy.gtdb import GTDB +from autometa.taxonomy.database import TaxonomyDatabase logger = logging.getLogger(__name__) def is_consistent_with_other_orfs( - taxid: int, rank: str, rank_counts: Dict[str, Dict], ncbi: NCBI + taxid: int, rank: str, rank_counts: Dict[str, Dict], taxa_db: TaxonomyDatabase ) -> bool: """Determines whether the majority of proteins in a contig, with rank equal to or above the given rank, are common ancestors of the taxid. @@ -48,8 +50,8 @@ def is_consistent_with_other_orfs( return True, otherwise return False. """ - rank_index = NCBI.CANONICAL_RANKS.index(rank) - ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:] + rank_index = TaxonomyDatabase.CANONICAL_RANKS.index(rank) + ranks_to_consider = TaxonomyDatabase.CANONICAL_RANKS[rank_index:] # Now we total up the consistent and inconsistent ORFs consistent = 0 inconsistent = 0 @@ -57,7 +59,7 @@ def is_consistent_with_other_orfs( if rank_name not in rank_counts: continue for rank_taxid, count in rank_counts[rank_name].items(): - if ncbi.is_common_ancestor(rank_taxid, taxid): + if taxa_db.is_common_ancestor(rank_taxid, taxid): consistent += count else: inconsistent += count @@ -70,7 +72,7 @@ def is_consistent_with_other_orfs( return False -def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: +def lowest_majority(rank_counts: Dict[str, Dict], taxa_db: TaxonomyDatabase) -> int: """Determine the lowest majority given `rank_counts` by first attempting to get a taxid that leads in counts with the highest specificity in terms of canonical rank. @@ -79,8 +81,8 @@ def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: ---------- rank_counts : dict {canonical_rank:{taxid:num_hits, ...}, rank2: {...}, ...} - ncbi : NCBI instance - NCBI object from autometa.taxonomy.ncbi + taxa_db : TaxonomyDatabase instance + NCBI or GTDB subclass object of autometa.taxonomy.database.TaxonomyDatabase Returns ------- @@ -89,11 +91,11 @@ def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: """ taxid_totals = {} - for rank in NCBI.CANONICAL_RANKS: + for rank in TaxonomyDatabase.CANONICAL_RANKS: if rank not in rank_counts: continue - rank_index = NCBI.CANONICAL_RANKS.index(rank) - ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:] + rank_index = TaxonomyDatabase.CANONICAL_RANKS.index(rank) + ranks_to_consider = TaxonomyDatabase.CANONICAL_RANKS[rank_index:] for taxid in rank_counts[rank]: # Make a dictionary to total the number of canonical ranks hit # while traversing the path so that we can add 'unclassified' to @@ -106,22 +108,22 @@ def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: current_taxid = taxid current_rank = rank while current_taxid != 1: - if current_rank not in set(NCBI.CANONICAL_RANKS): - current_taxid = ncbi.parent(current_taxid) - current_rank = ncbi.rank(current_taxid) + if current_rank not in set(TaxonomyDatabase.CANONICAL_RANKS): + current_taxid = taxa_db.parent(current_taxid) + current_rank = taxa_db.rank(current_taxid) continue ranks_in_path[current_rank] += 1 if current_rank not in taxid_totals: taxid_totals.update({current_rank: {current_taxid: 1}}) - current_taxid = ncbi.parent(current_taxid) - current_rank = ncbi.rank(current_taxid) + current_taxid = taxa_db.parent(current_taxid) + current_rank = taxa_db.rank(current_taxid) continue if current_taxid in taxid_totals[current_rank]: taxid_totals[current_rank][current_taxid] += 1 else: taxid_totals[current_rank][current_taxid] = 1 - current_taxid = ncbi.parent(current_taxid) - current_rank = ncbi.rank(current_taxid) + current_taxid = taxa_db.parent(current_taxid) + current_rank = taxa_db.rank(current_taxid) # Now go through ranks_in_path. Where total = 0, add 'unclassified' for rank_to_consider in ranks_to_consider: if ranks_in_path[rank_to_consider] == 0: @@ -135,7 +137,7 @@ def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: # we need to add 'unclassified' to the relevant canonical taxonomic rank. # However, we must never allow 'unclassified' to win! (That just won't really tell us anything) # Now we need to determine which is the first level to have a majority - for rank in NCBI.CANONICAL_RANKS: + for rank in TaxonomyDatabase.CANONICAL_RANKS: total_votes = 0 taxid_leader = None taxid_leader_votes = 0 @@ -155,7 +157,9 @@ def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: def rank_taxids( - ctg_lcas: dict, ncbi: Union[NCBI, LCA], verbose: bool = False + ctg_lcas: Dict[str, Dict[str, Dict[int, int]]], + taxa_db: TaxonomyDatabase, + verbose: bool = False, ) -> Dict[str, int]: """Votes for taxids based on modified majority vote system where if a majority does not exist, the lowest majority is voted. @@ -164,8 +168,8 @@ def rank_taxids( ---------- ctg_lcas : dict {ctg1:{canonical_rank:{taxid:num_hits,...},...}, ctg2:{...},...} - ncbi : ncbi.NCBI or lca.LCA object - instance of NCBI subclass or NCBI containing NCBI methods. + taxa_db : TaxonomyDatabase + instance of NCBI or GTDB subclass of autometa.taxonomy.database.TaxonomyDatabase verbose : bool Description of parameter `verbose` (the default is False). @@ -184,7 +188,7 @@ def rank_taxids( ctg_lcas, disable=disable, total=n_contigs, desc=desc, leave=False ): acceptedTaxid = None - for rank in NCBI.CANONICAL_RANKS: + for rank in TaxonomyDatabase.CANONICAL_RANKS: if acceptedTaxid is not None: break # Order in descending order of votes @@ -196,7 +200,7 @@ def rank_taxids( ) for taxid in ordered_taxids: if is_consistent_with_other_orfs( - taxid, rank, ctg_lcas[contig], ncbi + taxid, rank, ctg_lcas[contig], taxa_db ): acceptedTaxid = taxid break @@ -204,7 +208,7 @@ def rank_taxids( # draw, so we need to find the lowest taxonomic level where there is a # majority if acceptedTaxid is None: - acceptedTaxid = lowest_majority(ctg_lcas[contig], ncbi) + acceptedTaxid = lowest_majority(ctg_lcas[contig], taxa_db) top_taxids[contig] = acceptedTaxid return top_taxids @@ -247,7 +251,7 @@ def write_votes(results: Dict[str, int], out: str) -> str: def majority_vote( lca_fpath: str, out: str, - ncbi_dir: str, + taxa_db: TaxonomyDatabase, verbose: bool = False, orfs: str = None, ) -> str: @@ -259,8 +263,8 @@ def majority_vote( Path to lowest common ancestor assignments table. out : str Path to write assigned taxids. - ncbi_dir : str - Path to NCBI databases directory. + taxa_db : TaxonomyDatabase + An instance of TaxonomyDatabase verbose : bool, optional Increase verbosity of logging stream orfs: str, optional @@ -275,12 +279,14 @@ def majority_vote( """ outdir = os.path.dirname(os.path.realpath(out)) - lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir) + lca = LCA(taxonomy_db=taxa_db, verbose=verbose, cache=outdir) # retrieve lca taxids for each contig classifications = lca.parse(lca_fpath=lca_fpath, orfs_fpath=orfs) # Vote for majority lca taxid from contig lca taxids # We can pass in lca as the ncbi object here, because it is a subclass of NCBI. - voted_taxids = rank_taxids(ctg_lcas=classifications, ncbi=lca, verbose=verbose) + voted_taxids = rank_taxids( + ctg_lcas=classifications, taxa_db=taxa_db, verbose=verbose + ) return write_votes(results=voted_taxids, out=out) @@ -297,7 +303,13 @@ def main(): "--output", help="Path to write voted taxid results table.", required=True ) parser.add_argument( - "--dbdir", help="Path to NCBI databases directory.", default=NCBI_DIR + "--dbdir", help="Path to taxonomy database directory.", default=NCBI_DIR + ) + parser.add_argument( + "--dbtype", + help="Taxonomy database to use", + choices=["ncbi", "gtdb"], + default="ncbi", ) parser.add_argument( "--orfs", @@ -312,10 +324,15 @@ def main(): ) args = parser.parse_args() + if args.dbtype == "ncbi": + taxa_db = NCBI(args.dbdir, verbose=args.verbose) + elif args.dbtype == "gtdb": + taxa_db = GTDB(args.dbdir, verbose=args.verbose) + majority_vote( lca_fpath=args.lca, out=args.output, - ncbi_dir=args.dbdir, + taxa_db=taxa_db, verbose=args.verbose, orfs=args.orfs, ) diff --git a/autometa/taxonomy/ncbi.py b/autometa/taxonomy/ncbi.py index d78d3d3c9..06757179e 100644 --- a/autometa/taxonomy/ncbi.py +++ b/autometa/taxonomy/ncbi.py @@ -12,7 +12,9 @@ import os import string import sys -from typing import Dict, Iterable, List, Set + +from typing import Dict, Set, Tuple +from itertools import chain import pandas as pd @@ -21,6 +23,7 @@ from autometa.common.utilities import file_length from autometa.common.exceptions import DatabaseOutOfSyncError from autometa.config.utilities import DEFAULT_CONFIG +from autometa.taxonomy.database import TaxonomyDatabase logger = logging.getLogger(__name__) @@ -30,43 +33,32 @@ NCBI_DIR = NCBI_DIR if not "None" in NCBI_DIR else NCBI_DIR.replace("None", ".") -class NCBI: +class NCBI(TaxonomyDatabase): """Taxonomy utilities for NCBI databases.""" - CANONICAL_RANKS = [ - "species", - "genus", - "family", - "order", - "class", - "phylum", - "superkingdom", - "root", - ] - - def __init__(self, dirpath, verbose=False): + def __init__(self, dbdir, verbose=False): """ Instantiates the NCBI class Parameters ---------- - dirpath : str + dbdir : str Path to the database directory verbose : bool, optional log progress to terminal, by default False """ - self.dirpath = dirpath + self.dbdir = dbdir self.verbose = verbose self.disable = not self.verbose - self.names_fpath = os.path.join(self.dirpath, "names.dmp") - self.nodes_fpath = os.path.join(self.dirpath, "nodes.dmp") - self.merged_fpath = os.path.join(self.dirpath, "merged.dmp") - self.delnodes_fpath = os.path.join(self.dirpath, "delnodes.dmp") + self.names_fpath = os.path.join(self.dbdir, "names.dmp") + self.nodes_fpath = os.path.join(self.dbdir, "nodes.dmp") + self.merged_fpath = os.path.join(self.dbdir, "merged.dmp") + self.delnodes_fpath = os.path.join(self.dbdir, "delnodes.dmp") # Set prot.accession2taxid filepath - self.accession2taxid_fpath = os.path.join(self.dirpath, "prot.accession2taxid") + self.accession2taxid_fpath = os.path.join(self.dbdir, "prot.accession2taxid") acc2taxid_gz = ".".join([self.accession2taxid_fpath, "gz"]) if not os.path.exists(self.accession2taxid_fpath) and os.path.exists( acc2taxid_gz @@ -74,7 +66,7 @@ def __init__(self, dirpath, verbose=False): self.accession2taxid_fpath = acc2taxid_gz # Set prot.accession2taxid.FULL.gz filepath self.accession2taxidfull_fpath = os.path.join( - self.dirpath, "prot.accession2taxid.FULL" + self.dbdir, "prot.accession2taxid.FULL" ) acc2taxid_gz = ".".join([self.accession2taxidfull_fpath, "gz"]) if not os.path.exists(self.accession2taxidfull_fpath) and os.path.exists( @@ -83,7 +75,7 @@ def __init__(self, dirpath, verbose=False): self.accession2taxidfull_fpath = acc2taxid_gz # Set dead_prot.accession2taxid filepath self.dead_accession2taxid_fpath = os.path.join( - self.dirpath, "dead_prot.accession2taxid" + self.dbdir, "dead_prot.accession2taxid" ) acc2taxid_gz = ".".join([self.dead_accession2taxid_fpath, "gz"]) if not os.path.exists(self.dead_accession2taxid_fpath) and os.path.exists( @@ -91,10 +83,10 @@ def __init__(self, dirpath, verbose=False): ): self.dead_accession2taxid_fpath = acc2taxid_gz # Check formatting for nr database - self.nr_fpath = os.path.join(self.dirpath, "nr.gz") + self.nr_fpath = os.path.join(self.dbdir, "nr.gz") nr_bname = os.path.splitext(os.path.basename(self.nr_fpath))[0] nr_dmnd_fname = ".".join([nr_bname, "dmnd"]) - nr_dmnd_fpath = os.path.join(self.dirpath, nr_dmnd_fname) + nr_dmnd_fpath = os.path.join(self.dbdir, nr_dmnd_fname) if os.path.exists(nr_dmnd_fpath): self.nr_fpath = nr_dmnd_fpath if "dmnd" not in os.path.basename(self.nr_fpath): @@ -129,186 +121,7 @@ def __str__(self): Directory path of the class object """ # Perhaps should place summary here of files that do or do not exist? - return self.dirpath - - def name(self, taxid: int, rank: str = None) -> str: - """ - Parses through the names.dmp in search of the given `taxid` and returns its name. If the `taxid` is - deprecated, suppressed, withdrawn from NCBI (basically old) the updated name will be retrieved - - Parameters - ---------- - taxid : int - `taxid` whose name is being returned - rank : str, optional - If provided, will return `taxid` name at `rank`, by default None - Must be a canonical rank, choices: species, genus, family, order, class, phylum, superkingdom - Eg. self.name(562, 'genus') would return 'Escherichia', where 562 is the taxid for Escherichia coli - - Returns - ------- - str - Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified' - - Raises - ------ - DatabaseOutOfSyncError - NCBI databases nodes.dmp, names.dmp and merged.dmp are out of sync with each other - """ - try: - taxid = self.convert_taxid_dtype(taxid) - except DatabaseOutOfSyncError as err: - logger.warning(err) - taxid = 0 - if not rank: - return self.names.get(taxid, "unclassified") - if rank not in set(NCBI.CANONICAL_RANKS): - logger.warning(f"{rank} not in canonical ranks!") - return "unclassified" - ancestor_taxid = taxid - while ancestor_taxid != 1: - ancestor_rank = self.rank(ancestor_taxid) - if ancestor_rank == rank: - return self.names.get(ancestor_taxid, "unclassified") - ancestor_taxid = self.parent(ancestor_taxid) - # At this point we have not encountered a name for the taxid rank - # so we will place this as unclassified. - return "unclassified" - - def lineage(self, taxid: int, canonical: bool = True) -> List[Dict]: - """ - Returns the lineage of `taxids` encountered when traversing to root - - Parameters - ---------- - taxid : int - `taxid` in nodes.dmp, whose lineage is being returned - canonical : bool, optional - Lineage includes both canonical and non-canonical ranks when False, and only the canonical ranks when True - Canonical ranks include : species, genus , family, order, class, phylum, superkingdom, root - - Returns - ------- - ordered list of dicts - [{'taxid':taxid, 'rank':rank,'name':name}, ...] - """ - lineage = [] - while taxid != 1: - if canonical and self.rank(taxid) not in NCBI.CANONICAL_RANKS: - taxid = self.parent(taxid) - continue - lineage.append( - {"taxid": taxid, "name": self.name(taxid), "rank": self.rank(taxid)} - ) - taxid = self.parent(taxid) - return lineage - - def get_lineage_dataframe( - self, taxids: Iterable, fillna: bool = True - ) -> pd.DataFrame: - """ - Given an iterable of taxids generate a pandas DataFrame of their canonical - lineages - - Parameters - ---------- - taxids : iterable - `taxids` whose lineage dataframe is being returned - fillna : bool, optional - Whether to fill the empty cells with 'unclassified' or not, default True - - Returns - ------- - pd.DataFrame - index = taxid - columns = [superkingdom,phylum,class,order,family,genus,species] - - Example - ------- - - If you would like to merge the returned DataFrame ('this_df') with another - DataFrame ('your_df'). Let's say where you retrieved your taxids: - - .. code-block:: python - - merged_df = pd.merge( - left=your_df, - right=this_df, - how='left', - left_on=, - right_index=True) - """ - canonical_ranks = [ - rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root" - ] - taxids = list(set(taxids)) - ranked_taxids = {} - for rank in canonical_ranks: - for taxid in taxids: - name = self.name(taxid, rank=rank) - if taxid not in ranked_taxids: - ranked_taxids.update({taxid: {rank: name}}) - else: - ranked_taxids[taxid].update({rank: name}) - df = pd.DataFrame(ranked_taxids).transpose() - df.index.name = "taxid" - if fillna: - df.fillna(value="unclassified", inplace=True) - return df - - def rank(self, taxid: int) -> str: - """ - Return the respective rank of provided `taxid`. If the `taxid` is deprecated, suppressed, - withdrawn from NCBI (basically old) the updated rank will be retrieved - - Parameters - ---------- - taxid : int - `taxid` to retrieve rank from nodes.dmp - - Returns - ------- - str - rank name if taxid is found in nodes.dmp else "unclassified" - - Raises - ------ - DatabaseOutOfSyncError - NCBI databases nodes.dmp, names.dmp and merged.dmp are out of sync with each other - """ - try: - taxid = self.convert_taxid_dtype(taxid) - except DatabaseOutOfSyncError as err: - logger.warning(err) - taxid = 0 - return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank") - - def parent(self, taxid: int) -> int: - """ - Retrieve the parent taxid of provided `taxid`. If the `taxid` is deprecated, suppressed, - withdrawn from NCBI (basically old) the updated parent will be retrieved - - Parameters - ---------- - taxid : int - child taxid to retrieve parent - - Returns - ------- - int - Parent taxid if found in nodes.dmp otherwise 1 - - Raises - ------ - DatabaseOutOfSyncError - NCBI databases nodes.dmp, names.dmp and merged.dmp are out of sync with each other - """ - try: - taxid = self.convert_taxid_dtype(taxid) - except DatabaseOutOfSyncError as err: - logger.warning(err) - taxid = 0 - return self.nodes.get(taxid, {"parent": 1}).get("parent") + return self.dbdir def parse_names(self) -> Dict[int, str]: """ @@ -408,28 +221,6 @@ def parse_delnodes(self) -> Set[int]: logger.debug("delnodes loaded") return delnodes - def is_common_ancestor(self, taxid_A: int, taxid_B: int) -> bool: - """ - Determines whether the provided taxids have a non-root common ancestor - - Parameters - ---------- - taxid_A : int - taxid in NCBI taxonomy databases - nodes.dmp, names.dmp or merged.dmp - taxid_B : int - taxid in NCBI taxonomy databases - nodes.dmp, names.dmp or merged.dmp - - Returns - ------- - boolean - True if taxids share a common ancestor else False - """ - lineage_a_taxids = {ancestor.get("taxid") for ancestor in self.lineage(taxid_A)} - lineage_b_taxids = {ancestor.get("taxid") for ancestor in self.lineage(taxid_B)} - common_ancestor = lineage_b_taxids.intersection(lineage_a_taxids) - common_ancestor.discard(1) # This discards root - return True if common_ancestor else False - def convert_taxid_dtype(self, taxid: int) -> int: """ 1. Converts the given `taxid` to an integer and checks whether it is positive. @@ -601,6 +392,78 @@ def search_prot_accessions( logger.debug(f"sseqids converted from {filename}: {converted_sseqid_count:,}") return sseqids_to_taxids + def convert_accessions_to_taxids( + self, accessions: set + ) -> Tuple[Dict[str, Set[int]], pd.DataFrame]: + # We first get all unique accessions that were retrieved from the blast output + recovered_accessions = set( + chain.from_iterable( + [qseqid_sseqids for qseqid_sseqids in accessions.values()] + ) + ) + # Check for sseqid in dead_prot.accession2taxid.gz in case an old db was used. + # Any accessions not found in live prot.accession2taxid db *may* be here. + # This *attempts* to prevent accessions from being assigned root (root taxid=1) + try: + sseqids_to_taxids = self.search_prot_accessions( + accessions=recovered_accessions, + db="dead", + sseqids_to_taxids=None, + ) + dead_sseqids_found = set(sseqids_to_taxids.keys()) + except FileNotFoundError as db_fpath: + logger.warn(f"Skipping taxid conversion from {db_fpath}") + sseqids_to_taxids = None + dead_sseqids_found = set() + + # Now build the mapping from sseqid to taxid from the full/live accession2taxid dbs + # Possibly overwriting any merged accessions to live accessions + sseqids_to_taxids = self.search_prot_accessions( + accessions=recovered_accessions, + db="full", + sseqids_to_taxids=sseqids_to_taxids, + ) + + # Remove accessions: Ignore any accessions already found + live_sseqids_found = set(sseqids_to_taxids.keys()) + live_sseqids_found -= dead_sseqids_found + recovered_accessions -= live_sseqids_found + if recovered_accessions: + logger.warn( + f"accessions without corresponding taxid: {len(recovered_accessions):,}" + ) + root_taxid = 1 + taxids = {} + sseqid_not_found = pd.NA + sseqid_to_taxid_df = pd.DataFrame( + [ + { + "qseqid": qseqid, + "sseqid": sseqid, + "raw_taxid": sseqids_to_taxids.get(sseqid, sseqid_not_found), + } + for qseqid, qseqid_sseqids in accessions.items() + for sseqid in qseqid_sseqids + ] + ) + # Update taxids if they are old. + sseqid_to_taxid_df["merged_taxid"] = sseqid_to_taxid_df.raw_taxid.map( + lambda tid: self.merged.get(tid, tid) + ) + # If we still have missing taxids, we will set the sseqid value to the root taxid + # fill missing taxids with root_taxid + sseqid_to_taxid_df["cleaned_taxid"] = sseqid_to_taxid_df.merged_taxid.fillna( + root_taxid + ) + root_taxid = 1 + for qseqid, qseqid_sseqids in accessions.items(): + # NOTE: we only want to retrieve the set of unique taxids (not a list) for LCA query + qseqid_taxids = { + sseqids_to_taxids.get(sseqid, root_taxid) for sseqid in qseqid_sseqids + } + taxids[qseqid] = qseqid_taxids + return taxids, sseqid_to_taxid_df + if __name__ == "__main__": print("file containing Autometa NCBI utilities class") diff --git a/autometa/taxonomy/vote.py b/autometa/taxonomy/vote.py index 9167563d3..aa6211f19 100644 --- a/autometa/taxonomy/vote.py +++ b/autometa/taxonomy/vote.py @@ -15,13 +15,15 @@ import pandas as pd from Bio import SeqIO -from typing import Union, List +from typing import Union, List, Literal from autometa.common.external import prodigal from autometa.taxonomy import majority_vote from autometa.taxonomy.lca import LCA from autometa.taxonomy.ncbi import NCBI, NCBI_DIR +from autometa.taxonomy.gtdb import GTDB +from autometa.taxonomy.database import TaxonomyDatabase from autometa.common.exceptions import TableFormatError logger = logging.getLogger(__name__) @@ -35,7 +37,8 @@ def assign( nucl_orfs: str = None, blast: str = None, lca_fpath: str = None, - ncbi_dir: str = NCBI_DIR, + dbdir: str = NCBI_DIR, + dbtype: Literal["ncbi", "gtdb"] = "ncbi", force: bool = False, verbose: bool = False, parallel: bool = True, @@ -60,8 +63,10 @@ def assign( Path to blastp table, by default None lca_fpath : str, optional Path to output of LCA analysis, by default None - ncbi_dir : str, optional + dbdir : str, optional Path to NCBI databases directory, by default NCBI_DIR + dbtype : str, optional + Type of Taxonomy database to use, by default ncbi force : bool, optional Overwrite existing annotations, by default False verbose : bool, optional @@ -95,6 +100,8 @@ def assign( prot_orfs = prot_orfs if prot_orfs else os.path.join(outdir, "orfs.faa") nucl_orfs = nucl_orfs if nucl_orfs else os.path.join(outdir, "orfs.fna") + taxa_db = NCBI(dbdir=dbdir) if dbtype == "ncbi" else GTDB(dbdir=dbdir) + def call_orfs(): prodigal.run( assembly=assembly, @@ -107,16 +114,20 @@ def call_orfs(): def blast2lca(): if "lca" not in locals(): - lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir) + lca = LCA(taxonomy_db=taxa_db, verbose=verbose, cache=outdir) lca.blast2lca( - orfs=prot_orfs, out=lca_fpath, blast=blast, force=force, cpus=cpus + blast=blast, + out=lca_fpath, + force=force, ) def majority_vote_lca(out=out): if "lca" not in locals(): - lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir) + lca = LCA(taxonomy_db=taxa_db, verbose=verbose, cache=outdir) ctg_lcas = lca.parse(lca_fpath=lca_fpath, orfs_fpath=prot_orfs) - votes = majority_vote.rank_taxids(ctg_lcas=ctg_lcas, ncbi=lca, verbose=verbose) + votes = majority_vote.rank_taxids( + ctg_lcas=ctg_lcas, taxa_db=taxa_db, verbose=verbose + ) out = majority_vote.write_votes(results=votes, out=out) return pd.read_csv(out, sep="\t", index_col="contig") @@ -146,30 +157,29 @@ def majority_vote_lca(out=out): calculation() -def add_ranks(df: pd.DataFrame, ncbi: Union[NCBI, str]) -> pd.DataFrame: +def add_ranks(df: pd.DataFrame, taxa_db: TaxonomyDatabase) -> pd.DataFrame: """Add canonical ranks to `df` and write to `out` Parameters ---------- df : pd.DataFrame index="contig", column="taxid" - ncbi : str or NCBI - Path to NCBI databases directory, or autometa NCBI instance. + taxa_db : TaxonomyDatabase + NCBI or GTDB TaxonomyDatabase instance. Returns ------- pd.DataFrame index="contig", columns=["taxid", *canonical_ranks] """ - ncbi = ncbi if isinstance(ncbi, NCBI) else NCBI(ncbi) - dff = ncbi.get_lineage_dataframe(df["taxid"].unique().tolist()) + dff = taxa_db.get_lineage_dataframe(df["taxid"].unique().tolist()) return pd.merge(left=df, right=dff, how="left", left_on="taxid", right_index=True) def get( filepath_or_dataframe: Union[str, pd.DataFrame], kingdom: str, - ncbi: Union[NCBI, str] = NCBI_DIR, + taxa_db: TaxonomyDatabase, ) -> pd.DataFrame: """Retrieve specific `kingdom` voted taxa for `assembly` from `filepath` @@ -212,7 +222,7 @@ def get( if df.shape[1] <= 2: # Voting method will write out contig and its voted taxid (2 cols). # So here we add the canonical ranks using voted taxids. - df = add_ranks(df=df, ncbi=ncbi) + df = add_ranks(df=df, taxa_db=taxa_db) if "superkingdom" not in df.columns: raise TableFormatError(f"superkingdom is not in taxonomy columns {df.columns}") @@ -252,11 +262,9 @@ def write_ranks( Raises ------ - ValueError - `rank` not in canonical ranks + KeyError + `rank` not in taxonomy columns """ - if rank not in NCBI.CANONICAL_RANKS: - raise ValueError(f"rank: {rank} not in {NCBI.CANONICAL_RANKS}") if rank not in taxonomy.columns: raise KeyError(f"{rank} not in taxonomy columns: {taxonomy.columns}") if not os.path.exists(assembly) or not os.path.getsize(assembly): @@ -314,7 +322,7 @@ def main(): ) parser.add_argument( "--output", - help="Directory to output fasta files of split canonical ranks and taxonomy.tsv.", + help="Output directory to write specified canonical ranks fasta files and taxon-binning results table", type=str, metavar="dirpath", required=True, @@ -339,13 +347,24 @@ def main(): ], ) parser.add_argument( - "--ncbi", - help="Path to NCBI databases directory.", + "--dbdir", + help="Path to taxonomy database directory.", metavar="dirpath", default=NCBI_DIR, ) + parser.add_argument( + "--dbtype", + help="Taxonomy database to use", + choices=["ncbi", "gtdb"], + default="ncbi", + ) args = parser.parse_args() + if args.dbtype == "ncbi": + taxa_db = NCBI(args.dbdir) + elif args.dbtype == "gtdb": + taxa_db = GTDB(args.dbdir) + filename = f"{args.prefix}.taxonomy.tsv" if args.prefix else "taxonomy.tsv" out = os.path.join(args.output, filename) taxa_df = pd.read_csv(args.votes, sep="\t", index_col="contig") @@ -354,12 +373,16 @@ def main(): os.makedirs(args.output) if taxa_df.shape[1] <= 2: - taxa_df = add_ranks(taxa_df, ncbi=args.ncbi) + taxa_df = add_ranks(taxa_df, taxa_db=taxa_db) taxa_df.to_csv(out, sep="\t", index=True, header=True) logger.debug( f"Wrote {taxa_df.shape[0]:,} contigs canonical rank names to {out}" ) if args.split_rank_and_write: + if args.split_rank_and_write not in TaxonomyDatabase.CANONICAL_RANKS: + raise ValueError( + f"rank: {args.split_rank_and_write} not in {TaxonomyDatabase.CANONICAL_RANKS}" + ) written_ranks = write_ranks( taxonomy=taxa_df, assembly=args.assembly, diff --git a/docs/source/bash-workflow.rst b/docs/source/bash-workflow.rst index c8e8a55e2..80b8a7d1d 100644 --- a/docs/source/bash-workflow.rst +++ b/docs/source/bash-workflow.rst @@ -133,23 +133,23 @@ Alignments Preparation .. code-block:: bash # First index metagenome assembly - bwa index \\ - -b 550000000000 \\ # block size for the bwtsw algorithm (effective with -a bwtsw) [default=10000000] + bwa index \ + -b 550000000000 \ # block size for the bwtsw algorithm (effective with -a bwtsw) [default=10000000] metagenome.fna # Path to input metagenome # Now perform alignments (we are using kart, but you can use another alignment tool if you'd like) - kart \\ - -i metagenome.fna \\ # Path to input metagenome - -t 20 \\ # Number of cpus to use - -f /path/to/forward_reads.fastq.gz \\ # Path to forward paired-end reads - -f2 /path/to/reverse_reads.fastq.gz \\ # Path to reverse paired-end reads + kart \ + -i metagenome.fna \ # Path to input metagenome + -t 20 \ # Number of cpus to use + -f /path/to/forward_reads.fastq.gz \ # Path to forward paired-end reads + -f2 /path/to/reverse_reads.fastq.gz \ # Path to reverse paired-end reads -o alignments.sam # Path to alignments output # Now sort alignments and convert to bam format - samtools sort \\ - -@ 40 \\ # Number of cpus to use - -m 10G \\ # Amount of memory to use - alignments.sam \\ # Input alignments file path + samtools sort \ + -@ 40 \ # Number of cpus to use + -m 10G \ # Amount of memory to use + alignments.sam \ # Input alignments file path -o alignments.bam # Output alignments file path .. _orfs-preparation: @@ -162,11 +162,11 @@ ORFs .. code-block:: bash - prodigal -i metagenome.fna \\ - -f "gbk" \\ - -d "metagenome.orfs.fna" \\ - -o "metagenome.orfs.gbk" \\ - -a "metagenome.orfs.faa" \\ # This generated file is required as input to the bash workflow + prodigal -i metagenome.fna \ + -f "gbk" \ + -d "metagenome.orfs.fna" \ + -o "metagenome.orfs.gbk" \ + -a "metagenome.orfs.faa" \ # This generated file is required as input to the bash workflow -s "metagenome.all_orfs.txt" .. _blastp-preparation: @@ -179,10 +179,10 @@ Diamond blastp Preparation .. code-block:: bash - diamond blastp \\ - --query "metagenome.orfs.faa" \\ # See prodigal output from above - --db /path/to/nr.dmnd \\ # See NCBI section - --threads \\ + diamond blastp \ + --query "metagenome.orfs.faa" \ # See prodigal output from above + --db /path/to/nr.dmnd \ # See NCBI section + --threads \ --out blastp.tsv # This generated file is required as input to the bash workflow .. _ncbi-preparation: @@ -195,9 +195,9 @@ If you are running Autometa for the first time you'll have to download the NCBI .. code-block:: bash # First configure where you want to download the NCBI databases - autometa-config \\ - --section databases \\ - --option ncbi \\ + autometa-config \ + --section databases \ + --option ncbi \ --value # Now download and format the NCBI databases diff --git a/docs/source/databases.rst b/docs/source/databases.rst index 7fa47e914..593a55dc7 100644 --- a/docs/source/databases.rst +++ b/docs/source/databases.rst @@ -22,14 +22,15 @@ scripts for this, you will first need to download Autometa (See :ref:`Installati .. code-block:: bash # First configure where you want to download the NCBI databases - autometa-config \\ - --section databases --option ncbi \\ + autometa-config \ + --section databases --option ncbi \ --value # Now download and format the NCBI databases autometa-update-databases --update-ncbi .. note:: + You can check the default config paths using ``autometa-config --print``. See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options. @@ -45,3 +46,57 @@ The previous command will download the following NCBI databases: After these files are downloaded, the ``taxdump.tar.gz`` tarball's files are extracted and the non-redundant protein database (``nr.gz``) is formatted as a diamond database (i.e. ``nr.dmnd``). This will significantly speed-up the ``diamond blastp`` searches. + +Genome Taxonomy Database (GTDB) +############################### + +If you would like to incorporate the benefits of using the Genome Taxonomy Database. +You may do this manually or using a few Autometa helper scripts. If you would like to use Autometa's +scripts for this, you will first need to install Autometa (See :ref:`Installation`). + +You can either run the following script or manually download the respective databases. + +.. code-block:: bash + + # First configure where you want to download the GTDB databases + autometa-config \ + --section databases --option gtdb \ + --value + + # To use a specific GTDB release + autometa-config \ + --section gtdb --option release \ + --value latest + # Or --value r207 or --value r202, etc. + + # Download and format the configured GTDB databases release + autometa-update-databases --update-gtdb + + +.. note:: + + You can check the default config paths using ``autometa-config --print``. + + See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options. + +The previous command will download the following GTDB databases and format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` to be used by Autometa: + +- Amino acid sequences of representative genome + - `gtdb_proteins_aa_reps.tar.gz `_ +- gtdb-taxdump.tar.gz from `shenwei356/gtdb-taxdump `_ + - `gtdb-taxdump.tar.gz `_ + + +Once unzipped `gtdb-taxdump.tar.gz` will have the taxdump files of all the respective GTDB releases. Make sure that the release you use is in line with the `gtdb_proteins_aa_reps.tar.gz` release version. It's better to always use the latest version. + +All the taxonomy files for a specific taxonomy database should be in a single directory. You can now copy the taxdump files of the desired release version in the sample directory as `gtdb.dmnd` + +Alternatively if you have manually downloaded `gtdb_proteins_aa_reps.tar.gz` and `gtdb-taxdump.tar.gz` you can run the following command to format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` and make it ready for Autometa. + +.. code-block:: bash + + python -m autometa.taxonomy.gtdb --reps-faa --dbdir --cpus 20 + +.. note:: + + Again Make sure that the formatted `gtdb_proteins_aa_reps.tar.gz` databse and gtdb taxdump files are in the same directory. \ No newline at end of file diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf index 18c1e1d27..ad5554e6d 100644 --- a/modules/local/binning_summary.nf +++ b/modules/local/binning_summary.nf @@ -32,7 +32,8 @@ process BINNING_SUMMARY { def software = getSoftwareName(task.process) """ autometa-binning-summary \\ - --ncbi $ncbi \\ + --dbdir $ncbi \\ + --dbtype ncbi \\ --binning-main $binning_main \\ --markers $markers \\ --metagenome $metagenome \\ diff --git a/modules/local/majority_vote.nf b/modules/local/majority_vote.nf index 0d85eb4f2..846773a1b 100644 --- a/modules/local/majority_vote.nf +++ b/modules/local/majority_vote.nf @@ -28,7 +28,11 @@ process MAJORITY_VOTE { script: def software = getSoftwareName(task.process) """ - autometa-taxonomy-majority-vote --lca ${lca} --output votes.tsv --dbdir "${ncbi_tax_dir}" + autometa-taxonomy-majority-vote \\ + --lca ${lca} \\ + --output votes.tsv \\ + --dbdir "${ncbi_tax_dir}" \\ + --dbtype ncbi autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt """ diff --git a/modules/local/prepare_lca.nf b/modules/local/prepare_lca.nf index d8e4481d9..e11015153 100644 --- a/modules/local/prepare_lca.nf +++ b/modules/local/prepare_lca.nf @@ -32,6 +32,7 @@ process PREPARE_LCA { --blast . \\ --lca-output . \\ --dbdir ${blastdb_dir} \\ + --dbtype ncbi \\ --cache cache \\ --only-prepare-cache autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt diff --git a/modules/local/reduce_lca.nf b/modules/local/reduce_lca.nf index 7d9a46930..3424d2255 100644 --- a/modules/local/reduce_lca.nf +++ b/modules/local/reduce_lca.nf @@ -35,6 +35,7 @@ process REDUCE_LCA { autometa-taxonomy-lca \\ --blast ${blast} \\ --dbdir ${blastdb_dir} \\ + --dbtype ncbi \\ --cache ${lca_cache} \\ --lca-error-taxids lca_error_taxids.tsv \\ --sseqid2taxid-output sseqid2taxid.tsv \\ diff --git a/modules/local/split_kingdoms.nf b/modules/local/split_kingdoms.nf index 393d9504c..f93c12fd8 100644 --- a/modules/local/split_kingdoms.nf +++ b/modules/local/split_kingdoms.nf @@ -36,7 +36,8 @@ process SPLIT_KINGDOMS { --output . \\ --split-rank-and-write superkingdom \\ --assembly "${assembly}" \\ - --ncbi "${ncbi_tax_dir}" + --dbdir "${ncbi_tax_dir}" \\ + --dbtype ncbi autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt """ diff --git a/setup.py b/setup.py index c35809090..bd79d9947 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def read(fname): "autometa-taxonomy = autometa.taxonomy.vote:main", "autometa-taxonomy-lca = autometa.taxonomy.lca:main", "autometa-taxonomy-majority-vote = autometa.taxonomy.majority_vote:main", + "autometa-setup-gtdb = autometa.taxonomy.gtdb:main", "autometa-binning = autometa.binning.recursive_dbscan:main", "autometa-binning-summary = autometa.binning.summary:main", "autometa-binning-ldm = autometa.binning.large_data_mode:main", diff --git a/tests/conftest.py b/tests/conftest.py index f49888589..9454188a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ @pytest.fixture(name="ncbi", scope="session", autouse=True) def fixture_ncbi(ncbi_dir): - return NCBI(dirpath=ncbi_dir, verbose=False) + return NCBI(dbdir=ncbi_dir, verbose=False) @pytest.fixture(name="dbdir", scope="session", autouse=True) diff --git a/tests/environment.yml b/tests/environment.yml index 1f1ab3eda..f140f6d70 100644 --- a/tests/environment.yml +++ b/tests/environment.yml @@ -13,6 +13,7 @@ dependencies: - gdown - hdbscan - hmmer + - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809 - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -29,6 +30,7 @@ dependencies: - rsync - samtools>=1.11 - scikit-bio + - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations - sphinx - sphinx_rtd_theme diff --git a/tests/unit_tests/test_vote.py b/tests/unit_tests/test_vote.py index 61026cf9e..077e5c3ae 100644 --- a/tests/unit_tests/test_vote.py +++ b/tests/unit_tests/test_vote.py @@ -88,7 +88,7 @@ def fixture_votes_fpath(votes, tmp_path_factory): def test_add_ranks(ncbi, votes, tmp_path): out = tmp_path / "taxonomy.ranks_added.tsv" - df = vote.add_ranks(df=votes, ncbi=ncbi) + df = vote.add_ranks(df=votes, taxa_db=ncbi) assert df.shape == (2, 8) assert df.index.name == "contig" canonical_ranks = {rank for rank in ncbi.CANONICAL_RANKS if rank != "root"} @@ -101,10 +101,7 @@ def test_add_ranks(ncbi, votes, tmp_path): def test_vote_assign(blastp, ncbi_dir, prot_orfs, tmp_path): out = tmp_path / "votes.tsv" votes = vote.assign( - out=out, - prot_orfs=prot_orfs, - blast=blastp, - ncbi_dir=ncbi_dir, + out=out, prot_orfs=prot_orfs, blast=blastp, dbdir=ncbi_dir, dbtype="ncbi" ) assert isinstance(votes, pd.DataFrame) assert votes.index.name == "contig" @@ -115,7 +112,7 @@ def test_get(ncbi, votes_fpath): df = vote.get( filepath_or_dataframe=votes_fpath, kingdom="bacteria", - ncbi=ncbi, + taxa_db=ncbi, ) # canonical ranks should have been added to table if they were not already in place. assert df.shape == (2, 8) @@ -126,7 +123,7 @@ def test_get_none_recovered(ncbi, votes_fpath): vote.get( filepath_or_dataframe=votes_fpath, kingdom="archaea", - ncbi=ncbi, + taxa_db=ncbi, ) @@ -136,7 +133,7 @@ def test_get_empty_votes(ncbi_dir, tmp_path): vote.get( filepath_or_dataframe=fpath, kingdom="archaea", - ncbi=ncbi_dir, + taxa_db=ncbi_dir, ) @@ -151,7 +148,7 @@ def return_df(*args, **kwargs): vote.get( filepath_or_dataframe=fpath, kingdom="archaea", - ncbi=ncbi, + taxa_db=ncbi, ) @@ -160,7 +157,7 @@ def fixture_ranks_added_votes(votes_fpath, ncbi): return vote.get( filepath_or_dataframe=votes_fpath, kingdom="bacteria", - ncbi=ncbi, + taxa_db=ncbi, ) @@ -173,7 +170,7 @@ def test_write_ranks(monkeypatch, tmp_path, ranks_added_votes, rank, num_expecte assembly = dirpath / "assembly.fna" assembly.write_text("records") if rank == "noncanonical_rank": - with pytest.raises(ValueError): + with pytest.raises(KeyError): vote.write_ranks( taxonomy=ranks_added_votes, assembly=assembly, outdir=dirpath, rank=rank ) @@ -226,13 +223,12 @@ def test_write_ranks_no_taxonomy_columns(tmp_path, votes): ) -@pytest.mark.slow -@pytest.mark.skip @pytest.mark.entrypoint -def test_vote_main(monkeypatch, ncbi_dir, tmp_path): +def test_vote_main(monkeypatch, ncbi_dir, votes_fpath, tmp_path): outdir = tmp_path / "outdir" outdir.mkdir() - taxonomy = outdir / "taxonomy.tsv" + prefix = "test" + taxonomy = outdir / f"{prefix}.taxonomy.tsv" assembly = outdir / "assembly.fna" assembly.write_text("records") lca_fpath = outdir / "lca.tsv" @@ -240,11 +236,13 @@ def test_vote_main(monkeypatch, ncbi_dir, tmp_path): class MockArgs: def __init__(self): - self.votes = taxonomy + self.votes = votes_fpath self.output = outdir self.assembly = assembly self.split_rank_and_write = "superkingdom" - self.ncbi = ncbi_dir + self.dbdir = ncbi_dir + self.dbtype = "ncbi" + self.prefix = prefix class MockParser: def add_argument(self, *args, **kwargs): diff --git a/workflows/autometa-large-data-mode.sh b/workflows/autometa-large-data-mode.sh index b6acdc9f3..a4d65d36a 100644 --- a/workflows/autometa-large-data-mode.sh +++ b/workflows/autometa-large-data-mode.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash -#SBATCH -p partition +#SBATCH --partition=queue #SBATCH -t 48:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=16 +#SBATCH --error=autometa.%J.err +#SBATCH --output=autometa.%J.out ## First create environment to run Autometa, (and optionally GTDB-Tk and CheckM) # git clone git@github.com:KwanLab/Autometa @@ -28,30 +30,42 @@ assembly="Path to metagenome assembly fasta file" bam="Path to metagenome read alignments.bam" orfs="Path to orfs used as input to diamond blastp" blast="Path to diamond blastp results (outfmt 6)" -ncbi="Path to NCBI databases directory" # should contain prot.accession2taxid.gz, names.dmp, nodes.dmp and merged.dmp +ncbi="Path to NCBI databases directory" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi +gtdb="Path to GTDB databases directory" # (Optional) For more info see: https://autometa.readthedocs.io/en/latest/databases.html#genome-taxonomy-database-gtdb markers_dbdir="Path to autometa markers databases" # should contain marker hmms and cutoffs files (can be downloaded from Autometa repo under `Autometa/autometa/databases/markers`) # Autometa Parameters length_cutoff=3000 # in bp -# kmer counting, normalization transform and embedding method +# Taxon Assignment Parameters +taxa_routine="ncbi" # Choices are "ncbi" or "ncbi_gtdb" +# NOTE: When using the "ncbi_gtdb" option, blastP will be performed against the GTDB database +# using the kingdom-specific ORFs retrieved from the NCBI taxon-assignment sub-workflow. +# K-mer Counting, Normalization and Embedding Parameters kmer_size=5 norm_method="am_clr" # choices: am_clr, clr, ilr pca_dimensions=50 # NOTE: must be greater than $embed_dimensions embed_dimensions=2 # NOTE: must be less than $pca_dimensions -embed_method="umap" # choices: bhsne, sksne, umap, densne, trimap -# Binning clustering method -clustering_method="hdbscan" # choices: hdbscan, dbscan +embed_method="umap" # choices: bhsne, sksne, umap, densmap, trimap +# Binning Parameters (clustering methods and MAG quality thresholds) +cluster_method="hdbscan" # choices: hdbscan, dbscan # Binning metrics cutoffs -completeness=20.0 -purity=95.0 -cov_stddev_limit=25.0 -gc_stddev_limit=5.0 +completeness=20.0 # Accept MAGs greater than this value +purity=95.0 # Accept MAGs greater than this value +cov_stddev_limit=25.0 # Accept MAGs less than this value +gc_stddev_limit=5.0 # Accept MAGs less than this value max_partition_size=10000 +# Runtime Parameters cpus=16 seed=42 +if [[ $taxa_routine != "ncbi" ]] && [[ $taxa_routine != "ncbi_gtdb" ]] +then + echo "ERROR: Invalid Taxonomic routine value. Please choose between ncbi or ncbi_gtdb. Current selection: ${taxa_routine}" + exit 1 +fi + # Step 0: Do some path handling with the provided `assembly` filepath -simpleName="TemplateAssemblyName" +simple_name="TemplateAssemblyName" outdir="AutometaOutdir" if [ ! -d $outdir ] then mkdir -p $outdir @@ -60,7 +74,9 @@ fi ######### BEGIN ######### # Step 00: Report autometa version +set -x autometa --version +{ set +x; } 2>/dev/null # Step 1: filter assembly by length and retrieve contig lengths as well as GC content @@ -69,15 +85,17 @@ autometa --version # $length_cutoff --> User input # output: -filtered_assembly="${outdir}/${simpleName}.filtered.fna" -gc_content="${outdir}/${simpleName}.gc_content.tsv" +filtered_assembly="${outdir}/${simple_name}.filtered.fna" +gc_content="${outdir}/${simple_name}.gc_content.tsv" # script: +set -x autometa-length-filter \ --assembly $assembly \ --cutoff $length_cutoff \ --output-fasta $filtered_assembly \ --output-gc-content $gc_content +{ set +x; } 2>/dev/null # Step 2: Determine coverages from assembly read alignments @@ -85,11 +103,13 @@ autometa-length-filter \ # NOTE: $bam is defined at top and the rest of the inputs are generated by autometa # output: -bed="${outdir}/${simpleName}.coverages.bed.tsv" -coverages="${outdir}/${simpleName}.coverages.tsv" +bed="${outdir}/${simple_name}.coverages.bed.tsv" +coverages="${outdir}/${simple_name}.coverages.tsv" # script: +set -x autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages +{ set +x; } 2>/dev/null # Step 3: Annotate and filter markers # input: @@ -101,10 +121,11 @@ kingdoms=(bacteria archaea) # NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms for kingdom in ${kingdoms[@]};do # kingdom-specific output: - hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv" - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" + hmmscan="${outdir}/${simple_name}.${kingdom}.hmmscan.tsv" + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # script: + set -x autometa-markers \ --orfs $orfs \ --hmmscan $hmmscan \ @@ -114,6 +135,7 @@ for kingdom in ${kingdoms[@]};do --parallel \ --cpus 4 \ --seed $seed + { set +x; } 2>/dev/null done # Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits @@ -121,19 +143,25 @@ done # input: # $blast --> User Input # $ncbi --> User Input +# $dbtype --> Updated according to $taxa_routine +dbtype="ncbi" +prefix="${simple_name}.${dbtype}" # output: -lca="${outdir}/${simpleName}.orfs.lca.tsv" -sseqid2taxid="${outdir}/${simpleName}.orfs.sseqid2taxid.tsv" -errorTaxids="${outdir}/${simpleName}.orfs.errortaxids.tsv" +lca="${outdir}/${prefix}.orfs.lca.tsv" +sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv" +error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv" # script: +set -x autometa-taxonomy-lca \ --blast $blast \ --dbdir $ncbi \ + --dbtype $dbtype \ --lca-output $lca \ - --sseqid2taxid-output $sseqid2taxid \ - --lca-error-taxids $errorTaxids + --sseqid2taxid-output $sseqid_to_taxid \ + --lca-error-taxids $error_taxids +{ set +x; } 2>/dev/null # Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search @@ -142,11 +170,12 @@ autometa-taxonomy-lca \ # $ncbi --> User Input # output: -votes="${outdir}/${simpleName}.taxids.tsv" +votes="${outdir}/${prefix}.taxids.tsv" # script: -autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi - +set -x +autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi --dbtype $dbtype +{ set +x; } 2>/dev/null # Step 4.3: Split assigned taxonomies into kingdoms # input: @@ -157,71 +186,220 @@ autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi # output: # Will write recovered superkingdoms to ${outdir} -# e.g. ${outdir}/${simpleName}.bacteria.fna -# e.g. ${outdir}/${simpleName}.archaea.fna -# e.g. ${outdir}/${simpleName}.taxonomy.tsv +# e.g. ${outdir}/${prefix}.bacteria.fna +# e.g. ${outdir}/${prefix}.archaea.fna +# e.g. ${outdir}/${prefix}.taxonomy.tsv # script: +set -x autometa-taxonomy \ --votes $votes \ --output $outdir \ - --prefix $simpleName \ + --prefix $prefix \ --split-rank-and-write superkingdom \ --assembly $assembly \ - --ncbi $ncbi + --dbdir $ncbi \ + --dbtype $dbtype +{ set +x; } 2>/dev/null + +# Step 5: Taxon-assignment using the GTDB database +# NOTE: only performed if `taxa_routine` is 'ncbi_gtdb' + +# Step 5.1: Extract bacterial ORFs and run GTDB +# input: +# $kingdom_fasta --> Generated by step 4.3 +# $orfs --> User Input + +# output: +# orf_prefixes --> text file containing metagenome contig IDs classified within NCBI bacteria and archaea +# orf_ids --> text file containing contig ORF IDs classified within NCBI bacteria and archaea +# kingdom_orfs --> fasta file containing metagenome ORFs classified within NCBI bacteria or archaea +# gtdb_input_orfs --> metagenome orfs classified within NCBI bacteria *and* archaea + +if [[ "$taxa_routine" == "ncbi_gtdb" ]] +then + echo "Running GTDB taxon assignment step." + + # output + gtdb_input_orfs="${outdir}/${prefix}.orfs.faa" + + for kingdom in ${kingdoms[@]};do + + kingdom_fasta="${outdir}/${prefix}.${kingdom}.fna" + + orf_prefixes="${outdir}/${prefix}.${kingdom}.contigIDs.txt" + orf_ids="${outdir}/${prefix}.${kingdom}.orfIDs.txt" + kingdom_orfs="${outdir}/${prefix}.${kingdom}.orfs.faa" + + if [ ! -f $kingdom_fasta ] + then + echo "${kingdom_fasta} does not exist, skipping..." + continue + fi + + # Retrieve contig IDs from kingdom fasta file + set -x + grep ">" $kingdom_fasta | \ + sed 's/^>//' | \ + sed 's/$/_/' | \ + cut -f1 -d" " > $orf_prefixes + # Retrieve ORF IDs from contig IDs + grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids + # Retrieve ORF seqs from ORF IDs + seqkit grep \ + --pattern-file $orf_ids \ + --out-file $kingdom_orfs \ + $orfs + # Concatenate kingdom ORFs to single file for GTDB blastp + cat $kingdom_orfs >> $gtdb_input_orfs + { set +x; } 2>/dev/null + done + dbtype="gtdb" + prefix="${simple_name}.${dbtype}" + + # Step 5.2: Run blastp + # input: + # $gtdb_input_orfs --> Generated from step 5.1 + gtdb_dmnd_db=$(find $gtdb -name "gtdb.dmnd") # generated using autometa-setup-gtdb (Must be performed prior to using this script) + # output + blast="${outdir}/${prefix}.blastp.tsv" + + # script + set -x + diamond blastp \ + --query $gtdb_input_orfs \ + --db $gtdb_dmnd_db \ + --evalue 1e-5 \ + --max-target-seqs 200 \ + --threads $cpus \ + --outfmt 6 \ + --out $blast + { set +x; } 2>/dev/null + + # Step 5.3: Determine LCA + # input: + # $blast --> Generated from step 5.2 + # $gtdb --> User Input + # $dbtype --> Updated according to $taxa_routine + + # output: + lca="${outdir}/${prefix}.orfs.lca.tsv" + sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv" + error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv" + + # script: + set -x + autometa-taxonomy-lca \ + --blast $blast \ + --dbdir $gtdb \ + --dbtype $dbtype \ + --lca-output $lca \ + --sseqid2taxid-output $sseqid_to_taxid \ + --lca-error-taxids $error_taxids + { set +x; } 2>/dev/null + + # Step 5.4: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search + # input: + # $lca --> Generated from step 5.3 + # $gtdb --> User Input + + # output: + votes="${outdir}/${prefix}.taxids.tsv" + + # script: + set -x + autometa-taxonomy-majority-vote \ + --lca $lca \ + --output $votes \ + --dbdir $gtdb \ + --dbtype $dbtype + { set +x; } 2>/dev/null + + # Step 5.5: Split assigned taxonomies into kingdoms + # input: + # $votes --> Generated from step 5.4 + # $outdir --> Generated from step 0 + # $prefix --> Generated as input from steps 5.2 to 6 + # $filtered_assembly --> Generated from step 1 + # $gtdb --> User Input + + # output: + # Will write recovered superkingdoms to $outdir + # e.g. ${outdir}/${prefix}.bacteria.fna + # e.g. ${outdir}/${prefix}.archaea.fna + # e.g. ${outdir}/${prefix}.taxonomy.tsv + + # script: + set -x + autometa-taxonomy \ + --votes $votes \ + --output $outdir \ + --prefix $prefix \ + --split-rank-and-write superkingdom \ + --assembly $filtered_assembly \ + --dbdir $gtdb \ + --dbtype $dbtype + { set +x; } 2>/dev/null +fi -# Step 5: Perform k-mer counting on respective kingdoms +# Step 6: Perform k-mer counting on respective kingdoms # input: # $kmer_size --> User input -# $norm_method --> User input -# $embed_method --> User input -# $embed_dimensions --> User input # $cpus --> User input -# $seed --> User input kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: # NOTE: $fasta --> Generated by step 4.3 - fasta="${outdir}/${simpleName}.${kingdom}.fna" + fasta="${outdir}/${prefix}.${kingdom}.fna" # NOTE: $prefix is updated according to taxa_routine above + # kingdom-specific output: + counts="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.tsv" + if [ ! -f $fasta ] then echo "${fasta} does not exist, skipping..." continue fi - - # kingdom-specific output: - counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" - # script: + set -x autometa-kmers \ --fasta $fasta \ --kmers $counts \ --size $kmer_size \ --cpus $cpus + { set +x; } 2>/dev/null done -# Step 6: Perform binning on each set of bacterial and archaeal contigs +# Step 7: Perform binning on each set of bacterial and archaeal contigs # input: -# $cpus --> User input -# $seed --> User input -taxonomy="${outdir}/${simpleName}.taxonomy.tsv" # Generated by step 4.3 +# $coverages --> Generated by step 2 # $gc_content --> Generated by step 1 +taxonomy="${outdir}/${prefix}.taxonomy.tsv" # NOTE: $prefix is updated according to taxa_routine above +# $taxonomy is generated by either steps 4.3 or 5.5 depending on whether taxa_routine is 'ncbi' or 'ncbi_gtdb', respectively +# $cluster_method --> User input +# $completeness --> User input +# $purity --> User input +# $cov_stddev_limit --> User input +# $gc_stddev_limit --> User input +# $norm_method --> User input +# $pca_dimensions --> User input +# $embed_method --> User input +# $max_partition_size --> User input kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: - kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" # Generated by step 5 (counts) - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 + kmers="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.tsv" # Generated by step 6 (counts) + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3 (before taxon-assignment sub-workflows) # kingdom-specific output: - cache="${outdir}/${simpleName}_${kingdom}_cache" - output_binning="${outdir}/${simpleName}.${kingdom}.${clustering_method}.tsv" - output_main="${outdir}/${simpleName}.${kingdom}.${clustering_method}.main.tsv" + cache="${outdir}/${prefix}_${kingdom}_cache" + output_binning="${outdir}/${prefix}.${kingdom}.${cluster_method}.tsv" + output_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv" if [ ! -f $kmers ] then @@ -235,6 +413,7 @@ for kingdom in ${kingdoms[@]};do fi # script: + set -x autometa-binning-ldm \ --kmers $kmers \ --coverages $coverages \ @@ -243,7 +422,7 @@ for kingdom in ${kingdoms[@]};do --taxonomy $taxonomy \ --output-binning $output_binning \ --output-main $output_main \ - --clustering-method $clustering_method \ + --clustering-method $cluster_method \ --completeness $completeness \ --purity $purity \ --cov-stddev-limit $cov_stddev_limit \ @@ -257,27 +436,29 @@ for kingdom in ${kingdoms[@]};do --cache $cache \ --rank-filter superkingdom \ --rank-name-filter $kingdom - + { set +x; } 2>/dev/null done -# Step 7: Create binning summary files +# Step 8: Create binning summary files # input: # $ncbi -> User input +# $gtdb -> User input # $assembly -> User input +# $dbtype -> # NOTE: $prefix is updated according to taxa_routine above kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: - binning_main="${outdir}/${simpleName}.${kingdom}.${clustering_method}.main.tsv" # Generated by step 6 - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 + binning_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv" # Generated by step 7 + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3 # kingdom-specific output: - output_stats="${outdir}/${simpleName}_${kingdom}_metabin_stats.tsv" - output_taxonomy="${outdir}/${simpleName}_${kingdom}_metabin_taxonomy.tsv" - output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" + output_stats="${outdir}/${prefix}_${kingdom}_metabin_stats.tsv" + output_taxonomy="${outdir}/${prefix}_${kingdom}_metabin_taxonomy.tsv" + output_metabins="${outdir}/${prefix}_${kingdom}_metabins" if [ ! -f $binning_main ] then @@ -285,33 +466,41 @@ for kingdom in ${kingdoms[@]};do continue fi - # script: + if [[ "$taxa_routine" == "ncbi_gtdb" ]] + then + dbdir=$gtdb + else + dbdir=$ncbi + fi + set -x autometa-binning-summary \ --binning-main $binning_main \ --markers $markers \ --metagenome $assembly \ - --ncbi $ncbi \ + --dbdir $dbdir \ + --dbtype $dbtype \ --output-stats $output_stats \ --output-taxonomy $output_taxonomy \ --output-metabins $output_metabins + { set +x; } 2>/dev/null done -# Step 8: (OPTIONAL) Annotate Autometa bins' taxonomies and completeness/contamination metrics using GTDB-Tk & CheckM +# Step 9: (OPTIONAL) Annotate Autometa bins' taxonomies and completeness/contamination metrics using GTDB-Tk & CheckM # input: -# $output_metabins --> Generated by step 7 +# $output_metabins --> Generated by step 8 # kingdoms=(bacteria archaea) # for kingdom in ${kingdoms[@]};do # # kingdom-specific input -# output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" +# # output_metabins="${outdir}/${simple_name}_${dbtype}_${kingdom}_metabins" # # kingdom-specific outputs -# gtdbtk_outdir="${outdir}/${simpleName}_${kingdom}_gtdbtk_classify_wf" -# checkm_outdir="${outdir}/${simpleName}_${kingdom}_checkm_lineage_wf" -# checkm_stats="${outdir}/${simpleName}_${kingdom}_checkm_stats.tsv" +# gtdbtk_outdir="${outdir}/${simple_name}_${dbtype}_${kingdom}_gtdbtk_classify_wf" +# checkm_outdir="${outdir}/${simple_name}_${dbtype}_${kingdom}_checkm_lineage_wf" +# checkm_stats="${outdir}/${simple_name}_${dbtype}_${kingdom}_checkm_stats.tsv" # if [ ! -d $output_metabins ] # then @@ -320,7 +509,8 @@ done # fi # # NOTE: autometa-binning-summary writes out all bins with the .fna extension -# # Unclustered sequences are written to one file with the .fasta extension. e.g. outdir/unclustered.fasta +# # Unclustered sequences are written to one file with the .fasta extension. +# # e.g. ${outdir}/${prefix}_${kingdom}_metabins/unclustered.fasta # # First run `gtdbtk check_install` to ensure the following command will run # gtdbtk classify_wf \ diff --git a/workflows/autometa.sh b/workflows/autometa.sh index ae742b42c..fda8862b1 100644 --- a/workflows/autometa.sh +++ b/workflows/autometa.sh @@ -1,9 +1,10 @@ #!/usr/bin/env bash -#SBATCH -p partition +#SBATCH --partition=queue #SBATCH -t 48:00:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=16 - +#SBATCH --error=autometa.%J.err +#SBATCH --output=autometa.%J.out # NOTE: To create the conda environment for autometa you can supply the Makefile command: # make create_environment @@ -18,29 +19,41 @@ assembly="Path to metagenome assembly fasta file" bam="Path to metagenome read alignments.bam" orfs="Path to orfs used as input to diamond blast" -blast="Path to diamond output file (outfmt 6)" -ncbi="Path to NCBI databases directory" +blast="Path to diamond output file (outfmt 6)." # BlastP should be done against the NCBI `nr` database. +ncbi="Path to NCBI databases directory" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi +gtdb="Path to GTDB databases directory" # (Optional) For more info see: https://autometa.readthedocs.io/en/latest/databases.html#genome-taxonomy-database-gtdb # Autometa Parameters length_cutoff=3000 # in bp -# kmer counting, normalization transform and embedding method +# Taxon Assignment Parameters +taxa_routine="ncbi" # Choices are "ncbi" or "ncbi_gtdb" +# NOTE: When using the "ncbi_gtdb" option, blastP will be performed against the GTDB database +# using the kingdom-specific ORFs retrieved from the NCBI taxon-assignment sub-workflow. +# K-mer Counting, Normalization and Embedding Parameters kmer_size=5 norm_method="am_clr" # choices: am_clr, clr, ilr pca_dimensions=50 # NOTE: must be greater than $embed_dimensions embed_dimensions=2 # NOTE: must be less than $pca_dimensions -embed_method="bhsne" # choices: bhsne, sksne, umap, densne, trimap -# Binning clustering method +embed_method="bhsne" # choices: bhsne, sksne, umap, densmap, trimap +# Binning Parameters (clustering methods and MAG quality thresholds) cluster_method="hdbscan" # choices: hdbscan, dbscan # Binning metrics cutoffs -completeness=20.0 -purity=95.0 -cov_stddev_limit=25.0 -gc_stddev_limit=5.0 +completeness=20.0 # Accept MAGs greater than this value +purity=95.0 # Accept MAGs greater than this value +cov_stddev_limit=25.0 # Accept MAGs less than this value +gc_stddev_limit=5.0 # Accept MAGs less than this value +# Runtime Parameters cpus=16 seed=42 +if [[ $taxa_routine != "ncbi" ]] && [[ $taxa_routine != "ncbi_gtdb" ]] +then + echo "ERROR: Invalid Taxonomic routine value. Please choose between ncbi or ncbi_gtdb. Current selection: ${taxa_routine}" + exit 1 +fi + # Step 0: Do some Path handling with the provided `assembly` filepath -simpleName="TemplateAssemblyName" +simple_name="TemplateAssemblyName" outdir="AutometaOutdir" if [ ! -d $outdir ] then mkdir -p $outdir @@ -48,8 +61,11 @@ fi ######### BEGIN ######### + # Step 00: Report autometa version +set -x autometa --version +{ set +x; } 2>/dev/null # Step 1: filter assembly by length and retrieve contig lengths as well as GC content @@ -58,15 +74,17 @@ autometa --version # $length_cutoff --> User input # output: -filtered_assembly="${outdir}/${simpleName}.filtered.fna" -gc_content="${outdir}/${simpleName}.gc_content.tsv" +filtered_assembly="${outdir}/${simple_name}.filtered.fna" +gc_content="${outdir}/${simple_name}.gc_content.tsv" # script: +set -x autometa-length-filter \ --assembly $assembly \ --cutoff $length_cutoff \ --output-fasta $filtered_assembly \ --output-gc-content $gc_content +{ set +x; } 2>/dev/null # Step 2: Determine coverages from assembly read alignments @@ -74,11 +92,13 @@ autometa-length-filter \ # NOTE: $bam is defined at top and the rest of the inputs are generated by autometa # output: -bed="${outdir}/${simpleName}.coverages.bed.tsv" -coverages="${outdir}/${simpleName}.coverages.tsv" +bed="${outdir}/${simple_name}.coverages.bed.tsv" +coverages="${outdir}/${simple_name}.coverages.tsv" # script: +set -x autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages +{ set +x; } 2>/dev/null # Step 3: Annotate and filter markers # input: @@ -90,10 +110,11 @@ kingdoms=(bacteria archaea) # NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms for kingdom in ${kingdoms[@]};do # kingdom-specific output: - hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv" - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" + hmmscan="${outdir}/${simple_name}.${kingdom}.hmmscan.tsv" + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # script: + set -x autometa-markers \ --orfs $orfs \ --hmmscan $hmmscan \ @@ -102,6 +123,7 @@ for kingdom in ${kingdoms[@]};do --parallel \ --cpus 4 \ --seed $seed + { set +x; } 2>/dev/null done # Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits @@ -109,31 +131,40 @@ done # input: # $blast --> User Input # $ncbi --> User Input +# $dbtype --> Updated according to $taxa_routine +dbtype="ncbi" +prefix="${simple_name}.${dbtype}" # output: -lca="${outdir}/${simpleName}.orfs.lca.tsv" -sseqid2taxid="${outdir}/${simpleName}.orfs.sseqid2taxid.tsv" -errorTaxids="${outdir}/${simpleName}.orfs.errortaxids.tsv" +lca="${outdir}/${prefix}.orfs.lca.tsv" +sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv" +error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv" # script: +set -x autometa-taxonomy-lca \ --blast $blast \ --dbdir $ncbi \ + --dbtype $dbtype \ --lca-output $lca \ - --sseqid2taxid-output $sseqid2taxid \ - --lca-error-taxids $errorTaxids + --sseqid2taxid-output $sseqid_to_taxid \ + --lca-error-taxids $error_taxids +{ set +x; } 2>/dev/null # Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search # input: # $lca --> Generated by step 4.1 # $ncbi --> User Input +# $dbtype --> Updated according to $taxa_routine # output: -votes="${outdir}/${simpleName}.taxids.tsv" +votes="${outdir}/${prefix}.taxids.tsv" # script: -autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi +set -x +autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi --dbtype $dbtype +{ set +x; } 2>/dev/null # Step 4.3: Split assigned taxonomies into kingdoms @@ -145,20 +176,162 @@ autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi # output: # Will write recovered superkingdoms to ${outdir} -# e.g. ${outdir}/${simpleName}.bacteria.fna -# e.g. ${outdir}/${simpleName}.archaea.fna -# e.g. ${outdir}/${simpleName}.taxonomy.tsv +# e.g. ${outdir}/${prefix}.bacteria.fna +# e.g. ${outdir}/${prefix}.archaea.fna +# e.g. ${outdir}/${prefix}.taxonomy.tsv # script: +set -x autometa-taxonomy \ --votes $votes \ --output $outdir \ - --prefix $simpleName \ + --prefix $prefix \ --split-rank-and-write superkingdom \ --assembly $assembly \ - --ncbi $ncbi + --dbdir $ncbi \ + --dbtype $dbtype +{ set +x; } 2>/dev/null -# Step 5: Perform k-mer counting on respective kingdoms +# Step 5: Taxon-assignment using the GTDB database +# NOTE: only performed if `taxa_routine` is 'ncbi_gtdb' + +# Step 5.1: Extract bacterial ORFs and run GTDB +# input: +# $kingdom_fasta --> Generated by step 4.3 +# $orfs --> User Input + +# output: +# orf_prefixes --> text file containing metagenome contig IDs classified within NCBI bacteria and archaea +# orf_ids --> text file containing contig ORF IDs classified within NCBI bacteria and archaea +# kingdom_orfs --> fasta file containing metagenome ORFs classified within NCBI bacteria or archaea +# gtdb_input_orfs --> metagenome orfs classified within NCBI bacteria *and* archaea + +if [[ "$taxa_routine" == "ncbi_gtdb" ]] +then + echo "Running GTDB taxon assignment step." + # output + gtdb_input_orfs="${outdir}/${prefix}.orfs.faa" + + + for kingdom in ${kingdoms[@]};do + + kingdom_fasta="${outdir}/${prefix}.${kingdom}.fna" + + orf_prefixes="${outdir}/${prefix}.${kingdom}.contigIDs.txt" + orf_ids="${outdir}/${prefix}.${kingdom}.orfIDs.txt" + kingdom_orfs="${outdir}/${prefix}.${kingdom}.orfs.faa" + + if [ ! -f $kingdom_fasta ] + then + echo "${kingdom_fasta} does not exist, skipping..." + continue + fi + + # Retrieve contig IDs from kingdom fasta file + set -x + grep ">" $kingdom_fasta | \ + sed 's/^>//' | \ + sed 's/$/_/' | \ + cut -f1 -d" " > $orf_prefixes + # Retrieve ORF IDs from contig IDs + grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids + # Retrieve ORF seqs from ORF IDs + seqkit grep \ + --pattern-file $orf_ids \ + --out-file $kingdom_orfs \ + $orfs + # Concatenate kingdom ORFs to single file for GTDB blastp + cat $kingdom_orfs >> $gtdb_input_orfs + { set +x; } 2>/dev/null + done + dbtype="gtdb" + prefix="${simple_name}.${dbtype}" + + # Step 5.2: Run blastp + # input: + # $gtdb_input_orfs --> Generated from step 5.1 + gtdb_dmnd_db=$(find $gtdb -name "gtdb.dmnd") # generated using autometa-setup-gtdb (Must be performed prior to using this script) + # output + blast="${outdir}/${prefix}.blastp.tsv" + + # script + set -x + diamond blastp \ + --query $gtdb_input_orfs \ + --db $gtdb_dmnd_db \ + --evalue 1e-5 \ + --max-target-seqs 200 \ + --threads $cpus \ + --outfmt 6 \ + --out $blast + { set +x; } 2>/dev/null + + #Step 5.3: Determine LCA + # input: + # $blast --> Generated from step 5.2 + # $gtdb --> User Input + + # output: + lca="${outdir}/${prefix}.orfs.lca.tsv" + sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv" + error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv" + + # script: + set -x + autometa-taxonomy-lca \ + --blast $blast \ + --dbdir $gtdb \ + --dbtype $dbtype \ + --lca-output $lca \ + --sseqid2taxid-output $sseqid_to_taxid \ + --lca-error-taxids $error_taxids + { set +x; } 2>/dev/null + + # Step 5.4: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search + # input: + # $lca --> Generated from step 5.3 + # $gtdb --> User Input + + # output: + votes="${outdir}/${prefix}.taxids.tsv" + + # script: + set -x + autometa-taxonomy-majority-vote \ + --lca $lca \ + --output $votes \ + --dbdir $gtdb \ + --dbtype gtdb + { set +x; } 2>/dev/null + + # Step 5.5: Split assigned taxonomies into kingdoms + # input: + # $votes --> Generated from step 5.4 + # $outdir --> Generated from step 0 + # prefix="${prefix}.${dbtype}" + # $filtered_assembly --> Generated from step 1 + # $gtdb --> User Input + + # output: + # Will write recovered superkingdoms to $outdir + # e.g. ${outdir}/${prefix}.bacteria.fna + # e.g. ${outdir}/${prefix}.archaea.fna + # e.g. ${outdir}/${prefix}.taxonomy.tsv + + # script: + set -x + autometa-taxonomy \ + --votes $votes \ + --output $outdir \ + --prefix $prefix \ + --split-rank-and-write superkingdom \ + --assembly $filtered_assembly \ + --dbdir $gtdb \ + --dbtype gtdb + { set +x; } 2>/dev/null +fi + +# Step 6: Perform k-mer counting on respective kingdoms # input: # $kmer_size --> User input @@ -172,13 +345,10 @@ kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: - # NOTE: $fasta --> Generated by step 4.3 - fasta="${outdir}/${simpleName}.${kingdom}.fna" - - # kingdom-specific output: - counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" - normalized="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.tsv" - embedded="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" + fasta="${outdir}/${prefix}.${kingdom}.fna" # NOTE: $prefix is updated according to taxa_routine above + counts="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.tsv" + normalized="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.tsv" + embedded="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" if [ ! -f $fasta ] then @@ -186,6 +356,7 @@ for kingdom in ${kingdoms[@]};do continue fi # script: + set -x autometa-kmers \ --fasta $fasta \ --kmers $counts \ @@ -198,32 +369,41 @@ for kingdom in ${kingdoms[@]};do --embedding-dimensions $embed_dimensions \ --cpus $cpus \ --seed $seed + { set +x; } 2>/dev/null done -# Step 6: Perform binning on each set of bacterial and archaeal contigs +# Step 7: Perform binning on each set of bacterial and archaeal contigs # input: # $cpus --> User input # $seed --> User input -taxonomy="${outdir}/${simpleName}.taxonomy.tsv" # Generated by step 4.3 # $gc_content --> Generated by step 1 +taxonomy="${outdir}/${prefix}.taxonomy.tsv" # NOTE: $prefix is updated according to taxa_routine above +# $taxonomy is generated by either steps 4.3 or 5.5 depending on whether taxa_routine is 'ncbi' or 'ncbi_gtdb', respectively kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: - kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 5 - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 + kmers="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 6 + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3 (before taxon-assignment sub-workflows) # kingdom-specific output: - output_binning="${outdir}/${simpleName}.${kingdom}.${cluster_method}.tsv" - output_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv" + output_binning="${outdir}/${prefix}.${kingdom}.${cluster_method}.tsv" + output_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv" if [ -f $output_main ] && [ -s $output_main ];then echo "$(basename $output_main) already exists. continuing..." continue fi + + if [ ! -f $kmers ] + then echo "${kingdom} file not found: kmers: ${kmers}), skipping." + continue + fi + # script: + set -x autometa-binning \ --kmers $kmers \ --coverages $coverages \ @@ -240,26 +420,29 @@ for kingdom in ${kingdoms[@]};do --starting-rank superkingdom \ --rank-filter superkingdom \ --rank-name-filter $kingdom + { set +x; } 2>/dev/null done -# Step 7: Create binning summary files +# Step 8: Create binning summary files # input: # $ncbi -> User input +# $gtdb -> User input # $assembly -> User input +# $dbtype -> # NOTE: $prefix is updated according to taxa_routine above kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: - binning_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv" # Generated by step 6 - markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 + binning_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv" # Generated by step 7 + markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3 # kingdom-specific output: - output_stats="${outdir}/${simpleName}_${kingdom}_metabin_stats.tsv" - output_taxonomy="${outdir}/${simpleName}_${kingdom}_metabin_taxonomy.tsv" - output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" + output_stats="${outdir}/${prefix}_${kingdom}_metabin_stats.tsv" + output_taxonomy="${outdir}/${prefix}_${kingdom}_metabin_taxonomy.tsv" + output_metabins="${outdir}/${prefix}_${kingdom}_metabins" if [ ! -f $binning_main ] then @@ -267,15 +450,24 @@ for kingdom in ${kingdoms[@]};do continue fi - # script: + if [[ "$taxa_routine" == "ncbi_gtdb" ]] + then + dbdir=$gtdb + else + dbdir=$ncbi + fi + set -x autometa-binning-summary \ --binning-main $binning_main \ --markers $markers \ --metagenome $assembly \ - --ncbi $ncbi \ + --dbdir $dbdir \ + --dbtype $dbtype \ --output-stats $output_stats \ --output-taxonomy $output_taxonomy \ --output-metabins $output_metabins + { set +x; } 2>/dev/null + done ######### END ######### diff --git a/workflows/autometa_flagged.sh b/workflows/autometa_flagged.sh index b9ffa89aa..41ef05241 100644 --- a/workflows/autometa_flagged.sh +++ b/workflows/autometa_flagged.sh @@ -326,6 +326,7 @@ set -x autometa-taxonomy-lca \ --blast $blast \ --dbdir $ncbi \ + --dbtype ncbi \ --lca-output $lca \ --sseqid2taxid-output $sseqid2taxid \ --lca-error-taxids $error_taxids @@ -338,7 +339,8 @@ set -x autometa-taxonomy-majority-vote \ --lca $lca \ --output $votes \ - --dbdir $ncbi + --dbdir $ncbi \ + --dbtype ncbi # Step 4.3: Split assigned taxonomies into kingdoms autometa-taxonomy \ @@ -347,7 +349,8 @@ autometa-taxonomy \ --prefix $simple_name \ --split-rank-and-write superkingdom \ --assembly $filtered_assembly \ - --ncbi $ncbi + --dbdir $ncbi \ + --dbtype ncbi { set +x; } 2>/dev/null # Step 5: Perform k-mer counting on respective kingdoms @@ -451,7 +454,8 @@ for kingdom in ${kingdoms[@]};do --binning-main $binning_main \ --markers $markers \ --metagenome $filtered_assembly \ - --ncbi $ncbi \ + --dbdir $ncbi \ + --dbtype ncbi \ --output-stats $output_stats \ --output-taxonomy $output_taxonomy \ --output-metabins $output_metabins