Version 0.1.0

apcamargo · Mar 6, 2020 · 4f8ac61 · 4f8ac61
1 parent af8aa40
commit 4f8ac61
Show file tree

Hide file tree

Showing 7 changed files with 457 additions and 0 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,85 @@
+# taxopy
+
+A Python package for obtaining complete lineages and the lowest common ancestor (LCA) from a set of taxonomic identifiers.
+
+## Installation
+
+There are two ways to install taxopy:
+
+  - Using pip:
+
+```
+pip install taxopy
+```
+
+  - Using conda:
+
+```
+conda install -c conda-forge -c bioconda taxopy
+```
+
+## Usage
+
+```python
+import taxopy
+```
+
+First you need to download taxonomic information from NCBI's servers and put this data into a `TaxDb` object:
+
+
+```python
+taxdb = taxopy.TaxDb()
+# You can also use your own set of taxonomy files:
+taxdb = taxopy.TaxDb(nodes_dmp="taxdb/nodes.dmp", names_dmp="taxdb/names.dmp", keep_files=True)
+```
+
+The `TaxDb` object stores the name, rank and parent-child relationships of each taxonomic identifier:
+
+
+```python
+print(taxdb.taxid2name['2'])
+print(taxdb.taxid2parent['2'])
+print(taxdb.taxid2rank['2'])
+```
+
+    Bacteria
+    131567
+    superkingdom
+
+
+To get information of a given taxon you can create a `Taxon` object using its taxonomic identifier:
+
+
+```python
+human = taxopy.Taxon('9606', taxdb)
+gorilla = taxopy.Taxon('9593', taxdb)
+lagomorpha = taxopy.Taxon('9975', taxdb)
+```
+
+Each `Taxon` object stores a variety of information, such as the rank, identifier and name of the input taxon, and the identifiers and names of all the parent taxons:
+
+
+```python
+print(lagomorpha.rank)
+print(lagomorpha.name)
+print(lagomorpha.name_lineage)
+```
+
+    order
+    Lagomorpha
+    ['Lagomorpha', 'Glires', 'Euarchontoglires', 'Boreoeutheria', 'Eutheria', 'Theria', 'Mammalia', 'Amniota', 'Tetrapoda', 'Dipnotetrapodomorpha', 'Sarcopterygii', 'Euteleostomi', 'Teleostomi', 'Gnathostomata', 'Vertebrata', 'Craniata', 'Chordata', 'Deuterostomia', 'Bilateria', 'Eumetazoa', 'Metazoa', 'Opisthokonta', 'Eukaryota', 'cellular organisms', 'root']
+
+
+You can get the lowest common ancestor of a list of taxons using the `find_lca` function:
+
+
+```python
+human_gorilla_lca = taxopy.find_lca([human, gorilla], taxdb)
+print(human_gorilla_lca.name)
+```
+
+    Homininae
+
+## Acknowledgements
+
+Some of the code used in taxopy was taken from the [CAT/BAT tool for taxonomic classification of contigs and metagenome-assembled genomes](https://github.com/dutilh/CAT).
diff --git a/setup.py b/setup.py
@@ -0,0 +1,48 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#   This file is part of the taxopy package, available at:
+#   https://github.com/apcamargo/taxopy
+#
+#   Taxopy is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   Contact: antoniop.camargo@gmail.com
+
+"""The setup script."""
+
+from setuptools import setup, find_packages
+
+setup(
+    name="taxopy",
+    version="0.1.0",
+    packages=find_packages(),
+    license="GNU General Public License v3.0",
+    description="A Python package for obtaining complete lineages and the lowest common ancestor (LCA) from a set of taxonomic identifiers.",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    python_requires=">=3.5",
+    url="https://apcamargo.github.io/taxopy/",
+    keywords=["bioinformatics", "taxonomy"],
+    author="Antonio Pedro Camargo",
+    author_email="antoniop.camargo@gmail.com",
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Science/Research",
+        "Natural Language :: English",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+        "License :: OSI Approved :: GNU General Public License (GPL)",
+        "Programming Language :: Python :: 3",
+    ],
+)
diff --git a/taxopy/__init__.py b/taxopy/__init__.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+#
+#   This file is part of the taxopy package, available at:
+#   https://github.com/apcamargo/taxopy
+#
+#   Taxopy is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   Contact: antoniop.camargo@gmail.com
+
+from taxopy.core import TaxDb, Taxon
+from taxopy.utilities import find_lca
diff --git a/taxopy/core.py b/taxopy/core.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+#
+#   This file is part of the taxopy package, available at:
+#   https://github.com/apcamargo/taxopy
+#
+#   Taxopy is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   Contact: antoniop.camargo@gmail.com
+
+import os
+import tarfile
+import urllib.request
+from taxopy.exceptions import DownloadError, ExtractionError, TaxidError
+
+
+class TaxDb:
+    """
+    Create an object of the TaxDb class.
+
+    Parameters
+    ----------
+    taxdb_dir : str, optional
+        A directory to download NCBI's taxonomy database files to. If the
+        directory does not exist it will be created.
+    nodes_dmp : str, optional
+        The path for a pre-downloaded `nodes.dmp` file. If both `nodes.dmp` and
+        `names.dmp` are supplied NCBI's taxonomy database won't be downloaded.
+    names_dmp : str, optional
+        The path for a pre-downloaded `names.dmp` file. If both `names.dmp` and
+        `nodes.dmp` are supplied NCBI's taxonomy database won't be downloaded.
+    keep_files : bool, default True
+        Keep the `nodes.dmp` and `names.dmp` files after the TaxDb object is
+        created. If `taxdb_dir` was supplied the whole directory will be deleted.
+        By default, the files are deleted.
+
+
+    Attributes
+    ----------
+    taxid2name : dict
+        A dictionary where the keys are taxonomic identifiers and the values are
+        their corresponding names.
+    taxid2parent: dict
+        A dictionary where the keys are taxonomic identifiers and the values are
+        the taxonomic identifiers of their corresponding parent taxon.
+    taxid2rank: dict
+        A dictionary where the keys are taxonomic identifiers and the values are
+        their corresponding ranks.
+
+    Raises
+    ------
+    DownloadError
+        If the download of the taxonomy database fails.
+    ExtractionError
+        If the decompression of the taxonomy database fails.
+    """
+
+    def __init__(
+        self,
+        *,
+        taxdb_dir: str = None,
+        nodes_dmp: str = None,
+        names_dmp: str = None,
+        keep_files: bool = False
+    ):
+        if not taxdb_dir:
+            self._taxdb_dir = os.getcwd()
+        elif not os.path.isdir(taxdb_dir):
+            os.makedirs(taxdb_dir)
+            self._taxdb_dir = taxdb_dir
+        else:
+            self._taxdb_dir = taxdb_dir
+        if not nodes_dmp or not names_dmp:
+            self._nodes_dmp, self._names_dmp = self._download_taxonomy()
+        else:
+            self._nodes_dmp, self._names_dmp = nodes_dmp, names_dmp
+        self.taxid2parent, self.taxid2rank = self._import_nodes()
+        self.taxid2name = self._import_names()
+        if not keep_files:
+            self._delete_files()
+
+    def _download_taxonomy(self):
+        url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
+        tmp_taxonomy_file = os.path.join(self._taxdb_dir, "taxdump.tar.gz")
+        try:
+            urllib.request.urlretrieve(url, tmp_taxonomy_file)
+        except:
+            raise DownloadError(
+                "Download of taxonomy files failed. NCBI's server may be offline."
+            )
+        try:
+            with tarfile.open(tmp_taxonomy_file) as tf:
+                tf.extract("nodes.dmp", path=self._taxdb_dir)
+                tf.extract("names.dmp", path=self._taxdb_dir)
+        except:
+            raise ExtractionError(
+                "Something went wrong while extracting the taxonomy files."
+            )
+        os.remove(tmp_taxonomy_file)
+        return (
+            os.path.join(self._taxdb_dir, "nodes.dmp"),
+            os.path.join(self._taxdb_dir, "names.dmp"),
+        )
+
+    def _import_nodes(self):
+        taxid2parent = {}
+        taxid2rank = {}
+        with open(self._nodes_dmp, "r") as f:
+            for line in f:
+                line = line.split("\t")
+                taxid = line[0]
+                parent = line[2]
+                rank = line[4]
+                taxid2parent[taxid] = parent
+                taxid2rank[taxid] = rank
+        return taxid2parent, taxid2rank
+
+    def _import_names(self):
+        taxid2name = {}
+        with open(self._names_dmp, "r") as f:
+            for line in f:
+                line = line.split("\t")
+                if line[6] == "scientific name":
+                    taxid = line[0]
+                    name = line[2]
+                    taxid2name[taxid] = name
+        return taxid2name
+
+    def _delete_files(self):
+        os.remove(self._nodes_dmp)
+        os.remove(self._names_dmp)
+        if not os.listdir(self._taxdb_dir) and self._taxdb_dir != os.getcwd():
+            os.rmdir(self._taxdb_dir)
+
+
+class Taxon:
+    """
+    Create an object of the Taxon class.
+
+    Parameters
+    ----------
+    taxid : str
+        A NCBI taxonomic identifier.
+    taxdb : TaxDb
+        A TaxDb object.
+
+    Attributes
+    ----------
+    taxid : str
+        The NCBI taxonomic identifier the object represents (e.g., '9606').
+    name: str
+        The name of the taxon (e.g., 'Homo sapiens').
+    rank: str
+        The rank of the taxon (e.g., 'species').
+    taxid_lineage: list
+        An ordered list containing the taxonomic identifiers of the whole lineage
+        of the taxon, from the most specific to the most general.
+    name_lineage: list
+        An ordered list containing the names of the whole lineage of the taxon,
+        from the most specific to the most general.
+    rank_name_dictionary: dict
+        A dictionary where the keys are named ranks and the values are the names
+        of the taxons that correspond to each of the named ranks in the lineage.
+
+    Raises
+    ------
+    TaxidError
+        If the input string is not a valid NCBI taxonomic identifier.
+    """
+
+    def __init__(self, taxid: str, taxdb: TaxDb):
+        self.taxid = taxid
+        if self.taxid not in taxdb.taxid2name:
+            raise TaxidError("The input string is not a valid NCBI taxonomic identifier.")
+        self.name = taxdb.taxid2name[self.taxid]
+        self.rank = taxdb.taxid2rank[self.taxid]
+        self.taxid_lineage = self._find_lineage(taxdb.taxid2parent)
+        self.name_lineage = self._convert_to_names(taxdb.taxid2rank, taxdb.taxid2name)
+        self.rank_name_dictionary = self._convert_to_rank_name_dictionary(
+            taxdb.taxid2rank, taxdb.taxid2name
+        )
+
+    def __repr__(self):
+        return " -> ".join(reversed(self.name_lineage))
+
+    def _find_lineage(self, taxid2parent):
+        lineage = []
+        current_taxid = self.taxid
+        lineage.append(current_taxid)
+        while taxid2parent[current_taxid] != current_taxid:
+            current_taxid = taxid2parent[current_taxid]
+            lineage.append(current_taxid)
+        return lineage
+
+    def _convert_to_names(self, taxid2rank, taxid2name):
+        names = []
+        for taxid in self.taxid_lineage:
+            name = taxid2name[taxid]
+            names.append(name)
+        return names
+
+    def _convert_to_rank_name_dictionary(self, taxid2rank, taxid2name):
+        rank_name_dictionary = {}
+        for taxid in self.taxid_lineage:
+            rank = taxid2rank[taxid]
+            if rank != "no rank":
+                rank_name_dictionary[rank] = taxid2name[taxid]
+        return rank_name_dictionary