Skip to content

Commit

Permalink
Version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
apcamargo committed Mar 6, 2020
1 parent af8aa40 commit 4f8ac61
Show file tree
Hide file tree
Showing 7 changed files with 457 additions and 0 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include LICENSE
85 changes: 85 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# taxopy

A Python package for obtaining complete lineages and the lowest common ancestor (LCA) from a set of taxonomic identifiers.

## Installation

There are two ways to install taxopy:

- Using pip:

```
pip install taxopy
```

- Using conda:

```
conda install -c conda-forge -c bioconda taxopy
```

## Usage

```python
import taxopy
```

First you need to download taxonomic information from NCBI's servers and put this data into a `TaxDb` object:


```python
taxdb = taxopy.TaxDb()
# You can also use your own set of taxonomy files:
taxdb = taxopy.TaxDb(nodes_dmp="taxdb/nodes.dmp", names_dmp="taxdb/names.dmp", keep_files=True)
```

The `TaxDb` object stores the name, rank and parent-child relationships of each taxonomic identifier:


```python
print(taxdb.taxid2name['2'])
print(taxdb.taxid2parent['2'])
print(taxdb.taxid2rank['2'])
```

Bacteria
131567
superkingdom


To get information of a given taxon you can create a `Taxon` object using its taxonomic identifier:


```python
human = taxopy.Taxon('9606', taxdb)
gorilla = taxopy.Taxon('9593', taxdb)
lagomorpha = taxopy.Taxon('9975', taxdb)
```

Each `Taxon` object stores a variety of information, such as the rank, identifier and name of the input taxon, and the identifiers and names of all the parent taxons:


```python
print(lagomorpha.rank)
print(lagomorpha.name)
print(lagomorpha.name_lineage)
```

order
Lagomorpha
['Lagomorpha', 'Glires', 'Euarchontoglires', 'Boreoeutheria', 'Eutheria', 'Theria', 'Mammalia', 'Amniota', 'Tetrapoda', 'Dipnotetrapodomorpha', 'Sarcopterygii', 'Euteleostomi', 'Teleostomi', 'Gnathostomata', 'Vertebrata', 'Craniata', 'Chordata', 'Deuterostomia', 'Bilateria', 'Eumetazoa', 'Metazoa', 'Opisthokonta', 'Eukaryota', 'cellular organisms', 'root']


You can get the lowest common ancestor of a list of taxons using the `find_lca` function:


```python
human_gorilla_lca = taxopy.find_lca([human, gorilla], taxdb)
print(human_gorilla_lca.name)
```

Homininae

## Acknowledgements

Some of the code used in taxopy was taken from the [CAT/BAT tool for taxonomic classification of contigs and metagenome-assembled genomes](https://github.com/dutilh/CAT).
48 changes: 48 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# !/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the taxopy package, available at:
# https://github.com/apcamargo/taxopy
#
# Taxopy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Contact: antoniop.camargo@gmail.com

"""The setup script."""

from setuptools import setup, find_packages

setup(
name="taxopy",
version="0.1.0",
packages=find_packages(),
license="GNU General Public License v3.0",
description="A Python package for obtaining complete lineages and the lowest common ancestor (LCA) from a set of taxonomic identifiers.",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
python_requires=">=3.5",
url="https://apcamargo.github.io/taxopy/",
keywords=["bioinformatics", "taxonomy"],
author="Antonio Pedro Camargo",
author_email="antoniop.camargo@gmail.com",
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Topic :: Software Development :: Libraries",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"License :: OSI Approved :: GNU General Public License (GPL)",
"Programming Language :: Python :: 3",
],
)
22 changes: 22 additions & 0 deletions taxopy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
#
# This file is part of the taxopy package, available at:
# https://github.com/apcamargo/taxopy
#
# Taxopy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Contact: antoniop.camargo@gmail.com

from taxopy.core import TaxDb, Taxon
from taxopy.utilities import find_lca
218 changes: 218 additions & 0 deletions taxopy/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
# -*- coding: utf-8 -*-
#
# This file is part of the taxopy package, available at:
# https://github.com/apcamargo/taxopy
#
# Taxopy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Contact: antoniop.camargo@gmail.com

import os
import tarfile
import urllib.request
from taxopy.exceptions import DownloadError, ExtractionError, TaxidError


class TaxDb:
"""
Create an object of the TaxDb class.
Parameters
----------
taxdb_dir : str, optional
A directory to download NCBI's taxonomy database files to. If the
directory does not exist it will be created.
nodes_dmp : str, optional
The path for a pre-downloaded `nodes.dmp` file. If both `nodes.dmp` and
`names.dmp` are supplied NCBI's taxonomy database won't be downloaded.
names_dmp : str, optional
The path for a pre-downloaded `names.dmp` file. If both `names.dmp` and
`nodes.dmp` are supplied NCBI's taxonomy database won't be downloaded.
keep_files : bool, default True
Keep the `nodes.dmp` and `names.dmp` files after the TaxDb object is
created. If `taxdb_dir` was supplied the whole directory will be deleted.
By default, the files are deleted.
Attributes
----------
taxid2name : dict
A dictionary where the keys are taxonomic identifiers and the values are
their corresponding names.
taxid2parent: dict
A dictionary where the keys are taxonomic identifiers and the values are
the taxonomic identifiers of their corresponding parent taxon.
taxid2rank: dict
A dictionary where the keys are taxonomic identifiers and the values are
their corresponding ranks.
Raises
------
DownloadError
If the download of the taxonomy database fails.
ExtractionError
If the decompression of the taxonomy database fails.
"""

def __init__(
self,
*,
taxdb_dir: str = None,
nodes_dmp: str = None,
names_dmp: str = None,
keep_files: bool = False
):
if not taxdb_dir:
self._taxdb_dir = os.getcwd()
elif not os.path.isdir(taxdb_dir):
os.makedirs(taxdb_dir)
self._taxdb_dir = taxdb_dir
else:
self._taxdb_dir = taxdb_dir
if not nodes_dmp or not names_dmp:
self._nodes_dmp, self._names_dmp = self._download_taxonomy()
else:
self._nodes_dmp, self._names_dmp = nodes_dmp, names_dmp
self.taxid2parent, self.taxid2rank = self._import_nodes()
self.taxid2name = self._import_names()
if not keep_files:
self._delete_files()

def _download_taxonomy(self):
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
tmp_taxonomy_file = os.path.join(self._taxdb_dir, "taxdump.tar.gz")
try:
urllib.request.urlretrieve(url, tmp_taxonomy_file)
except:
raise DownloadError(
"Download of taxonomy files failed. NCBI's server may be offline."
)
try:
with tarfile.open(tmp_taxonomy_file) as tf:
tf.extract("nodes.dmp", path=self._taxdb_dir)
tf.extract("names.dmp", path=self._taxdb_dir)
except:
raise ExtractionError(
"Something went wrong while extracting the taxonomy files."
)
os.remove(tmp_taxonomy_file)
return (
os.path.join(self._taxdb_dir, "nodes.dmp"),
os.path.join(self._taxdb_dir, "names.dmp"),
)

def _import_nodes(self):
taxid2parent = {}
taxid2rank = {}
with open(self._nodes_dmp, "r") as f:
for line in f:
line = line.split("\t")
taxid = line[0]
parent = line[2]
rank = line[4]
taxid2parent[taxid] = parent
taxid2rank[taxid] = rank
return taxid2parent, taxid2rank

def _import_names(self):
taxid2name = {}
with open(self._names_dmp, "r") as f:
for line in f:
line = line.split("\t")
if line[6] == "scientific name":
taxid = line[0]
name = line[2]
taxid2name[taxid] = name
return taxid2name

def _delete_files(self):
os.remove(self._nodes_dmp)
os.remove(self._names_dmp)
if not os.listdir(self._taxdb_dir) and self._taxdb_dir != os.getcwd():
os.rmdir(self._taxdb_dir)


class Taxon:
"""
Create an object of the Taxon class.
Parameters
----------
taxid : str
A NCBI taxonomic identifier.
taxdb : TaxDb
A TaxDb object.
Attributes
----------
taxid : str
The NCBI taxonomic identifier the object represents (e.g., '9606').
name: str
The name of the taxon (e.g., 'Homo sapiens').
rank: str
The rank of the taxon (e.g., 'species').
taxid_lineage: list
An ordered list containing the taxonomic identifiers of the whole lineage
of the taxon, from the most specific to the most general.
name_lineage: list
An ordered list containing the names of the whole lineage of the taxon,
from the most specific to the most general.
rank_name_dictionary: dict
A dictionary where the keys are named ranks and the values are the names
of the taxons that correspond to each of the named ranks in the lineage.
Raises
------
TaxidError
If the input string is not a valid NCBI taxonomic identifier.
"""

def __init__(self, taxid: str, taxdb: TaxDb):
self.taxid = taxid
if self.taxid not in taxdb.taxid2name:
raise TaxidError("The input string is not a valid NCBI taxonomic identifier.")
self.name = taxdb.taxid2name[self.taxid]
self.rank = taxdb.taxid2rank[self.taxid]
self.taxid_lineage = self._find_lineage(taxdb.taxid2parent)
self.name_lineage = self._convert_to_names(taxdb.taxid2rank, taxdb.taxid2name)
self.rank_name_dictionary = self._convert_to_rank_name_dictionary(
taxdb.taxid2rank, taxdb.taxid2name
)

def __repr__(self):
return " -> ".join(reversed(self.name_lineage))

def _find_lineage(self, taxid2parent):
lineage = []
current_taxid = self.taxid
lineage.append(current_taxid)
while taxid2parent[current_taxid] != current_taxid:
current_taxid = taxid2parent[current_taxid]
lineage.append(current_taxid)
return lineage

def _convert_to_names(self, taxid2rank, taxid2name):
names = []
for taxid in self.taxid_lineage:
name = taxid2name[taxid]
names.append(name)
return names

def _convert_to_rank_name_dictionary(self, taxid2rank, taxid2name):
rank_name_dictionary = {}
for taxid in self.taxid_lineage:
rank = taxid2rank[taxid]
if rank != "no rank":
rank_name_dictionary[rank] = taxid2name[taxid]
return rank_name_dictionary
Loading

0 comments on commit 4f8ac61

Please sign in to comment.