From e60a35ae78dec188de5c9d8b08092a1a420cdf5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Gudy=C5=9B?= Date: Tue, 22 Oct 2024 23:15:37 +0200 Subject: [PATCH] Installation through PyPi, Set default `--min-kmers` to 20 Co-authored-by: aziele --- 3rd_party/clusty | 2 +- 3rd_party/kmer-db | 2 +- 3rd_party/lz-ani | 2 +- 3rd_party/ref-utils | 2 +- MANIFEST.in | 4 ++ README.md | 20 ++++---- pyproject.toml | 56 +++++++++++++++++++++ vclust.py | 118 +++++++++++++++++++++++++++++++++++--------- 8 files changed, 170 insertions(+), 36 deletions(-) create mode 100644 MANIFEST.in create mode 100644 pyproject.toml diff --git a/3rd_party/clusty b/3rd_party/clusty index d80c26a..7b109d4 160000 --- a/3rd_party/clusty +++ b/3rd_party/clusty @@ -1 +1 @@ -Subproject commit d80c26aec4c09a4715cb43763fa66c5baf8d9968 +Subproject commit 7b109d42a4c603e26dead5b566d43c0506a858d7 diff --git a/3rd_party/kmer-db b/3rd_party/kmer-db index 18719c7..742b494 160000 --- a/3rd_party/kmer-db +++ b/3rd_party/kmer-db @@ -1 +1 @@ -Subproject commit 18719c7329b4f3c9bb0a0ac44d030c517c7a1bbb +Subproject commit 742b4942b71271e8b0a1be63405e86b0d1f795ec diff --git a/3rd_party/lz-ani b/3rd_party/lz-ani index cdcaa0c..e3cc571 160000 --- a/3rd_party/lz-ani +++ b/3rd_party/lz-ani @@ -1 +1 @@ -Subproject commit cdcaa0ccb416d48a0689839cfdf78faaf67bf8a9 +Subproject commit e3cc571d973aedf634afd349c641dbb1328ea493 diff --git a/3rd_party/ref-utils b/3rd_party/ref-utils index cbee86d..21d36c7 160000 --- a/3rd_party/ref-utils +++ b/3rd_party/ref-utils @@ -1 +1 @@ -Subproject commit cbee86d539a83ded811a9abbdb3f8c892b4fdc07 +Subproject commit 21d36c7c5a629e23446400d51cfd317c57ac5dc7 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..887a3f6 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include LICENSE +recursive-include bin * +recursive-exclude 3rd_party * diff --git a/README.md b/README.md index ded09b4..0373e77 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ # Vclust logo Vclust -![version](https://img.shields.io/badge/version-1.2.7-blue.svg) -[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases) -[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust) +![version](https://img.shields.io/badge/version-1.2.8-blue.svg) +![PyPI - Version](https://img.shields.io/pypi/v/vclust?label=PyPI%20version&color=blue) [![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) +![PyPI - Downloads](https://img.shields.io/pypi/dm/vclust?label=PyPI%20downloads) +[![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](https://github.com/refresh-bio/vclust/releases) +[![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust) + ![x86-64](https://img.shields.io/static/v1?label=%E2%80%8B&message=x86-64&color=yellow&logo=PCGamingWiki&logoColor=white) ![ARM](https://img.shields.io/static/v1?label=%E2%80%8B&message=ARM&color=yellow&logo=Raspberry%20Pi&logoColor=white) ![Apple M](https://img.shields.io/static/v1?label=%E2%80%8B&message=Apple%20M&color=yellow&logo=Apple&logoColor=white) @@ -51,18 +54,17 @@ For datasets containing up to 1000 viral genomes, Vclust is available at [http:/ ## Quick start ```bash -# Clone repository and build Vclust -git clone --recurse-submodules https://github.com/refresh-bio/vclust -cd vclust && make -j +# Install Vclust (requires Python >= 3.7) +pip install vclust # Prefilter similar genome sequence pairs before conducting pairwise alignments. -./vclust.py prefilter -i example/multifasta.fna -o fltr.txt +vclust prefilter -i example/multifasta.fna -o fltr.txt # Align similar genome sequence pairs and calculate pairwise ANI measures. -./vclust.py align -i example/multifasta.fna -o ani.tsv --filter fltr.txt +vclust align -i example/multifasta.fna -o ani.tsv --filter fltr.txt # Cluster genome sequences based on given ANI measure and minimum threshold. -./vclust.py cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95 +vclust cluster -i ani.tsv -o clusters.tsv --ids ani.ids.tsv --metric ani --ani 0.95 ``` ## Documentation diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3fa8bdb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +py-modules = ["vclust"] + +[tool.setuptools.packages.find] +where = ["./"] + +[project] +name = "vclust-test" +description = """Fast and accurate tool for calculating \ +Average Nucleotide Identity (ANI) and clustering virus \ +genomes and metagenomic contigs""" +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Andrzej Zielezinski", email = "andrzej.zielezinski@amu.edu.pl" }, + { name = "Adam Gudyƛ", email = "adam.gudys@polsl.pl" }, + { name = "Sebastian Deorowicz", email = "sebastian.deorowicz@polsl.pl" }, +] +requires-python = ">=3.7" +dynamic = ["version"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Natural Language :: English", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS", + "License :: OSI Approved :: GNU Affero General Public License v3", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[tool.setuptools.dynamic] +version = { attr = "vclust.__version__" } + +[tool.setuptools.package-data] +"*" = ["bin/*"] + +[project.scripts] +vclust = "vclust:main" + +[project.urls] +Homepage = "https://github.com/refresh-bio/vclust" +Documentation = "https://github.com/refresh-bio/vclust/wiki" +Website = "http://vclust.org" \ No newline at end of file diff --git a/vclust.py b/vclust.py index be353f6..00c7269 100755 --- a/vclust.py +++ b/vclust.py @@ -9,13 +9,14 @@ import multiprocessing import os import pathlib +import platform import shutil import subprocess import sys import typing import uuid -__version__ = '1.2.7' +__version__ = '1.2.8' DEFAULT_THREAD_COUNT = min(multiprocessing.cpu_count(), 64) @@ -59,7 +60,7 @@ def ranged_float_type(value): return f parser = argparse.ArgumentParser( - description=f'%(prog)s v.{__version__}: calculate ANI and cluster ' + description=f'%(prog)s v{__version__}: calculate ANI and cluster ' 'virus (meta)genome sequences', add_help=False, ) @@ -117,7 +118,7 @@ def ranged_float_type(value): '--min-kmers', metavar="", type=int, - default=10, + default=20, help='Filter genome pairs based on minimum number of shared k-mers ' '[%(default)s]' ) @@ -531,7 +532,7 @@ def ranged_float_type(value): '--bin', metavar='', type=pathlib.Path, - dest="BIN_CLUSTY", + dest="bin_clusty", default=f'{BIN_CLUSTY}', help='Path to the Clusty binary [%(default)s]' ) @@ -603,8 +604,8 @@ def get_uuid() -> str: return f'vclust-{str(uuid.uuid4().hex)[:10]}' -def validate_binary(bin_path: pathlib.Path) -> pathlib.Path: - """Validates the existence and executability of a binary file. +def _validate_binary(bin_path: pathlib.Path) -> pathlib.Path: + """Validates the presence and executability of a binary file. This function checks if the provided path points to an existing binary file and if it is executable. It also attempts to run the binary to ensure it @@ -618,16 +619,16 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path: pathlib.Path: The resolved path to the binary file. Raises: - SystemExit: If the binary file does not exist, is not executable, or - if running the binary encounters an error. + RuntimeError: If the binary file does not exist, is not executable, + or if running the binary encounters an error. """ bin_path = bin_path.resolve() if not bin_path.exists(): - exit(f'error: Executable not found: {bin_path}') + raise RuntimeError(f'File not found: {bin_path}') if not bin_path.is_file() or not os.access(bin_path, os.X_OK): - exit(f'error: Binary file not executable: {bin_path}') + raise RuntimeError(f'Binary file not executable: {bin_path}') try: subprocess.run( @@ -638,14 +639,21 @@ def validate_binary(bin_path: pathlib.Path) -> pathlib.Path: check=True ) except subprocess.CalledProcessError as e: - exit(f'error: Running {bin_path} failed with message: {e.stderr}') + raise RuntimeError(f'Running {bin_path} failed with message: {e.stderr}') except OSError as e: - exit(f'error: OSError in {bin_path} - {e}') + raise RuntimeError(f'OSError in {bin_path} - {e}') except Exception as e: - exit(f'error: Unexpected error in binary {bin_path} - {e}') + raise RuntimeError(f'Unexpected error in binary {bin_path} - {e}') return bin_path +def validate_binary(bin_path: pathlib.Path) -> pathlib.Path: + try: + return _validate_binary(bin_path) + except RuntimeError as e: + sys.exit(f'error: {e}') + + def validate_args_fasta_input(args, parser) -> argparse.Namespace: """Validates the arguments for FASTA input.""" args.is_multifasta = True @@ -732,13 +740,13 @@ def run( ) except subprocess.CalledProcessError as e: logger.error(f'Process {" ".join(cmd)} failed with message: {e.stderr}') - exit(1) + sys.exit(1) except OSError as e: logger.error(f'OSError: {" ".join(cmd)} failed with message: {e}') - exit(1) + sys.exit(1) except Exception as e: logger.error(f'Unexpected: {" ".join(cmd)} failed with message: {e}') - exit(1) + sys.exit(1) logger.info(f'Done') return process @@ -1145,11 +1153,75 @@ def cmd_clusty( return cmd -def vclust_info(): - print(f'Vclust {__version__}') - for bin_path in [BIN_KMERDB, BIN_FASTASPLIT, BIN_LZANI, BIN_CLUSTY]: - validate_binary(bin_path) - print(f'{bin_path.name:<20} ok') +def vclust_info() -> None: + """ + Displays the Vclust version, installation paths, and binary dependencies. + Checks for the presence and executable status of required binaries. + + Exits with a non-zero status if any dependencies are missing or + not executable. + + Returns: + None + + Raises: + SystemExit: If any binary dependencies are missing or not executable. + + """ + # ANSI color codes for terminal output. + GREEN = '\033[92m' + RED = '\033[91m' + RESET = '\033[0m' + + binaries = { + 'Kmer-db': BIN_KMERDB, + 'LZ-ANI': BIN_LZANI, + 'Clusty': BIN_CLUSTY, + 'multi-fasta-split': BIN_FASTASPLIT, + } + + output_lines = [ + f'Vclust version {__version__} (Python {platform.python_version()})', + '', + 'Installed at:', + f' {pathlib.Path(__file__).resolve()}', + f' {BIN_DIR.resolve()}', + '', + 'Binary dependencies:', + ] + + errors = [] # List to collect any errors encountered during binary checks. + + # Check each binary's presence and version. + for name, path in binaries.items(): + try: + _validate_binary(path) + version = subprocess.run( + [str(path), '-version' if name == 'Kmer-db' else '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ).stderr.strip() + output_lines.append(f' {name:<20} v{version:<10}') + except Exception as e: + output_lines.append(f' {name:<20} [error]') + errors.append((name, e)) + + # Append the status summary based on any encountered errors. + output_lines.append('') + + if errors: + output_lines.append(f'{RED}Status: error{RESET}') + output_lines.extend(f" - {name}: {error}" for name, error in errors) + else: + output_lines.append(f'{GREEN}Status: ok{RESET}') + + # Output the complete information. + print('\n'.join(output_lines)) + + if errors: + sys.exit(1) class CustomHelpFormatter(argparse.HelpFormatter): @@ -1324,7 +1396,7 @@ def main(): # Cluster elif args.command == 'cluster': - args.BIN_CLUSTY = validate_binary(args.BIN_CLUSTY) + args.bin_clusty = validate_binary(args.bin_clusty) args = validate_args_cluster(args, parser) cmd = cmd_clusty( @@ -1344,7 +1416,7 @@ def main(): leiden_resolution=args.leiden_resolution, leiden_beta=args.leiden_beta, leiden_iterations=args.leiden_iterations, - bin_path=args.BIN_CLUSTY, + bin_path=args.bin_clusty, ) p = run(cmd, args.verbose, logger)