From a6bfd1dbfd3679890bf08d358c25e061e0ab281e Mon Sep 17 00:00:00 2001 From: Hamdah Shafqat Abbasi <74803092+hamshkhawar@users.noreply.github.com> Date: Tue, 6 Aug 2024 07:13:45 -0400 Subject: [PATCH] Updating hdbscan-clustering plugin (#498) * fix merge conflicts * fix apply manifest * fix apply manifest * remove file * updated hdbscan-clustering-plugin * fix bug in tests * fixed random generation of floats * fixed docker file and shell script for running docker * fixed docker files * renamed plugin and fixed merged conflicts * fixed docker files --- .../hdbscan-clustering-tool/.bumpversion.cfg | 27 +++ clustering/hdbscan-clustering-tool/.gitignore | 23 +++ clustering/hdbscan-clustering-tool/Dockerfile | 21 +++ clustering/hdbscan-clustering-tool/README.md | 52 ++++++ clustering/hdbscan-clustering-tool/VERSION | 1 + .../hdbscan-clustering-tool/build-docker.sh | 4 + .../package-release.sh | 16 ++ .../hdbscan-clustering-tool/plugin.json | 123 ++++++++++++++ .../hdbscan-clustering-tool/pyproject.toml | 32 ++++ .../hdbscan-clustering-tool/run-docker.sh | 23 +++ .../clustering/hdbscan_clustering/__init__.py | 4 + .../clustering/hdbscan_clustering/__main__.py | 156 ++++++++++++++++++ .../hdbscan_clustering/hdbscan_clustering.py | 150 +++++++++++++++++ .../hdbscan-clustering-tool/tests/__init__.py | 1 + .../hdbscan-clustering-tool/tests/conftest.py | 48 ++++++ .../hdbscan-clustering-tool/tests/test_cli.py | 74 +++++++++ .../tests/test_hdbscan_clustering.py | 49 ++++++ 17 files changed, 804 insertions(+) create mode 100644 clustering/hdbscan-clustering-tool/.bumpversion.cfg create mode 100644 clustering/hdbscan-clustering-tool/.gitignore create mode 100644 clustering/hdbscan-clustering-tool/Dockerfile create mode 100644 clustering/hdbscan-clustering-tool/README.md create mode 100644 clustering/hdbscan-clustering-tool/VERSION create mode 100755 clustering/hdbscan-clustering-tool/build-docker.sh create mode 100644 clustering/hdbscan-clustering-tool/package-release.sh create mode 100644 clustering/hdbscan-clustering-tool/plugin.json create mode 100644 clustering/hdbscan-clustering-tool/pyproject.toml create mode 100755 clustering/hdbscan-clustering-tool/run-docker.sh create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py create mode 100644 clustering/hdbscan-clustering-tool/tests/__init__.py create mode 100644 clustering/hdbscan-clustering-tool/tests/conftest.py create mode 100644 clustering/hdbscan-clustering-tool/tests/test_cli.py create mode 100644 clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py diff --git a/clustering/hdbscan-clustering-tool/.bumpversion.cfg b/clustering/hdbscan-clustering-tool/.bumpversion.cfg new file mode 100644 index 000000000..230e6c5f9 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.bumpversion.cfg @@ -0,0 +1,27 @@ +[bumpversion] +current_version = 0.4.8-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/images/clustering/hdbscan_clustering/__init__.py] diff --git a/clustering/hdbscan-clustering-tool/.gitignore b/clustering/hdbscan-clustering-tool/.gitignore new file mode 100644 index 000000000..9ed1c3775 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/.gitignore @@ -0,0 +1,23 @@ +# Jupyter Notebook +.ipynb_checkpoints +poetry.lock +../../poetry.lock +# Environments +.env +.myenv +.venv +env/ +venv/ +# test data directory +data +# yaml file +.pre-commit-config.yaml +# hidden files +.DS_Store +.ds_store +# flake8 +.flake8 +../../.flake8 +__pycache__ +.mypy_cache +requirements.txt diff --git a/clustering/hdbscan-clustering-tool/Dockerfile b/clustering/hdbscan-clustering-tool/Dockerfile new file mode 100644 index 000000000..fd4b86f93 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + + +ENTRYPOINT ["python3", "-m", "polus.images.clustering.hdbscan_clustering"] +CMD ["--help"] diff --git a/clustering/hdbscan-clustering-tool/README.md b/clustering/hdbscan-clustering-tool/README.md new file mode 100644 index 000000000..80c37a501 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/README.md @@ -0,0 +1,52 @@ +# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering (v0.4.8-dev0) + +The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation. + +## Inputs: + +### Input directory: +This plugin supports the all [vaex](https://vaex.readthedocs.io/en/latest/guides/io.html) supported file formats. + +### Filename pattern: +This plugin uses [filepattern](https://filepattern2.readthedocs.io/en/latest/Home.html) python library to parse file names of tabular files to be processed by this plugin. + +### Grouping pattern: +The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently. + +### Average groups: +`groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group. + +### Label column: +This is the name of the column containing the labels to be used with `groupingPattern`. + +### Minimum cluster size: +This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1. + +### Increment outlier ID: +This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations. + +## Output: +The output is a tabular file containing the clustered data. + +## Building +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes four input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- | +| `--inpDir` | Input tabular data files. | Input | genericData | +| `--groupingPattern` | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input | string | +| `--averageGroups` | Average data across groups. Requires capture groups | Input | boolean | +| `--labelCol` | Name of the column containing labels for grouping pattern. | Input | string | +| `--minClusterSize` | Minimum cluster size. | Input | number | +| `--incrementOutlierId` | Increments outlier ID to 1. | Input | boolean | +| `--outDir` | Output collection | Output | genericData | +| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/hdbscan-clustering-tool/VERSION b/clustering/hdbscan-clustering-tool/VERSION new file mode 100644 index 000000000..316ad8d55 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/VERSION @@ -0,0 +1 @@ +0.4.8-dev0 diff --git a/clustering/hdbscan-clustering-tool/build-docker.sh b/clustering/hdbscan-clustering-tool/build-docker.sh new file mode 100755 index 000000000..2e7dd1861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Hythem Sidky ", + "Hamdah Shafqat abbasi " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.7.0" +tqdm = "^4.64.1" +preadator="0.4.0.dev2" +vaex = "^4.17.0" +hdbscan = "^0.8.34rc1" + + +[tool.poetry.group.dev.dependencies] +pre-commit = "^3.3.3" +bump2version = "^1.0.1" +pytest = "^7.3.2" +pytest-xdist = "^3.3.1" +pytest-sugar = "^0.9.7" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/hdbscan-clustering-tool/run-docker.sh b/clustering/hdbscan-clustering-tool/run-docker.sh new file mode 100755 index 000000000..931115198 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/run-docker.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +version=$( None: + """Cluster data using HDBSCAN.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + # Regular expression for grouping. + logger.info(f"--groupingPattern = {grouping_pattern}") + # Whether to average data for each group. + logger.info(f"--averageGroups = {average_groups}") + # Name of column to use for grouping. + logger.info(f"--labelCol = {label_col}") + # Minimum cluster size for clustering using HDBSCAN. + logger.info(f"--minClusterSize = {min_cluster_size}") + # Set outlier cluster id as 1. + logger.info(f"--incrementOutlierId = {increment_outlier_id}") + logger.info(f"--outDir = {out_dir}") + + inp_dir = inp_dir.resolve() + out_dir = out_dir.resolve() + + assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" + assert ( + out_dir.exists() + ), f"{out_dir} does not exist!! Please check output path again" + + num_workers = max([cpu_count(), 2]) + + files = fp.FilePattern(inp_dir, file_pattern) + + if files is None: + msg = f"No tabular files found. Please check {file_pattern} again" + raise ValueError(msg) + + if preview: + with Path.open(Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[str, Any] = { + "filepattern": file_pattern, + "outDir": [], + } + for file in files(): + out_name = file[1][0].name.replace( + "".join(file[1][0].suffixes), + f"_hdbscan{hd.POLUS_TAB_EXT}", + ) + out_json["outDir"].append(out_name) + json.dump(out_json, jfile, indent=2) + else: + with preadator.ProcessManager( + name="Cluster data using HDBSCAN", + num_processes=num_workers, + threads_per_process=2, + ) as pm: + for file in tqdm( + files(), + total=len(files()), + desc="Clustering data", + mininterval=5, + initial=0, + unit_scale=True, + colour="cyan", + ): + pm.submit_process( + hd.hdbscan_clustering, + file[1][0], + min_cluster_size, + out_dir, + grouping_pattern, + label_col, + average_groups, + increment_outlier_id, + ) + pm.join_processes() + + +if __name__ == "__main__": + app() diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py new file mode 100644 index 000000000..3940c2861 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py @@ -0,0 +1,150 @@ +"""Hdbscan Clustering Plugin.""" +import logging +import os +import re +from itertools import chain +from pathlib import Path + +import hdbscan +import numpy as np +import vaex + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") +CHUNK_SIZE = 10000 + + +def hdbscan_model( + data: np.ndarray, + min_cluster_size: int, + increment_outlier_id: bool, +) -> np.ndarray: + """Cluster data using HDBSCAN. + + Args: + data: Data that need to be clustered. + min_cluster_size: Minimum cluster size. + increment_outlier_id : Increment outlier ID to unity. + + Returns: + Cluster labels for each row of data. + """ + clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data) + labels = clusters.labels_.flatten().astype(np.uint16) + 1 + return labels + 1 if increment_outlier_id else labels + + +def hdbscan_clustering( # noqa: PLR0913 + file: Path, + min_cluster_size: int, + out_dir: Path, + grouping_pattern: str, + label_col: str, + average_groups: bool, + increment_outlier_id: bool, +) -> None: + """Cluster data using HDBSCAN. + + Args: + file: Path of a tabular file. + min_cluster_size: Smallest size grouping that should be considered as a cluster. + out_dir: Path to output directory. + grouping_pattern: Regular expression to caputure groups in a label_col. + label_col: Name of column containing labels. + average_groups:To average data across groups. + increment_outlier_id: Increment outlier ID to unity. + """ + if Path(file.name).suffix == ".csv": + df = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) + else: + df = vaex.open(file) + # If user provided a regular expression. + if grouping_pattern: + if label_col == "None": + msg = f"Please define label column to capture groups {label_col}" + raise ValueError(msg) + + # Create a column group with matching string + group = np.array( + [ + re.search(grouping_pattern, x).group(0) # type: ignore + for x in df[label_col].tolist() + if len(re.search(grouping_pattern, x).group(0)) != 0 # type: ignore + ], + ) + if len(group) == 0: + msg = f"Could not find group with pattern {grouping_pattern}" + raise ValueError(msg) + + # Create a column group with matching string + df["group"] = group + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # If we want to average features for each group. + if average_groups: + df_grouped = df.groupby( + "group", + agg=[vaex.agg.mean(x) for x in int_columns], + ) + # Cluster data using HDBSCAN clustering. + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df_grouped.values, + min_cluster_size, + increment_outlier_id, + ) + df_grouped["cluster"] = cluster_ids + df = df.join( + df_grouped["group", "cluster"], + left_on="group", + right_on="group", + ) + + else: + dfs = [] + for group, df_ss in df.groupby("group"): + # Cluster data using HDBSCAN clustering. + logger.info(f"Clustering data in group {group}") + + cluster_ids = hdbscan_model( + df_ss.values, + min_cluster_size, + increment_outlier_id, + ) + + dfs.append(cluster_ids) + cluster_ids = np.array(list(chain.from_iterable(dfs))) + df["cluster"] = cluster_ids + + # No grouping. Vanilla clustering. + else: + int_columns = [ + feature + for feature in df.get_column_names() + if df.data_type(feature) == int or df.data_type(feature) == float + ] + + # Cluster data using HDBSCAN clustering + logger.info("Clustering the data") + cluster_ids = hdbscan_model( + df[int_columns].values, + min_cluster_size, + increment_outlier_id, + ) + df["cluster"] = cluster_ids + + outname = Path(out_dir, f"{Path(file.name).stem}_hdbscan{POLUS_TAB_EXT}") + + if POLUS_TAB_EXT == ".arrow": + df.export_feather(outname) + logger.info(f"Saving outputs: {outname}") + else: + df.export_csv(path=outname, chunk_size=CHUNK_SIZE) + + logger.info("Finished all processes!") diff --git a/clustering/hdbscan-clustering-tool/tests/__init__.py b/clustering/hdbscan-clustering-tool/tests/__init__.py new file mode 100644 index 000000000..2f89ec82b --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Hdbscan Clustering Plugin.""" diff --git a/clustering/hdbscan-clustering-tool/tests/conftest.py b/clustering/hdbscan-clustering-tool/tests/conftest.py new file mode 100644 index 000000000..a609d5b80 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/conftest.py @@ -0,0 +1,48 @@ +"""Test fixtures. + +Set up all data used in tests. +""" +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture( + params=[(50000, ".csv"), (100000, ".arrow")], +) +def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: + """To get the parameter of the fixture.""" + return request.param + + +@pytest.fixture() +def generate_synthetic_data(get_params: tuple[int, str]) -> tuple[Path, Path, str]: + """Generate tabular data.""" + nrows, file_extension = get_params + + input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) + output_directory = Path(tempfile.mkdtemp(prefix="out_")) + rng = np.random.default_rng() + tabular_data = { + "sepal_length": rng.random(nrows).tolist(), + "sepal_width": rng.random(nrows).tolist(), + "petal_length": rng.random(nrows).tolist(), + "petal_width": rng.random(nrows).tolist(), + "species": rng.choice( + ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], + nrows, + ).tolist(), + } + + df = pd.DataFrame(tabular_data) + if file_extension == ".csv": + outpath = Path(input_directory, "data.csv") + df.to_csv(outpath, index=False) + if file_extension == ".arrow": + outpath = Path(input_directory, "data.arrow") + df.to_feather(outpath) + + return input_directory, output_directory, file_extension diff --git a/clustering/hdbscan-clustering-tool/tests/test_cli.py b/clustering/hdbscan-clustering-tool/tests/test_cli.py new file mode 100644 index 000000000..b087215e8 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_cli.py @@ -0,0 +1,74 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from polus.images.clustering.hdbscan_clustering.__main__ import app +import shutil +from pathlib import Path + + +def test_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test the command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + file_pattern, + "--groupingPattern", + pattern, + "--averageGroups", + "--labelCol", + label, + "--minClusterSize", + clustersize, + "--incrementOutlierId", + "--outDir", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_short_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test short command line.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + label = "species" + clustersize = 3 + + runner = CliRunner() + result = runner.invoke( + app, + [ + "-i", + inp_dir, + "-f", + file_pattern, + "-g", + pattern, + "-a", + "-l", + label, + "-m", + clustersize, + "-io", + "-o", + out_dir, + ], + ) + + assert result.exit_code == 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) diff --git a/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py new file mode 100644 index 000000000..83debf273 --- /dev/null +++ b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py @@ -0,0 +1,49 @@ +"""Test Hdbscan Clustering Plugin.""" + +import shutil +from pathlib import Path + +import filepattern as fp +import polus.images.clustering.hdbscan_clustering.hdbscan_clustering as hd +import vaex + + +def test_hdbscan_clustering(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan clustering of tabular data.""" + inp_dir, out_dir, file_extension = generate_synthetic_data + pattern = r"\w+$" + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + hd.hdbscan_clustering( + file=file[1][0], + min_cluster_size=3, + grouping_pattern=pattern, + label_col="species", + average_groups=True, + increment_outlier_id=True, + out_dir=out_dir, + ) + + out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] + assert all(out_ext) is True + for f in out_dir.iterdir(): + df = vaex.open(f) + assert "cluster" in df.column_names + assert df["cluster"].values != 0 + shutil.rmtree(inp_dir) + shutil.rmtree(out_dir) + + +def test_hdbscan_model(generate_synthetic_data: tuple[Path, Path, str]) -> None: + """Test hdbscan model.""" + inp_dir, _, file_extension = generate_synthetic_data + file_pattern = f".*{file_extension}" + files = fp.FilePattern(inp_dir, file_pattern) + for file in files(): + df = vaex.open(file[1][0]) + data = df[df.column_names[:-1]].values + min_cluster_size = 3 + label = hd.hdbscan_model(data, min_cluster_size, True) + assert len(label) != 0 + shutil.rmtree(inp_dir)