From a6bfd1dbfd3679890bf08d358c25e061e0ab281e Mon Sep 17 00:00:00 2001
From: Hamdah Shafqat Abbasi <74803092+hamshkhawar@users.noreply.github.com>
Date: Tue, 6 Aug 2024 07:13:45 -0400
Subject: [PATCH] Updating hdbscan-clustering plugin (#498)

* fix merge conflicts

* fix apply manifest

* fix apply manifest

* remove file

* updated hdbscan-clustering-plugin

* fix bug in tests

* fixed random generation of floats

* fixed docker file and shell script for running docker

* fixed docker files

* renamed plugin and fixed merged conflicts

* fixed docker files
---
 .../hdbscan-clustering-tool/.bumpversion.cfg  |  27 +++
 clustering/hdbscan-clustering-tool/.gitignore |  23 +++
 clustering/hdbscan-clustering-tool/Dockerfile |  21 +++
 clustering/hdbscan-clustering-tool/README.md  |  52 ++++++
 clustering/hdbscan-clustering-tool/VERSION    |   1 +
 .../hdbscan-clustering-tool/build-docker.sh   |   4 +
 .../package-release.sh                        |  16 ++
 .../hdbscan-clustering-tool/plugin.json       | 123 ++++++++++++++
 .../hdbscan-clustering-tool/pyproject.toml    |  32 ++++
 .../hdbscan-clustering-tool/run-docker.sh     |  23 +++
 .../clustering/hdbscan_clustering/__init__.py |   4 +
 .../clustering/hdbscan_clustering/__main__.py | 156 ++++++++++++++++++
 .../hdbscan_clustering/hdbscan_clustering.py  | 150 +++++++++++++++++
 .../hdbscan-clustering-tool/tests/__init__.py |   1 +
 .../hdbscan-clustering-tool/tests/conftest.py |  48 ++++++
 .../hdbscan-clustering-tool/tests/test_cli.py |  74 +++++++++
 .../tests/test_hdbscan_clustering.py          |  49 ++++++
 17 files changed, 804 insertions(+)
 create mode 100644 clustering/hdbscan-clustering-tool/.bumpversion.cfg
 create mode 100644 clustering/hdbscan-clustering-tool/.gitignore
 create mode 100644 clustering/hdbscan-clustering-tool/Dockerfile
 create mode 100644 clustering/hdbscan-clustering-tool/README.md
 create mode 100644 clustering/hdbscan-clustering-tool/VERSION
 create mode 100755 clustering/hdbscan-clustering-tool/build-docker.sh
 create mode 100644 clustering/hdbscan-clustering-tool/package-release.sh
 create mode 100644 clustering/hdbscan-clustering-tool/plugin.json
 create mode 100644 clustering/hdbscan-clustering-tool/pyproject.toml
 create mode 100755 clustering/hdbscan-clustering-tool/run-docker.sh
 create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py
 create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py
 create mode 100644 clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py
 create mode 100644 clustering/hdbscan-clustering-tool/tests/__init__.py
 create mode 100644 clustering/hdbscan-clustering-tool/tests/conftest.py
 create mode 100644 clustering/hdbscan-clustering-tool/tests/test_cli.py
 create mode 100644 clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py
diff --git a/clustering/hdbscan-clustering-tool/.bumpversion.cfg b/clustering/hdbscan-clustering-tool/.bumpversion.cfg
new file mode 100644
index 000000000..230e6c5f9
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/.bumpversion.cfg
@@ -0,0 +1,27 @@
+[bumpversion]
+current_version = 0.4.8-dev0
+commit = True
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:src/polus/images/clustering/hdbscan_clustering/__init__.py]
diff --git a/clustering/hdbscan-clustering-tool/.gitignore b/clustering/hdbscan-clustering-tool/.gitignore
new file mode 100644
index 000000000..9ed1c3775
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/.gitignore
@@ -0,0 +1,23 @@
+# Jupyter Notebook
+.ipynb_checkpoints
+poetry.lock
+../../poetry.lock
+# Environments
+.env
+.myenv
+.venv
+env/
+venv/
+# test data directory
+data
+# yaml file
+.pre-commit-config.yaml
+# hidden files
+.DS_Store
+.ds_store
+# flake8
+.flake8
+../../.flake8
+__pycache__
+.mypy_cache
+requirements.txt
diff --git a/clustering/hdbscan-clustering-tool/Dockerfile b/clustering/hdbscan-clustering-tool/Dockerfile
new file mode 100644
index 000000000..fd4b86f93
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/Dockerfile
@@ -0,0 +1,21 @@
+FROM polusai/bfio:2.3.6
+
+# environment variables defined in polusai/bfio
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+ENV POLUS_IMG_EXT=".ome.tif"
+ENV POLUS_TAB_EXT=".csv"
+
+# Work directory defined in the base container
+WORKDIR ${EXEC_DIR}
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+
+ENTRYPOINT ["python3", "-m", "polus.images.clustering.hdbscan_clustering"]
+CMD ["--help"]
diff --git a/clustering/hdbscan-clustering-tool/README.md b/clustering/hdbscan-clustering-tool/README.md
new file mode 100644
index 000000000..80c37a501
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/README.md
@@ -0,0 +1,52 @@
+﻿# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering (v0.4.8-dev0)
+
+The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation.
+
+## Inputs:
+
+### Input directory:
+This plugin supports the all [vaex](https://vaex.readthedocs.io/en/latest/guides/io.html) supported file formats.
+
+### Filename pattern:
+This plugin uses [filepattern](https://filepattern2.readthedocs.io/en/latest/Home.html) python library to parse file names of tabular files to be processed by this plugin.
+
+### Grouping pattern:
+The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently.
+
+### Average groups:
+`groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group.
+
+### Label column:
+This is the name of the column containing the labels to be used with `groupingPattern`.
+
+### Minimum cluster size:
+This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1.
+
+### Increment outlier ID:
+This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations.
+
+## Output:
+The output is a tabular file containing the clustered data.
+
+## Building
+To build the Docker image for the conversion plugin, run
+`./build-docker.sh`.
+
+## Install WIPP Plugin
+If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit.
+For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp).
+
+## Options
+
+This plugin takes four input arguments and one output argument:
+
+| Name                   | Description                                                                                    | I/O    | Type          |
+| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- |
+| `--inpDir`             | Input tabular data files.                                                                      | Input  | genericData   |
+| `--groupingPattern`    | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input  | string        |
+| `--averageGroups`      | Average data across groups. Requires capture groups                                            | Input  | boolean       |
+| `--labelCol`           | Name of the column containing labels for grouping pattern.                                     | Input  | string        |
+| `--minClusterSize`     | Minimum cluster size.                                                                          | Input  | number        |
+| `--incrementOutlierId` | Increments outlier ID to 1.                                                                    | Input  | boolean       |
+| `--outDir`             | Output collection                                                                              | Output | genericData   |
+| `--preview`            | Generate a JSON file with outputs                                                              | Output | JSON       |
diff --git a/clustering/hdbscan-clustering-tool/VERSION b/clustering/hdbscan-clustering-tool/VERSION
new file mode 100644
index 000000000..316ad8d55
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/VERSION
@@ -0,0 +1 @@
+0.4.8-dev0
diff --git a/clustering/hdbscan-clustering-tool/build-docker.sh b/clustering/hdbscan-clustering-tool/build-docker.sh
new file mode 100755
index 000000000..2e7dd1861
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/hdbscan-clustering-tool:${version}
diff --git a/clustering/hdbscan-clustering-tool/package-release.sh b/clustering/hdbscan-clustering-tool/package-release.sh
new file mode 100644
index 000000000..9687b65df
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/package-release.sh
@@ -0,0 +1,16 @@
+# This script is designed to help package a new version of a plugin
+
+# Get the new version
+version=$(<VERSION)
+
+# Bump the version
+bump2version --config-file bumpversion.cfg --new-version ${version} --allow-dirty part
+
+# Build the container
+./build-docker.sh
+
+# Push to dockerhub
+docker push polusai/hdbscan-clustering-tool:${version}
+
+# Run pytests
+python -m pytest -s tests
diff --git a/clustering/hdbscan-clustering-tool/plugin.json b/clustering/hdbscan-clustering-tool/plugin.json
new file mode 100644
index 000000000..4458e9a29
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/plugin.json
@@ -0,0 +1,123 @@
+{
+  "name": "Hdbscan Clustering",
+  "version": "0.4.8-dev0",
+  "title": "Hdbscan Clustering",
+  "description": "Cluster the data using HDBSCAN.",
+  "author": "Jayapriya Nagarajan (github.com/Priyaaxle), Hythem Sidky (hythem.sidky@nih.gov) and Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov)",
+  "institution": "National Center for Advancing Translational Sciences, National Institutes of Health",
+  "repository": "https://github.com/PolusAI/image-tools",
+  "website": "https://ncats.nih.gov/preclinical/core/informatics",
+  "citation": "",
+  "containerId": "polusai/hdbscan-clustering-tool:0.4.8-dev0",
+  "baseCommand": [
+    "python3",
+    "-m",
+    "polus.images.clustering.hdbscan_clustering"
+  ],
+  "inputs": {
+    "inpDir": {
+      "type": "genericData",
+      "title": "Input tabular data",
+      "description": "Input tabular data.",
+      "required": "True"
+    },
+    "filePattern": {
+      "type": "string",
+      "title": "Filename pattern",
+      "description": "Filename pattern used to separate data.",
+      "required": "False"
+    },
+    "groupingPattern": {
+      "type": "string",
+      "title": "Grouping pattern",
+      "description": "Regular expression for optional row grouping.",
+      "required": "False"
+    },
+    "averageGroups": {
+      "type": "boolean",
+      "title": "Average groups",
+      "description": "Whether to average data across groups. Requires grouping pattern to be defined.",
+      "required": "False"
+    },
+    "labelCol": {
+      "type": "string",
+      "title": "Label Column",
+      "description": "Name of column containing labels. Required for grouping pattern.",
+      "required": "False"
+    },
+    "minClusterSize": {
+      "type": "number",
+      "title": "Minimum cluster size",
+      "description": "Minimum cluster size.",
+      "required": "True"
+    },
+    "incrementOutlierId": {
+      "type": "number",
+      "title": "Increment Outlier ID",
+      "description": "Increments outlier ID to 1.",
+      "required": "True"
+    },
+    "preview": {
+      "type": "boolean",
+      "title": "Preview",
+      "description": "Generate an output preview.",
+      "required": "False"
+    }
+  },
+  "outputs": {
+    "outDir": {
+      "type": "genericData",
+      "description": "Output collection."
+    }
+  },
+  "ui": {
+    "inpDir": {
+      "type": "genericData",
+      "title": "Input tabular data",
+      "description": "Input tabular data to be processed by this plugin.",
+      "required": "True"
+    },
+    "filePattern": {
+      "type": "string",
+      "title": "Filename pattern",
+      "description": "Filename pattern used to separate data.",
+      "required": "False"
+    },
+    "groupingPattern": {
+      "type": "string",
+      "title": "Grouping pattern",
+      "description": "Regular expression for optional row grouping.",
+      "required": "False"
+    },
+    "averageGroups": {
+      "type": "boolean",
+      "title": "Average groups",
+      "description": "Whether to average data across groups. Requires grouping pattern to be defined.",
+      "required": "False"
+    },
+    "labelCol": {
+      "type": "string",
+      "title": "Label Column",
+      "description": "Name of column containing labels. Required for grouping pattern.",
+      "required": "False"
+    },
+    "minClusterSize": {
+      "type": "number",
+      "title": "Minimum cluster size",
+      "description": "Minimum cluster size.",
+      "required": "True"
+    },
+    "incrementOutlierId": {
+      "type": "number",
+      "title": "Increment Outlier ID",
+      "description": "Increments outlier ID to 1.",
+      "required": "True"
+    },
+    "preview": {
+      "type": "boolean",
+      "title": "Preview",
+      "description": "Generate an output preview.",
+      "required": "False"
+    }
+  }
+}
diff --git a/clustering/hdbscan-clustering-tool/pyproject.toml b/clustering/hdbscan-clustering-tool/pyproject.toml
new file mode 100644
index 000000000..e736b9d95
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/pyproject.toml
@@ -0,0 +1,32 @@
+[tool.poetry]
+name = "polus-images-clustering-hdbscan-clustering"
+version = "0.4.8-dev0"
+description = "Cluster the data using HDBSCAN."
+authors = [
+           "Jayapriya Nagarajan <jayapriya.nagarajan@axleinfo.com>",
+           "Hythem Sidky <hythem.sidky@nih.gov>",
+           "Hamdah Shafqat abbasi <hamdahshafqat.abbasi@nih.gov>"
+           ]
+readme = "README.md"
+packages = [{include = "polus", from = "src"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.12"
+filepattern = "^2.0.4"
+typer = "^0.7.0"
+tqdm = "^4.64.1"
+preadator="0.4.0.dev2"
+vaex = "^4.17.0"
+hdbscan = "^0.8.34rc1"
+
+
+[tool.poetry.group.dev.dependencies]
+pre-commit = "^3.3.3"
+bump2version = "^1.0.1"
+pytest = "^7.3.2"
+pytest-xdist = "^3.3.1"
+pytest-sugar = "^0.9.7"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/clustering/hdbscan-clustering-tool/run-docker.sh b/clustering/hdbscan-clustering-tool/run-docker.sh
new file mode 100755
index 000000000..931115198
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/run-docker.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+version=$(<VERSION)
+datapath=$(readlink --canonicalize data)
+echo ${datapath}
+
+# Inputs
+inpDir=${datapath}/input
+filePattern=".*.csv"
+groupingPattern="\w+$"
+labelCol="species"
+minClusterSize=3
+outDir=${datapath}/output
+
+docker run -v ${datapath}:${datapath} \
+            polusai/hdbscan-clustering-plugin:${version} \
+            --inpDir ${inpDir} \
+            --filePattern ${filePattern} \
+            --groupingPattern ${groupingPattern} \
+            --labelCol ${labelCol} \
+            --minClusterSize ${minClusterSize} \
+            --incrementOutlierId \
+            --outDir ${outDir}
diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py
new file mode 100644
index 000000000..9831b7d1c
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__init__.py
@@ -0,0 +1,4 @@
+"""Hdbscan Clustering Plugin."""
+
+__version__ = "0.4.8-dev0"
+from . import hdbscan_clustering
diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py
new file mode 100644
index 000000000..dd0fa3ccb
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/__main__.py
@@ -0,0 +1,156 @@
+"""Hdbscan Clustering Plugin."""
+
+import json
+import logging
+from multiprocessing import cpu_count
+from pathlib import Path
+from typing import Any
+from typing import Optional
+
+import filepattern as fp
+import polus.images.clustering.hdbscan_clustering.hdbscan_clustering as hd
+import preadator
+import typer
+from tqdm import tqdm
+
+app = typer.Typer()
+
+# Initialize the logger
+logging.basicConfig(
+    format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
+    datefmt="%d-%b-%y %H:%M:%S",
+)
+logger = logging.getLogger("polus.plugins.clustering.hdbscan_clustering")
+logger.setLevel(logging.INFO)
+
+
+@app.command()
+def main(  # noqa: PLR0913
+    inp_dir: Path = typer.Option(
+        ...,
+        "--inpDir",
+        "-i",
+        help="Path to folder with tabular files",
+    ),
+    file_pattern: Optional[str] = typer.Option(
+        ".*",
+        "--filePattern",
+        "-f",
+        help="Pattern use to parse filenames",
+    ),
+    grouping_pattern: Optional[str] = typer.Option(
+        None,
+        "--groupingPattern",
+        "-g",
+        help="Regular expression to group rows to capture groups.",
+    ),
+    average_groups: Optional[bool] = typer.Option(
+        False,
+        "--averageGroups",
+        "-a",
+        help="Whether to average data across groups. Requires capture groups.",
+    ),
+    label_col: Optional[str] = typer.Option(
+        None,
+        "--labelCol",
+        "-l",
+        help="Name of column containing labels. Required only for grouping operations.",
+    ),
+    min_cluster_size: int = typer.Option(
+        ...,
+        "--minClusterSize",
+        "-m",
+        help="Minimum cluster size.",
+    ),
+    increment_outlier_id: Optional[bool] = typer.Option(
+        False,
+        "--incrementOutlierId",
+        "-io",
+        help="Increments outlier ID to 1.",
+    ),
+    out_dir: Path = typer.Option(
+        ...,
+        "--outDir",
+        "-o",
+        help="Output directory",
+    ),
+    preview: Optional[bool] = typer.Option(
+        False,
+        "--preview",
+        help="Output a JSON preview of files",
+    ),
+) -> None:
+    """Cluster data using HDBSCAN."""
+    logger.info(f"--inpDir = {inp_dir}")
+    logger.info(f"--filePattern = {file_pattern}")
+    # Regular expression for grouping.
+    logger.info(f"--groupingPattern = {grouping_pattern}")
+    # Whether to average data for each group.
+    logger.info(f"--averageGroups = {average_groups}")
+    # Name of column to use for grouping.
+    logger.info(f"--labelCol = {label_col}")
+    # Minimum cluster size for clustering using HDBSCAN.
+    logger.info(f"--minClusterSize = {min_cluster_size}")
+    # Set outlier cluster id as 1.
+    logger.info(f"--incrementOutlierId = {increment_outlier_id}")
+    logger.info(f"--outDir = {out_dir}")
+
+    inp_dir = inp_dir.resolve()
+    out_dir = out_dir.resolve()
+
+    assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again"
+    assert (
+        out_dir.exists()
+    ), f"{out_dir} does not exist!! Please check output path again"
+
+    num_workers = max([cpu_count(), 2])
+
+    files = fp.FilePattern(inp_dir, file_pattern)
+
+    if files is None:
+        msg = f"No tabular files found. Please check {file_pattern} again"
+        raise ValueError(msg)
+
+    if preview:
+        with Path.open(Path(out_dir, "preview.json"), "w") as jfile:
+            out_json: dict[str, Any] = {
+                "filepattern": file_pattern,
+                "outDir": [],
+            }
+            for file in files():
+                out_name = file[1][0].name.replace(
+                    "".join(file[1][0].suffixes),
+                    f"_hdbscan{hd.POLUS_TAB_EXT}",
+                )
+                out_json["outDir"].append(out_name)
+            json.dump(out_json, jfile, indent=2)
+    else:
+        with preadator.ProcessManager(
+            name="Cluster data using HDBSCAN",
+            num_processes=num_workers,
+            threads_per_process=2,
+        ) as pm:
+            for file in tqdm(
+                files(),
+                total=len(files()),
+                desc="Clustering data",
+                mininterval=5,
+                initial=0,
+                unit_scale=True,
+                colour="cyan",
+            ):
+                pm.submit_process(
+                    hd.hdbscan_clustering,
+                    file[1][0],
+                    min_cluster_size,
+                    out_dir,
+                    grouping_pattern,
+                    label_col,
+                    average_groups,
+                    increment_outlier_id,
+                )
+            pm.join_processes()
+
+
+if __name__ == "__main__":
+    app()
diff --git a/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py
new file mode 100644
index 000000000..3940c2861
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/src/polus/images/clustering/hdbscan_clustering/hdbscan_clustering.py
@@ -0,0 +1,150 @@
+"""Hdbscan Clustering Plugin."""
+import logging
+import os
+import re
+from itertools import chain
+from pathlib import Path
+
+import hdbscan
+import numpy as np
+import vaex
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv")
+CHUNK_SIZE = 10000
+
+
+def hdbscan_model(
+    data: np.ndarray,
+    min_cluster_size: int,
+    increment_outlier_id: bool,
+) -> np.ndarray:
+    """Cluster data using HDBSCAN.
+
+    Args:
+        data: Data that need to be clustered.
+        min_cluster_size: Minimum cluster size.
+        increment_outlier_id : Increment outlier ID to unity.
+
+    Returns:
+        Cluster labels for each row of data.
+    """
+    clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data)
+    labels = clusters.labels_.flatten().astype(np.uint16) + 1
+    return labels + 1 if increment_outlier_id else labels
+
+
+def hdbscan_clustering(  # noqa: PLR0913
+    file: Path,
+    min_cluster_size: int,
+    out_dir: Path,
+    grouping_pattern: str,
+    label_col: str,
+    average_groups: bool,
+    increment_outlier_id: bool,
+) -> None:
+    """Cluster data using HDBSCAN.
+
+    Args:
+        file: Path of a tabular file.
+        min_cluster_size: Smallest size grouping that should be considered as a cluster.
+        out_dir: Path to output directory.
+        grouping_pattern: Regular expression to caputure groups in a label_col.
+        label_col: Name of column containing labels.
+        average_groups:To average data across groups.
+        increment_outlier_id: Increment outlier ID to unity.
+    """
+    if Path(file.name).suffix == ".csv":
+        df = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE)
+    else:
+        df = vaex.open(file)
+    # If user provided a regular expression.
+    if grouping_pattern:
+        if label_col == "None":
+            msg = f"Please define label column to capture groups {label_col}"
+            raise ValueError(msg)
+
+        # Create a column group with matching string
+        group = np.array(
+            [
+                re.search(grouping_pattern, x).group(0)  # type: ignore
+                for x in df[label_col].tolist()
+                if len(re.search(grouping_pattern, x).group(0)) != 0  # type: ignore
+            ],
+        )
+        if len(group) == 0:
+            msg = f"Could not find group with pattern {grouping_pattern}"
+            raise ValueError(msg)
+
+        # Create a column group with matching string
+        df["group"] = group
+        int_columns = [
+            feature
+            for feature in df.get_column_names()
+            if df.data_type(feature) == int or df.data_type(feature) == float
+        ]
+
+        # If we want to average features for each group.
+        if average_groups:
+            df_grouped = df.groupby(
+                "group",
+                agg=[vaex.agg.mean(x) for x in int_columns],
+            )
+            # Cluster data using HDBSCAN clustering.
+            logger.info("Clustering the data")
+            cluster_ids = hdbscan_model(
+                df_grouped.values,
+                min_cluster_size,
+                increment_outlier_id,
+            )
+            df_grouped["cluster"] = cluster_ids
+            df = df.join(
+                df_grouped["group", "cluster"],
+                left_on="group",
+                right_on="group",
+            )
+
+        else:
+            dfs = []
+            for group, df_ss in df.groupby("group"):
+                # Cluster data using HDBSCAN clustering.
+                logger.info(f"Clustering data in group {group}")
+
+                cluster_ids = hdbscan_model(
+                    df_ss.values,
+                    min_cluster_size,
+                    increment_outlier_id,
+                )
+
+                dfs.append(cluster_ids)
+            cluster_ids = np.array(list(chain.from_iterable(dfs)))
+            df["cluster"] = cluster_ids
+
+    # No grouping. Vanilla clustering.
+    else:
+        int_columns = [
+            feature
+            for feature in df.get_column_names()
+            if df.data_type(feature) == int or df.data_type(feature) == float
+        ]
+
+        # Cluster data using HDBSCAN clustering
+        logger.info("Clustering the data")
+        cluster_ids = hdbscan_model(
+            df[int_columns].values,
+            min_cluster_size,
+            increment_outlier_id,
+        )
+        df["cluster"] = cluster_ids
+
+    outname = Path(out_dir, f"{Path(file.name).stem}_hdbscan{POLUS_TAB_EXT}")
+
+    if POLUS_TAB_EXT == ".arrow":
+        df.export_feather(outname)
+        logger.info(f"Saving outputs: {outname}")
+    else:
+        df.export_csv(path=outname, chunk_size=CHUNK_SIZE)
+
+    logger.info("Finished all processes!")
diff --git a/clustering/hdbscan-clustering-tool/tests/__init__.py b/clustering/hdbscan-clustering-tool/tests/__init__.py
new file mode 100644
index 000000000..2f89ec82b
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/tests/__init__.py
@@ -0,0 +1 @@
+"""Hdbscan Clustering Plugin."""
diff --git a/clustering/hdbscan-clustering-tool/tests/conftest.py b/clustering/hdbscan-clustering-tool/tests/conftest.py
new file mode 100644
index 000000000..a609d5b80
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/tests/conftest.py
@@ -0,0 +1,48 @@
+"""Test fixtures.
+
+Set up all data used in tests.
+"""
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+
+@pytest.fixture(
+    params=[(50000, ".csv"), (100000, ".arrow")],
+)
+def get_params(request: pytest.FixtureRequest) -> tuple[int, str]:
+    """To get the parameter of the fixture."""
+    return request.param
+
+
+@pytest.fixture()
+def generate_synthetic_data(get_params: tuple[int, str]) -> tuple[Path, Path, str]:
+    """Generate tabular data."""
+    nrows, file_extension = get_params
+
+    input_directory = Path(tempfile.mkdtemp(prefix="inputs_"))
+    output_directory = Path(tempfile.mkdtemp(prefix="out_"))
+    rng = np.random.default_rng()
+    tabular_data = {
+        "sepal_length": rng.random(nrows).tolist(),
+        "sepal_width": rng.random(nrows).tolist(),
+        "petal_length": rng.random(nrows).tolist(),
+        "petal_width": rng.random(nrows).tolist(),
+        "species": rng.choice(
+            ["Iris-setosa", "Iris-versicolor", "Iris-virginica"],
+            nrows,
+        ).tolist(),
+    }
+
+    df = pd.DataFrame(tabular_data)
+    if file_extension == ".csv":
+        outpath = Path(input_directory, "data.csv")
+        df.to_csv(outpath, index=False)
+    if file_extension == ".arrow":
+        outpath = Path(input_directory, "data.arrow")
+        df.to_feather(outpath)
+
+    return input_directory, output_directory, file_extension
diff --git a/clustering/hdbscan-clustering-tool/tests/test_cli.py b/clustering/hdbscan-clustering-tool/tests/test_cli.py
new file mode 100644
index 000000000..b087215e8
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/tests/test_cli.py
@@ -0,0 +1,74 @@
+"""Test Command line Tool."""
+
+from typer.testing import CliRunner
+from polus.images.clustering.hdbscan_clustering.__main__ import app
+import shutil
+from pathlib import Path
+
+
+def test_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None:
+    """Test the command line."""
+    inp_dir, out_dir, file_extension = generate_synthetic_data
+    pattern = r"\w+$"
+    file_pattern = f".*{file_extension}"
+    label = "species"
+    clustersize = 3
+
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        [
+            "--inpDir",
+            inp_dir,
+            "--filePattern",
+            file_pattern,
+            "--groupingPattern",
+            pattern,
+            "--averageGroups",
+            "--labelCol",
+            label,
+            "--minClusterSize",
+            clustersize,
+            "--incrementOutlierId",
+            "--outDir",
+            out_dir,
+        ],
+    )
+
+    assert result.exit_code == 0
+    shutil.rmtree(inp_dir)
+    shutil.rmtree(out_dir)
+
+
+def test_short_cli(generate_synthetic_data: tuple[Path, Path, str]) -> None:
+    """Test short command line."""
+    inp_dir, out_dir, file_extension = generate_synthetic_data
+    pattern = r"\w+$"
+    file_pattern = f".*{file_extension}"
+    label = "species"
+    clustersize = 3
+
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        [
+            "-i",
+            inp_dir,
+            "-f",
+            file_pattern,
+            "-g",
+            pattern,
+            "-a",
+            "-l",
+            label,
+            "-m",
+            clustersize,
+            "-io",
+            "-o",
+            out_dir,
+        ],
+    )
+
+    assert result.exit_code == 0
+    shutil.rmtree(inp_dir)
+    shutil.rmtree(out_dir)
diff --git a/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py
new file mode 100644
index 000000000..83debf273
--- /dev/null
+++ b/clustering/hdbscan-clustering-tool/tests/test_hdbscan_clustering.py
@@ -0,0 +1,49 @@
+"""Test Hdbscan Clustering Plugin."""
+
+import shutil
+from pathlib import Path
+
+import filepattern as fp
+import polus.images.clustering.hdbscan_clustering.hdbscan_clustering as hd
+import vaex
+
+
+def test_hdbscan_clustering(generate_synthetic_data: tuple[Path, Path, str]) -> None:
+    """Test hdbscan clustering of tabular data."""
+    inp_dir, out_dir, file_extension = generate_synthetic_data
+    pattern = r"\w+$"
+    file_pattern = f".*{file_extension}"
+    files = fp.FilePattern(inp_dir, file_pattern)
+    for file in files():
+        hd.hdbscan_clustering(
+            file=file[1][0],
+            min_cluster_size=3,
+            grouping_pattern=pattern,
+            label_col="species",
+            average_groups=True,
+            increment_outlier_id=True,
+            out_dir=out_dir,
+        )
+
+    out_ext = [Path(f.name).suffix for f in out_dir.iterdir()]
+    assert all(out_ext) is True
+    for f in out_dir.iterdir():
+        df = vaex.open(f)
+        assert "cluster" in df.column_names
+        assert df["cluster"].values != 0
+    shutil.rmtree(inp_dir)
+    shutil.rmtree(out_dir)
+
+
+def test_hdbscan_model(generate_synthetic_data: tuple[Path, Path, str]) -> None:
+    """Test hdbscan model."""
+    inp_dir, _, file_extension = generate_synthetic_data
+    file_pattern = f".*{file_extension}"
+    files = fp.FilePattern(inp_dir, file_pattern)
+    for file in files():
+        df = vaex.open(file[1][0])
+        data = df[df.column_names[:-1]].values
+        min_cluster_size = 3
+        label = hd.hdbscan_model(data, min_cluster_size, True)
+        assert len(label) != 0
+    shutil.rmtree(inp_dir)