From b38831668eac0af1ce0e3e5d96710999d988b08e Mon Sep 17 00:00:00 2001
From: Sujee Maniyam <sujee@sujee.net>
Date: Wed, 2 Oct 2024 00:55:14 -0700
Subject: [PATCH 1/7] Updated RAG example for DPK version 0.2.1

Signed-off-by: Sujee Maniyam <sujee@sujee.net>
---
 examples/notebooks/rag/README.md              |    2 +-
 examples/notebooks/rag/my_config.py           |   38 +
 .../rag/rag_1A_dpk_process_python.ipynb       | 1775 +++++++++++++++
 .../rag/rag_1A_dpk_process_ray.ipynb          | 2026 ++++++++---------
 .../rag/rag_1B_load_data_into_milvus.ipynb    |  304 ++-
 .../notebooks/rag/rag_1C_vector_search.ipynb  |  127 +-
 .../rag/rag_1D_query_llama_replicate.ipynb    |  229 +-
 .../rag/rag_2A_llamaindex_process.ipynb       |  195 +-
 .../rag/rag_2B_llamaindex_query.ipynb         |   48 +-
 examples/notebooks/rag/requirements.txt       |   11 +-
 .../notebooks/rag/setup-python-dev-env.md     |   26 +-
 11 files changed, 3182 insertions(+), 1599 deletions(-)
 create mode 100644 examples/notebooks/rag/my_config.py
 create mode 100644 examples/notebooks/rag/rag_1A_dpk_process_python.ipynb
diff --git a/examples/notebooks/rag/README.md b/examples/notebooks/rag/README.md
index 3ef82eba4..f4a3460a1 100644
--- a/examples/notebooks/rag/README.md
+++ b/examples/notebooks/rag/README.md
@@ -35,7 +35,7 @@ This code uses DPK to
 
 Here is the code: 
 
-- Python version: TODO
+- Python version: [rag_1A_dpk_process_python.ipynb](rag_1A_dpk_process_python.ipynb)
 - Ray version: [rag_1A_dpk_process_ray.ipynb](rag_1A_dpk_process_ray.ipynb)
 
 
diff --git a/examples/notebooks/rag/my_config.py b/examples/notebooks/rag/my_config.py
new file mode 100644
index 000000000..ba9ea89fd
--- /dev/null
+++ b/examples/notebooks/rag/my_config.py
@@ -0,0 +1,38 @@
+import os 
+
+
+## Configuration
+class MyConfig:
+    pass 
+
+MY_CONFIG = MyConfig ()
+
+## Input Data - configure this to the folder we want to process
+MY_CONFIG.INPUT_DATA_DIR = "input"
+MY_CONFIG.OUTPUT_FOLDER = "output"
+MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , "output_final")
+### -------------------------------
+
+### Milvus config
+MY_CONFIG.DB_URI = './rag_1_dpk.db'  # For embedded instance
+MY_CONFIG.COLLECTION_NAME = 'dpk_papers'
+
+
+## Embedding model
+MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+MY_CONFIG.EMBEDDING_LENGTH = 384
+
+## LLM Model
+MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct"
+
+
+
+## RAY CONFIGURATION
+num_cpus_available =  os.cpu_count()
+# print (num_cpus_available)
+# MY_CONFIG.RAY_NUM_CPUS = num_cpus_available // 2  ## use half the available cores for processing
+MY_CONFIG.RAY_NUM_CPUS =  0.8
+# print (MY_CONFIG.RAY_NUM_CPUS)
+MY_CONFIG.RAY_MEMORY_GB = 2  # GB
+# MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3
+MY_CONFIG.RAY_RUNTIME_WORKERS = 2
\ No newline at end of file
diff --git a/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb
new file mode 100644
index 000000000..ae8b0836d
--- /dev/null
+++ b/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb
@@ -0,0 +1,1775 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866",
+   "metadata": {},
+   "source": [
+    "<div style=\"background-color: #04D7FD; padding: 20px; text-align: left;\">\n",
+    "    <h1 style=\"color: #000000; font-size: 36px; margin: 0;\">Data Processing for RAG with Data Prep Kit (Python)</h1>\n",
+    "    \n",
+    "</div>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b15976e3",
+   "metadata": {},
+   "source": [
+    "## Before Running the notebook\n",
+    "\n",
+    "Please complete [setting up python dev environment](./setup-python-dev-env.md)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "053ecf08-5f62-4b99-9347-8a0955843d21",
+   "metadata": {},
+   "source": [
+    "## Overview\n",
+    "\n",
+    "This notebook will process PDF documents as part of RAG pipeline\n",
+    "\n",
+    "![](media/rag-overview-2.png)\n",
+    "\n",
+    "This notebook will perform steps 1, 2 and 3 in RAG pipeline.\n",
+    "\n",
+    "Here are the processing steps:\n",
+    "\n",
+    "- **pdf2parquet** : Extract text from PDF and convert them into parquet files\n",
+    "- **Chunk documents**: Split the PDFs into 'meaningful sections' (paragraphs, sentences ..etc)\n",
+    "- **Doc_ID generation**: Each chunk is assigned a uniq id, based on content and hash\n",
+    "- **Exact Dedup**: Chunks with exact same content are filtered out\n",
+    "- **Text encoder**: Convert chunks into vectors using embedding models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8b10be1",
+   "metadata": {},
+   "source": [
+    "## Step-1: Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "33345487",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from my_config import MY_CONFIG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "facb3bbc",
+   "metadata": {},
+   "source": [
+    "## Step-2:  Data\n",
+    "\n",
+    "We will use white papers  about LLMs.  \n",
+    "\n",
+    "- [Granite Code Models](https://arxiv.org/abs/2405.04324)\n",
+    "- [Attention is all you need](https://arxiv.org/abs/1706.03762)\n",
+    "\n",
+    "You can of course substite your own data below"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1fe7c0c",
+   "metadata": {},
+   "source": [
+    "### 2.1 - Download data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8739b7a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Local file 'input/attension.pdf' (2.22 MB) already exists. Skipping download.\n",
+      "Local file 'input/granite.pdf' (1.27 MB) already exists. Skipping download.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os, sys\n",
+    "import shutil\n",
+    "from utils import download_file\n",
+    "\n",
+    "## Download the data files\n",
+    "shutil.os.makedirs(MY_CONFIG.INPUT_DATA_DIR, exist_ok=True)\n",
+    "\n",
+    "download_file (url = 'https://arxiv.org/pdf/1706.03762', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'attension.pdf' ))\n",
+    "\n",
+    "download_file (url = 'https://arxiv.org/pdf/2405.04324', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'granite.pdf' ))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72510ae6-48b0-4b88-9e13-a623281c3a63",
+   "metadata": {},
+   "source": [
+    "### 2.2 - Set input/output path variables for the pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "60ac8bee-0960-4309-b225-d7a211b14262",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Cleared output directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os, sys\n",
+    "import shutil\n",
+    "\n",
+    "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n",
+    "    raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n",
+    "\n",
+    "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n",
+    "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n",
+    "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n",
+    "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n",
+    "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_embeddings_out')\n",
+    "\n",
+    "## clear output folder\n",
+    "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n",
+    "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n",
+    "\n",
+    "print (\"✅ Cleared output directory\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb",
+   "metadata": {},
+   "source": [
+    "## Step-3: pdf2parquet -  Convert data from PDF to Parquet\n",
+    "\n",
+    "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n",
+    "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a",
+   "metadata": {},
+   "source": [
+    "### 3.1 -  Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "482605b2-d814-456d-9195-49a2ec454ef0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_parquet_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE = 1 \n",
+    "\n",
+    "input_folder = MY_CONFIG.INPUT_DATA_DIR\n",
+    "output_folder =  output_parquet_dir\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b",
+   "metadata": {},
+   "source": [
+    "### 3.2 - Execute "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:23:40 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.JSON: 'application/json'>, 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n",
+      "00:23:40 INFO - pipeline id pipeline_id\n",
+      "00:23:40 INFO - code location None\n",
+      "00:23:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n",
+      "00:23:40 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:23:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
+      "00:23:40 INFO - orchestrator pdf2parquet started at 2024-10-02 00:23:40\n",
+      "00:23:40 INFO - Number of files is 2, source profile {'max_file_size': 2.112621307373047, 'min_file_size': 1.2146415710449219, 'total_file_size': 3.3272628784179688}\n",
+      "00:23:40 INFO - Initializing models\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd58971a33d4410c91e742e735a6e6e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7f8e0a57cf5b485aa42d95f1afe1379b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.pt:   0%|          | 0.00/201M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:24:14 INFO - Completed 1 files (50.0%) in 0.3 min\n",
+      "00:24:48 INFO - Completed 2 files (100.0%) in 0.871 min\n",
+      "00:24:48 INFO - Done processing 2 files, waiting for flush() completion.\n",
+      "00:24:48 INFO - done flushing in 0.0 sec\n",
+      "00:24:48 INFO - Completed execution in 1.137 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:1 completed successfully\n",
+      "CPU times: user 2min 9s, sys: 3.15 s, total: 2min 12s\n",
+      "Wall time: 1min 11s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time \n",
+    "\n",
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "\n",
+    "from pdf2parquet_transform import (\n",
+    "    pdf2parquet_contents_type_cli_param,\n",
+    "    pdf2parquet_contents_types,\n",
+    ")\n",
+    "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n",
+    "\n",
+    "\n",
+    "# create parameters\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "ingest_config = {\n",
+    "    pdf2parquet_contents_type_cli_param: pdf2parquet_contents_types.JSON,\n",
+    "}\n",
+    "\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    \"data_files_to_use\": ast.literal_eval(\"['.pdf']\"),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n",
+    "# create launcher\n",
+    "launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Job failed\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ca790e0",
+   "metadata": {},
+   "source": [
+    "### 3.3 -  Inspect Generated output\n",
+    "\n",
+    "Here we should see one entry per input file processed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "fe59563d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output dimensions (rows x columns)=  (2, 12)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        filename                                           contents  \\\n",
+       "0    granite.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...   \n",
+       "1  attension.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...   \n",
+       "\n",
+       "   num_pages  num_tables  num_doc_elements  \\\n",
+       "0         28          17               348   \n",
+       "1         15           4               193   \n",
+       "\n",
+       "                            document_id  ext  \\\n",
+       "0  4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3  pdf   \n",
+       "1  f275d75a-a072-4836-8a55-6a65f0d34577  pdf   \n",
+       "\n",
+       "                                                hash    size  \\\n",
+       "0  0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "1  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "\n",
+       "                date_acquired  pdf_convert_time source_filename  \n",
+       "0  2024-10-02T00:24:48.959612         34.223920     granite.pdf  \n",
+       "1  2024-10-02T00:24:14.713654         18.004455   attension.pdf  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.head(5)\n",
+    "\n",
+    "## To display certain columns\n",
+    "#parquet_df[['column1', 'column2', 'column3']].head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72274586",
+   "metadata": {},
+   "source": [
+    "##  Step-4: Doc chunks\n",
+    "\n",
+    "Split the documents in chunks, according to their layout segmentation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96198fa6",
+   "metadata": {},
+   "source": [
+    "### 4.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "305f00a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE = 2\n",
+    "\n",
+    "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_chunk_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "369f2cd1",
+   "metadata": {},
+   "source": [
+    "### 4.2 - Execute "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5b7b18d5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:24:50 INFO - doc_chunk parameters are : {'chunking_type': <chunking_types.DL_JSON: 'dl_json'>, 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n",
+      "00:24:50 INFO - pipeline id pipeline_id\n",
+      "00:24:50 INFO - code location None\n",
+      "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n",
+      "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:24:50 INFO - orchestrator doc_chunk started at 2024-10-02 00:24:50\n",
+      "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.12735748291015625, 'min_file_size': 0.035338401794433594, 'total_file_size': 0.16269588470458984}\n",
+      "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n",
+      "00:24:50 INFO - Completed 2 files (100.0%) in 0.004 min\n",
+      "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n",
+      "00:24:50 INFO - done flushing in 0.0 sec\n",
+      "00:24:50 INFO - Completed execution in 0.004 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:2 completed successfully\n",
+      "CPU times: user 1.07 s, sys: 95.1 ms, total: 1.16 s\n",
+      "Wall time: 1.19 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time \n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n",
+    "\n",
+    "# Prepare the commandline params\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # doc_chunk arguments\n",
+    "    # ...\n",
+    "}\n",
+    "\n",
+    "# Pass the commandline params\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# create launcher\n",
+    "launcher = PythonTransformLauncher(DocChunkPythonTransformConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "213afdf6",
+   "metadata": {},
+   "source": [
+    "### 4.3 - Inspect Generated output\n",
+    "\n",
+    "We would see documents are split into many chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d8138d43",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files processed : 2\n",
+      "Chunks created : 211\n",
+      "Input data dimensions (rows x columns)=  (2, 12)\n",
+      "Output data dimensions (rows x columns)=  (211, 16)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>87</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>6.3 Code Editing and Translation\\nTable 12: Pa...</td>\n",
+       "      <td>$.main-text[189]</td>\n",
+       "      <td>16</td>\n",
+       "      <td>[106.69820404, 190.24554443, 504.00320435, 211...</td>\n",
+       "      <td>f28d8c9a4fe81f0baf801daf9a95ddaf152a4ac5e8b8ac...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>154</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>3.2.2 Multi-Head Attention\\nMulti-head attenti...</td>\n",
+       "      <td>$.main-text[55]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[107.46644592, 669.41210938, 503.99703979, 690...</td>\n",
+       "      <td>da79f02a5f19c2f07de7a6f1da9df8db00f01a477582ac...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>67</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>6.1.5 RepoBench, CrossCodeEval: Repository-Lev...</td>\n",
+       "      <td>$.main-text[153]</td>\n",
+       "      <td>12</td>\n",
+       "      <td>[106.97065735, 224.31654358, 505.74191284, 290...</td>\n",
+       "      <td>cd5bd4537bde007298a91de7fa2fb4b56516d2f1d31262...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "87     granite.pdf         28          17               348  pdf   \n",
+       "154  attension.pdf         15           4               193  pdf   \n",
+       "67     granite.pdf         28          17               348  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "87   0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "154  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "67   0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "\n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "87   2024-10-02T00:24:48.959612         34.223920     granite.pdf   \n",
+       "154  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "67   2024-10-02T00:24:48.959612         34.223920     granite.pdf   \n",
+       "\n",
+       "                       source_document_id  \\\n",
+       "87   4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3   \n",
+       "154  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "67   4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3   \n",
+       "\n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "87   6.3 Code Editing and Translation\\nTable 12: Pa...  $.main-text[189]   \n",
+       "154  3.2.2 Multi-Head Attention\\nMulti-head attenti...   $.main-text[55]   \n",
+       "67   6.1.5 RepoBench, CrossCodeEval: Repository-Lev...  $.main-text[153]   \n",
+       "\n",
+       "     page_number                                               bbox  \\\n",
+       "87            16  [106.69820404, 190.24554443, 504.00320435, 211...   \n",
+       "154            5  [107.46644592, 669.41210938, 503.99703979, 690...   \n",
+       "67            12  [106.97065735, 224.31654358, 505.74191284, 290...   \n",
+       "\n",
+       "                                           document_id  \n",
+       "87   f28d8c9a4fe81f0baf801daf9a95ddaf152a4ac5e8b8ac...  \n",
+       "154  da79f02a5f19c2f07de7a6f1da9df8db00f01a477582ac...  \n",
+       "67   cd5bd4537bde007298a91de7fa2fb4b56516d2f1d31262...  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (f\"Files processed : {input_df.shape[0]:,}\")\n",
+    "print (f\"Chunks created : {output_df.shape[0]:,}\")\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.sample(min(3, output_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ece021fd",
+   "metadata": {},
+   "source": [
+    "## Step-5:  DOC ID generation\n",
+    "\n",
+    "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n",
+    "\n",
+    " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n",
+    " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e414c12c",
+   "metadata": {},
+   "source": [
+    "### 5.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "10251d3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "STAGE  = 3\n",
+    "\n",
+    "input_folder = output_chunk_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_docid_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f312347",
+   "metadata": {},
+   "source": [
+    "### 5.2 - Execute "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "a8b76a71",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:24:50 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n",
+      "00:24:50 INFO - pipeline id pipeline_id\n",
+      "00:24:50 INFO - code location None\n",
+      "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n",
+      "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:24:50 INFO - orchestrator doc_id started at 2024-10-02 00:24:50\n",
+      "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06398963928222656, 'min_file_size': 0.028062820434570312, 'total_file_size': 0.09205245971679688}\n",
+      "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n",
+      "00:24:50 INFO - Completed 2 files (100.0%) in 0.0 min\n",
+      "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n",
+      "00:24:50 INFO - done flushing in 0.0 sec\n",
+      "00:24:50 INFO - Completed execution in 0.0 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:3 completed successfully\n",
+      "CPU times: user 13.4 ms, sys: 4.83 ms, total: 18.3 ms\n",
+      "Wall time: 14.7 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time \n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # doc id configuration\n",
+    "    \"doc_id_doc_column\": \"contents\",\n",
+    "    \"doc_id_hash_column\": \"chunk_hash\",\n",
+    "    \"doc_id_int_column\": \"chunk_id\",\n",
+    "}\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# launch\n",
+    "\n",
+    "launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())\n",
+    "\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c23338b",
+   "metadata": {},
+   "source": [
+    "### 5.3 - Inspect Generated output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ec23aa3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (211, 16)\n",
+      "Output data dimensions (rows x columns)=  (211, 18)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>192</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>6.2 Model Variations\\nIn Table 3 rows (A), we ...</td>\n",
+       "      <td>$.main-text[118]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[107.27760315, 318.93438721, 505.24127197, 350...</td>\n",
+       "      <td>70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...</td>\n",
+       "      <td>70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...</td>\n",
+       "      <td>69</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>6.1.5 RepoBench, CrossCodeEval: Repository-Lev...</td>\n",
+       "      <td>$.tables[7]</td>\n",
+       "      <td>13</td>\n",
+       "      <td>[109.39778137, 486.89639282, 502.1010437, 679....</td>\n",
+       "      <td>b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...</td>\n",
+       "      <td>b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...</td>\n",
+       "      <td>159</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>196</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>6.3 English Constituency Parsing\\nWe performed...</td>\n",
+       "      <td>$.main-text[123]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[106.96768951, 69.592453, 504.24859619, 101.62...</td>\n",
+       "      <td>93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...</td>\n",
+       "      <td>93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...</td>\n",
+       "      <td>73</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "192  attension.pdf         15           4               193  pdf   \n",
+       "71     granite.pdf         28          17               348  pdf   \n",
+       "196  attension.pdf         15           4               193  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "192  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "71   0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "196  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "\n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "192  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "71   2024-10-02T00:24:48.959612         34.223920     granite.pdf   \n",
+       "196  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "\n",
+       "                       source_document_id  \\\n",
+       "192  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "71   4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3   \n",
+       "196  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "\n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "192  6.2 Model Variations\\nIn Table 3 rows (A), we ...  $.main-text[118]   \n",
+       "71   6.1.5 RepoBench, CrossCodeEval: Repository-Lev...       $.tables[7]   \n",
+       "196  6.3 English Constituency Parsing\\nWe performed...  $.main-text[123]   \n",
+       "\n",
+       "     page_number                                               bbox  \\\n",
+       "192            9  [107.27760315, 318.93438721, 505.24127197, 350...   \n",
+       "71            13  [109.39778137, 486.89639282, 502.1010437, 679....   \n",
+       "196            9  [106.96768951, 69.592453, 504.24859619, 101.62...   \n",
+       "\n",
+       "                                           document_id  \\\n",
+       "192  70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...   \n",
+       "71   b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...   \n",
+       "196  93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...   \n",
+       "\n",
+       "                                            chunk_hash  chunk_id  \n",
+       "192  70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...        69  \n",
+       "71   b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...       159  \n",
+       "196  93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...        73  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.sample(min(3, output_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53",
+   "metadata": {},
+   "source": [
+    "## Step-6: Exact Dedup\n",
+    "\n",
+    "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n",
+    "followed by de-duplication of record having identical hashes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe",
+   "metadata": {},
+   "source": [
+    "### 6.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4c7a1b94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE  = 4\n",
+    "\n",
+    "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_exact_dedupe_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e",
+   "metadata": {},
+   "source": [
+    "### 6.2 - Execute "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:24:50 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None}\n",
+      "00:24:50 INFO - pipeline id pipeline_id\n",
+      "00:24:50 INFO - code location None\n",
+      "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n",
+      "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:24:50 INFO - orchestrator ededup started at 2024-10-02 00:24:50\n",
+      "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06945991516113281, 'min_file_size': 0.03227043151855469, 'total_file_size': 0.1017303466796875}\n",
+      "00:24:50 INFO - Starting from the beginning\n",
+      "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n",
+      "00:24:50 INFO - Completed 2 files (100.0%) in 0.0 min\n",
+      "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n",
+      "00:24:50 INFO - done flushing in 0.0 sec\n",
+      "00:24:50 INFO - Completed execution in 0.0 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:4 completed successfully\n",
+      "CPU times: user 22.1 ms, sys: 5.79 ms, total: 27.9 ms\n",
+      "Wall time: 23.5 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "\n",
+    "# Import ededup transform configuration\n",
+    "from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration\n",
+    "\n",
+    "\n",
+    "# Prepare the commandline params\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # ededup parameters\n",
+    "    \"ededup_doc_column\": \"contents\",\n",
+    "    \"ededup_doc_id_column\": \"chunk_hash\",\n",
+    "    \n",
+    "}\n",
+    "\n",
+    "# Pass the commandline params\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "\n",
+    "# create launcher\n",
+    "launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())\n",
+    "# launch\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Ray job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eaf1c3c3",
+   "metadata": {},
+   "source": [
+    "### 6.3 - Inspect Generated output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "d824ebf6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (211, 18)\n",
+      "Output data dimensions (rows x columns)=  (211, 19)\n",
+      "Input chunks before exact dedupe : 211\n",
+      "Output chunks after exact dedupe : 211\n",
+      "Duplicate chunks removed :   0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>194</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>6.3 English Constituency Parsing\\nTo evaluate ...</td>\n",
+       "      <td>$.main-text[121]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[107.15766144, 167.93530273, 504.10968018, 210...</td>\n",
+       "      <td>10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...</td>\n",
+       "      <td>10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...</td>\n",
+       "      <td>71</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>101</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>6.5 Math Reasoning\\nTable 15: Performance on 4...</td>\n",
+       "      <td>$.main-text[219]</td>\n",
+       "      <td>19</td>\n",
+       "      <td>[118.49487305, 699.65753174, 492.17700195, 710...</td>\n",
+       "      <td>c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...</td>\n",
+       "      <td>c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...</td>\n",
+       "      <td>189</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>206</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>7 Conclusion\\nAcknowledgements We are grateful...</td>\n",
+       "      <td>$.main-text[135]</td>\n",
+       "      <td>10</td>\n",
+       "      <td>[107.4437561, 212.26509094, 504.00241089, 232....</td>\n",
+       "      <td>855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...</td>\n",
+       "      <td>855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...</td>\n",
+       "      <td>83</td>\n",
+       "      <td>[]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "194  attension.pdf         15           4               193  pdf   \n",
+       "101    granite.pdf         28          17               348  pdf   \n",
+       "206  attension.pdf         15           4               193  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "194  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "101  0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "206  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "\n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "194  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "101  2024-10-02T00:24:48.959612         34.223920     granite.pdf   \n",
+       "206  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "\n",
+       "                       source_document_id  \\\n",
+       "194  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "101  4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3   \n",
+       "206  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "\n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "194  6.3 English Constituency Parsing\\nTo evaluate ...  $.main-text[121]   \n",
+       "101  6.5 Math Reasoning\\nTable 15: Performance on 4...  $.main-text[219]   \n",
+       "206  7 Conclusion\\nAcknowledgements We are grateful...  $.main-text[135]   \n",
+       "\n",
+       "     page_number                                               bbox  \\\n",
+       "194            9  [107.15766144, 167.93530273, 504.10968018, 210...   \n",
+       "101           19  [118.49487305, 699.65753174, 492.17700195, 710...   \n",
+       "206           10  [107.4437561, 212.26509094, 504.00241089, 232....   \n",
+       "\n",
+       "                                           document_id  \\\n",
+       "194  10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...   \n",
+       "101  c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...   \n",
+       "206  855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...   \n",
+       "\n",
+       "                                            chunk_hash  chunk_id removed  \n",
+       "194  10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...        71      []  \n",
+       "101  c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...       189      []  \n",
+       "206  855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...        83      []  "
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n",
+    "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n",
+    "print (\"Duplicate chunks removed :  \", (input_df.shape[0] - output_df.shape[0]))\n",
+    "\n",
+    "output_df.sample(min(3, output_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85309751-8556-41c6-ac32-84acc941bc8d",
+   "metadata": {},
+   "source": [
+    "## Fuzzy Dedup\n",
+    "\n",
+    "**Fuzzy dedupe is currently available in RAY version only**\n",
+    "\n",
+    "So we will skip this here"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5370950a-2a3a-4143-8218-f9b4808099ba",
+   "metadata": {},
+   "source": [
+    "## Step-7:   Text encoding\n",
+    "\n",
+    "Encode text for the vector storage."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74fd33b1",
+   "metadata": {},
+   "source": [
+    "### 7.1 - Set Input/output Folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🏃🏼 STAGE-5: Processing input='output/04_exact_dedupe_out' --> output='output/05_embeddings_out'\n"
+     ]
+    }
+   ],
+   "source": [
+    "STAGE  = 5\n",
+    "\n",
+    "input_folder = output_exact_dedupe_dir\n",
+    "output_folder =  output_embeddings_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
+    "\n",
+    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9112479",
+   "metadata": {},
+   "source": [
+    "### 7.2 - Execute"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "228df6b2-bc62-494b-9697-03ece98d7853",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "00:24:50 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n",
+      "00:24:50 INFO - pipeline id pipeline_id\n",
+      "00:24:50 INFO - code location None\n",
+      "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_embeddings_out\n",
+      "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:24:50 INFO - orchestrator text_encoder started at 2024-10-02 00:24:50\n",
+      "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06981945037841797, 'min_file_size': 0.032629966735839844, 'total_file_size': 0.10244941711425781}\n",
+      "00:24:52 INFO - Completed 1 files (50.0%) in 0.008 min\n",
+      "00:24:53 INFO - Completed 2 files (100.0%) in 0.02 min\n",
+      "00:24:53 INFO - Done processing 2 files, waiting for flush() completion.\n",
+      "00:24:53 INFO - done flushing in 0.0 sec\n",
+      "00:24:53 INFO - Completed execution in 0.046 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Stage:5 completed successfully\n",
+      "CPU times: user 1.78 s, sys: 103 ms, total: 1.88 s\n",
+      "Wall time: 3.09 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time \n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from text_encoder_transform_python import TextEncoderPythonTransformConfiguration\n",
+    "\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # text_encoder\n",
+    "    \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n",
+    "}\n",
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "# create launcher\n",
+    "launcher = PythonTransformLauncher(TextEncoderPythonTransformConfiguration())\n",
+    "# Launch the ray actor(s) to process the input\n",
+    "\n",
+    "return_code = launcher.launch()\n",
+    "\n",
+    "if return_code == 0:\n",
+    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
+    "else:\n",
+    "    raise Exception (\"❌ Job failed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b734852c",
+   "metadata": {},
+   "source": [
+    "### 7.3 - Inspect Generated output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "7b1c1d09",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input data dimensions (rows x columns)=  (211, 19)\n",
+      "Output data dimensions (rows x columns)=  (211, 20)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>num_pages</th>\n",
+       "      <th>num_tables</th>\n",
+       "      <th>num_doc_elements</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>hash</th>\n",
+       "      <th>size</th>\n",
+       "      <th>date_acquired</th>\n",
+       "      <th>pdf_convert_time</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "      <th>embeddings</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>193</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>6.2 Model Variations\\nIn Table 3 rows (B), we ...</td>\n",
+       "      <td>$.main-text[119]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[107.44257355, 248.49208069, 505.24127197, 312...</td>\n",
+       "      <td>6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...</td>\n",
+       "      <td>6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...</td>\n",
+       "      <td>70</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[-0.0049973284, -0.10789071, 0.02143236, -0.02...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>210</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:24:14.713654</td>\n",
+       "      <td>18.004455</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>f275d75a-a072-4836-8a55-6a65f0d34577</td>\n",
+       "      <td>Attention Visualizations Input-Input Layer5\\nF...</td>\n",
+       "      <td>$.main-text[190]</td>\n",
+       "      <td>15</td>\n",
+       "      <td>[107.43354034, 157.36341858, 504.06988525, 189...</td>\n",
+       "      <td>67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...</td>\n",
+       "      <td>67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...</td>\n",
+       "      <td>87</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[0.01508544, -0.015680796, 0.039181348, 0.0084...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...</td>\n",
+       "      <td>654989</td>\n",
+       "      <td>2024-10-02T00:24:48.959612</td>\n",
+       "      <td>34.223920</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3</td>\n",
+       "      <td>6.1.1 HumanEvalSynthesize: Multilingual Code G...</td>\n",
+       "      <td>$.main-text[117]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[107.46860504, 613.84277344, 456.97003174, 624...</td>\n",
+       "      <td>3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...</td>\n",
+       "      <td>3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...</td>\n",
+       "      <td>134</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[-0.029933447, 0.031515192, -0.04598905, -0.01...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "193  attension.pdf         15           4               193  pdf   \n",
+       "210  attension.pdf         15           4               193  pdf   \n",
+       "46     granite.pdf         28          17               348  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "193  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "210  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "46   0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...  654989   \n",
+       "\n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "193  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "210  2024-10-02T00:24:14.713654         18.004455   attension.pdf   \n",
+       "46   2024-10-02T00:24:48.959612         34.223920     granite.pdf   \n",
+       "\n",
+       "                       source_document_id  \\\n",
+       "193  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "210  f275d75a-a072-4836-8a55-6a65f0d34577   \n",
+       "46   4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3   \n",
+       "\n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "193  6.2 Model Variations\\nIn Table 3 rows (B), we ...  $.main-text[119]   \n",
+       "210  Attention Visualizations Input-Input Layer5\\nF...  $.main-text[190]   \n",
+       "46   6.1.1 HumanEvalSynthesize: Multilingual Code G...  $.main-text[117]   \n",
+       "\n",
+       "     page_number                                               bbox  \\\n",
+       "193            9  [107.44257355, 248.49208069, 505.24127197, 312...   \n",
+       "210           15  [107.43354034, 157.36341858, 504.06988525, 189...   \n",
+       "46             9  [107.46860504, 613.84277344, 456.97003174, 624...   \n",
+       "\n",
+       "                                           document_id  \\\n",
+       "193  6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...   \n",
+       "210  67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...   \n",
+       "46   3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...   \n",
+       "\n",
+       "                                            chunk_hash  chunk_id removed  \\\n",
+       "193  6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...        70      []   \n",
+       "210  67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...        87      []   \n",
+       "46   3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...       134      []   \n",
+       "\n",
+       "                                            embeddings  \n",
+       "193  [-0.0049973284, -0.10789071, 0.02143236, -0.02...  \n",
+       "210  [0.01508544, -0.015680796, 0.039181348, 0.0084...  \n",
+       "46   [-0.029933447, 0.031515192, -0.04598905, -0.01...  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from utils import read_parquet_files_as_df\n",
+    "\n",
+    "output_df = read_parquet_files_as_df(output_folder)\n",
+    "\n",
+    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
+    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "\n",
+    "output_df.sample(min(3, output_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5e12630-be6b-4188-a925-77117155617b",
+   "metadata": {},
+   "source": [
+    "## Step-8: Copy output to final output dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Copied output from 'output/05_embeddings_out' --> 'output/output_final'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n",
+    "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n",
+    "\n",
+    "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
index 0f8440178..8a8942b1f 100644
--- a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
+++ b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb
@@ -6,7 +6,7 @@
    "metadata": {},
    "source": [
     "<div style=\"background-color: #04D7FD; padding: 20px; text-align: left;\">\n",
-    "    <h1 style=\"color: #000000; font-size: 36px; margin: 0;\">Data Processing for RAG with Data Prep Kit</h1>\n",
+    "    <h1 style=\"color: #000000; font-size: 36px; margin: 0;\">Data Processing for RAG with Data Prep Kit (RAY)</h1>\n",
     "    \n",
     "</div>\n"
    ]
@@ -38,8 +38,8 @@
     "\n",
     "- **pdf2parquet** : Extract text from PDF and convert them into parquet files\n",
     "- **Chunk documents**: Split the PDFs into 'meaningful sections' (paragraphs, sentences ..etc)\n",
-    "- **Exact Dedup**: Chunks with exact same content are filtered out\n",
     "- **Doc_ID generation**: Each chunk is assigned a uniq id, based on content and hash\n",
+    "- **Exact Dedup**: Chunks with exact same content are filtered out\n",
     "- **Fuzzy Dedup**: Eliminate chunks that are 'very similar' content\n",
     "- **Doc quality**: Scores the documents based on criteria like number of words, if it contains bad words ..etc\n",
     "- **Text encoder**: Convert chunks into vectors using embedding models"
@@ -60,21 +60,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os \n",
-    "\n",
-    "## Configuration\n",
-    "class MyConfig:\n",
-    "    pass \n",
-    "\n",
-    "MY_CONFIG = MyConfig ()\n",
-    "\n",
-    "## Input Data - configure this to the folder we want to process\n",
-    "MY_CONFIG.INPUT_DATA_DIR = \"input\"\n",
-    "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n",
-    "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n",
-    "\n",
-    "## Embedding model\n",
-    "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n",
+    "import os\n",
+    "from my_config import MY_CONFIG\n",
     "\n",
     "## RAY CONFIGURATION\n",
     "num_cpus_available =  os.cpu_count()\n",
@@ -89,29 +76,39 @@
   },
   {
    "cell_type": "markdown",
-   "id": "02cc3f0e",
+   "id": "40c58856",
    "metadata": {},
    "source": [
-    "### Download Data\n",
+    "## Step-2:  Data\n",
     "\n",
-    "We will use [Walmart annual report PDFs](https://github.com/sujee/data/tree/main/data-prep-kit/walmart-reports-1) as our input data.\n",
+    "We will use white papers  about LLMs.  \n",
     "\n",
-    "Feel free to substitute your data"
+    "- [Granite Code Models](https://arxiv.org/abs/2405.04324)\n",
+    "- [Attention is all you need](https://arxiv.org/abs/1706.03762)\n",
+    "\n",
+    "You can of course substite your own data below"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bce5939",
+   "metadata": {},
+   "source": [
+    "### 2.1 - Download data"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "82c1ae58",
+   "id": "1bfde6eb",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Local file 'input/Walmart-10K-Reports-Optimized_2023.pdf' (1.61 MB) already exists. Skipping download.\n",
-      "Local file 'input/Walmart_2024.pdf' (4.87 MB) already exists. Skipping download.\n",
-      "Local file 'input/Walmart_2024_copy.pdf' (4.87 MB) already exists. Skipping download.\n"
+      "Local file 'input/attension.pdf' (2.22 MB) already exists. Skipping download.\n",
+      "Local file 'input/granite.pdf' (1.27 MB) already exists. Skipping download.\n"
      ]
     }
    ],
@@ -123,11 +120,9 @@
     "## Download the data files\n",
     "shutil.os.makedirs(MY_CONFIG.INPUT_DATA_DIR, exist_ok=True)\n",
     "\n",
-    "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart-10K-Reports-Optimized_2023.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart-10K-Reports-Optimized_2023.pdf' ))\n",
-    "\n",
-    "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart_2024.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart_2024.pdf' ))\n",
+    "download_file (url = 'https://arxiv.org/pdf/1706.03762', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'attension.pdf' ))\n",
     "\n",
-    "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart_2024.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart_2024_copy.pdf' ))  # create a dupe file"
+    "download_file (url = 'https://arxiv.org/pdf/2405.04324', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'granite.pdf' ))\n"
    ]
   },
   {
@@ -135,7 +130,7 @@
    "id": "72510ae6-48b0-4b88-9e13-a623281c3a63",
    "metadata": {},
    "source": [
-    "### Set input/output path variables for the pipeline"
+    "### 2.2  - Set input/output path variables for the pipeline"
    ]
   },
   {
@@ -159,6 +154,13 @@
     "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n",
     "    raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n",
     "\n",
+    "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n",
+    "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n",
+    "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n",
+    "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n",
+    "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n",
+    "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n",
+    "\n",
     "\n",
     "## clear output folder\n",
     "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n",
@@ -167,42 +169,12 @@
     "print (\"✅ Cleared output directory\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "bd5d976e-cb4c-4469-af39-4b7ea507e9d8",
-   "metadata": {},
-   "source": [
-    "### Import Common python modules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "66178913-42b8-426b-a2e9-9587268fd05b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "# Main repo root\n",
-    "from utils import rootdir\n",
-    "\n",
-    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
-    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "\n",
-    "STAGE = 0"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb",
    "metadata": {},
    "source": [
-    "<a id=\"pdf2parquet\"></a>\n",
-    "\n",
-    "## Step-2: pdf2parquet -  Convert data from PDF to Parquet\n",
+    "## Step-3: pdf2parquet -  Convert data from PDF to Parquet\n",
     "\n",
     "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n",
     "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n",
@@ -214,12 +186,12 @@
    "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a",
    "metadata": {},
    "source": [
-    "### Set Input/output Folder"
+    "### 3.1 - Set Input/output Folder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "482605b2-d814-456d-9195-49a2ec454ef0",
    "metadata": {},
    "outputs": [
@@ -232,11 +204,10 @@
     }
    ],
    "source": [
-    "STAGE  += 1\n",
-    "# STAGE = 1  ## DEBUG\n",
+    "STAGE = 1 \n",
     "\n",
     "input_folder = MY_CONFIG.INPUT_DATA_DIR\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_parquet_out\")\n",
+    "output_folder =  output_parquet_dir\n",
     "\n",
     "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
    ]
@@ -246,12 +217,12 @@
    "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b",
    "metadata": {},
    "source": [
-    "### Execute "
+    "### 3.2 -  Execute "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26",
    "metadata": {},
    "outputs": [
@@ -259,51 +230,31 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:27:06 INFO - Running locally\n",
-      "10:27:06 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.JSON: 'application/json'>, 'do_table_structure': True, 'do_ocr': False}\n",
-      "10:27:06 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n",
-      "10:27:06 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:27:06 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
-      "10:27:06 INFO - pipeline id pipeline_id\n",
-      "10:27:06 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
-      "10:27:06 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n",
-      "10:27:06 INFO - actor creation delay 0\n",
-      "10:27:06 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:27:08,526\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - orchestrator started at 2024-08-30 10:27:13\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Number of files is 3, source profile {'max_file_size': 4.640201568603516, 'min_file_size': 1.5370569229125977, 'total_file_size': 10.817460060119629}\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.336485291831195, 'object_store': 4.168242644518614}\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 10:27:17 INFO - Initializing models\n",
-      "Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 30615.36it/s]\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m   warnings.warn(f\"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}\")\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:29.093 ( 315.179s) [        48BC0740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 10:27:17 INFO - Initializing models\n",
-      "Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 97867.09it/s]\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m   warnings.warn(f\"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}\")\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.255 ( 317.341s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1011\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.764 ( 317.850s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1037\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.833 ( 317.919s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1176\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:32.154 ( 318.240s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1321\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:32.543 ( 318.629s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1103\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:32:40 INFO - Completed 1 files in 5.459707876046498 min\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:32:40 INFO - Completed 1 files (33.333333333333336%)  in 5.459709358215332 min. Waiting for completion\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [        54E47740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [        54E47740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-selected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [        54E47740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [        54E47740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [        54E47740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-selected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:29.521 ( 615.607s) [        48BC0740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:31.651 ( 617.737s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1011\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.128 ( 618.214s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1037\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.195 ( 618.281s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1176\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.511 ( 618.597s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1321\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.873 ( 618.959s) [        48BC0740]          crf_model.cpp:2096   ERR| sequence is too long: 1000 > 1103\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:37:40 INFO - Completed processing 3 files in 10.459254721800486 min\n",
-      "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:37:40 INFO - done flushing in 0.0009496212005615234 sec\n",
-      "10:37:50 INFO - Completed execution in 10.735311404863994 min, execution result 0\n"
+      "00:25:24 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.JSON: 'application/json'>, 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n",
+      "00:25:24 INFO - pipeline id pipeline_id\n",
+      "00:25:24 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
+      "00:25:24 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n",
+      "00:25:24 INFO - actor creation delay 0\n",
+      "00:25:24 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:25:24 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n",
+      "00:25:24 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:25:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
+      "00:25:24 INFO - Running locally\n",
+      "2024-10-02 00:25:26,362\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - orchestrator started at 2024-10-02 00:25:29\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Number of files is 2, source profile {'max_file_size': 2.112621307373047, 'min_file_size': 1.2146415710449219, 'total_file_size': 3.3272628784179688}\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.941529083997011, 'object_store': 2.470764541067183}\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(RayTransformFileProcessor pid=636524)\u001b[0m 00:25:32 INFO - Initializing models\n",
+      "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 129854.61it/s]\n",
+      "\u001b[36m(RayTransformFileProcessor pid=636524)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:28:23 INFO - Completed processing 2 files in 2.9 min\n",
+      "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:28:23 INFO - done flushing in 0.001 sec\n",
+      "\u001b[36m(RayTransformFileProcessor pid=636523)\u001b[0m 00:25:32 INFO - Initializing models\n",
+      "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 37650.84it/s]\n",
+      "\u001b[36m(RayTransformFileProcessor pid=636523)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n",
+      "00:28:33 INFO - Completed execution in 3.158 min, execution result 0\n"
      ]
     },
     {
@@ -311,8 +262,8 @@
      "output_type": "stream",
      "text": [
       "✅ Stage:1 completed successfully\n",
-      "CPU times: user 4.08 s, sys: 1.07 s, total: 5.15 s\n",
-      "Wall time: 10min 48s\n"
+      "CPU times: user 3.85 s, sys: 668 ms, total: 4.52 s\n",
+      "Wall time: 3min 13s\n"
      ]
     }
    ],
@@ -323,6 +274,9 @@
     "import os\n",
     "import sys\n",
     "\n",
+    "from data_processing_ray.runtime.ray import RayTransformLauncher\n",
+    "from data_processing.utils import GB, ParamsUtils\n",
+    "\n",
     "from pdf2parquet_transform import (\n",
     "    pdf2parquet_contents_type_cli_param,\n",
     "    pdf2parquet_contents_types,\n",
@@ -330,9 +284,6 @@
     "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n",
     "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n",
     "\n",
-    "from data_processing.utils import GB, ParamsUtils\n",
-    "\n",
-    "\n",
     "# create parameters\n",
     "local_conf = {\n",
     "    \"input_folder\": input_folder,\n",
@@ -362,7 +313,6 @@
     "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n",
     "# create launcher\n",
     "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n",
-    "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n",
     "# launch\n",
     "return_code = launcher.launch()\n",
     "\n",
@@ -377,14 +327,14 @@
    "id": "5ca790e0",
    "metadata": {},
    "source": [
-    "### Inspect Generated output\n",
+    "### 3.3 - Inspect Generated output\n",
     "\n",
     "Here we should see one entry per input file processed"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "fe59563d",
    "metadata": {},
    "outputs": [
@@ -392,7 +342,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Output dimensions (rows x columns)=  (3, 12)\n"
+      "Output dimensions (rows x columns)=  (2, 12)\n"
      ]
     },
     {
@@ -433,86 +383,61 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
+       "      <td>granite.pdf</td>\n",
        "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
+       "      <td>attension.pdf</td>\n",
        "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>95cc2911-9a0d-49c3-a259-c74e35fca3ea</td>\n",
-       "      <td>pdf</td>\n",
-       "      <td>0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...</td>\n",
-       "      <td>1112050</td>\n",
-       "      <td>2024-08-30T10:37:40.616022</td>\n",
-       "      <td>299.935132</td>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>{\"_name\":\"\",\"type\":\"pdf-document\",\"description...</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                 filename  \\\n",
-       "0  Walmart-10K-Reports-Optimized_2023.pdf   \n",
-       "1                   Walmart_2024_copy.pdf   \n",
-       "2                        Walmart_2024.pdf   \n",
-       "\n",
-       "                                            contents  num_pages  num_tables  \\\n",
-       "0  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...        100          81   \n",
-       "1  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...        100          82   \n",
-       "2  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...        100          82   \n",
+       "        filename                                           contents  \\\n",
+       "0    granite.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...   \n",
+       "1  attension.pdf  {\"_name\":\"\",\"type\":\"pdf-document\",\"description...   \n",
        "\n",
-       "   num_doc_elements                           document_id  ext  \\\n",
-       "0              1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
-       "1              1163  95cc2911-9a0d-49c3-a259-c74e35fca3ea  pdf   \n",
-       "2              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
+       "   num_pages  num_tables  num_doc_elements  \\\n",
+       "0         28          17               348   \n",
+       "1         15           4               193   \n",
        "\n",
-       "                                                hash     size  \\\n",
-       "0  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "1  0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...  1112050   \n",
-       "2  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
+       "                            document_id  ext  \\\n",
+       "0  81bc331a-69cf-49bd-84b9-afedcab1344a  pdf   \n",
+       "1  7afd3fbc-3a9f-4728-8fd8-4a9a13980244  pdf   \n",
        "\n",
-       "                date_acquired  pdf_convert_time  \\\n",
-       "0  2024-08-30T10:32:49.798524        321.107279   \n",
-       "1  2024-08-30T10:37:40.616022        299.935132   \n",
-       "2  2024-08-30T10:32:40.640835        312.142404   \n",
+       "                                                hash    size  \\\n",
+       "0  79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "1  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
        "\n",
-       "                          source_filename  \n",
-       "0  Walmart-10K-Reports-Optimized_2023.pdf  \n",
-       "1                   Walmart_2024_copy.pdf  \n",
-       "2                        Walmart_2024.pdf  "
+       "                date_acquired  pdf_convert_time source_filename  \n",
+       "0  2024-10-02T00:28:23.836369        167.768806     granite.pdf  \n",
+       "1  2024-10-02T00:26:29.888597         53.822026   attension.pdf  "
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -535,9 +460,7 @@
    "id": "72274586",
    "metadata": {},
    "source": [
-    "<a id=\"chunking\"></a>\n",
-    "\n",
-    "##  Step-3: Doc chunks\n",
+    "##  Step-4: Doc chunks\n",
     "\n",
     "Split the documents in chunks, according to their layout segmentation."
    ]
@@ -547,12 +470,12 @@
    "id": "96198fa6",
    "metadata": {},
    "source": [
-    "### Set Input/output Folder"
+    "### 4.1 - Set Input/output Folder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "305f00a3",
    "metadata": {},
    "outputs": [
@@ -565,11 +488,10 @@
     }
    ],
    "source": [
-    "STAGE  += 1\n",
-    "# STAGE = 2  ## DEBUG\n",
+    "STAGE = 2\n",
     "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_chunk_out\")\n",
+    "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_chunk_dir\n",
     "\n",
     "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
     "\n",
@@ -581,12 +503,12 @@
    "id": "369f2cd1",
    "metadata": {},
    "source": [
-    "### Execute "
+    "### 4.2 -  Execute "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "5b7b18d5",
    "metadata": {},
    "outputs": [
@@ -594,40 +516,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package punkt_tab to\n",
-      "[nltk_data]     /home/sujee/apps/anaconda3/envs/data-prep-\n",
-      "[nltk_data]     kit-3-py311/lib/python3.11/site-\n",
-      "[nltk_data]     packages/llama_index/core/_static/nltk_cache...\n",
-      "[nltk_data]   Package punkt_tab is already up-to-date!\n",
-      "10:37:53 INFO - Running locally\n",
-      "10:37:53 INFO - doc_chunk parameters are : {'chunking_type': <chunking_types.DL_JSON: 'dl_json'>, 'content_column_name': 'contents', 'output_chunk_column_name': 'contents', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n",
-      "10:37:53 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n",
-      "10:37:53 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:37:53 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:37:53 INFO - pipeline id pipeline_id\n",
-      "10:37:53 INFO - code location None\n",
-      "10:37:53 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:37:53 INFO - actor creation delay 0\n",
-      "10:37:53 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:37:55,040\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] Downloading package punkt_tab to\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data]     /home/sujee/apps/anaconda3/envs/data-prep-\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data]     kit-3-py311/lib/python3.11/site-\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data]     packages/llama_index/core/_static/nltk_cache...\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data]   Package punkt_tab is already up-to-date!\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - orchestrator started at 2024-08-30 10:37:57\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Number of files is 3, source profile {'max_file_size': 0.3565502166748047, 'min_file_size': 0.35198307037353516, 'total_file_size': 1.060612678527832}\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.289907838217914, 'object_store': 4.144953917711973}\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed 1 files in 0.03202696243921916 min\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed 1 files (33.333333333333336%)  in 0.032028536001841225 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed processing 3 files in 0.03510438601175944 min\n",
-      "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - done flushing in 0.0009315013885498047 sec\n",
-      "10:38:09 INFO - Completed execution in 0.26731717586517334 min, execution result 0\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data] Downloading package punkt_tab to\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data]     kit-3-py311/lib/python3.11/site-\u001b[32m [repeated 4x across cluster]\u001b[0m\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data]     packages/llama_index/core/_static/nltk_cache...\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1089567)\u001b[0m [nltk_data]   Package punkt_tab is already up-to-date!\u001b[32m [repeated 2x across cluster]\u001b[0m\n"
+      "00:28:36 INFO - doc_chunk parameters are : {'chunking_type': <chunking_types.DL_JSON: 'dl_json'>, 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n",
+      "00:28:36 INFO - pipeline id pipeline_id\n",
+      "00:28:36 INFO - code location None\n",
+      "00:28:36 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
+      "00:28:36 INFO - actor creation delay 0\n",
+      "00:28:36 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:28:36 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n",
+      "00:28:36 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:28:36 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:28:36 INFO - Running locally\n",
+      "2024-10-02 00:28:38,768\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - orchestrator started at 2024-10-02 00:28:41\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Number of files is 2, source profile {'max_file_size': 0.12733078002929688, 'min_file_size': 0.035338401794433594, 'total_file_size': 0.16266918182373047}\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.939725494943559, 'object_store': 2.4698627470061183}\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:43 INFO - Completed processing 2 files in 0.033 min\n",
+      "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:43 INFO - done flushing in 0.001 sec\n",
+      "00:28:53 INFO - Completed execution in 0.281 min, execution result 0\n"
      ]
     },
     {
@@ -635,8 +542,8 @@
      "output_type": "stream",
      "text": [
       "✅ Stage:2 completed successfully\n",
-      "CPU times: user 1.35 s, sys: 997 ms, total: 2.35 s\n",
-      "Wall time: 18.5 s\n"
+      "CPU times: user 992 ms, sys: 321 ms, total: 1.31 s\n",
+      "Wall time: 19.6 s\n"
      ]
     }
    ],
@@ -684,14 +591,14 @@
    "id": "213afdf6",
    "metadata": {},
    "source": [
-    "### Inspect Generated output\n",
+    "### 4.3 - Inspect Generated output\n",
     "\n",
     "We would see documents are split into many chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "d8138d43",
    "metadata": {},
    "outputs": [
@@ -699,10 +606,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Files processed : 3\n",
-      "Chunks created : 2,042\n",
-      "Input data dimensions (rows x columns)=  (3, 12)\n",
-      "Output data dimensions (rows x columns)=  (2042, 15)\n"
+      "Files processed : 2\n",
+      "Chunks created : 211\n",
+      "Input data dimensions (rows x columns)=  (2, 12)\n",
+      "Output data dimensions (rows x columns)=  (211, 16)\n"
      ]
     },
     {
@@ -730,111 +637,120 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
        "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
        "      <th>contents</th>\n",
        "      <th>doc_jsonpath</th>\n",
        "      <th>page_number</th>\n",
        "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1229</th>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>95cc2911-9a0d-49c3-a259-c74e35fca3ea</td>\n",
+       "      <th>185</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...</td>\n",
-       "      <td>1112050</td>\n",
-       "      <td>2024-08-30T10:37:40.616022</td>\n",
-       "      <td>299.935132</td>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
-       "      <td>#26*1.88*)  &amp;62.2,7\\n*F=CF HC H&lt;9 .5L  IHG 5B8...</td>\n",
-       "      <td>$.main-text[891]</td>\n",
-       "      <td>76</td>\n",
-       "      <td>[35.41, 538.52, 546.86, 609.18]</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>6.1 Machine Translation\\nOn the WMT 2014 Engli...</td>\n",
+       "      <td>$.main-text[108]</td>\n",
+       "      <td>8</td>\n",
+       "      <td>[107.27262115, 260.13467407, 505.24533081, 302...</td>\n",
+       "      <td>d6c1d3686219a176bc5ff0ebf4f5c82a53d95d1502d476...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1767</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <th>94</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>67:?:E:@? 2?5 #:&gt;:E2E:@?D @7  ?E6C?2=  @?EC@=...</td>\n",
-       "      <td>$.main-text[630]</td>\n",
-       "      <td>55</td>\n",
-       "      <td>[35.55, 222.69, 525.53, 256.91]</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>6.3 Code Editing and Translation\\nFrom Table 1...</td>\n",
+       "      <td>$.main-text[199]</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[107.33219147, 356.5696106, 505.74539185, 411....</td>\n",
+       "      <td>1c841522286ea1348acafd3a4cfbbffd327ca5de53c5f9...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>865</th>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>95cc2911-9a0d-49c3-a259-c74e35fca3ea</td>\n",
+       "      <th>175</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...</td>\n",
-       "      <td>1112050</td>\n",
-       "      <td>2024-08-30T10:37:40.616022</td>\n",
-       "      <td>299.935132</td>\n",
-       "      <td>Walmart_2024_copy.pdf</td>\n",
-       "      <td>.6 C6=J 6IE6?D:G6=J @? :?7@C&gt;2E:@? 2?5 7:?2?4:...</td>\n",
-       "      <td>$.main-text[278]</td>\n",
-       "      <td>25</td>\n",
-       "      <td>[35.23, 641.07, 547.64, 747.74]</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>5.1 Training Data and Batching\\nWe trained on ...</td>\n",
+       "      <td>$.main-text[91]</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[107.12083435, 343.05245972, 505.65435791, 418...</td>\n",
+       "      <td>77de84b7743b8360a371146c12c9795a12984ef82354f4...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                   filename  num_pages  num_tables  num_doc_elements  \\\n",
-       "1229  Walmart_2024_copy.pdf        100          82              1163   \n",
-       "1767       Walmart_2024.pdf        100          82              1163   \n",
-       "865   Walmart_2024_copy.pdf        100          82              1163   \n",
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "185  attension.pdf         15           4               193  pdf   \n",
+       "94     granite.pdf         28          17               348  pdf   \n",
+       "175  attension.pdf         15           4               193  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "185  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "94   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "175  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
        "\n",
-       "                               document_id  ext  \\\n",
-       "1229  95cc2911-9a0d-49c3-a259-c74e35fca3ea  pdf   \n",
-       "1767  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
-       "865   95cc2911-9a0d-49c3-a259-c74e35fca3ea  pdf   \n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "185  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
+       "94   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "175  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
        "\n",
-       "                                                   hash     size  \\\n",
-       "1229  0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...  1112050   \n",
-       "1767  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
-       "865   0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...  1112050   \n",
+       "                       source_document_id  \\\n",
+       "185  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
+       "94   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "175  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
        "\n",
-       "                   date_acquired  pdf_convert_time        source_filename  \\\n",
-       "1229  2024-08-30T10:37:40.616022        299.935132  Walmart_2024_copy.pdf   \n",
-       "1767  2024-08-30T10:32:40.640835        312.142404       Walmart_2024.pdf   \n",
-       "865   2024-08-30T10:37:40.616022        299.935132  Walmart_2024_copy.pdf   \n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "185  6.1 Machine Translation\\nOn the WMT 2014 Engli...  $.main-text[108]   \n",
+       "94   6.3 Code Editing and Translation\\nFrom Table 1...  $.main-text[199]   \n",
+       "175  5.1 Training Data and Batching\\nWe trained on ...   $.main-text[91]   \n",
        "\n",
-       "                                               contents      doc_jsonpath  \\\n",
-       "1229  #26*1.88*)  &62.2,7\\n*F=CF HC H<9 .5L  IHG 5B8...  $.main-text[891]   \n",
-       "1767   67:?:E:@? 2?5 #:>:E2E:@?D @7  ?E6C?2=  @?EC@=...  $.main-text[630]   \n",
-       "865   .6 C6=J 6IE6?D:G6=J @? :?7@C>2E:@? 2?5 7:?2?4:...  $.main-text[278]   \n",
+       "     page_number                                               bbox  \\\n",
+       "185            8  [107.27262115, 260.13467407, 505.24533081, 302...   \n",
+       "94            17  [107.33219147, 356.5696106, 505.74539185, 411....   \n",
+       "175            7  [107.12083435, 343.05245972, 505.65435791, 418...   \n",
        "\n",
-       "      page_number                             bbox  \n",
-       "1229           76  [35.41, 538.52, 546.86, 609.18]  \n",
-       "1767           55  [35.55, 222.69, 525.53, 256.91]  \n",
-       "865            25  [35.23, 641.07, 547.64, 747.74]  "
+       "                                           document_id  \n",
+       "185  d6c1d3686219a176bc5ff0ebf4f5c82a53d95d1502d476...  \n",
+       "94   1c841522286ea1348acafd3a4cfbbffd327ca5de53c5f9...  \n",
+       "175  77de84b7743b8360a371146c12c9795a12984ef82354f4...  "
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -850,48 +766,50 @@
     "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
     "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
     "\n",
-    "output_df.sample(3)"
+    "output_df.sample(min(3, output_df.shape[0]))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53",
+   "id": "b8894d88",
    "metadata": {},
    "source": [
-    "## Step-4: Exact Dedup\n",
+    "## Step-5:  DOC ID generation\n",
     "\n",
-    "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n",
-    "followed by de-duplication of record having identical hashes."
+    "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n",
+    "\n",
+    " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n",
+    " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe",
+   "id": "46e88f76",
    "metadata": {},
    "source": [
-    "### Set Input/output Folder"
+    "### 5.1 - Set Input/output Folder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "4c7a1b94",
+   "execution_count": 10,
+   "id": "7debd243",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_ededupe_out'\n"
+      "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n"
      ]
     }
    ],
    "source": [
-    "STAGE  += 1\n",
-    "# STAGE  = 3  ## DEBUG\n",
     "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_ededupe_out\")\n",
+    "STAGE  = 3\n",
+    "\n",
+    "input_folder = output_chunk_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_docid_dir\n",
     "\n",
     "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
     "\n",
@@ -900,42 +818,41 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e",
+   "id": "1cadc2f3",
    "metadata": {},
    "source": [
-    "### Execute "
+    "### 5.2 - Execute "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
+   "execution_count": 11,
+   "id": "6b0eade3",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:38:10 INFO - Running locally\n",
-      "10:38:10 INFO - exact dedup params are {'doc_column': 'contents', 'hash_cpu': 0.5, 'num_hashes': 2}\n",
-      "10:38:10 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_ededupe_out\n",
-      "10:38:10 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:38:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:38:10 INFO - pipeline id pipeline_id\n",
-      "10:38:10 INFO - code location None\n",
-      "10:38:10 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:38:10 INFO - actor creation delay 0\n",
-      "10:38:10 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:38:12,554\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - orchestrator started at 2024-08-30 10:38:13\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Number of files is 3, source profile {'max_file_size': 0.20615005493164062, 'min_file_size': 0.19641399383544922, 'total_file_size': 0.5990447998046875}\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.27267990168184, 'object_store': 4.136339950375259}\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed 1 files in 0.011358428001403808 min\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed 1 files (33.333333333333336%)  in 0.011360756556193034 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed processing 3 files in 0.01162503957748413 min\n",
-      "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - done flushing in 0.0009477138519287109 sec\n",
-      "10:38:24 INFO - Completed execution in 0.2259385307629903 min, execution result 0\n"
+      "00:28:55 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n",
+      "00:28:55 INFO - pipeline id pipeline_id\n",
+      "00:28:55 INFO - code location None\n",
+      "00:28:55 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
+      "00:28:55 INFO - actor creation delay 0\n",
+      "00:28:55 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:28:55 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n",
+      "00:28:55 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:28:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:28:55 INFO - Running locally\n",
+      "2024-10-02 00:28:56,881\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - orchestrator started at 2024-10-02 00:28:57\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Number of files is 2, source profile {'max_file_size': 0.06398677825927734, 'min_file_size': 0.028062820434570312, 'total_file_size': 0.09204959869384766}\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.8911590576171875, 'object_store': 2.4455795288085938}\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:58 INFO - Completed processing 2 files in 0.013 min\n",
+      "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:58 INFO - done flushing in 0.001 sec\n",
+      "00:29:08 INFO - Completed execution in 0.228 min, execution result 0\n"
      ]
     },
     {
@@ -943,174 +860,7 @@
      "output_type": "stream",
      "text": [
       "✅ Stage:3 completed successfully\n",
-      "CPU times: user 121 ms, sys: 184 ms, total: 305 ms\n",
-      "Wall time: 14.8 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "# Import ededup transform configuration\n",
-    "from ededup_transform_ray import EdedupRayTransformConfiguration\n",
-    "\n",
-    "\n",
-    "# Prepare the commandline params\n",
-    "local_conf = {\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "}\n",
-    "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-    "params = {\n",
-    "    # where to run\n",
-    "    \"run_locally\": True,\n",
-    "    # Data access. Only required parameters are specified\n",
-    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-    "    # orchestrator\n",
-    "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
-    "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-    "    # ededup parameters\n",
-    "    \"ededup_hash_cpu\": 0.5,\n",
-    "    \"ededup_num_hashes\": 2,\n",
-    "    \"ededup_doc_column\": \"contents\",\n",
-    "}\n",
-    "\n",
-    "# Pass the commandline params\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "\n",
-    "# create launcher\n",
-    "launcher = RayTransformLauncher(EdedupRayTransformConfiguration())\n",
-    "# launch\n",
-    "return_code = launcher.launch()\n",
-    "\n",
-    "if return_code == 0:\n",
-    "    print (f\"✅ Stage:{STAGE} completed successfully\")\n",
-    "else:\n",
-    "    raise Exception (\"❌ Ray job failed\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eaf1c3c3",
-   "metadata": {},
-   "source": [
-    "### Inspect Generated output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "d824ebf6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Input data dimensions (rows x columns)=  (2042, 15)\n",
-      "Output data dimensions (rows x columns)=  (1324, 15)\n",
-      "Input chunks before exact dedupe : 2,042\n",
-      "Output chunks after exact dedupe : 1,324\n",
-      "Duplicate chunks removed :   718\n"
-     ]
-    }
-   ],
-   "source": [
-    "from utils import read_parquet_files_as_df\n",
-    "\n",
-    "output_df = read_parquet_files_as_df(output_folder)\n",
-    "\n",
-    "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
-    "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-    "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n",
-    "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n",
-    "print (\"Duplicate chunks removed :  \", (input_df.shape[0] - output_df.shape[0]))\n",
-    "\n",
-    "output_df.sample(3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f15f4d00-33bb-4d9a-9f34-4d7f3ee0b7bc",
-   "metadata": {},
-   "source": [
-    "## Step-5:  DOC ID generation\n",
-    "\n",
-    "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n",
-    "\n",
-    " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n",
-    " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "e6f62394-fbde-495c-bbbb-83161b006bed",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "🏃🏼 STAGE-4: Processing input='output/03_ededupe_out' --> output='output/04_doc_id_out'\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "# Input for this stage is the output of exact dedeup component\n",
-    "# output of this component makes it possible for fdedup component to run on data.\n",
-    "\n",
-    "STAGE  += 1\n",
-    "# STAGE  = 4  ## DEBUG\n",
-    "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_doc_id_out\")\n",
-    "\n",
-    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
-    "\n",
-    "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "a6daf36d-686c-4e0a-aabf-ce55f999bb2d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "10:38:25 INFO - Running locally\n",
-      "10:38:25 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column'}\n",
-      "10:38:25 INFO - data factory data_ is using local data access: input_folder - output/03_ededupe_out output_folder - output/04_doc_id_out\n",
-      "10:38:25 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:38:25 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:38:25 INFO - pipeline id pipeline_id\n",
-      "10:38:25 INFO - code location None\n",
-      "10:38:25 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:38:25 INFO - actor creation delay 0\n",
-      "10:38:25 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:38:27,443\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - orchestrator started at 2024-08-30 10:38:28\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Number of files is 3, source profile {'max_file_size': 0.20574665069580078, 'min_file_size': 0.003185272216796875, 'total_file_size': 0.4063444137573242}\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.265644073486328, 'object_store': 4.132822036743164}\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed 1 files in 0.012215912342071533 min\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed 1 files (33.333333333333336%)  in 0.012217283248901367 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed processing 3 files in 0.012248762448628743 min\n",
-      "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - done flushing in 0.0009109973907470703 sec\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1092988)\u001b[0m 10:38:29 WARNING - table is empty, skipping processing\n",
-      "10:38:39 INFO - Completed execution in 0.22525110244750976 min, execution result 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "✅ Stage:4 completed successfully\n",
-      "CPU times: user 136 ms, sys: 159 ms, total: 296 ms\n",
+      "CPU times: user 123 ms, sys: 167 ms, total: 290 ms\n",
       "Wall time: 15 s\n"
      ]
     }
@@ -1118,7 +868,7 @@
    "source": [
     "%%time \n",
     "\n",
-    "from doc_id_transform_ray import DocIDRayTransformConfiguration\n",
+    "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n",
     "local_conf = {\n",
     "    \"input_folder\": input_folder,\n",
     "    \"output_folder\": output_folder,\n",
@@ -1134,14 +884,14 @@
     "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
     "    # doc id configuration\n",
     "    \"doc_id_doc_column\": \"contents\",\n",
-    "    \"doc_id_hash_column\": \"hash_column\",\n",
-    "    \"doc_id_int_column\": \"int_id_column\",\n",
+    "    \"doc_id_hash_column\": \"chunk_hash\",\n",
+    "    \"doc_id_int_column\": \"chunk_id\",\n",
     "}\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
     "\n",
     "# launch\n",
     "\n",
-    "launcher = RayTransformLauncher(DocIDRayTransformConfiguration())\n",
+    "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n",
     "\n",
     "return_code = launcher.launch()\n",
     "\n",
@@ -1153,24 +903,24 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3d492c2b",
+   "id": "d5c5c6e4",
    "metadata": {},
    "source": [
-    "### Inspect Generated output"
+    "### 5.3 - Inspect Generated output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "91ade826",
+   "execution_count": 12,
+   "id": "45d941b2",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input data dimensions (rows x columns)=  (1324, 15)\n",
-      "Output data dimensions (rows x columns)=  (1324, 17)\n"
+      "Input data dimensions (rows x columns)=  (211, 16)\n",
+      "Output data dimensions (rows x columns)=  (211, 18)\n"
      ]
     },
     {
@@ -1198,129 +948,133 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
        "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
        "      <th>contents</th>\n",
        "      <th>doc_jsonpath</th>\n",
        "      <th>page_number</th>\n",
        "      <th>bbox</th>\n",
-       "      <th>hash_column</th>\n",
-       "      <th>int_id_column</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>860</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <th>31</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>#682=  +2I  )68F=2E@CJ   @&gt;A=:2?46  )6AFE2E:@?...</td>\n",
-       "      <td>$.main-text[299]</td>\n",
-       "      <td>27</td>\n",
-       "      <td>[35.24, 725.11, 503.48, 747.51]</td>\n",
-       "      <td>97e06840b409f4ca176c2d5b145e8f25c9d3d37c6510ac...</td>\n",
-       "      <td>187</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>3 Model Architecture\\nremove final 8 layers fr...</td>\n",
+       "      <td>$.main-text[69]</td>\n",
+       "      <td>6</td>\n",
+       "      <td>[107.45430756, 456.21582031, 504.50476074, 521...</td>\n",
+       "      <td>72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...</td>\n",
+       "      <td>72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...</td>\n",
+       "      <td>119</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>116</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>A A message  from  our r CEO\\nFor the fiscal y...</td>\n",
-       "      <td>$.main-text[14]</td>\n",
-       "      <td>3</td>\n",
-       "      <td>[214.16, 607.24, 390.6, 617.44]</td>\n",
-       "      <td>2f26fa255117cd004e3fc8e4348d39fd265e570edfdbc7...</td>\n",
-       "      <td>653</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>Acknowledgments\\nThanks and acknowledgement to...</td>\n",
+       "      <td>$.main-text[249]</td>\n",
+       "      <td>21</td>\n",
+       "      <td>[107.07092285, 59.12960052, 505.24591064, 160....</td>\n",
+       "      <td>b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...</td>\n",
+       "      <td>b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...</td>\n",
+       "      <td>204</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>607</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>95</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...</td>\n",
-       "      <td>$.main-text[978]</td>\n",
-       "      <td>85</td>\n",
-       "      <td>[111.85, 578.01, 263.73, 587.0]</td>\n",
-       "      <td>43503987ec97f0b553f02f3572b2326006b641745b7c2f...</td>\n",
-       "      <td>1258</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>6.3 Code Editing and Translation\\nCodeLingua (...</td>\n",
+       "      <td>$.main-text[200]</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[107.03813934, 207.6650238, 505.74505615, 350....</td>\n",
+       "      <td>c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...</td>\n",
+       "      <td>c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...</td>\n",
+       "      <td>183</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                   filename  num_pages  num_tables  \\\n",
-       "860                        Walmart_2024.pdf        100          82   \n",
-       "2    Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "607  Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "\n",
-       "     num_doc_elements                           document_id  ext  \\\n",
-       "860              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
-       "2                1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
-       "607              1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
+       "        filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "31   granite.pdf         28          17               348  pdf   \n",
+       "116  granite.pdf         28          17               348  pdf   \n",
+       "95   granite.pdf         28          17               348  pdf   \n",
        "\n",
-       "                                                  hash     size  \\\n",
-       "860  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
-       "2    ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "607  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
+       "                                                  hash    size  \\\n",
+       "31   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "116  79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "95   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
        "\n",
-       "                  date_acquired  pdf_convert_time  \\\n",
-       "860  2024-08-30T10:32:40.640835        312.142404   \n",
-       "2    2024-08-30T10:32:49.798524        321.107279   \n",
-       "607  2024-08-30T10:32:49.798524        321.107279   \n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "31   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "116  2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "95   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
        "\n",
-       "                            source_filename  \\\n",
-       "860                        Walmart_2024.pdf   \n",
-       "2    Walmart-10K-Reports-Optimized_2023.pdf   \n",
-       "607  Walmart-10K-Reports-Optimized_2023.pdf   \n",
+       "                       source_document_id  \\\n",
+       "31   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "116  81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "95   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
        "\n",
        "                                              contents      doc_jsonpath  \\\n",
-       "860  #682=  +2I  )68F=2E@CJ   @>A=:2?46  )6AFE2E:@?...  $.main-text[299]   \n",
-       "2    A A message  from  our r CEO\\nFor the fiscal y...   $.main-text[14]   \n",
-       "607  ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...  $.main-text[978]   \n",
+       "31   3 Model Architecture\\nremove final 8 layers fr...   $.main-text[69]   \n",
+       "116  Acknowledgments\\nThanks and acknowledgement to...  $.main-text[249]   \n",
+       "95   6.3 Code Editing and Translation\\nCodeLingua (...  $.main-text[200]   \n",
        "\n",
-       "     page_number                             bbox  \\\n",
-       "860           27  [35.24, 725.11, 503.48, 747.51]   \n",
-       "2              3  [214.16, 607.24, 390.6, 617.44]   \n",
-       "607           85  [111.85, 578.01, 263.73, 587.0]   \n",
+       "     page_number                                               bbox  \\\n",
+       "31             6  [107.45430756, 456.21582031, 504.50476074, 521...   \n",
+       "116           21  [107.07092285, 59.12960052, 505.24591064, 160....   \n",
+       "95            17  [107.03813934, 207.6650238, 505.74505615, 350....   \n",
        "\n",
-       "                                           hash_column  int_id_column  \n",
-       "860  97e06840b409f4ca176c2d5b145e8f25c9d3d37c6510ac...            187  \n",
-       "2    2f26fa255117cd004e3fc8e4348d39fd265e570edfdbc7...            653  \n",
-       "607  43503987ec97f0b553f02f3572b2326006b641745b7c2f...           1258  "
+       "                                           document_id  \\\n",
+       "31   72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...   \n",
+       "116  b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...   \n",
+       "95   c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...   \n",
+       "\n",
+       "                                            chunk_hash  chunk_id  \n",
+       "31   72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...       119  \n",
+       "116  b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...       204  \n",
+       "95   c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...       183  "
       ]
      },
-     "execution_count": 16,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1333,179 +1087,137 @@
     "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
     "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
     "\n",
-    "output_df.sample(3)"
+    "output_df.sample(min(3, output_df.shape[0]))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "85309751-8556-41c6-ac32-84acc941bc8d",
+   "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53",
    "metadata": {},
    "source": [
-    "## Step-6: Fuzzy Dedup\n",
+    "## Step-6: Exact Dedup\n",
     "\n",
-    "Post exact deduplication, fuzzy deduplication is applied with\n",
-    "the goal of removing code files that may have slight variations and thereby unbiasing\n",
-    "the data further. Small variations are quite commonly seen in code data in the form\n",
-    "of variations in the values of variables, addittion of logging statements etc. Find near-\n",
-    "duplicate."
+    "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n",
+    "followed by de-duplication of record having identical hashes."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "fcf574a3-b287-419c-9c86-07b828b41ca6",
+   "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe",
    "metadata": {},
    "source": [
-    "### Set Input/output Folder"
+    "### 6.1 -  Set Input/output Folder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
+   "execution_count": 13,
+   "id": "4c7a1b94",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "🏃🏼 STAGE-5: Processing input='output/04_doc_id_out' --> output='output/05_fdedupe_out'\n"
+      "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n"
      ]
     }
    ],
    "source": [
-    "## Input to this component is the output of doc_id generator component. \n",
+    "STAGE  = 4\n",
+    "\n",
+    "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_exact_dedupe_dir\n",
     "\n",
-    "STAGE  += 1\n",
-    "# STAGE  = 5  ## DEBUG\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
     "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_fdedupe_out\")\n",
     "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3",
+   "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e",
    "metadata": {},
    "source": [
-    "### Execute "
+    "### 6.2 - Execute "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
+   "execution_count": 14,
+   "id": "a624b2b2-faad-4325-ac7d-53a840f564ef",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:38:40 INFO - Running locally\n",
-      "10:38:40 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'int_id_column', 'cluster_column': 'hash_column', 'bucket_cpu': 0.5, 'mhash_cpu': 0.5, 'doc_cpu': 0.5, 'num_doc_actors': 2, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 2, 'num_permutations': 64, 'threshold': 0.8, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 1}}\n",
-      "10:38:40 INFO - data factory data_ is using local data access: input_folder - output/04_doc_id_out output_folder - output/05_fdedupe_out\n",
-      "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:38:40 INFO - pipeline id pipeline_id\n",
-      "10:38:40 INFO - code location None\n",
-      "10:38:40 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:38:40 INFO - actor creation delay 0\n",
-      "10:38:40 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:38:42,441\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - orchestrator started at 2024-08-30 10:38:43\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Number of files is 2, source profile {'max_file_size': 0.25233936309814453, 'min_file_size': 0.2446727752685547, 'total_file_size': 0.4970121383666992}\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.28145294263959, 'object_store': 4.140726470388472}\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - starting run from the beginning\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - continuing from the very beginning\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Fuzzy: num buckets 5, bucket length 11\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 1 bucket actors\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 1 minhash actors\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Table preprocessing uses 2 readers\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 2 table processor actors\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Completed 0 files (0.0%)  in 6.504853566487631e-06 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:52 INFO - Completed processing 2 files in 0.15140592257181804 min\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:52 INFO - creating minhash snapshots\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:53 INFO - minhash snapshots created\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:53 INFO - creating bucket snapshots\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - bucket snapshots created\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created 2 document actors\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created 2 bucket processor actors\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created bucket processor invoker\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - added invoker to bucket collectors\n",
-      "\u001b[36m(BucketsHash pid=1094647)\u001b[0m 10:38:54 INFO - processing buckets 0 long, 6569 short\n",
-      "\u001b[36m(BucketsHash pid=1094647)\u001b[0m 10:38:54 INFO - Done submitting long buckets\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:55 INFO - Done processing buckets in 0.011683110396067302 min\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:55 INFO - creating document snapshots\n",
-      "\u001b[36m(BucketsHashProcessorInvoker pid=1095253)\u001b[0m 10:38:55 INFO - Waiting bucket processing completion. Submitted requests 66\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:57 INFO - document snapshots created\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:57 INFO - Completed 0 files (0.0%)  in 1.0371208190917969e-05 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:39:06 INFO - Completed processing 2 files in 0.1462758183479309 min\n",
-      "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:39:06 INFO - done flushing in 0.001108407974243164 sec\n",
-      "10:39:16 INFO - Completed execution in 0.5921090364456176 min, execution result 0\n"
+      "00:29:10 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n",
+      "00:29:10 INFO - pipeline id pipeline_id\n",
+      "00:29:10 INFO - code location None\n",
+      "00:29:10 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
+      "00:29:10 INFO - actor creation delay 0\n",
+      "00:29:10 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:29:10 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n",
+      "00:29:10 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:29:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:29:10 INFO - Running locally\n",
+      "2024-10-02 00:29:11,920\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - orchestrator started at 2024-10-02 00:29:12\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Number of files is 2, source profile {'max_file_size': 0.0694570541381836, 'min_file_size': 0.03227043151855469, 'total_file_size': 0.10172748565673828}\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.913980866782367, 'object_store': 2.4569904319941998}\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:13 INFO - Completed processing 2 files in 0.013 min\n",
+      "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:13 INFO - done flushing in 0.001 sec\n",
+      "00:29:23 INFO - Completed execution in 0.227 min, execution result 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Stage:5 completed successfully\n",
-      "CPU times: user 208 ms, sys: 195 ms, total: 402 ms\n",
-      "Wall time: 37 s\n"
+      "✅ Stage:4 completed successfully\n",
+      "CPU times: user 120 ms, sys: 172 ms, total: 292 ms\n",
+      "Wall time: 14.9 s\n"
      ]
     }
    ],
    "source": [
-    "%%time \n",
-    "\n",
-    "import os\n",
-    "import sys\n",
+    "%%time\n",
     "\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_ray import FdedupRayTransformConfiguration\n",
+    "# Import ededup transform configuration\n",
+    "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n",
     "\n",
-    "# create parameters\n",
     "\n",
+    "# Prepare the commandline params\n",
     "local_conf = {\n",
     "    \"input_folder\": input_folder,\n",
     "    \"output_folder\": output_folder,\n",
     "}\n",
     "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
-    "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
     "params = {\n",
     "    # where to run\n",
     "    \"run_locally\": True,\n",
     "    # Data access. Only required parameters are specified\n",
     "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-    "    # Orchestration parameters\n",
+    "    # orchestrator\n",
     "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
     "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-    "    # columns used\n",
-    "    \"fdedup_doc_column\": \"contents\",\n",
-    "    \"fdedup_id_column\": \"int_id_column\",\n",
-    "    \"fdedup_cluster_column\": \"hash_column\",\n",
-    "    # infrastructure\n",
-    "    \"fdedup_bucket_cpu\": 0.5,\n",
-    "    \"fdedup_doc_cpu\": 0.5,\n",
-    "    \"fdedup_mhash_cpu\": 0.5,\n",
-    "    \"fdedup_num_doc_actors\": 2,\n",
-    "    \"fdedup_num_bucket_actors\": 1,\n",
-    "    \"fdedup_num_minhash_actors\": 1,\n",
-    "    \"fdedup_num_preprocessors\": 2,\n",
-    "    # fuzzy parameters\n",
-    "    \"fdedup_num_permutations\": 64,\n",
-    "    \"fdedup_threshold\": 0.8,\n",
-    "    \"fdedup_shingles_size\": 5,\n",
-    "    \"fdedup_delimiters\": \" \"\n",
+    "    # ededup parameters\n",
+    "    \"ededup_hash_cpu\": 0.5,\n",
+    "    \"ededup_num_hashes\": 2,\n",
+    "    \"ededup_doc_column\": \"contents\",\n",
+    "    \"ededup_doc_id_column\": \"chunk_hash\",\n",
+    "    \n",
     "}\n",
     "\n",
-    "# Pass commandline params\n",
+    "# Pass the commandline params\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
     "\n",
+    "# create launcher\n",
+    "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n",
     "# launch\n",
-    "\n",
-    "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n",
-    "\n",
     "return_code = launcher.launch()\n",
     "\n",
     "if return_code == 0:\n",
@@ -1516,25 +1228,27 @@
   },
   {
    "cell_type": "markdown",
-   "id": "a6f8cd11",
+   "id": "eaf1c3c3",
    "metadata": {},
    "source": [
-    "### Inspect Generated output"
+    "### 6.3 - Inspect Generated output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "e899ad60",
+   "execution_count": 15,
+   "id": "d824ebf6",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input data dimensions (rows x columns)=  (1324, 15)\n",
-      "Output data dimensions (rows x columns)=  (1302, 17)\n",
-      "Duplicate chunks removed  by fuzzy-dedupe:   22\n"
+      "Input data dimensions (rows x columns)=  (211, 18)\n",
+      "Output data dimensions (rows x columns)=  (211, 19)\n",
+      "Input chunks before exact dedupe : 211\n",
+      "Output chunks after exact dedupe : 211\n",
+      "Duplicate chunks removed :   0\n"
      ]
     },
     {
@@ -1562,124 +1276,137 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
        "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
        "      <th>contents</th>\n",
        "      <th>doc_jsonpath</th>\n",
        "      <th>page_number</th>\n",
        "      <th>bbox</th>\n",
-       "      <th>int_id_column</th>\n",
-       "      <th>hash_column</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_hash</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1102</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <th>188</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>&amp;  %   (' ('-+(%%#'!  '- + ,-\\n(CB7CBHFC@...</td>\n",
-       "      <td>$.main-text[734]</td>\n",
-       "      <td>66</td>\n",
-       "      <td>[35.27, 647.11, 551.24, 729.93]</td>\n",
-       "      <td>447</td>\n",
-       "      <td>-1</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>6.2 Model Variations\\nTo evaluate the importan...</td>\n",
+       "      <td>$.main-text[112]</td>\n",
+       "      <td>8</td>\n",
+       "      <td>[107.1419754, 91.9256134, 504.05615234, 113.59...</td>\n",
+       "      <td>6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...</td>\n",
+       "      <td>6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...</td>\n",
+       "      <td>65</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>470</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>153</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>Share-Based Compensation\\nFair value of restri...</td>\n",
-       "      <td>$.tables[39]</td>\n",
-       "      <td>68</td>\n",
-       "      <td>[47.21, 61.57, 540.3, 152.39]</td>\n",
-       "      <td>1123</td>\n",
-       "      <td>-1</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>3.2.2 Multi-Head Attention\\noutput values. The...</td>\n",
+       "      <td>$.main-text[54]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[107.36427307, 696.97607422, 503.99719238, 717...</td>\n",
+       "      <td>07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...</td>\n",
+       "      <td>07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...</td>\n",
+       "      <td>30</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>339</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>68</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>Capital Resources\\nWe  believe our cash flows ...</td>\n",
-       "      <td>$.main-text[517]</td>\n",
-       "      <td>48</td>\n",
-       "      <td>[46.39, 510.11, 539.0, 555.63]</td>\n",
-       "      <td>992</td>\n",
-       "      <td>-1</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>6.1.5 RepoBench, CrossCodeEval: Repository-Lev...</td>\n",
+       "      <td>$.main-text[154]</td>\n",
+       "      <td>12</td>\n",
+       "      <td>[107.21151733, 141.59487915, 505.73928833, 218...</td>\n",
+       "      <td>650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...</td>\n",
+       "      <td>650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...</td>\n",
+       "      <td>156</td>\n",
+       "      <td>[]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                    filename  num_pages  num_tables  \\\n",
-       "1102                        Walmart_2024.pdf        100          82   \n",
-       "470   Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "339   Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "188  attension.pdf         15           4               193  pdf   \n",
+       "153  attension.pdf         15           4               193  pdf   \n",
+       "68     granite.pdf         28          17               348  pdf   \n",
+       "\n",
+       "                                                  hash    size  \\\n",
+       "188  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "153  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "68   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
        "\n",
-       "      num_doc_elements                           document_id  ext  \\\n",
-       "1102              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
-       "470               1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
-       "339               1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "188  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
+       "153  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
+       "68   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
        "\n",
-       "                                                   hash     size  \\\n",
-       "1102  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
-       "470   ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "339   ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
+       "                       source_document_id  \\\n",
+       "188  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
+       "153  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
+       "68   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
        "\n",
-       "                   date_acquired  pdf_convert_time  \\\n",
-       "1102  2024-08-30T10:32:40.640835        312.142404   \n",
-       "470   2024-08-30T10:32:49.798524        321.107279   \n",
-       "339   2024-08-30T10:32:49.798524        321.107279   \n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "188  6.2 Model Variations\\nTo evaluate the importan...  $.main-text[112]   \n",
+       "153  3.2.2 Multi-Head Attention\\noutput values. The...   $.main-text[54]   \n",
+       "68   6.1.5 RepoBench, CrossCodeEval: Repository-Lev...  $.main-text[154]   \n",
        "\n",
-       "                             source_filename  \\\n",
-       "1102                        Walmart_2024.pdf   \n",
-       "470   Walmart-10K-Reports-Optimized_2023.pdf   \n",
-       "339   Walmart-10K-Reports-Optimized_2023.pdf   \n",
+       "     page_number                                               bbox  \\\n",
+       "188            8  [107.1419754, 91.9256134, 504.05615234, 113.59...   \n",
+       "153            5  [107.36427307, 696.97607422, 503.99719238, 717...   \n",
+       "68            12  [107.21151733, 141.59487915, 505.73928833, 218...   \n",
        "\n",
-       "                                               contents      doc_jsonpath  \\\n",
-       "1102       &  %   (' ('-+(%%#'!  '- + ,-\\n(CB7CBHFC@...  $.main-text[734]   \n",
-       "470   Share-Based Compensation\\nFair value of restri...      $.tables[39]   \n",
-       "339   Capital Resources\\nWe  believe our cash flows ...  $.main-text[517]   \n",
+       "                                           document_id  \\\n",
+       "188  6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...   \n",
+       "153  07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...   \n",
+       "68   650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...   \n",
        "\n",
-       "      page_number                             bbox  int_id_column  hash_column  \n",
-       "1102           66  [35.27, 647.11, 551.24, 729.93]            447           -1  \n",
-       "470            68    [47.21, 61.57, 540.3, 152.39]           1123           -1  \n",
-       "339            48   [46.39, 510.11, 539.0, 555.63]            992           -1  "
+       "                                            chunk_hash  chunk_id removed  \n",
+       "188  6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...        65      []  \n",
+       "153  07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...        30      []  \n",
+       "68   650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...       156      []  "
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1691,151 +1418,186 @@
     "\n",
     "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
     "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
-    "print (\"Duplicate chunks removed  by fuzzy-dedupe:  \", (input_df.shape[0] - output_df.shape[0]))\n",
+    "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n",
+    "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n",
+    "print (\"Duplicate chunks removed :  \", (input_df.shape[0] - output_df.shape[0]))\n",
     "\n",
-    "output_df.sample(3)"
+    "output_df.sample(min(3, output_df.shape[0]))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "0646cbb7-3046-44c0-827d-d102d3ff7cb8",
+   "id": "85309751-8556-41c6-ac32-84acc941bc8d",
    "metadata": {},
    "source": [
-    "## Step-7: Document Quality"
+    "## Step-7: Fuzzy Dedup\n",
+    "\n",
+    "Post exact deduplication, fuzzy deduplication is applied with\n",
+    "the goal of removing code files that may have slight variations and thereby unbiasing\n",
+    "the data further. Small variations are quite commonly seen in code data in the form\n",
+    "of variations in the values of variables, addittion of logging statements etc. Find near-\n",
+    "duplicate."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2e985668-848b-4633-b0d8-9fe70ada0c91",
+   "id": "fcf574a3-b287-419c-9c86-07b828b41ca6",
    "metadata": {},
    "source": [
-    "### Set Input/output Folder"
+    "### 7.1 - Set Input/output Folder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "9f080011-c9fe-430e-9ecc-f2220d2c8d18",
+   "execution_count": 16,
+   "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "🏃🏼 STAGE-6: Processing input='output/05_fdedupe_out' --> output='output/06_doc_quality_out'\n"
+      "🏃🏼 STAGE-5: Processing input='output/04_exact_dedupe_out' --> output='output/05_fuzzy_dedupe_out'\n"
      ]
     }
    ],
    "source": [
-    "STAGE  += 1\n",
-    "# STAGE  = 6 ## DEBUG\n",
+    "## Input to this component is the output of doc_id generator component. \n",
+    "\n",
+    "STAGE  = 5\n",
+    "\n",
+    "input_folder = output_exact_dedupe_dir # previous output folder is the input folder for the current stage\n",
+    "output_folder =  output_fuzzy_dedupe_dir\n",
+    "\n",
+    "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
     "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_doc_quality_out\")\n",
     "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c02982c5-f398-4a1a-a9fe-42d7ae748c7c",
+   "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3",
    "metadata": {},
    "source": [
-    "### Execute "
+    "### 7.2 - Execute "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "29319fb9-b0d8-4f86-9bc5-b92960ad8ae5",
+   "execution_count": 17,
+   "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:39:17 INFO - Running locally\n",
-      "10:39:17 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x79b73a1f9210>}\n",
-      "10:39:17 INFO - data factory docq_ is using local configuration without input/output path\n",
-      "10:39:17 INFO - data factory docq_ max_files -1, n_sample -1\n",
-      "10:39:17 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:39:17 INFO - data factory data_ is using local data access: input_folder - output/05_fdedupe_out output_folder - output/06_doc_quality_out\n",
-      "10:39:17 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:39:17 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:39:17 INFO - pipeline id pipeline_id\n",
-      "10:39:17 INFO - code location None\n",
-      "10:39:17 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:39:17 INFO - actor creation delay 0\n",
-      "10:39:17 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:39:19,513\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - orchestrator started at 2024-08-30 10:39:20\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Number of files is 2, source profile {'max_file_size': 0.20880889892578125, 'min_file_size': 0.200042724609375, 'total_file_size': 0.40885162353515625}\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.259551240131259, 'object_store': 4.129775619134307}\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Completed 0 files (0.0%)  in 6.075700124104818e-06 min. Waiting for completion\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1097239)\u001b[0m 10:39:20 INFO - Load badwords found locally from /home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:21 INFO - Completed processing 2 files in 0.02414883772532145 min\n",
-      "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:21 INFO - done flushing in 0.0010554790496826172 sec\n",
-      "10:39:31 INFO - Completed execution in 0.23775473435719807 min, execution result 0\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1097238)\u001b[0m 10:39:20 INFO - Load badwords found locally from /home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en\n"
+      "00:29:25 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 1}}\n",
+      "00:29:25 INFO - pipeline id pipeline_id\n",
+      "00:29:25 INFO - code location None\n",
+      "00:29:25 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
+      "00:29:25 INFO - actor creation delay 0\n",
+      "00:29:25 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:29:25 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_fuzzy_dedupe_out\n",
+      "00:29:25 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:29:25 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:29:25 INFO - Running locally\n",
+      "2024-10-02 00:29:26,903\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - orchestrator started at 2024-10-02 00:29:28\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Number of files is 2, source profile {'max_file_size': 0.06981658935546875, 'min_file_size': 0.032629966735839844, 'total_file_size': 0.1024465560913086}\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.94085159432143, 'object_store': 2.470425795763731}\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - starting run from the beginning\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - continuing from the very beginning\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Fuzzy: num buckets 8, bucket length 8\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 bucket actors\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 minhash actors\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Table preprocessing uses 1 readers\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 table processor actors\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:34 INFO - Completed 1 files in 0.115 min\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:34 INFO - Completed 1 files (50.0%)  in 0.115 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:41 INFO - Completed processing 2 files in 0.217 min\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:41 INFO - creating minhash snapshots\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:42 INFO - minhash snapshots created\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:42 INFO - creating bucket snapshots\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - bucket snapshots created\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created 1 document actors\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created 1 bucket processor actors\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created bucket processor invoker\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - added invoker to bucket collectors\n",
+      "\u001b[36m(BucketsHash pid=645808)\u001b[0m 00:29:43 INFO - processing buckets 0 long, 1686 short\n",
+      "\u001b[36m(BucketsHash pid=645808)\u001b[0m 00:29:43 INFO - Done submitting long buckets\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - Done processing buckets in 0.011 min\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - creating document snapshots\n",
+      "\u001b[36m(BucketsHashProcessorInvoker pid=646353)\u001b[0m 00:29:43 INFO - Waiting bucket processing completion. Submitted requests 17\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:44 INFO - document snapshots created\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:44 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:52 INFO - Completed processing 2 files in 0.131 min\n",
+      "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:52 INFO - done flushing in 0.003 sec\n",
+      "00:30:02 INFO - Completed execution in 0.627 min, execution result 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Stage:6 completed successfully\n",
-      "CPU times: user 139 ms, sys: 177 ms, total: 316 ms\n",
-      "Wall time: 15.6 s\n"
+      "✅ Stage:5 completed successfully\n",
+      "CPU times: user 223 ms, sys: 189 ms, total: 412 ms\n",
+      "Wall time: 39 s\n"
      ]
     }
    ],
    "source": [
-    "%%time\n",
+    "%%time \n",
     "\n",
     "import os\n",
     "import sys\n",
-    "from pathlib import Path\n",
     "\n",
-    "from doc_quality_transform import (\n",
-    "    text_lang_cli_param,\n",
-    "    doc_content_column_cli_param,\n",
-    "    bad_word_filepath_cli_param,\n",
-    ")\n",
-    "from doc_quality_transform_ray import DocQualityRayTransformConfiguration\n",
     "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_ray import FdedupRayTransformConfiguration\n",
+    "\n",
+    "# create parameters\n",
     "\n",
     "local_conf = {\n",
     "    \"input_folder\": input_folder,\n",
     "    \"output_folder\": output_folder,\n",
     "}\n",
-    "\n",
-    "doc_quality_basedir = os.path.join(rootdir, \"transforms\", \"language\", \"doc_quality\", \"python\")\n",
     "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n",
+    "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
     "params = {\n",
     "    # where to run\n",
     "    \"run_locally\": True,\n",
     "    # Data access. Only required parameters are specified\n",
     "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
-    "    # orchestrator\n",
+    "    # Orchestration parameters\n",
     "    \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n",
     "    \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n",
-    "    \"runtime_pipeline_id\": \"pipeline_id\",\n",
-    "    \"runtime_job_id\": \"job_id\",\n",
-    "    \"runtime_creation_delay\": 0,\n",
-    "    # doc quality configuration\n",
-    "    text_lang_cli_param: \"en\",\n",
-    "    doc_content_column_cli_param: \"contents\",\n",
-    "    bad_word_filepath_cli_param: os.path.join(doc_quality_basedir, \"ldnoobw\", \"en\"),\n",
+    "    # columns used\n",
+    "    \"fdedup_doc_column\": \"contents\",\n",
+    "    \"fdedup_id_column\": \"chunk_id\",\n",
+    "    \"fdedup_cluster_column\": \"chunk_hash\",\n",
+    "    # infrastructure\n",
+    "    \"fdedup_bucket_cpu\": 0.3,\n",
+    "    \"fdedup_doc_cpu\": 0.3,\n",
+    "    \"fdedup_mhash_cpu\": 0.3,\n",
+    "    \"fdedup_num_doc_actors\": 1,\n",
+    "    \"fdedup_num_bucket_actors\": 1,\n",
+    "    \"fdedup_num_minhash_actors\": 1,\n",
+    "    \"fdedup_num_preprocessors\": 1,\n",
+    "    # fuzzy parameters\n",
+    "    \"fdedup_num_permutations\": 64,\n",
+    "    \"fdedup_threshold\": 0.7, # between 0.0 to 1.0 ; smaller values tend to be more lenient in finding near dupes; close to 1.0 is more strict\n",
+    "    \"fdedup_shingles_size\": 5,\n",
+    "    \"fdedup_delimiters\": \" \"\n",
     "}\n",
     "\n",
-    "\n",
-    "Path(output_folder).mkdir(parents=True, exist_ok=True)\n",
-    "\n",
+    "# Pass commandline params\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
     "\n",
-    "# create launcher\n",
-    "launcher = RayTransformLauncher(DocQualityRayTransformConfiguration())\n",
     "# launch\n",
+    "\n",
+    "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n",
+    "\n",
     "return_code = launcher.launch()\n",
     "\n",
     "if return_code == 0:\n",
@@ -1846,24 +1608,25 @@
   },
   {
    "cell_type": "markdown",
-   "id": "43b7d855",
+   "id": "a6f8cd11",
    "metadata": {},
    "source": [
-    "### Inspect Generated output"
+    "### 7.3 - Inspect Generated output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "f631d5c1",
+   "execution_count": 18,
+   "id": "e899ad60",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input data dimensions (rows x columns)=  (1324, 15)\n",
-      "Output data dimensions (rows x columns)=  (1302, 28)\n"
+      "Input data dimensions (rows x columns)=  (211, 19)\n",
+      "Output data dimensions (rows x columns)=  (211, 19)\n",
+      "Duplicate chunks removed  by fuzzy-dedupe:   0\n"
      ]
     },
     {
@@ -1891,148 +1654,137 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
-       "      <th>...</th>\n",
-       "      <th>docq_mean_word_len</th>\n",
-       "      <th>docq_symbol_to_word_ratio</th>\n",
-       "      <th>docq_sentence_count</th>\n",
-       "      <th>docq_lorem_ipsum_ratio</th>\n",
-       "      <th>docq_curly_bracket_ratio</th>\n",
-       "      <th>docq_contain_bad_word</th>\n",
-       "      <th>docq_bullet_point_ratio</th>\n",
-       "      <th>docq_ellipsis_line_ratio</th>\n",
-       "      <th>docq_alphabet_word_ratio</th>\n",
-       "      <th>docq_contain_common_en_words</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "      <th>chunk_hash</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>354</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>47</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6.272727</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>False</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>6.1.1 HumanEvalSynthesize: Multilingual Code G...</td>\n",
+       "      <td>$.main-text[118]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>[107.09940338, 505.84005737, 505.70474243, 604...</td>\n",
+       "      <td>22dd65548755f19ec6ccd89020fd1fbc88e339fafbd881...</td>\n",
+       "      <td>135</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1125</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <th>134</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5.121622</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>31</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.648649</td>\n",
-       "      <td>False</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>1 Introduction\\nAttention mechanisms have beco...</td>\n",
+       "      <td>$.main-text[20]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[107.17721558, 497.6980896, 505.65536499, 540....</td>\n",
+       "      <td>362722af4a10ed54ca21fd329149c01397a621e15f8306...</td>\n",
+       "      <td>11</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>204</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>93</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5.880000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>True</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>6.3 Code Editing and Translation\\nTarget Langu...</td>\n",
+       "      <td>$.tables[13]</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[161.45388794, 433.6942749, 450.61630249, 552....</td>\n",
+       "      <td>f665c10385f0eb31b2b94e5e61c934651f5789f5ab528c...</td>\n",
+       "      <td>181</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3 rows × 28 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                    filename  num_pages  num_tables  \\\n",
-       "354   Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "1125                        Walmart_2024.pdf        100          82   \n",
-       "204   Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "47     granite.pdf         28          17               348  pdf   \n",
+       "134  attension.pdf         15           4               193  pdf   \n",
+       "93     granite.pdf         28          17               348  pdf   \n",
        "\n",
-       "      num_doc_elements                           document_id  ext  \\\n",
-       "354               1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
-       "1125              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
-       "204               1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
+       "                                                  hash    size  \\\n",
+       "47   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "134  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "93   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
        "\n",
-       "                                                   hash     size  \\\n",
-       "354   ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "1125  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
-       "204   ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "47   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "134  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
+       "93   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
        "\n",
-       "                   date_acquired  pdf_convert_time  ... docq_mean_word_len  \\\n",
-       "354   2024-08-30T10:32:49.798524        321.107279  ...           6.272727   \n",
-       "1125  2024-08-30T10:32:40.640835        312.142404  ...           5.121622   \n",
-       "204   2024-08-30T10:32:49.798524        321.107279  ...           5.880000   \n",
+       "                       source_document_id  \\\n",
+       "47   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "134  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
+       "93   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
        "\n",
-       "     docq_symbol_to_word_ratio docq_sentence_count  docq_lorem_ipsum_ratio  \\\n",
-       "354                        0.0                   2                     0.0   \n",
-       "1125                       0.0                  31                     0.0   \n",
-       "204                        0.0                   1                     0.0   \n",
-       "\n",
-       "     docq_curly_bracket_ratio  docq_contain_bad_word  docq_bullet_point_ratio  \\\n",
-       "354                       0.0                  False                      0.0   \n",
-       "1125                      0.0                  False                      0.0   \n",
-       "204                       0.0                  False                      0.0   \n",
+       "                                              contents      doc_jsonpath  \\\n",
+       "47   6.1.1 HumanEvalSynthesize: Multilingual Code G...  $.main-text[118]   \n",
+       "134  1 Introduction\\nAttention mechanisms have beco...   $.main-text[20]   \n",
+       "93   6.3 Code Editing and Translation\\nTarget Langu...      $.tables[13]   \n",
        "\n",
-       "      docq_ellipsis_line_ratio  docq_alphabet_word_ratio  \\\n",
-       "354                        0.0                  1.000000   \n",
-       "1125                       0.0                  0.648649   \n",
-       "204                        0.0                  1.000000   \n",
+       "     page_number                                               bbox  \\\n",
+       "47             9  [107.09940338, 505.84005737, 505.70474243, 604...   \n",
+       "134            2  [107.17721558, 497.6980896, 505.65536499, 540....   \n",
+       "93            17  [161.45388794, 433.6942749, 450.61630249, 552....   \n",
        "\n",
-       "      docq_contain_common_en_words  \n",
-       "354                          False  \n",
-       "1125                         False  \n",
-       "204                           True  \n",
+       "                                           document_id  chunk_id removed  \\\n",
+       "47   22dd65548755f19ec6ccd89020fd1fbc88e339fafbd881...       135      []   \n",
+       "134  362722af4a10ed54ca21fd329149c01397a621e15f8306...        11      []   \n",
+       "93   f665c10385f0eb31b2b94e5e61c934651f5789f5ab528c...       181      []   \n",
        "\n",
-       "[3 rows x 28 columns]"
+       "     chunk_hash  \n",
+       "47           -1  \n",
+       "134          -1  \n",
+       "93           -1  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2044,8 +1796,9 @@
     "\n",
     "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
     "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
+    "print (\"Duplicate chunks removed  by fuzzy-dedupe:  \", (input_df.shape[0] - output_df.shape[0]))\n",
     "\n",
-    "output_df.sample(3)"
+    "output_df.sample(min(3, output_df.shape[0]))"
    ]
   },
   {
@@ -2058,9 +1811,17 @@
     "Encode text for the vector storage."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8fbbeaff",
+   "metadata": {},
+   "source": [
+    "### 8.1 - Set Input/output Folder"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 19,
    "id": "20a153fa-fd56-401e-86be-4f7617affcc8",
    "metadata": {},
    "outputs": [
@@ -2068,25 +1829,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "🏃🏼 STAGE-7: Processing input='output/06_doc_quality_out' --> output='output/07_encoder_out'\n"
+      "🏃🏼 STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n"
      ]
     }
    ],
    "source": [
-    "STAGE  += 1\n",
-    "# STAGE  = 7 ## DEBUG\n",
+    "STAGE  = 6\n",
     "\n",
-    "input_folder = output_folder # previous output folder is the input folder for the current stage\n",
-    "output_folder =  os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_encoder_out\")\n",
+    "input_folder = output_fuzzy_dedupe_dir\n",
+    "output_folder =  output_embeddings_dir\n",
     "\n",
     "input_df = read_parquet_files_as_df(input_folder)  ## for debug purposes\n",
     "\n",
     "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "1e6a88f8",
+   "metadata": {},
+   "source": [
+    "### 8.2 - Execute"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 20,
    "id": "228df6b2-bc62-494b-9697-03ece98d7853",
    "metadata": {},
    "outputs": [
@@ -2094,38 +1862,34 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:39:33 INFO - Running locally\n",
-      "10:39:33 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n",
-      "10:39:33 INFO - data factory data_ is using local data access: input_folder - output/06_doc_quality_out output_folder - output/07_encoder_out\n",
-      "10:39:33 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:39:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:39:33 INFO - pipeline id pipeline_id\n",
-      "10:39:33 INFO - code location None\n",
-      "10:39:33 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
-      "10:39:33 INFO - actor creation delay 0\n",
-      "10:39:33 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "2024-08-30 10:39:35,588\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - orchestrator started at 2024-08-30 10:39:38\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Number of files is 2, source profile {'max_file_size': 0.2231884002685547, 'min_file_size': 0.2173166275024414, 'total_file_size': 0.4405050277709961}\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.224630738608539, 'object_store': 4.112315367907286}\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Completed 0 files (0.0%)  in 6.500879923502604e-06 min. Waiting for completion\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1098990)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1098990)\u001b[0m   warnings.warn(\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:40:26 INFO - Completed processing 2 files in 0.7918713609377543 min\n",
-      "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:40:26 INFO - done flushing in 0.0010461807250976562 sec\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1098989)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "\u001b[36m(RayTransformFileProcessor pid=1098989)\u001b[0m   warnings.warn(\n",
-      "10:40:36 INFO - Completed execution in 1.0400522033373514 min, execution result 0\n"
+      "00:30:04 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n",
+      "00:30:04 INFO - pipeline id pipeline_id\n",
+      "00:30:04 INFO - code location None\n",
+      "00:30:04 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n",
+      "00:30:04 INFO - actor creation delay 0\n",
+      "00:30:04 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n",
+      "00:30:04 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n",
+      "00:30:04 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "00:30:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "00:30:04 INFO - Running locally\n",
+      "2024-10-02 00:30:06,760\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - orchestrator started at 2024-10-02 00:30:10\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Number of files is 2, source profile {'max_file_size': 0.06542396545410156, 'min_file_size': 0.029404640197753906, 'total_file_size': 0.09482860565185547}\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.923227692954242, 'object_store': 2.4616138450801373}\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:21 INFO - Completed processing 2 files in 0.188 min\n",
+      "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:21 INFO - done flushing in 0.001 sec\n",
+      "00:30:31 INFO - Completed execution in 0.449 min, execution result 0\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Stage:7 completed successfully\n",
-      "CPU times: user 510 ms, sys: 285 ms, total: 795 ms\n",
-      "Wall time: 1min 4s\n"
+      "✅ Stage:6 completed successfully\n",
+      "CPU times: user 638 ms, sys: 269 ms, total: 907 ms\n",
+      "Wall time: 29 s\n"
      ]
     }
    ],
@@ -2169,12 +1933,12 @@
    "id": "b734852c",
    "metadata": {},
    "source": [
-    "### Inspect Generated output"
+    "### 8.3 - Inspect Generated output"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 21,
    "id": "7b1c1d09",
    "metadata": {},
    "outputs": [
@@ -2182,8 +1946,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Input data dimensions (rows x columns)=  (1302, 28)\n",
-      "Output data dimensions (rows x columns)=  (1302, 29)\n"
+      "Input data dimensions (rows x columns)=  (211, 19)\n",
+      "Output data dimensions (rows x columns)=  (211, 20)\n"
      ]
     },
     {
@@ -2211,153 +1975,141 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
-       "      <th>...</th>\n",
-       "      <th>docq_symbol_to_word_ratio</th>\n",
-       "      <th>docq_sentence_count</th>\n",
-       "      <th>docq_lorem_ipsum_ratio</th>\n",
-       "      <th>docq_curly_bracket_ratio</th>\n",
-       "      <th>docq_contain_bad_word</th>\n",
-       "      <th>docq_bullet_point_ratio</th>\n",
-       "      <th>docq_ellipsis_line_ratio</th>\n",
-       "      <th>docq_alphabet_word_ratio</th>\n",
-       "      <th>docq_contain_common_en_words</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>contents</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "      <th>chunk_hash</th>\n",
        "      <th>embeddings</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>916</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
+       "      <th>171</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
+       "      <td>4</td>\n",
+       "      <td>193</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.978022</td>\n",
-       "      <td>False</td>\n",
-       "      <td>[-0.048175987, 0.0011802563, -0.046808466, -0....</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>4 Why Self-Attention\\nlength n is smaller than...</td>\n",
+       "      <td>$.main-text[85]</td>\n",
+       "      <td>7</td>\n",
+       "      <td>[107.26034546, 652.83349609, 504.29177856, 717...</td>\n",
+       "      <td>6f8efa86e0a4f77b0d72d4a3141e5e0611b2921a392b99...</td>\n",
+       "      <td>48</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.018015103, -0.038851, 0.0016827772, -0.0493...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>286</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>a8118ae6-e6b5-4595-86ed-bf519ec23551</td>\n",
+       "      <th>25</th>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-30T10:32:49.798524</td>\n",
-       "      <td>321.107279</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>29</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.919414</td>\n",
-       "      <td>True</td>\n",
-       "      <td>[0.0038028236, -0.13894859, 0.015160485, -0.00...</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>3 Model Architecture\\nBatch size, 3B = 2048. B...</td>\n",
+       "      <td>$.tables[0]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>[138.25450134, 299.99499512, 471.55078125, 432...</td>\n",
+       "      <td>b8f3a83c697e885ad31913c716644399a4772691e39d0b...</td>\n",
+       "      <td>113</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.003977602, -0.06122852, -0.089708336, -0.00...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>852</th>\n",
-       "      <td>Walmart_2024.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>82</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>00df8499-2863-4ca4-96dc-0c2a2014c3dc</td>\n",
-       "      <td>pdf</td>\n",
-       "      <td>dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...</td>\n",
-       "      <td>1112045</td>\n",
-       "      <td>2024-08-30T10:32:40.640835</td>\n",
-       "      <td>312.142404</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.01087</td>\n",
+       "      <th>137</th>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>15</td>\n",
        "      <td>4</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.978261</td>\n",
-       "      <td>False</td>\n",
-       "      <td>[-0.033763092, 0.031698707, -0.04227217, 0.008...</td>\n",
+       "      <td>193</td>\n",
+       "      <td>pdf</td>\n",
+       "      <td>6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...</td>\n",
+       "      <td>135814</td>\n",
+       "      <td>2024-10-02T00:26:29.888597</td>\n",
+       "      <td>53.822026</td>\n",
+       "      <td>attension.pdf</td>\n",
+       "      <td>7afd3fbc-3a9f-4728-8fd8-4a9a13980244</td>\n",
+       "      <td>2 Background\\nSelf-attention, sometimes called...</td>\n",
+       "      <td>$.main-text[24]</td>\n",
+       "      <td>2</td>\n",
+       "      <td>[107.29702759, 256.18237305, 505.24960327, 298...</td>\n",
+       "      <td>9c2abd2ec38b67c74873e0cd670d27b702711d05930f26...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[0.03394238, -0.0117239505, -0.03349689, -0.02...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3 rows × 29 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                   filename  num_pages  num_tables  \\\n",
-       "916                        Walmart_2024.pdf        100          82   \n",
-       "286  Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "852                        Walmart_2024.pdf        100          82   \n",
+       "          filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "171  attension.pdf         15           4               193  pdf   \n",
+       "25     granite.pdf         28          17               348  pdf   \n",
+       "137  attension.pdf         15           4               193  pdf   \n",
        "\n",
-       "     num_doc_elements                           document_id  ext  \\\n",
-       "916              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
-       "286              1163  a8118ae6-e6b5-4595-86ed-bf519ec23551  pdf   \n",
-       "852              1163  00df8499-2863-4ca4-96dc-0c2a2014c3dc  pdf   \n",
+       "                                                  hash    size  \\\n",
+       "171  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
+       "25   79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "137  6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...  135814   \n",
        "\n",
-       "                                                  hash     size  \\\n",
-       "916  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
-       "286  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "852  dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...  1112045   \n",
+       "                  date_acquired  pdf_convert_time source_filename  \\\n",
+       "171  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
+       "25   2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "137  2024-10-02T00:26:29.888597         53.822026   attension.pdf   \n",
        "\n",
-       "                  date_acquired  pdf_convert_time  ...  \\\n",
-       "916  2024-08-30T10:32:40.640835        312.142404  ...   \n",
-       "286  2024-08-30T10:32:49.798524        321.107279  ...   \n",
-       "852  2024-08-30T10:32:40.640835        312.142404  ...   \n",
+       "                       source_document_id  \\\n",
+       "171  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
+       "25   81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "137  7afd3fbc-3a9f-4728-8fd8-4a9a13980244   \n",
        "\n",
-       "    docq_symbol_to_word_ratio docq_sentence_count docq_lorem_ipsum_ratio  \\\n",
-       "916                   0.00000                   3                    0.0   \n",
-       "286                   0.00000                  29                    0.0   \n",
-       "852                   0.01087                   4                    0.0   \n",
+       "                                              contents     doc_jsonpath  \\\n",
+       "171  4 Why Self-Attention\\nlength n is smaller than...  $.main-text[85]   \n",
+       "25   3 Model Architecture\\nBatch size, 3B = 2048. B...      $.tables[0]   \n",
+       "137  2 Background\\nSelf-attention, sometimes called...  $.main-text[24]   \n",
        "\n",
-       "     docq_curly_bracket_ratio docq_contain_bad_word  docq_bullet_point_ratio  \\\n",
-       "916                       0.0                 False                      0.0   \n",
-       "286                       0.0                 False                      0.0   \n",
-       "852                       0.0                 False                      0.0   \n",
+       "     page_number                                               bbox  \\\n",
+       "171            7  [107.26034546, 652.83349609, 504.29177856, 717...   \n",
+       "25             5  [138.25450134, 299.99499512, 471.55078125, 432...   \n",
+       "137            2  [107.29702759, 256.18237305, 505.24960327, 298...   \n",
        "\n",
-       "     docq_ellipsis_line_ratio  docq_alphabet_word_ratio  \\\n",
-       "916                       0.0                  0.978022   \n",
-       "286                       0.0                  0.919414   \n",
-       "852                       0.0                  0.978261   \n",
+       "                                           document_id  chunk_id removed  \\\n",
+       "171  6f8efa86e0a4f77b0d72d4a3141e5e0611b2921a392b99...        48      []   \n",
+       "25   b8f3a83c697e885ad31913c716644399a4772691e39d0b...       113      []   \n",
+       "137  9c2abd2ec38b67c74873e0cd670d27b702711d05930f26...        14      []   \n",
        "\n",
-       "     docq_contain_common_en_words  \\\n",
-       "916                         False   \n",
-       "286                          True   \n",
-       "852                         False   \n",
-       "\n",
-       "                                            embeddings  \n",
-       "916  [-0.048175987, 0.0011802563, -0.046808466, -0....  \n",
-       "286  [0.0038028236, -0.13894859, 0.015160485, -0.00...  \n",
-       "852  [-0.033763092, 0.031698707, -0.04227217, 0.008...  \n",
-       "\n",
-       "[3 rows x 29 columns]"
+       "     chunk_hash                                         embeddings  \n",
+       "171          -1  [0.018015103, -0.038851, 0.0016827772, -0.0493...  \n",
+       "25           -1  [0.003977602, -0.06122852, -0.089708336, -0.00...  \n",
+       "137          -1  [0.03394238, -0.0117239505, -0.03349689, -0.02...  "
       ]
      },
-     "execution_count": 25,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2370,7 +2122,7 @@
     "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n",
     "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n",
     "\n",
-    "output_df.sample(3)"
+    "output_df.sample(min(3, output_df.shape[0]))"
    ]
   },
   {
@@ -2383,7 +2135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 22,
    "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207",
    "metadata": {},
    "outputs": [
@@ -2391,7 +2143,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Copied output from 'output/07_encoder_out' --> 'output/output_final'\n"
+      "✅ Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n"
      ]
     }
    ],
diff --git a/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb b/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb
index 1c25e06ac..e481cf9ee 100644
--- a/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb
+++ b/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb
@@ -17,7 +17,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Configuration"
+    "## Step-1: Configuration"
    ]
   },
   {
@@ -26,22 +26,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class MyConfig:\n",
-    "    pass\n",
-    "MY_CONFIG = MyConfig()\n",
-    "\n",
-    "MY_CONFIG.PROCESSED_DATA_DIR = 'output/output_final'\n",
-    "\n",
-    "MY_CONFIG.DB_URI = './rag_1_dpk.db'  # For embedded instance\n",
-    "#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance\n",
-    "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'"
+    "from my_config import MY_CONFIG"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step-: Load Parquet Data\n",
+    "## Step-2: Load Parquet Data\n",
     "\n",
     "Load all  `.parquet` files in the given dir"
    ]
@@ -58,10 +50,10 @@
       "Loading data from :  output/output_final\n",
       "Number of parquet files to read :  2\n",
       "\n",
-      "Read file: 'output/output_final/Walmart-10K-Reports-Optimized_2023.parquet'.  number of rows = 666\n",
-      "Read file: 'output/output_final/Walmart_2024.parquet'.  number of rows = 636\n",
+      "Read file: 'output/output_final/granite.parquet'.  number of rows = 123\n",
+      "Read file: 'output/output_final/attension.parquet'.  number of rows = 88\n",
       "\n",
-      "Total number of rows = 1302\n"
+      "Total number of rows = 211\n"
      ]
     }
    ],
@@ -69,10 +61,10 @@
     "import pandas as pd\n",
     "import glob\n",
     "\n",
-    "print ('Loading data from : ', MY_CONFIG.PROCESSED_DATA_DIR)\n",
+    "print ('Loading data from : ', MY_CONFIG.OUTPUT_FOLDER_FINAL)\n",
     "\n",
     "# Get a list of all Parquet files in the directory\n",
-    "parquet_files = glob.glob(f'{MY_CONFIG.PROCESSED_DATA_DIR}/*.parquet')\n",
+    "parquet_files = glob.glob(f'{MY_CONFIG.OUTPUT_FOLDER_FINAL}/*.parquet')\n",
     "print (\"Number of parquet files to read : \", len(parquet_files))\n",
     "print ()\n",
     "\n",
@@ -102,41 +94,32 @@
      "text": [
       "embedding length:  384\n",
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 1302 entries, 0 to 1301\n",
-      "Data columns (total 29 columns):\n",
-      " #   Column                        Non-Null Count  Dtype  \n",
-      "---  ------                        --------------  -----  \n",
-      " 0   filename                      1302 non-null   object \n",
-      " 1   num_pages                     1302 non-null   int64  \n",
-      " 2   num_tables                    1302 non-null   int64  \n",
-      " 3   num_doc_elements              1302 non-null   int64  \n",
-      " 4   document_id                   1302 non-null   object \n",
-      " 5   ext                           1302 non-null   object \n",
-      " 6   hash                          1302 non-null   object \n",
-      " 7   size                          1302 non-null   int64  \n",
-      " 8   date_acquired                 1302 non-null   object \n",
-      " 9   pdf_convert_time              1302 non-null   float64\n",
-      " 10  source_filename               1302 non-null   object \n",
-      " 11  text                          1302 non-null   object \n",
-      " 12  doc_jsonpath                  1302 non-null   object \n",
-      " 13  page_number                   1302 non-null   int64  \n",
-      " 14  bbox                          1302 non-null   object \n",
-      " 15  int_id_column                 1302 non-null   int64  \n",
-      " 16  hash_column                   1302 non-null   int64  \n",
-      " 17  docq_total_words              1302 non-null   int64  \n",
-      " 18  docq_mean_word_len            1302 non-null   float64\n",
-      " 19  docq_symbol_to_word_ratio     1302 non-null   float64\n",
-      " 20  docq_sentence_count           1302 non-null   int64  \n",
-      " 21  docq_lorem_ipsum_ratio        1302 non-null   float64\n",
-      " 22  docq_curly_bracket_ratio      1302 non-null   float64\n",
-      " 23  docq_contain_bad_word         1302 non-null   bool   \n",
-      " 24  docq_bullet_point_ratio       1302 non-null   float64\n",
-      " 25  docq_ellipsis_line_ratio      1302 non-null   float64\n",
-      " 26  docq_alphabet_word_ratio      1302 non-null   float64\n",
-      " 27  docq_contain_common_en_words  1302 non-null   bool   \n",
-      " 28  vector                        1302 non-null   object \n",
-      "dtypes: bool(2), float64(8), int64(9), object(10)\n",
-      "memory usage: 277.3+ KB\n",
+      "RangeIndex: 211 entries, 0 to 210\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column              Non-Null Count  Dtype  \n",
+      "---  ------              --------------  -----  \n",
+      " 0   filename            211 non-null    object \n",
+      " 1   num_pages           211 non-null    int64  \n",
+      " 2   num_tables          211 non-null    int64  \n",
+      " 3   num_doc_elements    211 non-null    int64  \n",
+      " 4   ext                 211 non-null    object \n",
+      " 5   hash                211 non-null    object \n",
+      " 6   size                211 non-null    int64  \n",
+      " 7   date_acquired       211 non-null    object \n",
+      " 8   pdf_convert_time    211 non-null    float64\n",
+      " 9   source_filename     211 non-null    object \n",
+      " 10  source_document_id  211 non-null    object \n",
+      " 11  text                211 non-null    object \n",
+      " 12  doc_jsonpath        211 non-null    object \n",
+      " 13  page_number         211 non-null    int64  \n",
+      " 14  bbox                211 non-null    object \n",
+      " 15  document_id         211 non-null    object \n",
+      " 16  chunk_id            211 non-null    int64  \n",
+      " 17  removed             211 non-null    object \n",
+      " 18  chunk_hash          211 non-null    int64  \n",
+      " 19  vector              211 non-null    object \n",
+      "dtypes: float64(1), int64(7), object(12)\n",
+      "memory usage: 33.1+ KB\n",
       "None\n"
      ]
     },
@@ -165,150 +148,138 @@
        "      <th>num_pages</th>\n",
        "      <th>num_tables</th>\n",
        "      <th>num_doc_elements</th>\n",
-       "      <th>document_id</th>\n",
        "      <th>ext</th>\n",
        "      <th>hash</th>\n",
        "      <th>size</th>\n",
        "      <th>date_acquired</th>\n",
        "      <th>pdf_convert_time</th>\n",
-       "      <th>...</th>\n",
-       "      <th>docq_symbol_to_word_ratio</th>\n",
-       "      <th>docq_sentence_count</th>\n",
-       "      <th>docq_lorem_ipsum_ratio</th>\n",
-       "      <th>docq_curly_bracket_ratio</th>\n",
-       "      <th>docq_contain_bad_word</th>\n",
-       "      <th>docq_bullet_point_ratio</th>\n",
-       "      <th>docq_ellipsis_line_ratio</th>\n",
-       "      <th>docq_alphabet_word_ratio</th>\n",
-       "      <th>docq_contain_common_en_words</th>\n",
+       "      <th>source_filename</th>\n",
+       "      <th>source_document_id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>doc_jsonpath</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>document_id</th>\n",
+       "      <th>chunk_id</th>\n",
+       "      <th>removed</th>\n",
+       "      <th>chunk_hash</th>\n",
        "      <th>vector</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>d626ab9b-0f53-446c-b55d-150fbbd93066</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-29T00:43:21.059856</td>\n",
-       "      <td>332.679391</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>True</td>\n",
-       "      <td>[-0.006206639, 0.010256912, 0.023658218, -0.02...</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>Granite Code Models: A Family of Open Foundati...</td>\n",
+       "      <td>$.main-text[3]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>[142.70646667, 672.96929932, 468.58251953, 711...</td>\n",
+       "      <td>b773445f7cf4cc9a5bf6ec296c74504f93c9c179028ac6...</td>\n",
+       "      <td>88</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[-0.015789315, -0.07841933, -0.032271657, 0.00...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>d626ab9b-0f53-446c-b55d-150fbbd93066</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-29T00:43:21.059856</td>\n",
-       "      <td>332.679391</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>Granite Code Models: A Family of Open Foundati...</td>\n",
+       "      <td>$.main-text[4]</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.909091</td>\n",
-       "      <td>True</td>\n",
-       "      <td>[-0.0497427, 0.046492133, -0.02381167, 0.02798...</td>\n",
+       "      <td>[107.61845398, 535.62896729, 503.99923706, 647...</td>\n",
+       "      <td>7353bcc8d99c279335eaf120c793ca6a08f9a4fddcbb5b...</td>\n",
+       "      <td>89</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[-0.059480786, -0.056680508, -0.042864937, -0....</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>Walmart-10K-Reports-Optimized_2023.pdf</td>\n",
-       "      <td>100</td>\n",
-       "      <td>81</td>\n",
-       "      <td>1163</td>\n",
-       "      <td>d626ab9b-0f53-446c-b55d-150fbbd93066</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>28</td>\n",
+       "      <td>17</td>\n",
+       "      <td>348</td>\n",
        "      <td>pdf</td>\n",
-       "      <td>ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...</td>\n",
-       "      <td>1159974</td>\n",
-       "      <td>2024-08-29T00:43:21.059856</td>\n",
-       "      <td>332.679391</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>79c53d694df467391e94f279af2fa6a9a7e45c3922546e...</td>\n",
+       "      <td>655054</td>\n",
+       "      <td>2024-10-02T00:28:23.836369</td>\n",
+       "      <td>167.768806</td>\n",
+       "      <td>granite.pdf</td>\n",
+       "      <td>81bc331a-69cf-49bd-84b9-afedcab1344a</td>\n",
+       "      <td>Granite Code Models: A Family of Open Foundati...</td>\n",
+       "      <td>$.main-text[5]</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.875000</td>\n",
-       "      <td>False</td>\n",
-       "      <td>[-0.03265641, -0.040947884, 0.017305722, 0.022...</td>\n",
+       "      <td>[220.87228394, 484.46414185, 390.87872314, 529...</td>\n",
+       "      <td>389267895ca214924a0a071df8379c2b15fcf374f232a6...</td>\n",
+       "      <td>90</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>[-0.07557265, -0.07152908, -0.048923455, -0.04...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3 rows × 29 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                 filename  num_pages  num_tables  \\\n",
-       "0  Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "1  Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "2  Walmart-10K-Reports-Optimized_2023.pdf        100          81   \n",
-       "\n",
-       "   num_doc_elements                           document_id  ext  \\\n",
-       "0              1163  d626ab9b-0f53-446c-b55d-150fbbd93066  pdf   \n",
-       "1              1163  d626ab9b-0f53-446c-b55d-150fbbd93066  pdf   \n",
-       "2              1163  d626ab9b-0f53-446c-b55d-150fbbd93066  pdf   \n",
-       "\n",
-       "                                                hash     size  \\\n",
-       "0  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "1  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
-       "2  ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...  1159974   \n",
+       "      filename  num_pages  num_tables  num_doc_elements  ext  \\\n",
+       "0  granite.pdf         28          17               348  pdf   \n",
+       "1  granite.pdf         28          17               348  pdf   \n",
+       "2  granite.pdf         28          17               348  pdf   \n",
        "\n",
-       "                date_acquired  pdf_convert_time  ...  \\\n",
-       "0  2024-08-29T00:43:21.059856        332.679391  ...   \n",
-       "1  2024-08-29T00:43:21.059856        332.679391  ...   \n",
-       "2  2024-08-29T00:43:21.059856        332.679391  ...   \n",
+       "                                                hash    size  \\\n",
+       "0  79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "1  79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
+       "2  79c53d694df467391e94f279af2fa6a9a7e45c3922546e...  655054   \n",
        "\n",
-       "  docq_symbol_to_word_ratio docq_sentence_count docq_lorem_ipsum_ratio  \\\n",
-       "0                       0.0                   3                    0.0   \n",
-       "1                       0.0                   1                    0.0   \n",
-       "2                       0.0                   1                    0.0   \n",
+       "                date_acquired  pdf_convert_time source_filename  \\\n",
+       "0  2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "1  2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
+       "2  2024-10-02T00:28:23.836369        167.768806     granite.pdf   \n",
        "\n",
-       "   docq_curly_bracket_ratio docq_contain_bad_word  docq_bullet_point_ratio  \\\n",
-       "0                       0.0                 False                      0.0   \n",
-       "1                       0.0                 False                      0.0   \n",
-       "2                       0.0                 False                      0.0   \n",
+       "                     source_document_id  \\\n",
+       "0  81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "1  81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
+       "2  81bc331a-69cf-49bd-84b9-afedcab1344a   \n",
        "\n",
-       "   docq_ellipsis_line_ratio  docq_alphabet_word_ratio  \\\n",
-       "0                       0.0                  1.000000   \n",
-       "1                       0.0                  0.909091   \n",
-       "2                       0.0                  0.875000   \n",
+       "                                                text    doc_jsonpath  \\\n",
+       "0  Granite Code Models: A Family of Open Foundati...  $.main-text[3]   \n",
+       "1  Granite Code Models: A Family of Open Foundati...  $.main-text[4]   \n",
+       "2  Granite Code Models: A Family of Open Foundati...  $.main-text[5]   \n",
        "\n",
-       "   docq_contain_common_en_words  \\\n",
-       "0                          True   \n",
-       "1                          True   \n",
-       "2                         False   \n",
+       "   page_number                                               bbox  \\\n",
+       "0            1  [142.70646667, 672.96929932, 468.58251953, 711...   \n",
+       "1            1  [107.61845398, 535.62896729, 503.99923706, 647...   \n",
+       "2            1  [220.87228394, 484.46414185, 390.87872314, 529...   \n",
        "\n",
-       "                                              vector  \n",
-       "0  [-0.006206639, 0.010256912, 0.023658218, -0.02...  \n",
-       "1  [-0.0497427, 0.046492133, -0.02381167, 0.02798...  \n",
-       "2  [-0.03265641, -0.040947884, 0.017305722, 0.022...  \n",
+       "                                         document_id  chunk_id removed  \\\n",
+       "0  b773445f7cf4cc9a5bf6ec296c74504f93c9c179028ac6...        88      []   \n",
+       "1  7353bcc8d99c279335eaf120c793ca6a08f9a4fddcbb5b...        89      []   \n",
+       "2  389267895ca214924a0a071df8379c2b15fcf374f232a6...        90      []   \n",
        "\n",
-       "[3 rows x 29 columns]"
+       "   chunk_hash                                             vector  \n",
+       "0          -1  [-0.015789315, -0.07841933, -0.032271657, 0.00...  \n",
+       "1          -1  [-0.059480786, -0.056680508, -0.042864937, -0....  \n",
+       "2          -1  [-0.07557265, -0.07152908, -0.048923455, -0.04...  "
       ]
      },
      "execution_count": 3,
@@ -339,7 +310,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Connect to Vector Database\n",
+    "## Step-3: Connect to Vector Database\n",
     "\n",
     "Milvus can be embedded and easy to use.\n",
     "\n",
@@ -377,7 +348,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Create A Collection\n",
+    "# Step-4: Create A Collection\n",
     "\n"
    ]
   },
@@ -390,8 +361,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Cleared collection : dpk_walmart_docs\n",
-      "✅ Created collection : dpk_walmart_docs\n"
+      "✅ Created collection : dpk_papers\n"
      ]
     }
    ],
@@ -421,13 +391,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "inserted # rows 1302\n"
+      "inserted # rows 211\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'row_count': 1302}"
+       "{'row_count': 211}"
       ]
      },
      "execution_count": 6,
@@ -447,7 +417,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Close DB Connection\n",
+    "## Step-5: Close DB Connection\n",
     "\n",
     "Close the connection so the lock files are relinquished and other notebooks can access the db"
    ]
diff --git a/examples/notebooks/rag/rag_1C_vector_search.ipynb b/examples/notebooks/rag/rag_1C_vector_search.ipynb
index a0b0a849a..e49de86e4 100644
--- a/examples/notebooks/rag/rag_1C_vector_search.ipynb
+++ b/examples/notebooks/rag/rag_1C_vector_search.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Configuration"
+    "## Step-1: Configuration"
    ]
   },
   {
@@ -20,23 +20,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class MyConfig:\n",
-    "    pass\n",
-    "MY_CONFIG = MyConfig()\n",
-    "\n",
-    "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
-    "MY_CONFIG.EMBEDDING_LENGTH = 384\n",
-    "\n",
-    "MY_CONFIG.DB_URI = './rag_1_dpk.db'  # For embedded instance\n",
-    "#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance\n",
-    "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'"
+    "from my_config import MY_CONFIG"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Connect to Vector Database\n",
+    "## Step-2: Connect to Vector Database\n",
     "\n",
     "Milvus can be embedded and easy to use.\n",
     "\n",
@@ -72,7 +63,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Setup Embeddings\n",
+    "## Step-3: Setup Embeddings\n",
     "\n",
     "Two choices here. \n",
     "\n",
@@ -89,9 +80,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
       "  from tqdm.autonotebook import tqdm, trange\n",
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
     }
@@ -145,7 +136,7 @@
      "output_type": "stream",
      "text": [
       "sentence transformer : embeddings len = 384\n",
-      "sentence transformer : embeddings[:5] =  [ 0.02468893  0.10352128  0.02752643 -0.08551716 -0.01412826]\n",
+      "sentence transformer : embeddings[:5] =  [ 0.02468893  0.10352131  0.02752644 -0.08551719 -0.01412828]\n",
       "milvus model wrapper : embeddings len = 384\n",
       "milvus model wrapper  : embeddings[:5] =  [ 0.02468893  0.10352128  0.02752643 -0.08551716 -0.01412826]\n"
      ]
@@ -167,7 +158,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Do A  Vector Search\n",
+    "## Step-4: Do A  Vector Search\n",
     "\n",
     "We will do this to verify data"
    ]
@@ -220,50 +211,50 @@
      "text": [
       "num results :  5\n",
       "------ result 1 --------\n",
-      "search score: 0.5978392958641052\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 99\n",
+      "search score: 0.5946735143661499\n",
+      "filename: granite.pdf\n",
+      "page number: 5\n",
       "text:\n",
-      " Stock Performance Chart\n",
-      "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   Discretionary, 2020 = . S&P 500 Consumer   Discretionary, 2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 Consumer   Discretionary, 2023 = . S&P 500 Consumer   Discretionary, 2024 = . Discretionary   Distribution &   RiliId, 2019 = 100.00. Discretionary   Distribution &   RiliId, 2020 = 117.54. Discretionary   Distribution &   RiliId, 2021 = 166.19. Discretionary   Distribution &   RiliId, 2022 = 180.56. Discretionary   Distribution &   RiliId, 2023 = 147.66. Discretionary   Distribution &   RiliId, 2024 = 190.67\n",
+      " 3 Model Architecture\n",
+      "Table 1: Model configurations for Granite Code models.\n",
       "\n",
       "------ result 2 --------\n",
-      "search score: 0.5875853896141052\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.5919967889785767\n",
+      "filename: granite.pdf\n",
+      "page number: 6\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "Operating cash flow $36B\n",
+      " 3 Model Architecture\n",
+      "Figure 2: An overview of depth upscaling (Kim et al., 2024) for efficient training of Granite34B-Code. We utilize the 20B model after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework.\n",
       "\n",
       "------ result 3 --------\n",
-      "search score: 0.5865607857704163\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.5557882785797119\n",
+      "filename: granite.pdf\n",
+      "page number: 1\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n",
+      " Granite Code Models: A Family of Open Foundation Models for Code Intelligence\n",
+      "Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar Panda †\n",
       "\n",
       "------ result 4 --------\n",
-      "search score: 0.5840539932250977\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.539251983165741\n",
+      "filename: granite.pdf\n",
+      "page number: 6\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "Revenues\n",
+      " 3 Model Architecture\n",
+      "remove final 8 layers from the original model and initial 8 layers from its duplicate to form two models. Finally, we concatenate both models to form Granite-34B-Code model with 88 layers (see Figure 2 for an illustration). After the depth upscaling, we observe that the drop in performance compared to 20B model is pretty small contrary to what is observed by Kim et al.. This performance is recovered pretty quickly after we continue pretraining of the upscaled 34B model. Similar, to 20B, we use a 8192 token context during pretraining.\n",
       "\n",
       "------ result 5 --------\n",
-      "search score: 0.5462992191314697\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.537261962890625\n",
+      "filename: granite.pdf\n",
+      "page number: 20\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "1 Our global advertising business is recorded in either net sales or as a reduction to cost of sales, depending on the nature of the advertising arrangement. 2 1B tonnes CO 2 e emissions reduced, avoided, or sequestered reported by suppliers cumulatively since 2017 through Project Gigaton. Calculated in accordance with Walmart's \"Project Gigaton Accounting Methodology.\" 3 This result also includes emissions impacts that may only be realized in 2024\n",
+      " 6.6 Calling Functions and Tools\n",
+      "Figure 4 shows the results of different Granite Code models on BFCL benchmark. As can be seen from the figure, overall accuracy improves from 25.65% to 57.12% for Granite-3BCode-Base to Granite-34B-Code-Base, showing the effectiveness of model scaling in function (tool) calling capabilities. We also compare Granite-8B-Code with CodeLlama-7B in Figure 5 and find that Granite-8B-Code-Instruct beats CodeLlama-7B-Instruct by 22%, 14% and 12% on AST Summary, Execution Summary and Overall accuracy respectively. Additionally, Figure 5 shows that instruction tuning consistently improves performance of both base models, with more noticeable improvements in Granite Code models. E.g., +17.88% in overall accuracy from Granite-8B-Code-Base to Granite-8B-Code-Instruct, indicating the effectiveness of our well-curated data mixture in finetuning base models.\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "query = \"What was Walmart's revenue in 2023?\"\n",
+    "query = \"What was the training data used to train Granite models?\"\n",
     "\n",
     "results = do_vector_search (query)\n",
     "print_search_results(results)"
@@ -280,50 +271,50 @@
      "text": [
       "num results :  5\n",
       "------ result 1 --------\n",
-      "search score: 0.5755810141563416\n",
-      "filename: Walmart_2024_copy.pdf\n",
+      "search score: 0.6484582424163818\n",
+      "filename: attension.pdf\n",
       "page number: 2\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n",
+      " 1 Introduction\n",
+      "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms are used in conjunction with a recurrent network.\n",
       "\n",
       "------ result 2 --------\n",
-      "search score: 0.502342700958252\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.6340895891189575\n",
+      "filename: attension.pdf\n",
+      "page number: 3\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "1B Tonnes\n",
+      " 3.2 Attention\n",
+      "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n",
       "\n",
       "------ result 3 --------\n",
-      "search score: 0.5014065504074097\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 99\n",
+      "search score: 0.5805453062057495\n",
+      "filename: attension.pdf\n",
+      "page number: 10\n",
       "text:\n",
-      " Stock Performance Chart\n",
-      "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   Discretionary, 2020 = . S&P 500 Consumer   Discretionary, 2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 Consumer   Discretionary, 2023 = . S&P 500 Consumer   Discretionary, 2024 = . Discretionary   Distribution &   RiliId, 2019 = 100.00. Discretionary   Distribution &   RiliId, 2020 = 117.54. Discretionary   Distribution &   RiliId, 2021 = 166.19. Discretionary   Distribution &   RiliId, 2022 = 180.56. Discretionary   Distribution &   RiliId, 2023 = 147.66. Discretionary   Distribution &   RiliId, 2024 = 190.67\n",
+      " 7 Conclusion\n",
+      "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n",
       "\n",
       "------ result 4 --------\n",
-      "search score: 0.49448615312576294\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.5805416703224182\n",
+      "filename: attension.pdf\n",
+      "page number: 15\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "+20%\n",
+      " Attention Visualizations Input-Input Layer5\n",
+      "Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.\n",
       "\n",
       "------ result 5 --------\n",
-      "search score: 0.49202316999435425\n",
-      "filename: Walmart_2024_copy.pdf\n",
-      "page number: 2\n",
+      "search score: 0.5769087076187134\n",
+      "filename: attension.pdf\n",
+      "page number: 13\n",
       "text:\n",
-      " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "+6%\n",
+      " Attention Visualizations Input-Input Layer5\n",
+      "Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color.\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "query = \"How many distribution facilities does Walmart have?\"\n",
+    "query = \"What is the attention mechanism?\"\n",
     "\n",
     "results = do_vector_search (query)\n",
     "print_search_results(results)"
diff --git a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb b/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb
index 905ad307b..532b7ef4d 100644
--- a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb
+++ b/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb
@@ -7,19 +7,19 @@
     "# Query Data using LLM\n",
     "\n",
     "Here is the overall RAG pipeline.   In this notebook, we will do steps (5), (6), (7), (8), (9)\n",
-    "- Importing data is already done in this notebook [rag_1_B_load_data.ipynb](rag_1_B_load_data.ipynb)\n",
+    "- Importing data is already done in this notebook [rag_1B_load_data_into_milvus.ipynb](rag_1B_load_data_into_milvus.ipynb)\n",
     "- 👉 Step 5: Calculate embedding for user query\n",
     "- 👉 Step 6 & 7: Send the query to vector db to retrieve relevant documents\n",
     "- 👉 Step 8 & 9: Send the query and relevant documents (returned above step) to LLM and get answers to our query\n",
     "\n",
-    "![image missing](../media/rag-overview-2.png)"
+    "![image missing](media/rag-overview-2.png)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Configuration"
+    "## Step-1: Configuration"
    ]
   },
   {
@@ -28,42 +28,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class MyConfig:\n",
-    "    pass\n",
-    "MY_CONFIG = MyConfig()\n",
-    "\n",
-    "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
-    "MY_CONFIG.EMBEDDING_LENGTH = 384\n",
-    "\n",
-    "MY_CONFIG.DB_URI = './rag_1_dpk.db'  # For embedded instance\n",
-    "#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance\n",
-    "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'\n",
-    "\n",
-    "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Configuration\n",
-    "\n",
-    "Create a .env file with the following properties.  You can use [env.txt](../env.txt) as starting point\n",
-    "\n",
-    "---\n",
-    "\n",
-    "```text\n",
-    "REPLICATE_API_TOKEN=YOUR_TOKEN_GOES_HERE\n",
-    "```\n",
-    "\n",
-    "---"
+    "from my_config import MY_CONFIG"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load Configurations\n"
+    "## Step-2: Load .env file\n"
    ]
   },
   {
@@ -102,7 +74,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Connect to Vector Database\n",
+    "## Step-3: Connect to Vector Database\n",
     "\n",
     "Milvus can be embedded and easy to use.\n",
     "\n",
@@ -138,7 +110,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Step-: Setup Embeddings\n",
+    "## Step-4: Setup Embeddings\n",
     "\n",
     "Use the same embeddings we used to index our documents!"
    ]
@@ -152,9 +124,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
       "  from tqdm.autonotebook import tqdm, trange\n",
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
     }
@@ -179,7 +151,7 @@
      "output_type": "stream",
      "text": [
       "embeddings len = 384\n",
-      "embeddings[:5] =  [ 0.02468893  0.10352128  0.02752643 -0.08551716 -0.01412826]\n"
+      "embeddings[:5] =  [ 0.02468893  0.10352131  0.02752644 -0.08551719 -0.01412828]\n"
      ]
     }
    ],
@@ -194,7 +166,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Vector Search and RAG"
+    "## Step-5: Vector Search and RAG"
    ]
   },
   {
@@ -231,35 +203,31 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[   {   'distance': 0.5978392958641052,\n",
-      "        'text': 'Stock Performance Chart\\n'\n",
-      "                'Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. '\n",
-      "                'Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. '\n",
-      "                'Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. '\n",
-      "                'S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. '\n",
-      "                'S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. '\n",
-      "                'S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. '\n",
-      "                'S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   '\n",
-      "                'Discretionary, 2020 = . S&P 500 Consumer   Discretionary, '\n",
-      "                '2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 '\n",
-      "                'Consumer   Discretionary, 2023 = . S&P 500 Consumer   '\n",
-      "                'Discretionary, 2024 = . Discretionary   Distribution &   '\n",
-      "                'RiliId, 2019 = 100.00. Discretionary   Distribution &   '\n",
-      "                'RiliId, 2020 = 117.54. Discretionary   Distribution &   '\n",
-      "                'RiliId, 2021 = 166.19. Discretionary   Distribution &   '\n",
-      "                'RiliId, 2022 = 180.56. Discretionary   Distribution &   '\n",
-      "                'RiliId, 2023 = 147.66. Discretionary   Distribution &   '\n",
-      "                'RiliId, 2024 = 190.67'},\n",
-      "    {   'distance': 0.5875853896141052,\n",
-      "        'text': '\"At Walmart, we\\'re a people-led, tech-powered omnichannel '\n",
-      "                'retailer dedicated\\n'\n",
-      "                'Operating cash flow $36B'},\n",
-      "    {   'distance': 0.5865607857704163,\n",
-      "        'text': '\"At Walmart, we\\'re a people-led, tech-powered omnichannel '\n",
-      "                'retailer dedicated\\n'\n",
-      "                'through, up to and including 2030. Additional qualifying '\n",
-      "                'information can be found by visiting '\n",
-      "                'http://corporate.walmart.com/purpose/esgreport.'}]\n"
+      "[   {   'distance': 0.5946735143661499,\n",
+      "        'text': '3 Model Architecture\\n'\n",
+      "                'Table 1: Model configurations for Granite Code models.'},\n",
+      "    {   'distance': 0.5919967889785767,\n",
+      "        'text': '3 Model Architecture\\n'\n",
+      "                'Figure 2: An overview of depth upscaling (Kim et al., 2024) '\n",
+      "                'for efficient training of Granite34B-Code. We utilize the 20B '\n",
+      "                'model after 1.6T tokens to start training of 34B model with '\n",
+      "                'the same code pretraining data without any changes to the '\n",
+      "                'training and inference framework.'},\n",
+      "    {   'distance': 0.5557882785797119,\n",
+      "        'text': 'Granite Code Models: A Family of Open Foundation Models for '\n",
+      "                'Code Intelligence\\n'\n",
+      "                'Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen '\n",
+      "                'Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran '\n",
+      "                'Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong '\n",
+      "                'Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman '\n",
+      "                'Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris '\n",
+      "                'Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu '\n",
+      "                'Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel '\n",
+      "                'Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell '\n",
+      "                'Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen '\n",
+      "                'Seetharami Seelam Brian Belgodere Carlos Fonseca Amith '\n",
+      "                'Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar '\n",
+      "                'Panda †'}]\n"
      ]
     }
    ],
@@ -268,7 +236,7 @@
     "import json\n",
     "import pprint\n",
     "\n",
-    "question = \"What was Walmart's revenue in 2023?\"\n",
+    "question = \"What was the training data used to train Granite models?\"\n",
     "relevant_docs = fetch_relevant_documents(question)\n",
     "pprint.pprint(relevant_docs, indent=4)"
    ]
@@ -277,7 +245,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Initialize LLM\n",
+    "## Step-6: Initialize LLM\n",
     "\n",
     "### LLM Choices at Replicate\n",
     "\n",
@@ -305,59 +273,6 @@
     "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import replicate\n",
-    "\n",
-    "def ask_LLM (question, relevant_docs):\n",
-    "    context = \"\\n\".join(\n",
-    "        [doc['text'] for doc in relevant_docs]\n",
-    "    )\n",
-    "    print ('============ context (this is the context supplied to LLM) ============')\n",
-    "    print (context)\n",
-    "    print ('============ end  context ============', flush=True)\n",
-    "\n",
-    "    system_prompt = \"\"\"\n",
-    "    Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.\n",
-    "    \"\"\"\n",
-    "    user_prompt = f\"\"\"\n",
-    "    Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.\n",
-    "    <context>\n",
-    "    {context}\n",
-    "    </context>\n",
-    "    <question>\n",
-    "    {question}\n",
-    "    </question>\n",
-    "    \"\"\"\n",
-    "\n",
-    "    print ('============ here is the answer from LLM... STREAMING... =====')\n",
-    "    # The meta/meta-llama-3-8b-instruct model can stream output as it's running.\n",
-    "    for event in replicate.stream(\n",
-    "        MY_CONFIG.LLM_MODEL,\n",
-    "        input={\n",
-    "            \"top_k\": 0,\n",
-    "            \"top_p\": 0.95,\n",
-    "            \"prompt\": user_prompt,\n",
-    "            \"max_tokens\": 512,\n",
-    "            \"temperature\": 0.1,\n",
-    "            \"system_prompt\": system_prompt,\n",
-    "            \"length_penalty\": 1,\n",
-    "            \"max_new_tokens\": 512,\n",
-    "            \"stop_sequences\": \"<|end_of_text|>,<|eot_id|>\",\n",
-    "            \"prompt_template\": \"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
-    "            \"presence_penalty\": 0,\n",
-    "            \"log_performance_metrics\": False\n",
-    "        },\n",
-    "    ):\n",
-    "        print(str(event), end=\"\")\n",
-    "    ## ---\n",
-    "    print ('\\n======  end LLM answer ======\\n', flush=True)\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,
@@ -392,14 +307,14 @@
     "    for event in replicate.stream(\n",
     "        MY_CONFIG.LLM_MODEL,\n",
     "        input={\n",
-    "            \"top_k\": 0,\n",
+    "            \"top_k\": 1,\n",
     "            \"top_p\": 0.95,\n",
     "            \"prompt\": user_prompt,\n",
-    "            \"max_tokens\": 512,\n",
+    "            \"max_tokens\": 1024,\n",
     "            \"temperature\": 0.1,\n",
     "            \"system_prompt\": system_prompt,\n",
     "            \"length_penalty\": 1,\n",
-    "            \"max_new_tokens\": 512,\n",
+    "            # \"max_new_tokens\": 512,\n",
     "            \"stop_sequences\": \"<|end_of_text|>,<|eot_id|>\",\n",
     "            \"prompt_template\": \"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
     "            \"presence_penalty\": 0,\n",
@@ -415,7 +330,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Query"
+    "## Step-7: Query"
    ]
   },
   {
@@ -428,26 +343,26 @@
      "output_type": "stream",
      "text": [
       "============ context (this is the context supplied to LLM) ============\n",
-      "Stock Performance Chart\n",
-      "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   Discretionary, 2020 = . S&P 500 Consumer   Discretionary, 2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 Consumer   Discretionary, 2023 = . S&P 500 Consumer   Discretionary, 2024 = . Discretionary   Distribution &   RiliId, 2019 = 100.00. Discretionary   Distribution &   RiliId, 2020 = 117.54. Discretionary   Distribution &   RiliId, 2021 = 166.19. Discretionary   Distribution &   RiliId, 2022 = 180.56. Discretionary   Distribution &   RiliId, 2023 = 147.66. Discretionary   Distribution &   RiliId, 2024 = 190.67\n",
-      "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "Operating cash flow $36B\n",
-      "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n",
+      "3 Model Architecture\n",
+      "Table 1: Model configurations for Granite Code models.\n",
+      "3 Model Architecture\n",
+      "Figure 2: An overview of depth upscaling (Kim et al., 2024) for efficient training of Granite34B-Code. We utilize the 20B model after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework.\n",
+      "Granite Code Models: A Family of Open Foundation Models for Code Intelligence\n",
+      "Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar Panda †\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "The provided context does not mention Walmart's revenue in 2023. However, it does provide the stock performance chart for Walmart Inc. in 2023, which shows that the stock price was $153.58.\n",
+      "Based on the provided context, the training data used to train Granite models is not explicitly mentioned. However, it is mentioned that the 20B model was used after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework. This implies that the same code pretraining data was used for both models, but the exact nature of this data is not specified.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 254 ms, sys: 17.3 ms, total: 271 ms\n",
-      "Wall time: 1.14 s\n"
+      "CPU times: user 75.3 ms, sys: 37.8 ms, total: 113 ms\n",
+      "Wall time: 1.95 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
     "\n",
-    "question = \"What was Walmart's revenue in 2023?\"\n",
+    "question = \"What was the training data used to train Granite models?\"\n",
     "relevant_docs = fetch_relevant_documents(question)\n",
     "ask_LLM(question=question, relevant_docs=relevant_docs)"
    ]
@@ -462,26 +377,26 @@
      "output_type": "stream",
      "text": [
       "============ context (this is the context supplied to LLM) ============\n",
-      "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n",
-      "Stock Performance Chart\n",
-      "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   Discretionary, 2020 = . S&P 500 Consumer   Discretionary, 2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 Consumer   Discretionary, 2023 = . S&P 500 Consumer   Discretionary, 2024 = . Discretionary   Distribution &   RiliId, 2019 = 100.00. Discretionary   Distribution &   RiliId, 2020 = 117.54. Discretionary   Distribution &   RiliId, 2021 = 166.19. Discretionary   Distribution &   RiliId, 2022 = 180.56. Discretionary   Distribution &   RiliId, 2023 = 147.66. Discretionary   Distribution &   RiliId, 2024 = 190.67\n",
-      "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n",
-      "+6%\n",
+      "1 Introduction\n",
+      "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms are used in conjunction with a recurrent network.\n",
+      "3.2 Attention\n",
+      "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n",
+      "7 Conclusion\n",
+      "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "I apologize, but the provided context does not mention the number of distribution centers Walmart has. The context appears to be discussing Walmart's stock performance and its commitment to being a people-led, tech-powered omnichannel retailer. It does not provide information about the number of distribution centers.\n",
+      "Based on the provided context, an attention mechanism can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 214 ms, sys: 4.25 ms, total: 218 ms\n",
-      "Wall time: 928 ms\n"
+      "CPU times: user 41.1 ms, sys: 28.7 ms, total: 69.8 ms\n",
+      "Wall time: 1.58 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
     "\n",
-    "question = \"How many distribution centers does Walmart have?\"\n",
+    "question = \"What is attention mechanism?\"\n",
     "relevant_docs = fetch_relevant_documents(question)\n",
     "ask_LLM(question=question, relevant_docs=relevant_docs)"
    ]
@@ -496,21 +411,19 @@
      "output_type": "stream",
      "text": [
       "============ context (this is the context supplied to LLM) ============\n",
-      "                                                 -                                                                        \n",
-      "3/29/2024 10:28:40 AM\n",
-      "*E@4< '6C7@C>2?46  92CE\n",
-      " &$' ) *&% &    0  )  ,$,# + -  +&+ # ) +,)% \n",
-      " &7-  59.:&0*287 &2) %36/.2,  &4.8&0  *+.(.8 \n",
-      ":D42= 062CD  ?565 !2?F2CJ   \n",
+      "6.1.5 RepoBench, CrossCodeEval: Repository-Level Code Generation\n",
+      "StarCoderBase-3B, MBPP = 29.4. StarCoderBase-3B, MBPP+ = 37.8. StableCode-3B, MBPP = 34.8. StableCode-3B, MBPP+ = 43.3. StarCoder2-3B, MBPP = 42.4. StarCoder2-3B, MBPP+ = 48.6. CodeGemma-2B, MBPP = 30.4. CodeGemma-2B, MBPP+ = 30.8. Granite-3B-Code-Base, MBPP = 36.0. Granite-3B-Code-Base, MBPP+ = 45.1. StarCoderBase-7B, MBPP = 34.8. StarCoderBase-7B, MBPP+ = 42.1. CodeLlama-7B, MBPP = 39.0. CodeLlama-7B, MBPP+ = 42.3. StarCoder2-7B, MBPP = 45.4. StarCoder2-7B, MBPP+ = 46.7. CodeGemma-7B, MBPP = 53.0. CodeGemma-7B, MBPP+ = 54.9. Granite-8B-Code-Base, MBPP = 42.2. Granite-8B-Code-Base, MBPP+ = 49.6. StarCoderBase-15B, MBPP = 37.4. StarCoderBase-15B, MBPP+ = 46.1. CodeLlama-13B, MBPP = 30.6. CodeLlama-13B, MBPP+ = 30.1. StarCoder2-15B, MBPP = 51.2. StarCoder2-15B, MBPP+ = 56.6. Granite-20B-Code-Base, MBPP = 43.8. Granite-20B-Code-Base, MBPP+ = 51.6. CodeLlama-34B, MBPP = 48.6. CodeLlama-34B, MBPP+ = 53.6. Granite-34B-Code-Base, MBPP = 47.2. Granite-34B-Code-Base, MBPP+ = 53.1\n",
+      "6.1.3 MBPP and MBPP+: Code Generation in Python\n",
+      "MBPP (Austin et al., 2021) and MBPP+ (Liu et al., 2023a) are two of the most widely studied benchmarks for evaluating code models. While the prompt for each MBPP problem includes a natural language description followed by a few tests, MBPP+ consists of 35 × more tests than the original benchmarks. We use greedy decoding and report the mean pass@1 for all the models. Table 5 summarizes the results of different base models. As we can see, Granite3B-Code-Base significantly outperforms CodeGemma-2B but falls short of StarCoder2-3B on\n",
+      "6.1.4 DS1000: Data Science Tasks in Python\n",
+      "The Granite Code models achieve relatively high accuracy across all sizes (e.g., outperforming CodeGemma at 2B-3B scale, StarCoder2 at 7B-8B scale and CodeLlama models with half of the sizes). This shows that our Granite Code models are not only capable of generating good code but also of using libraries more accurately in real data science workflows.\n",
       "============ end  context ============\n",
       "============ here is the answer from LLM... STREAMING... =====\n",
-      "I'm happy to help! However, I must point out that the provided context does not contain any information about the moon landing. The text appears to be a jumbled mix of characters and symbols, and does not provide any relevant information about the moon landing or any other historical event.\n",
-      "\n",
-      "If you could provide a different context or question, I would be happy to try and assist you to the best of my abilities.\n",
+      "I apologize, but the provided context does not mention the moon landing. The context appears to be about code generation and evaluation benchmarks, specifically discussing the MBPP and MBPP+ benchmarks, and the performance of different code models. There is no mention of the moon landing. If you provide a different context or question, I'll be happy to help.\n",
       "======  end LLM answer ======\n",
       "\n",
-      "CPU times: user 268 ms, sys: 12.4 ms, total: 280 ms\n",
-      "Wall time: 1.37 s\n"
+      "CPU times: user 41.5 ms, sys: 21 ms, total: 62.5 ms\n",
+      "Wall time: 2.13 s\n"
      ]
     }
    ],
diff --git a/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb b/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb
index 7c1c9d124..b52ed53ea 100644
--- a/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb
+++ b/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb
@@ -24,19 +24,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class MyConfig:\n",
-    "    pass\n",
-    "\n",
-    "MY_CONFIG = MyConfig()\n",
-    "\n",
-    "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
-    "MY_CONFIG.EMBEDDING_LENGTH = 384\n",
-    "\n",
-    "MY_CONFIG.INPUT_DATA_DIR = \"input_data/walmart-reports-1\"\n",
+    "from my_config import MY_CONFIG\n",
     "\n",
     "MY_CONFIG.DB_URI = './rag_2_llamaindex.db'\n",
-    "MY_CONFIG.COLLECTION_NAME = 'llamaindex_walmart_docs'\n",
-    "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n"
+    "MY_CONFIG.COLLECTION_NAME = 'llamaindex_papers'"
    ]
   },
   {
@@ -51,24 +42,13 @@
    "execution_count": 2,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt_tab to\n",
-      "[nltk_data]     /home/sujee/apps/anaconda3/envs/data-prep-\n",
-      "[nltk_data]     kit-2/lib/python3.11/site-\n",
-      "[nltk_data]     packages/llama_index/core/_static/nltk_cache...\n",
-      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loaded 300 chunks\n",
-      "CPU times: user 10.1 s, sys: 2.23 s, total: 12.3 s\n",
-      "Wall time: 9.88 s\n"
+      "Loaded 43 chunks\n",
+      "CPU times: user 3.9 s, sys: 869 ms, total: 4.77 s\n",
+      "Wall time: 2.76 s\n"
      ]
     }
    ],
@@ -113,13 +93,167 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "08aefb8116a540678e28c78accd09648",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "930ad0af631d4f308978973d4bf84fcf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c46860ae2634b32b0564b3c3966447d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2e4b4e0ba174ff9ae8542631fdd53dd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a6d591a5f564b11a84648d57f24a418",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a458eff1e8c46c9ab9eb4c26cdfac04",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b754e347a67a40008346c4d928b06b50",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "680021888456420abd1eecf438c880ea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d303a919b7d14ca68b13fd85747461ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bab4bf697c843fd9dd636c793dba92a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac75b94e156b4562a9073889a137551d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -147,8 +281,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Connected to Milvus instance:  ./rag_2_llamaindex.db\n",
-      "✅ Cleared collection : llamaindex_walmart_docs\n"
+      "✅ Connected to Milvus instance:  ./rag_2_llamaindex.db\n"
      ]
     }
    ],
@@ -210,10 +343,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "✅ Created index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x76e5a7a15610>\n",
+      "✅ Created index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7ac2d9f4e450>\n",
       "✅ Saved index to db  ./rag_2_llamaindex.db\n",
-      "CPU times: user 24min 24s, sys: 1min 4s, total: 25min 28s\n",
-      "Wall time: 2min 20s\n"
+      "CPU times: user 912 ms, sys: 155 ms, total: 1.07 s\n",
+      "Wall time: 1.03 s\n"
      ]
     }
    ],
diff --git a/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb b/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb
index 069a2c797..717d79690 100644
--- a/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb
+++ b/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb
@@ -24,19 +24,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class MyConfig:\n",
-    "    pass\n",
-    "\n",
-    "MY_CONFIG = MyConfig()\n",
-    "\n",
-    "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
-    "MY_CONFIG.EMBEDDING_LENGTH = 384\n",
-    "\n",
-    "MY_CONFIG.INPUT_DATA_DIR = \"input_data/walmart-reports-1\"\n",
+    "from my_config import MY_CONFIG\n",
     "\n",
     "MY_CONFIG.DB_URI = './rag_2_llamaindex.db'\n",
-    "MY_CONFIG.COLLECTION_NAME = 'llamaindex_walmart_docs'\n",
-    "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n"
+    "MY_CONFIG.COLLECTION_NAME = 'llamaindex_papers'"
    ]
   },
   {
@@ -66,12 +57,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package punkt_tab to\n",
-      "[nltk_data]     /home/sujee/apps/anaconda3/envs/data-prep-\n",
-      "[nltk_data]     kit-2/lib/python3.11/site-\n",
-      "[nltk_data]     packages/llama_index/core/_static/nltk_cache...\n",
-      "[nltk_data]   Package punkt_tab is already up-to-date!\n",
-      "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
       "  warnings.warn(\n"
      ]
     }
@@ -114,7 +100,7 @@
     "    uri = MY_CONFIG.DB_URI ,\n",
     "    dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
     "    collection_name = MY_CONFIG.COLLECTION_NAME,\n",
-    "    overwrite=False\n",
+    "    overwrite=False  # so we load the index from db\n",
     ")\n",
     "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
     "\n",
@@ -138,8 +124,8 @@
      "output_type": "stream",
      "text": [
       "✅ Loaded index from vector db: ./rag_2_llamaindex.db\n",
-      "CPU times: user 265 ms, sys: 25.8 ms, total: 291 ms\n",
-      "Wall time: 289 ms\n"
+      "CPU times: user 255 ms, sys: 18.9 ms, total: 274 ms\n",
+      "Wall time: 271 ms\n"
      ]
     }
    ],
@@ -196,13 +182,21 @@
      "text": [
       "\n",
       "\n",
-      "According to the provided context information, Walmart's total revenue in 2023 was $611,289 million.\n"
+      "Based on the provided context information, the training data used to train the Granite models includes:\n",
+      "\n",
+      "* 3.5T to 4.5T tokens of code data\n",
+      "* Natural language datasets related to code\n",
+      "* High-quality data with two phases of training:\n",
+      "\t+ Phase 1: 4 trillion tokens of code data comprising 116 languages\n",
+      "\t+ Phase 2: 500B tokens (80% code and 20% language data) from various domains, including technical, mathematics, and web documents\n",
+      "\n",
+      "Note that the data is tokenized via byte pair encoding (BPE) and the same tokenizer as StarCoder is employed.\n"
      ]
     }
    ],
    "source": [
     "query_engine = index.as_query_engine()\n",
-    "res = query_engine.query(\"What was Walmart's revenue in 2023?\")\n",
+    "res = query_engine.query(\"What was the training data used to train Granite models?\")\n",
     "print(res)"
    ]
   },
@@ -217,15 +211,17 @@
      "text": [
       "\n",
       "\n",
-      "Based on the provided context information, the answer to the query is:\n",
+      "Based on the provided context information, it appears that the attention mechanism is a technique used in the encoder self-attention in layer 5 of 6, which allows the model to focus on specific parts of the input when processing it. This is evident from the visualizations provided, which show the attention heads attending to distant dependencies in the input text.\n",
+      "\n",
+      "In the first example, the attention heads are shown to attend to a distant dependency of the verb \"making\", completing the phrase \"making...more difficult\". In the second example, the attention heads are shown to exhibit behavior related to the structure of the sentence, with different heads performing different tasks.\n",
       "\n",
-      "Walmart has a total of 163 distribution facilities.\n"
+      "From this, it can be inferred that the attention mechanism is a way for the model to selectively focus on certain parts of the input, allowing it to better understand the context and relationships between different elements in the input.\n"
      ]
     }
    ],
    "source": [
     "query_engine = index.as_query_engine()\n",
-    "res = query_engine.query(\"How many distribution facilities does Walmart have?\")\n",
+    "res = query_engine.query(\"What is attention mechanism?\")\n",
     "print(res)"
    ]
   },
@@ -240,7 +236,7 @@
      "text": [
       "\n",
       "\n",
-      "I'm happy to help! However, I don't see any information about the moon landing in the provided context. The context appears to be a 10-K report filed by Walmart Inc. with the Securities and Exchange Commission. There is no mention of the moon landing in this report. If you have any other questions or if there's something else I can help you with, feel free to ask!\n"
+      "I'm happy to help! However, I don't see any information about the moon landing in the provided context. The text appears to be discussing IBM Granite Code Models and their performance on various benchmarks. Therefore, I cannot provide an answer to the query about the moon landing. If you could provide more context or clarify the question, I'd be happy to try and assist you further!\n"
      ]
     }
    ],
diff --git a/examples/notebooks/rag/requirements.txt b/examples/notebooks/rag/requirements.txt
index 4578b1ea8..1c5c4f00c 100644
--- a/examples/notebooks/rag/requirements.txt
+++ b/examples/notebooks/rag/requirements.txt
@@ -1,7 +1,9 @@
 ## Data prep kit
-#data-prep-toolkit-transforms==0.2.1.dev1
-#data-prep-toolkit-transforms-ray==0.2.1.dev1
 
+data-prep-toolkit-transforms==0.2.1
+data-prep-toolkit-transforms-ray==0.2.1
+
+deepsearch-toolkit
 
 
 # Milvus
@@ -28,14 +30,9 @@ llama-index
 
 ### llama-index embedding models
 llama-index-embeddings-huggingface
-## llama-index-embeddings-mistralai
-## llama-index-embeddings-openai== 0.1.7
 
 ### llama-index LLM interfaces
 llama-index-llms-replicate
-## llama-index-llms-mistralai
-## llama-index-llms-openai==0.1.12
-# llama-index-llms-llama-cpp
 
 ### llama-index Vector dbs
 llama-index-vector-stores-milvus
diff --git a/examples/notebooks/rag/setup-python-dev-env.md b/examples/notebooks/rag/setup-python-dev-env.md
index 8c2c93d20..b007c4b4b 100644
--- a/examples/notebooks/rag/setup-python-dev-env.md
+++ b/examples/notebooks/rag/setup-python-dev-env.md
@@ -18,21 +18,39 @@ We will create an environment for this workshop with all the required libraries
 
 ```bash
 conda create -n data-prep-kit-1 -y python=3.11
+```
+
+activate the new conda environment
 
-# activate the new conda environment
+```bash
 conda activate data-prep-kit-1
-# make sure env is swithced to data-prep-kit-1
+```
 
-## Check python version
+Make sure env is swithced to data-prep-kit-1
+
+Check python version
+
+```bash
 python --version
-# should say : 3.11
+```
+
+should say : 3.11
+
+**Note**: If you are on a linux system install these too
+
+```bash
+conda install gcc_linux-64
+
+conda install gxx_linux-64
 ```
 
 ### A-2: Install dependencies
 
 ```bash
 cd examples/notebooks/rag
+```
 
+```bash
 pip  install  -r requirements.txt
 ```
 

From 213e0c7eb8ec25d943ee4917faf051c4963107dc Mon Sep 17 00:00:00 2001
From: ian-cho <42691703+ian-cho@users.noreply.github.com>
Date: Thu, 3 Oct 2024 20:24:17 +0900
Subject: [PATCH 2/7] Update README.md

updated hap README
---
 transforms/universal/hap/python/README.md | 36 +++++++++++++++--------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md
index 23be7084c..347fa86ae 100644
--- a/transforms/universal/hap/python/README.md
+++ b/transforms/universal/hap/python/README.md
@@ -1,14 +1,14 @@
-# HAP Annotation
+# Hate, Abuse, and Profanity (HAP) Annotation
 Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up.
 
 ## Prerequisite
-This repo needs NLTK and please refer to `requirements.txt`.
+This repository needs [NLTK](https://www.nltk.org/) and please refer to `requirements.txt`.
 
 ## Summary
 The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document:
 
 * Sentence spliting: we use NLTK to split the document into sentence pieces.
-* Hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap.
+* hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap.
 * Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences.
 
 
@@ -16,25 +16,26 @@ The hap transform maps a non-empty input table to an output table with an added
 The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) 
 configuration for values are as follows:
 
-* --model_name_or_path - specifies HAP model which should be compatable with HuggingFace's `AutoModelForSequenceClassification` 
-* --batch_size - modify it based on the infrastructure capacity.
-* --max_length - the maximum length for the tokenizer.
-
-
+* --model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier `ibm-granite/granite-guardian-hap-38m`.
+* --batch_size - modify it based on the infrastructure capacity. Defaults to `128`.
+* --max_length - the maximum length for the tokenizer. Defaults to `512`.
+* --doc_text_column - the column name containing the document text in the input .parquet file. Defaults to `contents`.
+* --annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to `hap_score`.
+  
 
 ## input format
 The input is in .parquet format and contains the following columns:
 
-| doc_id  |   doc_text | 
-|:------|:------|
+| doc_id  | contents | 
+|:------:|:------:|
 | 1  |    GSC is very much a little Swiss Army knife for...   |
 | 2  |    Here are only a few examples. And no, I'm not ...   |
 
 ## output format
 The output is in .parquet format and includes an additional column, in addition to those in the input:
 
-| doc_id  |   doc_text | hap_score   |
-|:------|:------|:-------------|
+| doc_id  | contents | hap_score  |
+|:------:|:------:|:-------------:|
 | 1  |    GSC is very much a little Swiss Army knife for... | 0.002463     |
 | 2  |    Here are only a few examples. And no, I'm not ... | 0.989713     |
 
@@ -47,6 +48,17 @@ python hap_local_python.py
 
 You will obtain the output file `test1.parquet` in the output directory.
 
+## Throughput 
+The table below shows the throughput (tokens per second) of the HAP transform module, which primarily includes sentence splitting, HAP annotation, and HAP score aggregation. We herein compare two models:
+
+* 4-layer lightweight toxicity classifier [ibm-granite/granite-guardian-hap-38m](https://huggingface.co/ibm-granite/granite-guardian-hap-38m)
+* 12-layer toxicity classifier [ibm-granite/granite-guardian-hap-125m](https://huggingface.co/ibm-granite/granite-guardian-hap-125m)
+ 
+We report the average throughput on CPU over three runs.
+| Model used in HAP transform module  | throughput (tokens per second) | 
+|:------:|:------:|
+| granite-guardian-hap-38m  |  6.16 k   |
+| granite-guardian-hap-125m |  1.14 k   |
 
 
 

From 2971c730b0a580a7731ec70f7e2bb8e75a299cb0 Mon Sep 17 00:00:00 2001
From: ian-cho <42691703+ian-cho@users.noreply.github.com>
Date: Thu, 3 Oct 2024 21:27:54 +0900
Subject: [PATCH 3/7] Update README.md

---
 transforms/universal/hap/python/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md
index 347fa86ae..29d54d999 100644
--- a/transforms/universal/hap/python/README.md
+++ b/transforms/universal/hap/python/README.md
@@ -54,7 +54,8 @@ The table below shows the throughput (tokens per second) of the HAP transform mo
 * 4-layer lightweight toxicity classifier [ibm-granite/granite-guardian-hap-38m](https://huggingface.co/ibm-granite/granite-guardian-hap-38m)
 * 12-layer toxicity classifier [ibm-granite/granite-guardian-hap-125m](https://huggingface.co/ibm-granite/granite-guardian-hap-125m)
  
-We report the average throughput on CPU over three runs.
+We processed 6,000 documents (12 MB in Parquet file size) using the HAP transform module and reported the average CPU throughput over three trials.
+
 | Model used in HAP transform module  | throughput (tokens per second) | 
 |:------:|:------:|
 | granite-guardian-hap-38m  |  6.16 k   |

From d8b9be8531368657d2fb09973b8df486ed93c620 Mon Sep 17 00:00:00 2001
From: ian-cho <42691703+ian-cho@users.noreply.github.com>
Date: Thu, 3 Oct 2024 22:16:51 +0900
Subject: [PATCH 4/7] changed doc_text into contents in related files

The column name `doc_text` is changed into `contents` in all relevant scripts and parquet files
---
 .../universal/hap/python/output/metadata.json |  13 +++++++------
 .../universal/hap/python/output/test1.parquet | Bin 79822 -> 79822 bytes
 .../universal/hap/python/src/hap_local.py     |   2 +-
 .../hap/python/src/hap_local_python.py        |   2 +-
 .../universal/hap/python/src/hap_transform.py |  12 ++++++------
 .../python/test-data/expected/metadata.json   |  13 +++++++------
 .../python/test-data/expected/test1.parquet   | Bin 79822 -> 79822 bytes
 .../hap/python/test-data/input/test1.parquet  | Bin 109303 -> 109303 bytes
 .../universal/hap/python/test/test_hap.py     |   2 +-
 9 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/transforms/universal/hap/python/output/metadata.json b/transforms/universal/hap/python/output/metadata.json
index 6627fabb9..062fee162 100644
--- a/transforms/universal/hap/python/output/metadata.json
+++ b/transforms/universal/hap/python/output/metadata.json
@@ -5,8 +5,8 @@
     "job name": "hap",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-09-25 00:47:58",
-    "end_time": "2024-09-25 00:48:06",
+    "start_time": "2024-10-03 21:38:20",
+    "end_time": "2024-10-03 21:38:29",
     "status": "success"
   },
   "code": {
@@ -17,7 +17,7 @@
   "job_input_params": {
     "model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
@@ -30,11 +30,12 @@
     "num_processors": 0
   },
   "job_output_stats": {
-    "source_files": 1,
-    "source_size": 109303,
+    "source_files": 2,
+    "source_size": 12124594,
+    "transform execution exception": 1,
     "result_files": 1,
     "result_size": 79822,
-    "processing_time": 6.543,
+    "processing_time": 6.932,
     "source_doc_count": 50,
     "result_doc_count": 50
   },
diff --git a/transforms/universal/hap/python/output/test1.parquet b/transforms/universal/hap/python/output/test1.parquet
index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644
GIT binary patch
delta 171
zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R
zbQF}*GE;L>;`0)7Q<ba~l;AScGYuFWrpp^KvTk2%$Y{yTot0!{Y7iA}YLHwx{i6w^
k1dm@?QeIFF5LbC-PCsbO7`i>nlyMCslEU)oisp=K0maogvj6}9

delta 155
zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx>
z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$
a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q

diff --git a/transforms/universal/hap/python/src/hap_local.py b/transforms/universal/hap/python/src/hap_local.py
index 89140fd74..220eea19b 100644
--- a/transforms/universal/hap/python/src/hap_local.py
+++ b/transforms/universal/hap/python/src/hap_local.py
@@ -24,7 +24,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
diff --git a/transforms/universal/hap/python/src/hap_local_python.py b/transforms/universal/hap/python/src/hap_local_python.py
index 9a268803e..8e79dc583 100644
--- a/transforms/universal/hap/python/src/hap_local_python.py
+++ b/transforms/universal/hap/python/src/hap_local_python.py
@@ -37,7 +37,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
diff --git a/transforms/universal/hap/python/src/hap_transform.py b/transforms/universal/hap/python/src/hap_transform.py
index 71bad2acb..e6a48cf86 100644
--- a/transforms/universal/hap/python/src/hap_transform.py
+++ b/transforms/universal/hap/python/src/hap_transform.py
@@ -27,11 +27,11 @@ class HAPTransform(AbstractTableTransform):
 
     def __init__(self, config: dict[str, Any]):
         super().__init__(config)
-        self.model_name_or_path = config.get("model_name_or_path")
-        self.annotation_column = config.get("annotation_column")
-        self.doc_text_column = config.get("doc_text_column")
-        self.max_length = config.get("max_length")
-        self.batch_size = config.get("batch_size")
+        self.model_name_or_path = config.get("model_name_or_path", "ibm-granite/granite-guardian-hap-38m")
+        self.annotation_column = config.get("annotation_column", "hap_score")
+        self.doc_text_column = config.get("doc_text_column", "contents")
+        self.max_length = config.get("max_length", 512)
+        self.batch_size = config.get("batch_size", 128)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path)
 
@@ -70,7 +70,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
         :param table: Pyarrow table
         :return: a table with an additional hap_score column
         """
-        # make sure that the table contains "doc_text" column
+        # make sure that the table contains "contents" column
         TransformUtils.validate_columns(table=table, required=[self.doc_text_column])
         self.df = table.to_pandas()
         df_doc_list = []
diff --git a/transforms/universal/hap/python/test-data/expected/metadata.json b/transforms/universal/hap/python/test-data/expected/metadata.json
index 1e5f710db..062fee162 100644
--- a/transforms/universal/hap/python/test-data/expected/metadata.json
+++ b/transforms/universal/hap/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
     "job name": "hap",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-09-26 20:56:49",
-    "end_time": "2024-09-26 20:56:56",
+    "start_time": "2024-10-03 21:38:20",
+    "end_time": "2024-10-03 21:38:29",
     "status": "success"
   },
   "code": {
@@ -17,7 +17,7 @@
   "job_input_params": {
     "model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,
@@ -30,11 +30,12 @@
     "num_processors": 0
   },
   "job_output_stats": {
-    "source_files": 1,
-    "source_size": 109303,
+    "source_files": 2,
+    "source_size": 12124594,
+    "transform execution exception": 1,
     "result_files": 1,
     "result_size": 79822,
-    "processing_time": 6.501,
+    "processing_time": 6.932,
     "source_doc_count": 50,
     "result_doc_count": 50
   },
diff --git a/transforms/universal/hap/python/test-data/expected/test1.parquet b/transforms/universal/hap/python/test-data/expected/test1.parquet
index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644
GIT binary patch
delta 171
zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R
zbQF}*GE;L>;`0)7Q<ba~l;AScGYuFWrpp^KvTk2%$Y{yTot0!{Y7iA}YLHwx{i6w^
k1dm@?QeIFF5LbC-PCsbO7`i>nlyMCslEU)oisp=K0maogvj6}9

delta 155
zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx>
z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$
a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q

diff --git a/transforms/universal/hap/python/test-data/input/test1.parquet b/transforms/universal/hap/python/test-data/input/test1.parquet
index 8a3468009e1f012d50dc9f0d9bf437926d048867..5e2f5fe9d5547448a8d2ff3ec3b5b5c51e575455 100644
GIT binary patch
delta 177
zcmex<mF@dgwuUW?&u4HX=jWBA=9Lt0e?Nn<h8518UObPn7S01nMzC%VSi;!Cs0`Dg
zq@$pemYJH95}%ito2q1`pahqh{$mNF!*=s!jK`UDvyzNV4WhzL4U#K8i#?saaudVD
zilf{tOC!S#b3C*1(>-(j%98Sea)7vMyY@;(2}bTnBg@hh_ps6w58vs2s~A@T0IY^W
A?f?J)

delta 177
zcmex<mF@dgwuUW?&u4I?<R{0Mq*jz{e?Nn<h8518UObPn7S01nMzC%VSi;!Cs0`Dg
zq@$pemYJH95}%ito2q1`pahqh{$mNF!*=s!jK`UDvy+Vc(o)>ROj4XZi#?saaudVD
zilf{tOC!S#b3C*1(>-&8$|4Od4Wc4447O{pWRzg!j&irmiZBk!N%aVv?zf6@B>-=i
BLB0S0

diff --git a/transforms/universal/hap/python/test/test_hap.py b/transforms/universal/hap/python/test/test_hap.py
index 3f2a25e53..82ac5dc06 100644
--- a/transforms/universal/hap/python/test/test_hap.py
+++ b/transforms/universal/hap/python/test/test_hap.py
@@ -19,7 +19,7 @@
 hap_params = {
     "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
     "annotation_column": "hap_score",
-    "doc_text_column": "doc_text",
+    "doc_text_column": "contents",
     "inference_engine": "CPU",
     "max_length": 512,
     "batch_size": 128,

From 9ad002ad738e6cc68bb83f0fc97623c7cee12c4d Mon Sep 17 00:00:00 2001
From: Shahrokh Daijavad <shahrokhDaijavad@users.noreply.github.com>
Date: Thu, 3 Oct 2024 11:16:26 -0700
Subject: [PATCH 5/7] Update README.md

Added HAP to the table in README
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index ade3bed68..aeec4ef70 100644
--- a/README.md
+++ b/README.md
@@ -139,6 +139,7 @@ The matrix below shows the the combination of modules and supported runtimes. Al
 | [Filter on annotations](transforms/universal/filter/python/README.md)                | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | [Profiler](transforms/universal/profiler/ray/README.md)                              | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | [Resize](transforms/universal/resize/python/README.md)                               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| [HAP](transforms/universal/hap/python/README.md)                      | :white_check_mark: |  |                    |  |
 | [Tokenizer](transforms/universal/tokenization/python/README.md)                      | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
 | **Language-only**                                                                    |                    |                    |                    |                    |
 | [Language identification](transforms/language/lang_id/python/README.md)              | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |

From 0a9fe37ac4fe230f4886dd8133bd17bcbfd35922 Mon Sep 17 00:00:00 2001
From: Revital Sur <eres@il.ibm.com>
Date: Thu, 3 Oct 2024 22:42:24 +0300
Subject: [PATCH 6/7] Change the calculation of the desired ray actors (#654)

* Fix the calculation of the desired ray actors.

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>

* Fix hap workflow Makefile commands.

Signed-off-by: Revital Sur <eres@il.ibm.com>

* More change.

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>

* Additional fix.

Signed-off-by: Revital Sur <eres@il.ibm.com>

* additional change.

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>

* Cherry pick Boris's 05b97feb7 commit to removed None not supported by kfpV2.

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>

* Minor fix in ededup_transform_base.py

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>

* Disable pii Makefile.

Signed-off-by: Revital Sur <eres@il.ibm.com>

---------

Signed-off-by: Revital Sur <eres@il.ibm.com>
Co-authored-by: Boris Lublinsky <blublinsky@ibm.com>
---
 .../ray/src/data_processing_ray/runtime/ray/ray_utils.py  | 2 +-
 .../src/runtime_utils/kfp_utils.py                        | 2 +-
 .../language/pii_redactor/{Makefile => Makefile.disable}  | 0
 transforms/universal/ededup/kfp_ray/ededup_wf.py          | 2 +-
 .../ededup/kfp_ray/src/ededup_compute_execution_params.py | 8 ++++++--
 .../universal/ededup/python/src/ededup_transform_base.py  | 2 +-
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py          | 2 +-
 .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 4 ++--
 transforms/universal/hap/Makefile                         | 4 ----
 .../kfp_ray/src/profiler_compute_execution_params.py      | 6 +++---
 10 files changed, 16 insertions(+), 16 deletions(-)
 rename transforms/language/pii_redactor/{Makefile => Makefile.disable} (100%)

diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py
index c7362ef5e..5225508fb 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py
@@ -111,7 +111,7 @@ def operator() -> ActorHandle:
 
         cls_name = clazz.__class__.__name__.replace('ActorClass(', '').replace(')','')
         actors = [operator() for _ in range(n_actors)]
-        for i in range(60):
+        for i in range(120):
             time.sleep(1)
             alive = list_actors(filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")])
             if len(actors) == len(alive):
diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py
index 73b6a5cd4..7fa76453f 100644
--- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py
+++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py
@@ -138,7 +138,7 @@ def default_compute_execution_params(
         cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0)
         logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}")
         # compute number of actors
-        n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5))
+        n_actors_cpu = int((cluster_cpu - 1) * 0.7 / a_options.get("num_cpus", 0.5))
         n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB))
         n_actors = min(n_actors_cpu, n_actors_memory)
         # Check if we need gpu calculations as well
diff --git a/transforms/language/pii_redactor/Makefile b/transforms/language/pii_redactor/Makefile.disable
similarity index 100%
rename from transforms/language/pii_redactor/Makefile
rename to transforms/language/pii_redactor/Makefile.disable
diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py
index 645902d0e..ff4b4db57 100644
--- a/transforms/universal/ededup/kfp_ray/ededup_wf.py
+++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py
@@ -89,7 +89,7 @@ def ededup(
     ededup_hash_cpu: float = 0.5,
     ededup_doc_column: str = "contents",
     ededup_use_snapshot: bool = False,
-    ededup_snapshot_directory: str = None,
+    ededup_snapshot_directory: str = "",
     # data sampling
     ededup_n_samples: int = 10,
     # additional parameters
diff --git a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py
index a20a2e030..6f8197877 100644
--- a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py
+++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 ################################################################################
 
+
 def ededup_compute_execution_params(
     worker_options: dict,  # ray worker configuration
     actor_options: dict,  # actor's resource requirements
@@ -94,9 +95,9 @@ def ededup_compute_execution_params(
         )
         sys.exit(1)
     # Define number of workers
-    n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu)
+    n_workers = int((0.85 * (cluster_cpu - 1) - required_hash_cpu) / actor_cpu)
     print(f"Number of workers - {n_workers}")
-    if n_workers < 2:
+    if n_workers <= 0:
         print(f"Cluster is too small - estimated number of workers {n_workers}")
         sys.exit(1)
     # Limit amount of workers and processors to prevent S3 saturation
@@ -110,6 +111,9 @@ def ededup_compute_execution_params(
         print(f"Try to increase the size of the cluster or increase size of the cpu per worker")
         sys.exit(1)
     print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min")
+    # process None able parameters
+    if ededup_snapshot_directory is None or len(ededup_snapshot_directory) <= 1:
+        ededup_snapshot_directory = None
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
diff --git a/transforms/universal/ededup/python/src/ededup_transform_base.py b/transforms/universal/ededup/python/src/ededup_transform_base.py
index f1321db79..4437148ac 100644
--- a/transforms/universal/ededup/python/src/ededup_transform_base.py
+++ b/transforms/universal/ededup/python/src/ededup_transform_base.py
@@ -40,7 +40,7 @@
 doc_column_name_cli_param = f"{cli_prefix}{doc_column_name_key}"
 int_column_name_cli_param = f"{cli_prefix}{int_column_name_key}"
 use_snapshot_cli_param = f"{cli_prefix}{use_snapshot_key}"
-snapshot_directory_cli_param = f"--{cli_prefix}{snapshot_directory_key}"
+snapshot_directory_cli_param = f"{cli_prefix}{snapshot_directory_key}"
 
 class HashFilter:
     """
diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index bb2cc3194..3156ab6f1 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -82,7 +82,7 @@ def fdedup(
     data_max_files: int = -1,
     data_num_samples: int = -1,
     # orchestrator
-    runtime_actor_options: dict = {"num_cpus": 0.8},
+    runtime_actor_options: dict = {"num_cpus": 0.7},
     runtime_pipeline_id: str = "pipeline_id",
     runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
     # columns used
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index ebcecadb9..726200339 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -140,7 +140,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float:
     cluster_cpu = worker_options["replicas"] * worker_options["cpu"]
     cluster_memory = worker_options["replicas"] * worker_options["memory"]
     print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}")
-    cluster_cpu *= 0.85
+    cluster_cpu -= 1
     cluster_memory *= 0.85
     # get actor requirements
     actor_cpu = actor_options["num_cpus"]
@@ -172,7 +172,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float:
     n_preprocessors = int(
         (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu
     )
-    if n_preprocessors < 0:
+    if n_preprocessors <= 0:
         print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}")
         print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}")
         print("Try to increase the size of the cluster")
diff --git a/transforms/universal/hap/Makefile b/transforms/universal/hap/Makefile
index 05d3c3111..017eb23b4 100644
--- a/transforms/universal/hap/Makefile
+++ b/transforms/universal/hap/Makefile
@@ -55,16 +55,12 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
 
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
diff --git a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py
index 666734eda..a5483eec7 100644
--- a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py
+++ b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py
@@ -55,7 +55,7 @@ def profiler_compute_execution_params(
     cluster_cpu = w_options["replicas"] * w_options["cpu"]
     cluster_memory = w_options["replicas"] * w_options["memory"]
     print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}")
-    cluster_cpu *= 0.85
+    cluster_cpu -= 1
     cluster_memory *= 0.85
     # get actor requirements
     a_options = actor_options
@@ -82,7 +82,7 @@ def profiler_compute_execution_params(
     n_aggregators = math.ceil(number_of_docs * 32 / GB)
     print(f"Estimated Required hashes {n_aggregators}")
     print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}")
-    required_aggregator_cpu = n_aggregators * aggregator_cpu
+    required_aggregator_cpu = math.ceil(n_aggregators * aggregator_cpu)
     required_hash_mem = n_aggregators * 2
     if required_aggregator_cpu > cluster_cpu or required_hash_mem > cluster_memory:
         print(
@@ -93,7 +93,7 @@ def profiler_compute_execution_params(
     # Define number of workers
     n_workers = int((0.85 * cluster_cpu - required_aggregator_cpu) / actor_cpu)
     print(f"Number of workers - {n_workers}")
-    if n_workers < 2:
+    if n_workers <= 0:
         print(f"Cluster is too small - estimated number of workers {n_workers}")
         sys.exit(1)
     # Limit amount of workers and processors to prevent S3 saturation

From a5f86dad587d39cf9db008c81a876c9fa1440c68 Mon Sep 17 00:00:00 2001
From: David Wood <dawood@us.ibm.com>
Date: Thu, 3 Oct 2024 19:27:30 -0400
Subject: [PATCH 7/7] Various fixes to workflows, especially kfp (#664)

* disable test workflow when none code files change

Signed-off-by: David Wood <dawood@us.ibm.com>

* one more path-ignore in test.yml

Signed-off-by: David Wood <dawood@us.ibm.com>

* one more fix for path-ignore in test.yml

Signed-off-by: David Wood <dawood@us.ibm.com>

* test universal transform separately

Signed-off-by: David Wood <dawood@us.ibm.com>

* rename test universal workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* add comments to noop src to trigger new universal test workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix paths in test universal workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* addj back ignore paths in test universal workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* another noop comment

Signed-off-by: David Wood <dawood@us.ibm.com>

* move ignored paths to paths in univesal test workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* test-universal workflow name changes

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop comments

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop readme change'

Signed-off-by: David Wood <dawood@us.ibm.com>

* change test universal not paths

Signed-off-by: David Wood <dawood@us.ibm.com>

* disable all but new noop and doc_id test workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* code change in noop

Signed-off-by: David Wood <dawood@us.ibm.com>

* remake test transforms

Signed-off-by: David Wood <dawood@us.ibm.com>

* add individual test transform workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop README change

Signed-off-by: David Wood <dawood@us.ibm.com>

* better ignore of .md on test transform workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop readme change

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop test transform worklow 1 ignore

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* split out the tests into test-kfp/lib/misc and remove test.yml, add readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* test-kfp only on kfp/**

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop code change to trigger build

Signed-off-by: David Wood <dawood@us.ibm.com>

* comments in workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* updated workflow readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* only run build-library workflow on data-processing-lib changes

Signed-off-by: David Wood <dawood@us.ibm.com>

* try and ignore docs in build-library, test-kfp/lib

Signed-off-by: David Wood <dawood@us.ibm.com>

* workflow title changes for consistency

Signed-off-by: David Wood <dawood@us.ibm.com>

* test change on filter source

Signed-off-by: David Wood <dawood@us.ibm.com>

* change to lib readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* change to lib source

Signed-off-by: David Wood <dawood@us.ibm.com>

* minor job name changes in transform workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* test-lib workflow ignores

Signed-off-by: David Wood <dawood@us.ibm.com>

* top level readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop test source

Signed-off-by: David Wood <dawood@us.ibm.com>

* filter source change'

Signed-off-by: David Wood <dawood@us.ibm.com>

* updated all transform tets workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix typo in test template on check_images

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop src change

Signed-off-by: David Wood <dawood@us.ibm.com>

* check for makefile in test transform workflow

Signed-off-by: David Wood <dawood@us.ibm.com>

* automatically determine transforms in transforms directory for which to generate test workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* worklow readme, transform existence verification, disable build-library, and tools tests

Signed-off-by: David Wood <dawood@us.ibm.com>

* workflow readme details on kfp and misc tests

Signed-off-by: David Wood <dawood@us.ibm.com>

* backing out change to dpk lib code

Signed-off-by: David Wood <dawood@us.ibm.com>

* restore filter code

Signed-off-by: David Wood <dawood@us.ibm.com>

* restore noop code

Signed-off-by: David Wood <dawood@us.ibm.com>

* workflow readme

Signed-off-by: David Wood <dawood@us.ibm.com>

* really restore noop code

Signed-off-by: David Wood <dawood@us.ibm.com>

* check for makefile in transform test-src testing

Signed-off-by: David Wood <dawood@us.ibm.com>

* don't include lib test dependencies in transform test workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* noop code change

Signed-off-by: David Wood <dawood@us.ibm.com>

* disable noop, don't include lib test-data in transform dependencies

Signed-off-by: David Wood <dawood@us.ibm.com>

* use job.id.if on Makefile to enable transform test job

Signed-off-by: David Wood <dawood@us.ibm.com>

* use job.id.if on Makefile to enable transform test job

Signed-off-by: David Wood <dawood@us.ibm.com>

* restore noop Makefile

Signed-off-by: David Wood <dawood@us.ibm.com>

* exclude kfp_ray from transfor test workflow and change noop code

Signed-off-by: David Wood <dawood@us.ibm.com>

* remove if: from test workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* backout noop code change

Signed-off-by: David Wood <dawood@us.ibm.com>

* backout noop code change

Signed-off-by: David Wood <dawood@us.ibm.com>

* only build spark image for transform image tests that need it

Signed-off-by: David Wood <dawood@us.ibm.com>

* header_cleanser code change

Signed-off-by: David Wood <dawood@us.ibm.com>

* only build spark image for transform image tests that need it

Signed-off-by: David Wood <dawood@us.ibm.com>

* update workflows to trigger on .make.* changes

Signed-off-by: David Wood <dawood@us.ibm.com>

* blank line added to .make.versions to test new workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* remove change to .make.versions

Signed-off-by: David Wood <dawood@us.ibm.com>

* remove test-universal-html2parquet.yml since it is now moved to language

Signed-off-by: David Wood <dawood@us.ibm.com>

* mv kfp blacklist definition to check-workflows.sh script and have it check for kfp workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

* license_select_wf.py comment to trigger kfp tests here

Signed-off-by: David Wood <dawood@us.ibm.com>

* have transform top level makefile check for kfp_ray directory before recursing into

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix doc_quality Makefile kfp rules

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix tabbing in recent Makefile updates

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix kfp workflows to only build the target transform

Signed-off-by: David Wood <dawood@us.ibm.com>

* switch workflows from ubuntu-22.04 to ubuntu-latest to try and avoid excessive job queuing

Signed-off-by: David Wood <dawood@us.ibm.com>

* remove non-change from license_select_wf.py to trigger new ci/cd

Signed-off-by: David Wood <dawood@us.ibm.com>

* one more fix to kfp transform workflows to not workflow-build in all transforms

Signed-off-by: David Wood <dawood@us.ibm.com>

* fix repo_level_ordering/ray/Makefile to support kind/kfp testing targets

Signed-off-by: David Wood <dawood@us.ibm.com>

* change kfp test workflows to not test if transform's Makefile or kfp_ray dir is not present

Signed-off-by: David Wood <dawood@us.ibm.com>

* set cancel-in-progress=true in workflows

Signed-off-by: David Wood <dawood@us.ibm.com>

---------

Signed-off-by: David Wood <dawood@us.ibm.com>
---
 .github/workflows/Makefile                    |  15 +-
 .github/workflows/deploy-docs.yml             |   2 +-
 .github/workflows/deploy-library.yml          |   6 +-
 .github/workflows/deploy-transforms.yml       |   4 +-
 .../workflows/test-code-code2parquet-kfp.yml  | 106 ++++++++------
 .github/workflows/test-code-code2parquet.yml  |  15 +-
 .../workflows/test-code-code_quality-kfp.yml  | 106 ++++++++------
 .github/workflows/test-code-code_quality.yml  |  15 +-
 .../test-code-header_cleanser-kfp.yml         | 106 ++++++++------
 .../workflows/test-code-header_cleanser.yml   |  15 +-
 .../test-code-license_select-kfp.yml          | 130 ++++++++++++++++++
 .../workflows/test-code-license_select.yml    |  15 +-
 .github/workflows/test-code-malware-kfp.yml   | 106 ++++++++------
 .github/workflows/test-code-malware.yml       |  15 +-
 .../test-code-proglang_select-kfp.yml         | 106 ++++++++------
 .../workflows/test-code-proglang_select.yml   |  15 +-
 .../test-code-repo_level_ordering-kfp.yml     | 106 ++++++++------
 .../test-code-repo_level_ordering.yml         |  15 +-
 .github/workflows/test-kfp-transform.template | 106 ++++++++------
 .github/workflows/test-kfp.yml                |  13 +-
 .github/workflows/test-language-doc_chunk.yml |  15 +-
 .../test-language-doc_quality-kfp.yml         | 106 ++++++++------
 .../workflows/test-language-doc_quality.yml   |  15 +-
 .../workflows/test-language-html2parquet.yml  |  15 +-
 .../workflows/test-language-lang_id-kfp.yml   | 106 ++++++++------
 .github/workflows/test-language-lang_id.yml   |  15 +-
 .../workflows/test-language-pdf2parquet.yml   |  15 +-
 .../workflows/test-language-pii_redactor.yml  |  15 +-
 .../test-language-text_encoder-kfp.yml        | 106 ++++++++------
 .../workflows/test-language-text_encoder.yml  |  15 +-
 .github/workflows/test-lib.yml                |  15 +-
 .github/workflows/test-misc.yml               |   4 +-
 .github/workflows/test-packaging-python.yml   |   2 +-
 .github/workflows/test-packaging-ray.yml      |   2 +-
 .github/workflows/test-transform.template     |  15 +-
 .../workflows/test-universal-doc_id-kfp.yml   | 106 ++++++++------
 .github/workflows/test-universal-doc_id.yml   |  15 +-
 .../workflows/test-universal-ededup-kfp.yml   | 106 ++++++++------
 .github/workflows/test-universal-ededup.yml   |  15 +-
 .../workflows/test-universal-fdedup-kfp.yml   | 106 ++++++++------
 .github/workflows/test-universal-fdedup.yml   |  15 +-
 .../workflows/test-universal-filter-kfp.yml   | 106 ++++++++------
 .github/workflows/test-universal-filter.yml   |  15 +-
 .github/workflows/test-universal-hap.yml      |  15 +-
 .github/workflows/test-universal-noop-kfp.yml | 106 ++++++++------
 .github/workflows/test-universal-noop.yml     |  15 +-
 .../workflows/test-universal-profiler-kfp.yml | 106 ++++++++------
 .github/workflows/test-universal-profiler.yml |  15 +-
 .../workflows/test-universal-resize-kfp.yml   | 106 ++++++++------
 .github/workflows/test-universal-resize.yml   |  15 +-
 .../test-universal-tokenization-kfp.yml       | 106 ++++++++------
 .../workflows/test-universal-tokenization.yml |  15 +-
 .github/workflows/workflow-manual-run.yml     |   2 +-
 scripts/check-workflows.sh                    |  50 +++++--
 transforms/code/code2parquet/Makefile         |  19 ++-
 transforms/code/code_quality/Makefile         |  19 ++-
 transforms/code/header_cleanser/Makefile      |  19 ++-
 transforms/code/license_select/Makefile       |  19 ++-
 transforms/code/malware/Makefile              |  19 ++-
 transforms/code/proglang_select/Makefile      |  19 ++-
 transforms/code/repo_level_ordering/Makefile  |  16 ++-
 .../code/repo_level_ordering/ray/Makefile     |   7 +
 transforms/language/doc_chunk/Makefile        |  19 ++-
 transforms/language/doc_quality/Makefile      |  19 ++-
 transforms/language/html2parquet/Makefile     |  15 +-
 transforms/language/lang_id/Makefile          |  19 ++-
 transforms/language/pdf2parquet/Makefile      |  19 ++-
 .../language/pii_redactor/Makefile.disable    |  19 ++-
 transforms/language/text_encoder/Makefile     |  19 ++-
 transforms/universal/doc_id/Makefile          |  19 ++-
 transforms/universal/ededup/Makefile          |  19 ++-
 transforms/universal/fdedup/Makefile          |  19 ++-
 transforms/universal/filter/Makefile          |  19 ++-
 transforms/universal/hap/Makefile             |  12 ++
 transforms/universal/noop/Makefile            |  19 ++-
 transforms/universal/profiler/Makefile        |  19 ++-
 transforms/universal/resize/Makefile          |  19 ++-
 transforms/universal/tokenization/Makefile    |  19 ++-
 78 files changed, 1924 insertions(+), 1019 deletions(-)
 create mode 100644 .github/workflows/test-code-license_select-kfp.yml

diff --git a/.github/workflows/Makefile b/.github/workflows/Makefile
index 275fd4688..751b3201f 100644
--- a/.github/workflows/Makefile
+++ b/.github/workflows/Makefile
@@ -8,7 +8,6 @@ LANG_TRANSFORMS=doc_chunk doc_quality lang_id pdf2parquet pii_redactor text_enco
 
 
 # A list that holds transforms that should not be tested with KFP
-KFP_BLACK_LIST="doc_chunk,pdf2parquet,pii_redactor"
 
 transform-tests:
 	$(MAKE) TRANSFORM_SUBDIR=universal .transform-tests 
@@ -29,13 +28,19 @@ transform-tests:
 	done
 
 .transform-kfp-tests:
-	@for i in $$(find ../../transforms/$(TRANSFORM_SUBDIR) -mindepth 1 -maxdepth 1 -type d); do	\
+	@KFP_BLACK_LIST=$$(cd ../..; bash scripts/check-workflows.sh -show-kfp-black-list);		\
+	for i in $$(find ../../transforms/$(TRANSFORM_SUBDIR) -mindepth 1 -maxdepth 1 -type d); do	\
 		dir=$$(basename $$i);				\
-		z=$$(echo ${KFP_BLACK_LIST} | grep -v $$dir);     \
-		if [ ! -d ../../transforms/$(TRANSFORM_SUBDIR)/$$dir/kfp_ray ] || [ -z "$$z" ]; then		\
+		yml=test-$(TRANSFORM_SUBDIR)-$$dir-kfp.yml;                                     \
+		if [ ! -d ../../transforms/$(TRANSFORM_SUBDIR)/$$dir/kfp_ray ]; then		\
+			echo No kfp_ray directory for $$dir. Skipping generation of $$yml;                                                      \
+			continue;			\
+		fi;					\
+		z=$$(echo $${KFP_BLACK_LIST} | grep  $$dir);     \
+		if  [ ! -z "$$z" ]; then		\
+			echo $$dir is black listed. Skipping generation of $$yml;                                                      \
 			continue;			\
 		fi;					\
-		yml=test-$(TRANSFORM_SUBDIR)-$$dir-kfp.yml;                                     \
 		echo Generating $$yml;                                                      \
 		cat test-kfp-transform.template | sed -e "s?@TARGET_TRANSFORM_DIR@?transforms/$${TRANSFORM_SUBDIR}/$$dir?g" > $$yml;    \
 	done
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index a2909c55d..09678e937 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -8,7 +8,7 @@ on:
             - "releases/**"
 jobs:
     deploy:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         env:
             REPO_URL: "https://github.com/${{ github.repository }}"
             REPO_BRANCH: "dev"
diff --git a/.github/workflows/deploy-library.yml b/.github/workflows/deploy-library.yml
index 8ec97ed9e..0c2473175 100644
--- a/.github/workflows/deploy-library.yml
+++ b/.github/workflows/deploy-library.yml
@@ -14,7 +14,7 @@ permissions:
 jobs:
     build-package:
         name: Build Ray data processing libraries
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -30,7 +30,7 @@ jobs:
         name: Publish packages to test.pypi.org
         # disabled
         if: false
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         needs: build-package
 
         steps:
@@ -47,7 +47,7 @@ jobs:
 
     publish-pypi:
         name: Publish release to pypi.org
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         needs: build-package
         # disabled as of now
         if: false
diff --git a/.github/workflows/deploy-transforms.yml b/.github/workflows/deploy-transforms.yml
index 7fe5c8b4d..0f002187d 100644
--- a/.github/workflows/deploy-transforms.yml
+++ b/.github/workflows/deploy-transforms.yml
@@ -9,7 +9,7 @@ on:
 jobs:
     build-images:
         name: Build and check images
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -23,7 +23,7 @@ jobs:
         name: Publish packages to quay.io
         # disabled
         if: false
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         needs: build-images
 
         steps:
diff --git a/.github/workflows/test-code-code2parquet-kfp.yml b/.github/workflows/test-code-code2parquet-kfp.yml
index 41f58e0cb..6de24d4b0 100644
--- a/.github/workflows/test-code-code2parquet-kfp.yml
+++ b/.github/workflows/test-code-code2parquet-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/code2parquet/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/code2parquet/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/code2parquet workflow-test
-                  echo "Run transforms/code/code2parquet completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/code2parquet workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/code2parquet workflow-test
+                      echo "Run transforms/code/code2parquet completed"
+                   else
+                      echo "Skipping transforms/code/code2parquet kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/code2parquet workflow-test
-                  header_text "Run transforms/code/code2parquet completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/code2parquet workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/code2parquet workflow-test
+                      echo "Run transforms/code/code2parquet completed"
+                  else
+                      echo "Skipping transforms/code/code2parquet kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-code2parquet.yml b/.github/workflows/test-code-code2parquet.yml
index f8f1654e7..3f83e9856 100644
--- a/.github/workflows/test-code-code2parquet.yml
+++ b/.github/workflows/test-code-code2parquet.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/code2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/code/code2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/code2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/code/code2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-code_quality-kfp.yml b/.github/workflows/test-code-code_quality-kfp.yml
index 21fa63296..2e22c04a9 100644
--- a/.github/workflows/test-code-code_quality-kfp.yml
+++ b/.github/workflows/test-code-code_quality-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/code_quality/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/code_quality/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/code_quality workflow-test
-                  echo "Run transforms/code/code_quality completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/code_quality workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/code_quality workflow-test
+                      echo "Run transforms/code/code_quality completed"
+                   else
+                      echo "Skipping transforms/code/code_quality kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/code_quality workflow-test
-                  header_text "Run transforms/code/code_quality completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/code_quality workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/code_quality workflow-test
+                      echo "Run transforms/code/code_quality completed"
+                  else
+                      echo "Skipping transforms/code/code_quality kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-code_quality.yml b/.github/workflows/test-code-code_quality.yml
index d53c81c61..5a901edbb 100644
--- a/.github/workflows/test-code-code_quality.yml
+++ b/.github/workflows/test-code-code_quality.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/code_quality/**"
             - "data-processing-lib/**"
             - "!transforms/code/code_quality/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/code_quality/**"
             - "data-processing-lib/**"
             - "!transforms/code/code_quality/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-header_cleanser-kfp.yml b/.github/workflows/test-code-header_cleanser-kfp.yml
index 25f54b528..6cc4727aa 100644
--- a/.github/workflows/test-code-header_cleanser-kfp.yml
+++ b/.github/workflows/test-code-header_cleanser-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/header_cleanser/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/header_cleanser/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/header_cleanser workflow-test
-                  echo "Run transforms/code/header_cleanser completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/header_cleanser workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/header_cleanser workflow-test
+                      echo "Run transforms/code/header_cleanser completed"
+                   else
+                      echo "Skipping transforms/code/header_cleanser kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/header_cleanser workflow-test
-                  header_text "Run transforms/code/header_cleanser completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/header_cleanser workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/header_cleanser workflow-test
+                      echo "Run transforms/code/header_cleanser completed"
+                  else
+                      echo "Skipping transforms/code/header_cleanser kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-header_cleanser.yml b/.github/workflows/test-code-header_cleanser.yml
index 1834f4983..05f09a8c5 100644
--- a/.github/workflows/test-code-header_cleanser.yml
+++ b/.github/workflows/test-code-header_cleanser.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/header_cleanser/**"
             - "data-processing-lib/**"
             - "!transforms/code/header_cleanser/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/header_cleanser/**"
             - "data-processing-lib/**"
             - "!transforms/code/header_cleanser/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-license_select-kfp.yml b/.github/workflows/test-code-license_select-kfp.yml
new file mode 100644
index 000000000..94d662d1d
--- /dev/null
+++ b/.github/workflows/test-code-license_select-kfp.yml
@@ -0,0 +1,130 @@
+#
+# DO NOT EDIT THIS FILE: it is generated from test-transform.template,  Edit there and run make to change these files
+#
+name: Test KFP - transforms/code/license_select
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - "dev"
+            - "releases/**"
+        tags:
+            - "*"
+        paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
+            - "transforms/code/license_select/**"
+            - "!kfp/**" # This is tested in separate workflow
+            - "!data-processing-lib/**" # This is tested in separate workflow
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+    pull_request:
+        branches:
+            - "dev"
+            - "releases/**"
+        paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
+            - "transforms/code/license_select/**"
+            - "!data-processing-lib/**" # This is tested in separate workflow
+            - "!kfp/**" # This is tested in separate workflow
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
+jobs:
+    test-kfp-v1:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test KFP libs (shared and v1) and run a workflow
+              timeout-minutes: 120
+              run: |
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/license_select workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/license_select workflow-test
+                      echo "Run transforms/code/license_select completed"
+                   else
+                      echo "Skipping transforms/code/license_select kfp test for lack of Makefile and/or kfp_ray"
+                   fi
+
+    test-kfp-v2:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test KFP libs (shared and v2) and run a workflow
+              timeout-minutes: 120
+              run: |
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/license_select workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/license_select workflow-test
+                      echo "Run transforms/code/license_select completed"
+                  else
+                      echo "Skipping transforms/code/license_select kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-license_select.yml b/.github/workflows/test-code-license_select.yml
index ab24b582b..59592c82f 100644
--- a/.github/workflows/test-code-license_select.yml
+++ b/.github/workflows/test-code-license_select.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/license_select/**"
             - "data-processing-lib/**"
             - "!transforms/code/license_select/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/license_select/**"
             - "data-processing-lib/**"
             - "!transforms/code/license_select/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-malware-kfp.yml b/.github/workflows/test-code-malware-kfp.yml
index 9bd937f46..2c9e3186c 100644
--- a/.github/workflows/test-code-malware-kfp.yml
+++ b/.github/workflows/test-code-malware-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/malware/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/malware/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/malware workflow-test
-                  echo "Run transforms/code/malware completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/malware workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/malware workflow-test
+                      echo "Run transforms/code/malware completed"
+                   else
+                      echo "Skipping transforms/code/malware kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/malware workflow-test
-                  header_text "Run transforms/code/malware completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/malware workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/malware workflow-test
+                      echo "Run transforms/code/malware completed"
+                  else
+                      echo "Skipping transforms/code/malware kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-malware.yml b/.github/workflows/test-code-malware.yml
index debc779d1..44196c62c 100644
--- a/.github/workflows/test-code-malware.yml
+++ b/.github/workflows/test-code-malware.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/malware/**"
             - "data-processing-lib/**"
             - "!transforms/code/malware/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/malware/**"
             - "data-processing-lib/**"
             - "!transforms/code/malware/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-proglang_select-kfp.yml b/.github/workflows/test-code-proglang_select-kfp.yml
index bbe257964..c23e0f1ff 100644
--- a/.github/workflows/test-code-proglang_select-kfp.yml
+++ b/.github/workflows/test-code-proglang_select-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/proglang_select/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/proglang_select/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/proglang_select workflow-test
-                  echo "Run transforms/code/proglang_select completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/proglang_select workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/proglang_select workflow-test
+                      echo "Run transforms/code/proglang_select completed"
+                   else
+                      echo "Skipping transforms/code/proglang_select kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/proglang_select workflow-test
-                  header_text "Run transforms/code/proglang_select completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/proglang_select workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/proglang_select workflow-test
+                      echo "Run transforms/code/proglang_select completed"
+                  else
+                      echo "Skipping transforms/code/proglang_select kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-proglang_select.yml b/.github/workflows/test-code-proglang_select.yml
index 36bf6a869..4723e5d3a 100644
--- a/.github/workflows/test-code-proglang_select.yml
+++ b/.github/workflows/test-code-proglang_select.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/proglang_select/**"
             - "data-processing-lib/**"
             - "!transforms/code/proglang_select/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/proglang_select/**"
             - "data-processing-lib/**"
             - "!transforms/code/proglang_select/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-code-repo_level_ordering-kfp.yml b/.github/workflows/test-code-repo_level_ordering-kfp.yml
index c26ecda52..57b39f313 100644
--- a/.github/workflows/test-code-repo_level_ordering-kfp.yml
+++ b/.github/workflows/test-code-repo_level_ordering-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/repo_level_ordering/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/code/repo_level_ordering/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/repo_level_ordering workflow-test
-                  echo "Run transforms/code/repo_level_ordering completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/repo_level_ordering workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/repo_level_ordering workflow-test
+                      echo "Run transforms/code/repo_level_ordering completed"
+                   else
+                      echo "Skipping transforms/code/repo_level_ordering kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/code/repo_level_ordering workflow-test
-                  header_text "Run transforms/code/repo_level_ordering completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/code/repo_level_ordering workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/code/repo_level_ordering workflow-test
+                      echo "Run transforms/code/repo_level_ordering completed"
+                  else
+                      echo "Skipping transforms/code/repo_level_ordering kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-code-repo_level_ordering.yml b/.github/workflows/test-code-repo_level_ordering.yml
index fe0ee23bb..19ec8daf5 100644
--- a/.github/workflows/test-code-repo_level_ordering.yml
+++ b/.github/workflows/test-code-repo_level_ordering.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/repo_level_ordering/**"
             - "data-processing-lib/**"
             - "!transforms/code/repo_level_ordering/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/code/repo_level_ordering/**"
             - "data-processing-lib/**"
             - "!transforms/code/repo_level_ordering/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-kfp-transform.template b/.github/workflows/test-kfp-transform.template
index 434a57238..1003ba643 100644
--- a/.github/workflows/test-kfp-transform.template
+++ b/.github/workflows/test-kfp-transform.template
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "@TARGET_TRANSFORM_DIR@/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "@TARGET_TRANSFORM_DIR@/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C @TARGET_TRANSFORM_DIR@ workflow-test
-                  echo "Run @TARGET_TRANSFORM_DIR@ completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C @TARGET_TRANSFORM_DIR@ workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C @TARGET_TRANSFORM_DIR@ workflow-test
+                      echo "Run @TARGET_TRANSFORM_DIR@ completed"
+                   else
+                      echo "Skipping @TARGET_TRANSFORM_DIR@ kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C @TARGET_TRANSFORM_DIR@ workflow-test
-                  header_text "Run @TARGET_TRANSFORM_DIR@ completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C @TARGET_TRANSFORM_DIR@ workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C @TARGET_TRANSFORM_DIR@ workflow-test
+                      echo "Run @TARGET_TRANSFORM_DIR@ completed"
+                  else
+                      echo "Skipping @TARGET_TRANSFORM_DIR@ kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-kfp.yml b/.github/workflows/test-kfp.yml
index 6719c322e..01deebcfa 100644
--- a/.github/workflows/test-kfp.yml
+++ b/.github/workflows/test-kfp.yml
@@ -42,6 +42,11 @@ on:
             - "!**/images/**"
             - "!**/.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 env:
     KFP_BLACK_LIST: "doc_chunk-ray,pdf2parquet-ray,pii_redactor"
 
@@ -51,7 +56,7 @@ jobs:
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -68,7 +73,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -117,7 +122,7 @@ jobs:
                   echo "Run ${transforms[$index]} completed"
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -167,7 +172,7 @@ jobs:
                   header_text "Run ${transforms[$index]} completed"
     build-kfp-components:
         needs: [check_if_push_images]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 30
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-doc_chunk.yml b/.github/workflows/test-language-doc_chunk.yml
index fa3ea58ca..ec78512e5 100644
--- a/.github/workflows/test-language-doc_chunk.yml
+++ b/.github/workflows/test-language-doc_chunk.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/doc_chunk/**"
             - "data-processing-lib/**"
             - "!transforms/language/doc_chunk/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/doc_chunk/**"
             - "data-processing-lib/**"
             - "!transforms/language/doc_chunk/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-doc_quality-kfp.yml b/.github/workflows/test-language-doc_quality-kfp.yml
index e9f678595..1c5237b20 100644
--- a/.github/workflows/test-language-doc_quality-kfp.yml
+++ b/.github/workflows/test-language-doc_quality-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/doc_quality/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/doc_quality/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/doc_quality workflow-test
-                  echo "Run transforms/language/doc_quality completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/doc_quality workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/doc_quality workflow-test
+                      echo "Run transforms/language/doc_quality completed"
+                   else
+                      echo "Skipping transforms/language/doc_quality kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/doc_quality workflow-test
-                  header_text "Run transforms/language/doc_quality completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/doc_quality workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/doc_quality workflow-test
+                      echo "Run transforms/language/doc_quality completed"
+                  else
+                      echo "Skipping transforms/language/doc_quality kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-language-doc_quality.yml b/.github/workflows/test-language-doc_quality.yml
index dde61e1fa..443c22152 100644
--- a/.github/workflows/test-language-doc_quality.yml
+++ b/.github/workflows/test-language-doc_quality.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/doc_quality/**"
             - "data-processing-lib/**"
             - "!transforms/language/doc_quality/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/doc_quality/**"
             - "data-processing-lib/**"
             - "!transforms/language/doc_quality/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-html2parquet.yml b/.github/workflows/test-language-html2parquet.yml
index 8caf4efd9..e5ef8e510 100644
--- a/.github/workflows/test-language-html2parquet.yml
+++ b/.github/workflows/test-language-html2parquet.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/html2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/language/html2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/html2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/language/html2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-lang_id-kfp.yml b/.github/workflows/test-language-lang_id-kfp.yml
index cf3dec397..c6eb179b8 100644
--- a/.github/workflows/test-language-lang_id-kfp.yml
+++ b/.github/workflows/test-language-lang_id-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/lang_id/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/lang_id/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/lang_id workflow-test
-                  echo "Run transforms/language/lang_id completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/lang_id workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/lang_id workflow-test
+                      echo "Run transforms/language/lang_id completed"
+                   else
+                      echo "Skipping transforms/language/lang_id kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/lang_id workflow-test
-                  header_text "Run transforms/language/lang_id completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/lang_id workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/lang_id workflow-test
+                      echo "Run transforms/language/lang_id completed"
+                  else
+                      echo "Skipping transforms/language/lang_id kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-language-lang_id.yml b/.github/workflows/test-language-lang_id.yml
index 3b39358c9..7c318a3a1 100644
--- a/.github/workflows/test-language-lang_id.yml
+++ b/.github/workflows/test-language-lang_id.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/lang_id/**"
             - "data-processing-lib/**"
             - "!transforms/language/lang_id/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/lang_id/**"
             - "data-processing-lib/**"
             - "!transforms/language/lang_id/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-pdf2parquet.yml b/.github/workflows/test-language-pdf2parquet.yml
index bb523c57e..fbdd81b8e 100644
--- a/.github/workflows/test-language-pdf2parquet.yml
+++ b/.github/workflows/test-language-pdf2parquet.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/pdf2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/language/pdf2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/pdf2parquet/**"
             - "data-processing-lib/**"
             - "!transforms/language/pdf2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-pii_redactor.yml b/.github/workflows/test-language-pii_redactor.yml
index 9656a2f24..5ecc80b08 100644
--- a/.github/workflows/test-language-pii_redactor.yml
+++ b/.github/workflows/test-language-pii_redactor.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/pii_redactor/**"
             - "data-processing-lib/**"
             - "!transforms/language/pii_redactor/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/pii_redactor/**"
             - "data-processing-lib/**"
             - "!transforms/language/pii_redactor/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-language-text_encoder-kfp.yml b/.github/workflows/test-language-text_encoder-kfp.yml
index d90b76820..8e238dfcf 100644
--- a/.github/workflows/test-language-text_encoder-kfp.yml
+++ b/.github/workflows/test-language-text_encoder-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/text_encoder/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/language/text_encoder/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/text_encoder workflow-test
-                  echo "Run transforms/language/text_encoder completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/text_encoder workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/text_encoder workflow-test
+                      echo "Run transforms/language/text_encoder completed"
+                   else
+                      echo "Skipping transforms/language/text_encoder kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/language/text_encoder workflow-test
-                  header_text "Run transforms/language/text_encoder completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/language/text_encoder workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/language/text_encoder workflow-test
+                      echo "Run transforms/language/text_encoder completed"
+                  else
+                      echo "Skipping transforms/language/text_encoder kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-language-text_encoder.yml b/.github/workflows/test-language-text_encoder.yml
index f7622f8e0..d49c1193d 100644
--- a/.github/workflows/test-language-text_encoder.yml
+++ b/.github/workflows/test-language-text_encoder.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/text_encoder/**"
             - "data-processing-lib/**"
             - "!transforms/language/text_encoder/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/language/text_encoder/**"
             - "data-processing-lib/**"
             - "!transforms/language/text_encoder/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-lib.yml b/.github/workflows/test-lib.yml
index be00c2076..5a1cff872 100644
--- a/.github/workflows/test-lib.yml
+++ b/.github/workflows/test-lib.yml
@@ -25,13 +25,18 @@ on:
             - "!data-processing-lib/**/doc/**"
             - "!data-processing-lib/**/.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_images:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -48,7 +53,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-python-lib:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -56,7 +61,7 @@ jobs:
               run: |
                   make -C data-processing-lib/python DOCKER=docker venv test
     test-ray-lib:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -64,7 +69,7 @@ jobs:
               run: |
                   make -C data-processing-lib/ray DOCKER=docker venv test
     test-spark-lib:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -74,7 +79,7 @@ jobs:
     test-data-processing-lib-images:
         needs: [check_if_push_images]
         if: needs.check_if_push_images.outputs.publish_images == 'true'
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
             DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
diff --git a/.github/workflows/test-misc.yml b/.github/workflows/test-misc.yml
index 2c601bbd5..62c1a187a 100644
--- a/.github/workflows/test-misc.yml
+++ b/.github/workflows/test-misc.yml
@@ -29,7 +29,7 @@ on:
 
 jobs:
     test-make:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -37,7 +37,7 @@ jobs:
               run: |
                   make -n clean test build publish set-versions
     check-transform-test-workflows:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml
index 4ee491c8e..e88eeeae2 100644
--- a/.github/workflows/test-packaging-python.yml
+++ b/.github/workflows/test-packaging-python.yml
@@ -27,7 +27,7 @@ on:
 
 jobs:
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml
index 4b812540c..9dbce3110 100644
--- a/.github/workflows/test-packaging-ray.yml
+++ b/.github/workflows/test-packaging-ray.yml
@@ -27,7 +27,7 @@ on:
 
 jobs:
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
diff --git a/.github/workflows/test-transform.template b/.github/workflows/test-transform.template
index e0966717e..f3907d56a 100644
--- a/.github/workflows/test-transform.template
+++ b/.github/workflows/test-transform.template
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "@TARGET_TRANSFORM_DIR@/**"
             - "data-processing-lib/**"
             - "!@TARGET_TRANSFORM_DIR@/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "@TARGET_TRANSFORM_DIR@/**"
             - "data-processing-lib/**"
             - "!@TARGET_TRANSFORM_DIR@/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-doc_id-kfp.yml b/.github/workflows/test-universal-doc_id-kfp.yml
index 28c1d8717..8ed1df919 100644
--- a/.github/workflows/test-universal-doc_id-kfp.yml
+++ b/.github/workflows/test-universal-doc_id-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/doc_id/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/doc_id/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/doc_id workflow-test
-                  echo "Run transforms/universal/doc_id completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/doc_id workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/doc_id workflow-test
+                      echo "Run transforms/universal/doc_id completed"
+                   else
+                      echo "Skipping transforms/universal/doc_id kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/doc_id workflow-test
-                  header_text "Run transforms/universal/doc_id completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/doc_id workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/doc_id workflow-test
+                      echo "Run transforms/universal/doc_id completed"
+                  else
+                      echo "Skipping transforms/universal/doc_id kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml
index 66d0283ca..d314f3b25 100644
--- a/.github/workflows/test-universal-doc_id.yml
+++ b/.github/workflows/test-universal-doc_id.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/doc_id/**"
             - "data-processing-lib/**"
             - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/doc_id/**"
             - "data-processing-lib/**"
             - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-ededup-kfp.yml b/.github/workflows/test-universal-ededup-kfp.yml
index 5d3481e30..93408a260 100644
--- a/.github/workflows/test-universal-ededup-kfp.yml
+++ b/.github/workflows/test-universal-ededup-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/ededup/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/ededup/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/ededup workflow-test
-                  echo "Run transforms/universal/ededup completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/ededup workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/ededup workflow-test
+                      echo "Run transforms/universal/ededup completed"
+                   else
+                      echo "Skipping transforms/universal/ededup kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/ededup workflow-test
-                  header_text "Run transforms/universal/ededup completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/ededup workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/ededup workflow-test
+                      echo "Run transforms/universal/ededup completed"
+                  else
+                      echo "Skipping transforms/universal/ededup kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-ededup.yml b/.github/workflows/test-universal-ededup.yml
index 225c27cc3..8b4034570 100644
--- a/.github/workflows/test-universal-ededup.yml
+++ b/.github/workflows/test-universal-ededup.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/ededup/**"
             - "data-processing-lib/**"
             - "!transforms/universal/ededup/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/ededup/**"
             - "data-processing-lib/**"
             - "!transforms/universal/ededup/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-fdedup-kfp.yml b/.github/workflows/test-universal-fdedup-kfp.yml
index 9d331ed47..27dfc5493 100644
--- a/.github/workflows/test-universal-fdedup-kfp.yml
+++ b/.github/workflows/test-universal-fdedup-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/fdedup/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/fdedup/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/fdedup workflow-test
-                  echo "Run transforms/universal/fdedup completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/fdedup workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/fdedup workflow-test
+                      echo "Run transforms/universal/fdedup completed"
+                   else
+                      echo "Skipping transforms/universal/fdedup kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/fdedup workflow-test
-                  header_text "Run transforms/universal/fdedup completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/fdedup workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/fdedup workflow-test
+                      echo "Run transforms/universal/fdedup completed"
+                  else
+                      echo "Skipping transforms/universal/fdedup kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-fdedup.yml b/.github/workflows/test-universal-fdedup.yml
index 356736fca..5f68d4799 100644
--- a/.github/workflows/test-universal-fdedup.yml
+++ b/.github/workflows/test-universal-fdedup.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/fdedup/**"
             - "data-processing-lib/**"
             - "!transforms/universal/fdedup/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/fdedup/**"
             - "data-processing-lib/**"
             - "!transforms/universal/fdedup/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-filter-kfp.yml b/.github/workflows/test-universal-filter-kfp.yml
index 59ebbde3c..bd2f57229 100644
--- a/.github/workflows/test-universal-filter-kfp.yml
+++ b/.github/workflows/test-universal-filter-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/filter/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/filter/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/filter workflow-test
-                  echo "Run transforms/universal/filter completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/filter workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/filter workflow-test
+                      echo "Run transforms/universal/filter completed"
+                   else
+                      echo "Skipping transforms/universal/filter kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/filter workflow-test
-                  header_text "Run transforms/universal/filter completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/filter workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/filter workflow-test
+                      echo "Run transforms/universal/filter completed"
+                  else
+                      echo "Skipping transforms/universal/filter kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-filter.yml b/.github/workflows/test-universal-filter.yml
index 44858feff..43e936166 100644
--- a/.github/workflows/test-universal-filter.yml
+++ b/.github/workflows/test-universal-filter.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/filter/**"
             - "data-processing-lib/**"
             - "!transforms/universal/filter/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/filter/**"
             - "data-processing-lib/**"
             - "!transforms/universal/filter/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-hap.yml b/.github/workflows/test-universal-hap.yml
index b92e5867c..c845506c1 100644
--- a/.github/workflows/test-universal-hap.yml
+++ b/.github/workflows/test-universal-hap.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/hap/**"
             - "data-processing-lib/**"
             - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/hap/**"
             - "data-processing-lib/**"
             - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-noop-kfp.yml b/.github/workflows/test-universal-noop-kfp.yml
index 19c62ab49..01b14e51b 100644
--- a/.github/workflows/test-universal-noop-kfp.yml
+++ b/.github/workflows/test-universal-noop-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/noop/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/noop/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/noop workflow-test
-                  echo "Run transforms/universal/noop completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/noop workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/noop workflow-test
+                      echo "Run transforms/universal/noop completed"
+                   else
+                      echo "Skipping transforms/universal/noop kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/noop workflow-test
-                  header_text "Run transforms/universal/noop completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/noop workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/noop workflow-test
+                      echo "Run transforms/universal/noop completed"
+                  else
+                      echo "Skipping transforms/universal/noop kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-noop.yml b/.github/workflows/test-universal-noop.yml
index cd72703d1..13e066d58 100644
--- a/.github/workflows/test-universal-noop.yml
+++ b/.github/workflows/test-universal-noop.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/noop/**"
             - "data-processing-lib/**"
             - "!transforms/universal/noop/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/noop/**"
             - "data-processing-lib/**"
             - "!transforms/universal/noop/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-profiler-kfp.yml b/.github/workflows/test-universal-profiler-kfp.yml
index 3d377922d..e30f7bafa 100644
--- a/.github/workflows/test-universal-profiler-kfp.yml
+++ b/.github/workflows/test-universal-profiler-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/profiler/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/profiler/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/profiler workflow-test
-                  echo "Run transforms/universal/profiler completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/profiler workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/profiler workflow-test
+                      echo "Run transforms/universal/profiler completed"
+                   else
+                      echo "Skipping transforms/universal/profiler kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/profiler workflow-test
-                  header_text "Run transforms/universal/profiler completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/profiler workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/profiler workflow-test
+                      echo "Run transforms/universal/profiler completed"
+                  else
+                      echo "Skipping transforms/universal/profiler kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-profiler.yml b/.github/workflows/test-universal-profiler.yml
index 50cd8cd26..e018e0ed3 100644
--- a/.github/workflows/test-universal-profiler.yml
+++ b/.github/workflows/test-universal-profiler.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/profiler/**"
             - "data-processing-lib/**"
             - "!transforms/universal/profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/profiler/**"
             - "data-processing-lib/**"
             - "!transforms/universal/profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-resize-kfp.yml b/.github/workflows/test-universal-resize-kfp.yml
index fe7377178..630de3c05 100644
--- a/.github/workflows/test-universal-resize-kfp.yml
+++ b/.github/workflows/test-universal-resize-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/resize/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/resize/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/resize workflow-test
-                  echo "Run transforms/universal/resize completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/resize workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/resize workflow-test
+                      echo "Run transforms/universal/resize completed"
+                   else
+                      echo "Skipping transforms/universal/resize kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/resize workflow-test
-                  header_text "Run transforms/universal/resize completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/resize workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/resize workflow-test
+                      echo "Run transforms/universal/resize completed"
+                  else
+                      echo "Skipping transforms/universal/resize kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-resize.yml b/.github/workflows/test-universal-resize.yml
index 99e14b1b8..b3399e5ec 100644
--- a/.github/workflows/test-universal-resize.yml
+++ b/.github/workflows/test-universal-resize.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/resize/**"
             - "data-processing-lib/**"
             - "!transforms/universal/resize/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/resize/**"
             - "data-processing-lib/**"
             - "!transforms/universal/resize/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/test-universal-tokenization-kfp.yml b/.github/workflows/test-universal-tokenization-kfp.yml
index f127db59b..ff13a444c 100644
--- a/.github/workflows/test-universal-tokenization-kfp.yml
+++ b/.github/workflows/test-universal-tokenization-kfp.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/tokenization/**"
             - "!kfp/**" # This is tested in separate workflow
             - "!data-processing-lib/**" # This is tested in separate workflow
@@ -24,6 +26,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.workflow"
             - "transforms/universal/tokenization/**"
             - "!data-processing-lib/**" # This is tested in separate workflow
             - "!kfp/**" # This is tested in separate workflow
@@ -32,10 +36,14 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
 
 jobs:
     test-kfp-v1:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -51,30 +59,34 @@ jobs:
             - name: Test KFP libs (shared and v1) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/tokenization workflow-test
-                  echo "Run transforms/universal/tokenization completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/tokenization workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/tokenization workflow-test
+                      echo "Run transforms/universal/tokenization completed"
+                   else
+                      echo "Skipping transforms/universal/tokenization kfp test for lack of Makefile and/or kfp_ray"
+                   fi
 
     test-kfp-v2:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -90,25 +102,29 @@ jobs:
             - name: Test KFP libs (shared and v2) and run a workflow
               timeout-minutes: 120
               run: |
-                  export REPOROOT=$PWD
-                  export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
-                  source $K8S_SETUP_SCRIPTS/requirements.env
-                  export PATH=$PATH:/tmp/
-                  curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
-                  chmod 777 /tmp/kind
-                  curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
-                  chmod 700 /tmp/get_helm.sh
-                  HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
-                  chmod 777 /tmp/helm
-                  curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
-                  chmod 777 /tmp/kubectl
-                  curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
-                  chmod +x /tmp/mc
-                  export DEPLOY_KUBEFLOW=1
-                  export KFPv2=1
-                  make -C $K8S_SETUP_SCRIPTS setup
-                  make -C kfp/kfp_support_lib test
-                  make -C transforms workflow-build
-                  source $K8S_SETUP_SCRIPTS/common.sh
-                  make -C transforms/universal/tokenization workflow-test
-                  header_text "Run transforms/universal/tokenization completed"
+                  if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then
+                      export REPOROOT=$PWD
+                      export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
+                      source $K8S_SETUP_SCRIPTS/requirements.env
+                      export PATH=$PATH:/tmp/
+                      curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
+                      chmod 777 /tmp/kind
+                      curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
+                      chmod 700 /tmp/get_helm.sh
+                      HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
+                      chmod 777 /tmp/helm
+                      curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
+                      chmod 777 /tmp/kubectl
+                      curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
+                      chmod +x /tmp/mc
+                      export DEPLOY_KUBEFLOW=1
+                      export KFPv2=1
+                      make -C $K8S_SETUP_SCRIPTS setup
+                      make -C kfp/kfp_support_lib test
+                      make -C transforms/universal/tokenization workflow-build
+                      source $K8S_SETUP_SCRIPTS/common.sh
+                      make -C transforms/universal/tokenization workflow-test
+                      echo "Run transforms/universal/tokenization completed"
+                  else
+                      echo "Skipping transforms/universal/tokenization kfp test for lack of Makefile and/or kfp_ray"
+                  fi
diff --git a/.github/workflows/test-universal-tokenization.yml b/.github/workflows/test-universal-tokenization.yml
index e7a620882..ae547c396 100644
--- a/.github/workflows/test-universal-tokenization.yml
+++ b/.github/workflows/test-universal-tokenization.yml
@@ -12,6 +12,8 @@ on:
         tags:
             - "*"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/tokenization/**"
             - "data-processing-lib/**"
             - "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
             - "dev"
             - "releases/**"
         paths:
+            - ".make.*"
+            - "transforms/.make.transforms"
             - "transforms/universal/tokenization/**"
             - "data-processing-lib/**"
             - "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,13 +40,18 @@ on:
             - "!**/images/**"
             - "!**.gitignore"
 
+# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
+concurrency:
+    group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+    cancel-in-progress: true
+
 jobs:
     check_if_push_image:
         # check whether the Docker images should be pushed to the remote repository
         # The images are pushed if it is a merge to dev branch or a new tag is created.
         # The latter being part of the release process.
         # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         outputs:
             publish_images: ${{ steps.version.outputs.publish_images }}
         steps:
@@ -59,7 +68,7 @@ jobs:
                   fi
                   echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
     test-src:
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -81,7 +90,7 @@ jobs:
                   fi
     test-image:
         needs: [check_if_push_image]
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         timeout-minutes: 120
         env:
             DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
diff --git a/.github/workflows/workflow-manual-run.yml b/.github/workflows/workflow-manual-run.yml
index 3c0f37d47..f0f7028b6 100644
--- a/.github/workflows/workflow-manual-run.yml
+++ b/.github/workflows/workflow-manual-run.yml
@@ -22,7 +22,7 @@ jobs:
             KFPv2: ${{ github.event.inputs.kfp_v2 }}
             WORKFLOW_PATH: ${{ github.event.inputs.workflow-path }}
             DEBUG_MODE: ${{ github.event.inputs.debug }}
-        runs-on: ubuntu-22.04
+        runs-on: ubuntu-latest
         steps:
             - name: Checkout
               uses: actions/checkout@v4
diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
index 40f4e2615..afc73a886 100755
--- a/scripts/check-workflows.sh
+++ b/scripts/check-workflows.sh
@@ -1,20 +1,52 @@
 #!/bin/bash
-# Check that each transform in transforms/<category>/<transform> has a corresponding 
-# .github/workflows/test-<category>-<transforms>.yml file.
+usage() {
+cat << EOF
+Check that each transform in transforms/<category>/<transform> has a corresponding 
+  .github/workflows/test-<category>-<transforms>.yml file and,
+  .github/workflows/test-<category>-<transforms>-kfp.yml file if 
+	there is a kfp_ray directory for the transform, and
+	the transform is not in the kfp black list.
+Options:
+   -show-kfp-black-list: prints the space separate list of transform 
+	directories (base names) and exits.
+   -help: show this message.
+EOF
+}
+
 if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor"
+while [ $# -ne 0 ]; do
+   case $1 in
+        -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;
+	;;
+        *help)          	usage; exit 0;  
+	;;
+	*) echo Unrecognized option $1. exit 1
+	;;
+   esac
+   shift; 
+done
 for i in $(find transforms  -maxdepth 2 -mindepth 2 -type d | grep -v venv); do 
     transform=$(basename $i)
     category=$(dirname $i)
     category=$(basename $category)
-    workflow=.github/workflows/test-$category-$transform.yml
-    if [ ! -e $workflow ]; then 
-	echo Missing $workflow for transform $category/$transform 
-	echo Fix this by running make in the .github/workflows directory
-	exit 1
+    workflows=.github/workflows/test-$category-$transform.yml
+    is_blacklisted=$(echo $KFP_BLACK_LIST | grep $transform)     
+    if [ -d $i/kfp_ray -a -z "$is_blacklisted" ]; then
+    	workflows="$workflows .github/workflows/test-$category-$transform-kfp.yml"
     else
-	echo Verified existence of $workflow
-    fi 
+	echo KFP workflow for $transform is not expected. 
+    fi
+    for workflow in $workflows; do
+	if [ ! -e $workflow ]; then 
+	    echo Missing $workflow for transform $category/$transform 
+	    echo Fix this by running make in the .github/workflows directory
+	    exit 1
+	else
+	    echo Verified existence of $workflow
+	fi 
+    done
 done
diff --git a/transforms/code/code2parquet/Makefile b/transforms/code/code2parquet/Makefile
index bc4077099..027d29644 100644
--- a/transforms/code/code2parquet/Makefile
+++ b/transforms/code/code2parquet/Makefile
@@ -55,16 +55,25 @@ set-versions:
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
\ No newline at end of file
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/code_quality/Makefile b/transforms/code/code_quality/Makefile
index 204ea8856..bca6f7e85 100644
--- a/transforms/code/code_quality/Makefile
+++ b/transforms/code/code_quality/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C  kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C  kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C  kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/header_cleanser/Makefile b/transforms/code/header_cleanser/Makefile
index 204ea8856..bca6f7e85 100644
--- a/transforms/code/header_cleanser/Makefile
+++ b/transforms/code/header_cleanser/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C  kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C  kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C  kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/license_select/Makefile b/transforms/code/license_select/Makefile
index b19f5c963..04b1cc451 100644
--- a/transforms/code/license_select/Makefile
+++ b/transforms/code/license_select/Makefile
@@ -47,16 +47,25 @@ load-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C  kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C  kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C  kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/malware/Makefile b/transforms/code/malware/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/code/malware/Makefile
+++ b/transforms/code/malware/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile
index 9c7c898e4..9e222ee79 100644
--- a/transforms/code/proglang_select/Makefile
+++ b/transforms/code/proglang_select/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/repo_level_ordering/Makefile b/transforms/code/repo_level_ordering/Makefile
index cebfb4848..04b1cc451 100644
--- a/transforms/code/repo_level_ordering/Makefile
+++ b/transforms/code/repo_level_ordering/Makefile
@@ -45,15 +45,27 @@ load-image::
 	@# Help: Recursively make $@ in all subdirs 
 	$(MAKE) RULE=$@ .recurse
 
-# kfp implementation is not yet added, so below targets don't do anything.
 .PHONY: workflow-venv
 workflow-venv:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/code/repo_level_ordering/ray/Makefile b/transforms/code/repo_level_ordering/ray/Makefile
index 771ed9240..83f8692de 100644
--- a/transforms/code/repo_level_ordering/ray/Makefile
+++ b/transforms/code/repo_level_ordering/ray/Makefile
@@ -50,3 +50,10 @@ run-s3-sample: .transforms.run-s3-ray-sample
 minio-start:	.minio-start
 
 load-image:: .transforms.load-image
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
+
diff --git a/transforms/language/doc_chunk/Makefile b/transforms/language/doc_chunk/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/language/doc_chunk/Makefile
+++ b/transforms/language/doc_chunk/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/doc_quality/Makefile b/transforms/language/doc_quality/Makefile
index 5cded280a..a3f1865be 100644
--- a/transforms/language/doc_quality/Makefile
+++ b/transforms/language/doc_quality/Makefile
@@ -47,16 +47,25 @@ load-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
\ No newline at end of file
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/html2parquet/Makefile b/transforms/language/html2parquet/Makefile
index 017eb23b4..bca6f7e85 100644
--- a/transforms/language/html2parquet/Makefile
+++ b/transforms/language/html2parquet/Makefile
@@ -55,12 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/lang_id/Makefile b/transforms/language/lang_id/Makefile
index 2967ceb67..af4a86873 100644
--- a/transforms/language/lang_id/Makefile
+++ b/transforms/language/lang_id/Makefile
@@ -60,16 +60,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/pdf2parquet/Makefile b/transforms/language/pdf2parquet/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/language/pdf2parquet/Makefile
+++ b/transforms/language/pdf2parquet/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/pii_redactor/Makefile.disable b/transforms/language/pii_redactor/Makefile.disable
index f9f635f07..8764d0dc2 100644
--- a/transforms/language/pii_redactor/Makefile.disable
+++ b/transforms/language/pii_redactor/Makefile.disable
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/language/text_encoder/Makefile b/transforms/language/text_encoder/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/language/text_encoder/Makefile
+++ b/transforms/language/text_encoder/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile
index 7ebb1b8e1..be26d3bf4 100644
--- a/transforms/universal/doc_id/Makefile
+++ b/transforms/universal/doc_id/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/ededup/Makefile
+++ b/transforms/universal/ededup/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/fdedup/Makefile
+++ b/transforms/universal/fdedup/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/filter/Makefile b/transforms/universal/filter/Makefile
index 9c7c898e4..9e222ee79 100644
--- a/transforms/universal/filter/Makefile
+++ b/transforms/universal/filter/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/hap/Makefile b/transforms/universal/hap/Makefile
index 017eb23b4..cdb34d72e 100644
--- a/transforms/universal/hap/Makefile
+++ b/transforms/universal/hap/Makefile
@@ -55,12 +55,24 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
+	if [ -e kfp_ray ]; then			\
+	    $(MAKE) -C kfp_ray workflow-venv;	\
+	fi
 
 .PHONY: workflow-test
 workflow-test:
+	if [ -e kfp_ray ]; then			\
+	    $(MAKE) -C kfp_ray workflow-test;	\
+	fi
 
 .PHONY: workflow-upload
 workflow-upload:
+	if [ -e kfp_ray ]; then			\
+	    $(MAKE) -C kfp_ray workflow-upload;	\
+	fi
 
 .PHONY: workflow-build
 workflow-build:
+	if [ -e kfp_ray ]; then			\
+	    $(MAKE) -C  kfp_ray workflow-build;	\
+	fi
diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/noop/Makefile
+++ b/transforms/universal/noop/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/profiler/Makefile b/transforms/universal/profiler/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/profiler/Makefile
+++ b/transforms/universal/profiler/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/resize/Makefile b/transforms/universal/resize/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/resize/Makefile
+++ b/transforms/universal/resize/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+
diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile
index 05d3c3111..bca6f7e85 100644
--- a/transforms/universal/tokenization/Makefile
+++ b/transforms/universal/tokenization/Makefile
@@ -55,16 +55,25 @@ docker-save-image::
 
 .PHONY: workflow-venv
 workflow-venv:
-	$(MAKE) -C kfp_ray workflow-venv
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-venv;   \
+	fi
 
 .PHONY: workflow-test
 workflow-test:
-	$(MAKE) -C kfp_ray workflow-test
-
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-test;   \
+	fi
+	
 .PHONY: workflow-upload
 workflow-upload:
-	$(MAKE) -C kfp_ray workflow-upload
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C kfp_ray workflow-upload; \
+	fi
 
 .PHONY: workflow-build
 workflow-build:
-	$(MAKE) -C  kfp_ray workflow-build
+	if [ -e kfp_ray ]; then                 \
+	    $(MAKE) -C  kfp_ray workflow-build; \
+	fi
+

	filename	contents	num_pages	num_tables	num_doc_elements	document_id	ext	hash	size	date_acquired	pdf_convert_time	source_filename
0	granite.pdf	{\"_name\":\"\",\"type\":\"pdf-document\",\"description...	28	17	348	4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3	pdf	0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...	654989	2024-10-02T00:24:48.959612	34.223920	granite.pdf
1	attension.pdf	{\"_name\":\"\",\"type\":\"pdf-document\",\"description...	15	4	193	f275d75a-a072-4836-8a55-6a65f0d34577	pdf	6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...	135814	2024-10-02T00:24:14.713654	18.004455	attension.pdf