From b38831668eac0af1ce0e3e5d96710999d988b08e Mon Sep 17 00:00:00 2001 From: Sujee Maniyam Date: Wed, 2 Oct 2024 00:55:14 -0700 Subject: [PATCH 1/7] Updated RAG example for DPK version 0.2.1 Signed-off-by: Sujee Maniyam --- examples/notebooks/rag/README.md | 2 +- examples/notebooks/rag/my_config.py | 38 + .../rag/rag_1A_dpk_process_python.ipynb | 1775 +++++++++++++++ .../rag/rag_1A_dpk_process_ray.ipynb | 2026 ++++++++--------- .../rag/rag_1B_load_data_into_milvus.ipynb | 304 ++- .../notebooks/rag/rag_1C_vector_search.ipynb | 127 +- .../rag/rag_1D_query_llama_replicate.ipynb | 229 +- .../rag/rag_2A_llamaindex_process.ipynb | 195 +- .../rag/rag_2B_llamaindex_query.ipynb | 48 +- examples/notebooks/rag/requirements.txt | 11 +- .../notebooks/rag/setup-python-dev-env.md | 26 +- 11 files changed, 3182 insertions(+), 1599 deletions(-) create mode 100644 examples/notebooks/rag/my_config.py create mode 100644 examples/notebooks/rag/rag_1A_dpk_process_python.ipynb diff --git a/examples/notebooks/rag/README.md b/examples/notebooks/rag/README.md index 3ef82eba4..f4a3460a1 100644 --- a/examples/notebooks/rag/README.md +++ b/examples/notebooks/rag/README.md @@ -35,7 +35,7 @@ This code uses DPK to Here is the code: -- Python version: TODO +- Python version: [rag_1A_dpk_process_python.ipynb](rag_1A_dpk_process_python.ipynb) - Ray version: [rag_1A_dpk_process_ray.ipynb](rag_1A_dpk_process_ray.ipynb) diff --git a/examples/notebooks/rag/my_config.py b/examples/notebooks/rag/my_config.py new file mode 100644 index 000000000..ba9ea89fd --- /dev/null +++ b/examples/notebooks/rag/my_config.py @@ -0,0 +1,38 @@ +import os + + +## Configuration +class MyConfig: + pass + +MY_CONFIG = MyConfig () + +## Input Data - configure this to the folder we want to process +MY_CONFIG.INPUT_DATA_DIR = "input" +MY_CONFIG.OUTPUT_FOLDER = "output" +MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , "output_final") +### ------------------------------- + +### Milvus config +MY_CONFIG.DB_URI = './rag_1_dpk.db' # For embedded instance +MY_CONFIG.COLLECTION_NAME = 'dpk_papers' + + +## Embedding model +MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2' +MY_CONFIG.EMBEDDING_LENGTH = 384 + +## LLM Model +MY_CONFIG.LLM_MODEL = "meta/meta-llama-3-8b-instruct" + + + +## RAY CONFIGURATION +num_cpus_available = os.cpu_count() +# print (num_cpus_available) +# MY_CONFIG.RAY_NUM_CPUS = num_cpus_available // 2 ## use half the available cores for processing +MY_CONFIG.RAY_NUM_CPUS = 0.8 +# print (MY_CONFIG.RAY_NUM_CPUS) +MY_CONFIG.RAY_MEMORY_GB = 2 # GB +# MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3 +MY_CONFIG.RAY_RUNTIME_WORKERS = 2 \ No newline at end of file diff --git a/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb new file mode 100644 index 000000000..ae8b0836d --- /dev/null +++ b/examples/notebooks/rag/rag_1A_dpk_process_python.ipynb @@ -0,0 +1,1775 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "841e533d-ebb3-406d-9da7-b19e2c5f5866", + "metadata": {}, + "source": [ + "
\n", + "

Data Processing for RAG with Data Prep Kit (Python)

\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "b15976e3", + "metadata": {}, + "source": [ + "## Before Running the notebook\n", + "\n", + "Please complete [setting up python dev environment](./setup-python-dev-env.md)" + ] + }, + { + "cell_type": "markdown", + "id": "053ecf08-5f62-4b99-9347-8a0955843d21", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "This notebook will process PDF documents as part of RAG pipeline\n", + "\n", + "![](media/rag-overview-2.png)\n", + "\n", + "This notebook will perform steps 1, 2 and 3 in RAG pipeline.\n", + "\n", + "Here are the processing steps:\n", + "\n", + "- **pdf2parquet** : Extract text from PDF and convert them into parquet files\n", + "- **Chunk documents**: Split the PDFs into 'meaningful sections' (paragraphs, sentences ..etc)\n", + "- **Doc_ID generation**: Each chunk is assigned a uniq id, based on content and hash\n", + "- **Exact Dedup**: Chunks with exact same content are filtered out\n", + "- **Text encoder**: Convert chunks into vectors using embedding models" + ] + }, + { + "cell_type": "markdown", + "id": "e8b10be1", + "metadata": {}, + "source": [ + "## Step-1: Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "33345487", + "metadata": {}, + "outputs": [], + "source": [ + "from my_config import MY_CONFIG" + ] + }, + { + "cell_type": "markdown", + "id": "facb3bbc", + "metadata": {}, + "source": [ + "## Step-2: Data\n", + "\n", + "We will use white papers about LLMs. \n", + "\n", + "- [Granite Code Models](https://arxiv.org/abs/2405.04324)\n", + "- [Attention is all you need](https://arxiv.org/abs/1706.03762)\n", + "\n", + "You can of course substite your own data below" + ] + }, + { + "cell_type": "markdown", + "id": "f1fe7c0c", + "metadata": {}, + "source": [ + "### 2.1 - Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8739b7a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Local file 'input/attension.pdf' (2.22 MB) already exists. Skipping download.\n", + "Local file 'input/granite.pdf' (1.27 MB) already exists. Skipping download.\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "from utils import download_file\n", + "\n", + "## Download the data files\n", + "shutil.os.makedirs(MY_CONFIG.INPUT_DATA_DIR, exist_ok=True)\n", + "\n", + "download_file (url = 'https://arxiv.org/pdf/1706.03762', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'attension.pdf' ))\n", + "\n", + "download_file (url = 'https://arxiv.org/pdf/2405.04324', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'granite.pdf' ))\n" + ] + }, + { + "cell_type": "markdown", + "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", + "metadata": {}, + "source": [ + "### 2.2 - Set input/output path variables for the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60ac8bee-0960-4309-b225-d7a211b14262", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Cleared output directory\n" + ] + } + ], + "source": [ + "import os, sys\n", + "import shutil\n", + "\n", + "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", + " raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", + "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_embeddings_out')\n", + "\n", + "## clear output folder\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", + "shutil.os.makedirs(MY_CONFIG.OUTPUT_FOLDER, exist_ok=True)\n", + "\n", + "print (\"✅ Cleared output directory\")" + ] + }, + { + "cell_type": "markdown", + "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", + "metadata": {}, + "source": [ + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", + "\n", + "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", + "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", + "metadata": {}, + "source": [ + "### 3.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "482605b2-d814-456d-9195-49a2ec454ef0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-1: Processing input='input' --> output='output/01_parquet_out'\n" + ] + } + ], + "source": [ + "STAGE = 1 \n", + "\n", + "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", + "output_folder = output_parquet_dir\n", + "\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", + "metadata": {}, + "source": [ + "### 3.2 - Execute " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:23:40 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "00:23:40 INFO - pipeline id pipeline_id\n", + "00:23:40 INFO - code location None\n", + "00:23:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n", + "00:23:40 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:23:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "00:23:40 INFO - orchestrator pdf2parquet started at 2024-10-02 00:23:40\n", + "00:23:40 INFO - Number of files is 2, source profile {'max_file_size': 2.112621307373047, 'min_file_size': 1.2146415710449219, 'total_file_size': 3.3272628784179688}\n", + "00:23:40 INFO - Initializing models\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bd58971a33d4410c91e742e735a6e6e3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 10 files: 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecontentsnum_pagesnum_tablesnum_doc_elementsdocument_idexthashsizedate_acquiredpdf_convert_timesource_filename
0granite.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...28173484a32ba4c-8fdb-4eeb-a06b-d28493efe8e3pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf
1attension.pdf{\"_name\":\"\",\"type\":\"pdf-document\",\"description...154193f275d75a-a072-4836-8a55-6a65f0d34577pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdf
\n", + "" + ], + "text/plain": [ + " filename contents \\\n", + "0 granite.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... \n", + "1 attension.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... \n", + "\n", + " num_pages num_tables num_doc_elements \\\n", + "0 28 17 348 \n", + "1 15 4 193 \n", + "\n", + " document_id ext \\\n", + "0 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 pdf \n", + "1 f275d75a-a072-4836-8a55-6a65f0d34577 pdf \n", + "\n", + " hash size \\\n", + "0 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "1 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "\n", + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "1 2024-10-02T00:24:14.713654 18.004455 attension.pdf " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Output dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.head(5)\n", + "\n", + "## To display certain columns\n", + "#parquet_df[['column1', 'column2', 'column3']].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "72274586", + "metadata": {}, + "source": [ + "## Step-4: Doc chunks\n", + "\n", + "Split the documents in chunks, according to their layout segmentation." + ] + }, + { + "cell_type": "markdown", + "id": "96198fa6", + "metadata": {}, + "source": [ + "### 4.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "305f00a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-2: Processing input='output/01_parquet_out' --> output='output/02_chunk_out'\n" + ] + } + ], + "source": [ + "STAGE = 2\n", + "\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "369f2cd1", + "metadata": {}, + "source": [ + "### 4.2 - Execute " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5b7b18d5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:24:50 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "00:24:50 INFO - pipeline id pipeline_id\n", + "00:24:50 INFO - code location None\n", + "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:24:50 INFO - orchestrator doc_chunk started at 2024-10-02 00:24:50\n", + "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.12735748291015625, 'min_file_size': 0.035338401794433594, 'total_file_size': 0.16269588470458984}\n", + "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "00:24:50 INFO - Completed 2 files (100.0%) in 0.004 min\n", + "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n", + "00:24:50 INFO - done flushing in 0.0 sec\n", + "00:24:50 INFO - Completed execution in 0.004 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:2 completed successfully\n", + "CPU times: user 1.07 s, sys: 95.1 ms, total: 1.16 s\n", + "Wall time: 1.19 s\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # doc_chunk arguments\n", + " # ...\n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(DocChunkPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"❌ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "213afdf6", + "metadata": {}, + "source": [ + "### 4.3 - Inspect Generated output\n", + "\n", + "We would see documents are split into many chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d8138d43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Files processed : 2\n", + "Chunks created : 211\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (211, 16)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_id
87granite.pdf2817348pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf4a32ba4c-8fdb-4eeb-a06b-d28493efe8e36.3 Code Editing and Translation\\nTable 12: Pa...$.main-text[189]16[106.69820404, 190.24554443, 504.00320435, 211...f28d8c9a4fe81f0baf801daf9a95ddaf152a4ac5e8b8ac...
154attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345773.2.2 Multi-Head Attention\\nMulti-head attenti...$.main-text[55]5[107.46644592, 669.41210938, 503.99703979, 690...da79f02a5f19c2f07de7a6f1da9df8db00f01a477582ac...
67granite.pdf2817348pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf4a32ba4c-8fdb-4eeb-a06b-d28493efe8e36.1.5 RepoBench, CrossCodeEval: Repository-Lev...$.main-text[153]12[106.97065735, 224.31654358, 505.74191284, 290...cd5bd4537bde007298a91de7fa2fb4b56516d2f1d31262...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "87 granite.pdf 28 17 348 pdf \n", + "154 attension.pdf 15 4 193 pdf \n", + "67 granite.pdf 28 17 348 pdf \n", + "\n", + " hash size \\\n", + "87 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "154 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "67 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "87 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "154 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "67 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "\n", + " source_document_id \\\n", + "87 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 \n", + "154 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "67 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 \n", + "\n", + " contents doc_jsonpath \\\n", + "87 6.3 Code Editing and Translation\\nTable 12: Pa... $.main-text[189] \n", + "154 3.2.2 Multi-Head Attention\\nMulti-head attenti... $.main-text[55] \n", + "67 6.1.5 RepoBench, CrossCodeEval: Repository-Lev... $.main-text[153] \n", + "\n", + " page_number bbox \\\n", + "87 16 [106.69820404, 190.24554443, 504.00320435, 211... \n", + "154 5 [107.46644592, 669.41210938, 503.99703979, 690... \n", + "67 12 [106.97065735, 224.31654358, 505.74191284, 290... \n", + "\n", + " document_id \n", + "87 f28d8c9a4fe81f0baf801daf9a95ddaf152a4ac5e8b8ac... \n", + "154 da79f02a5f19c2f07de7a6f1da9df8db00f01a477582ac... \n", + "67 cd5bd4537bde007298a91de7fa2fb4b56516d2f1d31262... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (f\"Files processed : {input_df.shape[0]:,}\")\n", + "print (f\"Chunks created : {output_df.shape[0]:,}\")\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.sample(min(3, output_df.shape[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "ece021fd", + "metadata": {}, + "source": [ + "## Step-5: DOC ID generation\n", + "\n", + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "e414c12c", + "metadata": {}, + "source": [ + "### 5.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "10251d3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" + ] + } + ], + "source": [ + "\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_docid_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "0f312347", + "metadata": {}, + "source": [ + "### 5.2 - Execute " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a8b76a71", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:24:50 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "00:24:50 INFO - pipeline id pipeline_id\n", + "00:24:50 INFO - code location None\n", + "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:24:50 INFO - orchestrator doc_id started at 2024-10-02 00:24:50\n", + "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06398963928222656, 'min_file_size': 0.028062820434570312, 'total_file_size': 0.09205245971679688}\n", + "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "00:24:50 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n", + "00:24:50 INFO - done flushing in 0.0 sec\n", + "00:24:50 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:3 completed successfully\n", + "CPU times: user 13.4 ms, sys: 4.83 ms, total: 18.3 ms\n", + "Wall time: 14.7 ms\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from doc_id_transform_python import DocIDPythonTransformRuntimeConfiguration\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # doc id configuration\n", + " \"doc_id_doc_column\": \"contents\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", + "}\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# launch\n", + "\n", + "launcher = PythonTransformLauncher(DocIDPythonTransformRuntimeConfiguration())\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"❌ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "8c23338b", + "metadata": {}, + "source": [ + "### 5.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ec23aa3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (211, 16)\n", + "Output data dimensions (rows x columns)= (211, 18)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_id
192attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345776.2 Model Variations\\nIn Table 3 rows (A), we ...$.main-text[118]9[107.27760315, 318.93438721, 505.24127197, 350...70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...70948f748c6f275b39c70652e29d60dfd53c545e0d6d92...69
71granite.pdf2817348pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf4a32ba4c-8fdb-4eeb-a06b-d28493efe8e36.1.5 RepoBench, CrossCodeEval: Repository-Lev...$.tables[7]13[109.39778137, 486.89639282, 502.1010437, 679....b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52...159
196attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345776.3 English Constituency Parsing\\nWe performed...$.main-text[123]9[106.96768951, 69.592453, 504.24859619, 101.62...93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5...73
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "192 attension.pdf 15 4 193 pdf \n", + "71 granite.pdf 28 17 348 pdf \n", + "196 attension.pdf 15 4 193 pdf \n", + "\n", + " hash size \\\n", + "192 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "71 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "196 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "192 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "71 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "196 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "\n", + " source_document_id \\\n", + "192 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "71 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 \n", + "196 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "\n", + " contents doc_jsonpath \\\n", + "192 6.2 Model Variations\\nIn Table 3 rows (A), we ... $.main-text[118] \n", + "71 6.1.5 RepoBench, CrossCodeEval: Repository-Lev... $.tables[7] \n", + "196 6.3 English Constituency Parsing\\nWe performed... $.main-text[123] \n", + "\n", + " page_number bbox \\\n", + "192 9 [107.27760315, 318.93438721, 505.24127197, 350... \n", + "71 13 [109.39778137, 486.89639282, 502.1010437, 679.... \n", + "196 9 [106.96768951, 69.592453, 504.24859619, 101.62... \n", + "\n", + " document_id \\\n", + "192 70948f748c6f275b39c70652e29d60dfd53c545e0d6d92... \n", + "71 b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52... \n", + "196 93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5... \n", + "\n", + " chunk_hash chunk_id \n", + "192 70948f748c6f275b39c70652e29d60dfd53c545e0d6d92... 69 \n", + "71 b7497dcda69d88caa6b7c3a462edb925ffa97ce5e42c52... 159 \n", + "196 93e01b0e6bafcfe5fcd113d1a3dfedad27d12f81038ff5... 73 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.sample(min(3, output_df.shape[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", + "metadata": {}, + "source": [ + "## Step-6: Exact Dedup\n", + "\n", + "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n", + "followed by de-duplication of record having identical hashes." + ] + }, + { + "cell_type": "markdown", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "metadata": {}, + "source": [ + "### 6.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4c7a1b94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" + ] + } + ], + "source": [ + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "metadata": {}, + "source": [ + "### 6.2 - Execute " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:24:50 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None}\n", + "00:24:50 INFO - pipeline id pipeline_id\n", + "00:24:50 INFO - code location None\n", + "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:24:50 INFO - orchestrator ededup started at 2024-10-02 00:24:50\n", + "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06945991516113281, 'min_file_size': 0.03227043151855469, 'total_file_size': 0.1017303466796875}\n", + "00:24:50 INFO - Starting from the beginning\n", + "00:24:50 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "00:24:50 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "00:24:50 INFO - Done processing 2 files, waiting for flush() completion.\n", + "00:24:50 INFO - done flushing in 0.0 sec\n", + "00:24:50 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:4 completed successfully\n", + "CPU times: user 22.1 ms, sys: 5.79 ms, total: 27.9 ms\n", + "Wall time: 23.5 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "\n", + "# Import ededup transform configuration\n", + "from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration\n", + "\n", + "\n", + "# Prepare the commandline params\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # ededup parameters\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + " \n", + "}\n", + "\n", + "# Pass the commandline params\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"❌ Ray job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "eaf1c3c3", + "metadata": {}, + "source": [ + "### 6.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d824ebf6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (211, 18)\n", + "Output data dimensions (rows x columns)= (211, 19)\n", + "Input chunks before exact dedupe : 211\n", + "Output chunks after exact dedupe : 211\n", + "Duplicate chunks removed : 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremoved
194attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345776.3 English Constituency Parsing\\nTo evaluate ...$.main-text[121]9[107.15766144, 167.93530273, 504.10968018, 210...10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f...71[]
101granite.pdf2817348pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf4a32ba4c-8fdb-4eeb-a06b-d28493efe8e36.5 Math Reasoning\\nTable 15: Performance on 4...$.main-text[219]19[118.49487305, 699.65753174, 492.17700195, 710...c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852...189[]
206attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345777 Conclusion\\nAcknowledgements We are grateful...$.main-text[135]10[107.4437561, 212.26509094, 504.00241089, 232....855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4...83[]
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "194 attension.pdf 15 4 193 pdf \n", + "101 granite.pdf 28 17 348 pdf \n", + "206 attension.pdf 15 4 193 pdf \n", + "\n", + " hash size \\\n", + "194 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "101 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "206 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "194 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "101 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "206 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "\n", + " source_document_id \\\n", + "194 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "101 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 \n", + "206 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "\n", + " contents doc_jsonpath \\\n", + "194 6.3 English Constituency Parsing\\nTo evaluate ... $.main-text[121] \n", + "101 6.5 Math Reasoning\\nTable 15: Performance on 4... $.main-text[219] \n", + "206 7 Conclusion\\nAcknowledgements We are grateful... $.main-text[135] \n", + "\n", + " page_number bbox \\\n", + "194 9 [107.15766144, 167.93530273, 504.10968018, 210... \n", + "101 19 [118.49487305, 699.65753174, 492.17700195, 710... \n", + "206 10 [107.4437561, 212.26509094, 504.00241089, 232.... \n", + "\n", + " document_id \\\n", + "194 10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f... \n", + "101 c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852... \n", + "206 855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4... \n", + "\n", + " chunk_hash chunk_id removed \n", + "194 10c85ade191100c9586ffb4e5ded4944bc4fd865d0919f... 71 [] \n", + "101 c39e0817c8d1edf1d322cef0535b5a63b80d2b2b4d1852... 189 [] \n", + "206 855fdc0d15cb042a43d799b9a38d4339ae1e25b2df99c4... 83 [] " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", + "\n", + "output_df.sample(min(3, output_df.shape[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "metadata": {}, + "source": [ + "## Fuzzy Dedup\n", + "\n", + "**Fuzzy dedupe is currently available in RAY version only**\n", + "\n", + "So we will skip this here" + ] + }, + { + "cell_type": "markdown", + "id": "5370950a-2a3a-4143-8218-f9b4808099ba", + "metadata": {}, + "source": [ + "## Step-7: Text encoding\n", + "\n", + "Encode text for the vector storage." + ] + }, + { + "cell_type": "markdown", + "id": "74fd33b1", + "metadata": {}, + "source": [ + "### 7.1 - Set Input/output Folder" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "20a153fa-fd56-401e-86be-4f7617affcc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃🏼 STAGE-5: Processing input='output/04_exact_dedupe_out' --> output='output/05_embeddings_out'\n" + ] + } + ], + "source": [ + "STAGE = 5\n", + "\n", + "input_folder = output_exact_dedupe_dir\n", + "output_folder = output_embeddings_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", + "\n", + "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "b9112479", + "metadata": {}, + "source": [ + "### 7.2 - Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "228df6b2-bc62-494b-9697-03ece98d7853", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "00:24:50 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "00:24:50 INFO - pipeline id pipeline_id\n", + "00:24:50 INFO - code location None\n", + "00:24:50 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_embeddings_out\n", + "00:24:50 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:24:50 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:24:50 INFO - orchestrator text_encoder started at 2024-10-02 00:24:50\n", + "00:24:50 INFO - Number of files is 2, source profile {'max_file_size': 0.06981945037841797, 'min_file_size': 0.032629966735839844, 'total_file_size': 0.10244941711425781}\n", + "00:24:52 INFO - Completed 1 files (50.0%) in 0.008 min\n", + "00:24:53 INFO - Completed 2 files (100.0%) in 0.02 min\n", + "00:24:53 INFO - Done processing 2 files, waiting for flush() completion.\n", + "00:24:53 INFO - done flushing in 0.0 sec\n", + "00:24:53 INFO - Completed execution in 0.046 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Stage:5 completed successfully\n", + "CPU times: user 1.78 s, sys: 103 ms, total: 1.88 s\n", + "Wall time: 3.09 s\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from text_encoder_transform_python import TextEncoderPythonTransformConfiguration\n", + "\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # text_encoder\n", + " \"text_encoder_model_name\": MY_CONFIG.EMBEDDING_MODEL,\n", + "}\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(TextEncoderPythonTransformConfiguration())\n", + "# Launch the ray actor(s) to process the input\n", + "\n", + "return_code = launcher.launch()\n", + "\n", + "if return_code == 0:\n", + " print (f\"✅ Stage:{STAGE} completed successfully\")\n", + "else:\n", + " raise Exception (\"❌ Job failed\")" + ] + }, + { + "cell_type": "markdown", + "id": "b734852c", + "metadata": {}, + "source": [ + "### 7.3 - Inspect Generated output" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7b1c1d09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input data dimensions (rows x columns)= (211, 19)\n", + "Output data dimensions (rows x columns)= (211, 20)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamenum_pagesnum_tablesnum_doc_elementsexthashsizedate_acquiredpdf_convert_timesource_filenamesource_document_idcontentsdoc_jsonpathpage_numberbboxdocument_idchunk_hashchunk_idremovedembeddings
193attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d345776.2 Model Variations\\nIn Table 3 rows (B), we ...$.main-text[119]9[107.44257355, 248.49208069, 505.24127197, 312...6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522...70[][-0.0049973284, -0.10789071, 0.02143236, -0.02...
210attension.pdf154193pdf6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...1358142024-10-02T00:24:14.71365418.004455attension.pdff275d75a-a072-4836-8a55-6a65f0d34577Attention Visualizations Input-Input Layer5\\nF...$.main-text[190]15[107.43354034, 157.36341858, 504.06988525, 189...67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...67626adb815bf2b27871df24d538ddc10ae68a3fbbd238...87[][0.01508544, -0.015680796, 0.039181348, 0.0084...
46granite.pdf2817348pdf0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587...6549892024-10-02T00:24:48.95961234.223920granite.pdf4a32ba4c-8fdb-4eeb-a06b-d28493efe8e36.1.1 HumanEvalSynthesize: Multilingual Code G...$.main-text[117]9[107.46860504, 613.84277344, 456.97003174, 624...3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba...134[][-0.029933447, 0.031515192, -0.04598905, -0.01...
\n", + "
" + ], + "text/plain": [ + " filename num_pages num_tables num_doc_elements ext \\\n", + "193 attension.pdf 15 4 193 pdf \n", + "210 attension.pdf 15 4 193 pdf \n", + "46 granite.pdf 28 17 348 pdf \n", + "\n", + " hash size \\\n", + "193 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "210 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "46 0650e590f33356ab8581c7eb0c23f1b928f0cfe1659587... 654989 \n", + "\n", + " date_acquired pdf_convert_time source_filename \\\n", + "193 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "210 2024-10-02T00:24:14.713654 18.004455 attension.pdf \n", + "46 2024-10-02T00:24:48.959612 34.223920 granite.pdf \n", + "\n", + " source_document_id \\\n", + "193 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "210 f275d75a-a072-4836-8a55-6a65f0d34577 \n", + "46 4a32ba4c-8fdb-4eeb-a06b-d28493efe8e3 \n", + "\n", + " contents doc_jsonpath \\\n", + "193 6.2 Model Variations\\nIn Table 3 rows (B), we ... $.main-text[119] \n", + "210 Attention Visualizations Input-Input Layer5\\nF... $.main-text[190] \n", + "46 6.1.1 HumanEvalSynthesize: Multilingual Code G... $.main-text[117] \n", + "\n", + " page_number bbox \\\n", + "193 9 [107.44257355, 248.49208069, 505.24127197, 312... \n", + "210 15 [107.43354034, 157.36341858, 504.06988525, 189... \n", + "46 9 [107.46860504, 613.84277344, 456.97003174, 624... \n", + "\n", + " document_id \\\n", + "193 6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522... \n", + "210 67626adb815bf2b27871df24d538ddc10ae68a3fbbd238... \n", + "46 3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba... \n", + "\n", + " chunk_hash chunk_id removed \\\n", + "193 6b79d74f59d1218fa3cdff6d13b504c8bf80558f3e2522... 70 [] \n", + "210 67626adb815bf2b27871df24d538ddc10ae68a3fbbd238... 87 [] \n", + "46 3d5d963f59d4ecb05d1ec2d014747459e01cabe2944bba... 134 [] \n", + "\n", + " embeddings \n", + "193 [-0.0049973284, -0.10789071, 0.02143236, -0.02... \n", + "210 [0.01508544, -0.015680796, 0.039181348, 0.0084... \n", + "46 [-0.029933447, 0.031515192, -0.04598905, -0.01... " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import read_parquet_files_as_df\n", + "\n", + "output_df = read_parquet_files_as_df(output_folder)\n", + "\n", + "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", + "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "\n", + "output_df.sample(min(3, output_df.shape[0]))" + ] + }, + { + "cell_type": "markdown", + "id": "f5e12630-be6b-4188-a925-77117155617b", + "metadata": {}, + "source": [ + "## Step-8: Copy output to final output dir" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Copied output from 'output/05_embeddings_out' --> 'output/output_final'\n" + ] + } + ], + "source": [ + "import shutil\n", + "\n", + "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER_FINAL, ignore_errors=True)\n", + "shutil.copytree(src=output_folder, dst=MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", + "\n", + "print (f\"✅ Copied output from '{output_folder}' --> '{MY_CONFIG.OUTPUT_FOLDER_FINAL}'\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb index 0f8440178..8a8942b1f 100644 --- a/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb +++ b/examples/notebooks/rag/rag_1A_dpk_process_ray.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "source": [ "
\n", - "

Data Processing for RAG with Data Prep Kit

\n", + "

Data Processing for RAG with Data Prep Kit (RAY)

\n", " \n", "
\n" ] @@ -38,8 +38,8 @@ "\n", "- **pdf2parquet** : Extract text from PDF and convert them into parquet files\n", "- **Chunk documents**: Split the PDFs into 'meaningful sections' (paragraphs, sentences ..etc)\n", - "- **Exact Dedup**: Chunks with exact same content are filtered out\n", "- **Doc_ID generation**: Each chunk is assigned a uniq id, based on content and hash\n", + "- **Exact Dedup**: Chunks with exact same content are filtered out\n", "- **Fuzzy Dedup**: Eliminate chunks that are 'very similar' content\n", "- **Doc quality**: Scores the documents based on criteria like number of words, if it contains bad words ..etc\n", "- **Text encoder**: Convert chunks into vectors using embedding models" @@ -60,21 +60,8 @@ "metadata": {}, "outputs": [], "source": [ - "import os \n", - "\n", - "## Configuration\n", - "class MyConfig:\n", - " pass \n", - "\n", - "MY_CONFIG = MyConfig ()\n", - "\n", - "## Input Data - configure this to the folder we want to process\n", - "MY_CONFIG.INPUT_DATA_DIR = \"input\"\n", - "MY_CONFIG.OUTPUT_FOLDER = \"output\"\n", - "MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , \"output_final\")\n", - "\n", - "## Embedding model\n", - "MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "import os\n", + "from my_config import MY_CONFIG\n", "\n", "## RAY CONFIGURATION\n", "num_cpus_available = os.cpu_count()\n", @@ -89,29 +76,39 @@ }, { "cell_type": "markdown", - "id": "02cc3f0e", + "id": "40c58856", "metadata": {}, "source": [ - "### Download Data\n", + "## Step-2: Data\n", "\n", - "We will use [Walmart annual report PDFs](https://github.com/sujee/data/tree/main/data-prep-kit/walmart-reports-1) as our input data.\n", + "We will use white papers about LLMs. \n", "\n", - "Feel free to substitute your data" + "- [Granite Code Models](https://arxiv.org/abs/2405.04324)\n", + "- [Attention is all you need](https://arxiv.org/abs/1706.03762)\n", + "\n", + "You can of course substite your own data below" + ] + }, + { + "cell_type": "markdown", + "id": "6bce5939", + "metadata": {}, + "source": [ + "### 2.1 - Download data" ] }, { "cell_type": "code", "execution_count": 2, - "id": "82c1ae58", + "id": "1bfde6eb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Local file 'input/Walmart-10K-Reports-Optimized_2023.pdf' (1.61 MB) already exists. Skipping download.\n", - "Local file 'input/Walmart_2024.pdf' (4.87 MB) already exists. Skipping download.\n", - "Local file 'input/Walmart_2024_copy.pdf' (4.87 MB) already exists. Skipping download.\n" + "Local file 'input/attension.pdf' (2.22 MB) already exists. Skipping download.\n", + "Local file 'input/granite.pdf' (1.27 MB) already exists. Skipping download.\n" ] } ], @@ -123,11 +120,9 @@ "## Download the data files\n", "shutil.os.makedirs(MY_CONFIG.INPUT_DATA_DIR, exist_ok=True)\n", "\n", - "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart-10K-Reports-Optimized_2023.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart-10K-Reports-Optimized_2023.pdf' ))\n", - "\n", - "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart_2024.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart_2024.pdf' ))\n", + "download_file (url = 'https://arxiv.org/pdf/1706.03762', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'attension.pdf' ))\n", "\n", - "download_file (url = 'https://raw.githubusercontent.com/sujee/data/main/data-prep-kit/walmart-reports-1/Walmart_2024.pdf', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'Walmart_2024_copy.pdf' )) # create a dupe file" + "download_file (url = 'https://arxiv.org/pdf/2405.04324', local_file = os.path.join(MY_CONFIG.INPUT_DATA_DIR, 'granite.pdf' ))\n" ] }, { @@ -135,7 +130,7 @@ "id": "72510ae6-48b0-4b88-9e13-a623281c3a63", "metadata": {}, "source": [ - "### Set input/output path variables for the pipeline" + "### 2.2 - Set input/output path variables for the pipeline" ] }, { @@ -159,6 +154,13 @@ "if not os.path.exists(MY_CONFIG.INPUT_DATA_DIR ):\n", " raise Exception (f\"❌ Input folder MY_CONFIG.INPUT_DATA_DIR = '{MY_CONFIG.INPUT_DATA_DIR}' not found\")\n", "\n", + "output_parquet_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '01_parquet_out')\n", + "output_chunk_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '02_chunk_out')\n", + "output_docid_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '03_docid_out')\n", + "output_exact_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '04_exact_dedupe_out')\n", + "output_fuzzy_dedupe_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '05_fuzzy_dedupe_out')\n", + "output_embeddings_dir = os.path.join (MY_CONFIG.OUTPUT_FOLDER, '06_embeddings_out')\n", + "\n", "\n", "## clear output folder\n", "shutil.rmtree(MY_CONFIG.OUTPUT_FOLDER, ignore_errors=True)\n", @@ -167,42 +169,12 @@ "print (\"✅ Cleared output directory\")" ] }, - { - "cell_type": "markdown", - "id": "bd5d976e-cb4c-4469-af39-4b7ea507e9d8", - "metadata": {}, - "source": [ - "### Import Common python modules" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "66178913-42b8-426b-a2e9-9587268fd05b", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "# Main repo root\n", - "from utils import rootdir\n", - "\n", - "from data_processing_ray.runtime.ray import RayTransformLauncher\n", - "from data_processing.runtime.pure_python import PythonTransformLauncher\n", - "from data_processing.utils import ParamsUtils\n", - "\n", - "STAGE = 0" - ] - }, { "cell_type": "markdown", "id": "2449e5c7-078c-4ad6-a2f6-21d39d4da3fb", "metadata": {}, "source": [ - "\n", - "\n", - "## Step-2: pdf2parquet - Convert data from PDF to Parquet\n", + "## Step-3: pdf2parquet - Convert data from PDF to Parquet\n", "\n", "This step is reading the input folder containing all PDF files and ingest them in a parquet table using the [Docling package](https://github.com/DS4SD/docling).\n", "The documents are converted into a JSON format which allows to easily chunk it in the later steps.\n", @@ -214,12 +186,12 @@ "id": "c0c574c4-9dc4-4dab-9ad6-b5338207e67a", "metadata": {}, "source": [ - "### Set Input/output Folder" + "### 3.1 - Set Input/output Folder" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "482605b2-d814-456d-9195-49a2ec454ef0", "metadata": {}, "outputs": [ @@ -232,11 +204,10 @@ } ], "source": [ - "STAGE += 1\n", - "# STAGE = 1 ## DEBUG\n", + "STAGE = 1 \n", "\n", "input_folder = MY_CONFIG.INPUT_DATA_DIR\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_parquet_out\")\n", + "output_folder = output_parquet_dir\n", "\n", "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" ] @@ -246,12 +217,12 @@ "id": "9bb15f02-ab5c-4525-a536-cfa1fd2ba70b", "metadata": {}, "source": [ - "### Execute " + "### 3.2 - Execute " ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "b0cd8ebd-bf71-42d6-a397-8df0c7b66a26", "metadata": {}, "outputs": [ @@ -259,51 +230,31 @@ "name": "stderr", "output_type": "stream", "text": [ - "10:27:06 INFO - Running locally\n", - "10:27:06 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': False}\n", - "10:27:06 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n", - "10:27:06 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:27:06 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "10:27:06 INFO - pipeline id pipeline_id\n", - "10:27:06 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", - "10:27:06 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "10:27:06 INFO - actor creation delay 0\n", - "10:27:06 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:27:08,526\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - orchestrator started at 2024-08-30 10:27:13\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Number of files is 3, source profile {'max_file_size': 4.640201568603516, 'min_file_size': 1.5370569229125977, 'total_file_size': 10.817460060119629}\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.336485291831195, 'object_store': 4.168242644518614}\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:27:13 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 10:27:17 INFO - Initializing models\n", - "Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 30615.36it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m warnings.warn(f\"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}\")\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:29.093 ( 315.179s) [ 48BC0740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 10:27:17 INFO - Initializing models\n", - "Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 97867.09it/s]\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m warnings.warn(f\"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}\")\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.255 ( 317.341s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1011\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.764 ( 317.850s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1037\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:31.833 ( 317.919s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1176\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:32.154 ( 318.240s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1321\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:32:32.543 ( 318.629s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1103\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:32:40 INFO - Completed 1 files in 5.459707876046498 min\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:32:40 INFO - Completed 1 files (33.333333333333336%) in 5.459709358215332 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [ 54E47740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [ 54E47740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-selected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [ 54E47740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [ 54E47740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074217)\u001b[0m 2024-08-30 10:32:45.826 ( 331.904s) [ 54E47740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-selected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:29.521 ( 615.607s) [ 48BC0740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:31.651 ( 617.737s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1011\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.128 ( 618.214s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1037\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.195 ( 618.281s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1176\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.511 ( 618.597s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1321\n", - "\u001b[36m(RayTransformFileProcessor pid=1074216)\u001b[0m 2024-08-30 10:37:32.873 ( 618.959s) [ 48BC0740] crf_model.cpp:2096 ERR| sequence is too long: 1000 > 1103\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:37:40 INFO - Completed processing 3 files in 10.459254721800486 min\n", - "\u001b[36m(orchestrate pid=1073282)\u001b[0m 10:37:40 INFO - done flushing in 0.0009496212005615234 sec\n", - "10:37:50 INFO - Completed execution in 10.735311404863994 min, execution result 0\n" + "00:25:24 INFO - pdf2parquet parameters are : {'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'double_precision': 8}\n", + "00:25:24 INFO - pipeline id pipeline_id\n", + "00:25:24 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "00:25:24 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "00:25:24 INFO - actor creation delay 0\n", + "00:25:24 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:25:24 INFO - data factory data_ is using local data access: input_folder - input output_folder - output/01_parquet_out\n", + "00:25:24 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:25:24 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "00:25:24 INFO - Running locally\n", + "2024-10-02 00:25:26,362\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - orchestrator started at 2024-10-02 00:25:29\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Number of files is 2, source profile {'max_file_size': 2.112621307373047, 'min_file_size': 1.2146415710449219, 'total_file_size': 3.3272628784179688}\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.941529083997011, 'object_store': 2.470764541067183}\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Number of workers - 2 with {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:25:29 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(RayTransformFileProcessor pid=636524)\u001b[0m 00:25:32 INFO - Initializing models\n", + "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 129854.61it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=636524)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:28:23 INFO - Completed processing 2 files in 2.9 min\n", + "\u001b[36m(orchestrate pid=635641)\u001b[0m 00:28:23 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=636523)\u001b[0m 00:25:32 INFO - Initializing models\n", + "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 37650.84it/s]\n", + "\u001b[36m(RayTransformFileProcessor pid=636523)\u001b[0m Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n", + "00:28:33 INFO - Completed execution in 3.158 min, execution result 0\n" ] }, { @@ -311,8 +262,8 @@ "output_type": "stream", "text": [ "✅ Stage:1 completed successfully\n", - "CPU times: user 4.08 s, sys: 1.07 s, total: 5.15 s\n", - "Wall time: 10min 48s\n" + "CPU times: user 3.85 s, sys: 668 ms, total: 4.52 s\n", + "Wall time: 3min 13s\n" ] } ], @@ -323,6 +274,9 @@ "import os\n", "import sys\n", "\n", + "from data_processing_ray.runtime.ray import RayTransformLauncher\n", + "from data_processing.utils import GB, ParamsUtils\n", + "\n", "from pdf2parquet_transform import (\n", " pdf2parquet_contents_type_cli_param,\n", " pdf2parquet_contents_types,\n", @@ -330,9 +284,6 @@ "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n", "from pdf2parquet_transform_ray import Pdf2ParquetRayTransformConfiguration\n", "\n", - "from data_processing.utils import GB, ParamsUtils\n", - "\n", - "\n", "# create parameters\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", @@ -362,7 +313,6 @@ "sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))\n", "# create launcher\n", "launcher = RayTransformLauncher(Pdf2ParquetRayTransformConfiguration())\n", - "# launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())\n", "# launch\n", "return_code = launcher.launch()\n", "\n", @@ -377,14 +327,14 @@ "id": "5ca790e0", "metadata": {}, "source": [ - "### Inspect Generated output\n", + "### 3.3 - Inspect Generated output\n", "\n", "Here we should see one entry per input file processed" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "fe59563d", "metadata": {}, "outputs": [ @@ -392,7 +342,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Output dimensions (rows x columns)= (3, 12)\n" + "Output dimensions (rows x columns)= (2, 12)\n" ] }, { @@ -433,86 +383,61 @@ " \n", " \n", " 0\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", + " granite.pdf\n", " {\"_name\":\"\",\"type\":\"pdf-document\",\"description...\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 28\n", + " 17\n", + " 348\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", " \n", " \n", " 1\n", - " Walmart_2024_copy.pdf\n", + " attension.pdf\n", " {\"_name\":\"\",\"type\":\"pdf-document\",\"description...\n", - " 100\n", - " 82\n", - " 1163\n", - " 95cc2911-9a0d-49c3-a259-c74e35fca3ea\n", - " pdf\n", - " 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...\n", - " 1112050\n", - " 2024-08-30T10:37:40.616022\n", - " 299.935132\n", - " Walmart_2024_copy.pdf\n", - " \n", - " \n", - " 2\n", - " Walmart_2024.pdf\n", - " {\"_name\":\"\",\"type\":\"pdf-document\",\"description...\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 15\n", + " 4\n", + " 193\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " Walmart_2024.pdf\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename \\\n", - "0 Walmart-10K-Reports-Optimized_2023.pdf \n", - "1 Walmart_2024_copy.pdf \n", - "2 Walmart_2024.pdf \n", - "\n", - " contents num_pages num_tables \\\n", - "0 {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 100 81 \n", - "1 {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 100 82 \n", - "2 {\"_name\":\"\",\"type\":\"pdf-document\",\"description... 100 82 \n", + " filename contents \\\n", + "0 granite.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... \n", + "1 attension.pdf {\"_name\":\"\",\"type\":\"pdf-document\",\"description... \n", "\n", - " num_doc_elements document_id ext \\\n", - "0 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", - "1 1163 95cc2911-9a0d-49c3-a259-c74e35fca3ea pdf \n", - "2 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", + " num_pages num_tables num_doc_elements \\\n", + "0 28 17 348 \n", + "1 15 4 193 \n", "\n", - " hash size \\\n", - "0 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "1 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef... 1112050 \n", - "2 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", + " document_id ext \\\n", + "0 81bc331a-69cf-49bd-84b9-afedcab1344a pdf \n", + "1 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 pdf \n", "\n", - " date_acquired pdf_convert_time \\\n", - "0 2024-08-30T10:32:49.798524 321.107279 \n", - "1 2024-08-30T10:37:40.616022 299.935132 \n", - "2 2024-08-30T10:32:40.640835 312.142404 \n", + " hash size \\\n", + "0 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "1 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", "\n", - " source_filename \n", - "0 Walmart-10K-Reports-Optimized_2023.pdf \n", - "1 Walmart_2024_copy.pdf \n", - "2 Walmart_2024.pdf " + " date_acquired pdf_convert_time source_filename \n", + "0 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "1 2024-10-02T00:26:29.888597 53.822026 attension.pdf " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -535,9 +460,7 @@ "id": "72274586", "metadata": {}, "source": [ - "\n", - "\n", - "## Step-3: Doc chunks\n", + "## Step-4: Doc chunks\n", "\n", "Split the documents in chunks, according to their layout segmentation." ] @@ -547,12 +470,12 @@ "id": "96198fa6", "metadata": {}, "source": [ - "### Set Input/output Folder" + "### 4.1 - Set Input/output Folder" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "305f00a3", "metadata": {}, "outputs": [ @@ -565,11 +488,10 @@ } ], "source": [ - "STAGE += 1\n", - "# STAGE = 2 ## DEBUG\n", + "STAGE = 2\n", "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_chunk_out\")\n", + "input_folder = output_parquet_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_chunk_dir\n", "\n", "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", "\n", @@ -581,12 +503,12 @@ "id": "369f2cd1", "metadata": {}, "source": [ - "### Execute " + "### 4.2 - Execute " ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "5b7b18d5", "metadata": {}, "outputs": [ @@ -594,40 +516,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package punkt_tab to\n", - "[nltk_data] /home/sujee/apps/anaconda3/envs/data-prep-\n", - "[nltk_data] kit-3-py311/lib/python3.11/site-\n", - "[nltk_data] packages/llama_index/core/_static/nltk_cache...\n", - "[nltk_data] Package punkt_tab is already up-to-date!\n", - "10:37:53 INFO - Running locally\n", - "10:37:53 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'output_chunk_column_name': 'contents', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", - "10:37:53 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", - "10:37:53 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:37:53 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:37:53 INFO - pipeline id pipeline_id\n", - "10:37:53 INFO - code location None\n", - "10:37:53 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:37:53 INFO - actor creation delay 0\n", - "10:37:53 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:37:55,040\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] Downloading package punkt_tab to\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] /home/sujee/apps/anaconda3/envs/data-prep-\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] kit-3-py311/lib/python3.11/site-\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] packages/llama_index/core/_static/nltk_cache...\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m [nltk_data] Package punkt_tab is already up-to-date!\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - orchestrator started at 2024-08-30 10:37:57\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Number of files is 3, source profile {'max_file_size': 0.3565502166748047, 'min_file_size': 0.35198307037353516, 'total_file_size': 1.060612678527832}\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.289907838217914, 'object_store': 4.144953917711973}\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:57 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed 1 files in 0.03202696243921916 min\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed 1 files (33.333333333333336%) in 0.032028536001841225 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - Completed processing 3 files in 0.03510438601175944 min\n", - "\u001b[36m(orchestrate pid=1088698)\u001b[0m 10:37:59 INFO - done flushing in 0.0009315013885498047 sec\n", - "10:38:09 INFO - Completed execution in 0.26731717586517334 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data] Downloading package punkt_tab to\u001b[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data] kit-3-py311/lib/python3.11/site-\u001b[32m [repeated 4x across cluster]\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=1089568)\u001b[0m [nltk_data] packages/llama_index/core/_static/nltk_cache...\u001b[32m [repeated 2x across cluster]\u001b[0m\n", - "\u001b[36m(RayTransformFileProcessor pid=1089567)\u001b[0m [nltk_data] Package punkt_tab is already up-to-date!\u001b[32m [repeated 2x across cluster]\u001b[0m\n" + "00:28:36 INFO - doc_chunk parameters are : {'chunking_type': , 'content_column_name': 'contents', 'doc_id_column_name': 'document_id', 'dl_min_chunk_len': None, 'output_chunk_column_name': 'contents', 'output_source_doc_id_column_name': 'source_document_id', 'output_jsonpath_column_name': 'doc_jsonpath', 'output_pageno_column_name': 'page_number', 'output_bbox_column_name': 'bbox'}\n", + "00:28:36 INFO - pipeline id pipeline_id\n", + "00:28:36 INFO - code location None\n", + "00:28:36 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", + "00:28:36 INFO - actor creation delay 0\n", + "00:28:36 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_chunk', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:28:36 INFO - data factory data_ is using local data access: input_folder - output/01_parquet_out output_folder - output/02_chunk_out\n", + "00:28:36 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:28:36 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:28:36 INFO - Running locally\n", + "2024-10-02 00:28:38,768\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - orchestrator started at 2024-10-02 00:28:41\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Number of files is 2, source profile {'max_file_size': 0.12733078002929688, 'min_file_size': 0.035338401794433594, 'total_file_size': 0.16266918182373047}\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.939725494943559, 'object_store': 2.4698627470061183}\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:41 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:43 INFO - Completed processing 2 files in 0.033 min\n", + "\u001b[36m(orchestrate pid=640134)\u001b[0m 00:28:43 INFO - done flushing in 0.001 sec\n", + "00:28:53 INFO - Completed execution in 0.281 min, execution result 0\n" ] }, { @@ -635,8 +542,8 @@ "output_type": "stream", "text": [ "✅ Stage:2 completed successfully\n", - "CPU times: user 1.35 s, sys: 997 ms, total: 2.35 s\n", - "Wall time: 18.5 s\n" + "CPU times: user 992 ms, sys: 321 ms, total: 1.31 s\n", + "Wall time: 19.6 s\n" ] } ], @@ -684,14 +591,14 @@ "id": "213afdf6", "metadata": {}, "source": [ - "### Inspect Generated output\n", + "### 4.3 - Inspect Generated output\n", "\n", "We would see documents are split into many chunks" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "d8138d43", "metadata": {}, "outputs": [ @@ -699,10 +606,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Files processed : 3\n", - "Chunks created : 2,042\n", - "Input data dimensions (rows x columns)= (3, 12)\n", - "Output data dimensions (rows x columns)= (2042, 15)\n" + "Files processed : 2\n", + "Chunks created : 211\n", + "Input data dimensions (rows x columns)= (2, 12)\n", + "Output data dimensions (rows x columns)= (211, 16)\n" ] }, { @@ -730,111 +637,120 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", " source_filename\n", + " source_document_id\n", " contents\n", " doc_jsonpath\n", " page_number\n", " bbox\n", + " document_id\n", " \n", " \n", " \n", " \n", - " 1229\n", - " Walmart_2024_copy.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 95cc2911-9a0d-49c3-a259-c74e35fca3ea\n", + " 185\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...\n", - " 1112050\n", - " 2024-08-30T10:37:40.616022\n", - " 299.935132\n", - " Walmart_2024_copy.pdf\n", - " #26*1.88*) &62.2,7\\n*F=CF HC H<9 .5L IHG 5B8...\n", - " $.main-text[891]\n", - " 76\n", - " [35.41, 538.52, 546.86, 609.18]\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 6.1 Machine Translation\\nOn the WMT 2014 Engli...\n", + " $.main-text[108]\n", + " 8\n", + " [107.27262115, 260.13467407, 505.24533081, 302...\n", + " d6c1d3686219a176bc5ff0ebf4f5c82a53d95d1502d476...\n", " \n", " \n", - " 1767\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 94\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " Walmart_2024.pdf\n", - " 67:?:E:@? 2?5 #:>:E2E:@?D @7 ?E6C?2= @?EC@=...\n", - " $.main-text[630]\n", - " 55\n", - " [35.55, 222.69, 525.53, 256.91]\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 6.3 Code Editing and Translation\\nFrom Table 1...\n", + " $.main-text[199]\n", + " 17\n", + " [107.33219147, 356.5696106, 505.74539185, 411....\n", + " 1c841522286ea1348acafd3a4cfbbffd327ca5de53c5f9...\n", " \n", " \n", - " 865\n", - " Walmart_2024_copy.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 95cc2911-9a0d-49c3-a259-c74e35fca3ea\n", + " 175\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef...\n", - " 1112050\n", - " 2024-08-30T10:37:40.616022\n", - " 299.935132\n", - " Walmart_2024_copy.pdf\n", - " .6 C6=J 6IE6?D:G6=J @? :?7@C>2E:@? 2?5 7:?2?4:...\n", - " $.main-text[278]\n", - " 25\n", - " [35.23, 641.07, 547.64, 747.74]\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 5.1 Training Data and Batching\\nWe trained on ...\n", + " $.main-text[91]\n", + " 7\n", + " [107.12083435, 343.05245972, 505.65435791, 418...\n", + " 77de84b7743b8360a371146c12c9795a12984ef82354f4...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename num_pages num_tables num_doc_elements \\\n", - "1229 Walmart_2024_copy.pdf 100 82 1163 \n", - "1767 Walmart_2024.pdf 100 82 1163 \n", - "865 Walmart_2024_copy.pdf 100 82 1163 \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "185 attension.pdf 15 4 193 pdf \n", + "94 granite.pdf 28 17 348 pdf \n", + "175 attension.pdf 15 4 193 pdf \n", + "\n", + " hash size \\\n", + "185 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "94 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "175 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", "\n", - " document_id ext \\\n", - "1229 95cc2911-9a0d-49c3-a259-c74e35fca3ea pdf \n", - "1767 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", - "865 95cc2911-9a0d-49c3-a259-c74e35fca3ea pdf \n", + " date_acquired pdf_convert_time source_filename \\\n", + "185 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", + "94 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "175 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", "\n", - " hash size \\\n", - "1229 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef... 1112050 \n", - "1767 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", - "865 0be5657667eb7229f1389625f61b6d6dfb608c617ed5ef... 1112050 \n", + " source_document_id \\\n", + "185 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", + "94 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "175 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", "\n", - " date_acquired pdf_convert_time source_filename \\\n", - "1229 2024-08-30T10:37:40.616022 299.935132 Walmart_2024_copy.pdf \n", - "1767 2024-08-30T10:32:40.640835 312.142404 Walmart_2024.pdf \n", - "865 2024-08-30T10:37:40.616022 299.935132 Walmart_2024_copy.pdf \n", + " contents doc_jsonpath \\\n", + "185 6.1 Machine Translation\\nOn the WMT 2014 Engli... $.main-text[108] \n", + "94 6.3 Code Editing and Translation\\nFrom Table 1... $.main-text[199] \n", + "175 5.1 Training Data and Batching\\nWe trained on ... $.main-text[91] \n", "\n", - " contents doc_jsonpath \\\n", - "1229 #26*1.88*) &62.2,7\\n*F=CF HC H<9 .5L IHG 5B8... $.main-text[891] \n", - "1767 67:?:E:@? 2?5 #:>:E2E:@?D @7 ?E6C?2= @?EC@=... $.main-text[630] \n", - "865 .6 C6=J 6IE6?D:G6=J @? :?7@C>2E:@? 2?5 7:?2?4:... $.main-text[278] \n", + " page_number bbox \\\n", + "185 8 [107.27262115, 260.13467407, 505.24533081, 302... \n", + "94 17 [107.33219147, 356.5696106, 505.74539185, 411.... \n", + "175 7 [107.12083435, 343.05245972, 505.65435791, 418... \n", "\n", - " page_number bbox \n", - "1229 76 [35.41, 538.52, 546.86, 609.18] \n", - "1767 55 [35.55, 222.69, 525.53, 256.91] \n", - "865 25 [35.23, 641.07, 547.64, 747.74] " + " document_id \n", + "185 d6c1d3686219a176bc5ff0ebf4f5c82a53d95d1502d476... \n", + "94 1c841522286ea1348acafd3a4cfbbffd327ca5de53c5f9... \n", + "175 77de84b7743b8360a371146c12c9795a12984ef82354f4... " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -850,48 +766,50 @@ "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", "\n", - "output_df.sample(3)" + "output_df.sample(min(3, output_df.shape[0]))" ] }, { "cell_type": "markdown", - "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", + "id": "b8894d88", "metadata": {}, "source": [ - "## Step-4: Exact Dedup\n", + "## Step-5: DOC ID generation\n", "\n", - "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n", - "followed by de-duplication of record having identical hashes." + "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", + "\n", + " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n", + " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline." ] }, { "cell_type": "markdown", - "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", + "id": "46e88f76", "metadata": {}, "source": [ - "### Set Input/output Folder" + "### 5.1 - Set Input/output Folder" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "4c7a1b94", + "execution_count": 10, + "id": "7debd243", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_ededupe_out'\n" + "🏃🏼 STAGE-3: Processing input='output/02_chunk_out' --> output='output/03_docid_out'\n" ] } ], "source": [ - "STAGE += 1\n", - "# STAGE = 3 ## DEBUG\n", "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_ededupe_out\")\n", + "STAGE = 3\n", + "\n", + "input_folder = output_chunk_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_docid_dir\n", "\n", "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", "\n", @@ -900,42 +818,41 @@ }, { "cell_type": "markdown", - "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", + "id": "1cadc2f3", "metadata": {}, "source": [ - "### Execute " + "### 5.2 - Execute " ] }, { "cell_type": "code", - "execution_count": 12, - "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", + "execution_count": 11, + "id": "6b0eade3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "10:38:10 INFO - Running locally\n", - "10:38:10 INFO - exact dedup params are {'doc_column': 'contents', 'hash_cpu': 0.5, 'num_hashes': 2}\n", - "10:38:10 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_ededupe_out\n", - "10:38:10 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:38:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:38:10 INFO - pipeline id pipeline_id\n", - "10:38:10 INFO - code location None\n", - "10:38:10 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:38:10 INFO - actor creation delay 0\n", - "10:38:10 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:38:12,554\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - orchestrator started at 2024-08-30 10:38:13\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Number of files is 3, source profile {'max_file_size': 0.20615005493164062, 'min_file_size': 0.19641399383544922, 'total_file_size': 0.5990447998046875}\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.27267990168184, 'object_store': 4.136339950375259}\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:13 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed 1 files in 0.011358428001403808 min\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed 1 files (33.333333333333336%) in 0.011360756556193034 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - Completed processing 3 files in 0.01162503957748413 min\n", - "\u001b[36m(orchestrate pid=1090364)\u001b[0m 10:38:14 INFO - done flushing in 0.0009477138519287109 sec\n", - "10:38:24 INFO - Completed execution in 0.2259385307629903 min, execution result 0\n" + "00:28:55 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'chunk_hash', 'int_column': 'chunk_id', 'start_id': 0}\n", + "00:28:55 INFO - pipeline id pipeline_id\n", + "00:28:55 INFO - code location None\n", + "00:28:55 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", + "00:28:55 INFO - actor creation delay 0\n", + "00:28:55 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:28:55 INFO - data factory data_ is using local data access: input_folder - output/02_chunk_out output_folder - output/03_docid_out\n", + "00:28:55 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:28:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:28:55 INFO - Running locally\n", + "2024-10-02 00:28:56,881\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - orchestrator started at 2024-10-02 00:28:57\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Number of files is 2, source profile {'max_file_size': 0.06398677825927734, 'min_file_size': 0.028062820434570312, 'total_file_size': 0.09204959869384766}\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.8911590576171875, 'object_store': 2.4455795288085938}\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:57 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:58 INFO - Completed processing 2 files in 0.013 min\n", + "\u001b[36m(orchestrate pid=641742)\u001b[0m 00:28:58 INFO - done flushing in 0.001 sec\n", + "00:29:08 INFO - Completed execution in 0.228 min, execution result 0\n" ] }, { @@ -943,174 +860,7 @@ "output_type": "stream", "text": [ "✅ Stage:3 completed successfully\n", - "CPU times: user 121 ms, sys: 184 ms, total: 305 ms\n", - "Wall time: 14.8 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "# Import ededup transform configuration\n", - "from ededup_transform_ray import EdedupRayTransformConfiguration\n", - "\n", - "\n", - "# Prepare the commandline params\n", - "local_conf = {\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - "}\n", - "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "params = {\n", - " # where to run\n", - " \"run_locally\": True,\n", - " # Data access. Only required parameters are specified\n", - " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", - " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", - " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # ededup parameters\n", - " \"ededup_hash_cpu\": 0.5,\n", - " \"ededup_num_hashes\": 2,\n", - " \"ededup_doc_column\": \"contents\",\n", - "}\n", - "\n", - "# Pass the commandline params\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(EdedupRayTransformConfiguration())\n", - "# launch\n", - "return_code = launcher.launch()\n", - "\n", - "if return_code == 0:\n", - " print (f\"✅ Stage:{STAGE} completed successfully\")\n", - "else:\n", - " raise Exception (\"❌ Ray job failed\")" - ] - }, - { - "cell_type": "markdown", - "id": "eaf1c3c3", - "metadata": {}, - "source": [ - "### Inspect Generated output" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d824ebf6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data dimensions (rows x columns)= (2042, 15)\n", - "Output data dimensions (rows x columns)= (1324, 15)\n", - "Input chunks before exact dedupe : 2,042\n", - "Output chunks after exact dedupe : 1,324\n", - "Duplicate chunks removed : 718\n" - ] - } - ], - "source": [ - "from utils import read_parquet_files_as_df\n", - "\n", - "output_df = read_parquet_files_as_df(output_folder)\n", - "\n", - "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", - "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", - "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", - "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", - "\n", - "output_df.sample(3)" - ] - }, - { - "cell_type": "markdown", - "id": "f15f4d00-33bb-4d9a-9f34-4d7f3ee0b7bc", - "metadata": {}, - "source": [ - "## Step-5: DOC ID generation\n", - "\n", - "This transform annotates documents with document \"ids\". It supports the following transformations of the original data:\n", - "\n", - " - Adding document hash: this enables the addition of a document hash-based id to the data. The hash is calculated with `hashlib.sha256(doc.encode(\"utf-8\")).hexdigest()`. To enable this annotation, set hash_column to the name of the column, where you want to store it.\n", - " - Adding integer document id: this allows the addition of an integer document id to the data that is unique across all rows in all tables provided to the transform() method. To enable this annotation, set int_id_column to the name of the column, where you want to store it. **This is a pre-requisite for fuzzy dedup** in the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e6f62394-fbde-495c-bbbb-83161b006bed", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🏃🏼 STAGE-4: Processing input='output/03_ededupe_out' --> output='output/04_doc_id_out'\n" - ] - } - ], - "source": [ - "\n", - "# Input for this stage is the output of exact dedeup component\n", - "# output of this component makes it possible for fdedup component to run on data.\n", - "\n", - "STAGE += 1\n", - "# STAGE = 4 ## DEBUG\n", - "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_doc_id_out\")\n", - "\n", - "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", - "\n", - "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a6daf36d-686c-4e0a-aabf-ce55f999bb2d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "10:38:25 INFO - Running locally\n", - "10:38:25 INFO - Doc id parameters are : {'doc_column': 'contents', 'hash_column': 'hash_column', 'int_column': 'int_id_column'}\n", - "10:38:25 INFO - data factory data_ is using local data access: input_folder - output/03_ededupe_out output_folder - output/04_doc_id_out\n", - "10:38:25 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:38:25 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:38:25 INFO - pipeline id pipeline_id\n", - "10:38:25 INFO - code location None\n", - "10:38:25 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:38:25 INFO - actor creation delay 0\n", - "10:38:25 INFO - job details {'job category': 'preprocessing', 'job name': 'doc_id', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:38:27,443\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - orchestrator started at 2024-08-30 10:38:28\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Number of files is 3, source profile {'max_file_size': 0.20574665069580078, 'min_file_size': 0.003185272216796875, 'total_file_size': 0.4063444137573242}\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.265644073486328, 'object_store': 4.132822036743164}\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:28 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed 1 files in 0.012215912342071533 min\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed 1 files (33.333333333333336%) in 0.012217283248901367 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - Completed processing 3 files in 0.012248762448628743 min\n", - "\u001b[36m(orchestrate pid=1092147)\u001b[0m 10:38:29 INFO - done flushing in 0.0009109973907470703 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=1092988)\u001b[0m 10:38:29 WARNING - table is empty, skipping processing\n", - "10:38:39 INFO - Completed execution in 0.22525110244750976 min, execution result 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Stage:4 completed successfully\n", - "CPU times: user 136 ms, sys: 159 ms, total: 296 ms\n", + "CPU times: user 123 ms, sys: 167 ms, total: 290 ms\n", "Wall time: 15 s\n" ] } @@ -1118,7 +868,7 @@ "source": [ "%%time \n", "\n", - "from doc_id_transform_ray import DocIDRayTransformConfiguration\n", + "from doc_id_transform_ray import DocIDRayTransformRuntimeConfiguration\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", " \"output_folder\": output_folder,\n", @@ -1134,14 +884,14 @@ " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", " # doc id configuration\n", " \"doc_id_doc_column\": \"contents\",\n", - " \"doc_id_hash_column\": \"hash_column\",\n", - " \"doc_id_int_column\": \"int_id_column\",\n", + " \"doc_id_hash_column\": \"chunk_hash\",\n", + " \"doc_id_int_column\": \"chunk_id\",\n", "}\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "\n", "# launch\n", "\n", - "launcher = RayTransformLauncher(DocIDRayTransformConfiguration())\n", + "launcher = RayTransformLauncher(DocIDRayTransformRuntimeConfiguration())\n", "\n", "return_code = launcher.launch()\n", "\n", @@ -1153,24 +903,24 @@ }, { "cell_type": "markdown", - "id": "3d492c2b", + "id": "d5c5c6e4", "metadata": {}, "source": [ - "### Inspect Generated output" + "### 5.3 - Inspect Generated output" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "91ade826", + "execution_count": 12, + "id": "45d941b2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (1324, 15)\n", - "Output data dimensions (rows x columns)= (1324, 17)\n" + "Input data dimensions (rows x columns)= (211, 16)\n", + "Output data dimensions (rows x columns)= (211, 18)\n" ] }, { @@ -1198,129 +948,133 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", " source_filename\n", + " source_document_id\n", " contents\n", " doc_jsonpath\n", " page_number\n", " bbox\n", - " hash_column\n", - " int_id_column\n", + " document_id\n", + " chunk_hash\n", + " chunk_id\n", " \n", " \n", " \n", " \n", - " 860\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 31\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " Walmart_2024.pdf\n", - " #682= +2I )68F=2E@CJ @>A=:2?46 )6AFE2E:@?...\n", - " $.main-text[299]\n", - " 27\n", - " [35.24, 725.11, 503.48, 747.51]\n", - " 97e06840b409f4ca176c2d5b145e8f25c9d3d37c6510ac...\n", - " 187\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 3 Model Architecture\\nremove final 8 layers fr...\n", + " $.main-text[69]\n", + " 6\n", + " [107.45430756, 456.21582031, 504.50476074, 521...\n", + " 72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...\n", + " 72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9...\n", + " 119\n", " \n", " \n", - " 2\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 116\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " A A message from our r CEO\\nFor the fiscal y...\n", - " $.main-text[14]\n", - " 3\n", - " [214.16, 607.24, 390.6, 617.44]\n", - " 2f26fa255117cd004e3fc8e4348d39fd265e570edfdbc7...\n", - " 653\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " Acknowledgments\\nThanks and acknowledgement to...\n", + " $.main-text[249]\n", + " 21\n", + " [107.07092285, 59.12960052, 505.24591064, 160....\n", + " b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...\n", + " b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84...\n", + " 204\n", " \n", " \n", - " 607\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 95\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...\n", - " $.main-text[978]\n", - " 85\n", - " [111.85, 578.01, 263.73, 587.0]\n", - " 43503987ec97f0b553f02f3572b2326006b641745b7c2f...\n", - " 1258\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 6.3 Code Editing and Translation\\nCodeLingua (...\n", + " $.main-text[200]\n", + " 17\n", + " [107.03813934, 207.6650238, 505.74505615, 350....\n", + " c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...\n", + " c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3...\n", + " 183\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename num_pages num_tables \\\n", - "860 Walmart_2024.pdf 100 82 \n", - "2 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "607 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "\n", - " num_doc_elements document_id ext \\\n", - "860 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", - "2 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", - "607 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "31 granite.pdf 28 17 348 pdf \n", + "116 granite.pdf 28 17 348 pdf \n", + "95 granite.pdf 28 17 348 pdf \n", "\n", - " hash size \\\n", - "860 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", - "2 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "607 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", + " hash size \\\n", + "31 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "116 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "95 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", "\n", - " date_acquired pdf_convert_time \\\n", - "860 2024-08-30T10:32:40.640835 312.142404 \n", - "2 2024-08-30T10:32:49.798524 321.107279 \n", - "607 2024-08-30T10:32:49.798524 321.107279 \n", + " date_acquired pdf_convert_time source_filename \\\n", + "31 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "116 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "95 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", "\n", - " source_filename \\\n", - "860 Walmart_2024.pdf \n", - "2 Walmart-10K-Reports-Optimized_2023.pdf \n", - "607 Walmart-10K-Reports-Optimized_2023.pdf \n", + " source_document_id \\\n", + "31 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "116 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "95 81bc331a-69cf-49bd-84b9-afedcab1344a \n", "\n", " contents doc_jsonpath \\\n", - "860 #682= +2I )68F=2E@CJ @>A=:2?46 )6AFE2E:@?... $.main-text[299] \n", - "2 A A message from our r CEO\\nFor the fiscal y... $.main-text[14] \n", - "607 ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDUL... $.main-text[978] \n", + "31 3 Model Architecture\\nremove final 8 layers fr... $.main-text[69] \n", + "116 Acknowledgments\\nThanks and acknowledgement to... $.main-text[249] \n", + "95 6.3 Code Editing and Translation\\nCodeLingua (... $.main-text[200] \n", "\n", - " page_number bbox \\\n", - "860 27 [35.24, 725.11, 503.48, 747.51] \n", - "2 3 [214.16, 607.24, 390.6, 617.44] \n", - "607 85 [111.85, 578.01, 263.73, 587.0] \n", + " page_number bbox \\\n", + "31 6 [107.45430756, 456.21582031, 504.50476074, 521... \n", + "116 21 [107.07092285, 59.12960052, 505.24591064, 160.... \n", + "95 17 [107.03813934, 207.6650238, 505.74505615, 350.... \n", "\n", - " hash_column int_id_column \n", - "860 97e06840b409f4ca176c2d5b145e8f25c9d3d37c6510ac... 187 \n", - "2 2f26fa255117cd004e3fc8e4348d39fd265e570edfdbc7... 653 \n", - "607 43503987ec97f0b553f02f3572b2326006b641745b7c2f... 1258 " + " document_id \\\n", + "31 72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9... \n", + "116 b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84... \n", + "95 c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3... \n", + "\n", + " chunk_hash chunk_id \n", + "31 72fbd93a7a834627114fd13cdb1a48c354d6bd991a9eb9... 119 \n", + "116 b6d51d1a54147d95051f77bf536ca6ab7360102dd5ac84... 204 \n", + "95 c52299a48da2f5517c7ed6b964195a46dd0e339af1d0f3... 183 " ] }, - "execution_count": 16, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1333,179 +1087,137 @@ "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", "\n", - "output_df.sample(3)" + "output_df.sample(min(3, output_df.shape[0]))" ] }, { "cell_type": "markdown", - "id": "85309751-8556-41c6-ac32-84acc941bc8d", + "id": "4692975c-49ff-41ae-810e-0f5bc0bbdc53", "metadata": {}, "source": [ - "## Step-6: Fuzzy Dedup\n", + "## Step-6: Exact Dedup\n", "\n", - "Post exact deduplication, fuzzy deduplication is applied with\n", - "the goal of removing code files that may have slight variations and thereby unbiasing\n", - "the data further. Small variations are quite commonly seen in code data in the form\n", - "of variations in the values of variables, addittion of logging statements etc. Find near-\n", - "duplicate." + "Remove documents having identical code to remove bias in the training data. On the content of each document, a SHA256 hash is computed,\n", + "followed by de-duplication of record having identical hashes." ] }, { "cell_type": "markdown", - "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", + "id": "5acfd3a2-a236-4143-bcfc-15804f1da7fe", "metadata": {}, "source": [ - "### Set Input/output Folder" + "### 6.1 - Set Input/output Folder" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", + "execution_count": 13, + "id": "4c7a1b94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-5: Processing input='output/04_doc_id_out' --> output='output/05_fdedupe_out'\n" + "🏃🏼 STAGE-4: Processing input='output/03_docid_out' --> output='output/04_exact_dedupe_out'\n" ] } ], "source": [ - "## Input to this component is the output of doc_id generator component. \n", + "STAGE = 4\n", + "\n", + "input_folder = output_docid_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_exact_dedupe_dir\n", "\n", - "STAGE += 1\n", - "# STAGE = 5 ## DEBUG\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_fdedupe_out\")\n", "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" ] }, { "cell_type": "markdown", - "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", + "id": "3661cb37-39c7-4b09-a784-925bfa9eaf1e", "metadata": {}, "source": [ - "### Execute " + "### 6.2 - Execute " ] }, { "cell_type": "code", - "execution_count": 18, - "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", + "execution_count": 14, + "id": "a624b2b2-faad-4325-ac7d-53a840f564ef", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "10:38:40 INFO - Running locally\n", - "10:38:40 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'int_id_column', 'cluster_column': 'hash_column', 'bucket_cpu': 0.5, 'mhash_cpu': 0.5, 'doc_cpu': 0.5, 'num_doc_actors': 2, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 2, 'num_permutations': 64, 'threshold': 0.8, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 1}}\n", - "10:38:40 INFO - data factory data_ is using local data access: input_folder - output/04_doc_id_out output_folder - output/05_fdedupe_out\n", - "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:38:40 INFO - pipeline id pipeline_id\n", - "10:38:40 INFO - code location None\n", - "10:38:40 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:38:40 INFO - actor creation delay 0\n", - "10:38:40 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:38:42,441\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - orchestrator started at 2024-08-30 10:38:43\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Number of files is 2, source profile {'max_file_size': 0.25233936309814453, 'min_file_size': 0.2446727752685547, 'total_file_size': 0.4970121383666992}\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.28145294263959, 'object_store': 4.140726470388472}\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - starting run from the beginning\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - continuing from the very beginning\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Fuzzy: num buckets 5, bucket length 11\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 1 bucket actors\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 1 minhash actors\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Table preprocessing uses 2 readers\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - created 2 table processor actors\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:43 INFO - Completed 0 files (0.0%) in 6.504853566487631e-06 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:52 INFO - Completed processing 2 files in 0.15140592257181804 min\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:52 INFO - creating minhash snapshots\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:53 INFO - minhash snapshots created\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:53 INFO - creating bucket snapshots\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - bucket snapshots created\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created 2 document actors\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created 2 bucket processor actors\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - created bucket processor invoker\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:54 INFO - added invoker to bucket collectors\n", - "\u001b[36m(BucketsHash pid=1094647)\u001b[0m 10:38:54 INFO - processing buckets 0 long, 6569 short\n", - "\u001b[36m(BucketsHash pid=1094647)\u001b[0m 10:38:54 INFO - Done submitting long buckets\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:55 INFO - Done processing buckets in 0.011683110396067302 min\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:55 INFO - creating document snapshots\n", - "\u001b[36m(BucketsHashProcessorInvoker pid=1095253)\u001b[0m 10:38:55 INFO - Waiting bucket processing completion. Submitted requests 66\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:57 INFO - document snapshots created\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:38:57 INFO - Completed 0 files (0.0%) in 1.0371208190917969e-05 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:39:06 INFO - Completed processing 2 files in 0.1462758183479309 min\n", - "\u001b[36m(orchestrate pid=1093805)\u001b[0m 10:39:06 INFO - done flushing in 0.001108407974243164 sec\n", - "10:39:16 INFO - Completed execution in 0.5921090364456176 min, execution result 0\n" + "00:29:10 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'chunk_hash', 'use_snapshot': False, 'snapshot_directory': None, 'hash_cpu': 0.5, 'num_hashes': 2}\n", + "00:29:10 INFO - pipeline id pipeline_id\n", + "00:29:10 INFO - code location None\n", + "00:29:10 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", + "00:29:10 INFO - actor creation delay 0\n", + "00:29:10 INFO - job details {'job category': 'preprocessing', 'job name': 'ededup', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:29:10 INFO - data factory data_ is using local data access: input_folder - output/03_docid_out output_folder - output/04_exact_dedupe_out\n", + "00:29:10 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:29:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:29:10 INFO - Running locally\n", + "2024-10-02 00:29:11,920\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - orchestrator started at 2024-10-02 00:29:12\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Number of files is 2, source profile {'max_file_size': 0.0694570541381836, 'min_file_size': 0.03227043151855469, 'total_file_size': 0.10172748565673828}\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.913980866782367, 'object_store': 2.4569904319941998}\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:12 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:13 INFO - Completed processing 2 files in 0.013 min\n", + "\u001b[36m(orchestrate pid=643333)\u001b[0m 00:29:13 INFO - done flushing in 0.001 sec\n", + "00:29:23 INFO - Completed execution in 0.227 min, execution result 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:5 completed successfully\n", - "CPU times: user 208 ms, sys: 195 ms, total: 402 ms\n", - "Wall time: 37 s\n" + "✅ Stage:4 completed successfully\n", + "CPU times: user 120 ms, sys: 172 ms, total: 292 ms\n", + "Wall time: 14.9 s\n" ] } ], "source": [ - "%%time \n", - "\n", - "import os\n", - "import sys\n", + "%%time\n", "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", + "# Import ededup transform configuration\n", + "from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration\n", "\n", - "# create parameters\n", "\n", + "# Prepare the commandline params\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", " \"output_folder\": output_folder,\n", "}\n", "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", - "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", "params = {\n", " # where to run\n", " \"run_locally\": True,\n", " # Data access. Only required parameters are specified\n", " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # Orchestration parameters\n", + " # orchestrator\n", " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " # columns used\n", - " \"fdedup_doc_column\": \"contents\",\n", - " \"fdedup_id_column\": \"int_id_column\",\n", - " \"fdedup_cluster_column\": \"hash_column\",\n", - " # infrastructure\n", - " \"fdedup_bucket_cpu\": 0.5,\n", - " \"fdedup_doc_cpu\": 0.5,\n", - " \"fdedup_mhash_cpu\": 0.5,\n", - " \"fdedup_num_doc_actors\": 2,\n", - " \"fdedup_num_bucket_actors\": 1,\n", - " \"fdedup_num_minhash_actors\": 1,\n", - " \"fdedup_num_preprocessors\": 2,\n", - " # fuzzy parameters\n", - " \"fdedup_num_permutations\": 64,\n", - " \"fdedup_threshold\": 0.8,\n", - " \"fdedup_shingles_size\": 5,\n", - " \"fdedup_delimiters\": \" \"\n", + " # ededup parameters\n", + " \"ededup_hash_cpu\": 0.5,\n", + " \"ededup_num_hashes\": 2,\n", + " \"ededup_doc_column\": \"contents\",\n", + " \"ededup_doc_id_column\": \"chunk_hash\",\n", + " \n", "}\n", "\n", - "# Pass commandline params\n", + "# Pass the commandline params\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "\n", + "# create launcher\n", + "launcher = RayTransformLauncher(EdedupRayTransformRuntimeConfiguration())\n", "# launch\n", - "\n", - "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", - "\n", "return_code = launcher.launch()\n", "\n", "if return_code == 0:\n", @@ -1516,25 +1228,27 @@ }, { "cell_type": "markdown", - "id": "a6f8cd11", + "id": "eaf1c3c3", "metadata": {}, "source": [ - "### Inspect Generated output" + "### 6.3 - Inspect Generated output" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "e899ad60", + "execution_count": 15, + "id": "d824ebf6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (1324, 15)\n", - "Output data dimensions (rows x columns)= (1302, 17)\n", - "Duplicate chunks removed by fuzzy-dedupe: 22\n" + "Input data dimensions (rows x columns)= (211, 18)\n", + "Output data dimensions (rows x columns)= (211, 19)\n", + "Input chunks before exact dedupe : 211\n", + "Output chunks after exact dedupe : 211\n", + "Duplicate chunks removed : 0\n" ] }, { @@ -1562,124 +1276,137 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", " source_filename\n", + " source_document_id\n", " contents\n", " doc_jsonpath\n", " page_number\n", " bbox\n", - " int_id_column\n", - " hash_column\n", + " document_id\n", + " chunk_hash\n", + " chunk_id\n", + " removed\n", " \n", " \n", " \n", " \n", - " 1102\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 188\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " Walmart_2024.pdf\n", - " & % (' ('-+(%%#'! '- + ,-\\n(CB7CBHFC@...\n", - " $.main-text[734]\n", - " 66\n", - " [35.27, 647.11, 551.24, 729.93]\n", - " 447\n", - " -1\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 6.2 Model Variations\\nTo evaluate the importan...\n", + " $.main-text[112]\n", + " 8\n", + " [107.1419754, 91.9256134, 504.05615234, 113.59...\n", + " 6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...\n", + " 6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f...\n", + " 65\n", + " []\n", " \n", " \n", - " 470\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 153\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " Share-Based Compensation\\nFair value of restri...\n", - " $.tables[39]\n", - " 68\n", - " [47.21, 61.57, 540.3, 152.39]\n", - " 1123\n", - " -1\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 3.2.2 Multi-Head Attention\\noutput values. The...\n", + " $.main-text[54]\n", + " 5\n", + " [107.36427307, 696.97607422, 503.99719238, 717...\n", + " 07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...\n", + " 07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b...\n", + " 30\n", + " []\n", " \n", " \n", - " 339\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 68\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " Capital Resources\\nWe believe our cash flows ...\n", - " $.main-text[517]\n", - " 48\n", - " [46.39, 510.11, 539.0, 555.63]\n", - " 992\n", - " -1\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 6.1.5 RepoBench, CrossCodeEval: Repository-Lev...\n", + " $.main-text[154]\n", + " 12\n", + " [107.21151733, 141.59487915, 505.73928833, 218...\n", + " 650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...\n", + " 650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd...\n", + " 156\n", + " []\n", " \n", " \n", "\n", "" ], "text/plain": [ - " filename num_pages num_tables \\\n", - "1102 Walmart_2024.pdf 100 82 \n", - "470 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "339 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "188 attension.pdf 15 4 193 pdf \n", + "153 attension.pdf 15 4 193 pdf \n", + "68 granite.pdf 28 17 348 pdf \n", + "\n", + " hash size \\\n", + "188 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "153 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "68 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", "\n", - " num_doc_elements document_id ext \\\n", - "1102 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", - "470 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", - "339 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", + " date_acquired pdf_convert_time source_filename \\\n", + "188 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", + "153 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", + "68 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", "\n", - " hash size \\\n", - "1102 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", - "470 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "339 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", + " source_document_id \\\n", + "188 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", + "153 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", + "68 81bc331a-69cf-49bd-84b9-afedcab1344a \n", "\n", - " date_acquired pdf_convert_time \\\n", - "1102 2024-08-30T10:32:40.640835 312.142404 \n", - "470 2024-08-30T10:32:49.798524 321.107279 \n", - "339 2024-08-30T10:32:49.798524 321.107279 \n", + " contents doc_jsonpath \\\n", + "188 6.2 Model Variations\\nTo evaluate the importan... $.main-text[112] \n", + "153 3.2.2 Multi-Head Attention\\noutput values. The... $.main-text[54] \n", + "68 6.1.5 RepoBench, CrossCodeEval: Repository-Lev... $.main-text[154] \n", "\n", - " source_filename \\\n", - "1102 Walmart_2024.pdf \n", - "470 Walmart-10K-Reports-Optimized_2023.pdf \n", - "339 Walmart-10K-Reports-Optimized_2023.pdf \n", + " page_number bbox \\\n", + "188 8 [107.1419754, 91.9256134, 504.05615234, 113.59... \n", + "153 5 [107.36427307, 696.97607422, 503.99719238, 717... \n", + "68 12 [107.21151733, 141.59487915, 505.73928833, 218... \n", "\n", - " contents doc_jsonpath \\\n", - "1102 & % (' ('-+(%%#'! '- + ,-\\n(CB7CBHFC@... $.main-text[734] \n", - "470 Share-Based Compensation\\nFair value of restri... $.tables[39] \n", - "339 Capital Resources\\nWe believe our cash flows ... $.main-text[517] \n", + " document_id \\\n", + "188 6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f... \n", + "153 07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b... \n", + "68 650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd... \n", "\n", - " page_number bbox int_id_column hash_column \n", - "1102 66 [35.27, 647.11, 551.24, 729.93] 447 -1 \n", - "470 68 [47.21, 61.57, 540.3, 152.39] 1123 -1 \n", - "339 48 [46.39, 510.11, 539.0, 555.63] 992 -1 " + " chunk_hash chunk_id removed \n", + "188 6eb55d1014abb7e7a010fd07b994af17a0cad7ca059f8f... 65 [] \n", + "153 07f191b8e14ee3784ecc42c94e4096c97388733f1ea59b... 30 [] \n", + "68 650d9bcdcb744b665a189a4d02f09a4be39dcde46a0ecd... 156 [] " ] }, - "execution_count": 19, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1691,151 +1418,186 @@ "\n", "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", - "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", + "print (f\"Input chunks before exact dedupe : {input_df.shape[0]:,}\")\n", + "print (f\"Output chunks after exact dedupe : {output_df.shape[0]:,}\")\n", + "print (\"Duplicate chunks removed : \", (input_df.shape[0] - output_df.shape[0]))\n", "\n", - "output_df.sample(3)" + "output_df.sample(min(3, output_df.shape[0]))" ] }, { "cell_type": "markdown", - "id": "0646cbb7-3046-44c0-827d-d102d3ff7cb8", + "id": "85309751-8556-41c6-ac32-84acc941bc8d", "metadata": {}, "source": [ - "## Step-7: Document Quality" + "## Step-7: Fuzzy Dedup\n", + "\n", + "Post exact deduplication, fuzzy deduplication is applied with\n", + "the goal of removing code files that may have slight variations and thereby unbiasing\n", + "the data further. Small variations are quite commonly seen in code data in the form\n", + "of variations in the values of variables, addittion of logging statements etc. Find near-\n", + "duplicate." ] }, { "cell_type": "markdown", - "id": "2e985668-848b-4633-b0d8-9fe70ada0c91", + "id": "fcf574a3-b287-419c-9c86-07b828b41ca6", "metadata": {}, "source": [ - "### Set Input/output Folder" + "### 7.1 - Set Input/output Folder" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "9f080011-c9fe-430e-9ecc-f2220d2c8d18", + "execution_count": 16, + "id": "9e431c8c-c7c7-48de-ba5f-2c4649c35399", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-6: Processing input='output/05_fdedupe_out' --> output='output/06_doc_quality_out'\n" + "🏃🏼 STAGE-5: Processing input='output/04_exact_dedupe_out' --> output='output/05_fuzzy_dedupe_out'\n" ] } ], "source": [ - "STAGE += 1\n", - "# STAGE = 6 ## DEBUG\n", + "## Input to this component is the output of doc_id generator component. \n", + "\n", + "STAGE = 5\n", + "\n", + "input_folder = output_exact_dedupe_dir # previous output folder is the input folder for the current stage\n", + "output_folder = output_fuzzy_dedupe_dir\n", + "\n", + "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_doc_quality_out\")\n", "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" ] }, { "cell_type": "markdown", - "id": "c02982c5-f398-4a1a-a9fe-42d7ae748c7c", + "id": "f4c82a8f-b513-4fe5-b172-d41b104b54f3", "metadata": {}, "source": [ - "### Execute " + "### 7.2 - Execute " ] }, { "cell_type": "code", - "execution_count": 21, - "id": "29319fb9-b0d8-4f86-9bc5-b92960ad8ae5", + "execution_count": 17, + "id": "3864ff77-e9a8-48f7-973b-c3b3aef1a94f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "10:39:17 INFO - Running locally\n", - "10:39:17 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "10:39:17 INFO - data factory docq_ is using local configuration without input/output path\n", - "10:39:17 INFO - data factory docq_ max_files -1, n_sample -1\n", - "10:39:17 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:39:17 INFO - data factory data_ is using local data access: input_folder - output/05_fdedupe_out output_folder - output/06_doc_quality_out\n", - "10:39:17 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:39:17 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:39:17 INFO - pipeline id pipeline_id\n", - "10:39:17 INFO - code location None\n", - "10:39:17 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:39:17 INFO - actor creation delay 0\n", - "10:39:17 INFO - job details {'job category': 'preprocessing', 'job name': 'docq', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:39:19,513\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - orchestrator started at 2024-08-30 10:39:20\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Number of files is 2, source profile {'max_file_size': 0.20880889892578125, 'min_file_size': 0.200042724609375, 'total_file_size': 0.40885162353515625}\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.259551240131259, 'object_store': 4.129775619134307}\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:20 INFO - Completed 0 files (0.0%) in 6.075700124104818e-06 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=1097239)\u001b[0m 10:39:20 INFO - Load badwords found locally from /home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:21 INFO - Completed processing 2 files in 0.02414883772532145 min\n", - "\u001b[36m(orchestrate pid=1096394)\u001b[0m 10:39:21 INFO - done flushing in 0.0010554790496826172 sec\n", - "10:39:31 INFO - Completed execution in 0.23775473435719807 min, execution result 0\n", - "\u001b[36m(RayTransformFileProcessor pid=1097238)\u001b[0m 10:39:20 INFO - Load badwords found locally from /home/sujee/my-stuff/projects/ai-alliance/data-prep-kit-sujee/transforms/language/doc_quality/python/ldnoobw/en\n" + "00:29:25 INFO - fuzzy dedup params are {'doc_column': 'contents', 'id_column': 'chunk_id', 'cluster_column': 'chunk_hash', 'bucket_cpu': 0.3, 'mhash_cpu': 0.3, 'doc_cpu': 0.3, 'num_doc_actors': 1, 'num_minhash_actors': 1, 'num_bucket_actors': 1, 'num_preprocessors': 1, 'num_permutations': 64, 'threshold': 0.7, 'shingles_size': 5, 'delimiters': ' ', 'snapshot_delay': 1, 'use_bucket_snapshot': False, 'use_doc_snapshot': False, 'random_delay_limit': 10, 'worker_options': {'num_cpus': 1}}\n", + "00:29:25 INFO - pipeline id pipeline_id\n", + "00:29:25 INFO - code location None\n", + "00:29:25 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", + "00:29:25 INFO - actor creation delay 0\n", + "00:29:25 INFO - job details {'job category': 'preprocessing', 'job name': 'fdedup', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:29:25 INFO - data factory data_ is using local data access: input_folder - output/04_exact_dedupe_out output_folder - output/05_fuzzy_dedupe_out\n", + "00:29:25 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:29:25 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:29:25 INFO - Running locally\n", + "2024-10-02 00:29:26,903\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - orchestrator started at 2024-10-02 00:29:28\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Number of files is 2, source profile {'max_file_size': 0.06981658935546875, 'min_file_size': 0.032629966735839844, 'total_file_size': 0.1024465560913086}\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.94085159432143, 'object_store': 2.470425795763731}\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - starting run from the beginning\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - continuing from the very beginning\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Fuzzy: num buckets 8, bucket length 8\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 bucket actors\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 minhash actors\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - Table preprocessing uses 1 readers\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:28 INFO - created 1 table processor actors\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:34 INFO - Completed 1 files in 0.115 min\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:34 INFO - Completed 1 files (50.0%) in 0.115 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:41 INFO - Completed processing 2 files in 0.217 min\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:41 INFO - creating minhash snapshots\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:42 INFO - minhash snapshots created\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:42 INFO - creating bucket snapshots\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - bucket snapshots created\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created 1 document actors\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created 1 bucket processor actors\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - created bucket processor invoker\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - added invoker to bucket collectors\n", + "\u001b[36m(BucketsHash pid=645808)\u001b[0m 00:29:43 INFO - processing buckets 0 long, 1686 short\n", + "\u001b[36m(BucketsHash pid=645808)\u001b[0m 00:29:43 INFO - Done submitting long buckets\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - Done processing buckets in 0.011 min\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:43 INFO - creating document snapshots\n", + "\u001b[36m(BucketsHashProcessorInvoker pid=646353)\u001b[0m 00:29:43 INFO - Waiting bucket processing completion. Submitted requests 17\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:44 INFO - document snapshots created\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:44 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:52 INFO - Completed processing 2 files in 0.131 min\n", + "\u001b[36m(orchestrate pid=644959)\u001b[0m 00:29:52 INFO - done flushing in 0.003 sec\n", + "00:30:02 INFO - Completed execution in 0.627 min, execution result 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:6 completed successfully\n", - "CPU times: user 139 ms, sys: 177 ms, total: 316 ms\n", - "Wall time: 15.6 s\n" + "✅ Stage:5 completed successfully\n", + "CPU times: user 223 ms, sys: 189 ms, total: 412 ms\n", + "Wall time: 39 s\n" ] } ], "source": [ - "%%time\n", + "%%time \n", "\n", "import os\n", "import sys\n", - "from pathlib import Path\n", "\n", - "from doc_quality_transform import (\n", - " text_lang_cli_param,\n", - " doc_content_column_cli_param,\n", - " bad_word_filepath_cli_param,\n", - ")\n", - "from doc_quality_transform_ray import DocQualityRayTransformConfiguration\n", "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_ray import FdedupRayTransformConfiguration\n", + "\n", + "# create parameters\n", "\n", "local_conf = {\n", " \"input_folder\": input_folder,\n", " \"output_folder\": output_folder,\n", "}\n", - "\n", - "doc_quality_basedir = os.path.join(rootdir, \"transforms\", \"language\", \"doc_quality\", \"python\")\n", "worker_options = {\"num_cpus\" : MY_CONFIG.RAY_NUM_CPUS}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", "params = {\n", " # where to run\n", " \"run_locally\": True,\n", " # Data access. Only required parameters are specified\n", " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", - " # orchestrator\n", + " # Orchestration parameters\n", " \"runtime_worker_options\": ParamsUtils.convert_to_ast(worker_options),\n", " \"runtime_num_workers\": MY_CONFIG.RAY_RUNTIME_WORKERS,\n", - " \"runtime_pipeline_id\": \"pipeline_id\",\n", - " \"runtime_job_id\": \"job_id\",\n", - " \"runtime_creation_delay\": 0,\n", - " # doc quality configuration\n", - " text_lang_cli_param: \"en\",\n", - " doc_content_column_cli_param: \"contents\",\n", - " bad_word_filepath_cli_param: os.path.join(doc_quality_basedir, \"ldnoobw\", \"en\"),\n", + " # columns used\n", + " \"fdedup_doc_column\": \"contents\",\n", + " \"fdedup_id_column\": \"chunk_id\",\n", + " \"fdedup_cluster_column\": \"chunk_hash\",\n", + " # infrastructure\n", + " \"fdedup_bucket_cpu\": 0.3,\n", + " \"fdedup_doc_cpu\": 0.3,\n", + " \"fdedup_mhash_cpu\": 0.3,\n", + " \"fdedup_num_doc_actors\": 1,\n", + " \"fdedup_num_bucket_actors\": 1,\n", + " \"fdedup_num_minhash_actors\": 1,\n", + " \"fdedup_num_preprocessors\": 1,\n", + " # fuzzy parameters\n", + " \"fdedup_num_permutations\": 64,\n", + " \"fdedup_threshold\": 0.7, # between 0.0 to 1.0 ; smaller values tend to be more lenient in finding near dupes; close to 1.0 is more strict\n", + " \"fdedup_shingles_size\": 5,\n", + " \"fdedup_delimiters\": \" \"\n", "}\n", "\n", - "\n", - "Path(output_folder).mkdir(parents=True, exist_ok=True)\n", - "\n", + "# Pass commandline params\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "\n", - "# create launcher\n", - "launcher = RayTransformLauncher(DocQualityRayTransformConfiguration())\n", "# launch\n", + "\n", + "launcher = RayTransformLauncher(FdedupRayTransformConfiguration())\n", + "\n", "return_code = launcher.launch()\n", "\n", "if return_code == 0:\n", @@ -1846,24 +1608,25 @@ }, { "cell_type": "markdown", - "id": "43b7d855", + "id": "a6f8cd11", "metadata": {}, "source": [ - "### Inspect Generated output" + "### 7.3 - Inspect Generated output" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "f631d5c1", + "execution_count": 18, + "id": "e899ad60", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (1324, 15)\n", - "Output data dimensions (rows x columns)= (1302, 28)\n" + "Input data dimensions (rows x columns)= (211, 19)\n", + "Output data dimensions (rows x columns)= (211, 19)\n", + "Duplicate chunks removed by fuzzy-dedupe: 0\n" ] }, { @@ -1891,148 +1654,137 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", - " ...\n", - " docq_mean_word_len\n", - " docq_symbol_to_word_ratio\n", - " docq_sentence_count\n", - " docq_lorem_ipsum_ratio\n", - " docq_curly_bracket_ratio\n", - " docq_contain_bad_word\n", - " docq_bullet_point_ratio\n", - " docq_ellipsis_line_ratio\n", - " docq_alphabet_word_ratio\n", - " docq_contain_common_en_words\n", + " source_filename\n", + " source_document_id\n", + " contents\n", + " doc_jsonpath\n", + " page_number\n", + " bbox\n", + " document_id\n", + " chunk_id\n", + " removed\n", + " chunk_hash\n", " \n", " \n", " \n", " \n", - " 354\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 47\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " ...\n", - " 6.272727\n", - " 0.0\n", - " 2\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 1.000000\n", - " False\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 6.1.1 HumanEvalSynthesize: Multilingual Code G...\n", + " $.main-text[118]\n", + " 9\n", + " [107.09940338, 505.84005737, 505.70474243, 604...\n", + " 22dd65548755f19ec6ccd89020fd1fbc88e339fafbd881...\n", + " 135\n", + " []\n", + " -1\n", " \n", " \n", - " 1125\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 134\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " ...\n", - " 5.121622\n", - " 0.0\n", - " 31\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.648649\n", - " False\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 1 Introduction\\nAttention mechanisms have beco...\n", + " $.main-text[20]\n", + " 2\n", + " [107.17721558, 497.6980896, 505.65536499, 540....\n", + " 362722af4a10ed54ca21fd329149c01397a621e15f8306...\n", + " 11\n", + " []\n", + " -1\n", " \n", " \n", - " 204\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 93\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " ...\n", - " 5.880000\n", - " 0.0\n", - " 1\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 1.000000\n", - " True\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 6.3 Code Editing and Translation\\nTarget Langu...\n", + " $.tables[13]\n", + " 17\n", + " [161.45388794, 433.6942749, 450.61630249, 552....\n", + " f665c10385f0eb31b2b94e5e61c934651f5789f5ab528c...\n", + " 181\n", + " []\n", + " -1\n", " \n", " \n", "\n", - "

3 rows × 28 columns

\n", "" ], "text/plain": [ - " filename num_pages num_tables \\\n", - "354 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "1125 Walmart_2024.pdf 100 82 \n", - "204 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "47 granite.pdf 28 17 348 pdf \n", + "134 attension.pdf 15 4 193 pdf \n", + "93 granite.pdf 28 17 348 pdf \n", "\n", - " num_doc_elements document_id ext \\\n", - "354 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", - "1125 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", - "204 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", + " hash size \\\n", + "47 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "134 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "93 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", "\n", - " hash size \\\n", - "354 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "1125 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", - "204 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", + " date_acquired pdf_convert_time source_filename \\\n", + "47 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "134 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", + "93 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", "\n", - " date_acquired pdf_convert_time ... docq_mean_word_len \\\n", - "354 2024-08-30T10:32:49.798524 321.107279 ... 6.272727 \n", - "1125 2024-08-30T10:32:40.640835 312.142404 ... 5.121622 \n", - "204 2024-08-30T10:32:49.798524 321.107279 ... 5.880000 \n", + " source_document_id \\\n", + "47 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "134 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", + "93 81bc331a-69cf-49bd-84b9-afedcab1344a \n", "\n", - " docq_symbol_to_word_ratio docq_sentence_count docq_lorem_ipsum_ratio \\\n", - "354 0.0 2 0.0 \n", - "1125 0.0 31 0.0 \n", - "204 0.0 1 0.0 \n", - "\n", - " docq_curly_bracket_ratio docq_contain_bad_word docq_bullet_point_ratio \\\n", - "354 0.0 False 0.0 \n", - "1125 0.0 False 0.0 \n", - "204 0.0 False 0.0 \n", + " contents doc_jsonpath \\\n", + "47 6.1.1 HumanEvalSynthesize: Multilingual Code G... $.main-text[118] \n", + "134 1 Introduction\\nAttention mechanisms have beco... $.main-text[20] \n", + "93 6.3 Code Editing and Translation\\nTarget Langu... $.tables[13] \n", "\n", - " docq_ellipsis_line_ratio docq_alphabet_word_ratio \\\n", - "354 0.0 1.000000 \n", - "1125 0.0 0.648649 \n", - "204 0.0 1.000000 \n", + " page_number bbox \\\n", + "47 9 [107.09940338, 505.84005737, 505.70474243, 604... \n", + "134 2 [107.17721558, 497.6980896, 505.65536499, 540.... \n", + "93 17 [161.45388794, 433.6942749, 450.61630249, 552.... \n", "\n", - " docq_contain_common_en_words \n", - "354 False \n", - "1125 False \n", - "204 True \n", + " document_id chunk_id removed \\\n", + "47 22dd65548755f19ec6ccd89020fd1fbc88e339fafbd881... 135 [] \n", + "134 362722af4a10ed54ca21fd329149c01397a621e15f8306... 11 [] \n", + "93 f665c10385f0eb31b2b94e5e61c934651f5789f5ab528c... 181 [] \n", "\n", - "[3 rows x 28 columns]" + " chunk_hash \n", + "47 -1 \n", + "134 -1 \n", + "93 -1 " ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2044,8 +1796,9 @@ "\n", "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", + "print (\"Duplicate chunks removed by fuzzy-dedupe: \", (input_df.shape[0] - output_df.shape[0]))\n", "\n", - "output_df.sample(3)" + "output_df.sample(min(3, output_df.shape[0]))" ] }, { @@ -2058,9 +1811,17 @@ "Encode text for the vector storage." ] }, + { + "cell_type": "markdown", + "id": "8fbbeaff", + "metadata": {}, + "source": [ + "### 8.1 - Set Input/output Folder" + ] + }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "20a153fa-fd56-401e-86be-4f7617affcc8", "metadata": {}, "outputs": [ @@ -2068,25 +1829,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "🏃🏼 STAGE-7: Processing input='output/06_doc_quality_out' --> output='output/07_encoder_out'\n" + "🏃🏼 STAGE-6: Processing input='output/05_fuzzy_dedupe_out' --> output='output/06_embeddings_out'\n" ] } ], "source": [ - "STAGE += 1\n", - "# STAGE = 7 ## DEBUG\n", + "STAGE = 6\n", "\n", - "input_folder = output_folder # previous output folder is the input folder for the current stage\n", - "output_folder = os.path.join(MY_CONFIG.OUTPUT_FOLDER, f\"{STAGE:02}_encoder_out\")\n", + "input_folder = output_fuzzy_dedupe_dir\n", + "output_folder = output_embeddings_dir\n", "\n", "input_df = read_parquet_files_as_df(input_folder) ## for debug purposes\n", "\n", "print (f\"🏃🏼 STAGE-{STAGE}: Processing input='{input_folder}' --> output='{output_folder}'\")" ] }, + { + "cell_type": "markdown", + "id": "1e6a88f8", + "metadata": {}, + "source": [ + "### 8.2 - Execute" + ] + }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "id": "228df6b2-bc62-494b-9697-03ece98d7853", "metadata": {}, "outputs": [ @@ -2094,38 +1862,34 @@ "name": "stderr", "output_type": "stream", "text": [ - "10:39:33 INFO - Running locally\n", - "10:39:33 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", - "10:39:33 INFO - data factory data_ is using local data access: input_folder - output/06_doc_quality_out output_folder - output/07_encoder_out\n", - "10:39:33 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:39:33 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:39:33 INFO - pipeline id pipeline_id\n", - "10:39:33 INFO - code location None\n", - "10:39:33 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", - "10:39:33 INFO - actor creation delay 0\n", - "10:39:33 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", - "2024-08-30 10:39:35,588\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - orchestrator started at 2024-08-30 10:39:38\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Number of files is 2, source profile {'max_file_size': 0.2231884002685547, 'min_file_size': 0.2173166275024414, 'total_file_size': 0.4405050277709961}\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 8.224630738608539, 'object_store': 4.112315367907286}\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:39:38 INFO - Completed 0 files (0.0%) in 6.500879923502604e-06 min. Waiting for completion\n", - "\u001b[36m(RayTransformFileProcessor pid=1098990)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - "\u001b[36m(RayTransformFileProcessor pid=1098990)\u001b[0m warnings.warn(\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:40:26 INFO - Completed processing 2 files in 0.7918713609377543 min\n", - "\u001b[36m(orchestrate pid=1098089)\u001b[0m 10:40:26 INFO - done flushing in 0.0010461807250976562 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=1098989)\u001b[0m /home/sujee/apps/anaconda3/envs/data-prep-kit-3-py311/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - "\u001b[36m(RayTransformFileProcessor pid=1098989)\u001b[0m warnings.warn(\n", - "10:40:36 INFO - Completed execution in 1.0400522033373514 min, execution result 0\n" + "00:30:04 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2'}\n", + "00:30:04 INFO - pipeline id pipeline_id\n", + "00:30:04 INFO - code location None\n", + "00:30:04 INFO - number of workers 2 worker options {'num_cpus': 1, 'max_restarts': -1}\n", + "00:30:04 INFO - actor creation delay 0\n", + "00:30:04 INFO - job details {'job category': 'preprocessing', 'job name': 'text_encoder', 'job type': 'ray', 'job id': 'job_id'}\n", + "00:30:04 INFO - data factory data_ is using local data access: input_folder - output/05_fuzzy_dedupe_out output_folder - output/06_embeddings_out\n", + "00:30:04 INFO - data factory data_ max_files -1, n_sample -1\n", + "00:30:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "00:30:04 INFO - Running locally\n", + "2024-10-02 00:30:06,760\tINFO worker.py:1744 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - orchestrator started at 2024-10-02 00:30:10\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Number of files is 2, source profile {'max_file_size': 0.06542396545410156, 'min_file_size': 0.029404640197753906, 'total_file_size': 0.09482860565185547}\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Cluster resources: {'cpus': 16, 'gpus': 1, 'memory': 4.923227692954242, 'object_store': 2.4616138450801373}\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Number of workers - 2 with {'num_cpus': 1, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:10 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:21 INFO - Completed processing 2 files in 0.188 min\n", + "\u001b[36m(orchestrate pid=647243)\u001b[0m 00:30:21 INFO - done flushing in 0.001 sec\n", + "00:30:31 INFO - Completed execution in 0.449 min, execution result 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "✅ Stage:7 completed successfully\n", - "CPU times: user 510 ms, sys: 285 ms, total: 795 ms\n", - "Wall time: 1min 4s\n" + "✅ Stage:6 completed successfully\n", + "CPU times: user 638 ms, sys: 269 ms, total: 907 ms\n", + "Wall time: 29 s\n" ] } ], @@ -2169,12 +1933,12 @@ "id": "b734852c", "metadata": {}, "source": [ - "### Inspect Generated output" + "### 8.3 - Inspect Generated output" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 21, "id": "7b1c1d09", "metadata": {}, "outputs": [ @@ -2182,8 +1946,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Input data dimensions (rows x columns)= (1302, 28)\n", - "Output data dimensions (rows x columns)= (1302, 29)\n" + "Input data dimensions (rows x columns)= (211, 19)\n", + "Output data dimensions (rows x columns)= (211, 20)\n" ] }, { @@ -2211,153 +1975,141 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", - " ...\n", - " docq_symbol_to_word_ratio\n", - " docq_sentence_count\n", - " docq_lorem_ipsum_ratio\n", - " docq_curly_bracket_ratio\n", - " docq_contain_bad_word\n", - " docq_bullet_point_ratio\n", - " docq_ellipsis_line_ratio\n", - " docq_alphabet_word_ratio\n", - " docq_contain_common_en_words\n", + " source_filename\n", + " source_document_id\n", + " contents\n", + " doc_jsonpath\n", + " page_number\n", + " bbox\n", + " document_id\n", + " chunk_id\n", + " removed\n", + " chunk_hash\n", " embeddings\n", " \n", " \n", " \n", " \n", - " 916\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", + " 171\n", + " attension.pdf\n", + " 15\n", + " 4\n", + " 193\n", " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " ...\n", - " 0.00000\n", - " 3\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.978022\n", - " False\n", - " [-0.048175987, 0.0011802563, -0.046808466, -0....\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 4 Why Self-Attention\\nlength n is smaller than...\n", + " $.main-text[85]\n", + " 7\n", + " [107.26034546, 652.83349609, 504.29177856, 717...\n", + " 6f8efa86e0a4f77b0d72d4a3141e5e0611b2921a392b99...\n", + " 48\n", + " []\n", + " -1\n", + " [0.018015103, -0.038851, 0.0016827772, -0.0493...\n", " \n", " \n", - " 286\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " a8118ae6-e6b5-4595-86ed-bf519ec23551\n", + " 25\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-30T10:32:49.798524\n", - " 321.107279\n", - " ...\n", - " 0.00000\n", - " 29\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.919414\n", - " True\n", - " [0.0038028236, -0.13894859, 0.015160485, -0.00...\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " 3 Model Architecture\\nBatch size, 3B = 2048. B...\n", + " $.tables[0]\n", + " 5\n", + " [138.25450134, 299.99499512, 471.55078125, 432...\n", + " b8f3a83c697e885ad31913c716644399a4772691e39d0b...\n", + " 113\n", + " []\n", + " -1\n", + " [0.003977602, -0.06122852, -0.089708336, -0.00...\n", " \n", " \n", - " 852\n", - " Walmart_2024.pdf\n", - " 100\n", - " 82\n", - " 1163\n", - " 00df8499-2863-4ca4-96dc-0c2a2014c3dc\n", - " pdf\n", - " dd3b262828146a536bdc0f04e7c9dfbd7406d043714989...\n", - " 1112045\n", - " 2024-08-30T10:32:40.640835\n", - " 312.142404\n", - " ...\n", - " 0.01087\n", + " 137\n", + " attension.pdf\n", + " 15\n", " 4\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.978261\n", - " False\n", - " [-0.033763092, 0.031698707, -0.04227217, 0.008...\n", + " 193\n", + " pdf\n", + " 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23...\n", + " 135814\n", + " 2024-10-02T00:26:29.888597\n", + " 53.822026\n", + " attension.pdf\n", + " 7afd3fbc-3a9f-4728-8fd8-4a9a13980244\n", + " 2 Background\\nSelf-attention, sometimes called...\n", + " $.main-text[24]\n", + " 2\n", + " [107.29702759, 256.18237305, 505.24960327, 298...\n", + " 9c2abd2ec38b67c74873e0cd670d27b702711d05930f26...\n", + " 14\n", + " []\n", + " -1\n", + " [0.03394238, -0.0117239505, -0.03349689, -0.02...\n", " \n", " \n", "\n", - "

3 rows × 29 columns

\n", "" ], "text/plain": [ - " filename num_pages num_tables \\\n", - "916 Walmart_2024.pdf 100 82 \n", - "286 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "852 Walmart_2024.pdf 100 82 \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "171 attension.pdf 15 4 193 pdf \n", + "25 granite.pdf 28 17 348 pdf \n", + "137 attension.pdf 15 4 193 pdf \n", "\n", - " num_doc_elements document_id ext \\\n", - "916 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", - "286 1163 a8118ae6-e6b5-4595-86ed-bf519ec23551 pdf \n", - "852 1163 00df8499-2863-4ca4-96dc-0c2a2014c3dc pdf \n", + " hash size \\\n", + "171 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", + "25 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "137 6fe23d4f932c725077dfc8334f3f4da4e3aaf908d2aa23... 135814 \n", "\n", - " hash size \\\n", - "916 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", - "286 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "852 dd3b262828146a536bdc0f04e7c9dfbd7406d043714989... 1112045 \n", + " date_acquired pdf_convert_time source_filename \\\n", + "171 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", + "25 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "137 2024-10-02T00:26:29.888597 53.822026 attension.pdf \n", "\n", - " date_acquired pdf_convert_time ... \\\n", - "916 2024-08-30T10:32:40.640835 312.142404 ... \n", - "286 2024-08-30T10:32:49.798524 321.107279 ... \n", - "852 2024-08-30T10:32:40.640835 312.142404 ... \n", + " source_document_id \\\n", + "171 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", + "25 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "137 7afd3fbc-3a9f-4728-8fd8-4a9a13980244 \n", "\n", - " docq_symbol_to_word_ratio docq_sentence_count docq_lorem_ipsum_ratio \\\n", - "916 0.00000 3 0.0 \n", - "286 0.00000 29 0.0 \n", - "852 0.01087 4 0.0 \n", + " contents doc_jsonpath \\\n", + "171 4 Why Self-Attention\\nlength n is smaller than... $.main-text[85] \n", + "25 3 Model Architecture\\nBatch size, 3B = 2048. B... $.tables[0] \n", + "137 2 Background\\nSelf-attention, sometimes called... $.main-text[24] \n", "\n", - " docq_curly_bracket_ratio docq_contain_bad_word docq_bullet_point_ratio \\\n", - "916 0.0 False 0.0 \n", - "286 0.0 False 0.0 \n", - "852 0.0 False 0.0 \n", + " page_number bbox \\\n", + "171 7 [107.26034546, 652.83349609, 504.29177856, 717... \n", + "25 5 [138.25450134, 299.99499512, 471.55078125, 432... \n", + "137 2 [107.29702759, 256.18237305, 505.24960327, 298... \n", "\n", - " docq_ellipsis_line_ratio docq_alphabet_word_ratio \\\n", - "916 0.0 0.978022 \n", - "286 0.0 0.919414 \n", - "852 0.0 0.978261 \n", + " document_id chunk_id removed \\\n", + "171 6f8efa86e0a4f77b0d72d4a3141e5e0611b2921a392b99... 48 [] \n", + "25 b8f3a83c697e885ad31913c716644399a4772691e39d0b... 113 [] \n", + "137 9c2abd2ec38b67c74873e0cd670d27b702711d05930f26... 14 [] \n", "\n", - " docq_contain_common_en_words \\\n", - "916 False \n", - "286 True \n", - "852 False \n", - "\n", - " embeddings \n", - "916 [-0.048175987, 0.0011802563, -0.046808466, -0.... \n", - "286 [0.0038028236, -0.13894859, 0.015160485, -0.00... \n", - "852 [-0.033763092, 0.031698707, -0.04227217, 0.008... \n", - "\n", - "[3 rows x 29 columns]" + " chunk_hash embeddings \n", + "171 -1 [0.018015103, -0.038851, 0.0016827772, -0.0493... \n", + "25 -1 [0.003977602, -0.06122852, -0.089708336, -0.00... \n", + "137 -1 [0.03394238, -0.0117239505, -0.03349689, -0.02... " ] }, - "execution_count": 25, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2370,7 +2122,7 @@ "print (\"Input data dimensions (rows x columns)= \", input_df.shape)\n", "print (\"Output data dimensions (rows x columns)= \", output_df.shape)\n", "\n", - "output_df.sample(3)" + "output_df.sample(min(3, output_df.shape[0]))" ] }, { @@ -2383,7 +2135,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "id": "16dee3b8-31dc-4168-8adb-f2a0a0b5e207", "metadata": {}, "outputs": [ @@ -2391,7 +2143,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "✅ Copied output from 'output/07_encoder_out' --> 'output/output_final'\n" + "✅ Copied output from 'output/06_embeddings_out' --> 'output/output_final'\n" ] } ], diff --git a/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb b/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb index 1c25e06ac..e481cf9ee 100644 --- a/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb +++ b/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb @@ -17,7 +17,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Configuration" + "## Step-1: Configuration" ] }, { @@ -26,22 +26,14 @@ "metadata": {}, "outputs": [], "source": [ - "class MyConfig:\n", - " pass\n", - "MY_CONFIG = MyConfig()\n", - "\n", - "MY_CONFIG.PROCESSED_DATA_DIR = 'output/output_final'\n", - "\n", - "MY_CONFIG.DB_URI = './rag_1_dpk.db' # For embedded instance\n", - "#MY_CONFIG.DB_URI = 'http://localhost:19530' # For Docker instance\n", - "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'" + "from my_config import MY_CONFIG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step-: Load Parquet Data\n", + "## Step-2: Load Parquet Data\n", "\n", "Load all `.parquet` files in the given dir" ] @@ -58,10 +50,10 @@ "Loading data from : output/output_final\n", "Number of parquet files to read : 2\n", "\n", - "Read file: 'output/output_final/Walmart-10K-Reports-Optimized_2023.parquet'. number of rows = 666\n", - "Read file: 'output/output_final/Walmart_2024.parquet'. number of rows = 636\n", + "Read file: 'output/output_final/granite.parquet'. number of rows = 123\n", + "Read file: 'output/output_final/attension.parquet'. number of rows = 88\n", "\n", - "Total number of rows = 1302\n" + "Total number of rows = 211\n" ] } ], @@ -69,10 +61,10 @@ "import pandas as pd\n", "import glob\n", "\n", - "print ('Loading data from : ', MY_CONFIG.PROCESSED_DATA_DIR)\n", + "print ('Loading data from : ', MY_CONFIG.OUTPUT_FOLDER_FINAL)\n", "\n", "# Get a list of all Parquet files in the directory\n", - "parquet_files = glob.glob(f'{MY_CONFIG.PROCESSED_DATA_DIR}/*.parquet')\n", + "parquet_files = glob.glob(f'{MY_CONFIG.OUTPUT_FOLDER_FINAL}/*.parquet')\n", "print (\"Number of parquet files to read : \", len(parquet_files))\n", "print ()\n", "\n", @@ -102,41 +94,32 @@ "text": [ "embedding length: 384\n", "\n", - "RangeIndex: 1302 entries, 0 to 1301\n", - "Data columns (total 29 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 filename 1302 non-null object \n", - " 1 num_pages 1302 non-null int64 \n", - " 2 num_tables 1302 non-null int64 \n", - " 3 num_doc_elements 1302 non-null int64 \n", - " 4 document_id 1302 non-null object \n", - " 5 ext 1302 non-null object \n", - " 6 hash 1302 non-null object \n", - " 7 size 1302 non-null int64 \n", - " 8 date_acquired 1302 non-null object \n", - " 9 pdf_convert_time 1302 non-null float64\n", - " 10 source_filename 1302 non-null object \n", - " 11 text 1302 non-null object \n", - " 12 doc_jsonpath 1302 non-null object \n", - " 13 page_number 1302 non-null int64 \n", - " 14 bbox 1302 non-null object \n", - " 15 int_id_column 1302 non-null int64 \n", - " 16 hash_column 1302 non-null int64 \n", - " 17 docq_total_words 1302 non-null int64 \n", - " 18 docq_mean_word_len 1302 non-null float64\n", - " 19 docq_symbol_to_word_ratio 1302 non-null float64\n", - " 20 docq_sentence_count 1302 non-null int64 \n", - " 21 docq_lorem_ipsum_ratio 1302 non-null float64\n", - " 22 docq_curly_bracket_ratio 1302 non-null float64\n", - " 23 docq_contain_bad_word 1302 non-null bool \n", - " 24 docq_bullet_point_ratio 1302 non-null float64\n", - " 25 docq_ellipsis_line_ratio 1302 non-null float64\n", - " 26 docq_alphabet_word_ratio 1302 non-null float64\n", - " 27 docq_contain_common_en_words 1302 non-null bool \n", - " 28 vector 1302 non-null object \n", - "dtypes: bool(2), float64(8), int64(9), object(10)\n", - "memory usage: 277.3+ KB\n", + "RangeIndex: 211 entries, 0 to 210\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 filename 211 non-null object \n", + " 1 num_pages 211 non-null int64 \n", + " 2 num_tables 211 non-null int64 \n", + " 3 num_doc_elements 211 non-null int64 \n", + " 4 ext 211 non-null object \n", + " 5 hash 211 non-null object \n", + " 6 size 211 non-null int64 \n", + " 7 date_acquired 211 non-null object \n", + " 8 pdf_convert_time 211 non-null float64\n", + " 9 source_filename 211 non-null object \n", + " 10 source_document_id 211 non-null object \n", + " 11 text 211 non-null object \n", + " 12 doc_jsonpath 211 non-null object \n", + " 13 page_number 211 non-null int64 \n", + " 14 bbox 211 non-null object \n", + " 15 document_id 211 non-null object \n", + " 16 chunk_id 211 non-null int64 \n", + " 17 removed 211 non-null object \n", + " 18 chunk_hash 211 non-null int64 \n", + " 19 vector 211 non-null object \n", + "dtypes: float64(1), int64(7), object(12)\n", + "memory usage: 33.1+ KB\n", "None\n" ] }, @@ -165,150 +148,138 @@ " num_pages\n", " num_tables\n", " num_doc_elements\n", - " document_id\n", " ext\n", " hash\n", " size\n", " date_acquired\n", " pdf_convert_time\n", - " ...\n", - " docq_symbol_to_word_ratio\n", - " docq_sentence_count\n", - " docq_lorem_ipsum_ratio\n", - " docq_curly_bracket_ratio\n", - " docq_contain_bad_word\n", - " docq_bullet_point_ratio\n", - " docq_ellipsis_line_ratio\n", - " docq_alphabet_word_ratio\n", - " docq_contain_common_en_words\n", + " source_filename\n", + " source_document_id\n", + " text\n", + " doc_jsonpath\n", + " page_number\n", + " bbox\n", + " document_id\n", + " chunk_id\n", + " removed\n", + " chunk_hash\n", " vector\n", " \n", " \n", " \n", " \n", " 0\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " d626ab9b-0f53-446c-b55d-150fbbd93066\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-29T00:43:21.059856\n", - " 332.679391\n", - " ...\n", - " 0.0\n", - " 3\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 1.000000\n", - " True\n", - " [-0.006206639, 0.010256912, 0.023658218, -0.02...\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " Granite Code Models: A Family of Open Foundati...\n", + " $.main-text[3]\n", + " 1\n", + " [142.70646667, 672.96929932, 468.58251953, 711...\n", + " b773445f7cf4cc9a5bf6ec296c74504f93c9c179028ac6...\n", + " 88\n", + " []\n", + " -1\n", + " [-0.015789315, -0.07841933, -0.032271657, 0.00...\n", " \n", " \n", " 1\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " d626ab9b-0f53-446c-b55d-150fbbd93066\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-29T00:43:21.059856\n", - " 332.679391\n", - " ...\n", - " 0.0\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " Granite Code Models: A Family of Open Foundati...\n", + " $.main-text[4]\n", " 1\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.909091\n", - " True\n", - " [-0.0497427, 0.046492133, -0.02381167, 0.02798...\n", + " [107.61845398, 535.62896729, 503.99923706, 647...\n", + " 7353bcc8d99c279335eaf120c793ca6a08f9a4fddcbb5b...\n", + " 89\n", + " []\n", + " -1\n", + " [-0.059480786, -0.056680508, -0.042864937, -0....\n", " \n", " \n", " 2\n", - " Walmart-10K-Reports-Optimized_2023.pdf\n", - " 100\n", - " 81\n", - " 1163\n", - " d626ab9b-0f53-446c-b55d-150fbbd93066\n", + " granite.pdf\n", + " 28\n", + " 17\n", + " 348\n", " pdf\n", - " ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8...\n", - " 1159974\n", - " 2024-08-29T00:43:21.059856\n", - " 332.679391\n", - " ...\n", - " 0.0\n", + " 79c53d694df467391e94f279af2fa6a9a7e45c3922546e...\n", + " 655054\n", + " 2024-10-02T00:28:23.836369\n", + " 167.768806\n", + " granite.pdf\n", + " 81bc331a-69cf-49bd-84b9-afedcab1344a\n", + " Granite Code Models: A Family of Open Foundati...\n", + " $.main-text[5]\n", " 1\n", - " 0.0\n", - " 0.0\n", - " False\n", - " 0.0\n", - " 0.0\n", - " 0.875000\n", - " False\n", - " [-0.03265641, -0.040947884, 0.017305722, 0.022...\n", + " [220.87228394, 484.46414185, 390.87872314, 529...\n", + " 389267895ca214924a0a071df8379c2b15fcf374f232a6...\n", + " 90\n", + " []\n", + " -1\n", + " [-0.07557265, -0.07152908, -0.048923455, -0.04...\n", " \n", " \n", "\n", - "

3 rows × 29 columns

\n", "" ], "text/plain": [ - " filename num_pages num_tables \\\n", - "0 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "1 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "2 Walmart-10K-Reports-Optimized_2023.pdf 100 81 \n", - "\n", - " num_doc_elements document_id ext \\\n", - "0 1163 d626ab9b-0f53-446c-b55d-150fbbd93066 pdf \n", - "1 1163 d626ab9b-0f53-446c-b55d-150fbbd93066 pdf \n", - "2 1163 d626ab9b-0f53-446c-b55d-150fbbd93066 pdf \n", - "\n", - " hash size \\\n", - "0 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "1 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", - "2 ea5544f26fe0831ec9befbf7aaf68b1b256df6c3ae18b8... 1159974 \n", + " filename num_pages num_tables num_doc_elements ext \\\n", + "0 granite.pdf 28 17 348 pdf \n", + "1 granite.pdf 28 17 348 pdf \n", + "2 granite.pdf 28 17 348 pdf \n", "\n", - " date_acquired pdf_convert_time ... \\\n", - "0 2024-08-29T00:43:21.059856 332.679391 ... \n", - "1 2024-08-29T00:43:21.059856 332.679391 ... \n", - "2 2024-08-29T00:43:21.059856 332.679391 ... \n", + " hash size \\\n", + "0 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "1 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", + "2 79c53d694df467391e94f279af2fa6a9a7e45c3922546e... 655054 \n", "\n", - " docq_symbol_to_word_ratio docq_sentence_count docq_lorem_ipsum_ratio \\\n", - "0 0.0 3 0.0 \n", - "1 0.0 1 0.0 \n", - "2 0.0 1 0.0 \n", + " date_acquired pdf_convert_time source_filename \\\n", + "0 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "1 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", + "2 2024-10-02T00:28:23.836369 167.768806 granite.pdf \n", "\n", - " docq_curly_bracket_ratio docq_contain_bad_word docq_bullet_point_ratio \\\n", - "0 0.0 False 0.0 \n", - "1 0.0 False 0.0 \n", - "2 0.0 False 0.0 \n", + " source_document_id \\\n", + "0 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "1 81bc331a-69cf-49bd-84b9-afedcab1344a \n", + "2 81bc331a-69cf-49bd-84b9-afedcab1344a \n", "\n", - " docq_ellipsis_line_ratio docq_alphabet_word_ratio \\\n", - "0 0.0 1.000000 \n", - "1 0.0 0.909091 \n", - "2 0.0 0.875000 \n", + " text doc_jsonpath \\\n", + "0 Granite Code Models: A Family of Open Foundati... $.main-text[3] \n", + "1 Granite Code Models: A Family of Open Foundati... $.main-text[4] \n", + "2 Granite Code Models: A Family of Open Foundati... $.main-text[5] \n", "\n", - " docq_contain_common_en_words \\\n", - "0 True \n", - "1 True \n", - "2 False \n", + " page_number bbox \\\n", + "0 1 [142.70646667, 672.96929932, 468.58251953, 711... \n", + "1 1 [107.61845398, 535.62896729, 503.99923706, 647... \n", + "2 1 [220.87228394, 484.46414185, 390.87872314, 529... \n", "\n", - " vector \n", - "0 [-0.006206639, 0.010256912, 0.023658218, -0.02... \n", - "1 [-0.0497427, 0.046492133, -0.02381167, 0.02798... \n", - "2 [-0.03265641, -0.040947884, 0.017305722, 0.022... \n", + " document_id chunk_id removed \\\n", + "0 b773445f7cf4cc9a5bf6ec296c74504f93c9c179028ac6... 88 [] \n", + "1 7353bcc8d99c279335eaf120c793ca6a08f9a4fddcbb5b... 89 [] \n", + "2 389267895ca214924a0a071df8379c2b15fcf374f232a6... 90 [] \n", "\n", - "[3 rows x 29 columns]" + " chunk_hash vector \n", + "0 -1 [-0.015789315, -0.07841933, -0.032271657, 0.00... \n", + "1 -1 [-0.059480786, -0.056680508, -0.042864937, -0.... \n", + "2 -1 [-0.07557265, -0.07152908, -0.048923455, -0.04... " ] }, "execution_count": 3, @@ -339,7 +310,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Connect to Vector Database\n", + "## Step-3: Connect to Vector Database\n", "\n", "Milvus can be embedded and easy to use.\n", "\n", @@ -377,7 +348,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Create A Collection\n", + "# Step-4: Create A Collection\n", "\n" ] }, @@ -390,8 +361,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "✅ Cleared collection : dpk_walmart_docs\n", - "✅ Created collection : dpk_walmart_docs\n" + "✅ Created collection : dpk_papers\n" ] } ], @@ -421,13 +391,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "inserted # rows 1302\n" + "inserted # rows 211\n" ] }, { "data": { "text/plain": [ - "{'row_count': 1302}" + "{'row_count': 211}" ] }, "execution_count": 6, @@ -447,7 +417,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Close DB Connection\n", + "## Step-5: Close DB Connection\n", "\n", "Close the connection so the lock files are relinquished and other notebooks can access the db" ] diff --git a/examples/notebooks/rag/rag_1C_vector_search.ipynb b/examples/notebooks/rag/rag_1C_vector_search.ipynb index a0b0a849a..e49de86e4 100644 --- a/examples/notebooks/rag/rag_1C_vector_search.ipynb +++ b/examples/notebooks/rag/rag_1C_vector_search.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Configuration" + "## Step-1: Configuration" ] }, { @@ -20,23 +20,14 @@ "metadata": {}, "outputs": [], "source": [ - "class MyConfig:\n", - " pass\n", - "MY_CONFIG = MyConfig()\n", - "\n", - "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MY_CONFIG.EMBEDDING_LENGTH = 384\n", - "\n", - "MY_CONFIG.DB_URI = './rag_1_dpk.db' # For embedded instance\n", - "#MY_CONFIG.DB_URI = 'http://localhost:19530' # For Docker instance\n", - "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'" + "from my_config import MY_CONFIG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Connect to Vector Database\n", + "## Step-2: Connect to Vector Database\n", "\n", "Milvus can be embedded and easy to use.\n", "\n", @@ -72,7 +63,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup Embeddings\n", + "## Step-3: Setup Embeddings\n", "\n", "Two choices here. \n", "\n", @@ -89,9 +80,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm, trange\n", - "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } @@ -145,7 +136,7 @@ "output_type": "stream", "text": [ "sentence transformer : embeddings len = 384\n", - "sentence transformer : embeddings[:5] = [ 0.02468893 0.10352128 0.02752643 -0.08551716 -0.01412826]\n", + "sentence transformer : embeddings[:5] = [ 0.02468893 0.10352131 0.02752644 -0.08551719 -0.01412828]\n", "milvus model wrapper : embeddings len = 384\n", "milvus model wrapper : embeddings[:5] = [ 0.02468893 0.10352128 0.02752643 -0.08551716 -0.01412826]\n" ] @@ -167,7 +158,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Do A Vector Search\n", + "## Step-4: Do A Vector Search\n", "\n", "We will do this to verify data" ] @@ -220,50 +211,50 @@ "text": [ "num results : 5\n", "------ result 1 --------\n", - "search score: 0.5978392958641052\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 99\n", + "search score: 0.5946735143661499\n", + "filename: granite.pdf\n", + "page number: 5\n", "text:\n", - " Stock Performance Chart\n", - "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer Discretionary, 2019 = . S&P 500 Consumer Discretionary, 2020 = . S&P 500 Consumer Discretionary, 2021 = . S&P 500 Consumer Discretionary, 2022 = . S&P 500 Consumer Discretionary, 2023 = . S&P 500 Consumer Discretionary, 2024 = . Discretionary Distribution & RiliId, 2019 = 100.00. Discretionary Distribution & RiliId, 2020 = 117.54. Discretionary Distribution & RiliId, 2021 = 166.19. Discretionary Distribution & RiliId, 2022 = 180.56. Discretionary Distribution & RiliId, 2023 = 147.66. Discretionary Distribution & RiliId, 2024 = 190.67\n", + " 3 Model Architecture\n", + "Table 1: Model configurations for Granite Code models.\n", "\n", "------ result 2 --------\n", - "search score: 0.5875853896141052\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.5919967889785767\n", + "filename: granite.pdf\n", + "page number: 6\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "Operating cash flow $36B\n", + " 3 Model Architecture\n", + "Figure 2: An overview of depth upscaling (Kim et al., 2024) for efficient training of Granite34B-Code. We utilize the 20B model after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework.\n", "\n", "------ result 3 --------\n", - "search score: 0.5865607857704163\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.5557882785797119\n", + "filename: granite.pdf\n", + "page number: 1\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n", + " Granite Code Models: A Family of Open Foundation Models for Code Intelligence\n", + "Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar Panda †\n", "\n", "------ result 4 --------\n", - "search score: 0.5840539932250977\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.539251983165741\n", + "filename: granite.pdf\n", + "page number: 6\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "Revenues\n", + " 3 Model Architecture\n", + "remove final 8 layers from the original model and initial 8 layers from its duplicate to form two models. Finally, we concatenate both models to form Granite-34B-Code model with 88 layers (see Figure 2 for an illustration). After the depth upscaling, we observe that the drop in performance compared to 20B model is pretty small contrary to what is observed by Kim et al.. This performance is recovered pretty quickly after we continue pretraining of the upscaled 34B model. Similar, to 20B, we use a 8192 token context during pretraining.\n", "\n", "------ result 5 --------\n", - "search score: 0.5462992191314697\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.537261962890625\n", + "filename: granite.pdf\n", + "page number: 20\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "1 Our global advertising business is recorded in either net sales or as a reduction to cost of sales, depending on the nature of the advertising arrangement. 2 1B tonnes CO 2 e emissions reduced, avoided, or sequestered reported by suppliers cumulatively since 2017 through Project Gigaton. Calculated in accordance with Walmart's \"Project Gigaton Accounting Methodology.\" 3 This result also includes emissions impacts that may only be realized in 2024\n", + " 6.6 Calling Functions and Tools\n", + "Figure 4 shows the results of different Granite Code models on BFCL benchmark. As can be seen from the figure, overall accuracy improves from 25.65% to 57.12% for Granite-3BCode-Base to Granite-34B-Code-Base, showing the effectiveness of model scaling in function (tool) calling capabilities. We also compare Granite-8B-Code with CodeLlama-7B in Figure 5 and find that Granite-8B-Code-Instruct beats CodeLlama-7B-Instruct by 22%, 14% and 12% on AST Summary, Execution Summary and Overall accuracy respectively. Additionally, Figure 5 shows that instruction tuning consistently improves performance of both base models, with more noticeable improvements in Granite Code models. E.g., +17.88% in overall accuracy from Granite-8B-Code-Base to Granite-8B-Code-Instruct, indicating the effectiveness of our well-curated data mixture in finetuning base models.\n", "\n" ] } ], "source": [ - "query = \"What was Walmart's revenue in 2023?\"\n", + "query = \"What was the training data used to train Granite models?\"\n", "\n", "results = do_vector_search (query)\n", "print_search_results(results)" @@ -280,50 +271,50 @@ "text": [ "num results : 5\n", "------ result 1 --------\n", - "search score: 0.5755810141563416\n", - "filename: Walmart_2024_copy.pdf\n", + "search score: 0.6484582424163818\n", + "filename: attension.pdf\n", "page number: 2\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n", + " 1 Introduction\n", + "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms are used in conjunction with a recurrent network.\n", "\n", "------ result 2 --------\n", - "search score: 0.502342700958252\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.6340895891189575\n", + "filename: attension.pdf\n", + "page number: 3\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "1B Tonnes\n", + " 3.2 Attention\n", + "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n", "\n", "------ result 3 --------\n", - "search score: 0.5014065504074097\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 99\n", + "search score: 0.5805453062057495\n", + "filename: attension.pdf\n", + "page number: 10\n", "text:\n", - " Stock Performance Chart\n", - "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer Discretionary, 2019 = . S&P 500 Consumer Discretionary, 2020 = . S&P 500 Consumer Discretionary, 2021 = . S&P 500 Consumer Discretionary, 2022 = . S&P 500 Consumer Discretionary, 2023 = . S&P 500 Consumer Discretionary, 2024 = . Discretionary Distribution & RiliId, 2019 = 100.00. Discretionary Distribution & RiliId, 2020 = 117.54. Discretionary Distribution & RiliId, 2021 = 166.19. Discretionary Distribution & RiliId, 2022 = 180.56. Discretionary Distribution & RiliId, 2023 = 147.66. Discretionary Distribution & RiliId, 2024 = 190.67\n", + " 7 Conclusion\n", + "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n", "\n", "------ result 4 --------\n", - "search score: 0.49448615312576294\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.5805416703224182\n", + "filename: attension.pdf\n", + "page number: 15\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "+20%\n", + " Attention Visualizations Input-Input Layer5\n", + "Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.\n", "\n", "------ result 5 --------\n", - "search score: 0.49202316999435425\n", - "filename: Walmart_2024_copy.pdf\n", - "page number: 2\n", + "search score: 0.5769087076187134\n", + "filename: attension.pdf\n", + "page number: 13\n", "text:\n", - " \"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "+6%\n", + " Attention Visualizations Input-Input Layer5\n", + "Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb 'making', completing the phrase 'making...more difficult'. Attentions here shown only for the word 'making'. Different colors represent different heads. Best viewed in color.\n", "\n" ] } ], "source": [ - "query = \"How many distribution facilities does Walmart have?\"\n", + "query = \"What is the attention mechanism?\"\n", "\n", "results = do_vector_search (query)\n", "print_search_results(results)" diff --git a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb b/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb index 905ad307b..532b7ef4d 100644 --- a/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb +++ b/examples/notebooks/rag/rag_1D_query_llama_replicate.ipynb @@ -7,19 +7,19 @@ "# Query Data using LLM\n", "\n", "Here is the overall RAG pipeline. In this notebook, we will do steps (5), (6), (7), (8), (9)\n", - "- Importing data is already done in this notebook [rag_1_B_load_data.ipynb](rag_1_B_load_data.ipynb)\n", + "- Importing data is already done in this notebook [rag_1B_load_data_into_milvus.ipynb](rag_1B_load_data_into_milvus.ipynb)\n", "- 👉 Step 5: Calculate embedding for user query\n", "- 👉 Step 6 & 7: Send the query to vector db to retrieve relevant documents\n", "- 👉 Step 8 & 9: Send the query and relevant documents (returned above step) to LLM and get answers to our query\n", "\n", - "![image missing](../media/rag-overview-2.png)" + "![image missing](media/rag-overview-2.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Configuration" + "## Step-1: Configuration" ] }, { @@ -28,42 +28,14 @@ "metadata": {}, "outputs": [], "source": [ - "class MyConfig:\n", - " pass\n", - "MY_CONFIG = MyConfig()\n", - "\n", - "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MY_CONFIG.EMBEDDING_LENGTH = 384\n", - "\n", - "MY_CONFIG.DB_URI = './rag_1_dpk.db' # For embedded instance\n", - "#MY_CONFIG.DB_URI = 'http://localhost:19530' # For Docker instance\n", - "MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'\n", - "\n", - "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuration\n", - "\n", - "Create a .env file with the following properties. You can use [env.txt](../env.txt) as starting point\n", - "\n", - "---\n", - "\n", - "```text\n", - "REPLICATE_API_TOKEN=YOUR_TOKEN_GOES_HERE\n", - "```\n", - "\n", - "---" + "from my_config import MY_CONFIG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load Configurations\n" + "## Step-2: Load .env file\n" ] }, { @@ -102,7 +74,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Connect to Vector Database\n", + "## Step-3: Connect to Vector Database\n", "\n", "Milvus can be embedded and easy to use.\n", "\n", @@ -138,7 +110,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Step-: Setup Embeddings\n", + "## Step-4: Setup Embeddings\n", "\n", "Use the same embeddings we used to index our documents!" ] @@ -152,9 +124,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm, trange\n", - "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } @@ -179,7 +151,7 @@ "output_type": "stream", "text": [ "embeddings len = 384\n", - "embeddings[:5] = [ 0.02468893 0.10352128 0.02752643 -0.08551716 -0.01412826]\n" + "embeddings[:5] = [ 0.02468893 0.10352131 0.02752644 -0.08551719 -0.01412828]\n" ] } ], @@ -194,7 +166,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Vector Search and RAG" + "## Step-5: Vector Search and RAG" ] }, { @@ -231,35 +203,31 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ { 'distance': 0.5978392958641052,\n", - " 'text': 'Stock Performance Chart\\n'\n", - " 'Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. '\n", - " 'Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. '\n", - " 'Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. '\n", - " 'S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. '\n", - " 'S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. '\n", - " 'S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. '\n", - " 'S&P 500 Consumer Discretionary, 2019 = . S&P 500 Consumer '\n", - " 'Discretionary, 2020 = . S&P 500 Consumer Discretionary, '\n", - " '2021 = . S&P 500 Consumer Discretionary, 2022 = . S&P 500 '\n", - " 'Consumer Discretionary, 2023 = . S&P 500 Consumer '\n", - " 'Discretionary, 2024 = . Discretionary Distribution & '\n", - " 'RiliId, 2019 = 100.00. Discretionary Distribution & '\n", - " 'RiliId, 2020 = 117.54. Discretionary Distribution & '\n", - " 'RiliId, 2021 = 166.19. Discretionary Distribution & '\n", - " 'RiliId, 2022 = 180.56. Discretionary Distribution & '\n", - " 'RiliId, 2023 = 147.66. Discretionary Distribution & '\n", - " 'RiliId, 2024 = 190.67'},\n", - " { 'distance': 0.5875853896141052,\n", - " 'text': '\"At Walmart, we\\'re a people-led, tech-powered omnichannel '\n", - " 'retailer dedicated\\n'\n", - " 'Operating cash flow $36B'},\n", - " { 'distance': 0.5865607857704163,\n", - " 'text': '\"At Walmart, we\\'re a people-led, tech-powered omnichannel '\n", - " 'retailer dedicated\\n'\n", - " 'through, up to and including 2030. Additional qualifying '\n", - " 'information can be found by visiting '\n", - " 'http://corporate.walmart.com/purpose/esgreport.'}]\n" + "[ { 'distance': 0.5946735143661499,\n", + " 'text': '3 Model Architecture\\n'\n", + " 'Table 1: Model configurations for Granite Code models.'},\n", + " { 'distance': 0.5919967889785767,\n", + " 'text': '3 Model Architecture\\n'\n", + " 'Figure 2: An overview of depth upscaling (Kim et al., 2024) '\n", + " 'for efficient training of Granite34B-Code. We utilize the 20B '\n", + " 'model after 1.6T tokens to start training of 34B model with '\n", + " 'the same code pretraining data without any changes to the '\n", + " 'training and inference framework.'},\n", + " { 'distance': 0.5557882785797119,\n", + " 'text': 'Granite Code Models: A Family of Open Foundation Models for '\n", + " 'Code Intelligence\\n'\n", + " 'Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen '\n", + " 'Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran '\n", + " 'Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong '\n", + " 'Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman '\n", + " 'Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris '\n", + " 'Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu '\n", + " 'Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel '\n", + " 'Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell '\n", + " 'Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen '\n", + " 'Seetharami Seelam Brian Belgodere Carlos Fonseca Amith '\n", + " 'Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar '\n", + " 'Panda †'}]\n" ] } ], @@ -268,7 +236,7 @@ "import json\n", "import pprint\n", "\n", - "question = \"What was Walmart's revenue in 2023?\"\n", + "question = \"What was the training data used to train Granite models?\"\n", "relevant_docs = fetch_relevant_documents(question)\n", "pprint.pprint(relevant_docs, indent=4)" ] @@ -277,7 +245,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Initialize LLM\n", + "## Step-6: Initialize LLM\n", "\n", "### LLM Choices at Replicate\n", "\n", @@ -305,59 +273,6 @@ "os.environ[\"REPLICATE_API_TOKEN\"] = MY_CONFIG.REPLICATE_API_TOKEN" ] }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import replicate\n", - "\n", - "def ask_LLM (question, relevant_docs):\n", - " context = \"\\n\".join(\n", - " [doc['text'] for doc in relevant_docs]\n", - " )\n", - " print ('============ context (this is the context supplied to LLM) ============')\n", - " print (context)\n", - " print ('============ end context ============', flush=True)\n", - "\n", - " system_prompt = \"\"\"\n", - " Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.\n", - " \"\"\"\n", - " user_prompt = f\"\"\"\n", - " Use the following pieces of information enclosed in tags to provide an answer to the question enclosed in tags.\n", - " \n", - " {context}\n", - " \n", - " \n", - " {question}\n", - " \n", - " \"\"\"\n", - "\n", - " print ('============ here is the answer from LLM... STREAMING... =====')\n", - " # The meta/meta-llama-3-8b-instruct model can stream output as it's running.\n", - " for event in replicate.stream(\n", - " MY_CONFIG.LLM_MODEL,\n", - " input={\n", - " \"top_k\": 0,\n", - " \"top_p\": 0.95,\n", - " \"prompt\": user_prompt,\n", - " \"max_tokens\": 512,\n", - " \"temperature\": 0.1,\n", - " \"system_prompt\": system_prompt,\n", - " \"length_penalty\": 1,\n", - " \"max_new_tokens\": 512,\n", - " \"stop_sequences\": \"<|end_of_text|>,<|eot_id|>\",\n", - " \"prompt_template\": \"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n", - " \"presence_penalty\": 0,\n", - " \"log_performance_metrics\": False\n", - " },\n", - " ):\n", - " print(str(event), end=\"\")\n", - " ## ---\n", - " print ('\\n====== end LLM answer ======\\n', flush=True)\n" - ] - }, { "cell_type": "code", "execution_count": 10, @@ -392,14 +307,14 @@ " for event in replicate.stream(\n", " MY_CONFIG.LLM_MODEL,\n", " input={\n", - " \"top_k\": 0,\n", + " \"top_k\": 1,\n", " \"top_p\": 0.95,\n", " \"prompt\": user_prompt,\n", - " \"max_tokens\": 512,\n", + " \"max_tokens\": 1024,\n", " \"temperature\": 0.1,\n", " \"system_prompt\": system_prompt,\n", " \"length_penalty\": 1,\n", - " \"max_new_tokens\": 512,\n", + " # \"max_new_tokens\": 512,\n", " \"stop_sequences\": \"<|end_of_text|>,<|eot_id|>\",\n", " \"prompt_template\": \"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n", " \"presence_penalty\": 0,\n", @@ -415,7 +330,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Query" + "## Step-7: Query" ] }, { @@ -428,26 +343,26 @@ "output_type": "stream", "text": [ "============ context (this is the context supplied to LLM) ============\n", - "Stock Performance Chart\n", - "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer Discretionary, 2019 = . S&P 500 Consumer Discretionary, 2020 = . S&P 500 Consumer Discretionary, 2021 = . S&P 500 Consumer Discretionary, 2022 = . S&P 500 Consumer Discretionary, 2023 = . S&P 500 Consumer Discretionary, 2024 = . Discretionary Distribution & RiliId, 2019 = 100.00. Discretionary Distribution & RiliId, 2020 = 117.54. Discretionary Distribution & RiliId, 2021 = 166.19. Discretionary Distribution & RiliId, 2022 = 180.56. Discretionary Distribution & RiliId, 2023 = 147.66. Discretionary Distribution & RiliId, 2024 = 190.67\n", - "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "Operating cash flow $36B\n", - "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n", + "3 Model Architecture\n", + "Table 1: Model configurations for Granite Code models.\n", + "3 Model Architecture\n", + "Figure 2: An overview of depth upscaling (Kim et al., 2024) for efficient training of Granite34B-Code. We utilize the 20B model after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework.\n", + "Granite Code Models: A Family of Open Foundation Models for Code Intelligence\n", + "Mayank Mishra ⋆ Matt Stallone ⋆ Gaoyuan Zhang ⋆ Yikang Shen Aditya Prasad Adriana Meza Soria Michele Merler Parameswaran Selvam Saptha Surendran Shivdeep Singh Manish Sethi Xuan-Hong Dang Pengyuan Li Kun-Lung Wu Syed Zawad Andrew Coleman Matthew White Mark Lewis Raju Pavuluri Yan Koyfman Boris Lublinsky Maximilien de Bayser Ibrahim Abdelaziz Kinjal Basu Mayank Agarwal Yi Zhou Chris Johnson Aanchal Goyal Hima Patel Yousaf Shah Petros Zerfos Heiko Ludwig Asim Munawar Maxwell Crouse Pavan Kapanipathi Shweta Salaria Bob Calio Sophia Wen Seetharami Seelam Brian Belgodere Carlos Fonseca Amith Singhee Nirmit Desai David D. Cox Ruchir Puri † Rameswar Panda †\n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "The provided context does not mention Walmart's revenue in 2023. However, it does provide the stock performance chart for Walmart Inc. in 2023, which shows that the stock price was $153.58.\n", + "Based on the provided context, the training data used to train Granite models is not explicitly mentioned. However, it is mentioned that the 20B model was used after 1.6T tokens to start training of 34B model with the same code pretraining data without any changes to the training and inference framework. This implies that the same code pretraining data was used for both models, but the exact nature of this data is not specified.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 254 ms, sys: 17.3 ms, total: 271 ms\n", - "Wall time: 1.14 s\n" + "CPU times: user 75.3 ms, sys: 37.8 ms, total: 113 ms\n", + "Wall time: 1.95 s\n" ] } ], "source": [ "%%time\n", "\n", - "question = \"What was Walmart's revenue in 2023?\"\n", + "question = \"What was the training data used to train Granite models?\"\n", "relevant_docs = fetch_relevant_documents(question)\n", "ask_LLM(question=question, relevant_docs=relevant_docs)" ] @@ -462,26 +377,26 @@ "output_type": "stream", "text": [ "============ context (this is the context supplied to LLM) ============\n", - "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.\n", - "Stock Performance Chart\n", - "Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer Discretionary, 2019 = . S&P 500 Consumer Discretionary, 2020 = . S&P 500 Consumer Discretionary, 2021 = . S&P 500 Consumer Discretionary, 2022 = . S&P 500 Consumer Discretionary, 2023 = . S&P 500 Consumer Discretionary, 2024 = . Discretionary Distribution & RiliId, 2019 = 100.00. Discretionary Distribution & RiliId, 2020 = 117.54. Discretionary Distribution & RiliId, 2021 = 166.19. Discretionary Distribution & RiliId, 2022 = 180.56. Discretionary Distribution & RiliId, 2023 = 147.66. Discretionary Distribution & RiliId, 2024 = 190.67\n", - "\"At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated\n", - "+6%\n", + "1 Introduction\n", + "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms are used in conjunction with a recurrent network.\n", + "3.2 Attention\n", + "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n", + "7 Conclusion\n", + "We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "I apologize, but the provided context does not mention the number of distribution centers Walmart has. The context appears to be discussing Walmart's stock performance and its commitment to being a people-led, tech-powered omnichannel retailer. It does not provide information about the number of distribution centers.\n", + "Based on the provided context, an attention mechanism can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 214 ms, sys: 4.25 ms, total: 218 ms\n", - "Wall time: 928 ms\n" + "CPU times: user 41.1 ms, sys: 28.7 ms, total: 69.8 ms\n", + "Wall time: 1.58 s\n" ] } ], "source": [ "%%time\n", "\n", - "question = \"How many distribution centers does Walmart have?\"\n", + "question = \"What is attention mechanism?\"\n", "relevant_docs = fetch_relevant_documents(question)\n", "ask_LLM(question=question, relevant_docs=relevant_docs)" ] @@ -496,21 +411,19 @@ "output_type": "stream", "text": [ "============ context (this is the context supplied to LLM) ============\n", - " - \n", - "3/29/2024 10:28:40 AM\n", - "*E@4< '6C7@C>2?46 92CE\n", - " &$' ) *&% & 0 ) ,$,# + - +&+ # ) +,)% \n", - " &7- 59.:&0*287 &2) %36/.2, &4.8&0 *+.(.8 \n", - ":D42= 062CD ?565 !2?F2CJ \n", + "6.1.5 RepoBench, CrossCodeEval: Repository-Level Code Generation\n", + "StarCoderBase-3B, MBPP = 29.4. StarCoderBase-3B, MBPP+ = 37.8. StableCode-3B, MBPP = 34.8. StableCode-3B, MBPP+ = 43.3. StarCoder2-3B, MBPP = 42.4. StarCoder2-3B, MBPP+ = 48.6. CodeGemma-2B, MBPP = 30.4. CodeGemma-2B, MBPP+ = 30.8. Granite-3B-Code-Base, MBPP = 36.0. Granite-3B-Code-Base, MBPP+ = 45.1. StarCoderBase-7B, MBPP = 34.8. StarCoderBase-7B, MBPP+ = 42.1. CodeLlama-7B, MBPP = 39.0. CodeLlama-7B, MBPP+ = 42.3. StarCoder2-7B, MBPP = 45.4. StarCoder2-7B, MBPP+ = 46.7. CodeGemma-7B, MBPP = 53.0. CodeGemma-7B, MBPP+ = 54.9. Granite-8B-Code-Base, MBPP = 42.2. Granite-8B-Code-Base, MBPP+ = 49.6. StarCoderBase-15B, MBPP = 37.4. StarCoderBase-15B, MBPP+ = 46.1. CodeLlama-13B, MBPP = 30.6. CodeLlama-13B, MBPP+ = 30.1. StarCoder2-15B, MBPP = 51.2. StarCoder2-15B, MBPP+ = 56.6. Granite-20B-Code-Base, MBPP = 43.8. Granite-20B-Code-Base, MBPP+ = 51.6. CodeLlama-34B, MBPP = 48.6. CodeLlama-34B, MBPP+ = 53.6. Granite-34B-Code-Base, MBPP = 47.2. Granite-34B-Code-Base, MBPP+ = 53.1\n", + "6.1.3 MBPP and MBPP+: Code Generation in Python\n", + "MBPP (Austin et al., 2021) and MBPP+ (Liu et al., 2023a) are two of the most widely studied benchmarks for evaluating code models. While the prompt for each MBPP problem includes a natural language description followed by a few tests, MBPP+ consists of 35 × more tests than the original benchmarks. We use greedy decoding and report the mean pass@1 for all the models. Table 5 summarizes the results of different base models. As we can see, Granite3B-Code-Base significantly outperforms CodeGemma-2B but falls short of StarCoder2-3B on\n", + "6.1.4 DS1000: Data Science Tasks in Python\n", + "The Granite Code models achieve relatively high accuracy across all sizes (e.g., outperforming CodeGemma at 2B-3B scale, StarCoder2 at 7B-8B scale and CodeLlama models with half of the sizes). This shows that our Granite Code models are not only capable of generating good code but also of using libraries more accurately in real data science workflows.\n", "============ end context ============\n", "============ here is the answer from LLM... STREAMING... =====\n", - "I'm happy to help! However, I must point out that the provided context does not contain any information about the moon landing. The text appears to be a jumbled mix of characters and symbols, and does not provide any relevant information about the moon landing or any other historical event.\n", - "\n", - "If you could provide a different context or question, I would be happy to try and assist you to the best of my abilities.\n", + "I apologize, but the provided context does not mention the moon landing. The context appears to be about code generation and evaluation benchmarks, specifically discussing the MBPP and MBPP+ benchmarks, and the performance of different code models. There is no mention of the moon landing. If you provide a different context or question, I'll be happy to help.\n", "====== end LLM answer ======\n", "\n", - "CPU times: user 268 ms, sys: 12.4 ms, total: 280 ms\n", - "Wall time: 1.37 s\n" + "CPU times: user 41.5 ms, sys: 21 ms, total: 62.5 ms\n", + "Wall time: 2.13 s\n" ] } ], diff --git a/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb b/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb index 7c1c9d124..b52ed53ea 100644 --- a/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb +++ b/examples/notebooks/rag/rag_2A_llamaindex_process.ipynb @@ -24,19 +24,10 @@ "metadata": {}, "outputs": [], "source": [ - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig()\n", - "\n", - "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MY_CONFIG.EMBEDDING_LENGTH = 384\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = \"input_data/walmart-reports-1\"\n", + "from my_config import MY_CONFIG\n", "\n", "MY_CONFIG.DB_URI = './rag_2_llamaindex.db'\n", - "MY_CONFIG.COLLECTION_NAME = 'llamaindex_walmart_docs'\n", - "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n" + "MY_CONFIG.COLLECTION_NAME = 'llamaindex_papers'" ] }, { @@ -51,24 +42,13 @@ "execution_count": 2, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt_tab to\n", - "[nltk_data] /home/sujee/apps/anaconda3/envs/data-prep-\n", - "[nltk_data] kit-2/lib/python3.11/site-\n", - "[nltk_data] packages/llama_index/core/_static/nltk_cache...\n", - "[nltk_data] Package punkt_tab is already up-to-date!\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Loaded 300 chunks\n", - "CPU times: user 10.1 s, sys: 2.23 s, total: 12.3 s\n", - "Wall time: 9.88 s\n" + "Loaded 43 chunks\n", + "CPU times: user 3.9 s, sys: 869 ms, total: 4.77 s\n", + "Wall time: 2.76 s\n" ] } ], @@ -113,13 +93,167 @@ "execution_count": 4, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "08aefb8116a540678e28c78accd09648", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "modules.json: 0%| | 0.00/349 [00:00\n", + "✅ Created index: \n", "✅ Saved index to db ./rag_2_llamaindex.db\n", - "CPU times: user 24min 24s, sys: 1min 4s, total: 25min 28s\n", - "Wall time: 2min 20s\n" + "CPU times: user 912 ms, sys: 155 ms, total: 1.07 s\n", + "Wall time: 1.03 s\n" ] } ], diff --git a/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb b/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb index 069a2c797..717d79690 100644 --- a/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb +++ b/examples/notebooks/rag/rag_2B_llamaindex_query.ipynb @@ -24,19 +24,10 @@ "metadata": {}, "outputs": [], "source": [ - "class MyConfig:\n", - " pass\n", - "\n", - "MY_CONFIG = MyConfig()\n", - "\n", - "MY_CONFIG.EMBEDDING_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "MY_CONFIG.EMBEDDING_LENGTH = 384\n", - "\n", - "MY_CONFIG.INPUT_DATA_DIR = \"input_data/walmart-reports-1\"\n", + "from my_config import MY_CONFIG\n", "\n", "MY_CONFIG.DB_URI = './rag_2_llamaindex.db'\n", - "MY_CONFIG.COLLECTION_NAME = 'llamaindex_walmart_docs'\n", - "MY_CONFIG.LLM_MODEL = \"meta/meta-llama-3-8b-instruct\"\n" + "MY_CONFIG.COLLECTION_NAME = 'llamaindex_papers'" ] }, { @@ -66,12 +57,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package punkt_tab to\n", - "[nltk_data] /home/sujee/apps/anaconda3/envs/data-prep-\n", - "[nltk_data] kit-2/lib/python3.11/site-\n", - "[nltk_data] packages/llama_index/core/_static/nltk_cache...\n", - "[nltk_data] Package punkt_tab is already up-to-date!\n", - "/home/sujee/apps/anaconda3/envs/data-prep-kit-2/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "/home/sujee/apps/anaconda3/envs/data-prep-kit-4-021/lib/python3.11/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } @@ -114,7 +100,7 @@ " uri = MY_CONFIG.DB_URI ,\n", " dim = MY_CONFIG.EMBEDDING_LENGTH , \n", " collection_name = MY_CONFIG.COLLECTION_NAME,\n", - " overwrite=False\n", + " overwrite=False # so we load the index from db\n", ")\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", "\n", @@ -138,8 +124,8 @@ "output_type": "stream", "text": [ "✅ Loaded index from vector db: ./rag_2_llamaindex.db\n", - "CPU times: user 265 ms, sys: 25.8 ms, total: 291 ms\n", - "Wall time: 289 ms\n" + "CPU times: user 255 ms, sys: 18.9 ms, total: 274 ms\n", + "Wall time: 271 ms\n" ] } ], @@ -196,13 +182,21 @@ "text": [ "\n", "\n", - "According to the provided context information, Walmart's total revenue in 2023 was $611,289 million.\n" + "Based on the provided context information, the training data used to train the Granite models includes:\n", + "\n", + "* 3.5T to 4.5T tokens of code data\n", + "* Natural language datasets related to code\n", + "* High-quality data with two phases of training:\n", + "\t+ Phase 1: 4 trillion tokens of code data comprising 116 languages\n", + "\t+ Phase 2: 500B tokens (80% code and 20% language data) from various domains, including technical, mathematics, and web documents\n", + "\n", + "Note that the data is tokenized via byte pair encoding (BPE) and the same tokenizer as StarCoder is employed.\n" ] } ], "source": [ "query_engine = index.as_query_engine()\n", - "res = query_engine.query(\"What was Walmart's revenue in 2023?\")\n", + "res = query_engine.query(\"What was the training data used to train Granite models?\")\n", "print(res)" ] }, @@ -217,15 +211,17 @@ "text": [ "\n", "\n", - "Based on the provided context information, the answer to the query is:\n", + "Based on the provided context information, it appears that the attention mechanism is a technique used in the encoder self-attention in layer 5 of 6, which allows the model to focus on specific parts of the input when processing it. This is evident from the visualizations provided, which show the attention heads attending to distant dependencies in the input text.\n", + "\n", + "In the first example, the attention heads are shown to attend to a distant dependency of the verb \"making\", completing the phrase \"making...more difficult\". In the second example, the attention heads are shown to exhibit behavior related to the structure of the sentence, with different heads performing different tasks.\n", "\n", - "Walmart has a total of 163 distribution facilities.\n" + "From this, it can be inferred that the attention mechanism is a way for the model to selectively focus on certain parts of the input, allowing it to better understand the context and relationships between different elements in the input.\n" ] } ], "source": [ "query_engine = index.as_query_engine()\n", - "res = query_engine.query(\"How many distribution facilities does Walmart have?\")\n", + "res = query_engine.query(\"What is attention mechanism?\")\n", "print(res)" ] }, @@ -240,7 +236,7 @@ "text": [ "\n", "\n", - "I'm happy to help! However, I don't see any information about the moon landing in the provided context. The context appears to be a 10-K report filed by Walmart Inc. with the Securities and Exchange Commission. There is no mention of the moon landing in this report. If you have any other questions or if there's something else I can help you with, feel free to ask!\n" + "I'm happy to help! However, I don't see any information about the moon landing in the provided context. The text appears to be discussing IBM Granite Code Models and their performance on various benchmarks. Therefore, I cannot provide an answer to the query about the moon landing. If you could provide more context or clarify the question, I'd be happy to try and assist you further!\n" ] } ], diff --git a/examples/notebooks/rag/requirements.txt b/examples/notebooks/rag/requirements.txt index 4578b1ea8..1c5c4f00c 100644 --- a/examples/notebooks/rag/requirements.txt +++ b/examples/notebooks/rag/requirements.txt @@ -1,7 +1,9 @@ ## Data prep kit -#data-prep-toolkit-transforms==0.2.1.dev1 -#data-prep-toolkit-transforms-ray==0.2.1.dev1 +data-prep-toolkit-transforms==0.2.1 +data-prep-toolkit-transforms-ray==0.2.1 + +deepsearch-toolkit # Milvus @@ -28,14 +30,9 @@ llama-index ### llama-index embedding models llama-index-embeddings-huggingface -## llama-index-embeddings-mistralai -## llama-index-embeddings-openai== 0.1.7 ### llama-index LLM interfaces llama-index-llms-replicate -## llama-index-llms-mistralai -## llama-index-llms-openai==0.1.12 -# llama-index-llms-llama-cpp ### llama-index Vector dbs llama-index-vector-stores-milvus diff --git a/examples/notebooks/rag/setup-python-dev-env.md b/examples/notebooks/rag/setup-python-dev-env.md index 8c2c93d20..b007c4b4b 100644 --- a/examples/notebooks/rag/setup-python-dev-env.md +++ b/examples/notebooks/rag/setup-python-dev-env.md @@ -18,21 +18,39 @@ We will create an environment for this workshop with all the required libraries ```bash conda create -n data-prep-kit-1 -y python=3.11 +``` + +activate the new conda environment -# activate the new conda environment +```bash conda activate data-prep-kit-1 -# make sure env is swithced to data-prep-kit-1 +``` -## Check python version +Make sure env is swithced to data-prep-kit-1 + +Check python version + +```bash python --version -# should say : 3.11 +``` + +should say : 3.11 + +**Note**: If you are on a linux system install these too + +```bash +conda install gcc_linux-64 + +conda install gxx_linux-64 ``` ### A-2: Install dependencies ```bash cd examples/notebooks/rag +``` +```bash pip install -r requirements.txt ``` From 213e0c7eb8ec25d943ee4917faf051c4963107dc Mon Sep 17 00:00:00 2001 From: ian-cho <42691703+ian-cho@users.noreply.github.com> Date: Thu, 3 Oct 2024 20:24:17 +0900 Subject: [PATCH 2/7] Update README.md updated hap README --- transforms/universal/hap/python/README.md | 36 +++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md index 23be7084c..347fa86ae 100644 --- a/transforms/universal/hap/python/README.md +++ b/transforms/universal/hap/python/README.md @@ -1,14 +1,14 @@ -# HAP Annotation +# Hate, Abuse, and Profanity (HAP) Annotation Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up. ## Prerequisite -This repo needs NLTK and please refer to `requirements.txt`. +This repository needs [NLTK](https://www.nltk.org/) and please refer to `requirements.txt`. ## Summary The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document: * Sentence spliting: we use NLTK to split the document into sentence pieces. -* Hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap. +* hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap. * Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences. @@ -16,25 +16,26 @@ The hap transform maps a non-empty input table to an output table with an added The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) configuration for values are as follows: -* --model_name_or_path - specifies HAP model which should be compatable with HuggingFace's `AutoModelForSequenceClassification` -* --batch_size - modify it based on the infrastructure capacity. -* --max_length - the maximum length for the tokenizer. - - +* --model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier `ibm-granite/granite-guardian-hap-38m`. +* --batch_size - modify it based on the infrastructure capacity. Defaults to `128`. +* --max_length - the maximum length for the tokenizer. Defaults to `512`. +* --doc_text_column - the column name containing the document text in the input .parquet file. Defaults to `contents`. +* --annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to `hap_score`. + ## input format The input is in .parquet format and contains the following columns: -| doc_id | doc_text | -|:------|:------| +| doc_id | contents | +|:------:|:------:| | 1 | GSC is very much a little Swiss Army knife for... | | 2 | Here are only a few examples. And no, I'm not ... | ## output format The output is in .parquet format and includes an additional column, in addition to those in the input: -| doc_id | doc_text | hap_score | -|:------|:------|:-------------| +| doc_id | contents | hap_score | +|:------:|:------:|:-------------:| | 1 | GSC is very much a little Swiss Army knife for... | 0.002463 | | 2 | Here are only a few examples. And no, I'm not ... | 0.989713 | @@ -47,6 +48,17 @@ python hap_local_python.py You will obtain the output file `test1.parquet` in the output directory. +## Throughput +The table below shows the throughput (tokens per second) of the HAP transform module, which primarily includes sentence splitting, HAP annotation, and HAP score aggregation. We herein compare two models: + +* 4-layer lightweight toxicity classifier [ibm-granite/granite-guardian-hap-38m](https://huggingface.co/ibm-granite/granite-guardian-hap-38m) +* 12-layer toxicity classifier [ibm-granite/granite-guardian-hap-125m](https://huggingface.co/ibm-granite/granite-guardian-hap-125m) + +We report the average throughput on CPU over three runs. +| Model used in HAP transform module | throughput (tokens per second) | +|:------:|:------:| +| granite-guardian-hap-38m | 6.16 k | +| granite-guardian-hap-125m | 1.14 k | From 2971c730b0a580a7731ec70f7e2bb8e75a299cb0 Mon Sep 17 00:00:00 2001 From: ian-cho <42691703+ian-cho@users.noreply.github.com> Date: Thu, 3 Oct 2024 21:27:54 +0900 Subject: [PATCH 3/7] Update README.md --- transforms/universal/hap/python/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md index 347fa86ae..29d54d999 100644 --- a/transforms/universal/hap/python/README.md +++ b/transforms/universal/hap/python/README.md @@ -54,7 +54,8 @@ The table below shows the throughput (tokens per second) of the HAP transform mo * 4-layer lightweight toxicity classifier [ibm-granite/granite-guardian-hap-38m](https://huggingface.co/ibm-granite/granite-guardian-hap-38m) * 12-layer toxicity classifier [ibm-granite/granite-guardian-hap-125m](https://huggingface.co/ibm-granite/granite-guardian-hap-125m) -We report the average throughput on CPU over three runs. +We processed 6,000 documents (12 MB in Parquet file size) using the HAP transform module and reported the average CPU throughput over three trials. + | Model used in HAP transform module | throughput (tokens per second) | |:------:|:------:| | granite-guardian-hap-38m | 6.16 k | From d8b9be8531368657d2fb09973b8df486ed93c620 Mon Sep 17 00:00:00 2001 From: ian-cho <42691703+ian-cho@users.noreply.github.com> Date: Thu, 3 Oct 2024 22:16:51 +0900 Subject: [PATCH 4/7] changed doc_text into contents in related files The column name `doc_text` is changed into `contents` in all relevant scripts and parquet files --- .../universal/hap/python/output/metadata.json | 13 +++++++------ .../universal/hap/python/output/test1.parquet | Bin 79822 -> 79822 bytes .../universal/hap/python/src/hap_local.py | 2 +- .../hap/python/src/hap_local_python.py | 2 +- .../universal/hap/python/src/hap_transform.py | 12 ++++++------ .../python/test-data/expected/metadata.json | 13 +++++++------ .../python/test-data/expected/test1.parquet | Bin 79822 -> 79822 bytes .../hap/python/test-data/input/test1.parquet | Bin 109303 -> 109303 bytes .../universal/hap/python/test/test_hap.py | 2 +- 9 files changed, 23 insertions(+), 21 deletions(-) diff --git a/transforms/universal/hap/python/output/metadata.json b/transforms/universal/hap/python/output/metadata.json index 6627fabb9..062fee162 100644 --- a/transforms/universal/hap/python/output/metadata.json +++ b/transforms/universal/hap/python/output/metadata.json @@ -5,8 +5,8 @@ "job name": "hap", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-25 00:47:58", - "end_time": "2024-09-25 00:48:06", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", "status": "success" }, "code": { @@ -17,7 +17,7 @@ "job_input_params": { "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, @@ -30,11 +30,12 @@ "num_processors": 0 }, "job_output_stats": { - "source_files": 1, - "source_size": 109303, + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, "result_files": 1, "result_size": 79822, - "processing_time": 6.543, + "processing_time": 6.932, "source_doc_count": 50, "result_doc_count": 50 }, diff --git a/transforms/universal/hap/python/output/test1.parquet b/transforms/universal/hap/python/output/test1.parquet index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644 GIT binary patch delta 171 zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R zbQF}*GE;L>;`0)7QnlyMCslEU)oisp=K0maogvj6}9 delta 155 zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx> z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$ a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q diff --git a/transforms/universal/hap/python/src/hap_local.py b/transforms/universal/hap/python/src/hap_local.py index 89140fd74..220eea19b 100644 --- a/transforms/universal/hap/python/src/hap_local.py +++ b/transforms/universal/hap/python/src/hap_local.py @@ -24,7 +24,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, diff --git a/transforms/universal/hap/python/src/hap_local_python.py b/transforms/universal/hap/python/src/hap_local_python.py index 9a268803e..8e79dc583 100644 --- a/transforms/universal/hap/python/src/hap_local_python.py +++ b/transforms/universal/hap/python/src/hap_local_python.py @@ -37,7 +37,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, diff --git a/transforms/universal/hap/python/src/hap_transform.py b/transforms/universal/hap/python/src/hap_transform.py index 71bad2acb..e6a48cf86 100644 --- a/transforms/universal/hap/python/src/hap_transform.py +++ b/transforms/universal/hap/python/src/hap_transform.py @@ -27,11 +27,11 @@ class HAPTransform(AbstractTableTransform): def __init__(self, config: dict[str, Any]): super().__init__(config) - self.model_name_or_path = config.get("model_name_or_path") - self.annotation_column = config.get("annotation_column") - self.doc_text_column = config.get("doc_text_column") - self.max_length = config.get("max_length") - self.batch_size = config.get("batch_size") + self.model_name_or_path = config.get("model_name_or_path", "ibm-granite/granite-guardian-hap-38m") + self.annotation_column = config.get("annotation_column", "hap_score") + self.doc_text_column = config.get("doc_text_column", "contents") + self.max_length = config.get("max_length", 512) + self.batch_size = config.get("batch_size", 128) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path) @@ -70,7 +70,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab :param table: Pyarrow table :return: a table with an additional hap_score column """ - # make sure that the table contains "doc_text" column + # make sure that the table contains "contents" column TransformUtils.validate_columns(table=table, required=[self.doc_text_column]) self.df = table.to_pandas() df_doc_list = [] diff --git a/transforms/universal/hap/python/test-data/expected/metadata.json b/transforms/universal/hap/python/test-data/expected/metadata.json index 1e5f710db..062fee162 100644 --- a/transforms/universal/hap/python/test-data/expected/metadata.json +++ b/transforms/universal/hap/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "hap", "job type": "pure python", "job id": "job_id", - "start_time": "2024-09-26 20:56:49", - "end_time": "2024-09-26 20:56:56", + "start_time": "2024-10-03 21:38:20", + "end_time": "2024-10-03 21:38:29", "status": "success" }, "code": { @@ -17,7 +17,7 @@ "job_input_params": { "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, @@ -30,11 +30,12 @@ "num_processors": 0 }, "job_output_stats": { - "source_files": 1, - "source_size": 109303, + "source_files": 2, + "source_size": 12124594, + "transform execution exception": 1, "result_files": 1, "result_size": 79822, - "processing_time": 6.501, + "processing_time": 6.932, "source_doc_count": 50, "result_doc_count": 50 }, diff --git a/transforms/universal/hap/python/test-data/expected/test1.parquet b/transforms/universal/hap/python/test-data/expected/test1.parquet index 8ac5be443d311740b8b74296fb4a02e15eb50ebc..c9483e34d47dd71af90b1a6694c55fb01ea95453 100644 GIT binary patch delta 171 zcmX^2p5@$omWC~i#}qk|^Ycnl^Gb@hGiotLux?jVXS8F23r^3`WbB3WKuTQLm0>!R zbQF}*GE;L>;`0)7QnlyMCslEU)oisp=K0maogvj6}9 delta 155 zcmX^2p5@$omWC~i#}qkI@{{9BQY%WfGiotLux?jVXS8F23r^3`WbB3WKuTQL;oRx> z3>d{&L3)*@XBsd%OqVxeWZk~jkkOKvJ3Gn9FD=DA%p}Em`bQH+37(*`NJC45sE7;$ a&&=rujTu9?XPGjtVMJ2sI9<`4aV-EPE;6|Q diff --git a/transforms/universal/hap/python/test-data/input/test1.parquet b/transforms/universal/hap/python/test-data/input/test1.parquet index 8a3468009e1f012d50dc9f0d9bf437926d048867..5e2f5fe9d5547448a8d2ff3ec3b5b5c51e575455 100644 GIT binary patch delta 177 zcmex-(j%98Sea)7vMyY@;(2}bTnBg@hh_ps6w58vs2s~A@T0IY^W A?f?J) delta 177 zcmexROj4XZi#?saaudVD zilf{tOC!S#b3C*1(>-&8$|4Od4Wc4447O{pWRzg!j&irmiZBk!N%aVv?zf6@B>-=i BLB0S0 diff --git a/transforms/universal/hap/python/test/test_hap.py b/transforms/universal/hap/python/test/test_hap.py index 3f2a25e53..82ac5dc06 100644 --- a/transforms/universal/hap/python/test/test_hap.py +++ b/transforms/universal/hap/python/test/test_hap.py @@ -19,7 +19,7 @@ hap_params = { "model_name_or_path": 'ibm-granite/granite-guardian-hap-38m', "annotation_column": "hap_score", - "doc_text_column": "doc_text", + "doc_text_column": "contents", "inference_engine": "CPU", "max_length": 512, "batch_size": 128, From 9ad002ad738e6cc68bb83f0fc97623c7cee12c4d Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Thu, 3 Oct 2024 11:16:26 -0700 Subject: [PATCH 5/7] Update README.md Added HAP to the table in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ade3bed68..aeec4ef70 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ The matrix below shows the the combination of modules and supported runtimes. Al | [Filter on annotations](transforms/universal/filter/python/README.md) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | [Profiler](transforms/universal/profiler/ray/README.md) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | [Resize](transforms/universal/resize/python/README.md) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| [HAP](transforms/universal/hap/python/README.md) | :white_check_mark: | | | | | [Tokenizer](transforms/universal/tokenization/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | **Language-only** | | | | | | [Language identification](transforms/language/lang_id/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | From 0a9fe37ac4fe230f4886dd8133bd17bcbfd35922 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 3 Oct 2024 22:42:24 +0300 Subject: [PATCH 6/7] Change the calculation of the desired ray actors (#654) * Fix the calculation of the desired ray actors. Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky * Fix hap workflow Makefile commands. Signed-off-by: Revital Sur * More change. Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky * Additional fix. Signed-off-by: Revital Sur * additional change. Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky * Cherry pick Boris's 05b97feb7 commit to removed None not supported by kfpV2. Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky * Minor fix in ededup_transform_base.py Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky * Disable pii Makefile. Signed-off-by: Revital Sur --------- Signed-off-by: Revital Sur Co-authored-by: Boris Lublinsky --- .../ray/src/data_processing_ray/runtime/ray/ray_utils.py | 2 +- .../src/runtime_utils/kfp_utils.py | 2 +- .../language/pii_redactor/{Makefile => Makefile.disable} | 0 transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- .../ededup/kfp_ray/src/ededup_compute_execution_params.py | 8 ++++++-- .../universal/ededup/python/src/ededup_transform_base.py | 2 +- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 4 ++-- transforms/universal/hap/Makefile | 4 ---- .../kfp_ray/src/profiler_compute_execution_params.py | 6 +++--- 10 files changed, 16 insertions(+), 16 deletions(-) rename transforms/language/pii_redactor/{Makefile => Makefile.disable} (100%) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py index c7362ef5e..5225508fb 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py @@ -111,7 +111,7 @@ def operator() -> ActorHandle: cls_name = clazz.__class__.__name__.replace('ActorClass(', '').replace(')','') actors = [operator() for _ in range(n_actors)] - for i in range(60): + for i in range(120): time.sleep(1) alive = list_actors(filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")]) if len(actors) == len(alive): diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py index 73b6a5cd4..7fa76453f 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py @@ -138,7 +138,7 @@ def default_compute_execution_params( cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_cpu = int((cluster_cpu - 1) * 0.7 / a_options.get("num_cpus", 0.5)) n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) n_actors = min(n_actors_cpu, n_actors_memory) # Check if we need gpu calculations as well diff --git a/transforms/language/pii_redactor/Makefile b/transforms/language/pii_redactor/Makefile.disable similarity index 100% rename from transforms/language/pii_redactor/Makefile rename to transforms/language/pii_redactor/Makefile.disable diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 645902d0e..ff4b4db57 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -89,7 +89,7 @@ def ededup( ededup_hash_cpu: float = 0.5, ededup_doc_column: str = "contents", ededup_use_snapshot: bool = False, - ededup_snapshot_directory: str = None, + ededup_snapshot_directory: str = "", # data sampling ededup_n_samples: int = 10, # additional parameters diff --git a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py index a20a2e030..6f8197877 100644 --- a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py +++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ + def ededup_compute_execution_params( worker_options: dict, # ray worker configuration actor_options: dict, # actor's resource requirements @@ -94,9 +95,9 @@ def ededup_compute_execution_params( ) sys.exit(1) # Define number of workers - n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu) + n_workers = int((0.85 * (cluster_cpu - 1) - required_hash_cpu) / actor_cpu) print(f"Number of workers - {n_workers}") - if n_workers < 2: + if n_workers <= 0: print(f"Cluster is too small - estimated number of workers {n_workers}") sys.exit(1) # Limit amount of workers and processors to prevent S3 saturation @@ -110,6 +111,9 @@ def ededup_compute_execution_params( print(f"Try to increase the size of the cluster or increase size of the cpu per worker") sys.exit(1) print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") + # process None able parameters + if ededup_snapshot_directory is None or len(ededup_snapshot_directory) <= 1: + ededup_snapshot_directory = None return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, diff --git a/transforms/universal/ededup/python/src/ededup_transform_base.py b/transforms/universal/ededup/python/src/ededup_transform_base.py index f1321db79..4437148ac 100644 --- a/transforms/universal/ededup/python/src/ededup_transform_base.py +++ b/transforms/universal/ededup/python/src/ededup_transform_base.py @@ -40,7 +40,7 @@ doc_column_name_cli_param = f"{cli_prefix}{doc_column_name_key}" int_column_name_cli_param = f"{cli_prefix}{int_column_name_key}" use_snapshot_cli_param = f"{cli_prefix}{use_snapshot_key}" -snapshot_directory_cli_param = f"--{cli_prefix}{snapshot_directory_key}" +snapshot_directory_cli_param = f"{cli_prefix}{snapshot_directory_key}" class HashFilter: """ diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index bb2cc3194..3156ab6f1 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -82,7 +82,7 @@ def fdedup( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "pipeline_id", runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, # columns used diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index ebcecadb9..726200339 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -140,7 +140,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: cluster_cpu = worker_options["replicas"] * worker_options["cpu"] cluster_memory = worker_options["replicas"] * worker_options["memory"] print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu *= 0.85 + cluster_cpu -= 1 cluster_memory *= 0.85 # get actor requirements actor_cpu = actor_options["num_cpus"] @@ -172,7 +172,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: n_preprocessors = int( (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu ) - if n_preprocessors < 0: + if n_preprocessors <= 0: print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") print("Try to increase the size of the cluster") diff --git a/transforms/universal/hap/Makefile b/transforms/universal/hap/Makefile index 05d3c3111..017eb23b4 100644 --- a/transforms/universal/hap/Makefile +++ b/transforms/universal/hap/Makefile @@ -55,16 +55,12 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build diff --git a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py index 666734eda..a5483eec7 100644 --- a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py +++ b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py @@ -55,7 +55,7 @@ def profiler_compute_execution_params( cluster_cpu = w_options["replicas"] * w_options["cpu"] cluster_memory = w_options["replicas"] * w_options["memory"] print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu *= 0.85 + cluster_cpu -= 1 cluster_memory *= 0.85 # get actor requirements a_options = actor_options @@ -82,7 +82,7 @@ def profiler_compute_execution_params( n_aggregators = math.ceil(number_of_docs * 32 / GB) print(f"Estimated Required hashes {n_aggregators}") print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - required_aggregator_cpu = n_aggregators * aggregator_cpu + required_aggregator_cpu = math.ceil(n_aggregators * aggregator_cpu) required_hash_mem = n_aggregators * 2 if required_aggregator_cpu > cluster_cpu or required_hash_mem > cluster_memory: print( @@ -93,7 +93,7 @@ def profiler_compute_execution_params( # Define number of workers n_workers = int((0.85 * cluster_cpu - required_aggregator_cpu) / actor_cpu) print(f"Number of workers - {n_workers}") - if n_workers < 2: + if n_workers <= 0: print(f"Cluster is too small - estimated number of workers {n_workers}") sys.exit(1) # Limit amount of workers and processors to prevent S3 saturation From a5f86dad587d39cf9db008c81a876c9fa1440c68 Mon Sep 17 00:00:00 2001 From: David Wood Date: Thu, 3 Oct 2024 19:27:30 -0400 Subject: [PATCH 7/7] Various fixes to workflows, especially kfp (#664) * disable test workflow when none code files change Signed-off-by: David Wood * one more path-ignore in test.yml Signed-off-by: David Wood * one more fix for path-ignore in test.yml Signed-off-by: David Wood * test universal transform separately Signed-off-by: David Wood * rename test universal workflow Signed-off-by: David Wood * add comments to noop src to trigger new universal test workflow Signed-off-by: David Wood * fix paths in test universal workflow Signed-off-by: David Wood * addj back ignore paths in test universal workflow Signed-off-by: David Wood * another noop comment Signed-off-by: David Wood * move ignored paths to paths in univesal test workflow Signed-off-by: David Wood * test-universal workflow name changes Signed-off-by: David Wood * noop comments Signed-off-by: David Wood * noop readme change' Signed-off-by: David Wood * change test universal not paths Signed-off-by: David Wood * disable all but new noop and doc_id test workflows Signed-off-by: David Wood * code change in noop Signed-off-by: David Wood * remake test transforms Signed-off-by: David Wood * add individual test transform workflows Signed-off-by: David Wood * noop README change Signed-off-by: David Wood * better ignore of .md on test transform workflows Signed-off-by: David Wood * noop readme change Signed-off-by: David Wood * noop test transform worklow 1 ignore Signed-off-by: David Wood * noop readme Signed-off-by: David Wood * split out the tests into test-kfp/lib/misc and remove test.yml, add readme Signed-off-by: David Wood * test-kfp only on kfp/** Signed-off-by: David Wood * noop code change to trigger build Signed-off-by: David Wood * comments in workflows Signed-off-by: David Wood * updated workflow readme Signed-off-by: David Wood * only run build-library workflow on data-processing-lib changes Signed-off-by: David Wood * try and ignore docs in build-library, test-kfp/lib Signed-off-by: David Wood * workflow title changes for consistency Signed-off-by: David Wood * test change on filter source Signed-off-by: David Wood * change to lib readme Signed-off-by: David Wood * change to lib source Signed-off-by: David Wood * minor job name changes in transform workflows Signed-off-by: David Wood * noop readme Signed-off-by: David Wood * test-lib workflow ignores Signed-off-by: David Wood * top level readme Signed-off-by: David Wood * noop test source Signed-off-by: David Wood * filter source change' Signed-off-by: David Wood * updated all transform tets workflows Signed-off-by: David Wood * fix typo in test template on check_images Signed-off-by: David Wood * noop src change Signed-off-by: David Wood * check for makefile in test transform workflow Signed-off-by: David Wood * automatically determine transforms in transforms directory for which to generate test workflows Signed-off-by: David Wood * worklow readme, transform existence verification, disable build-library, and tools tests Signed-off-by: David Wood * workflow readme details on kfp and misc tests Signed-off-by: David Wood * backing out change to dpk lib code Signed-off-by: David Wood * restore filter code Signed-off-by: David Wood * restore noop code Signed-off-by: David Wood * workflow readme Signed-off-by: David Wood * really restore noop code Signed-off-by: David Wood * check for makefile in transform test-src testing Signed-off-by: David Wood * don't include lib test dependencies in transform test workflows Signed-off-by: David Wood * noop code change Signed-off-by: David Wood * disable noop, don't include lib test-data in transform dependencies Signed-off-by: David Wood * use job.id.if on Makefile to enable transform test job Signed-off-by: David Wood * use job.id.if on Makefile to enable transform test job Signed-off-by: David Wood * restore noop Makefile Signed-off-by: David Wood * exclude kfp_ray from transfor test workflow and change noop code Signed-off-by: David Wood * remove if: from test workflows Signed-off-by: David Wood * backout noop code change Signed-off-by: David Wood * backout noop code change Signed-off-by: David Wood * only build spark image for transform image tests that need it Signed-off-by: David Wood * header_cleanser code change Signed-off-by: David Wood * only build spark image for transform image tests that need it Signed-off-by: David Wood * update workflows to trigger on .make.* changes Signed-off-by: David Wood * blank line added to .make.versions to test new workflows Signed-off-by: David Wood * remove change to .make.versions Signed-off-by: David Wood * remove test-universal-html2parquet.yml since it is now moved to language Signed-off-by: David Wood * mv kfp blacklist definition to check-workflows.sh script and have it check for kfp workflows Signed-off-by: David Wood * license_select_wf.py comment to trigger kfp tests here Signed-off-by: David Wood * have transform top level makefile check for kfp_ray directory before recursing into Signed-off-by: David Wood * fix doc_quality Makefile kfp rules Signed-off-by: David Wood * fix tabbing in recent Makefile updates Signed-off-by: David Wood * fix kfp workflows to only build the target transform Signed-off-by: David Wood * switch workflows from ubuntu-22.04 to ubuntu-latest to try and avoid excessive job queuing Signed-off-by: David Wood * remove non-change from license_select_wf.py to trigger new ci/cd Signed-off-by: David Wood * one more fix to kfp transform workflows to not workflow-build in all transforms Signed-off-by: David Wood * fix repo_level_ordering/ray/Makefile to support kind/kfp testing targets Signed-off-by: David Wood * change kfp test workflows to not test if transform's Makefile or kfp_ray dir is not present Signed-off-by: David Wood * set cancel-in-progress=true in workflows Signed-off-by: David Wood --------- Signed-off-by: David Wood --- .github/workflows/Makefile | 15 +- .github/workflows/deploy-docs.yml | 2 +- .github/workflows/deploy-library.yml | 6 +- .github/workflows/deploy-transforms.yml | 4 +- .../workflows/test-code-code2parquet-kfp.yml | 106 ++++++++------ .github/workflows/test-code-code2parquet.yml | 15 +- .../workflows/test-code-code_quality-kfp.yml | 106 ++++++++------ .github/workflows/test-code-code_quality.yml | 15 +- .../test-code-header_cleanser-kfp.yml | 106 ++++++++------ .../workflows/test-code-header_cleanser.yml | 15 +- .../test-code-license_select-kfp.yml | 130 ++++++++++++++++++ .../workflows/test-code-license_select.yml | 15 +- .github/workflows/test-code-malware-kfp.yml | 106 ++++++++------ .github/workflows/test-code-malware.yml | 15 +- .../test-code-proglang_select-kfp.yml | 106 ++++++++------ .../workflows/test-code-proglang_select.yml | 15 +- .../test-code-repo_level_ordering-kfp.yml | 106 ++++++++------ .../test-code-repo_level_ordering.yml | 15 +- .github/workflows/test-kfp-transform.template | 106 ++++++++------ .github/workflows/test-kfp.yml | 13 +- .github/workflows/test-language-doc_chunk.yml | 15 +- .../test-language-doc_quality-kfp.yml | 106 ++++++++------ .../workflows/test-language-doc_quality.yml | 15 +- .../workflows/test-language-html2parquet.yml | 15 +- .../workflows/test-language-lang_id-kfp.yml | 106 ++++++++------ .github/workflows/test-language-lang_id.yml | 15 +- .../workflows/test-language-pdf2parquet.yml | 15 +- .../workflows/test-language-pii_redactor.yml | 15 +- .../test-language-text_encoder-kfp.yml | 106 ++++++++------ .../workflows/test-language-text_encoder.yml | 15 +- .github/workflows/test-lib.yml | 15 +- .github/workflows/test-misc.yml | 4 +- .github/workflows/test-packaging-python.yml | 2 +- .github/workflows/test-packaging-ray.yml | 2 +- .github/workflows/test-transform.template | 15 +- .../workflows/test-universal-doc_id-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-doc_id.yml | 15 +- .../workflows/test-universal-ededup-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-ededup.yml | 15 +- .../workflows/test-universal-fdedup-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-fdedup.yml | 15 +- .../workflows/test-universal-filter-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-filter.yml | 15 +- .github/workflows/test-universal-hap.yml | 15 +- .github/workflows/test-universal-noop-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-noop.yml | 15 +- .../workflows/test-universal-profiler-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-profiler.yml | 15 +- .../workflows/test-universal-resize-kfp.yml | 106 ++++++++------ .github/workflows/test-universal-resize.yml | 15 +- .../test-universal-tokenization-kfp.yml | 106 ++++++++------ .../workflows/test-universal-tokenization.yml | 15 +- .github/workflows/workflow-manual-run.yml | 2 +- scripts/check-workflows.sh | 50 +++++-- transforms/code/code2parquet/Makefile | 19 ++- transforms/code/code_quality/Makefile | 19 ++- transforms/code/header_cleanser/Makefile | 19 ++- transforms/code/license_select/Makefile | 19 ++- transforms/code/malware/Makefile | 19 ++- transforms/code/proglang_select/Makefile | 19 ++- transforms/code/repo_level_ordering/Makefile | 16 ++- .../code/repo_level_ordering/ray/Makefile | 7 + transforms/language/doc_chunk/Makefile | 19 ++- transforms/language/doc_quality/Makefile | 19 ++- transforms/language/html2parquet/Makefile | 15 +- transforms/language/lang_id/Makefile | 19 ++- transforms/language/pdf2parquet/Makefile | 19 ++- .../language/pii_redactor/Makefile.disable | 19 ++- transforms/language/text_encoder/Makefile | 19 ++- transforms/universal/doc_id/Makefile | 19 ++- transforms/universal/ededup/Makefile | 19 ++- transforms/universal/fdedup/Makefile | 19 ++- transforms/universal/filter/Makefile | 19 ++- transforms/universal/hap/Makefile | 12 ++ transforms/universal/noop/Makefile | 19 ++- transforms/universal/profiler/Makefile | 19 ++- transforms/universal/resize/Makefile | 19 ++- transforms/universal/tokenization/Makefile | 19 ++- 78 files changed, 1924 insertions(+), 1019 deletions(-) create mode 100644 .github/workflows/test-code-license_select-kfp.yml diff --git a/.github/workflows/Makefile b/.github/workflows/Makefile index 275fd4688..751b3201f 100644 --- a/.github/workflows/Makefile +++ b/.github/workflows/Makefile @@ -8,7 +8,6 @@ LANG_TRANSFORMS=doc_chunk doc_quality lang_id pdf2parquet pii_redactor text_enco # A list that holds transforms that should not be tested with KFP -KFP_BLACK_LIST="doc_chunk,pdf2parquet,pii_redactor" transform-tests: $(MAKE) TRANSFORM_SUBDIR=universal .transform-tests @@ -29,13 +28,19 @@ transform-tests: done .transform-kfp-tests: - @for i in $$(find ../../transforms/$(TRANSFORM_SUBDIR) -mindepth 1 -maxdepth 1 -type d); do \ + @KFP_BLACK_LIST=$$(cd ../..; bash scripts/check-workflows.sh -show-kfp-black-list); \ + for i in $$(find ../../transforms/$(TRANSFORM_SUBDIR) -mindepth 1 -maxdepth 1 -type d); do \ dir=$$(basename $$i); \ - z=$$(echo ${KFP_BLACK_LIST} | grep -v $$dir); \ - if [ ! -d ../../transforms/$(TRANSFORM_SUBDIR)/$$dir/kfp_ray ] || [ -z "$$z" ]; then \ + yml=test-$(TRANSFORM_SUBDIR)-$$dir-kfp.yml; \ + if [ ! -d ../../transforms/$(TRANSFORM_SUBDIR)/$$dir/kfp_ray ]; then \ + echo No kfp_ray directory for $$dir. Skipping generation of $$yml; \ + continue; \ + fi; \ + z=$$(echo $${KFP_BLACK_LIST} | grep $$dir); \ + if [ ! -z "$$z" ]; then \ + echo $$dir is black listed. Skipping generation of $$yml; \ continue; \ fi; \ - yml=test-$(TRANSFORM_SUBDIR)-$$dir-kfp.yml; \ echo Generating $$yml; \ cat test-kfp-transform.template | sed -e "s?@TARGET_TRANSFORM_DIR@?transforms/$${TRANSFORM_SUBDIR}/$$dir?g" > $$yml; \ done diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index a2909c55d..09678e937 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -8,7 +8,7 @@ on: - "releases/**" jobs: deploy: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest env: REPO_URL: "https://github.com/${{ github.repository }}" REPO_BRANCH: "dev" diff --git a/.github/workflows/deploy-library.yml b/.github/workflows/deploy-library.yml index 8ec97ed9e..0c2473175 100644 --- a/.github/workflows/deploy-library.yml +++ b/.github/workflows/deploy-library.yml @@ -14,7 +14,7 @@ permissions: jobs: build-package: name: Build Ray data processing libraries - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -30,7 +30,7 @@ jobs: name: Publish packages to test.pypi.org # disabled if: false - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: build-package steps: @@ -47,7 +47,7 @@ jobs: publish-pypi: name: Publish release to pypi.org - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: build-package # disabled as of now if: false diff --git a/.github/workflows/deploy-transforms.yml b/.github/workflows/deploy-transforms.yml index 7fe5c8b4d..0f002187d 100644 --- a/.github/workflows/deploy-transforms.yml +++ b/.github/workflows/deploy-transforms.yml @@ -9,7 +9,7 @@ on: jobs: build-images: name: Build and check images - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -23,7 +23,7 @@ jobs: name: Publish packages to quay.io # disabled if: false - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: build-images steps: diff --git a/.github/workflows/test-code-code2parquet-kfp.yml b/.github/workflows/test-code-code2parquet-kfp.yml index 41f58e0cb..6de24d4b0 100644 --- a/.github/workflows/test-code-code2parquet-kfp.yml +++ b/.github/workflows/test-code-code2parquet-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/code2parquet/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/code2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/code2parquet workflow-test - echo "Run transforms/code/code2parquet completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/code2parquet workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/code2parquet workflow-test + echo "Run transforms/code/code2parquet completed" + else + echo "Skipping transforms/code/code2parquet kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/code2parquet workflow-test - header_text "Run transforms/code/code2parquet completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/code2parquet workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/code2parquet workflow-test + echo "Run transforms/code/code2parquet completed" + else + echo "Skipping transforms/code/code2parquet kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-code2parquet.yml b/.github/workflows/test-code-code2parquet.yml index f8f1654e7..3f83e9856 100644 --- a/.github/workflows/test-code-code2parquet.yml +++ b/.github/workflows/test-code-code2parquet.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/code2parquet/**" - "data-processing-lib/**" - "!transforms/code/code2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/code2parquet/**" - "data-processing-lib/**" - "!transforms/code/code2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-code_quality-kfp.yml b/.github/workflows/test-code-code_quality-kfp.yml index 21fa63296..2e22c04a9 100644 --- a/.github/workflows/test-code-code_quality-kfp.yml +++ b/.github/workflows/test-code-code_quality-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/code_quality/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/code_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/code_quality workflow-test - echo "Run transforms/code/code_quality completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/code_quality workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/code_quality workflow-test + echo "Run transforms/code/code_quality completed" + else + echo "Skipping transforms/code/code_quality kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/code_quality workflow-test - header_text "Run transforms/code/code_quality completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/code_quality workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/code_quality workflow-test + echo "Run transforms/code/code_quality completed" + else + echo "Skipping transforms/code/code_quality kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-code_quality.yml b/.github/workflows/test-code-code_quality.yml index d53c81c61..5a901edbb 100644 --- a/.github/workflows/test-code-code_quality.yml +++ b/.github/workflows/test-code-code_quality.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/code_quality/**" - "data-processing-lib/**" - "!transforms/code/code_quality/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/code_quality/**" - "data-processing-lib/**" - "!transforms/code/code_quality/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-header_cleanser-kfp.yml b/.github/workflows/test-code-header_cleanser-kfp.yml index 25f54b528..6cc4727aa 100644 --- a/.github/workflows/test-code-header_cleanser-kfp.yml +++ b/.github/workflows/test-code-header_cleanser-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/header_cleanser/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/header_cleanser/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/header_cleanser workflow-test - echo "Run transforms/code/header_cleanser completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/header_cleanser workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/header_cleanser workflow-test + echo "Run transforms/code/header_cleanser completed" + else + echo "Skipping transforms/code/header_cleanser kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/header_cleanser workflow-test - header_text "Run transforms/code/header_cleanser completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/header_cleanser workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/header_cleanser workflow-test + echo "Run transforms/code/header_cleanser completed" + else + echo "Skipping transforms/code/header_cleanser kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-header_cleanser.yml b/.github/workflows/test-code-header_cleanser.yml index 1834f4983..05f09a8c5 100644 --- a/.github/workflows/test-code-header_cleanser.yml +++ b/.github/workflows/test-code-header_cleanser.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/header_cleanser/**" - "data-processing-lib/**" - "!transforms/code/header_cleanser/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/header_cleanser/**" - "data-processing-lib/**" - "!transforms/code/header_cleanser/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-license_select-kfp.yml b/.github/workflows/test-code-license_select-kfp.yml new file mode 100644 index 000000000..94d662d1d --- /dev/null +++ b/.github/workflows/test-code-license_select-kfp.yml @@ -0,0 +1,130 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test KFP - transforms/code/license_select + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - ".make.*" + - "transforms/.make.workflow" + - "transforms/code/license_select/**" + - "!kfp/**" # This is tested in separate workflow + - "!data-processing-lib/**" # This is tested in separate workflow + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - ".make.*" + - "transforms/.make.workflow" + - "transforms/code/license_select/**" + - "!data-processing-lib/**" # This is tested in separate workflow + - "!kfp/**" # This is tested in separate workflow + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + test-kfp-v1: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test KFP libs (shared and v1) and run a workflow + timeout-minutes: 120 + run: | + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/license_select workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/license_select workflow-test + echo "Run transforms/code/license_select completed" + else + echo "Skipping transforms/code/license_select kfp test for lack of Makefile and/or kfp_ray" + fi + + test-kfp-v2: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test KFP libs (shared and v2) and run a workflow + timeout-minutes: 120 + run: | + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/license_select workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/license_select workflow-test + echo "Run transforms/code/license_select completed" + else + echo "Skipping transforms/code/license_select kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-license_select.yml b/.github/workflows/test-code-license_select.yml index ab24b582b..59592c82f 100644 --- a/.github/workflows/test-code-license_select.yml +++ b/.github/workflows/test-code-license_select.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/license_select/**" - "data-processing-lib/**" - "!transforms/code/license_select/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/license_select/**" - "data-processing-lib/**" - "!transforms/code/license_select/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-malware-kfp.yml b/.github/workflows/test-code-malware-kfp.yml index 9bd937f46..2c9e3186c 100644 --- a/.github/workflows/test-code-malware-kfp.yml +++ b/.github/workflows/test-code-malware-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/malware/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/malware/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/malware workflow-test - echo "Run transforms/code/malware completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/malware workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/malware workflow-test + echo "Run transforms/code/malware completed" + else + echo "Skipping transforms/code/malware kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/malware workflow-test - header_text "Run transforms/code/malware completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/malware workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/malware workflow-test + echo "Run transforms/code/malware completed" + else + echo "Skipping transforms/code/malware kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-malware.yml b/.github/workflows/test-code-malware.yml index debc779d1..44196c62c 100644 --- a/.github/workflows/test-code-malware.yml +++ b/.github/workflows/test-code-malware.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/malware/**" - "data-processing-lib/**" - "!transforms/code/malware/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/malware/**" - "data-processing-lib/**" - "!transforms/code/malware/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-proglang_select-kfp.yml b/.github/workflows/test-code-proglang_select-kfp.yml index bbe257964..c23e0f1ff 100644 --- a/.github/workflows/test-code-proglang_select-kfp.yml +++ b/.github/workflows/test-code-proglang_select-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/proglang_select/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/proglang_select/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/proglang_select workflow-test - echo "Run transforms/code/proglang_select completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/proglang_select workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/proglang_select workflow-test + echo "Run transforms/code/proglang_select completed" + else + echo "Skipping transforms/code/proglang_select kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/proglang_select workflow-test - header_text "Run transforms/code/proglang_select completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/proglang_select workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/proglang_select workflow-test + echo "Run transforms/code/proglang_select completed" + else + echo "Skipping transforms/code/proglang_select kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-proglang_select.yml b/.github/workflows/test-code-proglang_select.yml index 36bf6a869..4723e5d3a 100644 --- a/.github/workflows/test-code-proglang_select.yml +++ b/.github/workflows/test-code-proglang_select.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/proglang_select/**" - "data-processing-lib/**" - "!transforms/code/proglang_select/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/proglang_select/**" - "data-processing-lib/**" - "!transforms/code/proglang_select/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-code-repo_level_ordering-kfp.yml b/.github/workflows/test-code-repo_level_ordering-kfp.yml index c26ecda52..57b39f313 100644 --- a/.github/workflows/test-code-repo_level_ordering-kfp.yml +++ b/.github/workflows/test-code-repo_level_ordering-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/repo_level_ordering/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/code/repo_level_ordering/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/repo_level_ordering workflow-test - echo "Run transforms/code/repo_level_ordering completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/repo_level_ordering workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/repo_level_ordering workflow-test + echo "Run transforms/code/repo_level_ordering completed" + else + echo "Skipping transforms/code/repo_level_ordering kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/code/repo_level_ordering workflow-test - header_text "Run transforms/code/repo_level_ordering completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/code/repo_level_ordering workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/code/repo_level_ordering workflow-test + echo "Run transforms/code/repo_level_ordering completed" + else + echo "Skipping transforms/code/repo_level_ordering kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-code-repo_level_ordering.yml b/.github/workflows/test-code-repo_level_ordering.yml index fe0ee23bb..19ec8daf5 100644 --- a/.github/workflows/test-code-repo_level_ordering.yml +++ b/.github/workflows/test-code-repo_level_ordering.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/repo_level_ordering/**" - "data-processing-lib/**" - "!transforms/code/repo_level_ordering/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/code/repo_level_ordering/**" - "data-processing-lib/**" - "!transforms/code/repo_level_ordering/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-kfp-transform.template b/.github/workflows/test-kfp-transform.template index 434a57238..1003ba643 100644 --- a/.github/workflows/test-kfp-transform.template +++ b/.github/workflows/test-kfp-transform.template @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "@TARGET_TRANSFORM_DIR@/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "@TARGET_TRANSFORM_DIR@/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C @TARGET_TRANSFORM_DIR@ workflow-test - echo "Run @TARGET_TRANSFORM_DIR@ completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C @TARGET_TRANSFORM_DIR@ workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C @TARGET_TRANSFORM_DIR@ workflow-test + echo "Run @TARGET_TRANSFORM_DIR@ completed" + else + echo "Skipping @TARGET_TRANSFORM_DIR@ kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C @TARGET_TRANSFORM_DIR@ workflow-test - header_text "Run @TARGET_TRANSFORM_DIR@ completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C @TARGET_TRANSFORM_DIR@ workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C @TARGET_TRANSFORM_DIR@ workflow-test + echo "Run @TARGET_TRANSFORM_DIR@ completed" + else + echo "Skipping @TARGET_TRANSFORM_DIR@ kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-kfp.yml b/.github/workflows/test-kfp.yml index 6719c322e..01deebcfa 100644 --- a/.github/workflows/test-kfp.yml +++ b/.github/workflows/test-kfp.yml @@ -42,6 +42,11 @@ on: - "!**/images/**" - "!**/.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: KFP_BLACK_LIST: "doc_chunk-ray,pdf2parquet-ray,pii_redactor" @@ -51,7 +56,7 @@ jobs: # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -68,7 +73,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -117,7 +122,7 @@ jobs: echo "Run ${transforms[$index]} completed" test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -167,7 +172,7 @@ jobs: header_text "Run ${transforms[$index]} completed" build-kfp-components: needs: [check_if_push_images] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 30 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-doc_chunk.yml b/.github/workflows/test-language-doc_chunk.yml index fa3ea58ca..ec78512e5 100644 --- a/.github/workflows/test-language-doc_chunk.yml +++ b/.github/workflows/test-language-doc_chunk.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/doc_chunk/**" - "data-processing-lib/**" - "!transforms/language/doc_chunk/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/doc_chunk/**" - "data-processing-lib/**" - "!transforms/language/doc_chunk/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-doc_quality-kfp.yml b/.github/workflows/test-language-doc_quality-kfp.yml index e9f678595..1c5237b20 100644 --- a/.github/workflows/test-language-doc_quality-kfp.yml +++ b/.github/workflows/test-language-doc_quality-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/doc_quality/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/doc_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/doc_quality workflow-test - echo "Run transforms/language/doc_quality completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/doc_quality workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/doc_quality workflow-test + echo "Run transforms/language/doc_quality completed" + else + echo "Skipping transforms/language/doc_quality kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/doc_quality workflow-test - header_text "Run transforms/language/doc_quality completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/doc_quality workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/doc_quality workflow-test + echo "Run transforms/language/doc_quality completed" + else + echo "Skipping transforms/language/doc_quality kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-language-doc_quality.yml b/.github/workflows/test-language-doc_quality.yml index dde61e1fa..443c22152 100644 --- a/.github/workflows/test-language-doc_quality.yml +++ b/.github/workflows/test-language-doc_quality.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/doc_quality/**" - "data-processing-lib/**" - "!transforms/language/doc_quality/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/doc_quality/**" - "data-processing-lib/**" - "!transforms/language/doc_quality/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-html2parquet.yml b/.github/workflows/test-language-html2parquet.yml index 8caf4efd9..e5ef8e510 100644 --- a/.github/workflows/test-language-html2parquet.yml +++ b/.github/workflows/test-language-html2parquet.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/html2parquet/**" - "data-processing-lib/**" - "!transforms/language/html2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/html2parquet/**" - "data-processing-lib/**" - "!transforms/language/html2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-lang_id-kfp.yml b/.github/workflows/test-language-lang_id-kfp.yml index cf3dec397..c6eb179b8 100644 --- a/.github/workflows/test-language-lang_id-kfp.yml +++ b/.github/workflows/test-language-lang_id-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/lang_id/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/lang_id/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/lang_id workflow-test - echo "Run transforms/language/lang_id completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/lang_id workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/lang_id workflow-test + echo "Run transforms/language/lang_id completed" + else + echo "Skipping transforms/language/lang_id kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/lang_id workflow-test - header_text "Run transforms/language/lang_id completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/lang_id workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/lang_id workflow-test + echo "Run transforms/language/lang_id completed" + else + echo "Skipping transforms/language/lang_id kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-language-lang_id.yml b/.github/workflows/test-language-lang_id.yml index 3b39358c9..7c318a3a1 100644 --- a/.github/workflows/test-language-lang_id.yml +++ b/.github/workflows/test-language-lang_id.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/lang_id/**" - "data-processing-lib/**" - "!transforms/language/lang_id/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/lang_id/**" - "data-processing-lib/**" - "!transforms/language/lang_id/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-pdf2parquet.yml b/.github/workflows/test-language-pdf2parquet.yml index bb523c57e..fbdd81b8e 100644 --- a/.github/workflows/test-language-pdf2parquet.yml +++ b/.github/workflows/test-language-pdf2parquet.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/pdf2parquet/**" - "data-processing-lib/**" - "!transforms/language/pdf2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/pdf2parquet/**" - "data-processing-lib/**" - "!transforms/language/pdf2parquet/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-pii_redactor.yml b/.github/workflows/test-language-pii_redactor.yml index 9656a2f24..5ecc80b08 100644 --- a/.github/workflows/test-language-pii_redactor.yml +++ b/.github/workflows/test-language-pii_redactor.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/pii_redactor/**" - "data-processing-lib/**" - "!transforms/language/pii_redactor/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/pii_redactor/**" - "data-processing-lib/**" - "!transforms/language/pii_redactor/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-language-text_encoder-kfp.yml b/.github/workflows/test-language-text_encoder-kfp.yml index d90b76820..8e238dfcf 100644 --- a/.github/workflows/test-language-text_encoder-kfp.yml +++ b/.github/workflows/test-language-text_encoder-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/text_encoder/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/language/text_encoder/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/text_encoder workflow-test - echo "Run transforms/language/text_encoder completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/text_encoder workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/text_encoder workflow-test + echo "Run transforms/language/text_encoder completed" + else + echo "Skipping transforms/language/text_encoder kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/language/text_encoder workflow-test - header_text "Run transforms/language/text_encoder completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/language/text_encoder workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/language/text_encoder workflow-test + echo "Run transforms/language/text_encoder completed" + else + echo "Skipping transforms/language/text_encoder kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-language-text_encoder.yml b/.github/workflows/test-language-text_encoder.yml index f7622f8e0..d49c1193d 100644 --- a/.github/workflows/test-language-text_encoder.yml +++ b/.github/workflows/test-language-text_encoder.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/text_encoder/**" - "data-processing-lib/**" - "!transforms/language/text_encoder/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/language/text_encoder/**" - "data-processing-lib/**" - "!transforms/language/text_encoder/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-lib.yml b/.github/workflows/test-lib.yml index be00c2076..5a1cff872 100644 --- a/.github/workflows/test-lib.yml +++ b/.github/workflows/test-lib.yml @@ -25,13 +25,18 @@ on: - "!data-processing-lib/**/doc/**" - "!data-processing-lib/**/.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_images: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -48,7 +53,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-python-lib: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -56,7 +61,7 @@ jobs: run: | make -C data-processing-lib/python DOCKER=docker venv test test-ray-lib: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -64,7 +69,7 @@ jobs: run: | make -C data-processing-lib/ray DOCKER=docker venv test test-spark-lib: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -74,7 +79,7 @@ jobs: test-data-processing-lib-images: needs: [check_if_push_images] if: needs.check_if_push_images.outputs.publish_images == 'true' - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} diff --git a/.github/workflows/test-misc.yml b/.github/workflows/test-misc.yml index 2c601bbd5..62c1a187a 100644 --- a/.github/workflows/test-misc.yml +++ b/.github/workflows/test-misc.yml @@ -29,7 +29,7 @@ on: jobs: test-make: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -37,7 +37,7 @@ jobs: run: | make -n clean test build publish set-versions check-transform-test-workflows: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml index 4ee491c8e..e88eeeae2 100644 --- a/.github/workflows/test-packaging-python.yml +++ b/.github/workflows/test-packaging-python.yml @@ -27,7 +27,7 @@ on: jobs: test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml index 4b812540c..9dbce3110 100644 --- a/.github/workflows/test-packaging-ray.yml +++ b/.github/workflows/test-packaging-ray.yml @@ -27,7 +27,7 @@ on: jobs: test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test-transform.template b/.github/workflows/test-transform.template index e0966717e..f3907d56a 100644 --- a/.github/workflows/test-transform.template +++ b/.github/workflows/test-transform.template @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "@TARGET_TRANSFORM_DIR@/**" - "data-processing-lib/**" - "!@TARGET_TRANSFORM_DIR@/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "@TARGET_TRANSFORM_DIR@/**" - "data-processing-lib/**" - "!@TARGET_TRANSFORM_DIR@/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-doc_id-kfp.yml b/.github/workflows/test-universal-doc_id-kfp.yml index 28c1d8717..8ed1df919 100644 --- a/.github/workflows/test-universal-doc_id-kfp.yml +++ b/.github/workflows/test-universal-doc_id-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/doc_id/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/doc_id/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/doc_id workflow-test - echo "Run transforms/universal/doc_id completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/doc_id workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/doc_id workflow-test + echo "Run transforms/universal/doc_id completed" + else + echo "Skipping transforms/universal/doc_id kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/doc_id workflow-test - header_text "Run transforms/universal/doc_id completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/doc_id workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/doc_id workflow-test + echo "Run transforms/universal/doc_id completed" + else + echo "Skipping transforms/universal/doc_id kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-doc_id.yml b/.github/workflows/test-universal-doc_id.yml index 66d0283ca..d314f3b25 100644 --- a/.github/workflows/test-universal-doc_id.yml +++ b/.github/workflows/test-universal-doc_id.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/doc_id/**" - "data-processing-lib/**" - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/doc_id/**" - "data-processing-lib/**" - "!transforms/universal/doc_id/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-ededup-kfp.yml b/.github/workflows/test-universal-ededup-kfp.yml index 5d3481e30..93408a260 100644 --- a/.github/workflows/test-universal-ededup-kfp.yml +++ b/.github/workflows/test-universal-ededup-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/ededup/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/ededup/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/ededup workflow-test - echo "Run transforms/universal/ededup completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/ededup workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/ededup workflow-test + echo "Run transforms/universal/ededup completed" + else + echo "Skipping transforms/universal/ededup kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/ededup workflow-test - header_text "Run transforms/universal/ededup completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/ededup workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/ededup workflow-test + echo "Run transforms/universal/ededup completed" + else + echo "Skipping transforms/universal/ededup kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-ededup.yml b/.github/workflows/test-universal-ededup.yml index 225c27cc3..8b4034570 100644 --- a/.github/workflows/test-universal-ededup.yml +++ b/.github/workflows/test-universal-ededup.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/ededup/**" - "data-processing-lib/**" - "!transforms/universal/ededup/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/ededup/**" - "data-processing-lib/**" - "!transforms/universal/ededup/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-fdedup-kfp.yml b/.github/workflows/test-universal-fdedup-kfp.yml index 9d331ed47..27dfc5493 100644 --- a/.github/workflows/test-universal-fdedup-kfp.yml +++ b/.github/workflows/test-universal-fdedup-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/fdedup/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/fdedup/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/fdedup workflow-test - echo "Run transforms/universal/fdedup completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/fdedup workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/fdedup workflow-test + echo "Run transforms/universal/fdedup completed" + else + echo "Skipping transforms/universal/fdedup kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/fdedup workflow-test - header_text "Run transforms/universal/fdedup completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/fdedup workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/fdedup workflow-test + echo "Run transforms/universal/fdedup completed" + else + echo "Skipping transforms/universal/fdedup kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-fdedup.yml b/.github/workflows/test-universal-fdedup.yml index 356736fca..5f68d4799 100644 --- a/.github/workflows/test-universal-fdedup.yml +++ b/.github/workflows/test-universal-fdedup.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/fdedup/**" - "data-processing-lib/**" - "!transforms/universal/fdedup/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/fdedup/**" - "data-processing-lib/**" - "!transforms/universal/fdedup/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-filter-kfp.yml b/.github/workflows/test-universal-filter-kfp.yml index 59ebbde3c..bd2f57229 100644 --- a/.github/workflows/test-universal-filter-kfp.yml +++ b/.github/workflows/test-universal-filter-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/filter/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/filter/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/filter workflow-test - echo "Run transforms/universal/filter completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/filter workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/filter workflow-test + echo "Run transforms/universal/filter completed" + else + echo "Skipping transforms/universal/filter kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/filter workflow-test - header_text "Run transforms/universal/filter completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/filter workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/filter workflow-test + echo "Run transforms/universal/filter completed" + else + echo "Skipping transforms/universal/filter kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-filter.yml b/.github/workflows/test-universal-filter.yml index 44858feff..43e936166 100644 --- a/.github/workflows/test-universal-filter.yml +++ b/.github/workflows/test-universal-filter.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/filter/**" - "data-processing-lib/**" - "!transforms/universal/filter/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/filter/**" - "data-processing-lib/**" - "!transforms/universal/filter/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-hap.yml b/.github/workflows/test-universal-hap.yml index b92e5867c..c845506c1 100644 --- a/.github/workflows/test-universal-hap.yml +++ b/.github/workflows/test-universal-hap.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/hap/**" - "data-processing-lib/**" - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/hap/**" - "data-processing-lib/**" - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-noop-kfp.yml b/.github/workflows/test-universal-noop-kfp.yml index 19c62ab49..01b14e51b 100644 --- a/.github/workflows/test-universal-noop-kfp.yml +++ b/.github/workflows/test-universal-noop-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/noop/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/noop/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/noop workflow-test - echo "Run transforms/universal/noop completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/noop workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/noop workflow-test + echo "Run transforms/universal/noop completed" + else + echo "Skipping transforms/universal/noop kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/noop workflow-test - header_text "Run transforms/universal/noop completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/noop workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/noop workflow-test + echo "Run transforms/universal/noop completed" + else + echo "Skipping transforms/universal/noop kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-noop.yml b/.github/workflows/test-universal-noop.yml index cd72703d1..13e066d58 100644 --- a/.github/workflows/test-universal-noop.yml +++ b/.github/workflows/test-universal-noop.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/noop/**" - "data-processing-lib/**" - "!transforms/universal/noop/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/noop/**" - "data-processing-lib/**" - "!transforms/universal/noop/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-profiler-kfp.yml b/.github/workflows/test-universal-profiler-kfp.yml index 3d377922d..e30f7bafa 100644 --- a/.github/workflows/test-universal-profiler-kfp.yml +++ b/.github/workflows/test-universal-profiler-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/profiler/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/profiler/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/profiler workflow-test - echo "Run transforms/universal/profiler completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/profiler workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/profiler workflow-test + echo "Run transforms/universal/profiler completed" + else + echo "Skipping transforms/universal/profiler kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/profiler workflow-test - header_text "Run transforms/universal/profiler completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/profiler workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/profiler workflow-test + echo "Run transforms/universal/profiler completed" + else + echo "Skipping transforms/universal/profiler kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-profiler.yml b/.github/workflows/test-universal-profiler.yml index 50cd8cd26..e018e0ed3 100644 --- a/.github/workflows/test-universal-profiler.yml +++ b/.github/workflows/test-universal-profiler.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/profiler/**" - "data-processing-lib/**" - "!transforms/universal/profiler/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/profiler/**" - "data-processing-lib/**" - "!transforms/universal/profiler/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-resize-kfp.yml b/.github/workflows/test-universal-resize-kfp.yml index fe7377178..630de3c05 100644 --- a/.github/workflows/test-universal-resize-kfp.yml +++ b/.github/workflows/test-universal-resize-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/resize/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/resize/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/resize workflow-test - echo "Run transforms/universal/resize completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/resize workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/resize workflow-test + echo "Run transforms/universal/resize completed" + else + echo "Skipping transforms/universal/resize kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/resize workflow-test - header_text "Run transforms/universal/resize completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/resize workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/resize workflow-test + echo "Run transforms/universal/resize completed" + else + echo "Skipping transforms/universal/resize kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-resize.yml b/.github/workflows/test-universal-resize.yml index 99e14b1b8..b3399e5ec 100644 --- a/.github/workflows/test-universal-resize.yml +++ b/.github/workflows/test-universal-resize.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/resize/**" - "data-processing-lib/**" - "!transforms/universal/resize/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/resize/**" - "data-processing-lib/**" - "!transforms/universal/resize/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/test-universal-tokenization-kfp.yml b/.github/workflows/test-universal-tokenization-kfp.yml index f127db59b..ff13a444c 100644 --- a/.github/workflows/test-universal-tokenization-kfp.yml +++ b/.github/workflows/test-universal-tokenization-kfp.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/tokenization/**" - "!kfp/**" # This is tested in separate workflow - "!data-processing-lib/**" # This is tested in separate workflow @@ -24,6 +26,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.workflow" - "transforms/universal/tokenization/**" - "!data-processing-lib/**" # This is tested in separate workflow - "!kfp/**" # This is tested in separate workflow @@ -32,10 +36,14 @@ on: - "!**/images/**" - "!**.gitignore" +# taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: test-kfp-v1: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -51,30 +59,34 @@ jobs: - name: Test KFP libs (shared and v1) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/tokenization workflow-test - echo "Run transforms/universal/tokenization completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/tokenization workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/tokenization workflow-test + echo "Run transforms/universal/tokenization completed" + else + echo "Skipping transforms/universal/tokenization kfp test for lack of Makefile and/or kfp_ray" + fi test-kfp-v2: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -90,25 +102,29 @@ jobs: - name: Test KFP libs (shared and v2) and run a workflow timeout-minutes: 120 run: | - export REPOROOT=$PWD - export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup - source $K8S_SETUP_SCRIPTS/requirements.env - export PATH=$PATH:/tmp/ - curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 - chmod 777 /tmp/kind - curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 - chmod 700 /tmp/get_helm.sh - HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo - chmod 777 /tmp/helm - curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl - chmod 777 /tmp/kubectl - curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc - chmod +x /tmp/mc - export DEPLOY_KUBEFLOW=1 - export KFPv2=1 - make -C $K8S_SETUP_SCRIPTS setup - make -C kfp/kfp_support_lib test - make -C transforms workflow-build - source $K8S_SETUP_SCRIPTS/common.sh - make -C transforms/universal/tokenization workflow-test - header_text "Run transforms/universal/tokenization completed" + if [ -e "@TARGET_TRANSFORM_DIR/Makefile" -a -d "@TARGET_TRANSFORM_DIR/kfp_ray" ]; then + export REPOROOT=$PWD + export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup + source $K8S_SETUP_SCRIPTS/requirements.env + export PATH=$PATH:/tmp/ + curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64 + chmod 777 /tmp/kind + curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 /tmp/get_helm.sh + HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo + chmod 777 /tmp/helm + curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl + chmod 777 /tmp/kubectl + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc + chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 + export KFPv2=1 + make -C $K8S_SETUP_SCRIPTS setup + make -C kfp/kfp_support_lib test + make -C transforms/universal/tokenization workflow-build + source $K8S_SETUP_SCRIPTS/common.sh + make -C transforms/universal/tokenization workflow-test + echo "Run transforms/universal/tokenization completed" + else + echo "Skipping transforms/universal/tokenization kfp test for lack of Makefile and/or kfp_ray" + fi diff --git a/.github/workflows/test-universal-tokenization.yml b/.github/workflows/test-universal-tokenization.yml index e7a620882..ae547c396 100644 --- a/.github/workflows/test-universal-tokenization.yml +++ b/.github/workflows/test-universal-tokenization.yml @@ -12,6 +12,8 @@ on: tags: - "*" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/tokenization/**" - "data-processing-lib/**" - "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -26,6 +28,8 @@ on: - "dev" - "releases/**" paths: + - ".make.*" + - "transforms/.make.transforms" - "transforms/universal/tokenization/**" - "data-processing-lib/**" - "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow @@ -36,13 +40,18 @@ on: - "!**/images/**" - "!**.gitignore" +# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check_if_push_image: # check whether the Docker images should be pushed to the remote repository # The images are pushed if it is a merge to dev branch or a new tag is created. # The latter being part of the release process. # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest outputs: publish_images: ${{ steps.version.outputs.publish_images }} steps: @@ -59,7 +68,7 @@ jobs: fi echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" test-src: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 @@ -81,7 +90,7 @@ jobs: fi test-image: needs: [check_if_push_image] - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest timeout-minutes: 120 env: DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} diff --git a/.github/workflows/workflow-manual-run.yml b/.github/workflows/workflow-manual-run.yml index 3c0f37d47..f0f7028b6 100644 --- a/.github/workflows/workflow-manual-run.yml +++ b/.github/workflows/workflow-manual-run.yml @@ -22,7 +22,7 @@ jobs: KFPv2: ${{ github.event.inputs.kfp_v2 }} WORKFLOW_PATH: ${{ github.event.inputs.workflow-path }} DEBUG_MODE: ${{ github.event.inputs.debug }} - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh index 40f4e2615..afc73a886 100755 --- a/scripts/check-workflows.sh +++ b/scripts/check-workflows.sh @@ -1,20 +1,52 @@ #!/bin/bash -# Check that each transform in transforms// has a corresponding -# .github/workflows/test--.yml file. +usage() { +cat << EOF +Check that each transform in transforms// has a corresponding + .github/workflows/test--.yml file and, + .github/workflows/test---kfp.yml file if + there is a kfp_ray directory for the transform, and + the transform is not in the kfp black list. +Options: + -show-kfp-black-list: prints the space separate list of transform + directories (base names) and exits. + -help: show this message. +EOF +} + if [ ! -d transforms ]; then echo Please run this script from the top of the repository exit 1 fi +KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor" +while [ $# -ne 0 ]; do + case $1 in + -show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0; + ;; + *help) usage; exit 0; + ;; + *) echo Unrecognized option $1. exit 1 + ;; + esac + shift; +done for i in $(find transforms -maxdepth 2 -mindepth 2 -type d | grep -v venv); do transform=$(basename $i) category=$(dirname $i) category=$(basename $category) - workflow=.github/workflows/test-$category-$transform.yml - if [ ! -e $workflow ]; then - echo Missing $workflow for transform $category/$transform - echo Fix this by running make in the .github/workflows directory - exit 1 + workflows=.github/workflows/test-$category-$transform.yml + is_blacklisted=$(echo $KFP_BLACK_LIST | grep $transform) + if [ -d $i/kfp_ray -a -z "$is_blacklisted" ]; then + workflows="$workflows .github/workflows/test-$category-$transform-kfp.yml" else - echo Verified existence of $workflow - fi + echo KFP workflow for $transform is not expected. + fi + for workflow in $workflows; do + if [ ! -e $workflow ]; then + echo Missing $workflow for transform $category/$transform + echo Fix this by running make in the .github/workflows directory + exit 1 + else + echo Verified existence of $workflow + fi + done done diff --git a/transforms/code/code2parquet/Makefile b/transforms/code/code2parquet/Makefile index bc4077099..027d29644 100644 --- a/transforms/code/code2parquet/Makefile +++ b/transforms/code/code2parquet/Makefile @@ -55,16 +55,25 @@ set-versions: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build \ No newline at end of file + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/code_quality/Makefile b/transforms/code/code_quality/Makefile index 204ea8856..bca6f7e85 100644 --- a/transforms/code/code_quality/Makefile +++ b/transforms/code/code_quality/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/header_cleanser/Makefile b/transforms/code/header_cleanser/Makefile index 204ea8856..bca6f7e85 100644 --- a/transforms/code/header_cleanser/Makefile +++ b/transforms/code/header_cleanser/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/license_select/Makefile b/transforms/code/license_select/Makefile index b19f5c963..04b1cc451 100644 --- a/transforms/code/license_select/Makefile +++ b/transforms/code/license_select/Makefile @@ -47,16 +47,25 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/malware/Makefile b/transforms/code/malware/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/code/malware/Makefile +++ b/transforms/code/malware/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile index 9c7c898e4..9e222ee79 100644 --- a/transforms/code/proglang_select/Makefile +++ b/transforms/code/proglang_select/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/repo_level_ordering/Makefile b/transforms/code/repo_level_ordering/Makefile index cebfb4848..04b1cc451 100644 --- a/transforms/code/repo_level_ordering/Makefile +++ b/transforms/code/repo_level_ordering/Makefile @@ -45,15 +45,27 @@ load-image:: @# Help: Recursively make $@ in all subdirs $(MAKE) RULE=$@ .recurse -# kfp implementation is not yet added, so below targets don't do anything. .PHONY: workflow-venv workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/code/repo_level_ordering/ray/Makefile b/transforms/code/repo_level_ordering/ray/Makefile index 771ed9240..83f8692de 100644 --- a/transforms/code/repo_level_ordering/ray/Makefile +++ b/transforms/code/repo_level_ordering/ray/Makefile @@ -50,3 +50,10 @@ run-s3-sample: .transforms.run-s3-ray-sample minio-start: .minio-start load-image:: .transforms.load-image + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image + diff --git a/transforms/language/doc_chunk/Makefile b/transforms/language/doc_chunk/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/language/doc_chunk/Makefile +++ b/transforms/language/doc_chunk/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/doc_quality/Makefile b/transforms/language/doc_quality/Makefile index 5cded280a..a3f1865be 100644 --- a/transforms/language/doc_quality/Makefile +++ b/transforms/language/doc_quality/Makefile @@ -47,16 +47,25 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build \ No newline at end of file + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/html2parquet/Makefile b/transforms/language/html2parquet/Makefile index 017eb23b4..bca6f7e85 100644 --- a/transforms/language/html2parquet/Makefile +++ b/transforms/language/html2parquet/Makefile @@ -55,12 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/lang_id/Makefile b/transforms/language/lang_id/Makefile index 2967ceb67..af4a86873 100644 --- a/transforms/language/lang_id/Makefile +++ b/transforms/language/lang_id/Makefile @@ -60,16 +60,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/pdf2parquet/Makefile b/transforms/language/pdf2parquet/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/language/pdf2parquet/Makefile +++ b/transforms/language/pdf2parquet/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/pii_redactor/Makefile.disable b/transforms/language/pii_redactor/Makefile.disable index f9f635f07..8764d0dc2 100644 --- a/transforms/language/pii_redactor/Makefile.disable +++ b/transforms/language/pii_redactor/Makefile.disable @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/language/text_encoder/Makefile b/transforms/language/text_encoder/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/language/text_encoder/Makefile +++ b/transforms/language/text_encoder/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index 7ebb1b8e1..be26d3bf4 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/ededup/Makefile +++ b/transforms/universal/ededup/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/filter/Makefile b/transforms/universal/filter/Makefile index 9c7c898e4..9e222ee79 100644 --- a/transforms/universal/filter/Makefile +++ b/transforms/universal/filter/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/hap/Makefile b/transforms/universal/hap/Makefile index 017eb23b4..cdb34d72e 100644 --- a/transforms/universal/hap/Makefile +++ b/transforms/universal/hap/Makefile @@ -55,12 +55,24 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi .PHONY: workflow-upload workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/profiler/Makefile b/transforms/universal/profiler/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/profiler/Makefile +++ b/transforms/universal/profiler/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/resize/Makefile b/transforms/universal/resize/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/resize/Makefile +++ b/transforms/universal/resize/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi + diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile index 05d3c3111..bca6f7e85 100644 --- a/transforms/universal/tokenization/Makefile +++ b/transforms/universal/tokenization/Makefile @@ -55,16 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray workflow-venv + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-venv; \ + fi .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray workflow-test - + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-test; \ + fi + .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray workflow-upload + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-upload; \ + fi .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray workflow-build + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray workflow-build; \ + fi +