From bedc1791f1976dc9b3016d6d5fc6fbe3470fac55 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 13:56:33 -0400 Subject: [PATCH 01/15] added single package for data-prep-kit with extra[dev,ray,spak] Signed-off-by: Maroun Touma --- data-processing-lib/Makefile | 5 +++ data-processing-lib/pyproject.toml | 44 ++++++++++++++++++++++ data-processing-lib/requirements-dev.txt | 9 +++++ data-processing-lib/requirements-ray.txt | 3 ++ data-processing-lib/requirements-spark.txt | 2 + data-processing-lib/requirements.txt | 6 +++ 6 files changed, 69 insertions(+) create mode 100644 data-processing-lib/pyproject.toml create mode 100644 data-processing-lib/requirements-dev.txt create mode 100644 data-processing-lib/requirements-ray.txt create mode 100644 data-processing-lib/requirements-spark.txt create mode 100644 data-processing-lib/requirements.txt diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile index a70a05ff8..d0d1305ac 100644 --- a/data-processing-lib/Makefile +++ b/data-processing-lib/Makefile @@ -55,3 +55,8 @@ set-versions: @# Help: Recursively $@ in all subdirs @$(MAKE) RULE=$@ .recurse + +build-pkg-dist:: + $(MAKE) .defaults.build-dist BUILD_WHEEL_ARG=-w + +publish-dist :: .check-env .defaults.publish-dist diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml new file mode 100644 index 000000000..9dbbc7f27 --- /dev/null +++ b/data-processing-lib/pyproject.toml @@ -0,0 +1,44 @@ +[project] +name = "data_prep_toolkit" +version = "0.2.2.dev0" +keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +requires-python = ">=3.10" +description = "Data Preparation Toolkit Library for Ray and Python" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "David Wood", email = "dawood@us.ibm.com" }, + { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, +] + +dynamic = ["dependencies", "optional-dependencies"] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" +"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +spark = { file = ["requirements-spark.txt"]} + +[tool.setuptools.packages.find] +where = ["python/src", "ray/src", "spark/src"] + + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/data-processing-lib/requirements-dev.txt b/data-processing-lib/requirements-dev.txt new file mode 100644 index 000000000..326d62c8e --- /dev/null +++ b/data-processing-lib/requirements-dev.txt @@ -0,0 +1,9 @@ +twine +pytest>=7.3.2 +pytest-dotenv>=0.5.2 +pytest-env>=1.0.0 +pre-commit>=3.3.2 +pytest-cov>=4.1.0 +pytest-mock>=3.10.0 +moto==5.0.5 +markupsafe==2.0.1 diff --git a/data-processing-lib/requirements-ray.txt b/data-processing-lib/requirements-ray.txt new file mode 100644 index 000000000..aafa3caeb --- /dev/null +++ b/data-processing-lib/requirements-ray.txt @@ -0,0 +1,3 @@ +ray[default]==2.24.0 +fastapi>=0.110.2 +pillow>=10.3.0 diff --git a/data-processing-lib/requirements-spark.txt b/data-processing-lib/requirements-spark.txt new file mode 100644 index 000000000..f38f033da --- /dev/null +++ b/data-processing-lib/requirements-spark.txt @@ -0,0 +1,2 @@ +pyspark>=3.5.2 +psutil>=6.0.0 diff --git a/data-processing-lib/requirements.txt b/data-processing-lib/requirements.txt new file mode 100644 index 000000000..7b363f2b5 --- /dev/null +++ b/data-processing-lib/requirements.txt @@ -0,0 +1,6 @@ + numpy < 1.29.0 + pyarrow==16.1.0 + boto3==1.34.69 + argparse + mmh3 + psutil From c6514d52916e7ea902e47d14941b73028d68cfb0 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 14:03:24 -0400 Subject: [PATCH 02/15] Crate a single package with [all,ray,code_quality,etc.] Signed-off-by: Maroun Touma --- .make.defaults | 2 +- transforms/Makefile | 29 ++++++ .../python/README.md => README-list.md} | 5 + .../code/code2parquet/python/pyproject.toml | 13 ++- .../code/code2parquet/python/requirements.txt | 3 + .../code/code_quality/python/pyproject.toml | 14 ++- .../code/code_quality/python/requirements.txt | 3 + .../header_cleanser/python/pyproject.toml | 9 +- .../header_cleanser/python/requirements.txt | 3 + .../code/license_select/python/pyproject.toml | 8 +- .../license_select/python/requirements.txt | 1 + .../proglang_select/python/pyproject.toml | 8 +- .../proglang_select/python/requirements.txt | 1 + .../language/doc_chunk/python/pyproject.toml | 9 +- .../doc_chunk/python/requirements.txt | 3 + .../doc_quality/python/pyproject.toml | 8 +- .../doc_quality/python/requirements.txt | 2 + .../language/lang_id/python/pyproject.toml | 15 +-- .../language/lang_id/python/requirements.txt | 5 + .../pii_redactor/python/pyproject.toml | 11 +-- .../pii_redactor/python/requirements.txt | 5 + .../text_encoder/python/pyproject.toml | 8 +- .../text_encoder/python/requirements.txt | 2 + transforms/packaging/.gitignore | 5 - transforms/packaging/.make.packaging | 83 ---------------- transforms/packaging/Makefile | 60 ------------ transforms/packaging/README.md | 55 ----------- transforms/packaging/python/Makefile | 89 ----------------- transforms/packaging/python/pyproject.toml | 39 -------- .../packaging/python/requirements.all.txt | 51 ---------- .../packaging/python/requirements.lang1.txt | 32 ------- transforms/packaging/ray/Makefile | 66 ------------- transforms/packaging/ray/README.md | 41 -------- transforms/packaging/ray/pyproject.toml | 40 -------- transforms/packaging/ray/requirements.txt | 21 ---- transforms/pyproject.toml | 96 +++++++++++++++++++ transforms/requirements-ray.txt | 9 ++ transforms/requirements.txt | 1 + .../universal/doc_id/python/pyproject.toml | 7 +- .../universal/doc_id/python/requirements.txt | 1 + .../universal/ededup/python/pyproject.toml | 10 +- .../universal/ededup/python/requirements.txt | 3 + .../universal/fdedup/ray/pyproject.toml | 4 +- .../universal/filter/python/pyproject.toml | 13 ++- .../universal/filter/python/requirements.txt | 3 + .../universal/profiler/python/pyproject.toml | 9 +- .../profiler/python/requirements.txt | 5 + .../universal/resize/python/pyproject.toml | 7 +- .../universal/resize/python/requirements.txt | 1 + .../tokenization/python/pyproject.toml | 10 +- .../tokenization/python/requirements.txt | 2 + 51 files changed, 274 insertions(+), 656 deletions(-) rename transforms/{packaging/python/README.md => README-list.md} (94%) create mode 100644 transforms/code/code2parquet/python/requirements.txt create mode 100644 transforms/code/code_quality/python/requirements.txt create mode 100644 transforms/code/header_cleanser/python/requirements.txt create mode 100644 transforms/code/license_select/python/requirements.txt create mode 100644 transforms/code/proglang_select/python/requirements.txt create mode 100644 transforms/language/doc_chunk/python/requirements.txt create mode 100644 transforms/language/doc_quality/python/requirements.txt create mode 100644 transforms/language/lang_id/python/requirements.txt create mode 100644 transforms/language/pii_redactor/python/requirements.txt create mode 100644 transforms/language/text_encoder/python/requirements.txt delete mode 100644 transforms/packaging/.gitignore delete mode 100644 transforms/packaging/.make.packaging delete mode 100644 transforms/packaging/Makefile delete mode 100644 transforms/packaging/README.md delete mode 100644 transforms/packaging/python/Makefile delete mode 100644 transforms/packaging/python/pyproject.toml delete mode 100644 transforms/packaging/python/requirements.all.txt delete mode 100644 transforms/packaging/python/requirements.lang1.txt delete mode 100644 transforms/packaging/ray/Makefile delete mode 100644 transforms/packaging/ray/README.md delete mode 100644 transforms/packaging/ray/pyproject.toml delete mode 100644 transforms/packaging/ray/requirements.txt create mode 100644 transforms/pyproject.toml create mode 100644 transforms/requirements-ray.txt create mode 100644 transforms/requirements.txt create mode 100644 transforms/universal/doc_id/python/requirements.txt create mode 100644 transforms/universal/ededup/python/requirements.txt create mode 100644 transforms/universal/filter/python/requirements.txt create mode 100644 transforms/universal/profiler/python/requirements.txt create mode 100644 transforms/universal/resize/python/requirements.txt create mode 100644 transforms/universal/tokenization/python/requirements.txt diff --git a/.make.defaults b/.make.defaults index f9f58500f..4e07d84b3 100644 --- a/.make.defaults +++ b/.make.defaults @@ -627,7 +627,7 @@ MINIO_ADMIN_PWD= localminiosecretkey rm -rf dist || true rm -rf src/*egg-info || true ${PIP} install --upgrade build - ${PYTHON} -m build + ${PYTHON} -m build $(BUILD_WHEEL_ARG) # Publish the distribution in the dist directory, usually created with .defaults.build-dist target .PHONY: .defaults.publish-dist diff --git a/transforms/Makefile b/transforms/Makefile index 2f8fa27b1..5ff1f5111 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -79,3 +79,32 @@ workflow-upload:: set-versions:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse + + +build-pkg-dist:: + ## Most transforms today don't have a package name.... Need to fix that + ## In the meantime, we will copy everything to a single folder + -rm -fr src + mkdir src + # Copy all the src folders recursively (not clear if they have subfolders) + for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + echo $$x ; \ + if [ -d "$$x" ]; then \ + cp -r $$x/* src ; \ + fi \ + done + # Only needs to build the whl + $(MAKE) BUILD_WHEEL_ARG=-w .defaults.build-dist + +test-pkg-dist:: + -rm -fr venv + python -m venv venv + source venv/bin/activate && $(PYTHON) -m pip install '$(REPOROOT)/data-processing-lib/dist/data_prep_toolkit-$(DPK_VERSION)-py3-none-any.whl[dev,ray]' + source venv/bin/activate && $(PYTHON) -m pip install 'dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl[all]' + for T in $(shell find . | grep '[ray| python]/test$$') ; do \ + echo "running unit test on: $$T" ; \ + source venv/bin/activate && $(PYTEST) $$T; \ + done; + @# Help: Setup environment and run unit tests for all transforms + + diff --git a/transforms/packaging/python/README.md b/transforms/README-list.md similarity index 94% rename from transforms/packaging/python/README.md rename to transforms/README-list.md index 20eb0dff0..99885ad34 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/README-list.md @@ -5,9 +5,14 @@ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: `python -m pip install data-prep-toolkit-transforms` +or +`python -m pip install data-prep-toolkit-transforms[ray]` + installing the python transforms will also install `data-prep-toolkit` +installing the ray transforms will also install `data-prep-toolkit[ray]` + ## List of Transforms in current package Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi. diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 34a668bf0..16c63f7d3 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -9,11 +9,14 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "parameterized", - "pandas", -] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]}] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt new file mode 100644 index 000000000..758ab56fe --- /dev/null +++ b/transforms/code/code2parquet/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +parameterized +pandas diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 58e2affa7..22313fc2a 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -8,11 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "bs4==0.0.2", - "transformers==4.38.2", -] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt new file mode 100644 index 000000000..106e56f74 --- /dev/null +++ b/transforms/code/code_quality/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +bs4==0.0.2 +transformers==4.38.2 diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index c4326b4a0..2e24466f0 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "scancode-toolkit==32.1.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt new file mode 100644 index 000000000..bed2168c1 --- /dev/null +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' + diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 1058b0440..0d7857d12 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -9,14 +9,16 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.1.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/code/license_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 25aa5fdcf..9745a48c3 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/code/proglang_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 7705779b0..1a3bd333f 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -10,16 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "docling-core==1.3.0", - "llama-index-core>=0.11.0,<0.12.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt new file mode 100644 index 000000000..8e8c1bebb --- /dev/null +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +docling-core==1.3.0 +llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 8ebec8fe3..12c712ae9 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt new file mode 100644 index 000000000..f2f9d6200 --- /dev/null +++ b/transforms/language/doc_quality/python/requirements.txt @@ -0,0 +1,2 @@ + +data-prep-toolkit==0.2.2.dev0 diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 54c874a36..1b9107019 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -8,13 +8,14 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "fasttext==0.9.2", - "langcodes==3.3.0", - "huggingface-hub >= 0.21.4, <1.0.0", - "numpy==1.26.4", -] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt new file mode 100644 index 000000000..111465be0 --- /dev/null +++ b/transforms/language/lang_id/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev0 +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 55d4e8970..7045b6ec0 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -8,18 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "presidio-analyzer>=2.2.355", - "presidio-anonymizer>=2.2.355", - "flair>=0.14.0", - "pandas>=2.2.2", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt new file mode 100644 index 000000000..99e423ce1 --- /dev/null +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev0 +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index e9f84fefd..0dd0ac44c 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -10,15 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "sentence-transformers==3.0.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt new file mode 100644 index 000000000..be8c0a880 --- /dev/null +++ b/transforms/language/text_encoder/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev0 +sentence-transformers==3.0.1 diff --git a/transforms/packaging/.gitignore b/transforms/packaging/.gitignore deleted file mode 100644 index 863607847..000000000 --- a/transforms/packaging/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -**/src -**/dist -**/*.egg-info -**/build - diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging deleted file mode 100644 index 5268889d0..000000000 --- a/transforms/packaging/.make.packaging +++ /dev/null @@ -1,83 +0,0 @@ -ifndef T_SET -T_SET=all -endif - - -venv: - $(MAKE) .defaults.create-venv - -test:: test-src - -clean:: .transforms.clean - -rm -fr src - -image:: .transforms.python-image - -run-ut:: - source venv/bin/activate; \ - if [ -e requirements.test.txt ]; then \ - $(PYTHON) -m pip install -r requirements.test.txt ; \ - fi; \ - for T in $(TRANSFORMS_NAMES); do \ - echo running unit test on: $$T ; \ - $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ - done; - @# Help: Setup environment and run unit tests for all transforms - - -setup: .transforms.setup venv - $(MAKE) src - source venv/bin/activate; \ - $(PYTHON) -m pip install . - @# Help: Do any default transform setup before running make src and setting up a test environment - - -requirements: - if [ -e requirements.$(T_SET).txt ]; then \ - cp requirements.$(T_SET).txt requirements.txt ; \ - fi - -pkg-name: - if [ $(TRANSFORM_PKG) ]; then \ - cat pyproject.toml | sed -e \ - 's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -is-patch: - if [ $(IS_PATCH) ]; then \ - cat pyproject.toml | sed -e \ - 's/^version[ ]*=[ ]*"\(.*\).dev.*/version = "\1"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -##################################################### -# to build a patched release, use make IS_PATCH=1 src -##################################################### -src: - mkdir src - make requirements - make pkg-name - make is-patch - for T in $(shell echo $(TRANSFORMS_NAMES)); do \ - echo copy src from $$T ; \ - cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ - rm -fr *.egg-info ; \ - rm -fr dist ; \ - rm -fr build ; \ - done; - @# Help: Setup src folder and remove old distribution. to setup for a patched release use: make IS_PATCH=1 $@ - - -build:: build-dist - -publish:: publish-dist - -build-dist:: src .defaults.build-dist - @# Help: build the distribution for publishing to pypi. to build a patch release (no .devN) use: make IS_PATCH=1 $@ - -publish-dist:: .defaults.publish-dist - - diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile deleted file mode 100644 index aa75d525e..000000000 --- a/transforms/packaging/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -REPOROOT=../../ -# Use make help, to see the available rules -include ../../.make.defaults - -setup:: - -clean:: - # Clean up workflows common virtual environment. - rm -rf venv || true - rm -rf *.back || true - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -src:: - @# Help: Recursively setup $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -setup:: - -build:: - -build-dist:: - @# Help: Recursively build distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -publish-dist:: - @# Help: Recursively publish distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -venv:: - -image:: - -publish:: - -test-image:: - -test:: - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: - -workflow-venv:: - -workflow-test:: - -workflow-build:: - -workflow-upload:: - -set-versions:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md deleted file mode 100644 index e0d23ad52..000000000 --- a/transforms/packaging/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Transforms Pacakges for both Python and Ray - -Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this [link](python/README.md). Similarly the following [link](ray/README.md) provide a derailed list and installation instructions for Ray transforms - - - -## Clone folder and update version number -```` -git clone https://github.com/IBM/data-prep-kit.git package-release -cd package-release -```` -in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi - -`make set-versions` - -## Creating src folder - -Given that the transforms do not currently have their own name spaces, the first step is to copy all the transforms to the same src folder prior to running unit tests of the individual transforms and/or building the distribution: - - -```` -cd transforms/packaging -make clean -make src -```` - -## Build and Test - -This procedure will run all the UT for each individual transforms using a single package configuration: - -```` -cd transforms/packaging -make clean -make src -make test-src -```` - -## Build and Deploy - -This procedure will buid two wheels: one for the python transforms and one for the ray transforms. - -```` -cd transforms/packaging -make clean -make src -make build-dist -```` - -To publish the wheels to pypi.org, run: - -`make publish-dist` - - - - diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile deleted file mode 100644 index 6a0a355de..000000000 --- a/transforms/packaging/python/Makefile +++ /dev/null @@ -1,89 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=python - -ifeq ($(T_SET), all) -# Cannot combine language/html2parquet with pdf2parquet due to: -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -TRANSFORMS_NAMES = code/code_quality \ - code/code2parquet \ - code/header_cleanser \ - code/proglang_select \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/pdf2parquet \ - language/pii_redactor \ - language/text_encoder \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms" -endif - -ifeq ($(T_SET), lang1) -TRANSFORMS_NAMES = language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/html2parquet \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms_lang1" -endif - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - -test-src:: - $(MAKE) src - $(MAKE) .transforms.python-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-pypi: - $(MAKE) src - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install . - $(MAKE) run-ut - @# Help: Load dependencies from pypi and run all unit tests: final step in verification BEFORE deploying to pypi) - - -test-wheel: - -rm -fr venv - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install dist/*.whl - $(MAKE) run-ut - @# Help: Load wheel from local folder and run all unit tests - - - -test-latest-patch: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install $(TRANSFORM_PKG) - $(MAKE) run-ut - @# Help: Load wheel from pypi and run all unit tests: final step in verification AFTER deploying to pypi) - - - diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml deleted file mode 100644 index 8d760515a..000000000 --- a/transforms/packaging/python/pyproject.toml +++ /dev/null @@ -1,39 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt deleted file mode 100644 index c1246fba9..000000000 --- a/transforms/packaging/python/requirements.all.txt +++ /dev/null @@ -1,51 +0,0 @@ -data-prep-toolkit>=0.2.1 -# code quality -bs4==0.0.2 -transformers==4.38.2 -#pdf2parquet -docling-core==1.3.0 -docling-ibm-models==1.1.7 -deepsearch-glm==0.21.0 -docling==1.11.0, -filetype >=1.2.0, <2.0.0 -#Doc chunking -docling-core==1.3.0, -llama-index-core>=0.11.0,<0.12.0, -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy>=1.12.0, <2.0.0 -# ededup -mmh3>=4.1.0 -xxhash==3.4.1 -#code2parquet -pandas -parameterized -#header cleanser -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -#text_encoder -sentence-transformers==3.0.1 -# PII-redactor -presidio-analyzer>=2.2.355 -presidio-anonymizer>=2.2.355 -flair>=0.14.0 -pandas>=2.2.2 -#html2parquet -#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while. -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -#trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - - - diff --git a/transforms/packaging/python/requirements.lang1.txt b/transforms/packaging/python/requirements.lang1.txt deleted file mode 100644 index 1c7289f64..000000000 --- a/transforms/packaging/python/requirements.lang1.txt +++ /dev/null @@ -1,32 +0,0 @@ -data-prep-toolkit>=0.2.1 -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy==1.12.0 -# ededup -mmh3>=4.1.0, -xxhash==3.4.1 -#text_encoder -sentence-transformers>=3.0.1 -#html2parquet -trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - -#ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. -#data-prep-toolkit-transforms 0.2.2.dev0 requires duckdb==0.10.1, but you have duckdb 1.1.0 which is incompatible. -#data-prep-toolkit-transforms 0.2.2.dev0 requires sentence-transformers==3.0.1, but you have sentence-transformers 3.1.1 which is incompatible. - - - - - - diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile deleted file mode 100644 index 0a1d6d911..000000000 --- a/transforms/packaging/ray/Makefile +++ /dev/null @@ -1,66 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=ray - -# Excluded from build -# ./code/malware/ray - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - - -## Ray Transforms: `find . -name src | grep ray/src` -TRANSFORMS_NAMES = code/proglang_select \ - code/header_cleanser \ - code/code_quality \ - code/repo_level_ordering \ - code/code2parquet \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/pii_redactor \ - language/pdf2parquet \ - universal/fdedup \ - universal/tokenization \ - universal/ededup \ - universal/profiler \ - universal/doc_id \ - universal/filter \ - universal/resize - -# doc chunk has conflict dependencies with pdf2parquet that need to be resolved -# doc_chunk depends on docling>=1.8.2,<2.0.0 -# pdf2parquet depends on docling==1.7.0 - - -test-src:: - $(MAKE) src - $(MAKE) -C ../python src - make .transforms.ray-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-python-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . - $(MAKE) test-src - -test-with-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRANSFORMS_VERSION) - $(MAKE) test-src - diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md deleted file mode 100644 index b7d4cf2eb..000000000 --- a/transforms/packaging/ray/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# DPK Ray Transforms - -## installation - -The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: - -`python -m pip install data-prep-toolkit-transforms-ray` - -installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` - -## List of Ray Transforms availabe in current package - -Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components - -* code - * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) - * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/ray/README.md) - * [repo_level_ordering](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/repo_level_ordering/ray/README.md) -* language - * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/ray/README.md) - * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/ray/README.md) - * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/ray/README.md) - * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/ray/README.md) - * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/ray/README.md) - * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/ray/README.md) -* universal - * [fdedup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/fdedup/ray/README.md) - * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/tokenization/ray/README.md) - * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/ray/README.md) - * [profiler](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/profiler/ray/README.md) - * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/ray/README.md) - * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/ray/README.md) - * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/ray/README.md) - - - - - - diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml deleted file mode 100644 index 2f02d4c51..000000000 --- a/transforms/packaging/ray/pyproject.toml +++ /dev/null @@ -1,40 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms_ray" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms using Ray" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt deleted file mode 100644 index 632bbe670..000000000 --- a/transforms/packaging/ray/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -data-prep-toolkit-ray>=0.2.2.dev0 -data-prep-toolkit-transforms>=0.2.2.dev0 -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -parameterized -tqdm==4.66.3 -mmh3==4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -#The conflict is caused by: -# ray fdedup depends on scipy==1.12.0 -# docling 1.7.0 depends on scipy<2.0.0 and >=1.14.1 -scipy>=1.12.0 -networkx==3.3 -colorlog==6.8.2 -func-timeout==4.3.5 -pandas==2.2.2 -emerge-viz==2.0.0 - - - - diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml new file mode 100644 index 000000000..30d9f39e9 --- /dev/null +++ b/transforms/pyproject.toml @@ -0,0 +1,96 @@ +[project] +name = "data_prep_toolkit_transforms" +version = "0.2.2.dev0" +requires-python = ">=3.10" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms using Ray" +license = {text = "Apache-2.0"} +readme = {file = "README-list.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] +dynamic = ["dependencies","optional-dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +all = { file = [ +"code/proglang_select/python/requirements.txt", +"code/header_cleanser/python/requirements.txt", +"code/license_select/python/requirements.txt", +"code/code_quality/python/requirements.txt", +"code/code2parquet/python/requirements.txt", + +"language/doc_quality/python/requirements.txt", +"language/doc_chunk/python/requirements.txt", +##### Cannot have html2parquet until we solve +## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +## "language/html2parquet/python/requirements.txt", +"language/pii_redactor/python/requirements.txt", +"language/lang_id/python/requirements.txt", +"language/text_encoder/python/requirements.txt", +"language/pdf2parquet/python/requirements.txt", + +"universal/hap/python/requirements.txt", +"universal/tokenization/python/requirements.txt", +"universal/ededup/python/requirements.txt", +"universal/profiler/python/requirements.txt", +"universal/doc_id/python/requirements.txt", +"universal/filter/python/requirements.txt", +"universal/resize/python/requirements.txt" +]} + +# pyproject.toml must be in a parent and cannot be in sibling +# i.e. Cannot access '../code/proglang_select/python/.. + +proglang_select = { file = ["code/proglang_select/python/requirements.txt"]} +header_cleanser = {file = ["code/header_cleanser/python/requirements.txt"]} +license_select = { file = ["code/license_select/python/requirements.txt"]} +code_quality = { file = ["code/code_quality/python/requirements.txt"]} +code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} + +doc_quality = { file = ["language/doc_quality/python/requirements.txt"]} +doc_chunk = { file = ["language/doc_chunk/python/requirements.txt"]} +html2parquet = { file = ["language/html2parquet/python/requirements.txt"]} +pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} +lang_id = { file = ["language/lang_id/python/requirements.txt"]} +text_encoder = { file = ["language/text_encoder/python/requirements.txt"]} +pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} + +hap = { file = ["universal/hap/python/requirements.txt"]} +tokenization = { file = ["universal/tokenization/python/requirements.txt"]} +ededup = { file = ["universal/ededup/python/requirements.txt"]} +profiler = { file = ["universal/profiler/python/requirements.txt"]} +doc_id = { file = ["universal/doc_id/python/requirements.txt"]} +filter = { file = ["universal/filter/python/requirements.txt"]} +resize = { file = ["universal/resize/python/requirements.txt"]} + +# Does not seem to work for our custom layout +# copy all files to a single src and let automatic discovery find them + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + + + + + + + diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt new file mode 100644 index 000000000..4eadbf121 --- /dev/null +++ b/transforms/requirements-ray.txt @@ -0,0 +1,9 @@ +data-prep-toolkit[ray]>=0.2.2.dev0 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +emerge-viz==2.0.0 + + + + diff --git a/transforms/requirements.txt b/transforms/requirements.txt new file mode 100644 index 000000000..d30f01bd3 --- /dev/null +++ b/transforms/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit>=0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 46d3f79f8..b9d45b803 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0" -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/universal/doc_id/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 59d0d72ee..fecad1683 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -9,16 +9,16 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] + +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt new file mode 100644 index 000000000..d01c93d95 --- /dev/null +++ b/transforms/universal/ededup/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +mmh3>=4.1.0 +xxhash==3.4.1 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 3f2c8ba51..d6d36f9c0 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -11,10 +11,10 @@ authors = [ ] dependencies = [ "data-prep-toolkit-ray==0.2.2.dev0", - "mmh3==4.1.0", + "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", - "scipy==1.12.0" + "scipy>=1.12.0, <2.0.0" ] [build-system] diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index b9d781573..0cc80e7ae 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -8,10 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "duckdb==0.10.1", -] + +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt new file mode 100644 index 000000000..9d1711c3b --- /dev/null +++ b/transforms/universal/filter/python/requirements.txt @@ -0,0 +1,3 @@ + +data-prep-toolkit==0.2.2.dev0 +duckdb>=0.10.1 diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 4bc90209f..290e89a15 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -8,16 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt new file mode 100644 index 000000000..d164794c7 --- /dev/null +++ b/transforms/universal/profiler/python/requirements.txt @@ -0,0 +1,5 @@ + +data-prep-toolkit==0.2.2.dev0 +mmh3==4.1.0 +xxhash==3.4.1 + diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 2396e5b23..6dd64f3bf 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/universal/resize/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index f69787b3d..b45336701 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -9,11 +9,6 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "transformers==4.38.2", -] - [project_urls] Repository = "https://github.com/IBM/data-prep-kit" @@ -21,10 +16,15 @@ Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/" "Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization" +dynamic = ["dependencies"] + [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt new file mode 100644 index 000000000..269257538 --- /dev/null +++ b/transforms/universal/tokenization/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev0 +transformers==4.38.2 From 552b5fc11740bc715df2d734082c9eaab75cb52b Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 15:10:48 -0400 Subject: [PATCH 03/15] fix typos --- transforms/code/code2parquet/python/pyproject.toml | 6 +----- transforms/code/code_quality/python/pyproject.toml | 4 ---- transforms/code/license_select/python/pyproject.toml | 2 +- transforms/code/license_select/ray/pyproject.toml | 6 +++--- transforms/language/lang_id/python/pyproject.toml | 3 --- 5 files changed, 5 insertions(+), 16 deletions(-) diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 16c63f7d3..b08504bef 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -16,11 +16,7 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]}] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 22313fc2a..46f59bc6c 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -18,10 +18,6 @@ build-backend = "setuptools.build_meta" dependencies = {file = ["requirements.txt"]} -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 0d7857d12..1404bb205 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.1.dev0" +version = "0.2.2.dev0" requires-python = ">=3.10" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 89b4b9ea5..3295f2427 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.1.dev0" +version = "0.2.2.dev0" requires-python = ">=3.10" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-license-select-transform-python==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev0", ] [build-system] diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 1b9107019..ba256765f 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -17,9 +17,6 @@ build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" [project.optional-dependencies] dev = [ From 05b9c85a859af639b649401968afff33de5b8dc5 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 15:24:57 -0400 Subject: [PATCH 04/15] fix typo Signed-off-by: Maroun Touma --- transforms/universal/filter/python/pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 0cc80e7ae..f2dadffa6 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -18,10 +18,6 @@ build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] dependencies = {file = ["requirements.txt"]} -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - [project.optional-dependencies] dev = [ "twine", From 37454afcb1f7ba2430fbbeaa70758cafb7690ebe Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 15:35:34 -0400 Subject: [PATCH 05/15] Added requirements.txt to dockerfile Signed-off-by: Maroun Touma --- transforms/code/code2parquet/python/Dockerfile | 1 + transforms/code/code_quality/python/Dockerfile | 1 + transforms/code/header_cleanser/python/Dockerfile | 1 + transforms/code/license_select/python/Dockerfile | 1 + transforms/code/proglang_select/python/Dockerfile | 1 + transforms/language/doc_chunk/python/Dockerfile | 1 + transforms/language/doc_quality/python/Dockerfile | 1 + transforms/language/lang_id/python/Dockerfile | 1 + transforms/language/pii_redactor/python/Dockerfile | 1 + transforms/language/text_encoder/python/Dockerfile | 3 ++- transforms/universal/doc_id/python/Dockerfile | 2 +- transforms/universal/ededup/python/Dockerfile | 1 + transforms/universal/filter/python/Dockerfile | 1 + transforms/universal/profiler/python/Dockerfile | 1 + transforms/universal/resize/python/Dockerfile | 1 + transforms/universal/tokenization/python/Dockerfile | 1 + 16 files changed, 17 insertions(+), 2 deletions(-) diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/code/code2parquet/python/Dockerfile index b36b6a6c4..f94301a9c 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/code/code2parquet/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/code_quality/python/Dockerfile b/transforms/code/code_quality/python/Dockerfile index 76cf1de30..b25a57ca1 100644 --- a/transforms/code/code_quality/python/Dockerfile +++ b/transforms/code/code_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt diff --git a/transforms/code/header_cleanser/python/Dockerfile b/transforms/code/header_cleanser/python/Dockerfile index c2e215904..84831bcd2 100644 --- a/transforms/code/header_cleanser/python/Dockerfile +++ b/transforms/code/header_cleanser/python/Dockerfile @@ -27,6 +27,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/license_select/python/Dockerfile b/transforms/code/license_select/python/Dockerfile index 6831306c3..2fa9f9426 100644 --- a/transforms/code/license_select/python/Dockerfile +++ b/transforms/code/license_select/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/proglang_select/python/Dockerfile b/transforms/code/proglang_select/python/Dockerfile index a94d9d960..3186862f0 100644 --- a/transforms/code/proglang_select/python/Dockerfile +++ b/transforms/code/proglang_select/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/language/doc_chunk/python/Dockerfile b/transforms/language/doc_chunk/python/Dockerfile index 8efb3845b..d399a77ed 100644 --- a/transforms/language/doc_chunk/python/Dockerfile +++ b/transforms/language/doc_chunk/python/Dockerfile @@ -21,6 +21,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/doc_quality/python/Dockerfile b/transforms/language/doc_quality/python/Dockerfile index 78b769dd7..10dca4999 100644 --- a/transforms/language/doc_quality/python/Dockerfile +++ b/transforms/language/doc_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/lang_id/python/Dockerfile b/transforms/language/lang_id/python/Dockerfile index 131748480..f1bcc1bdd 100644 --- a/transforms/language/lang_id/python/Dockerfile +++ b/transforms/language/lang_id/python/Dockerfile @@ -25,6 +25,7 @@ USER dpk COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # clean up apt diff --git a/transforms/language/pii_redactor/python/Dockerfile b/transforms/language/pii_redactor/python/Dockerfile index 64b92e1b6..437bf8220 100644 --- a/transforms/language/pii_redactor/python/Dockerfile +++ b/transforms/language/pii_redactor/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/text_encoder/python/Dockerfile b/transforms/language/text_encoder/python/Dockerfile index 676968fee..86023a440 100644 --- a/transforms/language/text_encoder/python/Dockerfile +++ b/transforms/language/text_encoder/python/Dockerfile @@ -19,7 +19,8 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/universal/doc_id/python/Dockerfile b/transforms/universal/doc_id/python/Dockerfile index 16a9c0e66..6f478cb33 100644 --- a/transforms/universal/doc_id/python/Dockerfile +++ b/transforms/universal/doc_id/python/Dockerfile @@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md - +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/universal/ededup/python/Dockerfile b/transforms/universal/ededup/python/Dockerfile index d3d47e7a4..df9f3ce64 100644 --- a/transforms/universal/ededup/python/Dockerfile +++ b/transforms/universal/ededup/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/filter/python/Dockerfile b/transforms/universal/filter/python/Dockerfile index 6f60d2813..5df52a36e 100644 --- a/transforms/universal/filter/python/Dockerfile +++ b/transforms/universal/filter/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/profiler/python/Dockerfile b/transforms/universal/profiler/python/Dockerfile index a744fc9cd..9aa921f5e 100644 --- a/transforms/universal/profiler/python/Dockerfile +++ b/transforms/universal/profiler/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/resize/python/Dockerfile b/transforms/universal/resize/python/Dockerfile index 303e67840..9caa3565c 100644 --- a/transforms/universal/resize/python/Dockerfile +++ b/transforms/universal/resize/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:users src/ src/ COPY --chown=dpk:users pyproject.toml pyproject.toml COPY --chown=dpk:users README.md Readme.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/python/Dockerfile index a1fd159c7..a9a96e52d 100644 --- a/transforms/universal/tokenization/python/Dockerfile +++ b/transforms/universal/tokenization/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt From 46ba7953f60940d218694c4655af7243e076fd54 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 16:02:13 -0400 Subject: [PATCH 06/15] added requirements.txt to context --- .make.defaults | 1 + 1 file changed, 1 insertion(+) diff --git a/.make.defaults b/.make.defaults index 4e07d84b3..a81c3fb39 100644 --- a/.make.defaults +++ b/.make.defaults @@ -234,6 +234,7 @@ __check_defined = \ mkdir ${LIB_NAME} cp -p -R ${LIB_PATH}/src ${LIB_NAME} cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME} + cp -p -R ${LIB_PATH}/requirements.txt ${LIB_NAME} cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} # Build and image using the local Dockerfile and make the data-processing-lib/python From d52701b5963de3b9a0094686b5c1859a85088f4f Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 16:55:53 -0400 Subject: [PATCH 07/15] pip install python requirments from txt file Signed-off-by: Maroun Touma --- .make.defaults | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.make.defaults b/.make.defaults index a81c3fb39..e1287ffc4 100644 --- a/.make.defaults +++ b/.make.defaults @@ -302,7 +302,10 @@ endif if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \ extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \ fi; \ - pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR); + if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \ + pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \ + fi; \ + pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR) @echo Done installing source from $(PYTHON_PROJECT_DIR) into venv # Install local requirements last as it generally includes our lib source From 8e77eb7614775ad678b64563e6eb7ce3cb1dc98e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 17:06:09 -0400 Subject: [PATCH 08/15] copy requiremwnts.txt if prsent Signed-off-by: Maroun Touma --- .make.defaults | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.make.defaults b/.make.defaults index e1287ffc4..bf46962ea 100644 --- a/.make.defaults +++ b/.make.defaults @@ -234,7 +234,7 @@ __check_defined = \ mkdir ${LIB_NAME} cp -p -R ${LIB_PATH}/src ${LIB_NAME} cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME} - cp -p -R ${LIB_PATH}/requirements.txt ${LIB_NAME} + -cp -p -R ${LIB_PATH}/requirements.txt ${LIB_NAME} cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} # Build and image using the local Dockerfile and make the data-processing-lib/python From 4e62fb026d09b04e473ba8885c49836553f00966 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 17:56:34 -0400 Subject: [PATCH 09/15] try to install requirements file directly. Signed-off-by: Maroun Touma --- transforms/universal/tokenization/python/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/python/Dockerfile index a9a96e52d..e1eea7e40 100644 --- a/transforms/universal/tokenization/python/Dockerfile +++ b/transforms/universal/tokenization/python/Dockerfile @@ -20,11 +20,9 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - # copy the main() entry point to the image COPY ./src/tokenization_transform_python.py . From 0ada456b00aea99c2408e4e9519a42807306c607 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 18:14:30 -0400 Subject: [PATCH 10/15] Apply same fix as for python image. For some reason, in this image we need to install the rquirements.txt seperately Signed-off-by: Maroun Touma --- transforms/universal/tokenization/ray/Dockerfile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/ray/Dockerfile index 0199e23b8..8b7e78c27 100644 --- a/transforms/universal/tokenization/ray/Dockerfile +++ b/transforms/universal/tokenization/ray/Dockerfile @@ -13,11 +13,9 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . -COPY --chown=ray:users python-transform/ python-transform -RUN cd python-transform && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform +RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml From 81fdd0f52c390fa15c8fb304e88e47860522ee67 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Sun, 6 Oct 2024 19:09:33 -0400 Subject: [PATCH 11/15] prevent kfp-workflow from installing local requirements found in /transforms folder Signed-off-by: Maroun Touma --- .make.defaults | 5 +++++ transforms/.make.workflows | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.make.defaults b/.make.defaults index bf46962ea..3b5016237 100644 --- a/.make.defaults +++ b/.make.defaults @@ -348,6 +348,11 @@ endif .defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. +# Install local requirements last as it generally includes our lib source +.PHONY: .defaults.kfp-venv +.defaults.kfp-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv + @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. + # Install all source from the repo for a ray runtime transform into an existing venv # And if there is an adjacent python dir (as for transforms), then also install that source .PHONY: .defaults.install-ray-lib-src-venv diff --git a/transforms/.make.workflows b/transforms/.make.workflows index adbf721e6..d9b9217b1 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -52,7 +52,7 @@ endif ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${KFP_LIB_SRC_FILES} ${KFP_LIB_CONFIG_FILE} ${KFP_SHARED_LIB_SRC_FILES} rm -rf ${REPOROOT}/transforms/venv - $(MAKE) -C ${REPOROOT}/transforms .defaults.ray-lib-src-venv + $(MAKE) -C ${REPOROOT}/transforms .defaults.kfp-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/shared_workflow_support; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); \ From df9988687b7f31d7d23362fcb1416d3adfff5c8b Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 7 Oct 2024 11:15:50 -0400 Subject: [PATCH 12/15] change author name for pyproject.tml Signed-off-by: Maroun Touma --- data-processing-lib/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 9dbbc7f27..d9f23f2fb 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -7,8 +7,7 @@ description = "Data Preparation Toolkit Library for Ray and Python" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Maroun Touma", email = "touma@us.ibm.com" }, ] dynamic = ["dependencies", "optional-dependencies"] From 5b4572bb482d26a1e95449b540ba678a4c80bd3e Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Mon, 7 Oct 2024 09:57:30 -0700 Subject: [PATCH 13/15] Update README.md Consistency of installing new pip installs --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aeec4ef70..bd717aa82 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,11 @@ conda install gcc_linux-64 conda install gxx_linux-64 ``` -Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. +Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. For better management of dependencies, it is recommended to install the same tagged version of both the library and the transform. ```bash -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit[ray]==0.2.2 +pip3 install data-prep-toolkit-transforms[ray]==0.2.2 pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel From e641e49d3bc27c12b57ae005f568b456d397adf9 Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Mon, 7 Oct 2024 09:59:02 -0700 Subject: [PATCH 14/15] Update quick-start.md New pip install option for Ray --- doc/quick-start/quick-start.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/quick-start/quick-start.md b/doc/quick-start/quick-start.md index b7167df77..47d5a1f1b 100644 --- a/doc/quick-start/quick-start.md +++ b/doc/quick-start/quick-start.md @@ -59,7 +59,7 @@ or **Deploy the latest releases of the data prep toolkit library, all python transforms and all ray transforms** ```shell -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit-transforms[ray] ``` ## Running transforms From 0e7b710e5e24fa4c3fdb2a46b80e269c58904173 Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Mon, 7 Oct 2024 10:03:42 -0700 Subject: [PATCH 15/15] Update README.md Use "all" option for the new pip install --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bd717aa82..31d238833 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Next, install the data prep toolkit library. This library installs both the pyth ```bash pip3 install data-prep-toolkit[ray]==0.2.2 -pip3 install data-prep-toolkit-transforms[ray]==0.2.2 +pip3 install data-prep-toolkit-transforms[ray,all]==0.2.2 pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel