diff --git a/.make.defaults b/.make.defaults index f9f58500f..3b5016237 100644 --- a/.make.defaults +++ b/.make.defaults @@ -234,6 +234,7 @@ __check_defined = \ mkdir ${LIB_NAME} cp -p -R ${LIB_PATH}/src ${LIB_NAME} cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME} + -cp -p -R ${LIB_PATH}/requirements.txt ${LIB_NAME} cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} # Build and image using the local Dockerfile and make the data-processing-lib/python @@ -301,7 +302,10 @@ endif if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \ extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \ fi; \ - pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR); + if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \ + pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \ + fi; \ + pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR) @echo Done installing source from $(PYTHON_PROJECT_DIR) into venv # Install local requirements last as it generally includes our lib source @@ -344,6 +348,11 @@ endif .defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. +# Install local requirements last as it generally includes our lib source +.PHONY: .defaults.kfp-venv +.defaults.kfp-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv + @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. + # Install all source from the repo for a ray runtime transform into an existing venv # And if there is an adjacent python dir (as for transforms), then also install that source .PHONY: .defaults.install-ray-lib-src-venv @@ -627,7 +636,7 @@ MINIO_ADMIN_PWD= localminiosecretkey rm -rf dist || true rm -rf src/*egg-info || true ${PIP} install --upgrade build - ${PYTHON} -m build + ${PYTHON} -m build $(BUILD_WHEEL_ARG) # Publish the distribution in the dist directory, usually created with .defaults.build-dist target .PHONY: .defaults.publish-dist diff --git a/README.md b/README.md index aeec4ef70..31d238833 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,11 @@ conda install gcc_linux-64 conda install gxx_linux-64 ``` -Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. +Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. For better management of dependencies, it is recommended to install the same tagged version of both the library and the transform. ```bash -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit[ray]==0.2.2 +pip3 install data-prep-toolkit-transforms[ray,all]==0.2.2 pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile index a70a05ff8..d0d1305ac 100644 --- a/data-processing-lib/Makefile +++ b/data-processing-lib/Makefile @@ -55,3 +55,8 @@ set-versions: @# Help: Recursively $@ in all subdirs @$(MAKE) RULE=$@ .recurse + +build-pkg-dist:: + $(MAKE) .defaults.build-dist BUILD_WHEEL_ARG=-w + +publish-dist :: .check-env .defaults.publish-dist diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml new file mode 100644 index 000000000..d9f23f2fb --- /dev/null +++ b/data-processing-lib/pyproject.toml @@ -0,0 +1,43 @@ +[project] +name = "data_prep_toolkit" +version = "0.2.2.dev0" +keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +requires-python = ">=3.10" +description = "Data Preparation Toolkit Library for Ray and Python" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] + +dynamic = ["dependencies", "optional-dependencies"] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" +"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +spark = { file = ["requirements-spark.txt"]} + +[tool.setuptools.packages.find] +where = ["python/src", "ray/src", "spark/src"] + + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/data-processing-lib/requirements-dev.txt b/data-processing-lib/requirements-dev.txt new file mode 100644 index 000000000..326d62c8e --- /dev/null +++ b/data-processing-lib/requirements-dev.txt @@ -0,0 +1,9 @@ +twine +pytest>=7.3.2 +pytest-dotenv>=0.5.2 +pytest-env>=1.0.0 +pre-commit>=3.3.2 +pytest-cov>=4.1.0 +pytest-mock>=3.10.0 +moto==5.0.5 +markupsafe==2.0.1 diff --git a/data-processing-lib/requirements-ray.txt b/data-processing-lib/requirements-ray.txt new file mode 100644 index 000000000..aafa3caeb --- /dev/null +++ b/data-processing-lib/requirements-ray.txt @@ -0,0 +1,3 @@ +ray[default]==2.24.0 +fastapi>=0.110.2 +pillow>=10.3.0 diff --git a/data-processing-lib/requirements-spark.txt b/data-processing-lib/requirements-spark.txt new file mode 100644 index 000000000..f38f033da --- /dev/null +++ b/data-processing-lib/requirements-spark.txt @@ -0,0 +1,2 @@ +pyspark>=3.5.2 +psutil>=6.0.0 diff --git a/data-processing-lib/requirements.txt b/data-processing-lib/requirements.txt new file mode 100644 index 000000000..7b363f2b5 --- /dev/null +++ b/data-processing-lib/requirements.txt @@ -0,0 +1,6 @@ + numpy < 1.29.0 + pyarrow==16.1.0 + boto3==1.34.69 + argparse + mmh3 + psutil diff --git a/doc/quick-start/quick-start.md b/doc/quick-start/quick-start.md index b7167df77..47d5a1f1b 100644 --- a/doc/quick-start/quick-start.md +++ b/doc/quick-start/quick-start.md @@ -59,7 +59,7 @@ or **Deploy the latest releases of the data prep toolkit library, all python transforms and all ray transforms** ```shell -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit-transforms[ray] ``` ## Running transforms diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 9b7259b60..8cc35a926 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -52,7 +52,7 @@ endif ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${KFP_LIB_SRC_FILES} ${KFP_LIB_CONFIG_FILE} ${KFP_SHARED_LIB_SRC_FILES} rm -rf ${REPOROOT}/transforms/venv - $(MAKE) -C ${REPOROOT}/transforms .defaults.ray-lib-src-venv + $(MAKE) -C ${REPOROOT}/transforms .defaults.kfp-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/shared_workflow_support; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); \ diff --git a/transforms/Makefile b/transforms/Makefile index 2f8fa27b1..5ff1f5111 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -79,3 +79,32 @@ workflow-upload:: set-versions:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse + + +build-pkg-dist:: + ## Most transforms today don't have a package name.... Need to fix that + ## In the meantime, we will copy everything to a single folder + -rm -fr src + mkdir src + # Copy all the src folders recursively (not clear if they have subfolders) + for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + echo $$x ; \ + if [ -d "$$x" ]; then \ + cp -r $$x/* src ; \ + fi \ + done + # Only needs to build the whl + $(MAKE) BUILD_WHEEL_ARG=-w .defaults.build-dist + +test-pkg-dist:: + -rm -fr venv + python -m venv venv + source venv/bin/activate && $(PYTHON) -m pip install '$(REPOROOT)/data-processing-lib/dist/data_prep_toolkit-$(DPK_VERSION)-py3-none-any.whl[dev,ray]' + source venv/bin/activate && $(PYTHON) -m pip install 'dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl[all]' + for T in $(shell find . | grep '[ray| python]/test$$') ; do \ + echo "running unit test on: $$T" ; \ + source venv/bin/activate && $(PYTEST) $$T; \ + done; + @# Help: Setup environment and run unit tests for all transforms + + diff --git a/transforms/packaging/python/README.md b/transforms/README-list.md similarity index 94% rename from transforms/packaging/python/README.md rename to transforms/README-list.md index 20eb0dff0..99885ad34 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/README-list.md @@ -5,9 +5,14 @@ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: `python -m pip install data-prep-toolkit-transforms` +or +`python -m pip install data-prep-toolkit-transforms[ray]` + installing the python transforms will also install `data-prep-toolkit` +installing the ray transforms will also install `data-prep-toolkit[ray]` + ## List of Transforms in current package Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi. diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/code/code2parquet/python/Dockerfile index b36b6a6c4..f94301a9c 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/code/code2parquet/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 34a668bf0..b08504bef 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -9,16 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "parameterized", - "pandas", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt new file mode 100644 index 000000000..758ab56fe --- /dev/null +++ b/transforms/code/code2parquet/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +parameterized +pandas diff --git a/transforms/code/code_quality/python/Dockerfile b/transforms/code/code_quality/python/Dockerfile index 76cf1de30..b25a57ca1 100644 --- a/transforms/code/code_quality/python/Dockerfile +++ b/transforms/code/code_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 58e2affa7..46f59bc6c 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -8,16 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "bs4==0.0.2", - "transformers==4.38.2", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt new file mode 100644 index 000000000..106e56f74 --- /dev/null +++ b/transforms/code/code_quality/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +bs4==0.0.2 +transformers==4.38.2 diff --git a/transforms/code/header_cleanser/python/Dockerfile b/transforms/code/header_cleanser/python/Dockerfile index c2e215904..84831bcd2 100644 --- a/transforms/code/header_cleanser/python/Dockerfile +++ b/transforms/code/header_cleanser/python/Dockerfile @@ -27,6 +27,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index c4326b4a0..2e24466f0 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "scancode-toolkit==32.1.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt new file mode 100644 index 000000000..bed2168c1 --- /dev/null +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' + diff --git a/transforms/code/license_select/python/Dockerfile b/transforms/code/license_select/python/Dockerfile index 6831306c3..2fa9f9426 100644 --- a/transforms/code/license_select/python/Dockerfile +++ b/transforms/code/license_select/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 1058b0440..1404bb205 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.1.dev0" +version = "0.2.2.dev0" requires-python = ">=3.10" description = "License Select Python Transform" license = {text = "Apache-2.0"} @@ -9,14 +9,16 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.1.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/code/license_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 89b4b9ea5..3295f2427 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.1.dev0" +version = "0.2.2.dev0" requires-python = ">=3.10" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-license-select-transform-python==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev0", ] [build-system] diff --git a/transforms/code/proglang_select/python/Dockerfile b/transforms/code/proglang_select/python/Dockerfile index a94d9d960..3186862f0 100644 --- a/transforms/code/proglang_select/python/Dockerfile +++ b/transforms/code/proglang_select/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 25aa5fdcf..9745a48c3 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/code/proglang_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/language/doc_chunk/python/Dockerfile b/transforms/language/doc_chunk/python/Dockerfile index 8efb3845b..d399a77ed 100644 --- a/transforms/language/doc_chunk/python/Dockerfile +++ b/transforms/language/doc_chunk/python/Dockerfile @@ -21,6 +21,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 7705779b0..1a3bd333f 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -10,16 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "docling-core==1.3.0", - "llama-index-core>=0.11.0,<0.12.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt new file mode 100644 index 000000000..8e8c1bebb --- /dev/null +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +docling-core==1.3.0 +llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_quality/python/Dockerfile b/transforms/language/doc_quality/python/Dockerfile index 78b769dd7..10dca4999 100644 --- a/transforms/language/doc_quality/python/Dockerfile +++ b/transforms/language/doc_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 8ebec8fe3..12c712ae9 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt new file mode 100644 index 000000000..f2f9d6200 --- /dev/null +++ b/transforms/language/doc_quality/python/requirements.txt @@ -0,0 +1,2 @@ + +data-prep-toolkit==0.2.2.dev0 diff --git a/transforms/language/lang_id/python/Dockerfile b/transforms/language/lang_id/python/Dockerfile index 131748480..f1bcc1bdd 100644 --- a/transforms/language/lang_id/python/Dockerfile +++ b/transforms/language/lang_id/python/Dockerfile @@ -25,6 +25,7 @@ USER dpk COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # clean up apt diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 54c874a36..ba256765f 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -8,18 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "fasttext==0.9.2", - "langcodes==3.3.0", - "huggingface-hub >= 0.21.4, <1.0.0", - "numpy==1.26.4", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt new file mode 100644 index 000000000..111465be0 --- /dev/null +++ b/transforms/language/lang_id/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev0 +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 diff --git a/transforms/language/pii_redactor/python/Dockerfile b/transforms/language/pii_redactor/python/Dockerfile index 64b92e1b6..437bf8220 100644 --- a/transforms/language/pii_redactor/python/Dockerfile +++ b/transforms/language/pii_redactor/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 55d4e8970..7045b6ec0 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -8,18 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "presidio-analyzer>=2.2.355", - "presidio-anonymizer>=2.2.355", - "flair>=0.14.0", - "pandas>=2.2.2", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt new file mode 100644 index 000000000..99e423ce1 --- /dev/null +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev0 +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 diff --git a/transforms/language/text_encoder/python/Dockerfile b/transforms/language/text_encoder/python/Dockerfile index 676968fee..86023a440 100644 --- a/transforms/language/text_encoder/python/Dockerfile +++ b/transforms/language/text_encoder/python/Dockerfile @@ -19,7 +19,8 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index e9f84fefd..0dd0ac44c 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -10,15 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "sentence-transformers==3.0.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt new file mode 100644 index 000000000..be8c0a880 --- /dev/null +++ b/transforms/language/text_encoder/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev0 +sentence-transformers==3.0.1 diff --git a/transforms/packaging/.gitignore b/transforms/packaging/.gitignore deleted file mode 100644 index 863607847..000000000 --- a/transforms/packaging/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -**/src -**/dist -**/*.egg-info -**/build - diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging deleted file mode 100644 index 5268889d0..000000000 --- a/transforms/packaging/.make.packaging +++ /dev/null @@ -1,83 +0,0 @@ -ifndef T_SET -T_SET=all -endif - - -venv: - $(MAKE) .defaults.create-venv - -test:: test-src - -clean:: .transforms.clean - -rm -fr src - -image:: .transforms.python-image - -run-ut:: - source venv/bin/activate; \ - if [ -e requirements.test.txt ]; then \ - $(PYTHON) -m pip install -r requirements.test.txt ; \ - fi; \ - for T in $(TRANSFORMS_NAMES); do \ - echo running unit test on: $$T ; \ - $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ - done; - @# Help: Setup environment and run unit tests for all transforms - - -setup: .transforms.setup venv - $(MAKE) src - source venv/bin/activate; \ - $(PYTHON) -m pip install . - @# Help: Do any default transform setup before running make src and setting up a test environment - - -requirements: - if [ -e requirements.$(T_SET).txt ]; then \ - cp requirements.$(T_SET).txt requirements.txt ; \ - fi - -pkg-name: - if [ $(TRANSFORM_PKG) ]; then \ - cat pyproject.toml | sed -e \ - 's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -is-patch: - if [ $(IS_PATCH) ]; then \ - cat pyproject.toml | sed -e \ - 's/^version[ ]*=[ ]*"\(.*\).dev.*/version = "\1"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -##################################################### -# to build a patched release, use make IS_PATCH=1 src -##################################################### -src: - mkdir src - make requirements - make pkg-name - make is-patch - for T in $(shell echo $(TRANSFORMS_NAMES)); do \ - echo copy src from $$T ; \ - cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ - rm -fr *.egg-info ; \ - rm -fr dist ; \ - rm -fr build ; \ - done; - @# Help: Setup src folder and remove old distribution. to setup for a patched release use: make IS_PATCH=1 $@ - - -build:: build-dist - -publish:: publish-dist - -build-dist:: src .defaults.build-dist - @# Help: build the distribution for publishing to pypi. to build a patch release (no .devN) use: make IS_PATCH=1 $@ - -publish-dist:: .defaults.publish-dist - - diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile deleted file mode 100644 index aa75d525e..000000000 --- a/transforms/packaging/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -REPOROOT=../../ -# Use make help, to see the available rules -include ../../.make.defaults - -setup:: - -clean:: - # Clean up workflows common virtual environment. - rm -rf venv || true - rm -rf *.back || true - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -src:: - @# Help: Recursively setup $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -setup:: - -build:: - -build-dist:: - @# Help: Recursively build distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -publish-dist:: - @# Help: Recursively publish distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -venv:: - -image:: - -publish:: - -test-image:: - -test:: - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: - -workflow-venv:: - -workflow-test:: - -workflow-build:: - -workflow-upload:: - -set-versions:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md deleted file mode 100644 index e0d23ad52..000000000 --- a/transforms/packaging/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Transforms Pacakges for both Python and Ray - -Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this [link](python/README.md). Similarly the following [link](ray/README.md) provide a derailed list and installation instructions for Ray transforms - - - -## Clone folder and update version number -```` -git clone https://github.com/IBM/data-prep-kit.git package-release -cd package-release -```` -in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi - -`make set-versions` - -## Creating src folder - -Given that the transforms do not currently have their own name spaces, the first step is to copy all the transforms to the same src folder prior to running unit tests of the individual transforms and/or building the distribution: - - -```` -cd transforms/packaging -make clean -make src -```` - -## Build and Test - -This procedure will run all the UT for each individual transforms using a single package configuration: - -```` -cd transforms/packaging -make clean -make src -make test-src -```` - -## Build and Deploy - -This procedure will buid two wheels: one for the python transforms and one for the ray transforms. - -```` -cd transforms/packaging -make clean -make src -make build-dist -```` - -To publish the wheels to pypi.org, run: - -`make publish-dist` - - - - diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile deleted file mode 100644 index 6a0a355de..000000000 --- a/transforms/packaging/python/Makefile +++ /dev/null @@ -1,89 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=python - -ifeq ($(T_SET), all) -# Cannot combine language/html2parquet with pdf2parquet due to: -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -TRANSFORMS_NAMES = code/code_quality \ - code/code2parquet \ - code/header_cleanser \ - code/proglang_select \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/pdf2parquet \ - language/pii_redactor \ - language/text_encoder \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms" -endif - -ifeq ($(T_SET), lang1) -TRANSFORMS_NAMES = language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/html2parquet \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms_lang1" -endif - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - -test-src:: - $(MAKE) src - $(MAKE) .transforms.python-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-pypi: - $(MAKE) src - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install . - $(MAKE) run-ut - @# Help: Load dependencies from pypi and run all unit tests: final step in verification BEFORE deploying to pypi) - - -test-wheel: - -rm -fr venv - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install dist/*.whl - $(MAKE) run-ut - @# Help: Load wheel from local folder and run all unit tests - - - -test-latest-patch: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install $(TRANSFORM_PKG) - $(MAKE) run-ut - @# Help: Load wheel from pypi and run all unit tests: final step in verification AFTER deploying to pypi) - - - diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml deleted file mode 100644 index 8d760515a..000000000 --- a/transforms/packaging/python/pyproject.toml +++ /dev/null @@ -1,39 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt deleted file mode 100644 index c1246fba9..000000000 --- a/transforms/packaging/python/requirements.all.txt +++ /dev/null @@ -1,51 +0,0 @@ -data-prep-toolkit>=0.2.1 -# code quality -bs4==0.0.2 -transformers==4.38.2 -#pdf2parquet -docling-core==1.3.0 -docling-ibm-models==1.1.7 -deepsearch-glm==0.21.0 -docling==1.11.0, -filetype >=1.2.0, <2.0.0 -#Doc chunking -docling-core==1.3.0, -llama-index-core>=0.11.0,<0.12.0, -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy>=1.12.0, <2.0.0 -# ededup -mmh3>=4.1.0 -xxhash==3.4.1 -#code2parquet -pandas -parameterized -#header cleanser -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -#text_encoder -sentence-transformers==3.0.1 -# PII-redactor -presidio-analyzer>=2.2.355 -presidio-anonymizer>=2.2.355 -flair>=0.14.0 -pandas>=2.2.2 -#html2parquet -#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while. -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -#trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - - - diff --git a/transforms/packaging/python/requirements.lang1.txt b/transforms/packaging/python/requirements.lang1.txt deleted file mode 100644 index 1c7289f64..000000000 --- a/transforms/packaging/python/requirements.lang1.txt +++ /dev/null @@ -1,32 +0,0 @@ -data-prep-toolkit>=0.2.1 -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy==1.12.0 -# ededup -mmh3>=4.1.0, -xxhash==3.4.1 -#text_encoder -sentence-transformers>=3.0.1 -#html2parquet -trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - -#ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. -#data-prep-toolkit-transforms 0.2.2.dev0 requires duckdb==0.10.1, but you have duckdb 1.1.0 which is incompatible. -#data-prep-toolkit-transforms 0.2.2.dev0 requires sentence-transformers==3.0.1, but you have sentence-transformers 3.1.1 which is incompatible. - - - - - - diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile deleted file mode 100644 index 0a1d6d911..000000000 --- a/transforms/packaging/ray/Makefile +++ /dev/null @@ -1,66 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=ray - -# Excluded from build -# ./code/malware/ray - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - - -## Ray Transforms: `find . -name src | grep ray/src` -TRANSFORMS_NAMES = code/proglang_select \ - code/header_cleanser \ - code/code_quality \ - code/repo_level_ordering \ - code/code2parquet \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/pii_redactor \ - language/pdf2parquet \ - universal/fdedup \ - universal/tokenization \ - universal/ededup \ - universal/profiler \ - universal/doc_id \ - universal/filter \ - universal/resize - -# doc chunk has conflict dependencies with pdf2parquet that need to be resolved -# doc_chunk depends on docling>=1.8.2,<2.0.0 -# pdf2parquet depends on docling==1.7.0 - - -test-src:: - $(MAKE) src - $(MAKE) -C ../python src - make .transforms.ray-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-python-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . - $(MAKE) test-src - -test-with-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRANSFORMS_VERSION) - $(MAKE) test-src - diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md deleted file mode 100644 index b7d4cf2eb..000000000 --- a/transforms/packaging/ray/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# DPK Ray Transforms - -## installation - -The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: - -`python -m pip install data-prep-toolkit-transforms-ray` - -installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` - -## List of Ray Transforms availabe in current package - -Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components - -* code - * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) - * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/ray/README.md) - * [repo_level_ordering](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/repo_level_ordering/ray/README.md) -* language - * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/ray/README.md) - * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/ray/README.md) - * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/ray/README.md) - * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/ray/README.md) - * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/ray/README.md) - * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/ray/README.md) -* universal - * [fdedup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/fdedup/ray/README.md) - * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/tokenization/ray/README.md) - * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/ray/README.md) - * [profiler](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/profiler/ray/README.md) - * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/ray/README.md) - * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/ray/README.md) - * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/ray/README.md) - - - - - - diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml deleted file mode 100644 index 2f02d4c51..000000000 --- a/transforms/packaging/ray/pyproject.toml +++ /dev/null @@ -1,40 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms_ray" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms using Ray" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt deleted file mode 100644 index 632bbe670..000000000 --- a/transforms/packaging/ray/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -data-prep-toolkit-ray>=0.2.2.dev0 -data-prep-toolkit-transforms>=0.2.2.dev0 -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -parameterized -tqdm==4.66.3 -mmh3==4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -#The conflict is caused by: -# ray fdedup depends on scipy==1.12.0 -# docling 1.7.0 depends on scipy<2.0.0 and >=1.14.1 -scipy>=1.12.0 -networkx==3.3 -colorlog==6.8.2 -func-timeout==4.3.5 -pandas==2.2.2 -emerge-viz==2.0.0 - - - - diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml new file mode 100644 index 000000000..30d9f39e9 --- /dev/null +++ b/transforms/pyproject.toml @@ -0,0 +1,96 @@ +[project] +name = "data_prep_toolkit_transforms" +version = "0.2.2.dev0" +requires-python = ">=3.10" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms using Ray" +license = {text = "Apache-2.0"} +readme = {file = "README-list.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] +dynamic = ["dependencies","optional-dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +all = { file = [ +"code/proglang_select/python/requirements.txt", +"code/header_cleanser/python/requirements.txt", +"code/license_select/python/requirements.txt", +"code/code_quality/python/requirements.txt", +"code/code2parquet/python/requirements.txt", + +"language/doc_quality/python/requirements.txt", +"language/doc_chunk/python/requirements.txt", +##### Cannot have html2parquet until we solve +## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +## "language/html2parquet/python/requirements.txt", +"language/pii_redactor/python/requirements.txt", +"language/lang_id/python/requirements.txt", +"language/text_encoder/python/requirements.txt", +"language/pdf2parquet/python/requirements.txt", + +"universal/hap/python/requirements.txt", +"universal/tokenization/python/requirements.txt", +"universal/ededup/python/requirements.txt", +"universal/profiler/python/requirements.txt", +"universal/doc_id/python/requirements.txt", +"universal/filter/python/requirements.txt", +"universal/resize/python/requirements.txt" +]} + +# pyproject.toml must be in a parent and cannot be in sibling +# i.e. Cannot access '../code/proglang_select/python/.. + +proglang_select = { file = ["code/proglang_select/python/requirements.txt"]} +header_cleanser = {file = ["code/header_cleanser/python/requirements.txt"]} +license_select = { file = ["code/license_select/python/requirements.txt"]} +code_quality = { file = ["code/code_quality/python/requirements.txt"]} +code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} + +doc_quality = { file = ["language/doc_quality/python/requirements.txt"]} +doc_chunk = { file = ["language/doc_chunk/python/requirements.txt"]} +html2parquet = { file = ["language/html2parquet/python/requirements.txt"]} +pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} +lang_id = { file = ["language/lang_id/python/requirements.txt"]} +text_encoder = { file = ["language/text_encoder/python/requirements.txt"]} +pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} + +hap = { file = ["universal/hap/python/requirements.txt"]} +tokenization = { file = ["universal/tokenization/python/requirements.txt"]} +ededup = { file = ["universal/ededup/python/requirements.txt"]} +profiler = { file = ["universal/profiler/python/requirements.txt"]} +doc_id = { file = ["universal/doc_id/python/requirements.txt"]} +filter = { file = ["universal/filter/python/requirements.txt"]} +resize = { file = ["universal/resize/python/requirements.txt"]} + +# Does not seem to work for our custom layout +# copy all files to a single src and let automatic discovery find them + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + + + + + + + diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt new file mode 100644 index 000000000..4eadbf121 --- /dev/null +++ b/transforms/requirements-ray.txt @@ -0,0 +1,9 @@ +data-prep-toolkit[ray]>=0.2.2.dev0 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +emerge-viz==2.0.0 + + + + diff --git a/transforms/requirements.txt b/transforms/requirements.txt new file mode 100644 index 000000000..d30f01bd3 --- /dev/null +++ b/transforms/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit>=0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/python/Dockerfile b/transforms/universal/doc_id/python/Dockerfile index 16a9c0e66..6f478cb33 100644 --- a/transforms/universal/doc_id/python/Dockerfile +++ b/transforms/universal/doc_id/python/Dockerfile @@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md - +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 46d3f79f8..b9d45b803 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0" -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/universal/doc_id/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/ededup/python/Dockerfile b/transforms/universal/ededup/python/Dockerfile index d3d47e7a4..df9f3ce64 100644 --- a/transforms/universal/ededup/python/Dockerfile +++ b/transforms/universal/ededup/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 59d0d72ee..fecad1683 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -9,16 +9,16 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] + +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt new file mode 100644 index 000000000..d01c93d95 --- /dev/null +++ b/transforms/universal/ededup/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev0 +mmh3>=4.1.0 +xxhash==3.4.1 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 3f2c8ba51..d6d36f9c0 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -11,10 +11,10 @@ authors = [ ] dependencies = [ "data-prep-toolkit-ray==0.2.2.dev0", - "mmh3==4.1.0", + "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", - "scipy==1.12.0" + "scipy>=1.12.0, <2.0.0" ] [build-system] diff --git a/transforms/universal/filter/python/Dockerfile b/transforms/universal/filter/python/Dockerfile index 6f60d2813..5df52a36e 100644 --- a/transforms/universal/filter/python/Dockerfile +++ b/transforms/universal/filter/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index b9d781573..f2dadffa6 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "duckdb==0.10.1", -] + +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt new file mode 100644 index 000000000..9d1711c3b --- /dev/null +++ b/transforms/universal/filter/python/requirements.txt @@ -0,0 +1,3 @@ + +data-prep-toolkit==0.2.2.dev0 +duckdb>=0.10.1 diff --git a/transforms/universal/profiler/python/Dockerfile b/transforms/universal/profiler/python/Dockerfile index a744fc9cd..9aa921f5e 100644 --- a/transforms/universal/profiler/python/Dockerfile +++ b/transforms/universal/profiler/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 4bc90209f..290e89a15 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -8,16 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt new file mode 100644 index 000000000..d164794c7 --- /dev/null +++ b/transforms/universal/profiler/python/requirements.txt @@ -0,0 +1,5 @@ + +data-prep-toolkit==0.2.2.dev0 +mmh3==4.1.0 +xxhash==3.4.1 + diff --git a/transforms/universal/resize/python/Dockerfile b/transforms/universal/resize/python/Dockerfile index 303e67840..9caa3565c 100644 --- a/transforms/universal/resize/python/Dockerfile +++ b/transforms/universal/resize/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:users src/ src/ COPY --chown=dpk:users pyproject.toml pyproject.toml COPY --chown=dpk:users README.md Readme.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 2396e5b23..6dd64f3bf 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt new file mode 100644 index 000000000..e14cde7ab --- /dev/null +++ b/transforms/universal/resize/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev0 \ No newline at end of file diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/python/Dockerfile index a1fd159c7..e1eea7e40 100644 --- a/transforms/universal/tokenization/python/Dockerfile +++ b/transforms/universal/tokenization/python/Dockerfile @@ -19,11 +19,10 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - # copy the main() entry point to the image COPY ./src/tokenization_transform_python.py . diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index f69787b3d..b45336701 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -9,11 +9,6 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "transformers==4.38.2", -] - [project_urls] Repository = "https://github.com/IBM/data-prep-kit" @@ -21,10 +16,15 @@ Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/" "Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization" +dynamic = ["dependencies"] + [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt new file mode 100644 index 000000000..269257538 --- /dev/null +++ b/transforms/universal/tokenization/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev0 +transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/ray/Dockerfile index 0199e23b8..8b7e78c27 100644 --- a/transforms/universal/tokenization/ray/Dockerfile +++ b/transforms/universal/tokenization/ray/Dockerfile @@ -13,11 +13,9 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . -COPY --chown=ray:users python-transform/ python-transform -RUN cd python-transform && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform +RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml