Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Packaging modules as single package with extra (data-prep-kit[ray,spark] and data-prep-kit-tranforms[all, ray, etc.]) #674

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ __check_defined = \
mkdir ${LIB_NAME}
cp -p -R ${LIB_PATH}/src ${LIB_NAME}
cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME}
-cp -p -R ${LIB_PATH}/requirements.txt ${LIB_NAME}
cp -p -R ${LIB_PATH}/README.md ${LIB_NAME}

# Build and image using the local Dockerfile and make the data-processing-lib/python
Expand Down Expand Up @@ -301,7 +302,10 @@ endif
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR);
if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \
pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \
fi; \
pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR)
@echo Done installing source from $(PYTHON_PROJECT_DIR) into venv

# Install local requirements last as it generally includes our lib source
Expand Down Expand Up @@ -344,6 +348,11 @@ endif
.defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv
@# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present.

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.kfp-venv
.defaults.kfp-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv
@# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present.

# Install all source from the repo for a ray runtime transform into an existing venv
# And if there is an adjacent python dir (as for transforms), then also install that source
.PHONY: .defaults.install-ray-lib-src-venv
Expand Down Expand Up @@ -627,7 +636,7 @@ MINIO_ADMIN_PWD= localminiosecretkey
rm -rf dist || true
rm -rf src/*egg-info || true
${PIP} install --upgrade build
${PYTHON} -m build
${PYTHON} -m build $(BUILD_WHEEL_ARG)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we renaming BUILD_WHEEL_ARG to PYTHON_BUILD_EXTRA_ARGS to match DOCKER_BUILD_EXTRA_ARGS?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and a default (empty?) should be defined above, similar to DOCKER_BUILD_EXTRA_ARGS

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @daw3rd . No need for Default. The default is none anyway.


# Publish the distribution in the dist directory, usually created with .defaults.build-dist target
.PHONY: .defaults.publish-dist
Expand Down
5 changes: 5 additions & 0 deletions data-processing-lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,8 @@ set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse


build-pkg-dist::
$(MAKE) .defaults.build-dist BUILD_WHEEL_ARG=-w

publish-dist :: .check-env .defaults.publish-dist
44 changes: 44 additions & 0 deletions data-processing-lib/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[project]
name = "data_prep_toolkit"
version = "0.2.2.dev0"
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
requires-python = ">=3.10"
description = "Data Preparation Toolkit Library for Ray and Python"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "David Wood", email = "dawood@us.ibm.com" },
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]

dynamic = ["dependencies", "optional-dependencies"]

[project_urls]
Repository = "https://github.com/IBM/data-prep-kit"
Issues = "https://github.com/IBM/data-prep-kit/issues"
Documentation = "https://ibm.github.io/data-prep-kit/"
"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic.dependencies]
file = ["requirements.txt"]

[tool.setuptools.dynamic.optional-dependencies]
dev = { file = ["requirements-dev.txt"]}
ray = { file = ["requirements-ray.txt"]}
spark = { file = ["requirements-spark.txt"]}

[tool.setuptools.packages.find]
where = ["python/src", "ray/src", "spark/src"]


[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
9 changes: 9 additions & 0 deletions data-processing-lib/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
twine
pytest>=7.3.2
pytest-dotenv>=0.5.2
pytest-env>=1.0.0
pre-commit>=3.3.2
pytest-cov>=4.1.0
pytest-mock>=3.10.0
moto==5.0.5
markupsafe==2.0.1
3 changes: 3 additions & 0 deletions data-processing-lib/requirements-ray.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ray[default]==2.24.0
fastapi>=0.110.2
pillow>=10.3.0
2 changes: 2 additions & 0 deletions data-processing-lib/requirements-spark.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pyspark>=3.5.2
psutil>=6.0.0
6 changes: 6 additions & 0 deletions data-processing-lib/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy < 1.29.0
pyarrow==16.1.0
boto3==1.34.69
argparse
mmh3
psutil
2 changes: 1 addition & 1 deletion transforms/.make.workflows
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ endif

${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${KFP_LIB_SRC_FILES} ${KFP_LIB_CONFIG_FILE} ${KFP_SHARED_LIB_SRC_FILES}
rm -rf ${REPOROOT}/transforms/venv
$(MAKE) -C ${REPOROOT}/transforms .defaults.ray-lib-src-venv
$(MAKE) -C ${REPOROOT}/transforms .defaults.kfp-venv
. ${WORKFLOW_VENV_ACTIVATE}; \
pip install -e $(REPOROOT)/kfp/kfp_support_lib/shared_workflow_support; \
pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); \
Expand Down
29 changes: 29 additions & 0 deletions transforms/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,32 @@ workflow-upload::
set-versions::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse


build-pkg-dist::
## Most transforms today don't have a package name.... Need to fix that
## In the meantime, we will copy everything to a single folder
-rm -fr src
mkdir src
# Copy all the src folders recursively (not clear if they have subfolders)
for x in $(shell find . | grep '[ray| python]/src$$') ; do \
echo $$x ; \
if [ -d "$$x" ]; then \
cp -r $$x/* src ; \
fi \
done
# Only needs to build the whl
$(MAKE) BUILD_WHEEL_ARG=-w .defaults.build-dist

test-pkg-dist::
-rm -fr venv
python -m venv venv
source venv/bin/activate && $(PYTHON) -m pip install '$(REPOROOT)/data-processing-lib/dist/data_prep_toolkit-$(DPK_VERSION)-py3-none-any.whl[dev,ray]'
source venv/bin/activate && $(PYTHON) -m pip install 'dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl[all]'
for T in $(shell find . | grep '[ray| python]/test$$') ; do \
echo "running unit test on: $$T" ; \
source venv/bin/activate && $(PYTEST) $$T; \
done;
@# Help: Setup environment and run unit tests for all transforms


Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install:

`python -m pip install data-prep-toolkit-transforms`
or
`python -m pip install data-prep-toolkit-transforms[ray]`


installing the python transforms will also install `data-prep-toolkit`

installing the ray transforms will also install `data-prep-toolkit[ray]`

## List of Transforms in current package

Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi.
Expand Down
1 change: 1 addition & 0 deletions transforms/code/code2parquet/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
Expand Down
9 changes: 4 additions & 5 deletions transforms/code/code2parquet/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@ authors = [
{ name = "David Wood", email = "dawood@us.ibm.com" },
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"parameterized",
"pandas",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
dev = [
"twine",
Expand Down
3 changes: 3 additions & 0 deletions transforms/code/code2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data-prep-toolkit==0.2.2.dev0
parameterized
pandas
1 change: 1 addition & 0 deletions transforms/code/code_quality/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
Expand Down
10 changes: 5 additions & 5 deletions transforms/code/code_quality/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"bs4==0.0.2",
"transformers==4.38.2",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}


[project.optional-dependencies]
dev = [
"twine",
Expand Down
3 changes: 3 additions & 0 deletions transforms/code/code_quality/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data-prep-toolkit==0.2.2.dev0
bs4==0.0.2
transformers==4.38.2
1 change: 1 addition & 0 deletions transforms/code/header_cleanser/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy source data
Expand Down
9 changes: 5 additions & 4 deletions transforms/code/header_cleanser/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"scancode-toolkit==32.1.0",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}


[project.optional-dependencies]
dev = [
"twine",
Expand Down
3 changes: 3 additions & 0 deletions transforms/code/header_cleanser/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data-prep-toolkit==0.2.2.dev0
scancode-toolkit==32.1.0 ; platform_system != 'Darwin'

1 change: 1 addition & 0 deletions transforms/code/license_select/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy source data
Expand Down
10 changes: 6 additions & 4 deletions transforms/code/license_select/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_license_select_transform_python"
version = "0.2.1.dev0"
version = "0.2.2.dev0"
requires-python = ">=3.10"
description = "License Select Python Transform"
license = {text = "Apache-2.0"}
Expand All @@ -9,14 +9,16 @@ authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
{ name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.1.dev0",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}


[project.optional-dependencies]
dev = [
"twine",
Expand Down
1 change: 1 addition & 0 deletions transforms/code/license_select/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data-prep-toolkit==0.2.2.dev0
6 changes: 3 additions & 3 deletions transforms/code/license_select/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dpk_license_select_transform_ray"
version = "0.2.1.dev0"
version = "0.2.2.dev0"
requires-python = ">=3.10"
description = "License Select Transform"
license = {text = "Apache-2.0"}
Expand All @@ -10,8 +10,8 @@ authors = [
{ name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" },
]
dependencies = [
"dpk-license-select-transform-python==0.2.1.dev0",
"data-prep-toolkit-ray==0.2.1.dev0",
"dpk-license-select-transform-python==0.2.2.dev0",
"data-prep-toolkit-ray==0.2.2.dev0",
]

[build-system]
Expand Down
1 change: 1 addition & 0 deletions transforms/code/proglang_select/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
Expand Down
8 changes: 5 additions & 3 deletions transforms/code/proglang_select/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}


[project.optional-dependencies]
dev = [
"twine",
Expand Down
1 change: 1 addition & 0 deletions transforms/code/proglang_select/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data-prep-toolkit==0.2.2.dev0
1 change: 1 addition & 0 deletions transforms/language/doc_chunk/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e .

# copy transform main() entry point to the image
Expand Down
9 changes: 4 additions & 5 deletions transforms/language/doc_chunk/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@ authors = [
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.2.dev0",
"docling-core==1.3.0",
"llama-index-core>=0.11.0,<0.12.0",
]
dynamic = ["dependencies"]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
dev = [
"twine",
Expand Down
3 changes: 3 additions & 0 deletions transforms/language/doc_chunk/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data-prep-toolkit==0.2.2.dev0
docling-core==1.3.0
llama-index-core>=0.11.0,<0.12.0
Loading