diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py b/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py new file mode 100644 index 000000000..170c248db --- /dev/null +++ b/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from ededup_pipeline_transform_python import EdedupPypelinePythonTransformConfiguration +from ededup_transform_base import ( + doc_column_name_cli_param, + int_column_name_cli_param, +) + + +# create launcher +launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # ededup parameters + doc_column_name_cli_param: "contents", + int_column_name_cli_param: "document_id", +} +sys.argv = ParamsUtils.dict_to_req(d=params) + +# launch +launcher.launch() diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py new file mode 100644 index 000000000..a10d3f06e --- /dev/null +++ b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration + +logger = get_logger(__name__) + + +class EdedupPypelinePythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config= + PipelineTransformConfiguration({"transforms": [EdedupPythonTransformRuntimeConfiguration()]})) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py b/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py new file mode 100644 index 000000000..81f09c4e7 --- /dev/null +++ b/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from ededup_pipeline_transform_python import EdedupPypelinePythonTransformConfiguration +from ededup_transform_base import doc_column_name_cli_param, int_column_name_cli_param + + +class TestEdedupPypilinePythonTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) + config = {doc_column_name_cli_param: "contents", int_column_name_cli_param: "document_id"} + return [(launcher, config, basedir + "/input", basedir + "/expected")] diff --git a/transforms/universal/noop/python/src/noop_pipeline_local_python.py b/transforms/universal/noop/python/src/noop_pipeline_local_python.py new file mode 100644 index 000000000..c3d2b648d --- /dev/null +++ b/transforms/universal/noop/python/src/noop_pipeline_local_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from noop_pipeline_transform_python import NOOPPypelinePythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # noop params + "noop_sleep_sec": 1, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=NOOPPypelinePythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/noop/python/src/noop_pipeline_transform_python.py b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py new file mode 100644 index 000000000..381f13149 --- /dev/null +++ b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from noop_transform_python import NOOPPythonTransformConfiguration + +logger = get_logger(__name__) + + +class NOOPPypelinePythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config= + PipelineTransformConfiguration({"transforms": [NOOPPythonTransformConfiguration()]})) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPPypelinePythonTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/noop/python/test/test_noop_pipeline_python.py b/transforms/universal/noop/python/test/test_noop_pipeline_python.py new file mode 100644 index 000000000..d0fec66a8 --- /dev/null +++ b/transforms/universal/noop/python/test/test_noop_pipeline_python.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from noop_transform import sleep_cli_param +from noop_pipeline_transform_python import NOOPPypelinePythonTransformConfiguration + + +class TestPythonNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = PythonTransformLauncher(NOOPPypelinePythonTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + transform_config = {sleep_cli_param: 0} + fixtures.append( + ( + launcher, + transform_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures