From a900891f81cd42476e6f499d33d5945e9d471f96 Mon Sep 17 00:00:00 2001 From: kmonte Date: Mon, 26 Feb 2024 16:08:43 -0800 Subject: [PATCH] Cache subpipeline executions for partial run PiperOrigin-RevId: 610557733 --- .../experimental/core/pipeline_ops.py | 50 +++++++++++++ .../experimental/core/pipeline_ops_test.py | 75 +++++++++++++++++++ .../portable/partial_run_utils.py | 7 +- 3 files changed, 129 insertions(+), 3 deletions(-) diff --git a/tfx/orchestration/experimental/core/pipeline_ops.py b/tfx/orchestration/experimental/core/pipeline_ops.py index ed2188deb4..b9a6a115b8 100644 --- a/tfx/orchestration/experimental/core/pipeline_ops.py +++ b/tfx/orchestration/experimental/core/pipeline_ops.py @@ -13,6 +13,7 @@ # limitations under the License. """Pipeline-level operations.""" +import collections import contextlib import copy import dataclasses @@ -184,6 +185,55 @@ def initiate_pipeline_start( raise status_lib.StatusNotOkError( code=status_lib.Code.INVALID_ARGUMENT, message=str(e) ) + else: + # Find all subpipelines in the parent pipeline, which we are caching. + to_process = collections.deque([]) + for node in pipeline.nodes: + # Only add to processing queue if it's a subpipeline that we are going + # to cache. For subpipelines, the begin node's (nodes[0]) execution + # options repersent the subpipeline's execution options. + if node.WhichOneof( + 'node' + ) == 'sub_pipeline' and partial_run_utils.should_attempt_to_reuse_artifact( + node.sub_pipeline.nodes[0].pipeline_node.execution_options + ): + to_process.append(node.sub_pipeline) + cached_subpipelines = [] + while to_process: + subpipeline = to_process.popleft() + cached_subpipelines.append(subpipeline) + to_process.extend( + node.sub_pipeline + for node in subpipeline.nodes + if node.WhichOneof('node') == 'sub_pipeline' + ) + logging.info( + 'Found subpipelines: %s', + [s.pipeline_info.id for s in cached_subpipelines], + ) + # Add a new pipeline run for every subpipeline we are going to cache in + # the partial run. + for subpipeline in cached_subpipelines: + reused_subpipeline_view = _load_reused_pipeline_view( + mlmd_handle, subpipeline, partial_run_option.snapshot_settings + ) + # TODO: b/323912217 - Support putting multiple subpipeline executions + # into MLMD to handle the ForEach case. + with pstate.PipelineState.new( + mlmd_handle, + subpipeline, + pipeline_run_metadata, + reused_subpipeline_view, + ) as subpipeline_state: + # TODO: b/320535460 - The new pipeline run should not be stopped if + # there are still nodes to run in it. + logging.info('Subpipeline execution cached for partial run.') + subpipeline_state.initiate_stop( + status_lib.Status( + code=status_lib.Code.OK, + message='Subpipeline execution cached for partial run.', + ) + ) if pipeline.runtime_spec.HasField('snapshot_settings'): try: base_run_id = ( diff --git a/tfx/orchestration/experimental/core/pipeline_ops_test.py b/tfx/orchestration/experimental/core/pipeline_ops_test.py index 137a12f3b2..9fbba2080c 100644 --- a/tfx/orchestration/experimental/core/pipeline_ops_test.py +++ b/tfx/orchestration/experimental/core/pipeline_ops_test.py @@ -773,6 +773,81 @@ def test_initiate_pipeline_start_with_partial_run(self, mock_snapshot): mock_snapshot.assert_called_once() self.assertEqual(expected_pipeline, pipeline_state.pipeline) + @parameterized.named_parameters( + dict( + testcase_name='cache_subpipeline', + run_subpipeline=False, + ), + dict( + testcase_name='run_subpipeline', + run_subpipeline=True, + ), + ) + @mock.patch.object(partial_run_utils, 'snapshot') + def test_initiate_pipeline_start_with_partial_run_and_subpipeline( + self, mock_snapshot, run_subpipeline + ): + with self._mlmd_connection as m: + pipeline = test_sync_pipeline.create_pipeline_with_subpipeline() + runtime_parameter_utils.substitute_runtime_parameter( + pipeline, + { + constants.PIPELINE_ROOT_PARAMETER_NAME: '/my/pipeline/root', + constants.PIPELINE_RUN_ID_PARAMETER_NAME: 'run-0123', + }, + ) + + expected_pipeline = copy.deepcopy(pipeline) + example_gen = expected_pipeline.nodes[0].pipeline_node + subpipeline = expected_pipeline.nodes[1].sub_pipeline + subpipeline_begin = subpipeline.nodes[0].pipeline_node + transform = expected_pipeline.nodes[2].pipeline_node + partial_run_utils.set_latest_pipeline_run_strategy( + expected_pipeline.runtime_spec.snapshot_settings + ) + + skip = pipeline_pb2.NodeExecutionOptions.Skip( + reuse_artifacts_mode=pipeline_pb2.NodeExecutionOptions.Skip.REQUIRED + ) + run = pipeline_pb2.NodeExecutionOptions.Run( + perform_snapshot=True, depends_on_snapshot=True + ) + example_gen.execution_options.skip.CopyFrom(skip) + + if run_subpipeline: + subpipeline_begin.execution_options.run.CopyFrom(run) + transform.execution_options.run.depends_on_snapshot = True + else: + subpipeline_begin.execution_options.skip.CopyFrom(skip) + transform.execution_options.run.CopyFrom(run) + + partial_run_option = pipeline_pb2.PartialRun( + from_nodes=['sub-pipeline'] if run_subpipeline else ['my_transform'], + snapshot_settings=partial_run_utils.latest_pipeline_snapshot_settings(), + ) + with pipeline_ops.initiate_pipeline_start( + m, pipeline, partial_run_option=partial_run_option + ) as pipeline_state: + mock_snapshot.assert_called_once() + self.assertProtoEquals(expected_pipeline, pipeline_state.pipeline) + + if run_subpipeline: + # If the subpipeline should be run then we should not have pre-loaded a + # run for it. + with self.assertRaises(status_lib.StatusNotOkError): + pstate.PipelineState.load_run( + m, 'sub-pipeline', 'sub-pipeline_run-0123' + ) + else: + # Skipped subpipelines should have a run injected so their nodes are + # properly marked as cached. + with pstate.PipelineState.load_run( + m, 'sub-pipeline', 'sub-pipeline_run-0123' + ) as subpipeline_state: + self.assertEqual( + subpipeline_state.stop_initiated_reason().code, status_lib.Code.OK + ) + @mock.patch.object(partial_run_utils, 'snapshot') def test_partial_run_with_previously_failed_nodes(self, mock_snapshot): with self._mlmd_connection as m: diff --git a/tfx/orchestration/portable/partial_run_utils.py b/tfx/orchestration/portable/partial_run_utils.py index 1d50f3a50c..2c7b33d088 100644 --- a/tfx/orchestration/portable/partial_run_utils.py +++ b/tfx/orchestration/portable/partial_run_utils.py @@ -153,7 +153,7 @@ def snapshot(mlmd_handle: metadata.Metadata, """ # Avoid unnecessary snapshotting step if no node needs to reuse any artifacts. if not any( - _should_attempt_to_reuse_artifact(node.pipeline_node.execution_options) + should_attempt_to_reuse_artifact(node.pipeline_node.execution_options) for node in pipeline.nodes): return @@ -452,8 +452,9 @@ def _get_validated_new_run_id(pipeline: pipeline_pb2.Pipeline, return str(inferred_new_run_id or new_run_id) -def _should_attempt_to_reuse_artifact( +def should_attempt_to_reuse_artifact( execution_options: pipeline_pb2.NodeExecutionOptions): + """Returns whether artifacts should be reused for the these execution options.""" return execution_options.HasField('skip') and ( execution_options.skip.reuse_artifacts or execution_options.skip.reuse_artifacts_mode == _REUSE_ARTIFACT_OPTIONAL or @@ -512,7 +513,7 @@ def _reuse_pipeline_run_artifacts( reuse_nodes = [ node for node in node_proto_view.get_view_for_all_in(marked_pipeline) - if _should_attempt_to_reuse_artifact(node.execution_options) + if should_attempt_to_reuse_artifact(node.execution_options) ] logging.info( 'Reusing nodes: %s', [n.node_info.id for n in reuse_nodes]