From b6f7cfd9a727b39f1ce56f0c7a605f4962b51de5 Mon Sep 17 00:00:00 2001 From: Shao Wang <77665902+Electronic-Waste@users.noreply.github.com> Date: Wed, 7 Aug 2024 01:50:39 +0800 Subject: [PATCH] [SDK] test: Add e2e test for tune function. (#2399) * fix(sdk): fix error field metrics_collector in tune function. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): Add e2e tests for tune function. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add missing field parameters. Signed-off-by: Electronic-Waste <2690692950@qq.com> * refactor(test/sdk): add run-e2e-tune-api.py. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): delete tune testing code in run-e2e-experiment. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add blank lines. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add verbose and temporarily delete e2e-experiment test. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add namespace_labels. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add time.sleep(5). Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add error output. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): build random image for tune. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): delete extra debug log. Signed-off-by: Electronic-Waste <2690692950@qq.com> * refactor(test/sdk): create separate workflow for tune. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): change api to API. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): change the permission of scripts. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): delete exit code & comment image pulling. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): delete image pulling phase. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): refactor workflow file to use template. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): mark experiments and trial-images as not required. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): pass tune-api param to setup-minikube.sh. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): fix err in template-e2e-test. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): add debug logs. Signed-off-by: Electronic-Waste <2690692950@qq.com> * test(sdk): reorder params and delete logs. Signed-off-by: Electronic-Waste <2690692950@qq.com> --------- Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .github/workflows/e2e-test-tune-api.yaml | 34 +++++ .../workflows/template-e2e-test/action.yaml | 19 ++- .../kubeflow/katib/api/katib_client.py | 2 +- .../v1beta1/scripts/gh-actions/build-load.sh | 13 +- .../scripts/gh-actions/run-e2e-experiment.py | 139 +---------------- .../scripts/gh-actions/run-e2e-tune-api.py | 97 ++++++++++++ .../scripts/gh-actions/run-e2e-tune-api.sh | 38 +++++ .../scripts/gh-actions/setup-minikube.sh | 7 +- test/e2e/v1beta1/scripts/gh-actions/verify.py | 141 ++++++++++++++++++ 9 files changed, 341 insertions(+), 149 deletions(-) create mode 100644 .github/workflows/e2e-test-tune-api.yaml create mode 100644 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py create mode 100755 test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh create mode 100644 test/e2e/v1beta1/scripts/gh-actions/verify.py diff --git a/.github/workflows/e2e-test-tune-api.yaml b/.github/workflows/e2e-test-tune-api.yaml new file mode 100644 index 00000000000..e1f37a3701b --- /dev/null +++ b/.github/workflows/e2e-test-tune-api.yaml @@ -0,0 +1,34 @@ +name: E2E Test with tune API + +on: + pull_request: + paths-ignore: + - "pkg/ui/v1beta1/frontend/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e: + runs-on: ubuntu-22.04 + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Test Env + uses: ./.github/workflows/template-setup-e2e-test + with: + kubernetes-version: ${{ matrix.kubernetes-version }} + + - name: Run e2e test with tune API + uses: ./.github/workflows/template-e2e-test + with: + tune-api: true + + strategy: + fail-fast: false + matrix: + # Detail: https://hub.docker.com/r/kindest/node + kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index ef1ca26064d..7c9598df04b 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -4,15 +4,17 @@ description: Run e2e test using the minikube cluster inputs: experiments: - required: true + required: false description: comma delimited experiment name + default: "" training-operator: required: false description: whether to deploy training-operator or not default: false trial-images: - required: true + required: false description: comma delimited trial image name + default: "" katib-ui: required: true description: whether to deploy katib-ui or not @@ -21,13 +23,17 @@ inputs: required: false description: mysql or postgres default: mysql + tune-api: + required: true + description: whether to execute tune-api test or not + default: false runs: using: composite steps: - name: Setup Minikube Cluster shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh ${{ inputs.katib-ui }} ${{ inputs.trial-images }} ${{ inputs.experiments }} + run: ./test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh ${{ inputs.katib-ui }} ${{ inputs.tune-api }} ${{ inputs.trial-images }} ${{ inputs.experiments }} - name: Setup Katib shell: bash @@ -35,4 +41,9 @@ runs: - name: Run E2E Experiment shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} + run: | + if "${{ inputs.tune-api }}"; then + ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh + else + ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} + fi diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index b18a81cad81..ceb3be2ce77 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -386,7 +386,7 @@ def tune( # Add metrics collector to the Katib Experiment. # Up to now, We only support parameter `kind`, of which default value is `StdOut`, to specify the kind of metrics collector. - experiment.spec.metrics_collector = models.V1beta1MetricsCollectorSpec( + experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec( collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"]) ) diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh index 2ce492da79a..cb0ea03cd5a 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh @@ -25,9 +25,10 @@ pushd . cd "$(dirname "$0")/../../../../.." trap popd EXIT -TRIAL_IMAGES=${1:-""} -EXPERIMENTS=${2:-""} -DEPLOY_KATIB_UI=${3:-false} +DEPLOY_KATIB_UI=${1:-false} +TUNE_API=${2:-false} +TRIAL_IMAGES=${3:-""} +EXPERIMENTS=${4:-""} REGISTRY="docker.io/kubeflowkatib" TAG="e2e-test" @@ -162,6 +163,12 @@ for name in "${TRIAL_IMAGE_ARRAY[@]}"; do run "$name" "examples/$VERSION/trial-images/$name/Dockerfile" done +# Testing image for tune function +if "$TUNE_API"; then + echo -e "\nPulling and building testing image for tune function..." + _build_containers "suggestion-hyperopt" "$CMD_PREFIX/suggestion/hyperopt/$VERSION/Dockerfile" +fi + echo -e "\nCleanup Build Cache...\n" docker buildx prune -f diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py index 26ef2e9f6e2..efbe0539e73 100644 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.py @@ -1,6 +1,5 @@ import argparse import logging -import time from kubeflow.katib import ApiClient from kubeflow.katib import KatibClient @@ -8,6 +7,7 @@ from kubeflow.katib.constants import constants from kubeflow.katib.utils.utils import FakeResponse from kubernetes import client +from verify import verify_experiment_results import yaml # Experiment timeout is 40 min. @@ -17,143 +17,6 @@ logging.basicConfig(level=logging.INFO) -def verify_experiment_results( - katib_client: KatibClient, - experiment: models.V1beta1Experiment, - exp_name: str, - exp_namespace: str, -): - - # Get the best objective metric. - best_objective_metric = None - for metric in experiment.status.current_optimal_trial.observation.metrics: - if metric.name == experiment.spec.objective.objective_metric_name: - best_objective_metric = metric - break - - if best_objective_metric is None: - raise Exception( - "Unable to get the best metrics for objective: {}. Current Optimal Trial: {}".format( - experiment.spec.objective.objective_metric_name, - experiment.status.current_optimal_trial, - ) - ) - - # Get Experiment Succeeded reason. - for c in experiment.status.conditions: - if ( - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED - and c.status == constants.CONDITION_STATUS_TRUE - ): - succeeded_reason = c.reason - break - - trials_completed = experiment.status.trials_succeeded or 0 - trials_completed += experiment.status.trials_early_stopped or 0 - max_trial_count = experiment.spec.max_trial_count - - # If Experiment is Succeeded because of Max Trial Reached, all Trials must be completed. - if ( - succeeded_reason == "ExperimentMaxTrialsReached" - and trials_completed != max_trial_count - ): - raise Exception( - "All Trials must be Completed. Max Trial count: {}, Experiment status: {}".format( - max_trial_count, experiment.status - ) - ) - - # If Experiment is Succeeded because of Goal reached, the metrics must be correct. - if succeeded_reason == "ExperimentGoalReached" and ( - ( - experiment.spec.objective.type == "minimize" - and float(best_objective_metric.min) > float(experiment.spec.objective.goal) - ) - or ( - experiment.spec.objective.type == "maximize" - and float(best_objective_metric.max) < float(experiment.spec.objective.goal) - ) - ): - raise Exception( - "Experiment goal is reached, but metrics are incorrect. " - f"Experiment objective: {experiment.spec.objective}. " - f"Experiment best objective metric: {best_objective_metric}" - ) - - # Verify Suggestion's resources. Suggestion name = Experiment name. - suggestion = katib_client.get_suggestion(exp_name, exp_namespace) - - # For the Never or FromVolume resume policies Suggestion must be Succeeded. - # For the LongRunning resume policy Suggestion must be always Running. - for c in suggestion.status.conditions: - if ( - c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED - and c.status == constants.CONDITION_STATUS_TRUE - and experiment.spec.resume_policy == "LongRunning" - ): - raise Exception( - f"Suggestion is Succeeded while Resume Policy is {experiment.spec.resume_policy}." - f"Suggestion conditions: {suggestion.status.conditions}" - ) - elif ( - c.type == constants.EXPERIMENT_CONDITION_RUNNING - and c.status == constants.CONDITION_STATUS_TRUE - and experiment.spec.resume_policy != "LongRunning" - ): - raise Exception( - f"Suggestion is Running while Resume Policy is {experiment.spec.resume_policy}." - f"Suggestion conditions: {suggestion.status.conditions}" - ) - - # For Never and FromVolume resume policies verify Suggestion's resources. - if ( - experiment.spec.resume_policy == "Never" - or experiment.spec.resume_policy == "FromVolume" - ): - resource_name = exp_name + "-" + experiment.spec.algorithm.algorithm_name - - # Suggestion's Service and Deployment should be deleted. - for i in range(10): - try: - client.AppsV1Api().read_namespaced_deployment( - resource_name, exp_namespace - ) - except client.ApiException as e: - if e.status == 404: - break - else: - raise e - # Deployment deletion might take some time. - time.sleep(1) - if i == 10: - raise Exception( - "Suggestion Deployment is still alive for Resume Policy: {}".format( - experiment.spec.resume_policy - ) - ) - - try: - client.CoreV1Api().read_namespaced_service(resource_name, exp_namespace) - except client.ApiException as e: - if e.status != 404: - raise e - else: - raise Exception( - "Suggestion Service is still alive for Resume Policy: {}".format( - experiment.spec.resume_policy - ) - ) - - # For FromVolume resume policy PVC should not be deleted. - if experiment.spec.resume_policy == "FromVolume": - try: - client.CoreV1Api().read_namespaced_persistent_volume_claim( - resource_name, exp_namespace - ) - except client.ApiException: - raise Exception("PVC is deleted for FromVolume Resume Policy") - - def run_e2e_experiment( katib_client: KatibClient, experiment: models.V1beta1Experiment, diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py new file mode 100644 index 00000000000..1ca3596af95 --- /dev/null +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py @@ -0,0 +1,97 @@ +import argparse +import logging + +from kubeflow.katib import KatibClient +from kubeflow.katib import search +from kubernetes import client +from verify import verify_experiment_results + +# Experiment timeout is 40 min. +EXPERIMENT_TIMEOUT = 60 * 40 + +# The default logging config. +logging.basicConfig(level=logging.INFO) + + +def run_e2e_experiment_create_by_tune( + katib_client: KatibClient, + exp_name: str, + exp_namespace: str, +): + # Create Katib Experiment and wait until it is finished. + logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name)) + + # Use the test case from get-started tutorial. + # https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk + # [1] Create an objective function. + def objective(parameters): + import time + time.sleep(5) + result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2 + print(f"result={result}") + + # [2] Create hyperparameter search space. + parameters = { + "a": search.int(min=10, max=20), + "b": search.double(min=0.1, max=0.2) + } + + # [3] Create Katib Experiment with 4 Trials and 2 CPUs per Trial. + # And Wait until Experiment reaches Succeeded condition. + katib_client.tune( + name=exp_name, + namespace=exp_namespace, + objective=objective, + parameters=parameters, + objective_metric_name="result", + max_trial_count=4, + resources_per_trial={"cpu": "2"}, + ) + experiment = katib_client.wait_for_experiment_condition( + exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT + ) + + # Verify the Experiment results. + verify_experiment_results(katib_client, experiment, exp_name, exp_namespace) + + # Print the Experiment and Suggestion. + logging.debug(katib_client.get_experiment(exp_name, exp_namespace)) + logging.debug(katib_client.get_suggestion(exp_name, exp_namespace)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--namespace", type=str, required=True, help="Namespace for the Katib E2E test", + ) + parser.add_argument( + "--verbose", action="store_true", help="Verbose output for the Katib E2E test", + ) + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + katib_client = KatibClient() + + namespace_labels = client.CoreV1Api().read_namespace(args.namespace).metadata.labels + if 'katib.kubeflow.org/metrics-collector-injection' not in namespace_labels: + namespace_labels['katib.kubeflow.org/metrics-collector-injection'] = 'enabled' + client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}}) + + # Test with run_e2e_experiment_create_by_tune + exp_name = "tune-example" + exp_namespace = args.namespace + try: + run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace) + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}") + except Exception as e: + logging.info("---------------------------------------------------------------") + logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}") + raise e + finally: + # Delete the Experiment. + logging.info("---------------------------------------------------------------") + logging.info("---------------------------------------------------------------") + katib_client.delete_experiment(exp_name, exp_namespace) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh new file mode 100755 index 00000000000..1520d301439 --- /dev/null +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to run Katib Experiment. +# Input parameter - path to Experiment yaml. + +set -o errexit +set -o nounset +set -o pipefail + +cd "$(dirname "$0")" + +echo "Katib deployments" +kubectl -n kubeflow get deploy +echo "Katib services" +kubectl -n kubeflow get svc +echo "Katib pods" +kubectl -n kubeflow get pod +echo "Katib persistent volume claims" +kubectl get pvc -n kubeflow +echo "Available CRDs" +kubectl get crd + +python run-e2e-tune-api.py --namespace default \ +--verbose || (kubectl get pods -n kubeflow && exit 1) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh index a24131bbb7d..b890a40d41b 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-minikube.sh @@ -22,8 +22,9 @@ set -o nounset cd "$(dirname "$0")" DEPLOY_KATIB_UI=${1:-false} -TRIAL_IMAGES=${2:-""} -EXPERIMENTS=${3:-""} +TUNE_API=${2:-false} +TRIAL_IMAGES=${3:-""} +EXPERIMENTS=${4:-""} echo "Start to setup Minikube Kubernetes Cluster" kubectl version @@ -31,4 +32,4 @@ kubectl cluster-info kubectl get nodes echo "Build and Load container images" -./build-load.sh "$TRIAL_IMAGES" "$EXPERIMENTS" "$DEPLOY_KATIB_UI" +./build-load.sh "$DEPLOY_KATIB_UI" "$TUNE_API" "$TRIAL_IMAGES" "$EXPERIMENTS" diff --git a/test/e2e/v1beta1/scripts/gh-actions/verify.py b/test/e2e/v1beta1/scripts/gh-actions/verify.py new file mode 100644 index 00000000000..cbc522d8344 --- /dev/null +++ b/test/e2e/v1beta1/scripts/gh-actions/verify.py @@ -0,0 +1,141 @@ +import time + +from kubeflow.katib import KatibClient +from kubeflow.katib import models +from kubeflow.katib.constants import constants +from kubernetes import client + + +def verify_experiment_results( + katib_client: KatibClient, + experiment: models.V1beta1Experiment, + exp_name: str, + exp_namespace: str, +): + + # Get the best objective metric. + best_objective_metric = None + for metric in experiment.status.current_optimal_trial.observation.metrics: + if metric.name == experiment.spec.objective.objective_metric_name: + best_objective_metric = metric + break + + if best_objective_metric is None: + raise Exception( + "Unable to get the best metrics for objective: {}. Current Optimal Trial: {}".format( + experiment.spec.objective.objective_metric_name, + experiment.status.current_optimal_trial, + ) + ) + + # Get Experiment Succeeded reason. + for c in experiment.status.conditions: + if ( + c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED + and c.status == constants.CONDITION_STATUS_TRUE + ): + succeeded_reason = c.reason + break + + trials_completed = experiment.status.trials_succeeded or 0 + trials_completed += experiment.status.trials_early_stopped or 0 + max_trial_count = experiment.spec.max_trial_count + + # If Experiment is Succeeded because of Max Trial Reached, all Trials must be completed. + if ( + succeeded_reason == "ExperimentMaxTrialsReached" + and trials_completed != max_trial_count + ): + raise Exception( + "All Trials must be Completed. Max Trial count: {}, Experiment status: {}".format( + max_trial_count, experiment.status + ) + ) + + # If Experiment is Succeeded because of Goal reached, the metrics must be correct. + if succeeded_reason == "ExperimentGoalReached" and ( + ( + experiment.spec.objective.type == "minimize" + and float(best_objective_metric.min) > float(experiment.spec.objective.goal) + ) + or ( + experiment.spec.objective.type == "maximize" + and float(best_objective_metric.max) < float(experiment.spec.objective.goal) + ) + ): + raise Exception( + "Experiment goal is reached, but metrics are incorrect. " + f"Experiment objective: {experiment.spec.objective}. " + f"Experiment best objective metric: {best_objective_metric}" + ) + + # Verify Suggestion's resources. Suggestion name = Experiment name. + suggestion = katib_client.get_suggestion(exp_name, exp_namespace) + + # For the Never or FromVolume resume policies Suggestion must be Succeeded. + # For the LongRunning resume policy Suggestion must be always Running. + for c in suggestion.status.conditions: + if ( + c.type == constants.EXPERIMENT_CONDITION_SUCCEEDED + and c.status == constants.CONDITION_STATUS_TRUE + and experiment.spec.resume_policy == "LongRunning" + ): + raise Exception( + f"Suggestion is Succeeded while Resume Policy is {experiment.spec.resume_policy}." + f"Suggestion conditions: {suggestion.status.conditions}" + ) + elif ( + c.type == constants.EXPERIMENT_CONDITION_RUNNING + and c.status == constants.CONDITION_STATUS_TRUE + and experiment.spec.resume_policy != "LongRunning" + ): + raise Exception( + f"Suggestion is Running while Resume Policy is {experiment.spec.resume_policy}." + f"Suggestion conditions: {suggestion.status.conditions}" + ) + + # For Never and FromVolume resume policies verify Suggestion's resources. + if ( + experiment.spec.resume_policy == "Never" + or experiment.spec.resume_policy == "FromVolume" + ): + resource_name = exp_name + "-" + experiment.spec.algorithm.algorithm_name + + # Suggestion's Service and Deployment should be deleted. + for i in range(10): + try: + client.AppsV1Api().read_namespaced_deployment( + resource_name, exp_namespace + ) + except client.ApiException as e: + if e.status == 404: + break + else: + raise e + if i == 10: + raise Exception( + "Suggestion Deployment is still alive for Resume Policy: {}".format( + experiment.spec.resume_policy + ) + ) + + try: + client.CoreV1Api().read_namespaced_service(resource_name, exp_namespace) + except client.ApiException as e: + if e.status != 404: + raise e + else: + raise Exception( + "Suggestion Service is still alive for Resume Policy: {}".format( + experiment.spec.resume_policy + ) + ) + + # For FromVolume resume policy PVC should not be deleted. + if experiment.spec.resume_policy == "FromVolume": + try: + client.CoreV1Api().read_namespaced_persistent_volume_claim( + resource_name, exp_namespace + ) + except client.ApiException: + raise Exception("PVC is deleted for FromVolume Resume Policy")