From 76da783585f3d440ee2c0b6b24de6713ba3d3f84 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 22 Nov 2023 08:52:16 +0530 Subject: [PATCH] [k8s] Fix GPU count detection from autoscaler YAML (#2636) * fix gpu count detection in yaml * comments --- sky/skylet/providers/kubernetes/config.py | 14 ++++++++++---- tests/test_smoke.py | 1 - 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sky/skylet/providers/kubernetes/config.py b/sky/skylet/providers/kubernetes/config.py index 2ab466f03ed..cfc08523713 100644 --- a/sky/skylet/providers/kubernetes/config.py +++ b/sky/skylet/providers/kubernetes/config.py @@ -128,11 +128,17 @@ def get_resource(container_resources: Dict[str, Any], # float('inf') means there's no limit set res_count = request if limit == float('inf') else limit # Convert to int since Ray autoscaler expects int. - # Cap the minimum resource to 1 because if resource count is set to 0, - # (e.g., when request=0.5), ray will not be able to schedule any tasks. # We also round up the resource count to the nearest integer to provide the # user at least the amount of resource they requested. - return max(1, math.ceil(res_count)) + rounded_count = math.ceil(res_count) + if resource_name == 'cpu': + # For CPU, we set minimum count to 1 because if CPU count is set to 0, + # (e.g. when the user sets --cpu 0.5), ray will not be able to schedule + # any tasks. + return max(1, rounded_count) + else: + # For GPU and memory, return the rounded count. + return rounded_count def _get_resource(container_resources: Dict[str, Any], resource_name: str, @@ -144,7 +150,7 @@ def _get_resource(container_resources: Dict[str, Any], resource_name: str, Args: container_resources: Container's resource field. - resource_name: One of 'cpu', 'gpu' or memory. + resource_name: One of 'cpu', 'gpu' or 'memory'. field_name: One of 'requests' or 'limits'. Returns: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 62490e44fd7..17708ccf7ef 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2964,7 +2964,6 @@ def test_skyserve_cancel(): # ------- Testing user ray cluster -------- -@pytest.mark.no_kubernetes # Kubernetes does not support sky status -r yet. def test_user_ray_cluster(generic_cloud: str): name = _get_cluster_name() test = Test(