From a9294744d4aa401f57ccbf44b7af9aaf74a0826b Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 16 Sep 2024 08:59:11 +0000 Subject: [PATCH 01/63] initial version of TPU support on GKE --- sky/clouds/kubernetes.py | 10 ++++ sky/provision/kubernetes/instance.py | 11 ++++ sky/provision/kubernetes/utils.py | 85 +++++++++++++++++++++++----- sky/resources.py | 20 +++++-- sky/task.py | 69 +++++++++++++++++++++- sky/templates/kubernetes-ray.yml.j2 | 16 +++++- 6 files changed, 190 insertions(+), 21 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 47f8a435ebb..dbf65dd9e4a 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -261,11 +261,18 @@ def make_deploy_resources_variables( k8s_acc_label_key = None k8s_acc_label_value = None + tpu_is_requested = False # If GPUs are requested, set node label to match the GPU type. if acc_count > 0 and acc_type is not None: k8s_acc_label_key, k8s_acc_label_value = \ kubernetes_utils.get_gpu_label_key_value(acc_type) + if k8s_acc_label_key == 'cloud.google.com/gke-tpu-accelerator': + tpu_is_requested = True + + if tpu_is_requested: + k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( + kubernetes_utils.get_tpu_topology_key_value()) port_mode = network_utils.get_port_mode(None) @@ -330,6 +337,9 @@ def make_deploy_resources_variables( 'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE, 'k8s_spot_label_key': spot_label_key, 'k8s_spot_label_value': spot_label_value, + 'tpu_is_requested': tpu_is_requested, + 'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key, + 'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value, 'image_id': image_id, } diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 83f9c34592e..a197b778b36 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -577,6 +577,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str, } } + # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule) + # and explain what those are for. And explain why we need toleration at this point to ignore that taint. + if 'cloud.google.com/gke-tpu-accelerator' in config.node_config['spec']['nodeSelector']: + tpu_toleration = { + 'key': 'google.com/tpu', + 'operator': 'Equal', + 'value': 'present', + 'effect': 'NoSchedule' + } + pod_spec['spec']['tolerations'] = [tpu_toleration] + pod = kubernetes.core_api(context).create_namespaced_pod( namespace, pod_spec) created_pods[pod.metadata.name] = pod diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6aa6400dfa1..0c2c9044b10 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -183,11 +183,21 @@ class GKELabelFormatter(GPULabelFormatter): label, which is used to identify the GPU type. """ - LABEL_KEY = 'cloud.google.com/gke-accelerator' + GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator' + TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator' + TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology' @classmethod def get_label_key(cls) -> str: - return cls.LABEL_KEY + return cls.GPU_LABEL_KEY + + @classmethod + def is_label_key(cls, label: str) -> Tuple[str, str]: + return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] + + @classmethod + def get_tpu_topology_label_key(cls) -> str: + return cls.TPU_TOPOLOGY_LABEL_KEY @classmethod def get_label_value(cls, accelerator: str) -> str: @@ -205,6 +215,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # to distinguish between a3-high and a3-mega instances return 'H100' return acc + elif value.startswith('tpu-'): + # Doyoung: This may need some updates depending on the namings + # required from up/down-stream. + return value else: raise ValueError( f'Invalid accelerator name in GKE cluster: {value}') @@ -304,21 +318,27 @@ def detect_gpu_label_formatter( # Get all labels across all nodes node_labels: Dict[str, List[Tuple[str, str]]] = {} nodes = get_kubernetes_nodes() + is_tpu = False for node in nodes: node_labels[node.metadata.name] = [] for label, value in node.metadata.labels.items(): + if 'cloud.google.com/gke-tpu-accelerator' == label: + is_tpu = True node_labels[node.metadata.name].append((label, value)) label_formatter = None - # Check if the node labels contain any of the GPU label prefixes - for lf in LABEL_FORMATTER_REGISTRY: - label_key = lf.get_label_key() - for _, label_list in node_labels.items(): - for label, _ in label_list: - if label.startswith(label_key): - label_formatter = lf() - return label_formatter, node_labels + if is_tpu: + label_formatter = GKELabelFormatter() + else: + # Check if the node labels contain any of the GPU label prefixes + for lf in LABEL_FORMATTER_REGISTRY: + label_key = lf.get_label_key() + for _, label_list in node_labels.items(): + for label, _ in label_list: + if label.startswith(label_key): + label_formatter = lf() + return label_formatter, node_labels return label_formatter, node_labels @@ -338,7 +358,7 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]: nodes = get_kubernetes_nodes() for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_gpu = 'nvidia.com/gpu' in cluster_resources + has_gpu = 'nvidia.com/gpu' in cluster_resources or 'google.com/tpu' in cluster_resources return has_gpu, cluster_resources @@ -523,13 +543,25 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # correctly setup and will behave as expected. for node_name, label_list in node_labels.items(): for label, value in label_list: - if label == label_formatter.get_label_key(): + if label_formatter.is_label_key(label): is_valid, reason = label_formatter.validate_label_value( value) if not is_valid: raise exceptions.ResourcesUnavailableError( f'Node {node_name!r} in Kubernetes cluster has ' f'invalid GPU label: {label}={value}. {reason}') + ##### + # for node_name, label_list in node_labels.items(): + # for label, value in label_list: + # if label == label_formatter.get_label_key(): + # is_valid, reason = label_formatter.validate_label_value( + # value) + # if not is_valid: + # raise exceptions.ResourcesUnavailableError( + # f'Node {node_name!r} in Kubernetes cluster has ' + # f'invalid GPU label: {label}={value}. {reason}') + ##### + if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. @@ -543,10 +575,20 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # during scheduling. for node_name, label_list in node_labels.items(): for label, value in label_list: - if (label == k8s_acc_label_key and + if (label_formatter.is_label_key(label) and label_formatter.get_accelerator_from_label_value( value) == acc_type): return label, value + + #### + # for node_name, label_list in node_labels.items(): + # for label, value in label_list: + # if (label == k8s_acc_label_key and + # label_formatter.get_accelerator_from_label_value( + # value) == acc_type): + # return label, value + #### + # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): suffix = '' @@ -580,6 +622,23 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: f'to set up GPUs.{suffix}') +def get_tpu_topology_key_value(): + label_formatter, node_labels = detect_gpu_label_formatter() + for node_name, label_list in node_labels.items(): + for label, value in label_list: + if label == label_formatter.get_tpu_topology_label_key(): + is_valid, reason = label_formatter.validate_label_value(value) + if not is_valid: + raise exceptions.ResourcesUnavailableError( + f'Node {node_name!r} in Kubernetes cluster has ' + f'invalid GPU label: {label}={value}. {reason}') + + for node_name, label_list in node_labels.items(): + for label, value in label_list: + if label == label_formatter.get_tpu_topology_label_key(): + return label, value + + def get_head_ssh_port(cluster_name: str, namespace: str, context: Optional[str]) -> int: svc_name = f'{cluster_name}-head-ssh' diff --git a/sky/resources.py b/sky/resources.py index 2f19cd1aa01..51c9fe1b2b5 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -564,13 +564,25 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): - if self.cloud is None: - self._cloud = clouds.GCP() + # Doyoung: Confirm if below two lines can be removed. Perhaps, + # raise an error when cloud is not specified by the user + # if self.cloud is None: + # self._cloud = clouds.GCP() assert self.cloud.is_same_cloud( - clouds.GCP()), 'Cloud must be GCP.' + clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.' if accelerator_args is None: accelerator_args = {} - use_tpu_vm = accelerator_args.get('tpu_vm', True) + + # Doyoung: May need to understand the usage of tpu_vm and make + # proper adjustments to the following snippets. + if self.cloud.is_same_cloud(clouds.GCP()): + use_tpu_vm = accelerator_args.get('tpu_vm', True) + else: + use_tpu_vm = False + + #### + # use_tpu_vm = accelerator_args.get('tpu_vm', True) + #### if self.instance_type is not None and use_tpu_vm: if self.instance_type != 'TPU-VM': with ux_utils.print_exception_no_traceback(): diff --git a/sky/task.py b/sky/task.py index cebc616dc6d..9a561d005da 100644 --- a/sky/task.py +++ b/sky/task.py @@ -927,6 +927,10 @@ def _get_preferred_store( if self.best_resources is not None: storage_cloud = self.best_resources.cloud + # if storage_cloud == store type read from task + # storage_region = self.best_resources.region + # else: + # storage_region = None storage_region = self.best_resources.region else: resources = list(self.resources)[0] @@ -955,11 +959,72 @@ def sync_storage_mounts(self) -> None: file_mounts of the form ``{ /remote/path: {s3,gs,..}:// }``. """ + # for storage in self.storage_mounts.values(): + # if len(storage.stores) == 0: + # store_type, store_region = self._get_preferred_store() + # self.storage_plans[storage] = store_type + # storage.add_store(store_type, store_region) + # else: + # # We will download the first store that is added to remote. + # self.storage_plans[storage] = list(storage.stores.keys())[0] + + # Things to consider + # 1. Created storage must respect the region from _get_preferred_store() + # 2. If other store type was specify from the task yaml, it should create + # at that cloud type. + + # Perhaps, pass Storage object to _get_preferred_store() and that gets + # the highest priority from _get_preferred_store. And if this passed + # store type aligns with best_resources.cloud, then return it with the + # best_resources.region. If not, just return + + # write a method that can read the store information from task yaml given + # the storage object name. Then, using this method, we can check the store type + # meant to be created from this task, and if this store type is identical to + # what _get_preferred_store type would create, we use the region value from it + # as well. If they are not the same, region value is set to None. + + # So given the storage name, we can find the store type attempted to be created + # from this task. Get that store type, and if it's identical to best_resources.cloud, + # use best_resources.region as well. + + # --> This approach requires to read the task yaml to get the init_store type, which + # means, we need a way to obtain the yaml path to read again, but currently, the path + # is not passed all the way down. + + # So, why do we need the init_store from task yaml? + # This method currently only checks if there's already a storage created with + # len(storage.stores) == 0, but now this is not sufficient as it is possible to + # create another store with identical name(actually, need to confirm what to do with this behavior) + + # Originally, + # if len(storage.stores) == 0: + # else: + # was good enough since create storage from task yaml was already done + # before reaching this point. So reaching this point with len(storage.stores) == 0 + # was + + # Approach 2: + # Get store_type and store_region from _get_preferred_store method. + # Check if this store_type is in storage.stores, + # if it does not exist, we run add_store(store_type, store_region) + # if it exists, we can + + # There are two main points making this difficult to implement without knowing the store type + # attempted to be created from current task yaml. + # 1. When store type given from task yaml is different from best_resources.cloud, the storage + # is created under wrong cloud provider. + # 2. + for storage in self.storage_mounts.values(): - if len(storage.stores) == 0: - store_type, store_region = self._get_preferred_store() + store_type, store_region = self._get_preferred_store(storage) + if store_type not in storage.stores: self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) + # if len(storage.stores) == 0: + # store_type, store_region = self._get_preferred_store() + # self.storage_plans[storage] = store_type + # storage.add_store(store_type, store_region) else: # We will download the first store that is added to remote. self.storage_plans[storage] = list(storage.stores.keys())[0] diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 1b09409ad0e..287dda90aa3 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -283,12 +283,15 @@ available_node_types: restartPolicy: Never - # Add node selector if GPUs are requested: + # Add node selector if GPU/TPUs are requested: {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %} nodeSelector: {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %} {{k8s_acc_label_key}}: {{k8s_acc_label_value}} {% endif %} + {% if k8s_tpu_topology_label_key is not none and k8s_tpu_topology_label_value is not none %} + {{k8s_tpu_topology_label_key}}: {{k8s_tpu_topology_label_value}} + {% endif %} {% if k8s_spot_label_key is not none %} {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}} {% endif %} @@ -398,14 +401,23 @@ available_node_types: requests: cpu: {{cpus}} memory: {{memory}}G + {% if tpu_is_requested %} + google.com/tpu: {{accelerator_count}} + {% else %} nvidia.com/gpu: {{accelerator_count}} + {% endif %} {% if k8s_fuse_device_required %} # Kubernetes resource exposed by the fuse device manager # https://gitlab.com/arm-research/smarter/smarter-device-manager smarter-devices/fuse: "1" {% endif %} limits: - nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests + # Limits need to be defined for GPU/TPU requests + {% if tpu_is_requested %} + google.com/tpu: {{accelerator_count}} + {% else %} + nvidia.com/gpu: {{accelerator_count}} + {% endif %} {% if k8s_fuse_device_required %} smarter-devices/fuse: "1" {% endif %} From 80e1877174a8cb8f47eee39eb587baa92aa0bdea Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 16 Sep 2024 09:15:40 +0000 Subject: [PATCH 02/63] revert unnecesary change --- sky/task.py | 65 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 63 deletions(-) diff --git a/sky/task.py b/sky/task.py index 9a561d005da..ae2877e1a8f 100644 --- a/sky/task.py +++ b/sky/task.py @@ -959,72 +959,11 @@ def sync_storage_mounts(self) -> None: file_mounts of the form ``{ /remote/path: {s3,gs,..}:// }``. """ - # for storage in self.storage_mounts.values(): - # if len(storage.stores) == 0: - # store_type, store_region = self._get_preferred_store() - # self.storage_plans[storage] = store_type - # storage.add_store(store_type, store_region) - # else: - # # We will download the first store that is added to remote. - # self.storage_plans[storage] = list(storage.stores.keys())[0] - - # Things to consider - # 1. Created storage must respect the region from _get_preferred_store() - # 2. If other store type was specify from the task yaml, it should create - # at that cloud type. - - # Perhaps, pass Storage object to _get_preferred_store() and that gets - # the highest priority from _get_preferred_store. And if this passed - # store type aligns with best_resources.cloud, then return it with the - # best_resources.region. If not, just return - - # write a method that can read the store information from task yaml given - # the storage object name. Then, using this method, we can check the store type - # meant to be created from this task, and if this store type is identical to - # what _get_preferred_store type would create, we use the region value from it - # as well. If they are not the same, region value is set to None. - - # So given the storage name, we can find the store type attempted to be created - # from this task. Get that store type, and if it's identical to best_resources.cloud, - # use best_resources.region as well. - - # --> This approach requires to read the task yaml to get the init_store type, which - # means, we need a way to obtain the yaml path to read again, but currently, the path - # is not passed all the way down. - - # So, why do we need the init_store from task yaml? - # This method currently only checks if there's already a storage created with - # len(storage.stores) == 0, but now this is not sufficient as it is possible to - # create another store with identical name(actually, need to confirm what to do with this behavior) - - # Originally, - # if len(storage.stores) == 0: - # else: - # was good enough since create storage from task yaml was already done - # before reaching this point. So reaching this point with len(storage.stores) == 0 - # was - - # Approach 2: - # Get store_type and store_region from _get_preferred_store method. - # Check if this store_type is in storage.stores, - # if it does not exist, we run add_store(store_type, store_region) - # if it exists, we can - - # There are two main points making this difficult to implement without knowing the store type - # attempted to be created from current task yaml. - # 1. When store type given from task yaml is different from best_resources.cloud, the storage - # is created under wrong cloud provider. - # 2. - for storage in self.storage_mounts.values(): - store_type, store_region = self._get_preferred_store(storage) - if store_type not in storage.stores: + if len(storage.stores) == 0: + store_type, store_region = self._get_preferred_store() self.storage_plans[storage] = store_type storage.add_store(store_type, store_region) - # if len(storage.stores) == 0: - # store_type, store_region = self._get_preferred_store() - # self.storage_plans[storage] = store_type - # storage.add_store(store_type, store_region) else: # We will download the first store that is added to remote. self.storage_plans[storage] = list(storage.stores.keys())[0] From 70a07abcd1ae2190e74a9897a1647b96c6865105 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 16 Sep 2024 09:16:19 +0000 Subject: [PATCH 03/63] revert --- sky/task.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sky/task.py b/sky/task.py index ae2877e1a8f..cebc616dc6d 100644 --- a/sky/task.py +++ b/sky/task.py @@ -927,10 +927,6 @@ def _get_preferred_store( if self.best_resources is not None: storage_cloud = self.best_resources.cloud - # if storage_cloud == store type read from task - # storage_region = self.best_resources.region - # else: - # storage_region = None storage_region = self.best_resources.region else: resources = list(self.resources)[0] From 0cba9a5347e48d6003e7d17bd8464af657a8b51b Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 06:19:08 +0000 Subject: [PATCH 04/63] use TPU_LABEL_KEY constant --- sky/provision/kubernetes/instance.py | 2 +- sky/provision/kubernetes/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index a197b778b36..735f9366365 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -579,7 +579,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule) # and explain what those are for. And explain why we need toleration at this point to ignore that taint. - if 'cloud.google.com/gke-tpu-accelerator' in config.node_config['spec']['nodeSelector']: + if kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY in config.node_config['spec']['nodeSelector']: tpu_toleration = { 'key': 'google.com/tpu', 'operator': 'Equal', diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 0c2c9044b10..81f24ffebaa 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -322,7 +322,7 @@ def detect_gpu_label_formatter( for node in nodes: node_labels[node.metadata.name] = [] for label, value in node.metadata.labels.items(): - if 'cloud.google.com/gke-tpu-accelerator' == label: + if GKELabelFormatter.TPU_LABEL_KEY == label: is_tpu = True node_labels[node.metadata.name].append((label, value)) From 17bcbd8f73e8582e1a7221a7671d50c19e14b28e Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 06:22:12 +0000 Subject: [PATCH 05/63] nit --- sky/clouds/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index dbf65dd9e4a..233c2388704 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -267,7 +267,7 @@ def make_deploy_resources_variables( if acc_count > 0 and acc_type is not None: k8s_acc_label_key, k8s_acc_label_value = \ kubernetes_utils.get_gpu_label_key_value(acc_type) - if k8s_acc_label_key == 'cloud.google.com/gke-tpu-accelerator': + if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: tpu_is_requested = True if tpu_is_requested: From 9233bf5d7363c3fbd02ab17ebf35826e2810b0a3 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 06:56:43 +0000 Subject: [PATCH 06/63] nit --- sky/clouds/kubernetes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 233c2388704..d1fa151869c 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -261,18 +261,18 @@ def make_deploy_resources_variables( k8s_acc_label_key = None k8s_acc_label_value = None + k8s_tpu_topology_label_key = None + k8s_tpu_topology_label_value = None tpu_is_requested = False - # If GPUs are requested, set node label to match the GPU type. + # If GPU/TPUs are requested, set node label to match the GPU/TPU type. if acc_count > 0 and acc_type is not None: k8s_acc_label_key, k8s_acc_label_value = \ kubernetes_utils.get_gpu_label_key_value(acc_type) if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: tpu_is_requested = True - - if tpu_is_requested: - k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_key_value()) + k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( + kubernetes_utils.get_tpu_topology_key_value()) port_mode = network_utils.get_port_mode(None) From 12e62c05ca1bae2f4b6240b382d7fbd02e37e503 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 06:59:39 +0000 Subject: [PATCH 07/63] update detect_gpu_label_formatter() to use match_label_key() --- sky/provision/kubernetes/utils.py | 46 +++++++++++++++++++------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 81f24ffebaa..d9bca557831 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -79,12 +79,17 @@ class GPULabelFormatter: def get_label_key(cls) -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError - + @classmethod def get_label_value(cls, accelerator: str) -> str: """Given a GPU type, returns the label value to be used""" raise NotImplementedError + @classmethod + def match_label_key(cls, label_key: str) -> bool: + """Checks if the given label key matches the formatter's label keys""" + raise NotImplementedError + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" @@ -141,6 +146,10 @@ def get_label_value(cls, accelerator: str) -> str: # See sky.utils.kubernetes.gpu_labeler. return accelerator.lower() + @classmethod + def match_label_key(cls, label_key: str) -> bool: + return label_key == cls.LABEL_KEY + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value.upper() @@ -171,6 +180,10 @@ def get_label_key(cls) -> str: def get_label_value(cls, accelerator: str) -> str: return accelerator.upper() + @classmethod + def match_label_key(cls, label_key: str) -> bool: + return label_key == cls.LABEL_KEY + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value @@ -192,7 +205,7 @@ def get_label_key(cls) -> str: return cls.GPU_LABEL_KEY @classmethod - def is_label_key(cls, label: str) -> Tuple[str, str]: + def match_label_key(cls, label: str) -> bool: return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] @classmethod @@ -252,6 +265,10 @@ def get_label_value(cls, accelerator: str) -> str: As a result, we do not support get_label_value for GFDLabelFormatter.""" raise NotImplementedError + @classmethod + def match_label_key(cls, label_key: str) -> bool: + return label_key == cls.LABEL_KEY + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Searches against a canonical list of NVIDIA GPUs and pattern @@ -318,27 +335,20 @@ def detect_gpu_label_formatter( # Get all labels across all nodes node_labels: Dict[str, List[Tuple[str, str]]] = {} nodes = get_kubernetes_nodes() - is_tpu = False for node in nodes: node_labels[node.metadata.name] = [] for label, value in node.metadata.labels.items(): - if GKELabelFormatter.TPU_LABEL_KEY == label: - is_tpu = True node_labels[node.metadata.name].append((label, value)) label_formatter = None - if is_tpu: - label_formatter = GKELabelFormatter() - else: - # Check if the node labels contain any of the GPU label prefixes - for lf in LABEL_FORMATTER_REGISTRY: - label_key = lf.get_label_key() - for _, label_list in node_labels.items(): - for label, _ in label_list: - if label.startswith(label_key): - label_formatter = lf() - return label_formatter, node_labels + # Check if the node labels contain any of the GPU label prefixes + for lf in LABEL_FORMATTER_REGISTRY: + for _, label_list in node_labels.items(): + for label, _ in label_list: + if lf.match_label_key(label): + label_formatter = lf() + return label_formatter, node_labels return label_formatter, node_labels @@ -543,7 +553,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # correctly setup and will behave as expected. for node_name, label_list in node_labels.items(): for label, value in label_list: - if label_formatter.is_label_key(label): + if label_formatter.match_label_key(label): is_valid, reason = label_formatter.validate_label_value( value) if not is_valid: @@ -575,7 +585,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # during scheduling. for node_name, label_list in node_labels.items(): for label, value in label_list: - if (label_formatter.is_label_key(label) and + if (label_formatter.match_label_key(label) and label_formatter.get_accelerator_from_label_value( value) == acc_type): return label, value From c795fe7212cc1935bb06adbdda701145c4aa5d77 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 07:12:39 +0000 Subject: [PATCH 08/63] tidy get_gpu_label_key_value --- sky/provision/kubernetes/utils.py | 41 +++++++++---------------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index d9bca557831..41870cf59f9 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -229,8 +229,6 @@ def get_accelerator_from_label_value(cls, value: str) -> str: return 'H100' return acc elif value.startswith('tpu-'): - # Doyoung: This may need some updates depending on the namings - # required from up/down-stream. return value else: raise ValueError( @@ -354,21 +352,24 @@ def detect_gpu_label_formatter( def detect_gpu_resource() -> Tuple[bool, Set[str]]: - """Checks if the Kubernetes cluster has nvidia.com/gpu resource. + """Checks if the Kubernetes cluster has accelerator resource. - If nvidia.com/gpu resource is missing, that typically means that the - Kubernetes cluster does not have GPUs or the nvidia GPU operator and/or - device drivers are not installed. + Two types of accelerator resources are available which are each checked + with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is + missing, that typically means that the Kubernetes cluster does not have + GPUs or the nvidia GPU operator and/or device drivers are not installed. Returns: - bool: True if the cluster has nvidia.com/gpu resource, False otherwise. + bool: True if the cluster has nvidia.com/gpu or google.com/tpu + resource, False otherwise. """ # Get the set of resources across all nodes cluster_resources: Set[str] = set() nodes = get_kubernetes_nodes() for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_gpu = 'nvidia.com/gpu' in cluster_resources or 'google.com/tpu' in cluster_resources + has_gpu = ('nvidia.com/gpu' in cluster_resources or + 'google.com/tpu' in cluster_resources) return has_gpu, cluster_resources @@ -560,23 +561,12 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: raise exceptions.ResourcesUnavailableError( f'Node {node_name!r} in Kubernetes cluster has ' f'invalid GPU label: {label}={value}. {reason}') - ##### - # for node_name, label_list in node_labels.items(): - # for label, value in label_list: - # if label == label_formatter.get_label_key(): - # is_valid, reason = label_formatter.validate_label_value( - # value) - # if not is_valid: - # raise exceptions.ResourcesUnavailableError( - # f'Node {node_name!r} in Kubernetes cluster has ' - # f'invalid GPU label: {label}={value}. {reason}') - ##### if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. return '', '' - k8s_acc_label_key = label_formatter.get_label_key() + # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a @@ -589,15 +579,6 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: label_formatter.get_accelerator_from_label_value( value) == acc_type): return label, value - - #### - # for node_name, label_list in node_labels.items(): - # for label, value in label_list: - # if (label == k8s_acc_label_key and - # label_formatter.get_accelerator_from_label_value( - # value) == acc_type): - # return label, value - #### # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): @@ -607,7 +588,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for node_name, label_list in node_labels.items(): all_labels.extend(label_list) gpus_available = set( - v for k, v in all_labels if k == k8s_acc_label_key) + v for k, v in all_labels if label_formatter.match_label_key(k)) suffix = f' Available GPUs on the cluster: {gpus_available}' raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' From 1c895f0b6df59f5e6e8c6a3b1f6414565e7f14af Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 07:16:19 +0000 Subject: [PATCH 09/63] nit --- sky/clouds/kubernetes.py | 6 +++--- sky/templates/kubernetes-ray.yml.j2 | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index d1fa151869c..2d3d4874088 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -263,14 +263,14 @@ def make_deploy_resources_variables( k8s_acc_label_value = None k8s_tpu_topology_label_key = None k8s_tpu_topology_label_value = None - tpu_is_requested = False + tpu_requested = False # If GPU/TPUs are requested, set node label to match the GPU/TPU type. if acc_count > 0 and acc_type is not None: k8s_acc_label_key, k8s_acc_label_value = \ kubernetes_utils.get_gpu_label_key_value(acc_type) if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: - tpu_is_requested = True + tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( kubernetes_utils.get_tpu_topology_key_value()) @@ -337,7 +337,7 @@ def make_deploy_resources_variables( 'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE, 'k8s_spot_label_key': spot_label_key, 'k8s_spot_label_value': spot_label_value, - 'tpu_is_requested': tpu_is_requested, + 'tpu_requested': tpu_requested, 'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key, 'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value, 'image_id': image_id, diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 287dda90aa3..d219bc6670e 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -401,7 +401,7 @@ available_node_types: requests: cpu: {{cpus}} memory: {{memory}}G - {% if tpu_is_requested %} + {% if tpu_requested %} google.com/tpu: {{accelerator_count}} {% else %} nvidia.com/gpu: {{accelerator_count}} @@ -413,7 +413,7 @@ available_node_types: {% endif %} limits: # Limits need to be defined for GPU/TPU requests - {% if tpu_is_requested %} + {% if tpu_requested %} google.com/tpu: {{accelerator_count}} {% else %} nvidia.com/gpu: {{accelerator_count}} From a8f5b6bf6a77429c494aae44d0b3db3fa6da4024 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 07:19:25 +0000 Subject: [PATCH 10/63] update method name --- sky/clouds/kubernetes.py | 2 +- sky/provision/kubernetes/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 2d3d4874088..8ea5b89afb3 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -272,7 +272,7 @@ def make_deploy_resources_variables( if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_key_value()) + kubernetes_utils.get_tpu_topology_label_key_value()) port_mode = network_utils.get_port_mode(None) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 41870cf59f9..45e3bf3b80b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -613,7 +613,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: f'to set up GPUs.{suffix}') -def get_tpu_topology_key_value(): +def get_tpu_topology_label_key_value(): label_formatter, node_labels = detect_gpu_label_formatter() for node_name, label_list in node_labels.items(): for label, value in label_list: From bdb34690e9464bdfd9f4b7253d0c4283cfca945e Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 07:24:49 +0000 Subject: [PATCH 11/63] update get_gke_accelerator_name to support TPU --- sky/provision/kubernetes/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 45e3bf3b80b..e6fc3a20325 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -123,6 +123,8 @@ def get_gke_accelerator_name(accelerator: str) -> str: # A100-80GB, L4, H100-80GB and H100-MEGA-80GB # have a different name pattern. return 'nvidia-{}'.format(accelerator.lower()) + elif accelerator.startswith('tpu-'): + return accelerator else: return 'nvidia-tesla-{}'.format(accelerator.lower()) From 1d2d24399544bbf4db8604e217dcd6a56b027c19 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 08:04:21 +0000 Subject: [PATCH 12/63] add support for get_label_keys method due to TPU label key --- sky/provision/kubernetes/instance.py | 3 +- sky/provision/kubernetes/utils.py | 45 +++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 735f9366365..9dd0e2eb62d 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -180,8 +180,9 @@ def _lack_resource_msg(resource: str, '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long f' Full error: {event_message}') gpu_lf_keys = [ - lf.get_label_key() + key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY + for key in lf.get_label_keys() ] if pod.spec.node_selector: for label_key in pod.spec.node_selector.keys(): diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index e6fc3a20325..38aaa3ce82b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -76,10 +76,15 @@ class GPULabelFormatter: """ @classmethod - def get_label_key(cls) -> str: + def get_label_key(cls, accelerator: str = None) -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError - + + @classmethod + def get_label_keys(cls) -> List[str]: + """Returns a list of label keys for GPU used by Kubernetes cluster.""" + pass + @classmethod def get_label_value(cls, accelerator: str) -> str: """Given a GPU type, returns the label value to be used""" @@ -139,8 +144,13 @@ class SkyPilotLabelFormatter(GPULabelFormatter): LABEL_KEY = 'skypilot.co/accelerator' @classmethod - def get_label_key(cls) -> str: + def get_label_key(cls, accelerator: str = None) -> str: + del accelerator # Unused return cls.LABEL_KEY + + @classmethod + def get_label_keys(cls) -> str: + return [cls.LABEL_KEY] @classmethod def get_label_value(cls, accelerator: str) -> str: @@ -175,9 +185,14 @@ class CoreWeaveLabelFormatter(GPULabelFormatter): LABEL_KEY = 'gpu.nvidia.com/class' @classmethod - def get_label_key(cls) -> str: + def get_label_key(cls, accelerator: str = None) -> str: + del accelerator # Unused return cls.LABEL_KEY + @classmethod + def get_label_keys(cls) -> str: + return [cls.LABEL_KEY] + @classmethod def get_label_value(cls, accelerator: str) -> str: return accelerator.upper() @@ -203,9 +218,15 @@ class GKELabelFormatter(GPULabelFormatter): TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology' @classmethod - def get_label_key(cls) -> str: + def get_label_key(cls, accelerator: str = None) -> str: + if accelerator.startswith('tpu-'): + return cls.TPU_LABEL_KEY return cls.GPU_LABEL_KEY + @classmethod + def get_label_keys(cls) -> str: + return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] + @classmethod def match_label_key(cls, label: str) -> bool: return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] @@ -255,9 +276,14 @@ class GFDLabelFormatter(GPULabelFormatter): LABEL_KEY = 'nvidia.com/gpu.product' @classmethod - def get_label_key(cls) -> str: + def get_label_key(cls, accelerator: str = None) -> str: + del accelerator # Unused return cls.LABEL_KEY + @classmethod + def get_label_keys(cls) -> str: + return [cls.LABEL_KEY] + @classmethod def get_label_value(cls, accelerator: str) -> str: """An accelerator can map to many Nvidia GFD labels @@ -528,7 +554,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type) assert formatter is not None, ('Unsupported autoscaler type:' f' {autoscaler_type}') - return formatter.get_label_key(), formatter.get_label_value(acc_type) + return formatter.get_label_key(acc_type), formatter.get_label_value(acc_type) has_gpus, cluster_resources = detect_gpu_resource() if has_gpus: @@ -540,7 +566,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # detected, raise error with ux_utils.print_exception_no_traceback(): supported_formats = ', '.join( - [f.get_label_key() for f in LABEL_FORMATTER_REGISTRY]) + key + for f in LABEL_FORMATTER_REGISTRY + for key in f.get_label_keys() + ) suffix = '' if env_options.Options.SHOW_DEBUG_INFO.get(): suffix = f' Found node labels: {node_labels}' From 92f4f382d8a51f7fe5c87c9f133293271eb320f0 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 08:09:06 +0000 Subject: [PATCH 13/63] syntax --- sky/provision/kubernetes/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 38aaa3ce82b..afe4b82c115 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -566,9 +566,9 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # detected, raise error with ux_utils.print_exception_no_traceback(): supported_formats = ', '.join( - key + [key for f in LABEL_FORMATTER_REGISTRY - for key in f.get_label_keys() + for key in f.get_label_keys()] ) suffix = '' if env_options.Options.SHOW_DEBUG_INFO.get(): @@ -592,12 +592,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: raise exceptions.ResourcesUnavailableError( f'Node {node_name!r} in Kubernetes cluster has ' f'invalid GPU label: {label}={value}. {reason}') - if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. return '', '' - # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a From 2662ec80e5c9b513f5d2700dcd97decb2de0ab18 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 17 Sep 2024 09:07:17 +0000 Subject: [PATCH 14/63] update get_tpu_topology_label_key_value --- sky/clouds/kubernetes.py | 2 +- sky/provision/kubernetes/utils.py | 73 ++++++++++++++++++++----------- 2 files changed, 49 insertions(+), 26 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 8ea5b89afb3..97447f25f88 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -272,7 +272,7 @@ def make_deploy_resources_variables( if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_label_key_value()) + kubernetes_utils.get_tpu_topology_label_key_value(acc_type)) port_mode = network_utils.get_port_mode(None) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index afe4b82c115..3000eb8e829 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -621,9 +621,9 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: suffix = f' Available GPUs on the cluster: {gpus_available}' raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' - f'with {acc_type} GPU. Please ensure at least ' - f'one node in the cluster has {acc_type} GPU and node ' - 'labels are setup correctly. ' + f'with {acc_type}. Please ensure at least one node in the ' + f'cluster has {acc_type} and node labels are setup ' + 'correctly. ' f'Please refer to the documentation for more. {suffix}') else: # If GPU resources are not detected, raise error @@ -633,31 +633,54 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: suffix = (' Available resources on the cluster: ' f'{cluster_resources}') raise exceptions.ResourcesUnavailableError( - 'Could not detect GPU resources (`nvidia.com/gpu`) in ' - 'Kubernetes cluster. If this cluster contains GPUs, please ' - 'ensure GPU drivers are installed on the node. Check if the ' - 'GPUs are setup correctly by running `kubectl describe nodes` ' - 'and looking for the nvidia.com/gpu resource. ' - 'Please refer to the documentation on how ' - f'to set up GPUs.{suffix}') + 'Could not detect GPU/TPU resources (`nvidia.com/gpu` or ' + '`google.com/tpu`) in Kubernetes cluster. If this cluster ' + 'contains GPUs, please ensure GPU drivers are installed on ' + 'the node. Check if the GPUs are setup correctly by running ' + '`kubectl describe nodes` and looking for the nvidia.com/gpu ' + 'or google.com/tpu resource. Please refer to the documentation' + f'on how to set up GPUs.{suffix}') -def get_tpu_topology_label_key_value(): +def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]: + """Returns the TPU topology label key and value for given accelerator type. + + Args: + accelerator: The TPU accelerator type required by the task. + + Returns: + A tuple of the TPU topology label key and value. + + Raises: + ResourcesUnavailableError: Can be raised from the following conditions: + - The cluster does not have TPU labels set up correctly. + - The cluster doesn't have any nodes with the specified TPU + accelerator type. + - The TPU topology label is missing for the specified accelerator. + """ label_formatter, node_labels = detect_gpu_label_formatter() - for node_name, label_list in node_labels.items(): - for label, value in label_list: - if label == label_formatter.get_tpu_topology_label_key(): - is_valid, reason = label_formatter.validate_label_value(value) - if not is_valid: - raise exceptions.ResourcesUnavailableError( - f'Node {node_name!r} in Kubernetes cluster has ' - f'invalid GPU label: {label}={value}. {reason}') - - for node_name, label_list in node_labels.items(): - for label, value in label_list: - if label == label_formatter.get_tpu_topology_label_key(): - return label, value - + assert isinstance(label_formatter, GKELabelFormatter) + + tpu_label_key = label_formatter.TPU_LABEL_KEY + tpu_topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY + + for labels in node_labels.values(): + labels_dict = dict(labels) + if labels_dict.get(tpu_label_key) == accelerator: + topology_value = labels_dict.get(tpu_topology_label_key) + return tpu_topology_label_key, topology_value + + # If TPU labels are not detected, raise error + with ux_utils.print_exception_no_traceback(): + suffix = '' + if env_options.Options.SHOW_DEBUG_INFO.get(): + suffix = (' Available node labels on the cluster: ' + f'{node_labels}') + raise exceptions.ResourcesUnavailableError( + f'Unable to find TPU topology for accelerator {accelerator!r}. ' + f'No node found with label `{tpu_label_key}={accelerator}` ' + f'or missing {tpu_topology_label_key!r} label.{suffix}') + def get_head_ssh_port(cluster_name: str, namespace: str, context: Optional[str]) -> int: From 58f8ad66c02c4e7af3498abfcf3f6ed94eb97186 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 20 Sep 2024 19:45:15 +0000 Subject: [PATCH 15/63] nit --- sky/resources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/resources.py b/sky/resources.py index 51c9fe1b2b5..4b47e8e4247 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -568,14 +568,14 @@ def _set_accelerators( # raise an error when cloud is not specified by the user # if self.cloud is None: # self._cloud = clouds.GCP() - assert self.cloud.is_same_cloud( - clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.' + # assert self.cloud.is_same_cloud( + # clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.' if accelerator_args is None: accelerator_args = {} # Doyoung: May need to understand the usage of tpu_vm and make # proper adjustments to the following snippets. - if self.cloud.is_same_cloud(clouds.GCP()): + if self.cloud is not None and self.cloud.is_same_cloud(clouds.GCP()): use_tpu_vm = accelerator_args.get('tpu_vm', True) else: use_tpu_vm = False From 1cf82b647b703731d01f3957128f760fa46f34b0 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 20 Sep 2024 19:45:40 +0000 Subject: [PATCH 16/63] refactor error surfacing methods to have it work with TPU support --- sky/provision/kubernetes/instance.py | 131 +++++++++++++++++---------- 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9dd0e2eb62d..7969923272e 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1,7 +1,7 @@ """Kubernetes instance provisioning.""" import copy import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import uuid from sky import exceptions @@ -79,6 +79,74 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]: return {'component': f'{cluster_name}-head'} +def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]): + # Returns a formatted string of resource requirements for a pod. + resource_requirements = {} + + if isinstance(pod_or_spec, dict): + containers = pod_or_spec.get('spec', {}).get('containers', []) + else: + containers = pod_or_spec.spec.containers + + for container in containers: + if isinstance(container, dict): + resources = container.get('resources', {}) + requests = resources.get('requests', {}) + else: + resources = container.resources + requests = resources.requests or {} + + for resource, value in requests.items(): + if resource not in resource_requirements: + resource_requirements[resource] = 0 + if resource == 'memory': + int_value = kubernetes_utils.parse_memory_resource(value) + else: + int_value = kubernetes_utils.parse_cpu_or_gpu_resource( + value) + resource_requirements[resource] += int_value + return ', '.join(f'{resource}={value}' + for resource, value in resource_requirements.items()) + + +def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]: + # Returns a formatted string of node selectors for a pod. + node_selectors = [] + + if isinstance(pod_or_spec, dict): + selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {}) + else: + selectors = pod_or_spec.spec.node_selector + + if not selectors: + return None + + for label_key, label_value in selectors.items(): + node_selectors.append(f'{label_key}={label_value}') + return ', '.join(node_selectors) + + +def _lack_resource_msg(resource: str, + pod_or_spec: Union['Pod', dict], + extra_msg: Optional[str] = None, + details: Optional[str] = None) -> str: + resource_requirements = _formatted_resource_requirements(pod_or_spec) + node_selectors = _formatted_node_selector(pod_or_spec) + node_selector_str = f' and labels ({node_selectors})' if ( + node_selectors) else '' + msg = ( + f'Insufficient {resource} capacity on the cluster. ' + f'Required resources ({resource_requirements}){node_selector_str} ' + 'were not found in a single node. Other SkyPilot tasks or pods may ' + 'be using resources. Check resource usage by running ' + '`kubectl describe nodes`.') + if extra_msg: + msg += f' {extra_msg}' + if details: + msg += f'\nFull error: {details}' + return msg + + def _raise_pod_scheduling_errors(namespace, context, new_nodes): """Raise pod scheduling failure reason. @@ -86,52 +154,6 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): are recorded as events. This function retrieves those events and raises descriptive errors for better debugging and user feedback. """ - - def _formatted_resource_requirements(pod): - # Returns a formatted string of resource requirements for a pod. - resource_requirements = {} - for container in pod.spec.containers: - for resource, value in container.resources.requests.items(): - if resource not in resource_requirements: - resource_requirements[resource] = 0 - if resource == 'memory': - int_value = kubernetes_utils.parse_memory_resource(value) - else: - int_value = kubernetes_utils.parse_cpu_or_gpu_resource( - value) - resource_requirements[resource] += int_value - return ', '.join(f'{resource}={value}' - for resource, value in resource_requirements.items()) - - def _formatted_node_selector(pod) -> Optional[str]: - # Returns a formatted string of node selectors for a pod. - node_selectors = [] - if pod.spec.node_selector is None: - return None - for label_key, label_value in pod.spec.node_selector.items(): - node_selectors.append(f'{label_key}={label_value}') - return ', '.join(node_selectors) - - def _lack_resource_msg(resource: str, - pod, - extra_msg: Optional[str] = None, - details: Optional[str] = None) -> str: - resource_requirements = _formatted_resource_requirements(pod) - node_selectors = _formatted_node_selector(pod) - node_selector_str = f' and labels ({node_selectors})' if ( - node_selectors) else '' - msg = ( - f'Insufficient {resource} capacity on the cluster. ' - f'Required resources ({resource_requirements}){node_selector_str} ' - 'were not found in a single node. Other SkyPilot tasks or pods may ' - 'be using resources. Check resource usage by running ' - '`kubectl describe nodes`.') - if extra_msg: - msg += f' {extra_msg}' - if details: - msg += f'\nFull error: {details}' - return msg - for new_node in new_nodes: pod = kubernetes.core_api(context).read_namespaced_pod( new_node.metadata.name, namespace) @@ -589,8 +611,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str, } pod_spec['spec']['tolerations'] = [tpu_toleration] - pod = kubernetes.core_api(context).create_namespaced_pod( - namespace, pod_spec) + try: + pod = kubernetes.core_api(context).create_namespaced_pod( + namespace, pod_spec) + except kubernetes.api_exception() as e: + error_msg = str(e) + if 'Invalid resource requests for google.com/tpu.' in error_msg: + extra_msg = ('Verify if the cluster has a TPU slice with a ' + 'topology matching the number of TPU(s) ' + 'requested.') + raise config_lib.KubernetesError( + _lack_resource_msg('TPU', pod_spec, details=error_msg, extra_msg=extra_msg) + ) + raise created_pods[pod.metadata.name] = pod if head_pod_name is None: head_pod_name = pod.metadata.name From 7b551c94c66b44ae3e2779b398d6db68901b9429 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 21 Sep 2024 03:22:42 +0000 Subject: [PATCH 17/63] update toleration comment --- sky/provision/kubernetes/instance.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 7969923272e..b1d24c5765a 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -600,9 +600,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str, } } - # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule) - # and explain what those are for. And explain why we need toleration at this point to ignore that taint. - if kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY in config.node_config['spec']['nodeSelector']: + # TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule. + # This is to prevent from non-TPU workloads from being scheduled on TPU + # slice nodes. We need this toleration to allow the pod to be scheduled + # on TPU nodes. + # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long + tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY + if tpu_label in config.node_config['spec']['nodeSelector']: tpu_toleration = { 'key': 'google.com/tpu', 'operator': 'Equal', @@ -617,12 +621,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str, except kubernetes.api_exception() as e: error_msg = str(e) if 'Invalid resource requests for google.com/tpu.' in error_msg: - extra_msg = ('Verify if the cluster has a TPU slice with a ' - 'topology matching the number of TPU(s) ' + extra_msg = ('Verify if the cluster has a TPU slice node with ' + 'a topology matching the number of TPU(s) ' 'requested.') raise config_lib.KubernetesError( - _lack_resource_msg('TPU', pod_spec, details=error_msg, extra_msg=extra_msg) - ) + _lack_resource_msg('TPU', + pod_spec, + details=error_msg, + extra_msg=extra_msg)) raise created_pods[pod.metadata.name] = pod if head_pod_name is None: From 81a05ee6eef5eba2b98b1f4cf8c7b9d8a4355e6d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 21 Sep 2024 04:38:07 +0000 Subject: [PATCH 18/63] support listing available TPUs and show-gpus for TPUs --- sky/cli.py | 4 +- .../service_catalog/kubernetes_catalog.py | 111 ++++++++++-------- sky/provision/kubernetes/utils.py | 73 +++++++----- 3 files changed, 104 insertions(+), 84 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index eb0267f7ced..a4199dae68b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3086,8 +3086,8 @@ def _get_kubernetes_node_info_table(): for node_name, node_info in node_info_dict.items(): node_table.add_row([ node_name, node_info.gpu_type, - node_info.total['nvidia.com/gpu'], - node_info.free['nvidia.com/gpu'] + node_info.total['accelerator_count'], + node_info.free['accelerators_available'] ]) return node_table diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 9365d693cbd..1534c4c39a5 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -84,7 +84,7 @@ def list_accelerators_realtime( return {}, {}, {} accelerators_qtys: Set[Tuple[str, int]] = set() - key = label_formatter.get_label_key() + keys = label_formatter.get_label_keys() nodes = kubernetes_utils.get_kubernetes_nodes() # Get the pods to get the real-time GPU usage pods = kubernetes_utils.get_kubernetes_pods() @@ -95,56 +95,65 @@ def list_accelerators_realtime( min_quantity_filter = quantity_filter if quantity_filter else 1 for node in nodes: - if key in node.metadata.labels: - allocated_qty = 0 - accelerator_name = label_formatter.get_accelerator_from_label_value( - node.metadata.labels.get(key)) - - # Check if name_filter regex matches the accelerator_name - regex_flags = 0 if case_sensitive else re.IGNORECASE - if name_filter and not re.match( - name_filter, accelerator_name, flags=regex_flags): - continue - - accelerator_count = int( - node.status.allocatable.get('nvidia.com/gpu', 0)) - - # Generate the GPU quantities for the accelerators - if accelerator_name and accelerator_count > 0: - for count in range(1, accelerator_count + 1): - accelerators_qtys.add((accelerator_name, count)) - - for pod in pods: - # Get all the pods running on the node - if (pod.spec.node_name == node.metadata.name and - pod.status.phase in ['Running', 'Pending']): - # Iterate over all the containers in the pod and sum the - # GPU requests - for container in pod.spec.containers: - if container.resources.requests: - allocated_qty += int( - container.resources.requests.get( - 'nvidia.com/gpu', 0)) - - accelerators_available = accelerator_count - allocated_qty - - if accelerator_count >= min_quantity_filter: - quantized_count = (min_quantity_filter * - (accelerator_count // min_quantity_filter)) - if accelerator_name not in total_accelerators_capacity: - total_accelerators_capacity[ - accelerator_name] = quantized_count - else: - total_accelerators_capacity[ - accelerator_name] += quantized_count - - if accelerator_name not in total_accelerators_available: - total_accelerators_available[accelerator_name] = 0 - if accelerators_available >= min_quantity_filter: - quantized_availability = min_quantity_filter * ( - accelerators_available // min_quantity_filter) - total_accelerators_available[ - accelerator_name] += quantized_availability + for key in keys: + if key in node.metadata.labels: + allocated_qty = 0 + accelerator_name = label_formatter.get_accelerator_from_label_value( + node.metadata.labels.get(key)) + + # Check if name_filter regex matches the accelerator_name + regex_flags = 0 if case_sensitive else re.IGNORECASE + if name_filter and not re.match( + name_filter, accelerator_name, flags=regex_flags): + continue + + accelerator_count = 0 + if 'nvidia.com/gpu' in node.status.allocatable: + accelerator_count = int( + node.status.allocatable['nvidia.com/gpu']) + elif 'google.com/tpu' in node.status.allocatable: + accelerator_count = int( + node.status.allocatable['google.com/tpu']) + + # Generate the GPU quantities for the accelerators + if accelerator_name and accelerator_count > 0: + for count in range(1, accelerator_count + 1): + accelerators_qtys.add((accelerator_name, count)) + + for pod in pods: + # Get all the pods running on the node + if (pod.spec.node_name == node.metadata.name and + pod.status.phase in ['Running', 'Pending']): + # Iterate over all the containers in the pod and sum the + # GPU requests + for container in pod.spec.containers: + if container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'nvidia.com/gpu', 0)) + allocated_qty += int( + container.resources.requests.get( + 'google.com/tpu', 0)) + + accelerators_available = accelerator_count - allocated_qty + + if accelerator_count >= min_quantity_filter: + quantized_count = (min_quantity_filter * + (accelerator_count // min_quantity_filter)) + if accelerator_name not in total_accelerators_capacity: + total_accelerators_capacity[ + accelerator_name] = quantized_count + else: + total_accelerators_capacity[ + accelerator_name] += quantized_count + + if accelerator_name not in total_accelerators_available: + total_accelerators_available[accelerator_name] = 0 + if accelerators_available >= min_quantity_filter: + quantized_availability = min_quantity_filter * ( + accelerators_available // min_quantity_filter) + total_accelerators_available[ + accelerator_name] += quantized_availability result = [] diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 3000eb8e829..18642cdf667 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1812,40 +1812,51 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: if not label_formatter: label_key = None else: - label_key = label_formatter.get_label_key() + label_keys = label_formatter.get_label_keys() node_info_dict: Dict[str, KubernetesNodeInfo] = {} - for node in nodes: - allocated_qty = 0 - if label_formatter is not None and label_key in node.metadata.labels: - accelerator_name = label_formatter.get_accelerator_from_label_value( - node.metadata.labels.get(label_key)) - else: - accelerator_name = None - - accelerator_count = int(node.status.allocatable.get( - 'nvidia.com/gpu', 0)) - - for pod in pods: - # Get all the pods running on the node - if (pod.spec.node_name == node.metadata.name and - pod.status.phase in ['Running', 'Pending']): - # Iterate over all the containers in the pod and sum the - # GPU requests - for container in pod.spec.containers: - if container.resources.requests: - allocated_qty += int( - container.resources.requests.get( - 'nvidia.com/gpu', 0)) - - accelerators_available = accelerator_count - allocated_qty - - node_info_dict[node.metadata.name] = KubernetesNodeInfo( - name=node.metadata.name, - gpu_type=accelerator_name, - total={'nvidia.com/gpu': int(accelerator_count)}, - free={'nvidia.com/gpu': int(accelerators_available)}) + for label_key in label_keys: + for node in nodes: + allocated_qty = 0 + if label_formatter is not None and label_key in node.metadata.labels: + accelerator_name = label_formatter.get_accelerator_from_label_value( + node.metadata.labels.get(label_key)) + else: + accelerator_name = None + + accelerator_count = 0 + if 'nvidia.com/gpu' in node.status.allocatable: + accelerator_count = int( + node.status.allocatable['nvidia.com/gpu']) + elif 'google.com/tpu' in node.status.allocatable: + accelerator_count = int( + node.status.allocatable['google.com/tpu']) + + for pod in pods: + # Get all the pods running on the node + if (pod.spec.node_name == node.metadata.name and + pod.status.phase in ['Running', 'Pending']): + # Iterate over all the containers in the pod and sum the + # GPU requests + for container in pod.spec.containers: + if container.resources.requests: + if 'nvidia.com/gpu' in container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'nvidia.com/gpu', 0)) + elif 'google.com/tpu' in container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'google.com/tpu', 0)) + + accelerators_available = accelerator_count - allocated_qty + + node_info_dict[node.metadata.name] = KubernetesNodeInfo( + name=node.metadata.name, + gpu_type=accelerator_name, + total={'accelerator_count': int(accelerator_count)}, + free={'accelerators_available': int(accelerators_available)}) return node_info_dict From e8764f1c9493905aa618668a90080cb759e0dc48 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 21 Sep 2024 22:01:13 +0000 Subject: [PATCH 19/63] nit --- sky/provision/kubernetes/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index b1d24c5765a..e6da6db4c45 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -606,7 +606,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, # on TPU nodes. # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY - if tpu_label in config.node_config['spec']['nodeSelector']: + if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}): tpu_toleration = { 'key': 'google.com/tpu', 'operator': 'Equal', From 3497aee95a6d2dbecd559f6b06ca7cccaa4b3093 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 21 Sep 2024 22:14:29 +0000 Subject: [PATCH 20/63] update help message --- sky/provision/kubernetes/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6aa6400dfa1..3d0c2f57f8e 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -38,10 +38,11 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure ' - 'nvidia.com/gpu resource is available on the nodes and ' - 'the node labels for identifying GPUs ' - '(e.g., skypilot.co/accelerator) are setup correctly. ') +NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs or TPUs, make sure ' + 'nvidia.com/gpu or google.com/tpu resource is available' + ' on the nodes and the node labels for identifying ' + 'GPUs/TPUs (e.g., skypilot.co/accelerator) are setup ' + 'correctly. ') KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' From 724806a5814b9afba048b396fabba5141863a303 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 22 Sep 2024 00:08:43 +0000 Subject: [PATCH 21/63] Update /tmp/tpu_logs dir's write permission --- sky/templates/kubernetes-ray.yml.j2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index d219bc6670e..be288cfc918 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -441,6 +441,16 @@ setup_commands: sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; + {% if tpu_requested %} + # The /tmp/tpu_logs directory is where TPU-related logs, specifically logs + # from the TPU runtime (such as the TPU driver), are written. These logs + # capture important runtime information about the TPU execution, including + # any warnings, errors, or general activity of the TPU driver. + # By default, the /tmp/tpu_logs directory is created with 755 permissions, + # and user of the provisioned pod is not necessarily a root. Hence, we need + # to update the write permission so the logs can be properly written. + - sudo chmod 777 /tmp/tpu_logs; + {% endif %} # Format: `REMOTE_PATH : LOCAL_PATH` file_mounts: { From e8d73feffc046e9bfae853d6b6d003462ec6b8d7 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 22 Sep 2024 00:10:21 +0000 Subject: [PATCH 22/63] nit --- sky/templates/kubernetes-ray.yml.j2 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index be288cfc918..186dbe09d3c 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -442,13 +442,13 @@ setup_commands: mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; {% if tpu_requested %} - # The /tmp/tpu_logs directory is where TPU-related logs, specifically logs - # from the TPU runtime (such as the TPU driver), are written. These logs - # capture important runtime information about the TPU execution, including - # any warnings, errors, or general activity of the TPU driver. + # The /tmp/tpu_logs directory is where TPU-related logs, such as logs from + # the TPU runtime (such as the TPU driver), are written. These logs capture + # important runtime information about the TPU execution, including any + # warnings, errors, or general activity of the TPU driver. # By default, the /tmp/tpu_logs directory is created with 755 permissions, - # and user of the provisioned pod is not necessarily a root. Hence, we need - # to update the write permission so the logs can be properly written. + # and the user of the provisioned pod is not necessarily a root. Hence, we + # need to update the write permission so the logs can be properly written. - sudo chmod 777 /tmp/tpu_logs; {% endif %} From 7ac503673287c1b9f1759fe9756df07db6464676 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 22 Sep 2024 00:16:38 +0000 Subject: [PATCH 23/63] nit --- sky/templates/kubernetes-ray.yml.j2 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 186dbe09d3c..79c0cb2a77e 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -443,12 +443,12 @@ setup_commands: [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; {% if tpu_requested %} # The /tmp/tpu_logs directory is where TPU-related logs, such as logs from - # the TPU runtime (such as the TPU driver), are written. These logs capture - # important runtime information about the TPU execution, including any - # warnings, errors, or general activity of the TPU driver. - # By default, the /tmp/tpu_logs directory is created with 755 permissions, - # and the user of the provisioned pod is not necessarily a root. Hence, we - # need to update the write permission so the logs can be properly written. + # the TPU runtime, are written. These capture runtime information about the + # TPU execution, including any warnings, errors, or general activity of + # the TPU driver. By default, the /tmp/tpu_logs directory is created with + # 755 permissions, and the user of the provisioned pod is not necessarily + # a root. Hence, we need to update the write permission so the logs can be + # properly written. - sudo chmod 777 /tmp/tpu_logs; {% endif %} From 4470dbeccb9e68ba336973592d2c76f110a650b6 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 22 Sep 2024 00:18:45 +0000 Subject: [PATCH 24/63] comment update on TPU resource lackage error handling --- sky/provision/kubernetes/instance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index e6da6db4c45..73b3147f98d 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -620,6 +620,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str, namespace, pod_spec) except kubernetes.api_exception() as e: error_msg = str(e) + # Unlike other errors from resource lackage on CPU/GPU/Memory, TPU + # lackage error is raised when pod is attemtped to be created. if 'Invalid resource requests for google.com/tpu.' in error_msg: extra_msg = ('Verify if the cluster has a TPU slice node with ' 'a topology matching the number of TPU(s) ' From 0860e4596e38f475ab4e1df61bf528566ae1d43d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 22 Sep 2024 00:45:33 +0000 Subject: [PATCH 25/63] Update to use global constant instead of hard coded string of nvidia.com/gpu and google.com/tpu --- .../service_catalog/kubernetes_catalog.py | 12 +++--- sky/provision/kubernetes/instance.py | 4 +- sky/provision/kubernetes/utils.py | 39 +++++++++++-------- sky/utils/kubernetes/gpu_labeler.py | 4 +- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 1534c4c39a5..c0ac2e16472 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -108,12 +108,12 @@ def list_accelerators_realtime( continue accelerator_count = 0 - if 'nvidia.com/gpu' in node.status.allocatable: + if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable: accelerator_count = int( - node.status.allocatable['nvidia.com/gpu']) - elif 'google.com/tpu' in node.status.allocatable: + node.status.allocatable[kubernetes_utils.GPU_RESOURCE_KEY]) + elif kubernetes_utils.TPU_RESOURCE_KEY in node.status.allocatable: accelerator_count = int( - node.status.allocatable['google.com/tpu']) + node.status.allocatable[kubernetes_utils.TPU_RESOURCE_KEY]) # Generate the GPU quantities for the accelerators if accelerator_name and accelerator_count > 0: @@ -130,10 +130,10 @@ def list_accelerators_realtime( if container.resources.requests: allocated_qty += int( container.resources.requests.get( - 'nvidia.com/gpu', 0)) + kubernetes_utils.GPU_RESOURCE_KEY, 0)) allocated_qty += int( container.resources.requests.get( - 'google.com/tpu', 0)) + kubernetes_utils.TPU_RESOURCE_KEY, 0)) accelerators_available = accelerator_count - allocated_qty diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 73b3147f98d..5c7d65f0d71 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -555,7 +555,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, 'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get( - 'limits', {}).get('nvidia.com/gpu', 0) > 0) + 'limits', {}).get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0) if nvidia_runtime_exists and needs_gpus: pod_spec['spec']['runtimeClassName'] = 'nvidia' @@ -608,7 +608,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}): tpu_toleration = { - 'key': 'google.com/tpu', + 'key': kubernetes_utils.TPU_RESOURCE_KEY, 'operator': 'Equal', 'value': 'present', 'effect': 'NoSchedule' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 18642cdf667..d5bc586cde5 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -38,8 +38,15 @@ 'T': 2**40, 'P': 2**50, } + +# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on +# nodes. These keys are typically used in the node's status.allocatable +# or status.capacity fields to indicate the available resources on the node. +GPU_RESOURCE_KEY = 'nvidia.com/gpu' +TPU_RESOURCE_KEY = 'google.com/tpu' + NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure ' - 'nvidia.com/gpu resource is available on the nodes and ' + f'{GPU_RESOURCE_KEY} resource is available on the nodes and ' 'the node labels for identifying GPUs ' '(e.g., skypilot.co/accelerator) are setup correctly. ') @@ -388,7 +395,7 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]: GPUs or the nvidia GPU operator and/or device drivers are not installed. Returns: - bool: True if the cluster has nvidia.com/gpu or google.com/tpu + bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY resource, False otherwise. """ # Get the set of resources across all nodes @@ -396,8 +403,8 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]: nodes = get_kubernetes_nodes() for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_gpu = ('nvidia.com/gpu' in cluster_resources or - 'google.com/tpu' in cluster_resources) + has_gpu = (GPU_RESOURCE_KEY in cluster_resources or + TPU_RESOURCE_KEY in cluster_resources) return has_gpu, cluster_resources @@ -633,12 +640,12 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: suffix = (' Available resources on the cluster: ' f'{cluster_resources}') raise exceptions.ResourcesUnavailableError( - 'Could not detect GPU/TPU resources (`nvidia.com/gpu` or ' - '`google.com/tpu`) in Kubernetes cluster. If this cluster ' + f'Could not detect GPU/TPU resources (`{GPU_RESOURCE_KEY}` or ' + f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster ' 'contains GPUs, please ensure GPU drivers are installed on ' 'the node. Check if the GPUs are setup correctly by running ' - '`kubectl describe nodes` and looking for the nvidia.com/gpu ' - 'or google.com/tpu resource. Please refer to the documentation' + f'`kubectl describe nodes` and looking for the {GPU_RESOURCE_KEY!r} ' + f'or {TPU_RESOURCE_KEY!r} resource. Please refer to the documentation' f'on how to set up GPUs.{suffix}') @@ -1826,12 +1833,12 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: accelerator_name = None accelerator_count = 0 - if 'nvidia.com/gpu' in node.status.allocatable: + if GPU_RESOURCE_KEY in node.status.allocatable: accelerator_count = int( - node.status.allocatable['nvidia.com/gpu']) - elif 'google.com/tpu' in node.status.allocatable: + node.status.allocatable[GPU_RESOURCE_KEY]) + elif TPU_RESOURCE_KEY in node.status.allocatable: accelerator_count = int( - node.status.allocatable['google.com/tpu']) + node.status.allocatable[TPU_RESOURCE_KEY]) for pod in pods: # Get all the pods running on the node @@ -1841,14 +1848,14 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: # GPU requests for container in pod.spec.containers: if container.resources.requests: - if 'nvidia.com/gpu' in container.resources.requests: + if GPU_RESOURCE_KEY in container.resources.requests: allocated_qty += int( container.resources.requests.get( - 'nvidia.com/gpu', 0)) - elif 'google.com/tpu' in container.resources.requests: + GPU_RESOURCE_KEY, 0)) + elif TPU_RESOURCE_KEY in container.resources.requests: allocated_qty += int( container.resources.requests.get( - 'google.com/tpu', 0)) + TPU_RESOURCE_KEY, 0)) accelerators_available = accelerator_count - allocated_qty diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py index b00bd4f21ae..14fbbdedca5 100644 --- a/sky/utils/kubernetes/gpu_labeler.py +++ b/sky/utils/kubernetes/gpu_labeler.py @@ -101,7 +101,7 @@ def label(): # Get the list of nodes with GPUs gpu_nodes = [] for node in nodes: - if 'nvidia.com/gpu' in node.status.capacity: + if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity: gpu_nodes.append(node) print(f'Found {len(gpu_nodes)} GPU nodes in the cluster') @@ -142,7 +142,7 @@ def label(): if len(gpu_nodes) == 0: print('No GPU nodes found in the cluster. If you have GPU nodes, ' 'please ensure that they have the label ' - '`nvidia.com/gpu: `') + f'`{kubernetes_utils.GPU_RESOURCE_KEY}: `') else: print('GPU labeling started - this may take 10 min or more to complete.' '\nTo check the status of GPU labeling jobs, run ' From 35f3c80106d7eeaef6feda47075b1b90a10887a3 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 23 Sep 2024 02:57:03 +0000 Subject: [PATCH 26/63] add smoke test and make exec work on TPU pods --- sky/backends/cloud_vm_ray_backend.py | 3 +-- sky/clouds/utils/gcp_utils.py | 9 +++++++ sky/resources.py | 36 ++++++++++++++-------------- tests/test_smoke.py | 16 +++++++++++++ 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 191a09438aa..5c6e5129020 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2484,8 +2484,7 @@ def head_ssh_port(self): @property def num_ips_per_node(self) -> int: """Returns number of IPs per node in the cluster, handling TPU Pod.""" - is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources) - if is_tpu_vm_pod: + if gcp_utils.is_tpu_vm_pod(self.launched_resources): num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources) else: num_ips = 1 diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index 68e6192d351..f1ee93c7979 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -31,10 +31,19 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: acc, _ = list(resources.accelerators.items())[0] return acc.startswith('tpu') +def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool: + if not is_tpu(resources): + return False + assert resources is not None + acc, _ = list(resources.accelerators.items())[0] + # Reference on Accelerator names for TPU Pod slices: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long + return acc.endswith('-podslice') or acc.endswith('-device') def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: if not is_tpu(resources): return False + elif is_tpu_pod_slice(resources): + return False assert resources is not None if resources.accelerator_args is None: return True diff --git a/sky/resources.py b/sky/resources.py index 4b47e8e4247..137a8e731f4 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -564,26 +564,26 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): - # Doyoung: Confirm if below two lines can be removed. Perhaps, - # raise an error when cloud is not specified by the user - # if self.cloud is None: - # self._cloud = clouds.GCP() - # assert self.cloud.is_same_cloud( - # clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.' + if self.cloud is None: + if acc.endswith('-podslice') or acc.endswith('-device'): + self._cloud = clouds.Kubernetes() + else: + self._cloud = clouds.GCP() + assert (self.cloud.is_same_cloud( + clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes())), 'Cloud must be GCP or Kubernetes.' + if accelerator_args is None: accelerator_args = {} - - # Doyoung: May need to understand the usage of tpu_vm and make - # proper adjustments to the following snippets. - if self.cloud is not None and self.cloud.is_same_cloud(clouds.GCP()): - use_tpu_vm = accelerator_args.get('tpu_vm', True) - else: - use_tpu_vm = False - - #### - # use_tpu_vm = accelerator_args.get('tpu_vm', True) - #### - if self.instance_type is not None and use_tpu_vm: + + # Supported TPU Podslice versions on GKE are v4 <= and those + # versions default the architecture to be TPU-VM. + # Reference: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_architectures + use_tpu_vm = True + if self.cloud.is_same_cloud(clouds.GCP()): + use_tpu_vm = accelerator_args.get('tpu_vm', True) + + if self.cloud.is_same_cloud( + clouds.GCP()) and self.instance_type is not None and use_tpu_vm: if self.instance_type != 'TPU-VM': with ux_utils.print_exception_no_traceback(): raise ValueError( diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 3b2bba72e8a..6df14a27dad 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1890,6 +1890,22 @@ def test_tpu_vm_pod(): run_one_test(test) +# ---------- TPU Pod Slice on GKE. ---------- +@pytest.mark.kubernetes +def test_tpu_pod_slice_gke(): + name = _get_cluster_name() + test = Test( + 'tpu_pod_slice_gke', + [ + f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', + f'sky logs {name} 1', # Ensure the job finished. + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + timeout=30 * 60, # can take 30 mins + ) + run_one_test(test) + # ---------- Simple apps. ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_multi_hostname(generic_cloud: str): From 2b56a9eab6153b59e0413188324a02bd64c6e3af Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 02:34:15 +0000 Subject: [PATCH 27/63] update smoke test to check if TPU is reachable. --- tests/test_smoke.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6df14a27dad..84224ef8bdb 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1900,6 +1900,8 @@ def test_tpu_pod_slice_gke(): f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', f'sky logs {name} 1', # Ensure the job finished. f'sky logs {name} 1 --status', # Ensure the job succeeded. + f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu | exit 1;"', # Ensure TPU is reachable. + f'sky logs {name} 2 --status' ], f'sky down -y {name}', timeout=30 * 60, # can take 30 mins From 305705cbd575bd3ddbaf8ab1d804a35e12ee2e21 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 02:38:24 +0000 Subject: [PATCH 28/63] add comment --- sky/resources.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/resources.py b/sky/resources.py index 137a8e731f4..74de19e1150 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -565,6 +565,7 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): if self.cloud is None: + # Reference on names of TPU Pod slices available on GKE: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long if acc.endswith('-podslice') or acc.endswith('-device'): self._cloud = clouds.Kubernetes() else: From c2b5bfcc1a5135cb7e3de50832f5d390107bd594 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 02:49:25 +0000 Subject: [PATCH 29/63] nit --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 84224ef8bdb..5f13b665120 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1900,7 +1900,7 @@ def test_tpu_pod_slice_gke(): f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', f'sky logs {name} 1', # Ensure the job finished. f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu | exit 1;"', # Ensure TPU is reachable. + f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. f'sky logs {name} 2 --status' ], f'sky down -y {name}', From 2ba5537d0a0742fbce1c571ff24080f189ce1661 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 05:55:02 +0000 Subject: [PATCH 30/63] Comment on number of requested TPU chips for multi- and single- host TPU slice. --- sky/templates/kubernetes-ray.yml.j2 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 79c0cb2a77e..fb5576f6f0a 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -402,6 +402,11 @@ available_node_types: cpu: {{cpus}} memory: {{memory}}G {% if tpu_requested %} + # Number of requested google.com/tpu must be equal to the total + # number of available TPU chips on the TPU slice node either it + # being a node from multi-host TPU slice or single-host TPU + # slice. Example reference: + # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work google.com/tpu: {{accelerator_count}} {% else %} nvidia.com/gpu: {{accelerator_count}} From 92cd77d3f535927825c33ff93db32acdd7383991 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 06:05:49 +0000 Subject: [PATCH 31/63] update method to check GKE supported TPU name --- sky/clouds/utils/gcp_utils.py | 4 +-- sky/provision/kubernetes/utils.py | 9 ++++++ sky/resources.py | 48 ++++++++++++++++--------------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index f1ee93c7979..7ec5c8f3b2b 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -17,6 +17,7 @@ from sky import sky_logging from sky import skypilot_config from sky.provision.gcp import constants +from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import subprocess_utils if typing.TYPE_CHECKING: @@ -36,8 +37,7 @@ def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool: return False assert resources is not None acc, _ = list(resources.accelerators.items())[0] - # Reference on Accelerator names for TPU Pod slices: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long - return acc.endswith('-podslice') or acc.endswith('-device') + return acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: if not is_tpu(resources): diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index d5bc586cde5..90fc3d07735 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -71,6 +71,15 @@ PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-' f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh') +# Mapping used to get generation for TPU accelerator name. +# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run +GKE_TPU_ACCELERATOR_TO_GENERATION = { + "tpu-v4-podslice": "v4", + "tpu-v5-lite-device": "v5e", + "tpu-v5-lite-podslice": "v5e", + "tpu-v5p-slice": "v5p", +} + logger = sky_logging.init_logger(__name__) diff --git a/sky/resources.py b/sky/resources.py index 74de19e1150..fbff8422c8d 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -14,6 +14,7 @@ from sky import skypilot_config from sky.clouds import service_catalog from sky.provision import docker_utils +from sky.provision.kubernetes import utils as kubernetes_utils from sky.skylet import constants from sky.utils import accelerator_registry from sky.utils import common_utils @@ -565,8 +566,7 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): if self.cloud is None: - # Reference on names of TPU Pod slices available on GKE: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long - if acc.endswith('-podslice') or acc.endswith('-device'): + if acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION: self._cloud = clouds.Kubernetes() else: self._cloud = clouds.GCP() @@ -576,13 +576,30 @@ def _set_accelerators( if accelerator_args is None: accelerator_args = {} - # Supported TPU Podslice versions on GKE are v4 <= and those - # versions default the architecture to be TPU-VM. - # Reference: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_architectures - use_tpu_vm = True + use_tpu_vm = accelerator_args.get('tpu_vm', True) if self.cloud.is_same_cloud(clouds.GCP()): - use_tpu_vm = accelerator_args.get('tpu_vm', True) - + if 'runtime_version' not in accelerator_args: + + def _get_default_runtime_version() -> str: + if not use_tpu_vm: + return '2.12.0' + # TPU V5 requires a newer runtime version. + if acc.startswith('tpu-v5'): + return 'v2-alpha-tpuv5' + return 'tpu-vm-base' + + accelerator_args['runtime_version'] = ( + _get_default_runtime_version()) + logger.info( + 'Missing runtime_version in accelerator_args, using' + f' default ({accelerator_args["runtime_version"]})') + + if self.instance_type is not None and use_tpu_vm: + if self.instance_type != 'TPU-VM': + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify instance type' + f' (got "{self.instance_type}") for TPU VM.') if self.cloud.is_same_cloud( clouds.GCP()) and self.instance_type is not None and use_tpu_vm: if self.instance_type != 'TPU-VM': @@ -590,21 +607,6 @@ def _set_accelerators( raise ValueError( 'Cannot specify instance type' f' (got "{self.instance_type}") for TPU VM.') - if 'runtime_version' not in accelerator_args: - - def _get_default_runtime_version() -> str: - if not use_tpu_vm: - return '2.12.0' - # TPU V5 requires a newer runtime version. - if acc.startswith('tpu-v5'): - return 'v2-alpha-tpuv5' - return 'tpu-vm-base' - - accelerator_args['runtime_version'] = ( - _get_default_runtime_version()) - logger.info( - 'Missing runtime_version in accelerator_args, using' - f' default ({accelerator_args["runtime_version"]})') self._accelerators = accelerators self._accelerator_args = accelerator_args From d085a5b918d4caa343f40891a302dfd4d6da69b1 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 24 Sep 2024 06:08:30 +0000 Subject: [PATCH 32/63] nit --- sky/backends/cloud_vm_ray_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 5c6e5129020..191a09438aa 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2484,7 +2484,8 @@ def head_ssh_port(self): @property def num_ips_per_node(self) -> int: """Returns number of IPs per node in the cluster, handling TPU Pod.""" - if gcp_utils.is_tpu_vm_pod(self.launched_resources): + is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources) + if is_tpu_vm_pod: num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources) else: num_ips = 1 From 786067985a33a3f7908e5cc2f363d05e45afe5c9 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 25 Sep 2024 05:34:58 +0000 Subject: [PATCH 33/63] move is_tpu_pod_slice to kubernetes_utils --- sky/clouds/utils/gcp_utils.py | 9 ++------- sky/provision/kubernetes/utils.py | 3 +++ sky/resources.py | 9 +-------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index 7ec5c8f3b2b..549eb422015 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -32,17 +32,12 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: acc, _ = list(resources.accelerators.items())[0] return acc.startswith('tpu') -def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool: - if not is_tpu(resources): - return False - assert resources is not None - acc, _ = list(resources.accelerators.items())[0] - return acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: + acc, _ = list(resources.accelerators.items())[0] if not is_tpu(resources): return False - elif is_tpu_pod_slice(resources): + elif kubernetes_utils.is_tpu_pod_slice(acc): return False assert resources is not None if resources.accelerator_args is None: diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 90fc3d07735..ee8ca7b503d 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1885,3 +1885,6 @@ def get_namespace_from_config(provider_config: Dict[str, Any]) -> str: def get_context_from_config(provider_config: Dict[str, Any]) -> str: return provider_config.get('context', get_current_kube_config_context_name()) + +def is_tpu_pod_slice(accelerator: str) -> bool: + return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION diff --git a/sky/resources.py b/sky/resources.py index fbff8422c8d..0ea5e4b1aaa 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -566,7 +566,7 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): if self.cloud is None: - if acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION: + if kubernetes_utils.is_tpu_pod_slice(acc): self._cloud = clouds.Kubernetes() else: self._cloud = clouds.GCP() @@ -600,13 +600,6 @@ def _get_default_runtime_version() -> str: raise ValueError( 'Cannot specify instance type' f' (got "{self.instance_type}") for TPU VM.') - if self.cloud.is_same_cloud( - clouds.GCP()) and self.instance_type is not None and use_tpu_vm: - if self.instance_type != 'TPU-VM': - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Cannot specify instance type' - f' (got "{self.instance_type}") for TPU VM.') self._accelerators = accelerators self._accelerator_args = accelerator_args From 96924a71c6e5eeb551ce06e54debd883e121a3f7 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 25 Sep 2024 05:38:49 +0000 Subject: [PATCH 34/63] update get_accelerator_from_label_value to use is_tpu_pod_slice method --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index ee8ca7b503d..afc984e835a 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -267,7 +267,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # to distinguish between a3-high and a3-mega instances return 'H100' return acc - elif value.startswith('tpu-'): + elif is_tpu_pod_slice(value): return value else: raise ValueError( From 1bbac2126d97e37ea24edc5899f879fcbc186d62 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 25 Sep 2024 05:38:57 +0000 Subject: [PATCH 35/63] nit --- sky/provision/kubernetes/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 5c7d65f0d71..d81296c2b10 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -79,7 +79,7 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]: return {'component': f'{cluster_name}-head'} -def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]): +def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str: # Returns a formatted string of resource requirements for a pod. resource_requirements = {} From 4f7ea0354d097ce3f85c10d84b2b4fd5a96b4c8d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 25 Sep 2024 06:09:19 +0000 Subject: [PATCH 36/63] format --- sky/clouds/kubernetes.py | 3 +- .../service_catalog/kubernetes_catalog.py | 24 ++--- sky/clouds/utils/gcp_utils.py | 2 +- sky/provision/kubernetes/instance.py | 40 ++++---- sky/provision/kubernetes/utils.py | 97 ++++++++++--------- sky/resources.py | 10 +- tests/test_smoke.py | 3 +- 7 files changed, 94 insertions(+), 85 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 97447f25f88..68fbd27b1c1 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -269,7 +269,8 @@ def make_deploy_resources_variables( if acc_count > 0 and acc_type is not None: k8s_acc_label_key, k8s_acc_label_value = \ kubernetes_utils.get_gpu_label_key_value(acc_type) - if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY: + if (k8s_acc_label_key == + kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( kubernetes_utils.get_tpu_topology_label_key_value(acc_type)) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index c0ac2e16472..d0d606e5026 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -79,12 +79,12 @@ def list_accelerators_realtime( if not has_gpu: return {}, {}, {} - label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter() - if not label_formatter: + lf, _ = kubernetes_utils.detect_gpu_label_formatter() + if not lf: return {}, {}, {} accelerators_qtys: Set[Tuple[str, int]] = set() - keys = label_formatter.get_label_keys() + keys = lf.get_label_keys() nodes = kubernetes_utils.get_kubernetes_nodes() # Get the pods to get the real-time GPU usage pods = kubernetes_utils.get_kubernetes_pods() @@ -98,7 +98,7 @@ def list_accelerators_realtime( for key in keys: if key in node.metadata.labels: allocated_qty = 0 - accelerator_name = label_formatter.get_accelerator_from_label_value( + accelerator_name = lf.get_accelerator_from_label_value( node.metadata.labels.get(key)) # Check if name_filter regex matches the accelerator_name @@ -109,11 +109,12 @@ def list_accelerators_realtime( accelerator_count = 0 if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable: - accelerator_count = int( - node.status.allocatable[kubernetes_utils.GPU_RESOURCE_KEY]) - elif kubernetes_utils.TPU_RESOURCE_KEY in node.status.allocatable: - accelerator_count = int( - node.status.allocatable[kubernetes_utils.TPU_RESOURCE_KEY]) + accelerator_count = int(node.status.allocatable[ + kubernetes_utils.GPU_RESOURCE_KEY]) + elif (kubernetes_utils.TPU_RESOURCE_KEY + in node.status.allocatable): + accelerator_count = int(node.status.allocatable[ + kubernetes_utils.TPU_RESOURCE_KEY]) # Generate the GPU quantities for the accelerators if accelerator_name and accelerator_count > 0: @@ -138,8 +139,9 @@ def list_accelerators_realtime( accelerators_available = accelerator_count - allocated_qty if accelerator_count >= min_quantity_filter: - quantized_count = (min_quantity_filter * - (accelerator_count // min_quantity_filter)) + quantized_count = ( + min_quantity_filter * + (accelerator_count // min_quantity_filter)) if accelerator_name not in total_accelerators_capacity: total_accelerators_capacity[ accelerator_name] = quantized_count diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index 549eb422015..b44c84b122d 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -34,12 +34,12 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: + assert resources is not None acc, _ = list(resources.accelerators.items())[0] if not is_tpu(resources): return False elif kubernetes_utils.is_tpu_pod_slice(acc): return False - assert resources is not None if resources.accelerator_args is None: return True return resources.accelerator_args.get('tpu_vm', True) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index d81296c2b10..c66bd3f5a20 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -79,10 +79,10 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]: return {'component': f'{cluster_name}-head'} -def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str: +def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str: # Returns a formatted string of resource requirements for a pod. resource_requirements = {} - + if isinstance(pod_or_spec, dict): containers = pod_or_spec.get('spec', {}).get('containers', []) else: @@ -95,21 +95,20 @@ def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str: else: resources = container.resources requests = resources.requests or {} - + for resource, value in requests.items(): if resource not in resource_requirements: resource_requirements[resource] = 0 if resource == 'memory': int_value = kubernetes_utils.parse_memory_resource(value) else: - int_value = kubernetes_utils.parse_cpu_or_gpu_resource( - value) - resource_requirements[resource] += int_value + int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value) + resource_requirements[resource] += int(int_value) return ', '.join(f'{resource}={value}' - for resource, value in resource_requirements.items()) + for resource, value in resource_requirements.items()) -def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]: +def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]: # Returns a formatted string of node selectors for a pod. node_selectors = [] @@ -117,7 +116,7 @@ def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]: selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {}) else: selectors = pod_or_spec.spec.node_selector - + if not selectors: return None @@ -127,19 +126,18 @@ def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]: def _lack_resource_msg(resource: str, - pod_or_spec: Union['Pod', dict], - extra_msg: Optional[str] = None, - details: Optional[str] = None) -> str: + pod_or_spec: Union[Any, dict], + extra_msg: Optional[str] = None, + details: Optional[str] = None) -> str: resource_requirements = _formatted_resource_requirements(pod_or_spec) node_selectors = _formatted_node_selector(pod_or_spec) node_selector_str = f' and labels ({node_selectors})' if ( node_selectors) else '' - msg = ( - f'Insufficient {resource} capacity on the cluster. ' - f'Required resources ({resource_requirements}){node_selector_str} ' - 'were not found in a single node. Other SkyPilot tasks or pods may ' - 'be using resources. Check resource usage by running ' - '`kubectl describe nodes`.') + msg = (f'Insufficient {resource} capacity on the cluster. ' + f'Required resources ({resource_requirements}){node_selector_str} ' + 'were not found in a single node. Other SkyPilot tasks or pods may ' + 'be using resources. Check resource usage by running ' + '`kubectl describe nodes`.') if extra_msg: msg += f' {extra_msg}' if details: @@ -202,8 +200,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long f' Full error: {event_message}') gpu_lf_keys = [ - key - for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY + key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY for key in lf.get_label_keys() ] if pod.spec.node_selector: @@ -606,7 +603,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str, # on TPU nodes. # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY - if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}): + if tpu_label in config.node_config.get('spec', + {}).get('nodeSelector', {}): tpu_toleration = { 'key': kubernetes_utils.TPU_RESOURCE_KEY, 'operator': 'Equal', diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index afc984e835a..b362ce32f36 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -40,15 +40,16 @@ } # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on -# nodes. These keys are typically used in the node's status.allocatable +# nodes. These keys are typically used in the node's status.allocatable # or status.capacity fields to indicate the available resources on the node. GPU_RESOURCE_KEY = 'nvidia.com/gpu' TPU_RESOURCE_KEY = 'google.com/tpu' -NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure ' - f'{GPU_RESOURCE_KEY} resource is available on the nodes and ' - 'the node labels for identifying GPUs ' - '(e.g., skypilot.co/accelerator) are setup correctly. ') +NO_GPU_HELP_MESSAGE = ( + 'If your cluster contains GPUs, make sure ' + f'{GPU_RESOURCE_KEY} resource is available on the nodes and ' + 'the node labels for identifying GPUs ' + '(e.g., skypilot.co/accelerator) are setup correctly. ') KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' @@ -74,10 +75,10 @@ # Mapping used to get generation for TPU accelerator name. # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run GKE_TPU_ACCELERATOR_TO_GENERATION = { - "tpu-v4-podslice": "v4", - "tpu-v5-lite-device": "v5e", - "tpu-v5-lite-podslice": "v5e", - "tpu-v5p-slice": "v5p", + 'tpu-v4-podslice': 'v4', + 'tpu-v5-lite-device': 'v5e', + 'tpu-v5-lite-podslice': 'v5e', + 'tpu-v5p-slice': 'v5p', } logger = sky_logging.init_logger(__name__) @@ -92,14 +93,14 @@ class GPULabelFormatter: """ @classmethod - def get_label_key(cls, accelerator: str = None) -> str: + def get_label_key(cls, accelerator: str = '') -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError @classmethod def get_label_keys(cls) -> List[str]: """Returns a list of label keys for GPU used by Kubernetes cluster.""" - pass + raise NotImplementedError @classmethod def get_label_value(cls, accelerator: str) -> str: @@ -160,12 +161,12 @@ class SkyPilotLabelFormatter(GPULabelFormatter): LABEL_KEY = 'skypilot.co/accelerator' @classmethod - def get_label_key(cls, accelerator: str = None) -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: str = '') -> str: + del accelerator # Unused return cls.LABEL_KEY - + @classmethod - def get_label_keys(cls) -> str: + def get_label_keys(cls) -> List[str]: return [cls.LABEL_KEY] @classmethod @@ -201,12 +202,12 @@ class CoreWeaveLabelFormatter(GPULabelFormatter): LABEL_KEY = 'gpu.nvidia.com/class' @classmethod - def get_label_key(cls, accelerator: str = None) -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: str = '') -> str: + del accelerator # Unused return cls.LABEL_KEY @classmethod - def get_label_keys(cls) -> str: + def get_label_keys(cls) -> List[str]: return [cls.LABEL_KEY] @classmethod @@ -234,18 +235,18 @@ class GKELabelFormatter(GPULabelFormatter): TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology' @classmethod - def get_label_key(cls, accelerator: str = None) -> str: + def get_label_key(cls, accelerator: str = '') -> str: if accelerator.startswith('tpu-'): return cls.TPU_LABEL_KEY return cls.GPU_LABEL_KEY @classmethod - def get_label_keys(cls) -> str: + def get_label_keys(cls) -> List[str]: return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] @classmethod - def match_label_key(cls, label: str) -> bool: - return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY] + def match_label_key(cls, label_key: str) -> bool: + return label_key in cls.get_label_keys() @classmethod def get_tpu_topology_label_key(cls) -> str: @@ -292,12 +293,12 @@ class GFDLabelFormatter(GPULabelFormatter): LABEL_KEY = 'nvidia.com/gpu.product' @classmethod - def get_label_key(cls, accelerator: str = None) -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: str = '') -> str: + del accelerator # Unused return cls.LABEL_KEY @classmethod - def get_label_keys(cls) -> str: + def get_label_keys(cls) -> List[str]: return [cls.LABEL_KEY] @classmethod @@ -570,7 +571,8 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type) assert formatter is not None, ('Unsupported autoscaler type:' f' {autoscaler_type}') - return formatter.get_label_key(acc_type), formatter.get_label_value(acc_type) + return formatter.get_label_key(acc_type), formatter.get_label_value( + acc_type) has_gpus, cluster_resources = detect_gpu_resource() if has_gpus: @@ -581,11 +583,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # If none of the GPU labels from LABEL_FORMATTER_REGISTRY are # detected, raise error with ux_utils.print_exception_no_traceback(): - supported_formats = ', '.join( - [key - for f in LABEL_FORMATTER_REGISTRY - for key in f.get_label_keys()] - ) + supported_formats = ', '.join([ + key for f in LABEL_FORMATTER_REGISTRY + for key in f.get_label_keys() + ]) suffix = '' if env_options.Options.SHOW_DEBUG_INFO.get(): suffix = f' Found node labels: {node_labels}' @@ -632,8 +633,8 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: all_labels = [] for node_name, label_list in node_labels.items(): all_labels.extend(label_list) - gpus_available = set( - v for k, v in all_labels if label_formatter.match_label_key(k)) + gpus_available = set(v for k, v in all_labels + if label_formatter.match_label_key(k)) suffix = f' Available GPUs on the cluster: {gpus_available}' raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' @@ -653,12 +654,14 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster ' 'contains GPUs, please ensure GPU drivers are installed on ' 'the node. Check if the GPUs are setup correctly by running ' - f'`kubectl describe nodes` and looking for the {GPU_RESOURCE_KEY!r} ' - f'or {TPU_RESOURCE_KEY!r} resource. Please refer to the documentation' - f'on how to set up GPUs.{suffix}') + '`kubectl describe nodes` and looking for the ' + f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. ' + 'Please refer to the documentation on how to set up GPUs.' + f'{suffix}') -def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]: +def get_tpu_topology_label_key_value( + accelerator: str) -> Tuple[str, Optional[str]]: """Returns the TPU topology label key and value for given accelerator type. Args: @@ -685,13 +688,13 @@ def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]: if labels_dict.get(tpu_label_key) == accelerator: topology_value = labels_dict.get(tpu_topology_label_key) return tpu_topology_label_key, topology_value - + # If TPU labels are not detected, raise error with ux_utils.print_exception_no_traceback(): suffix = '' if env_options.Options.SHOW_DEBUG_INFO.get(): suffix = (' Available node labels on the cluster: ' - f'{node_labels}') + f'{node_labels}') raise exceptions.ResourcesUnavailableError( f'Unable to find TPU topology for accelerator {accelerator!r}. ' f'No node found with label `{tpu_label_key}={accelerator}` ' @@ -1824,19 +1827,19 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: # Get the pods to get the real-time resource usage pods = get_kubernetes_pods() - label_formatter, _ = detect_gpu_label_formatter() - if not label_formatter: + lf, _ = detect_gpu_label_formatter() + if not lf: label_key = None else: - label_keys = label_formatter.get_label_keys() + label_keys = lf.get_label_keys() node_info_dict: Dict[str, KubernetesNodeInfo] = {} for label_key in label_keys: for node in nodes: allocated_qty = 0 - if label_formatter is not None and label_key in node.metadata.labels: - accelerator_name = label_formatter.get_accelerator_from_label_value( + if lf is not None and label_key in node.metadata.labels: + accelerator_name = lf.get_accelerator_from_label_value( node.metadata.labels.get(label_key)) else: accelerator_name = None @@ -1861,10 +1864,11 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: allocated_qty += int( container.resources.requests.get( GPU_RESOURCE_KEY, 0)) - elif TPU_RESOURCE_KEY in container.resources.requests: + elif (TPU_RESOURCE_KEY in + container.resources.requests): allocated_qty += int( container.resources.requests.get( - TPU_RESOURCE_KEY, 0)) + TPU_RESOURCE_KEY, 0)) accelerators_available = accelerator_count - allocated_qty @@ -1886,5 +1890,6 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> str: return provider_config.get('context', get_current_kube_config_context_name()) + def is_tpu_pod_slice(accelerator: str) -> bool: return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION diff --git a/sky/resources.py b/sky/resources.py index 0ea5e4b1aaa..928be3d7ca0 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -570,8 +570,9 @@ def _set_accelerators( self._cloud = clouds.Kubernetes() else: self._cloud = clouds.GCP() - assert (self.cloud.is_same_cloud( - clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes())), 'Cloud must be GCP or Kubernetes.' + assert (self.cloud.is_same_cloud(clouds.GCP()) or + self.cloud.is_same_cloud(clouds.Kubernetes()) + ), 'Cloud must be GCP or Kubernetes.' if accelerator_args is None: accelerator_args = {} @@ -593,13 +594,14 @@ def _get_default_runtime_version() -> str: logger.info( 'Missing runtime_version in accelerator_args, using' f' default ({accelerator_args["runtime_version"]})') - + if self.instance_type is not None and use_tpu_vm: if self.instance_type != 'TPU-VM': with ux_utils.print_exception_no_traceback(): raise ValueError( 'Cannot specify instance type' - f' (got "{self.instance_type}") for TPU VM.') + f' (got "{self.instance_type}") for TPU VM.' + ) self._accelerators = accelerators self._accelerator_args = accelerator_args diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5f13b665120..30872a69fff 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1900,7 +1900,7 @@ def test_tpu_pod_slice_gke(): f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice', f'sky logs {name} 1', # Ensure the job finished. f'sky logs {name} 1 --status', # Ensure the job succeeded. - f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. + f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable. f'sky logs {name} 2 --status' ], f'sky down -y {name}', @@ -1908,6 +1908,7 @@ def test_tpu_pod_slice_gke(): ) run_one_test(test) + # ---------- Simple apps. ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_multi_hostname(generic_cloud: str): From 16b6c2909704daea9d6e0ef113ca131bca8ae6da Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 25 Sep 2024 08:12:10 +0000 Subject: [PATCH 37/63] nit --- sky/clouds/utils/gcp_utils.py | 6 +++--- sky/provision/kubernetes/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index b44c84b122d..01ca1c89a71 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -34,11 +34,11 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: - assert resources is not None - acc, _ = list(resources.accelerators.items())[0] if not is_tpu(resources): return False - elif kubernetes_utils.is_tpu_pod_slice(acc): + assert resources is not None + acc, _ = list(resources.accelerators.items())[0] + if kubernetes_utils.is_tpu_pod_slice(acc): return False if resources.accelerator_args is None: return True diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index b362ce32f36..60be85ce9e8 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1864,8 +1864,8 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: allocated_qty += int( container.resources.requests.get( GPU_RESOURCE_KEY, 0)) - elif (TPU_RESOURCE_KEY in - container.resources.requests): + elif (TPU_RESOURCE_KEY + in container.resources.requests): allocated_qty += int( container.resources.requests.get( TPU_RESOURCE_KEY, 0)) From e3908435d96849291d183f513cbc022ab5917352 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 18 Oct 2024 03:27:00 +0000 Subject: [PATCH 38/63] check acc count support --- sky/clouds/kubernetes.py | 2 +- sky/provision/kubernetes/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 68fbd27b1c1..ca50d133de3 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -273,7 +273,7 @@ def make_deploy_resources_variables( kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_label_key_value(acc_type)) + kubernetes_utils.get_tpu_topology_label_key_value(acc_type, acc_count)) port_mode = network_utils.get_port_mode(None) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 745394f8144..c187ce8394c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -247,6 +247,7 @@ class GKELabelFormatter(GPULabelFormatter): GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator' TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator' + ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count' TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology' @classmethod @@ -676,7 +677,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: def get_tpu_topology_label_key_value( - accelerator: str) -> Tuple[str, Optional[str]]: + accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]: """Returns the TPU topology label key and value for given accelerator type. Args: @@ -701,6 +702,8 @@ def get_tpu_topology_label_key_value( for labels in node_labels.values(): labels_dict = dict(labels) if labels_dict.get(tpu_label_key) == accelerator: + tpu_chip_count = labels_dict.get(GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY) + #reduce topology and compare number with acc count topology_value = labels_dict.get(tpu_topology_label_key) return tpu_topology_label_key, topology_value From 884f0a21beb7c8f240b7da40fb1ee5ce2ad50e44 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 18 Oct 2024 06:02:29 +0000 Subject: [PATCH 39/63] preemptive TPU check --- sky/provision/kubernetes/instance.py | 21 ++++++++- sky/provision/kubernetes/utils.py | 65 +++++++++++++++++++++++++--- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 4d9a0ea35f2..53200d0bfc6 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -173,7 +173,21 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): # TODO(romilb): We may have additional node # affinity selectors in the future - in that # case we will need to update this logic. - if (('Insufficient nvidia.com/gpu' + # TODO(Doyoung): Update the error message raised + # with the multi-host TPU support. + if 'Insufficient google.com/tpu' in event_message: + extra_msg = ( + f'Verify if ' + f'{pod.spec.node_selector[label_key]}' + ' is available in the cluster. Note ' + 'that multi-host TPU podslices are ' + 'currently not unsupported.') + raise config_lib.KubernetesError( + _lack_resource_msg('TPU', + pod, + extra_msg, + details=event_message)) + elif (('Insufficient nvidia.com/gpu' in event_message) or ('didn\'t match Pod\'s node affinity/selector' in event_message)): @@ -585,10 +599,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str, error_msg = str(e) # Unlike other errors from resource lackage on CPU/GPU/Memory, TPU # lackage error is raised when pod is attemtped to be created. + # TODO(Doyoung): Update the error message raised + # with the multi-host TPU support. if 'Invalid resource requests for google.com/tpu.' in error_msg: extra_msg = ('Verify if the cluster has a TPU slice node with ' 'a topology matching the number of TPU(s) ' - 'requested.') + 'requested. Note that multi-host TPU podslices ' + 'are currently not unsupported.') raise config_lib.KubernetesError( _lack_resource_msg('TPU', pod_spec, diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c187ce8394c..4acf6853b8d 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1,5 +1,6 @@ """Kubernetes utilities for SkyPilot.""" import dataclasses +import functools import json import math import os @@ -508,13 +509,54 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', 'Maximum resources found on a single node: ' f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory') + def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', + node_list: List[Any]) -> Tuple[bool, Optional[str]]: + # check if the requested TPU type is in the cluster + # if exists, check if the requested TPU topology is available in the cluster. + node_list = gpu_nodes + acc_type = candidate_instance_type.accelerator_type + acc_count = candidate_instance_type.accelerator_count + tpu_list_in_cluster = [] + for node in node_list: + if acc_type == node.metadata.labels[GKELabelFormatter.TPU_LABEL_KEY]: + topology_value = node.metadata.labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY] + # node_tpu_chip_count represents the number of TPU chips + # available in this node. If the node is part of a node pool + # forming a multi-host TPU podslice, it only reflects the + # number of TPU chips in this individual node, not the entire + # multi-host TPU podslice. + node_tpu_chip_count = node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY] + chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] + # topology_chip_count represents the total number of TPU chips + # in the entire podslice, whether it is a single-host or + # multi-host TPU podslice. + topology_chip_count = functools.reduce(lambda x, y: x * y, chip_dimensions) + # TODO(Doyoung): Update the naming scheme with multi-host TPU + # support. + tpu_type = f'{acc_type}:{node_tpu_chip_count}' + tpu_list_in_cluster.append(tpu_type) + # For multi-host TPU podslices, topology_chip_count and + # node_tpu_chip_count will differ, as topology_chip_count + # reflects the total across all hosts, while + # node_tpu_chip_count reflects only the chips in a single node. + # TODO(Doyoung): Remove the condition, + # node_tpu_chip_count == topology_chip_count, when adding + # multi-host TPU support. + if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count: + return True, None + tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster) + return False, ('Requested TPU type was not found in the cluster. TPU ' + 'types found in the cluster: ' + f'{tpu_list_in_cluster_str}.') + nodes = get_kubernetes_nodes() k8s_instance_type = KubernetesInstanceType.\ from_instance_type(instance) acc_type = k8s_instance_type.accelerator_type if acc_type is not None: - # If GPUs are requested, check if GPU type is available, and if so, - # check if CPU and memory requirements on the specific node are met. + # If GPU/TPUs are requested, check if GPU/TPU type is available, and + # if so, check if CPU and memory requirements on the specific node are + # met. try: gpu_label_key, gpu_label_val = get_gpu_label_key_value(acc_type) except exceptions.ResourcesUnavailableError as e: @@ -526,6 +568,13 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', node.metadata.labels[gpu_label_key] == gpu_label_val ] assert len(gpu_nodes) > 0, 'GPU nodes not found' + if is_tpu_pod_slice(acc_type): + # If requested accelerator is a TPU type, check if the cluster + # has sufficient TPU resource to meet the requirement. + fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes) + if reason is not None: + return fits, reason + candidate_nodes = gpu_nodes not_fit_reason_prefix = ( f'GPU nodes with {acc_type} do not have ' @@ -537,7 +586,7 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', f'CPU (> {k8s_instance_type.cpus} CPUs) ' 'and/or memory ' f'(> {k8s_instance_type.memory} G). ') - # Check if CPU and memory requirements are met on at least one + # Check if CPU and memory requirements are met on at least one # candidate node. fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes) if not fits: @@ -702,10 +751,11 @@ def get_tpu_topology_label_key_value( for labels in node_labels.values(): labels_dict = dict(labels) if labels_dict.get(tpu_label_key) == accelerator: - tpu_chip_count = labels_dict.get(GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY) - #reduce topology and compare number with acc count topology_value = labels_dict.get(tpu_topology_label_key) - return tpu_topology_label_key, topology_value + chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] + num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions) + if num_chips == accelerator_count: + return tpu_topology_label_key, topology_value # If TPU labels are not detected, raise error with ux_utils.print_exception_no_traceback(): @@ -716,7 +766,8 @@ def get_tpu_topology_label_key_value( raise exceptions.ResourcesUnavailableError( f'Unable to find TPU topology for accelerator {accelerator!r}. ' f'No node found with label `{tpu_label_key}={accelerator}` ' - f'or missing {tpu_topology_label_key!r} label.{suffix}') + f'or missing {tpu_topology_label_key!r} label.{suffix}. Note ' + 'that multi-host TPU podslices are currently not unsupported.') def get_head_ssh_port(cluster_name: str, namespace: str, From 11142e5338c4cfa6053f6e398aa7f2bdb289585c Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 19 Oct 2024 04:47:20 +0000 Subject: [PATCH 40/63] update check_tpu_fits --- sky/clouds/kubernetes.py | 2 +- sky/provision/kubernetes/utils.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 6d2c7769bf3..e7bc02b651a 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -377,7 +377,7 @@ def make_deploy_resources_variables( kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_label_key_value(acc_type, acc_count)) + kubernetes_utils.get_tpu_topology_label_key_value(context, acc_type, acc_count)) port_mode = network_utils.get_port_mode(None) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6e8438c1ba0..d41e04d7e8b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -532,7 +532,6 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', node_list: List[Any]) -> Tuple[bool, Optional[str]]: # check if the requested TPU type is in the cluster # if exists, check if the requested TPU topology is available in the cluster. - node_list = gpu_nodes acc_type = candidate_instance_type.accelerator_type acc_count = candidate_instance_type.accelerator_count tpu_list_in_cluster = [] @@ -544,7 +543,7 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', # forming a multi-host TPU podslice, it only reflects the # number of TPU chips in this individual node, not the entire # multi-host TPU podslice. - node_tpu_chip_count = node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY] + node_tpu_chip_count = int(node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]) chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] # topology_chip_count represents the total number of TPU chips # in the entire podslice, whether it is a single-host or @@ -748,7 +747,7 @@ def get_gpu_label_key_value(context: Optional[str], def get_tpu_topology_label_key_value( - accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]: + context: Optional[str], accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]: """Returns the TPU topology label key and value for given accelerator type. Args: @@ -764,7 +763,7 @@ def get_tpu_topology_label_key_value( accelerator type. - The TPU topology label is missing for the specified accelerator. """ - label_formatter, node_labels = detect_gpu_label_formatter() + label_formatter, node_labels = detect_gpu_label_formatter(context) assert isinstance(label_formatter, GKELabelFormatter) tpu_label_key = label_formatter.TPU_LABEL_KEY From de55663e961cca33cfe3ea2d2f1659827007de06 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 19 Oct 2024 04:51:11 +0000 Subject: [PATCH 41/63] error msg update --- sky/provision/kubernetes/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index d41e04d7e8b..0f8f56dd767 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -563,9 +563,12 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count: return True, None tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster) + # TODO(Doyoung): Update the error message raised with the multi-host + # TPU support. return False, ('Requested TPU type was not found in the cluster. TPU ' 'types found in the cluster: ' - f'{tpu_list_in_cluster_str}.') + f'{tpu_list_in_cluster_str}. Note that multi-host TPU ' + 'podslices are currently not unsupported.') nodes = get_kubernetes_nodes(context) k8s_instance_type = KubernetesInstanceType.\ From a500555d72fd6d65c139a39db9da8a447c025123 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 19 Oct 2024 23:09:13 +0000 Subject: [PATCH 42/63] merge get_tpu_topology_label_key_value into get_gpu_label_key_value --- sky/clouds/kubernetes.py | 6 +- .../service_catalog/kubernetes_catalog.py | 2 +- sky/provision/kubernetes/utils.py | 123 ++++++++---------- tests/common.py | 2 +- 4 files changed, 55 insertions(+), 78 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index e7bc02b651a..c8565dd6145 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -371,13 +371,11 @@ def make_deploy_resources_variables( # If GPU/TPUs are requested, set node label to match the GPU/TPU type. if acc_count > 0 and acc_type is not None: - k8s_acc_label_key, k8s_acc_label_value = \ - kubernetes_utils.get_gpu_label_key_value(context, acc_type) + k8s_acc_label_key, k8s_acc_label_value, k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( + kubernetes_utils.get_accelerator_label_key_value(context, acc_type, acc_count)) if (k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True - k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_tpu_topology_label_key_value(context, acc_type, acc_count)) port_mode = network_utils.get_port_mode(None) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 309c574fa2a..1b6d5b2b414 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -84,7 +84,7 @@ def list_accelerators_realtime( ) or not kubernetes_utils.check_credentials(context)[0]: return {}, {}, {} - has_gpu = kubernetes_utils.detect_gpu_resource(context) + has_gpu = kubernetes_utils.detect_accelerator_resource(context) if not has_gpu: return {}, {}, {} diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 0f8f56dd767..6287a03d7c1 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -420,8 +420,8 @@ def detect_gpu_label_formatter( @functools.lru_cache(maxsize=10) -def detect_gpu_resource(context: Optional[str]) -> Tuple[bool, Set[str]]: - """Checks if the Kubernetes cluster has nvidia.com/gpu resource. +def detect_accelerator_resource(context: Optional[str]) -> Tuple[bool, Set[str]]: + """Checks if the Kubernetes cluster has GPU/TPU resource. Two types of accelerator resources are available which are each checked with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is @@ -574,13 +574,14 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', k8s_instance_type = KubernetesInstanceType.\ from_instance_type(instance) acc_type = k8s_instance_type.accelerator_type + acc_count = k8s_instance_type.accelerator_count if acc_type is not None: # If GPU/TPUs are requested, check if GPU/TPU type is available, and # if so, check if CPU and memory requirements on the specific node are # met. try: - gpu_label_key, gpu_label_val = get_gpu_label_key_value( - context, acc_type) + gpu_label_key, gpu_label_val, _, _ = get_accelerator_label_key_value( + context, acc_type, acc_count) except exceptions.ResourcesUnavailableError as e: # If GPU not found, return empty list and error message. return False, str(e) @@ -619,25 +620,31 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', return fits, reason -def get_gpu_label_key_value(context: Optional[str], - acc_type: str, - check_mode=False) -> Tuple[str, str]: - """Returns the label key and value for the given GPU type. +def get_accelerator_label_key_value(context: Optional[str], + acc_type: str, + acc_count: int, + check_mode=False + ) -> Tuple[str, str, str, str]: + """Returns the label key and value for the given GPU/TPU type. Args: - acc_type: The GPU type required by the task. - check_mode: If True, only checks if the cluster has GPU resources and - labels are setup on the cluster. acc_type is ignore does not return - the label key and value. Useful for checking if GPUs are configured - correctly on the cluster without explicitly requesting a acc_type. + acc_type: The GPU/TPU type required by the task. + acc_count: Number of GPU/TPUs required by the task. + check_mode: If True, only checks if the cluster has GPU/TPU resources + and labels are setup on the cluster. acc_type is ignore does not + return the label key and value. Useful for checking if GPUs are + configured correctly on the cluster without explicitly requesting + a acc_type. Returns: - A tuple of the label key and value. Returns empty strings if check_mode - is True. + A tuple of the accelerator label key, value, topology label key, and + topology value. The topology label key and value are populated only if + the requested accelerator type is TPU. Returns empty strings if + check_mode is True. Raises: ResourcesUnavailableError: Can be raised from the following conditions: - - The cluster does not have GPU resources (nvidia.com/gpu) - - The cluster does not have GPU labels setup correctly - - The cluster doesn't have any nodes with acc_type GPU + - The cluster does not have GPU/TPU resources (nvidia.com/gpu, google.com/tpu) + - The cluster does not have GPU/TPU labels setup correctly + - The cluster doesn't have any nodes with acc_type GPU/TPU """ # Check if the cluster has GPU resources # TODO(romilb): This assumes the accelerator is a nvidia GPU. We @@ -656,14 +663,14 @@ def get_gpu_label_key_value(context: Optional[str], # If check mode is enabled and autoscaler is set, we can return # early since we assume the cluster autoscaler will handle GPU # node provisioning. - return '', '' + return '', '', '', '' formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type) assert formatter is not None, ('Unsupported autoscaler type:' f' {autoscaler_type}') return formatter.get_label_key(acc_type), formatter.get_label_value( - acc_type) + acc_type), '', '' - has_gpus, cluster_resources = detect_gpu_resource(context) + has_gpus, cluster_resources = detect_accelerator_resource(context) if has_gpus: # Check if the cluster has GPU labels setup correctly label_formatter, node_labels = \ @@ -701,7 +708,7 @@ def get_gpu_label_key_value(context: Optional[str], if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. - return '', '' + return '', '', '', '' # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a @@ -713,7 +720,20 @@ def get_gpu_label_key_value(context: Optional[str], if (label_formatter.match_label_key(label) and label_formatter.get_accelerator_from_label_value( value) == acc_type): - return label, value + if is_tpu_pod_slice(acc_type): + assert isinstance(label_formatter, GKELabelFormatter) + topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY + labels_dict = dict(node_labels[node_name]) + if labels_dict.get(label_formatter.TPU_LABEL_KEY) == acc_type: + topology_value = labels_dict.get(topology_label_key) + chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] + num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions) + if num_chips == acc_count: + return label, value, topology_label_key, topology_value + else: + continue + else: + return label, value, '', '' # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): @@ -722,15 +742,19 @@ def get_gpu_label_key_value(context: Optional[str], all_labels = [] for node_name, label_list in node_labels.items(): all_labels.extend(label_list) - gpus_available = set(v for k, v in all_labels + acc_available = set(v for k, v in all_labels if label_formatter.match_label_key(k)) - suffix = f' Available GPUs on the cluster: {gpus_available}' + suffix = f' Available GPU/TPUs on the cluster: {acc_available}' + # TODO(Doyoung): Update the error message raised with the multi-host + # TPU support. raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' f'with {acc_type}. Please ensure at least one node in the ' f'cluster has {acc_type} and node labels are setup ' 'correctly. ' - f'Please refer to the documentation for more. {suffix}') + f'Please refer to the documentation for more. {suffix}. ' + 'Note that multi-host TPU podslices are currently not ' + 'unsupported.') else: # If GPU resources are not detected, raise error with ux_utils.print_exception_no_traceback(): @@ -749,51 +773,6 @@ def get_gpu_label_key_value(context: Optional[str], f'{suffix}') -def get_tpu_topology_label_key_value( - context: Optional[str], accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]: - """Returns the TPU topology label key and value for given accelerator type. - - Args: - accelerator: The TPU accelerator type required by the task. - - Returns: - A tuple of the TPU topology label key and value. - - Raises: - ResourcesUnavailableError: Can be raised from the following conditions: - - The cluster does not have TPU labels set up correctly. - - The cluster doesn't have any nodes with the specified TPU - accelerator type. - - The TPU topology label is missing for the specified accelerator. - """ - label_formatter, node_labels = detect_gpu_label_formatter(context) - assert isinstance(label_formatter, GKELabelFormatter) - - tpu_label_key = label_formatter.TPU_LABEL_KEY - tpu_topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY - - for labels in node_labels.values(): - labels_dict = dict(labels) - if labels_dict.get(tpu_label_key) == accelerator: - topology_value = labels_dict.get(tpu_topology_label_key) - chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] - num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions) - if num_chips == accelerator_count: - return tpu_topology_label_key, topology_value - - # If TPU labels are not detected, raise error - with ux_utils.print_exception_no_traceback(): - suffix = '' - if env_options.Options.SHOW_DEBUG_INFO.get(): - suffix = (' Available node labels on the cluster: ' - f'{node_labels}') - raise exceptions.ResourcesUnavailableError( - f'Unable to find TPU topology for accelerator {accelerator!r}. ' - f'No node found with label `{tpu_label_key}={accelerator}` ' - f'or missing {tpu_topology_label_key!r} label.{suffix}. Note ' - 'that multi-host TPU podslices are currently not unsupported.') - - def get_head_ssh_port(cluster_name: str, namespace: str, context: Optional[str]) -> int: svc_name = f'{cluster_name}-head-ssh' @@ -886,7 +865,7 @@ def check_credentials(context: Optional[str], # provider if their cluster GPUs are not setup correctly. gpu_msg = '' try: - _, _ = get_gpu_label_key_value(context, acc_type='', check_mode=True) + _, _, _, _ = get_accelerator_label_key_value(context, acc_type='', acc_count=0, check_mode=True) except exceptions.ResourcesUnavailableError as e: # If GPUs are not available, we return cluster as enabled (since it can # be a CPU-only cluster) but we also return the exception message which diff --git a/tests/common.py b/tests/common.py index c6f08588d99..d50ae7facdf 100644 --- a/tests/common.py +++ b/tests/common.py @@ -64,7 +64,7 @@ def _get_az_mappings(_): monkeypatch.setattr( 'sky.provision.kubernetes.utils.detect_gpu_label_formatter', lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}]) - monkeypatch.setattr('sky.provision.kubernetes.utils.detect_gpu_resource', + monkeypatch.setattr('sky.provision.kubernetes.utils.detect_accelerator_resource', lambda *_args, **_kwargs: [True, []]) monkeypatch.setattr('sky.provision.kubernetes.utils.check_instance_fits', lambda *_args, **_kwargs: [True, '']) From bce87318e908dcbf4d6f47424c386c2c09e210b1 Mon Sep 17 00:00:00 2001 From: landscapepainter <34902420+landscapepainter@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:46:51 -0700 Subject: [PATCH 43/63] Update sky/provision/kubernetes/utils.py Co-authored-by: Tian Xia --- sky/provision/kubernetes/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 6287a03d7c1..98726a2d75b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -57,9 +57,9 @@ NO_GPU_HELP_MESSAGE = ( 'If your cluster contains GPUs or TPUs, make sure ' - f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available' - ' on the nodes and the node labels for identifying ' - 'GPUs/TPUs (e.g., skypilot.co/accelerator) are setup correctly. ') + f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available ' + 'on the nodes and the node labels for identifying GPUs/TPUs ' + '(e.g., skypilot.co/accelerator) are setup correctly. ') KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' From 0e8366c6ca6fc46e89498eab952234ead8778c6f Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 20 Oct 2024 00:33:25 +0000 Subject: [PATCH 44/63] nit fixes --- sky/cli.py | 3 ++- sky/provision/kubernetes/utils.py | 18 +++++++++--------- sky/resources.py | 5 ++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d23df3c9c58..d25d847bb19 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3106,7 +3106,8 @@ def _get_kubernetes_realtime_gpu_table( 'in Kubernetes cluster. ') debug_msg = ('To show available accelerators on kubernetes,' ' run: sky show-gpus --cloud kubernetes ') - full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + + full_err_msg = (err_msg + + kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE + debug_msg) raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 98726a2d75b..275b8a2b9cc 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -55,7 +55,7 @@ GPU_RESOURCE_KEY = 'nvidia.com/gpu' TPU_RESOURCE_KEY = 'google.com/tpu' -NO_GPU_HELP_MESSAGE = ( +NO_ACCELERATOR_HELP_MESSAGE = ( 'If your cluster contains GPUs or TPUs, make sure ' f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available ' 'on the nodes and the node labels for identifying GPUs/TPUs ' @@ -153,10 +153,11 @@ def validate_label_value(cls, value: str) -> Tuple[bool, str]: def get_gke_accelerator_name(accelerator: str) -> str: - """Returns the accelerator name for GKE clusters + """Returns the accelerator name for GKE clusters. Uses the format - nvidia-tesla-. - A100-80GB, H100-80GB and L4 are an exception. They use nvidia-. + A100-80GB, H100-80GB, L4 are an exception. They use nvidia-. + TPU types are an exception as well keeping the given name. """ if accelerator == 'H100': # H100 is named as H100-80GB in GKE. @@ -751,10 +752,9 @@ def get_accelerator_label_key_value(context: Optional[str], 'Could not find any node in the Kubernetes cluster ' f'with {acc_type}. Please ensure at least one node in the ' f'cluster has {acc_type} and node labels are setup ' - 'correctly. ' - f'Please refer to the documentation for more. {suffix}. ' - 'Note that multi-host TPU podslices are currently not ' - 'unsupported.') + 'correctly. Please refer to the documentration for more. ' + f'{suffix}. Note that multi-host TPU podslices are ' + 'currently not unsupported.') else: # If GPU resources are not detected, raise error with ux_utils.print_exception_no_traceback(): @@ -1942,7 +1942,7 @@ def __init__(self, obj): class KubernetesNodeInfo: """Dataclass to store Kubernetes node information.""" name: str - gpu_type: Optional[str] + accelerator_type: Optional[str] # Resources available on the node. E.g., {'nvidia.com/gpu': '2'} total: Dict[str, int] free: Dict[str, int] @@ -2011,7 +2011,7 @@ def get_kubernetes_node_info( node_info_dict[node.metadata.name] = KubernetesNodeInfo( name=node.metadata.name, - gpu_type=accelerator_name, + accelerator_type=accelerator_name, total={'accelerator_count': int(accelerator_count)}, free={'accelerators_available': int(accelerators_available)}) diff --git a/sky/resources.py b/sky/resources.py index 7fcaea1db82..764858afc10 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -599,9 +599,8 @@ def _get_default_runtime_version() -> str: if self.instance_type != 'TPU-VM': with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Cannot specify instance type' - f' (got "{self.instance_type}") for TPU VM.' - ) + 'Cannot specify instance type (got ' + f'{self.instance_type!r}) for TPU VM.') self._accelerators = accelerators self._accelerator_args = accelerator_args From f67ad0fb7a6bda6d541f15f4c3c8c53b678223e5 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 20 Oct 2024 01:03:02 +0000 Subject: [PATCH 45/63] format --- sky/cli.py | 2 +- sky/clouds/kubernetes.py | 14 +++-- sky/provision/kubernetes/instance.py | 6 +- sky/provision/kubernetes/utils.py | 93 ++++++++++++++++++---------- sky/templates/kubernetes-ray.yml.j2 | 4 +- tests/common.py | 5 +- 6 files changed, 78 insertions(+), 46 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d25d847bb19..05bcd732070 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3124,7 +3124,7 @@ def _get_kubernetes_node_info_table(context: Optional[str]): node_info_dict = kubernetes_utils.get_kubernetes_node_info(context) for node_name, node_info in node_info_dict.items(): node_table.add_row([ - node_name, node_info.gpu_type, + node_name, node_info.accelerator_type, node_info.total['accelerator_count'], node_info.free['accelerators_available'] ]) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index c8565dd6145..106a282438e 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -365,14 +365,16 @@ def make_deploy_resources_variables( k8s_acc_label_key = None k8s_acc_label_value = None - k8s_tpu_topology_label_key = None - k8s_tpu_topology_label_value = None + k8s_topology_label_key = None + k8s_topology_label_value = None tpu_requested = False # If GPU/TPUs are requested, set node label to match the GPU/TPU type. if acc_count > 0 and acc_type is not None: - k8s_acc_label_key, k8s_acc_label_value, k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = ( - kubernetes_utils.get_accelerator_label_key_value(context, acc_type, acc_count)) + (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key, + k8s_topology_label_value) = ( + kubernetes_utils.get_accelerator_label_key_value( + context, acc_type, acc_count)) if (k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True @@ -438,8 +440,8 @@ def make_deploy_resources_variables( 'k8s_spot_label_key': spot_label_key, 'k8s_spot_label_value': spot_label_value, 'tpu_requested': tpu_requested, - 'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key, - 'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value, + 'k8s_topology_label_key': k8s_topology_label_key, + 'k8s_topology_label_value': k8s_topology_label_value, 'image_id': image_id, } diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9b298e0d784..44ed1b91ec6 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -188,9 +188,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): extra_msg, details=event_message)) elif (('Insufficient nvidia.com/gpu' - in event_message) or - ('didn\'t match Pod\'s node affinity/selector' - in event_message)): + in event_message) or + ('didn\'t match Pod\'s node affinity/selector' + in event_message)): extra_msg = ( f'Verify if ' f'{pod.spec.node_selector[label_key]}' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 275b8a2b9cc..b309a617271 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -421,7 +421,8 @@ def detect_gpu_label_formatter( @functools.lru_cache(maxsize=10) -def detect_accelerator_resource(context: Optional[str]) -> Tuple[bool, Set[str]]: +def detect_accelerator_resource( + context: Optional[str]) -> Tuple[bool, Set[str]]: """Checks if the Kubernetes cluster has GPU/TPU resource. Two types of accelerator resources are available which are each checked @@ -531,25 +532,37 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', node_list: List[Any]) -> Tuple[bool, Optional[str]]: - # check if the requested TPU type is in the cluster - # if exists, check if the requested TPU topology is available in the cluster. + """Checks if the instance fits on the cluster based on requested TPU. + + It checks if the TPU type and count on each node match the required + number of TPU chips for the instance. In the case of multi-host TPU + podslice, the function ensures that the number of TPU chips on a single + node (node_tpu_chip_count) and the total TPU chips across the entire + podslice (topology_chip_count) are correctly handled. + """ acc_type = candidate_instance_type.accelerator_type acc_count = candidate_instance_type.accelerator_count tpu_list_in_cluster = [] for node in node_list: - if acc_type == node.metadata.labels[GKELabelFormatter.TPU_LABEL_KEY]: - topology_value = node.metadata.labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY] + if acc_type == node.metadata.labels[ + GKELabelFormatter.TPU_LABEL_KEY]: + topology_value = node.metadata.labels[ + GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY] # node_tpu_chip_count represents the number of TPU chips # available in this node. If the node is part of a node pool # forming a multi-host TPU podslice, it only reflects the # number of TPU chips in this individual node, not the entire # multi-host TPU podslice. - node_tpu_chip_count = int(node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]) - chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] + node_tpu_chip_count = int(node.metadata.labels[ + GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]) + chip_dimensions = [ + int(chip_count) for chip_count in topology_value.split('x') + ] # topology_chip_count represents the total number of TPU chips # in the entire podslice, whether it is a single-host or # multi-host TPU podslice. - topology_chip_count = functools.reduce(lambda x, y: x * y, chip_dimensions) + topology_chip_count = functools.reduce(lambda x, y: x * y, + chip_dimensions) # TODO(Doyoung): Update the naming scheme with multi-host TPU # support. tpu_type = f'{acc_type}:{node_tpu_chip_count}' @@ -561,7 +574,8 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', # TODO(Doyoung): Remove the condition, # node_tpu_chip_count == topology_chip_count, when adding # multi-host TPU support. - if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count: + if (node_tpu_chip_count == topology_chip_count and + topology_chip_count == acc_count): return True, None tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster) # TODO(Doyoung): Update the error message raised with the multi-host @@ -581,8 +595,8 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', # if so, check if CPU and memory requirements on the specific node are # met. try: - gpu_label_key, gpu_label_val, _, _ = get_accelerator_label_key_value( - context, acc_type, acc_count) + gpu_label_key, gpu_label_val, _, _ = ( + get_accelerator_label_key_value(context, acc_type, acc_count)) except exceptions.ResourcesUnavailableError as e: # If GPU not found, return empty list and error message. return False, str(e) @@ -621,11 +635,11 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', return fits, reason -def get_accelerator_label_key_value(context: Optional[str], - acc_type: str, - acc_count: int, - check_mode=False - ) -> Tuple[str, str, str, str]: +def get_accelerator_label_key_value( + context: Optional[str], + acc_type: str, + acc_count: Optional[int], + check_mode=False) -> Tuple[str, str, str, str]: """Returns the label key and value for the given GPU/TPU type. Args: @@ -643,7 +657,8 @@ def get_accelerator_label_key_value(context: Optional[str], check_mode is True. Raises: ResourcesUnavailableError: Can be raised from the following conditions: - - The cluster does not have GPU/TPU resources (nvidia.com/gpu, google.com/tpu) + - The cluster does not have GPU/TPU resources + (nvidia.com/gpu, google.com/tpu) - The cluster does not have GPU/TPU labels setup correctly - The cluster doesn't have any nodes with acc_type GPU/TPU """ @@ -722,15 +737,25 @@ def get_accelerator_label_key_value(context: Optional[str], label_formatter.get_accelerator_from_label_value( value) == acc_type): if is_tpu_pod_slice(acc_type): - assert isinstance(label_formatter, GKELabelFormatter) - topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY + assert isinstance(label_formatter, + GKELabelFormatter) + topology_label_key = ( + label_formatter.TPU_TOPOLOGY_LABEL_KEY) labels_dict = dict(node_labels[node_name]) - if labels_dict.get(label_formatter.TPU_LABEL_KEY) == acc_type: - topology_value = labels_dict.get(topology_label_key) - chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")] - num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions) + if labels_dict.get( + label_formatter.TPU_LABEL_KEY) == acc_type: + topology_value = labels_dict.get( + topology_label_key) + assert topology_value is not None + chip_dimensions = [ + int(chip_count) + for chip_count in topology_value.split('x') + ] + num_chips = functools.reduce( + lambda x, y: x * y, chip_dimensions) if num_chips == acc_count: - return label, value, topology_label_key, topology_value + return (label, value, topology_label_key, + topology_value) else: continue else: @@ -744,10 +769,11 @@ def get_accelerator_label_key_value(context: Optional[str], for node_name, label_list in node_labels.items(): all_labels.extend(label_list) acc_available = set(v for k, v in all_labels - if label_formatter.match_label_key(k)) - suffix = f' Available GPU/TPUs on the cluster: {acc_available}' - # TODO(Doyoung): Update the error message raised with the multi-host - # TPU support. + if label_formatter.match_label_key(k)) + suffix = (' Available GPU/TPUs on the cluster: ' + f'{acc_available}') + # TODO(Doyoung): Update the error message raised with the + # multi-host TPU support. raise exceptions.ResourcesUnavailableError( 'Could not find any node in the Kubernetes cluster ' f'with {acc_type}. Please ensure at least one node in the ' @@ -763,9 +789,9 @@ def get_accelerator_label_key_value(context: Optional[str], suffix = (' Available resources on the cluster: ' f'{cluster_resources}') raise exceptions.ResourcesUnavailableError( - f'Could not detect GPU/TPU resources (`{GPU_RESOURCE_KEY}` or ' - f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster ' - 'contains GPUs, please ensure GPU drivers are installed on ' + f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or ' + f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster' + ' contains GPUs, please ensure GPU drivers are installed on ' 'the node. Check if the GPUs are setup correctly by running ' '`kubectl describe nodes` and looking for the ' f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. ' @@ -865,7 +891,10 @@ def check_credentials(context: Optional[str], # provider if their cluster GPUs are not setup correctly. gpu_msg = '' try: - _, _, _, _ = get_accelerator_label_key_value(context, acc_type='', acc_count=0, check_mode=True) + _, _, _, _ = get_accelerator_label_key_value(context, + acc_type='', + acc_count=0, + check_mode=True) except exceptions.ResourcesUnavailableError as e: # If GPUs are not available, we return cluster as enabled (since it can # be a CPU-only cluster) but we also return the exception message which diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index e3f3a8bcee0..9b6d190c7ee 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -289,8 +289,8 @@ available_node_types: {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %} {{k8s_acc_label_key}}: {{k8s_acc_label_value}} {% endif %} - {% if k8s_tpu_topology_label_key is not none and k8s_tpu_topology_label_value is not none %} - {{k8s_tpu_topology_label_key}}: {{k8s_tpu_topology_label_value}} + {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %} + {{k8s_topology_label_key}}: {{k8s_topology_label_value}} {% endif %} {% if k8s_spot_label_key is not none %} {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}} diff --git a/tests/common.py b/tests/common.py index d50ae7facdf..ad1f92f6455 100644 --- a/tests/common.py +++ b/tests/common.py @@ -64,8 +64,9 @@ def _get_az_mappings(_): monkeypatch.setattr( 'sky.provision.kubernetes.utils.detect_gpu_label_formatter', lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}]) - monkeypatch.setattr('sky.provision.kubernetes.utils.detect_accelerator_resource', - lambda *_args, **_kwargs: [True, []]) + monkeypatch.setattr( + 'sky.provision.kubernetes.utils.detect_accelerator_resource', + lambda *_args, **_kwargs: [True, []]) monkeypatch.setattr('sky.provision.kubernetes.utils.check_instance_fits', lambda *_args, **_kwargs: [True, '']) monkeypatch.setattr('sky.provision.kubernetes.utils.get_spot_label', From 05c37aaf9f48ddf1c43da02040faeb2582d3ca58 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 20 Oct 2024 01:13:18 +0000 Subject: [PATCH 46/63] nit --- sky/provision/kubernetes/utils.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index b309a617271..fe299b67310 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -113,7 +113,7 @@ class GPULabelFormatter: """ @classmethod - def get_label_key(cls, accelerator: str = '') -> str: + def get_label_key(cls, accelerator: Optional[str] = None) -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError @@ -182,8 +182,7 @@ class SkyPilotLabelFormatter(GPULabelFormatter): LABEL_KEY = 'skypilot.co/accelerator' @classmethod - def get_label_key(cls, accelerator: str = '') -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: Optional[str] = None) -> str: return cls.LABEL_KEY @classmethod @@ -223,8 +222,7 @@ class CoreWeaveLabelFormatter(GPULabelFormatter): LABEL_KEY = 'gpu.nvidia.com/class' @classmethod - def get_label_key(cls, accelerator: str = '') -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: Optional[str] = None) -> str: return cls.LABEL_KEY @classmethod @@ -257,8 +255,8 @@ class GKELabelFormatter(GPULabelFormatter): TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology' @classmethod - def get_label_key(cls, accelerator: str = '') -> str: - if accelerator.startswith('tpu-'): + def get_label_key(cls, accelerator: Optional[str] = None) -> str: + if isinstance(accelerator, str) and accelerator.startswith('tpu-'): return cls.TPU_LABEL_KEY return cls.GPU_LABEL_KEY @@ -315,8 +313,7 @@ class GFDLabelFormatter(GPULabelFormatter): LABEL_KEY = 'nvidia.com/gpu.product' @classmethod - def get_label_key(cls, accelerator: str = '') -> str: - del accelerator # Unused + def get_label_key(cls, accelerator: Optional[str] = None) -> str: return cls.LABEL_KEY @classmethod From 06d3879a4bfab5b90d6b4a7c8d147a39f99829b7 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 20 Oct 2024 01:58:24 +0000 Subject: [PATCH 47/63] Implement method for reading acc counts from node/pod object --- .../service_catalog/kubernetes_catalog.py | 25 ++++------- sky/provision/kubernetes/utils.py | 42 ++++++++++++------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 1b6d5b2b414..bb69fe5dab7 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -116,16 +116,10 @@ def list_accelerators_realtime( name_filter, accelerator_name, flags=regex_flags): continue - accelerator_count = 0 - if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable: - accelerator_count = int(node.status.allocatable[ - kubernetes_utils.GPU_RESOURCE_KEY]) - elif (kubernetes_utils.TPU_RESOURCE_KEY - in node.status.allocatable): - accelerator_count = int(node.status.allocatable[ - kubernetes_utils.TPU_RESOURCE_KEY]) - # Generate the GPU quantities for the accelerators + accelerator_count = ( + kubernetes_utils.get_node_accelerator_count( + node.status.allocatable)) if accelerator_name and accelerator_count > 0: for count in range(1, accelerator_count + 1): accelerators_qtys.add((accelerator_name, count)) @@ -134,16 +128,13 @@ def list_accelerators_realtime( # Get all the pods running on the node if (pod.spec.node_name == node.metadata.name and pod.status.phase in ['Running', 'Pending']): - # Iterate over all the containers in the pod and sum the - # GPU requests + # Iterate over all the containers in the pod and sum + # the GPU requests for container in pod.spec.containers: if container.resources.requests: - allocated_qty += int( - container.resources.requests.get( - kubernetes_utils.GPU_RESOURCE_KEY, 0)) - allocated_qty += int( - container.resources.requests.get( - kubernetes_utils.TPU_RESOURCE_KEY, 0)) + allocated_qty += ( + kubernetes_utils.get_node_accelerator_count( + container.resources.requests)) accelerators_available = accelerator_count - allocated_qty diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index fe299b67310..ee7dd8d0b89 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -2007,13 +2007,8 @@ def get_kubernetes_node_info( else: accelerator_name = None - accelerator_count = 0 - if GPU_RESOURCE_KEY in node.status.allocatable: - accelerator_count = int( - node.status.allocatable[GPU_RESOURCE_KEY]) - elif TPU_RESOURCE_KEY in node.status.allocatable: - accelerator_count = int( - node.status.allocatable[TPU_RESOURCE_KEY]) + accelerator_count = get_node_accelerator_count( + node.status.allocatable) for pod in pods: # Get all the pods running on the node @@ -2023,15 +2018,8 @@ def get_kubernetes_node_info( # GPU requests for container in pod.spec.containers: if container.resources.requests: - if GPU_RESOURCE_KEY in container.resources.requests: - allocated_qty += int( - container.resources.requests.get( - GPU_RESOURCE_KEY, 0)) - elif (TPU_RESOURCE_KEY - in container.resources.requests): - allocated_qty += int( - container.resources.requests.get( - TPU_RESOURCE_KEY, 0)) + allocated_qty += get_node_accelerator_count( + container.resources.requests) accelerators_available = accelerator_count - allocated_qty @@ -2333,3 +2321,25 @@ def process_skypilot_pods( num_pods = len(cluster.pods) cluster.resources_str = f'{num_pods}x {cluster.resources}' return list(clusters.values()), jobs_controllers, serve_controllers + + +def get_node_accelerator_count(attribute_dict: dict) -> int: + """Retrieves the count of accelerators from a node's resource dictionary. + + This method checks the node's allocatable resources or the accelerators + already deployed on the node, using pod objects that describe resource + requests. + + Args: + attribute_dict): Containing resource information from a node, such as + allocatable or requested resources. + + Returns: + Number of accelerators allocated or available from the node. If no + resource is found, it returns 0. + """ + if GPU_RESOURCE_KEY in attribute_dict: + return int(attribute_dict[GPU_RESOURCE_KEY]) + elif TPU_RESOURCE_KEY in attribute_dict: + return int(attribute_dict[TPU_RESOURCE_KEY]) + return 0 From 9a2046c52e2665fc9d9e3262e17bdfc9c0afff98 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 20 Oct 2024 02:16:14 +0000 Subject: [PATCH 48/63] assertion update for is_tpu_vm --- sky/clouds/utils/gcp_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index 01ca1c89a71..f44788d2e6b 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -36,7 +36,7 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool: def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: if not is_tpu(resources): return False - assert resources is not None + assert (resources is not None and len(resources.accelerators) == 1) acc, _ = list(resources.accelerators.items())[0] if kubernetes_utils.is_tpu_pod_slice(acc): return False From 62b235f0553c5d388e6aee1fcfc2dc1a9d508f9c Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 21 Oct 2024 01:01:35 +0000 Subject: [PATCH 49/63] Exclude multi-host TPUs to displayed from show-gpus --- .../service_catalog/kubernetes_catalog.py | 6 ++ sky/provision/kubernetes/utils.py | 97 +++++++++++-------- 2 files changed, 65 insertions(+), 38 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index bb69fe5dab7..46078ddc590 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -110,6 +110,12 @@ def list_accelerators_realtime( accelerator_name = lf.get_accelerator_from_label_value( node.metadata.labels.get(key)) + # Exclude multi-host TPUs from being processed. + # TODO(Doyoung): Remove the logic when adding support for + # multi-host TPUs. + if kubernetes_utils.is_multi_host_tpu(node.metadata.labels): + continue + # Check if name_filter regex matches the accelerator_name regex_flags = 0 if case_sensitive else re.IGNORECASE if name_filter and not re.match( diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index ee7dd8d0b89..e9e7df135c9 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -543,36 +543,15 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', for node in node_list: if acc_type == node.metadata.labels[ GKELabelFormatter.TPU_LABEL_KEY]: - topology_value = node.metadata.labels[ - GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY] - # node_tpu_chip_count represents the number of TPU chips - # available in this node. If the node is part of a node pool - # forming a multi-host TPU podslice, it only reflects the - # number of TPU chips in this individual node, not the entire - # multi-host TPU podslice. + # TODO(Doyoung): Update the logic when adding support for + # multi-host TPUs. + if is_multi_host_tpu(node.metadata.labels): + continue node_tpu_chip_count = int(node.metadata.labels[ GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]) - chip_dimensions = [ - int(chip_count) for chip_count in topology_value.split('x') - ] - # topology_chip_count represents the total number of TPU chips - # in the entire podslice, whether it is a single-host or - # multi-host TPU podslice. - topology_chip_count = functools.reduce(lambda x, y: x * y, - chip_dimensions) - # TODO(Doyoung): Update the naming scheme with multi-host TPU - # support. tpu_type = f'{acc_type}:{node_tpu_chip_count}' tpu_list_in_cluster.append(tpu_type) - # For multi-host TPU podslices, topology_chip_count and - # node_tpu_chip_count will differ, as topology_chip_count - # reflects the total across all hosts, while - # node_tpu_chip_count reflects only the chips in a single node. - # TODO(Doyoung): Remove the condition, - # node_tpu_chip_count == topology_chip_count, when adding - # multi-host TPU support. - if (node_tpu_chip_count == topology_chip_count and - topology_chip_count == acc_count): + if node_tpu_chip_count == acc_count: return True, None tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster) # TODO(Doyoung): Update the error message raised with the multi-host @@ -729,6 +708,11 @@ def get_accelerator_label_key_value( # quantity is available since that is dynamic and can change # during scheduling. for node_name, label_list in node_labels.items(): + node_metadata_labels = dict(label_list) + # TODO(Doyoung): Update the logic when adding support for + # multi-host TPUs. + if is_multi_host_tpu(node_metadata_labels): + continue for label, value in label_list: if (label_formatter.match_label_key(label) and label_formatter.get_accelerator_from_label_value( @@ -736,21 +720,16 @@ def get_accelerator_label_key_value( if is_tpu_pod_slice(acc_type): assert isinstance(label_formatter, GKELabelFormatter) - topology_label_key = ( - label_formatter.TPU_TOPOLOGY_LABEL_KEY) - labels_dict = dict(node_labels[node_name]) - if labels_dict.get( + if node_metadata_labels.get( label_formatter.TPU_LABEL_KEY) == acc_type: - topology_value = labels_dict.get( + topology_label_key = ( + label_formatter.TPU_TOPOLOGY_LABEL_KEY) + topology_value = node_metadata_labels.get( topology_label_key) assert topology_value is not None - chip_dimensions = [ - int(chip_count) - for chip_count in topology_value.split('x') - ] - num_chips = functools.reduce( - lambda x, y: x * y, chip_dimensions) - if num_chips == acc_count: + tpu_topology_chip_count = reduce_tpu_topology( + topology_value) + if tpu_topology_chip_count == acc_count: return (label, value, topology_label_key, topology_value) else: @@ -2023,6 +2002,12 @@ def get_kubernetes_node_info( accelerators_available = accelerator_count - allocated_qty + # Exclude multi-host TPUs from being processed. + # TODO(Doyoung): Remove the logic when adding support for + # multi-host TPUs. + if is_multi_host_tpu(node.metadata.labels): + continue + node_info_dict[node.metadata.name] = KubernetesNodeInfo( name=node.metadata.name, accelerator_type=accelerator_name, @@ -2343,3 +2328,39 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: elif TPU_RESOURCE_KEY in attribute_dict: return int(attribute_dict[TPU_RESOURCE_KEY]) return 0 + + +def reduce_tpu_topology(topology: str): + """Computes the number of TPU chips from its topology string.""" + chip_dimensions = [int(chip_count) for chip_count in topology.split('x')] + # tpu_topology_chip_count represents the total number of TPU chips in the + # entire podslice, whether it is a single-host or multi-host TPU podslice. + tpu_topology_chip_count = functools.reduce( + lambda x, y: x * y, chip_dimensions) + return tpu_topology_chip_count + + +def is_multi_host_tpu(node_metadata_labels: dict): + """Determines whether the given node is a multi-host TPU configuration.""" + if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels: + assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels + topology_value = ( + node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]) + accelerator_count_label_key = ( + GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY) + assert accelerator_count_label_key in node_metadata_labels + # node_tpu_chip_count represents the number of TPU chips + # available in this node. If the node is part of a node pool + # forming a multi-host TPU podslice, it only reflects the + # number of TPU chips in this individual node, not the entire + # multi-host TPU podslice. + node_tpu_chip_count = int( + node_metadata_labels[accelerator_count_label_key]) + topology_chip_count = reduce_tpu_topology(topology_value) + # For multi-host TPU podslices, topology_chip_count and + # node_tpu_chip_count will differ, as topology_chip_count + # reflects the total across all hosts, while + # node_tpu_chip_count reflects only the chips in a single node. + if node_tpu_chip_count != topology_chip_count: + return True + return False From 4db1e637f6efccdf870442a17b0963431ab759e1 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 21 Oct 2024 01:01:56 +0000 Subject: [PATCH 50/63] Notify users that multi-host TPUs are not supported from 'sky show-gpus' --- sky/cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 05bcd732070..124eb893241 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3180,8 +3180,11 @@ def _output(): yield from k8s_realtime_table.get_string() k8s_node_table = _get_kubernetes_node_info_table(context) yield '\n\n' + # TODO(Doyoung): Update the message with the multi-host TPU + # support. yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes per node GPU availability' + f'Kubernetes per node accelerator availability ' + '(Note: Multi-host TPUs are not supported.)' f'{colorama.Style.RESET_ALL}\n') yield from k8s_node_table.get_string() if kubernetes_autoscaling: From 5923f104c34a93e6262698aa96f6c51941f40ad5 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 21 Oct 2024 01:04:29 +0000 Subject: [PATCH 51/63] format --- sky/provision/kubernetes/utils.py | 116 +++++++++++++++--------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index e9e7df135c9..a321bb39857 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -2198,6 +2198,64 @@ def is_tpu_pod_slice(accelerator: str) -> bool: return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION +def get_node_accelerator_count(attribute_dict: dict) -> int: + """Retrieves the count of accelerators from a node's resource dictionary. + + This method checks the node's allocatable resources or the accelerators + already deployed on the node, using pod objects that describe resource + requests. + + Args: + attribute_dict: Containing resource information from a node, such as + allocatable or requested resources. + + Returns: + Number of accelerators allocated or available from the node. If no + resource is found, it returns 0. + """ + if GPU_RESOURCE_KEY in attribute_dict: + return int(attribute_dict[GPU_RESOURCE_KEY]) + elif TPU_RESOURCE_KEY in attribute_dict: + return int(attribute_dict[TPU_RESOURCE_KEY]) + return 0 + + +def reduce_tpu_topology(topology: str): + """Computes the number of TPU chips from its topology string.""" + chip_dimensions = [int(chip_count) for chip_count in topology.split('x')] + # tpu_topology_chip_count represents the total number of TPU chips in the + # entire podslice, whether it is a single-host or multi-host TPU podslice. + tpu_topology_chip_count = functools.reduce(lambda x, y: x * y, + chip_dimensions) + return tpu_topology_chip_count + + +def is_multi_host_tpu(node_metadata_labels: dict): + """Determines whether the given node is a multi-host TPU configuration.""" + if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels: + assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels + topology_value = ( + node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]) + accelerator_count_label_key = ( + GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY) + assert accelerator_count_label_key in node_metadata_labels + # node_tpu_chip_count represents the number of TPU chips + # available in this node. If the node is part of a node pool + # forming a multi-host TPU podslice, it only reflects the + # number of TPU chips in this individual node, not the entire + # multi-host TPU podslice. + node_tpu_chip_count = int( + node_metadata_labels[accelerator_count_label_key]) + topology_chip_count = reduce_tpu_topology(topology_value) + # For multi-host TPU podslices, topology_chip_count and + # node_tpu_chip_count will differ, as topology_chip_count + # reflects the total across all hosts, while + # node_tpu_chip_count reflects only the chips in a single node. + if node_tpu_chip_count != topology_chip_count: + return True + return False + + @dataclasses.dataclass class KubernetesSkyPilotClusterInfo: cluster_name_on_cloud: str @@ -2306,61 +2364,3 @@ def process_skypilot_pods( num_pods = len(cluster.pods) cluster.resources_str = f'{num_pods}x {cluster.resources}' return list(clusters.values()), jobs_controllers, serve_controllers - - -def get_node_accelerator_count(attribute_dict: dict) -> int: - """Retrieves the count of accelerators from a node's resource dictionary. - - This method checks the node's allocatable resources or the accelerators - already deployed on the node, using pod objects that describe resource - requests. - - Args: - attribute_dict): Containing resource information from a node, such as - allocatable or requested resources. - - Returns: - Number of accelerators allocated or available from the node. If no - resource is found, it returns 0. - """ - if GPU_RESOURCE_KEY in attribute_dict: - return int(attribute_dict[GPU_RESOURCE_KEY]) - elif TPU_RESOURCE_KEY in attribute_dict: - return int(attribute_dict[TPU_RESOURCE_KEY]) - return 0 - - -def reduce_tpu_topology(topology: str): - """Computes the number of TPU chips from its topology string.""" - chip_dimensions = [int(chip_count) for chip_count in topology.split('x')] - # tpu_topology_chip_count represents the total number of TPU chips in the - # entire podslice, whether it is a single-host or multi-host TPU podslice. - tpu_topology_chip_count = functools.reduce( - lambda x, y: x * y, chip_dimensions) - return tpu_topology_chip_count - - -def is_multi_host_tpu(node_metadata_labels: dict): - """Determines whether the given node is a multi-host TPU configuration.""" - if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels: - assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels - topology_value = ( - node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]) - accelerator_count_label_key = ( - GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY) - assert accelerator_count_label_key in node_metadata_labels - # node_tpu_chip_count represents the number of TPU chips - # available in this node. If the node is part of a node pool - # forming a multi-host TPU podslice, it only reflects the - # number of TPU chips in this individual node, not the entire - # multi-host TPU podslice. - node_tpu_chip_count = int( - node_metadata_labels[accelerator_count_label_key]) - topology_chip_count = reduce_tpu_topology(topology_value) - # For multi-host TPU podslices, topology_chip_count and - # node_tpu_chip_count will differ, as topology_chip_count - # reflects the total across all hosts, while - # node_tpu_chip_count reflects only the chips in a single node. - if node_tpu_chip_count != topology_chip_count: - return True - return False From fa2e6708bb5f4882becead44a7de6c0a22a6e15a Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 21 Oct 2024 01:05:25 +0000 Subject: [PATCH 52/63] nit --- sky/provision/kubernetes/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index a321bb39857..720ee5295d3 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -2195,6 +2195,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]: def is_tpu_pod_slice(accelerator: str) -> bool: + """Determins if the given accelerator is a TPU supported on GKE.""" return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION From c1ee117ebef292fca0e775415955e41437497eb5 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 21 Oct 2024 01:41:52 +0000 Subject: [PATCH 53/63] display warning message from show-gpus conditionally --- sky/cli.py | 9 +++++++-- sky/clouds/utils/gcp_utils.py | 2 +- sky/provision/kubernetes/utils.py | 18 ++++++++++++++---- sky/resources.py | 2 +- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 124eb893241..5db241562b8 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3182,9 +3182,14 @@ def _output(): yield '\n\n' # TODO(Doyoung): Update the message with the multi-host TPU # support. + k8s_per_node_acc_message = ( + 'Kubernetes per node accelerator availability ') + if kubernetes_utils.multi_host_tpu_exists_in_cluster( + context): + k8s_per_node_acc_message += ( + '(Note: Multi-host TPUs are not supported.)') yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes per node accelerator availability ' - '(Note: Multi-host TPUs are not supported.)' + f'{k8s_per_node_acc_message}' f'{colorama.Style.RESET_ALL}\n') yield from k8s_node_table.get_string() if kubernetes_autoscaling: diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py index f44788d2e6b..0fa7db5f344 100644 --- a/sky/clouds/utils/gcp_utils.py +++ b/sky/clouds/utils/gcp_utils.py @@ -38,7 +38,7 @@ def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool: return False assert (resources is not None and len(resources.accelerators) == 1) acc, _ = list(resources.accelerators.items())[0] - if kubernetes_utils.is_tpu_pod_slice(acc): + if kubernetes_utils.is_tpu_on_gke(acc): return False if resources.accelerator_args is None: return True diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 720ee5295d3..8c177dc5e60 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -288,7 +288,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # to distinguish between a3-high and a3-mega instances return 'H100' return acc - elif is_tpu_pod_slice(value): + elif is_tpu_on_gke(value): return value else: raise ValueError( @@ -582,7 +582,7 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', node.metadata.labels[gpu_label_key] == gpu_label_val ] assert len(gpu_nodes) > 0, 'GPU nodes not found' - if is_tpu_pod_slice(acc_type): + if is_tpu_on_gke(acc_type): # If requested accelerator is a TPU type, check if the cluster # has sufficient TPU resource to meet the requirement. fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes) @@ -717,7 +717,7 @@ def get_accelerator_label_key_value( if (label_formatter.match_label_key(label) and label_formatter.get_accelerator_from_label_value( value) == acc_type): - if is_tpu_pod_slice(acc_type): + if is_tpu_on_gke(acc_type): assert isinstance(label_formatter, GKELabelFormatter) if node_metadata_labels.get( @@ -2194,7 +2194,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]: return pods -def is_tpu_pod_slice(accelerator: str) -> bool: +def is_tpu_on_gke(accelerator: str) -> bool: """Determins if the given accelerator is a TPU supported on GKE.""" return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION @@ -2257,6 +2257,16 @@ def is_multi_host_tpu(node_metadata_labels: dict): return False +def multi_host_tpu_exists_in_cluster(context: Optional[str] = None): + """Checks if there exists a multi-host TPU within the cluster.""" + multi_host_tpu_in_cluster = False + nodes = get_kubernetes_nodes(context) + for node in nodes: + if is_multi_host_tpu(node.metadata.labels): + multi_host_tpu_in_cluster = True + return multi_host_tpu_in_cluster + + @dataclasses.dataclass class KubernetesSkyPilotClusterInfo: cluster_name_on_cloud: str diff --git a/sky/resources.py b/sky/resources.py index 764858afc10..af303abaeeb 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -566,7 +566,7 @@ def _set_accelerators( acc, _ = list(accelerators.items())[0] if 'tpu' in acc.lower(): if self.cloud is None: - if kubernetes_utils.is_tpu_pod_slice(acc): + if kubernetes_utils.is_tpu_on_gke(acc): self._cloud = clouds.Kubernetes() else: self._cloud = clouds.GCP() From cbce4d5115b3afbed37b2a911ae51d28c2774db3 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 23 Oct 2024 02:27:28 +0000 Subject: [PATCH 54/63] update sky show-gpus --- sky/clouds/service_catalog/kubernetes_catalog.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 46078ddc590..2bed325dfb6 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -127,8 +127,12 @@ def list_accelerators_realtime( kubernetes_utils.get_node_accelerator_count( node.status.allocatable)) if accelerator_name and accelerator_count > 0: - for count in range(1, accelerator_count + 1): - accelerators_qtys.add((accelerator_name, count)) + if kubernetes_utils.is_tpu_on_gke(accelerator_name): + accelerators_qtys.add( + (accelerator_name, accelerator_count)) + else: + for count in range(1, accelerator_count + 1): + accelerators_qtys.add((accelerator_name, count)) for pod in pods: # Get all the pods running on the node From 241efc046f6a0d400462cfe3d47daa37f032de2b Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 25 Oct 2024 04:47:00 +0000 Subject: [PATCH 55/63] update get_accelerator_label_key_value --- sky/provision/kubernetes/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 8c177dc5e60..44ca8919315 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -615,7 +615,8 @@ def get_accelerator_label_key_value( context: Optional[str], acc_type: str, acc_count: Optional[int], - check_mode=False) -> Tuple[str, str, str, str]: + check_mode=False + ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: """Returns the label key and value for the given GPU/TPU type. Args: @@ -629,8 +630,8 @@ def get_accelerator_label_key_value( Returns: A tuple of the accelerator label key, value, topology label key, and topology value. The topology label key and value are populated only if - the requested accelerator type is TPU. Returns empty strings if - check_mode is True. + the requested accelerator type is TPU. Returns None if check_mode is + True. Raises: ResourcesUnavailableError: Can be raised from the following conditions: - The cluster does not have GPU/TPU resources @@ -655,12 +656,12 @@ def get_accelerator_label_key_value( # If check mode is enabled and autoscaler is set, we can return # early since we assume the cluster autoscaler will handle GPU # node provisioning. - return '', '', '', '' + return None, None, None, None formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type) assert formatter is not None, ('Unsupported autoscaler type:' f' {autoscaler_type}') return formatter.get_label_key(acc_type), formatter.get_label_value( - acc_type), '', '' + acc_type), None, None has_gpus, cluster_resources = detect_accelerator_resource(context) if has_gpus: @@ -700,7 +701,7 @@ def get_accelerator_label_key_value( if check_mode: # If check mode is enabled and we reached so far, we can # conclude that the cluster is setup correctly and return. - return '', '', '', '' + return None, None, None, None # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a @@ -735,7 +736,7 @@ def get_accelerator_label_key_value( else: continue else: - return label, value, '', '' + return label, value, None, None # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): From 2fbb4eb7298ac922c302acda193c51bfc9a55a25 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 25 Oct 2024 07:55:55 +0000 Subject: [PATCH 56/63] format --- sky/provision/kubernetes/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 44ca8919315..c38a8907463 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -612,11 +612,11 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType', def get_accelerator_label_key_value( - context: Optional[str], - acc_type: str, - acc_count: Optional[int], - check_mode=False - ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: + context: Optional[str], + acc_type: str, + acc_count: Optional[int], + check_mode=False +) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: """Returns the label key and value for the given GPU/TPU type. Args: From 9e8d53d00a0418cf8d3fa293c4316458c28bedc3 Mon Sep 17 00:00:00 2001 From: landscapepainter Date: Sat, 26 Oct 2024 20:28:32 +0000 Subject: [PATCH 57/63] format --- sky/provision/kubernetes/instance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index f633c577cea..a18f5a92933 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -530,9 +530,9 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict, 'are currently not unsupported.') raise config_lib.KubernetesError( _lack_resource_msg('TPU', - pod_spec, - details=error_message, - extra_msg=extra_message)) + pod_spec, + details=error_message, + extra_msg=extra_message)) else: # Re-raise the exception if it's a different error From 0a0eac28d9fd1222c5c1ea4fdc726889ceb5e361 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 1 Nov 2024 02:11:48 +0000 Subject: [PATCH 58/63] format --- sky/provision/kubernetes/instance.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index b033f454655..d6cbeed0c4e 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -525,13 +525,14 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict, # TPU support. elif 'Invalid resource requests for google.com/tpu.' in error_message: extra_message = ('Verify if the cluster has a TPU slice node with ' - 'a topology matching the number of TPU(s) ' - 'requested. Note that multi-host TPU podslices ' - 'are currently not unsupported.') - raise config_lib.KubernetesError(_lack_resource_msg('TPU', - pod_spec, - details=error_msg, - extra_msg=extra_message)) + 'a topology matching the number of TPU(s) ' + 'requested. Note that multi-host TPU podslices ' + 'are currently not unsupported.') + raise config_lib.KubernetesError( + _lack_resource_msg('TPU', + pod_spec, + details=error_message, + extra_msg=extra_message)) else: # Re-raise the exception if it's a different error raise e From 9dbaa72a10efae88db99779301d65a9a5a5eae7f Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 1 Nov 2024 02:28:29 +0000 Subject: [PATCH 59/63] update comment --- sky/clouds/service_catalog/kubernetes_catalog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 4e27fb1b674..711d126963b 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -128,6 +128,9 @@ def list_accelerators_realtime( node.status.allocatable)) if accelerator_name and accelerator_count > 0: + # TPUs are counted in a different way compared to GPUs. + # Multi-node GPUs can be split into smaller units and be + # provisioned, but TPUs are considered as an atomic unit. if kubernetes_utils.is_tpu_on_gke(accelerator_name): accelerators_qtys.add( (accelerator_name, accelerator_count)) From f5e1d373f7cf2cc91ccdb4c8526c21e3eee114ef Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 1 Nov 2024 04:02:37 +0000 Subject: [PATCH 60/63] resolve review comments --- sky/clouds/kubernetes.py | 5 +++++ sky/provision/kubernetes/instance.py | 8 ++++++-- sky/provision/kubernetes/utils.py | 29 ++++++++++++++-------------- sky/resources.py | 5 +++-- sky/templates/kubernetes-ray.yml.j2 | 12 ++++-------- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 6322b8ddcd3..6633a5b8c0d 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -364,6 +364,7 @@ def make_deploy_resources_variables( k8s_acc_label_value = None k8s_topology_label_key = None k8s_topology_label_value = None + k8s_resource_key = None tpu_requested = False # If GPU/TPUs are requested, set node label to match the GPU/TPU type. @@ -375,6 +376,9 @@ def make_deploy_resources_variables( if (k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY): tpu_requested = True + k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY + else: + k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY port_mode = network_utils.get_port_mode(None) @@ -439,6 +443,7 @@ def make_deploy_resources_variables( 'tpu_requested': tpu_requested, 'k8s_topology_label_key': k8s_topology_label_key, 'k8s_topology_label_value': k8s_topology_label_value, + 'k8s_resource_key': k8s_resource_key, 'image_id': image_id, } diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index d6c4169f553..936a4f389f2 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -613,8 +613,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str, 'override runtimeClassName in ~/.sky/config.yaml. ' 'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long - needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get( - 'limits', {}).get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0) + needs_gpus = False + limits = pod_spec['spec']['containers'][0].get('resources', + {}).get('limits') + if limits is not None: + needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0 + if nvidia_runtime_exists and needs_gpus: pod_spec['spec']['runtimeClassName'] = 'nvidia' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c38a8907463..09675aad395 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -86,7 +86,9 @@ # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run GKE_TPU_ACCELERATOR_TO_GENERATION = { 'tpu-v4-podslice': 'v4', + # Only Single-host v5e TPU configurations are allowed. 'tpu-v5-lite-device': 'v5e', + # Multi-host compatible v5e TPU configurations allowed. 'tpu-v5-lite-podslice': 'v5e', 'tpu-v5p-slice': 'v5p', } @@ -256,7 +258,7 @@ class GKELabelFormatter(GPULabelFormatter): @classmethod def get_label_key(cls, accelerator: Optional[str] = None) -> str: - if isinstance(accelerator, str) and accelerator.startswith('tpu-'): + if accelerator is not None and accelerator.startswith('tpu-'): return cls.TPU_LABEL_KEY return cls.GPU_LABEL_KEY @@ -436,10 +438,10 @@ def detect_accelerator_resource( nodes = get_kubernetes_nodes(context) for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_gpu = (GPU_RESOURCE_KEY in cluster_resources or - TPU_RESOURCE_KEY in cluster_resources) + has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or + TPU_RESOURCE_KEY in cluster_resources) - return has_gpu, cluster_resources + return has_accelerator, cluster_resources @functools.lru_cache(maxsize=10) @@ -868,10 +870,10 @@ def check_credentials(context: Optional[str], # provider if their cluster GPUs are not setup correctly. gpu_msg = '' try: - _, _, _, _ = get_accelerator_label_key_value(context, - acc_type='', - acc_count=0, - check_mode=True) + get_accelerator_label_key_value(context, + acc_type='', + acc_count=0, + check_mode=True) except exceptions.ResourcesUnavailableError as e: # If GPUs are not available, we return cluster as enabled (since it can # be a CPU-only cluster) but we also return the exception message which @@ -2222,7 +2224,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: return 0 -def reduce_tpu_topology(topology: str): +def reduce_tpu_topology(topology: str) -> int: """Computes the number of TPU chips from its topology string.""" chip_dimensions = [int(chip_count) for chip_count in topology.split('x')] # tpu_topology_chip_count represents the total number of TPU chips in the @@ -2232,7 +2234,7 @@ def reduce_tpu_topology(topology: str): return tpu_topology_chip_count -def is_multi_host_tpu(node_metadata_labels: dict): +def is_multi_host_tpu(node_metadata_labels: dict) -> bool: """Determines whether the given node is a multi-host TPU configuration.""" if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels: assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels @@ -2258,14 +2260,13 @@ def is_multi_host_tpu(node_metadata_labels: dict): return False -def multi_host_tpu_exists_in_cluster(context: Optional[str] = None): +def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool: """Checks if there exists a multi-host TPU within the cluster.""" - multi_host_tpu_in_cluster = False nodes = get_kubernetes_nodes(context) for node in nodes: if is_multi_host_tpu(node.metadata.labels): - multi_host_tpu_in_cluster = True - return multi_host_tpu_in_cluster + return True + return False @dataclasses.dataclass diff --git a/sky/resources.py b/sky/resources.py index 8c9a95b04fb..0528113f2fb 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -588,8 +588,9 @@ def _set_accelerators( else: self._cloud = clouds.GCP() assert (self.cloud.is_same_cloud(clouds.GCP()) or - self.cloud.is_same_cloud(clouds.Kubernetes()) - ), 'Cloud must be GCP or Kubernetes.' + self.cloud.is_same_cloud(clouds.Kubernetes())), ( + 'Cloud must be GCP or Kubernetes for TPU ' + 'accelerators.') if accelerator_args is None: accelerator_args = {} diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 9b6d190c7ee..79a27527f9a 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -401,15 +401,13 @@ available_node_types: requests: cpu: {{cpus}} memory: {{memory}}G - {% if tpu_requested %} + {% if k8s_resource_key is not none %} # Number of requested google.com/tpu must be equal to the total # number of available TPU chips on the TPU slice node either it # being a node from multi-host TPU slice or single-host TPU # slice. Example reference: # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work - google.com/tpu: {{accelerator_count}} - {% else %} - nvidia.com/gpu: {{accelerator_count}} + {{k8s_resource_key}}: {{accelerator_count}} {% endif %} {% if k8s_fuse_device_required %} # Kubernetes resource exposed by the fuse device manager @@ -418,10 +416,8 @@ available_node_types: {% endif %} limits: # Limits need to be defined for GPU/TPU requests - {% if tpu_requested %} - google.com/tpu: {{accelerator_count}} - {% else %} - nvidia.com/gpu: {{accelerator_count}} + {% if k8s_resource_key is not none %} + {{k8s_resource_key}}: {{accelerator_count}} {% endif %} {% if k8s_fuse_device_required %} smarter-devices/fuse: "1" From 688c0b4588ad6133d60269493f87008f3b466934 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 2 Nov 2024 22:37:21 +0000 Subject: [PATCH 61/63] update tpuvm_mnist.yaml --- examples/tpu/tpuvm_mnist.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tpu/tpuvm_mnist.yaml b/examples/tpu/tpuvm_mnist.yaml index d1fd434fad6..41b14283fac 100644 --- a/examples/tpu/tpuvm_mnist.yaml +++ b/examples/tpu/tpuvm_mnist.yaml @@ -5,7 +5,7 @@ resources: # The setup command. Will be run under the working directory. setup: | - git clone https://github.com/google/flax.git --branch v0.8.2 + git clone https://github.com/google/flax.git --branch v0.10.1 conda activate flax if [ $? -eq 0 ]; then @@ -15,7 +15,7 @@ setup: | conda activate flax # Make sure to install TPU related packages in a conda env to avoid package conflicts. pip install \ - -f https://storage.googleapis.com/jax-releases/libtpu_releases.html "jax[tpu]==0.4.25" \ + -f https://storage.googleapis.com/jax-releases/libtpu_releases.html "jax[tpu]==0.4.35" \ clu \ tensorflow tensorflow-datasets pip install -e flax From 2dec7f9a2c9187f43088764e7331475b53fb7c41 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 3 Nov 2024 01:00:56 +0000 Subject: [PATCH 62/63] resolve comments --- sky/provision/kubernetes/instance.py | 2 ++ sky/provision/kubernetes/utils.py | 5 +++++ sky/resources.py | 3 ++- sky/templates/kubernetes-ray.yml.j2 | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 936a4f389f2..66d22099a85 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -619,6 +619,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str, if limits is not None: needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0 + # TPU pods provisioned on GKE use the default containerd runtime. + # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long if nvidia_runtime_exists and needs_gpus: pod_spec['spec']['runtimeClassName'] = 'nvidia' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 09675aad395..5982e5acefa 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -732,6 +732,9 @@ def get_accelerator_label_key_value( assert topology_value is not None tpu_topology_chip_count = reduce_tpu_topology( topology_value) + # For single-host TPUs, there aren't multiple + # different topologies that maps to identical + # number of TPU chips. if tpu_topology_chip_count == acc_count: return (label, value, topology_label_key, topology_value) @@ -2217,6 +2220,8 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: Number of accelerators allocated or available from the node. If no resource is found, it returns 0. """ + assert not (GPU_RESOURCE_KEY in attribute_dict and + TPU_RESOURCE_KEY in attribute_dict) if GPU_RESOURCE_KEY in attribute_dict: return int(attribute_dict[GPU_RESOURCE_KEY]) elif TPU_RESOURCE_KEY in attribute_dict: diff --git a/sky/resources.py b/sky/resources.py index 0528113f2fb..deb05a6eade 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -596,7 +596,8 @@ def _set_accelerators( accelerator_args = {} use_tpu_vm = accelerator_args.get('tpu_vm', True) - if self.cloud.is_same_cloud(clouds.GCP()): + if (self.cloud.is_same_cloud(clouds.GCP()) and + not kubernetes_utils.is_tpu_on_gke(acc)): if 'runtime_version' not in accelerator_args: def _get_default_runtime_version() -> str: diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 79a27527f9a..0b52238bf6c 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -450,6 +450,9 @@ setup_commands: # 755 permissions, and the user of the provisioned pod is not necessarily # a root. Hence, we need to update the write permission so the logs can be # properly written. + # TODO(Doyoung): Investigate to see why TPU workload fails to run without + # execution permission, such as granting 766 to log file. Check if it's a + # must and see if there's a workaround to grant minimum permission. - sudo chmod 777 /tmp/tpu_logs; {% endif %} From dc23e886d9b44d8d73feb80988afef9a460141b9 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 4 Nov 2024 01:34:51 +0000 Subject: [PATCH 63/63] update display message for show-gpus --- sky/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 47975dd1ef9..a86aa5e5b11 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3206,7 +3206,9 @@ def _output(): if kubernetes_utils.multi_host_tpu_exists_in_cluster( context): k8s_per_node_acc_message += ( - '(Note: Multi-host TPUs are not supported.)') + '(Note: Multi-host TPUs are detected and excluded ' + 'from the display as multi-host TPUs are not ' + 'supported.)') yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'{k8s_per_node_acc_message}' f'{colorama.Style.RESET_ALL}\n')