From a9294744d4aa401f57ccbf44b7af9aaf74a0826b Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 16 Sep 2024 08:59:11 +0000
Subject: [PATCH 01/63] initial version of TPU support on GKE

---
 sky/clouds/kubernetes.py             | 10 ++++
 sky/provision/kubernetes/instance.py | 11 ++++
 sky/provision/kubernetes/utils.py    | 85 +++++++++++++++++++++++-----
 sky/resources.py                     | 20 +++++--
 sky/task.py                          | 69 +++++++++++++++++++++-
 sky/templates/kubernetes-ray.yml.j2  | 16 +++++-
 6 files changed, 190 insertions(+), 21 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 47f8a435ebb..dbf65dd9e4a 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -261,11 +261,18 @@ def make_deploy_resources_variables(
 
         k8s_acc_label_key = None
         k8s_acc_label_value = None
+        tpu_is_requested = False
 
         # If GPUs are requested, set node label to match the GPU type.
         if acc_count > 0 and acc_type is not None:
             k8s_acc_label_key, k8s_acc_label_value = \
                 kubernetes_utils.get_gpu_label_key_value(acc_type)
+            if k8s_acc_label_key == 'cloud.google.com/gke-tpu-accelerator':
+                tpu_is_requested = True
+
+        if tpu_is_requested:
+            k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
+                kubernetes_utils.get_tpu_topology_key_value())
 
         port_mode = network_utils.get_port_mode(None)
 
@@ -330,6 +337,9 @@ def make_deploy_resources_variables(
             'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
             'k8s_spot_label_key': spot_label_key,
             'k8s_spot_label_value': spot_label_value,
+            'tpu_is_requested': tpu_is_requested,
+            'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key,
+            'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value,
             'image_id': image_id,
         }
 
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 83f9c34592e..a197b778b36 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -577,6 +577,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 }
             }
 
+        # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule)
+        # and explain what those are for. And explain why we need toleration at this point to ignore that taint.
+        if 'cloud.google.com/gke-tpu-accelerator' in config.node_config['spec']['nodeSelector']:
+            tpu_toleration = {
+                'key': 'google.com/tpu',
+                'operator': 'Equal',
+                'value': 'present',
+                'effect': 'NoSchedule'
+            }
+            pod_spec['spec']['tolerations'] = [tpu_toleration]
+
         pod = kubernetes.core_api(context).create_namespaced_pod(
             namespace, pod_spec)
         created_pods[pod.metadata.name] = pod
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 6aa6400dfa1..0c2c9044b10 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -183,11 +183,21 @@ class GKELabelFormatter(GPULabelFormatter):
     label, which is used to identify the GPU type.
     """
 
-    LABEL_KEY = 'cloud.google.com/gke-accelerator'
+    GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
+    TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
+    TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
 
     @classmethod
     def get_label_key(cls) -> str:
-        return cls.LABEL_KEY
+        return cls.GPU_LABEL_KEY
+
+    @classmethod
+    def is_label_key(cls, label: str) -> Tuple[str, str]:
+        return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
+
+    @classmethod
+    def get_tpu_topology_label_key(cls) -> str:
+        return cls.TPU_TOPOLOGY_LABEL_KEY
 
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
@@ -205,6 +215,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
                 # to distinguish between a3-high and a3-mega instances
                 return 'H100'
             return acc
+        elif value.startswith('tpu-'):
+            # Doyoung: This may need some updates depending on the namings
+            # required from up/down-stream.
+            return value
         else:
             raise ValueError(
                 f'Invalid accelerator name in GKE cluster: {value}')
@@ -304,21 +318,27 @@ def detect_gpu_label_formatter(
     # Get all labels across all nodes
     node_labels: Dict[str, List[Tuple[str, str]]] = {}
     nodes = get_kubernetes_nodes()
+    is_tpu = False
     for node in nodes:
         node_labels[node.metadata.name] = []
         for label, value in node.metadata.labels.items():
+            if 'cloud.google.com/gke-tpu-accelerator' == label:
+                is_tpu = True
             node_labels[node.metadata.name].append((label, value))
 
     label_formatter = None
 
-    # Check if the node labels contain any of the GPU label prefixes
-    for lf in LABEL_FORMATTER_REGISTRY:
-        label_key = lf.get_label_key()
-        for _, label_list in node_labels.items():
-            for label, _ in label_list:
-                if label.startswith(label_key):
-                    label_formatter = lf()
-                    return label_formatter, node_labels
+    if is_tpu:
+        label_formatter = GKELabelFormatter()
+    else:
+        # Check if the node labels contain any of the GPU label prefixes
+        for lf in LABEL_FORMATTER_REGISTRY:
+            label_key = lf.get_label_key()
+            for _, label_list in node_labels.items():
+                for label, _ in label_list:
+                    if label.startswith(label_key):
+                        label_formatter = lf()
+                        return label_formatter, node_labels
 
     return label_formatter, node_labels
 
@@ -338,7 +358,7 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]:
     nodes = get_kubernetes_nodes()
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = 'nvidia.com/gpu' in cluster_resources
+    has_gpu = 'nvidia.com/gpu' in cluster_resources or 'google.com/tpu' in cluster_resources
 
     return has_gpu, cluster_resources
 
@@ -523,13 +543,25 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # correctly setup and will behave as expected.
             for node_name, label_list in node_labels.items():
                 for label, value in label_list:
-                    if label == label_formatter.get_label_key():
+                    if label_formatter.is_label_key(label):
                         is_valid, reason = label_formatter.validate_label_value(
                             value)
                         if not is_valid:
                             raise exceptions.ResourcesUnavailableError(
                                 f'Node {node_name!r} in Kubernetes cluster has '
                                 f'invalid GPU label: {label}={value}. {reason}')
+            #####
+            # for node_name, label_list in node_labels.items():
+            #     for label, value in label_list:
+            #         if label == label_formatter.get_label_key():
+            #             is_valid, reason = label_formatter.validate_label_value(
+            #                 value)
+            #             if not is_valid:
+            #                 raise exceptions.ResourcesUnavailableError(
+            #                     f'Node {node_name!r} in Kubernetes cluster has '
+            #                     f'invalid GPU label: {label}={value}. {reason}')
+            #####
+            
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
@@ -543,10 +575,20 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # during scheduling.
             for node_name, label_list in node_labels.items():
                 for label, value in label_list:
-                    if (label == k8s_acc_label_key and
+                    if (label_formatter.is_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
                         return label, value
+            
+            ####
+            # for node_name, label_list in node_labels.items():
+            #     for label, value in label_list:
+            #         if (label == k8s_acc_label_key and
+            #                 label_formatter.get_accelerator_from_label_value(
+            #                     value) == acc_type):
+            #             return label, value
+            ####
+
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():
                 suffix = ''
@@ -580,6 +622,23 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 f'to set up GPUs.{suffix}')
 
 
+def get_tpu_topology_key_value():
+    label_formatter, node_labels = detect_gpu_label_formatter()
+    for node_name, label_list in node_labels.items():
+        for label, value in label_list:
+            if label == label_formatter.get_tpu_topology_label_key():
+                is_valid, reason = label_formatter.validate_label_value(value)
+                if not is_valid:
+                    raise exceptions.ResourcesUnavailableError(
+                        f'Node {node_name!r} in Kubernetes cluster has '
+                        f'invalid GPU label: {label}={value}. {reason}')
+
+    for node_name, label_list in node_labels.items():
+        for label, value in label_list:
+            if label == label_formatter.get_tpu_topology_label_key():
+                return label, value
+                
+
 def get_head_ssh_port(cluster_name: str, namespace: str,
                       context: Optional[str]) -> int:
     svc_name = f'{cluster_name}-head-ssh'
diff --git a/sky/resources.py b/sky/resources.py
index 2f19cd1aa01..51c9fe1b2b5 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -564,13 +564,25 @@ def _set_accelerators(
 
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
-                if self.cloud is None:
-                    self._cloud = clouds.GCP()
+                # Doyoung: Confirm if below two lines can be removed. Perhaps,
+                # raise an error when cloud is not specified by the user
+                # if self.cloud is None:
+                #     self._cloud = clouds.GCP()
                 assert self.cloud.is_same_cloud(
-                    clouds.GCP()), 'Cloud must be GCP.'
+                    clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.'
                 if accelerator_args is None:
                     accelerator_args = {}
-                use_tpu_vm = accelerator_args.get('tpu_vm', True)
+                
+                # Doyoung: May need to understand the usage of tpu_vm and make
+                # proper adjustments to the following snippets.
+                if self.cloud.is_same_cloud(clouds.GCP()):
+                    use_tpu_vm = accelerator_args.get('tpu_vm', True)
+                else:
+                    use_tpu_vm = False
+                
+                ####
+                # use_tpu_vm = accelerator_args.get('tpu_vm', True)
+                ####
                 if self.instance_type is not None and use_tpu_vm:
                     if self.instance_type != 'TPU-VM':
                         with ux_utils.print_exception_no_traceback():
diff --git a/sky/task.py b/sky/task.py
index cebc616dc6d..9a561d005da 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -927,6 +927,10 @@ def _get_preferred_store(
 
         if self.best_resources is not None:
             storage_cloud = self.best_resources.cloud
+            # if storage_cloud == store type read from task
+            #   storage_region = self.best_resources.region
+            # else:
+            #   storage_region = None
             storage_region = self.best_resources.region
         else:
             resources = list(self.resources)[0]
@@ -955,11 +959,72 @@ def sync_storage_mounts(self) -> None:
         file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
         }``.
         """
+        # for storage in self.storage_mounts.values():
+        #     if len(storage.stores) == 0:
+        #         store_type, store_region = self._get_preferred_store()
+        #         self.storage_plans[storage] = store_type
+        #         storage.add_store(store_type, store_region)
+        #     else:
+        #         # We will download the first store that is added to remote.
+        #         self.storage_plans[storage] = list(storage.stores.keys())[0]
+
+        # Things to consider
+        # 1. Created storage must respect the region from _get_preferred_store()
+        # 2. If other store type was specify from the task yaml, it should create
+        # at that cloud type.
+
+        # Perhaps, pass Storage object to _get_preferred_store() and that gets
+        # the highest priority from _get_preferred_store. And if this passed
+        # store type aligns with best_resources.cloud, then return it with the
+        # best_resources.region. If not, just return
+
+        # write a method that can read the store information from task yaml given
+        # the storage object name. Then, using this method, we can check the store type
+        # meant to be created from this task, and if this store type is identical to
+        # what _get_preferred_store type would create, we use the region value from it
+        # as well. If they are not the same, region value is set to None.
+
+        # So given the storage name, we can find the store type attempted to be created
+        # from this task. Get that store type, and if it's identical to best_resources.cloud,
+        # use best_resources.region as well.
+
+        # --> This approach requires to read the task yaml to get the init_store type, which
+        # means, we need a way to obtain the yaml path to read again, but currently, the path
+        # is not passed all the way down.
+
+        # So, why do we need the init_store from task yaml?
+        # This method currently only checks if there's already a storage created with
+        # len(storage.stores) == 0, but now this is not sufficient as it is possible to
+        # create another store with identical name(actually, need to confirm what to do with this behavior)
+
+        # Originally,
+        # if len(storage.stores) == 0:
+        # else:
+        # was good enough since create storage from task yaml was already done
+        # before reaching this point. So reaching this point with len(storage.stores) == 0
+        # was
+
+        # Approach 2:
+        # Get store_type and store_region from _get_preferred_store method.
+        # Check if this store_type is in storage.stores,
+        # if it does not exist, we run add_store(store_type, store_region)
+        # if it exists, we can
+
+        # There are two main points making this difficult to implement without knowing the store type
+        # attempted to be created from current task yaml.
+        # 1. When store type given from task yaml is different from best_resources.cloud, the storage
+        #    is created under wrong cloud provider.
+        # 2.
+
         for storage in self.storage_mounts.values():
-            if len(storage.stores) == 0:
-                store_type, store_region = self._get_preferred_store()
+            store_type, store_region = self._get_preferred_store(storage)
+            if store_type not in storage.stores:
                 self.storage_plans[storage] = store_type
                 storage.add_store(store_type, store_region)
+            # if len(storage.stores) == 0:
+            #     store_type, store_region = self._get_preferred_store()
+            #     self.storage_plans[storage] = store_type
+            #     storage.add_store(store_type, store_region)
             else:
                 # We will download the first store that is added to remote.
                 self.storage_plans[storage] = list(storage.stores.keys())[0]
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 1b09409ad0e..287dda90aa3 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -283,12 +283,15 @@ available_node_types:
 
         restartPolicy: Never
 
-        # Add node selector if GPUs are requested:
+        # Add node selector if GPU/TPUs are requested:
         {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
         nodeSelector:
             {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
             {{k8s_acc_label_key}}: {{k8s_acc_label_value}}
             {% endif %}
+            {% if k8s_tpu_topology_label_key is not none and k8s_tpu_topology_label_value is not none %}
+            {{k8s_tpu_topology_label_key}}: {{k8s_tpu_topology_label_value}}
+            {% endif %}
             {% if k8s_spot_label_key is not none %}
             {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
             {% endif %}
@@ -398,14 +401,23 @@ available_node_types:
             requests:
               cpu: {{cpus}}
               memory: {{memory}}G
+              {% if tpu_is_requested %}
+              google.com/tpu: {{accelerator_count}}
+              {% else %}
               nvidia.com/gpu: {{accelerator_count}}
+              {% endif %}
               {% if k8s_fuse_device_required %}
               # Kubernetes resource exposed by the fuse device manager
               # https://gitlab.com/arm-research/smarter/smarter-device-manager
               smarter-devices/fuse: "1"
               {% endif %}
             limits:
-              nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests
+              # Limits need to be defined for GPU/TPU requests
+              {% if tpu_is_requested %}
+              google.com/tpu: {{accelerator_count}}
+              {% else %}
+              nvidia.com/gpu: {{accelerator_count}}
+              {% endif %}
               {% if k8s_fuse_device_required %}
               smarter-devices/fuse: "1"
               {% endif %}

From 80e1877174a8cb8f47eee39eb587baa92aa0bdea Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 16 Sep 2024 09:15:40 +0000
Subject: [PATCH 02/63] revert unnecesary change

---
 sky/task.py | 65 ++---------------------------------------------------
 1 file changed, 2 insertions(+), 63 deletions(-)

diff --git a/sky/task.py b/sky/task.py
index 9a561d005da..ae2877e1a8f 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -959,72 +959,11 @@ def sync_storage_mounts(self) -> None:
         file_mounts of the form ``{ /remote/path: {s3,gs,..}://<bucket path>
         }``.
         """
-        # for storage in self.storage_mounts.values():
-        #     if len(storage.stores) == 0:
-        #         store_type, store_region = self._get_preferred_store()
-        #         self.storage_plans[storage] = store_type
-        #         storage.add_store(store_type, store_region)
-        #     else:
-        #         # We will download the first store that is added to remote.
-        #         self.storage_plans[storage] = list(storage.stores.keys())[0]
-
-        # Things to consider
-        # 1. Created storage must respect the region from _get_preferred_store()
-        # 2. If other store type was specify from the task yaml, it should create
-        # at that cloud type.
-
-        # Perhaps, pass Storage object to _get_preferred_store() and that gets
-        # the highest priority from _get_preferred_store. And if this passed
-        # store type aligns with best_resources.cloud, then return it with the
-        # best_resources.region. If not, just return
-
-        # write a method that can read the store information from task yaml given
-        # the storage object name. Then, using this method, we can check the store type
-        # meant to be created from this task, and if this store type is identical to
-        # what _get_preferred_store type would create, we use the region value from it
-        # as well. If they are not the same, region value is set to None.
-
-        # So given the storage name, we can find the store type attempted to be created
-        # from this task. Get that store type, and if it's identical to best_resources.cloud,
-        # use best_resources.region as well.
-
-        # --> This approach requires to read the task yaml to get the init_store type, which
-        # means, we need a way to obtain the yaml path to read again, but currently, the path
-        # is not passed all the way down.
-
-        # So, why do we need the init_store from task yaml?
-        # This method currently only checks if there's already a storage created with
-        # len(storage.stores) == 0, but now this is not sufficient as it is possible to
-        # create another store with identical name(actually, need to confirm what to do with this behavior)
-
-        # Originally,
-        # if len(storage.stores) == 0:
-        # else:
-        # was good enough since create storage from task yaml was already done
-        # before reaching this point. So reaching this point with len(storage.stores) == 0
-        # was
-
-        # Approach 2:
-        # Get store_type and store_region from _get_preferred_store method.
-        # Check if this store_type is in storage.stores,
-        # if it does not exist, we run add_store(store_type, store_region)
-        # if it exists, we can
-
-        # There are two main points making this difficult to implement without knowing the store type
-        # attempted to be created from current task yaml.
-        # 1. When store type given from task yaml is different from best_resources.cloud, the storage
-        #    is created under wrong cloud provider.
-        # 2.
-
         for storage in self.storage_mounts.values():
-            store_type, store_region = self._get_preferred_store(storage)
-            if store_type not in storage.stores:
+            if len(storage.stores) == 0:
+                store_type, store_region = self._get_preferred_store()
                 self.storage_plans[storage] = store_type
                 storage.add_store(store_type, store_region)
-            # if len(storage.stores) == 0:
-            #     store_type, store_region = self._get_preferred_store()
-            #     self.storage_plans[storage] = store_type
-            #     storage.add_store(store_type, store_region)
             else:
                 # We will download the first store that is added to remote.
                 self.storage_plans[storage] = list(storage.stores.keys())[0]

From 70a07abcd1ae2190e74a9897a1647b96c6865105 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 16 Sep 2024 09:16:19 +0000
Subject: [PATCH 03/63] revert

---
 sky/task.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sky/task.py b/sky/task.py
index ae2877e1a8f..cebc616dc6d 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -927,10 +927,6 @@ def _get_preferred_store(
 
         if self.best_resources is not None:
             storage_cloud = self.best_resources.cloud
-            # if storage_cloud == store type read from task
-            #   storage_region = self.best_resources.region
-            # else:
-            #   storage_region = None
             storage_region = self.best_resources.region
         else:
             resources = list(self.resources)[0]

From 0cba9a5347e48d6003e7d17bd8464af657a8b51b Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 06:19:08 +0000
Subject: [PATCH 04/63] use TPU_LABEL_KEY constant

---
 sky/provision/kubernetes/instance.py | 2 +-
 sky/provision/kubernetes/utils.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index a197b778b36..735f9366365 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -579,7 +579,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
 
         # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule)
         # and explain what those are for. And explain why we need toleration at this point to ignore that taint.
-        if 'cloud.google.com/gke-tpu-accelerator' in config.node_config['spec']['nodeSelector']:
+        if kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY in config.node_config['spec']['nodeSelector']:
             tpu_toleration = {
                 'key': 'google.com/tpu',
                 'operator': 'Equal',
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 0c2c9044b10..81f24ffebaa 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -322,7 +322,7 @@ def detect_gpu_label_formatter(
     for node in nodes:
         node_labels[node.metadata.name] = []
         for label, value in node.metadata.labels.items():
-            if 'cloud.google.com/gke-tpu-accelerator' == label:
+            if GKELabelFormatter.TPU_LABEL_KEY == label:
                 is_tpu = True
             node_labels[node.metadata.name].append((label, value))
 

From 17bcbd8f73e8582e1a7221a7671d50c19e14b28e Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 06:22:12 +0000
Subject: [PATCH 05/63] nit

---
 sky/clouds/kubernetes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index dbf65dd9e4a..233c2388704 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -267,7 +267,7 @@ def make_deploy_resources_variables(
         if acc_count > 0 and acc_type is not None:
             k8s_acc_label_key, k8s_acc_label_value = \
                 kubernetes_utils.get_gpu_label_key_value(acc_type)
-            if k8s_acc_label_key == 'cloud.google.com/gke-tpu-accelerator':
+            if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
                 tpu_is_requested = True
 
         if tpu_is_requested:

From 9233bf5d7363c3fbd02ab17ebf35826e2810b0a3 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 06:56:43 +0000
Subject: [PATCH 06/63] nit

---
 sky/clouds/kubernetes.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 233c2388704..d1fa151869c 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -261,18 +261,18 @@ def make_deploy_resources_variables(
 
         k8s_acc_label_key = None
         k8s_acc_label_value = None
+        k8s_tpu_topology_label_key = None
+        k8s_tpu_topology_label_value = None
         tpu_is_requested = False
 
-        # If GPUs are requested, set node label to match the GPU type.
+        # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
         if acc_count > 0 and acc_type is not None:
             k8s_acc_label_key, k8s_acc_label_value = \
                 kubernetes_utils.get_gpu_label_key_value(acc_type)
             if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
                 tpu_is_requested = True
-
-        if tpu_is_requested:
-            k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                kubernetes_utils.get_tpu_topology_key_value())
+                k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
+                    kubernetes_utils.get_tpu_topology_key_value())
 
         port_mode = network_utils.get_port_mode(None)
 

From 12e62c05ca1bae2f4b6240b382d7fbd02e37e503 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 06:59:39 +0000
Subject: [PATCH 07/63] update detect_gpu_label_formatter() to use
 match_label_key()

---
 sky/provision/kubernetes/utils.py | 46 +++++++++++++++++++------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 81f24ffebaa..d9bca557831 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -79,12 +79,17 @@ class GPULabelFormatter:
     def get_label_key(cls) -> str:
         """Returns the label key for GPU type used by the Kubernetes cluster"""
         raise NotImplementedError
-
+        
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         """Given a GPU type, returns the label value to be used"""
         raise NotImplementedError
 
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        """Checks if the given label key matches the formatter's label keys"""
+        raise NotImplementedError
+
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         """Given a label value, returns the GPU type"""
@@ -141,6 +146,10 @@ def get_label_value(cls, accelerator: str) -> str:
         # See sky.utils.kubernetes.gpu_labeler.
         return accelerator.lower()
 
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
+
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         return value.upper()
@@ -171,6 +180,10 @@ def get_label_key(cls) -> str:
     def get_label_value(cls, accelerator: str) -> str:
         return accelerator.upper()
 
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
+
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         return value
@@ -192,7 +205,7 @@ def get_label_key(cls) -> str:
         return cls.GPU_LABEL_KEY
 
     @classmethod
-    def is_label_key(cls, label: str) -> Tuple[str, str]:
+    def match_label_key(cls, label: str) -> bool:
         return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
 
     @classmethod
@@ -252,6 +265,10 @@ def get_label_value(cls, accelerator: str) -> str:
         As a result, we do not support get_label_value for GFDLabelFormatter."""
         raise NotImplementedError
 
+    @classmethod
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key == cls.LABEL_KEY
+
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
         """Searches against a canonical list of NVIDIA GPUs and pattern
@@ -318,27 +335,20 @@ def detect_gpu_label_formatter(
     # Get all labels across all nodes
     node_labels: Dict[str, List[Tuple[str, str]]] = {}
     nodes = get_kubernetes_nodes()
-    is_tpu = False
     for node in nodes:
         node_labels[node.metadata.name] = []
         for label, value in node.metadata.labels.items():
-            if GKELabelFormatter.TPU_LABEL_KEY == label:
-                is_tpu = True
             node_labels[node.metadata.name].append((label, value))
 
     label_formatter = None
 
-    if is_tpu:
-        label_formatter = GKELabelFormatter()
-    else:
-        # Check if the node labels contain any of the GPU label prefixes
-        for lf in LABEL_FORMATTER_REGISTRY:
-            label_key = lf.get_label_key()
-            for _, label_list in node_labels.items():
-                for label, _ in label_list:
-                    if label.startswith(label_key):
-                        label_formatter = lf()
-                        return label_formatter, node_labels
+    # Check if the node labels contain any of the GPU label prefixes
+    for lf in LABEL_FORMATTER_REGISTRY:
+        for _, label_list in node_labels.items():
+            for label, _ in label_list:
+                if lf.match_label_key(label):
+                    label_formatter = lf()
+                    return label_formatter, node_labels
 
     return label_formatter, node_labels
 
@@ -543,7 +553,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # correctly setup and will behave as expected.
             for node_name, label_list in node_labels.items():
                 for label, value in label_list:
-                    if label_formatter.is_label_key(label):
+                    if label_formatter.match_label_key(label):
                         is_valid, reason = label_formatter.validate_label_value(
                             value)
                         if not is_valid:
@@ -575,7 +585,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # during scheduling.
             for node_name, label_list in node_labels.items():
                 for label, value in label_list:
-                    if (label_formatter.is_label_key(label) and
+                    if (label_formatter.match_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
                         return label, value

From c795fe7212cc1935bb06adbdda701145c4aa5d77 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 07:12:39 +0000
Subject: [PATCH 08/63] tidy get_gpu_label_key_value

---
 sky/provision/kubernetes/utils.py | 41 +++++++++----------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index d9bca557831..41870cf59f9 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -229,8 +229,6 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
                 return 'H100'
             return acc
         elif value.startswith('tpu-'):
-            # Doyoung: This may need some updates depending on the namings
-            # required from up/down-stream.
             return value
         else:
             raise ValueError(
@@ -354,21 +352,24 @@ def detect_gpu_label_formatter(
 
 
 def detect_gpu_resource() -> Tuple[bool, Set[str]]:
-    """Checks if the Kubernetes cluster has nvidia.com/gpu resource.
+    """Checks if the Kubernetes cluster has accelerator resource.
 
-    If nvidia.com/gpu resource is missing, that typically means that the
-    Kubernetes cluster does not have GPUs or the nvidia GPU operator and/or
-    device drivers are not installed.
+    Two types of accelerator resources are available which are each checked
+    with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
+    missing, that typically means that the Kubernetes cluster does not have
+    GPUs or the nvidia GPU operator and/or device drivers are not installed.
 
     Returns:
-        bool: True if the cluster has nvidia.com/gpu resource, False otherwise.
+        bool: True if the cluster has nvidia.com/gpu or google.com/tpu
+            resource, False otherwise.
     """
     # Get the set of resources across all nodes
     cluster_resources: Set[str] = set()
     nodes = get_kubernetes_nodes()
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = 'nvidia.com/gpu' in cluster_resources or 'google.com/tpu' in cluster_resources
+    has_gpu = ('nvidia.com/gpu' in cluster_resources or
+               'google.com/tpu' in cluster_resources)
 
     return has_gpu, cluster_resources
 
@@ -560,23 +561,12 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                             raise exceptions.ResourcesUnavailableError(
                                 f'Node {node_name!r} in Kubernetes cluster has '
                                 f'invalid GPU label: {label}={value}. {reason}')
-            #####
-            # for node_name, label_list in node_labels.items():
-            #     for label, value in label_list:
-            #         if label == label_formatter.get_label_key():
-            #             is_valid, reason = label_formatter.validate_label_value(
-            #                 value)
-            #             if not is_valid:
-            #                 raise exceptions.ResourcesUnavailableError(
-            #                     f'Node {node_name!r} in Kubernetes cluster has '
-            #                     f'invalid GPU label: {label}={value}. {reason}')
-            #####
             
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
                 return '', ''
-            k8s_acc_label_key = label_formatter.get_label_key()
+
             # Search in node_labels to see if any node has the requested
             # GPU type.
             # Note - this only checks if the label is available on a
@@ -589,15 +579,6 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
                         return label, value
-            
-            ####
-            # for node_name, label_list in node_labels.items():
-            #     for label, value in label_list:
-            #         if (label == k8s_acc_label_key and
-            #                 label_formatter.get_accelerator_from_label_value(
-            #                     value) == acc_type):
-            #             return label, value
-            ####
 
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():
@@ -607,7 +588,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                     for node_name, label_list in node_labels.items():
                         all_labels.extend(label_list)
                     gpus_available = set(
-                        v for k, v in all_labels if k == k8s_acc_label_key)
+                        v for k, v in all_labels if label_formatter.match_label_key(k))
                     suffix = f' Available GPUs on the cluster: {gpus_available}'
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '

From 1c895f0b6df59f5e6e8c6a3b1f6414565e7f14af Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 07:16:19 +0000
Subject: [PATCH 09/63] nit

---
 sky/clouds/kubernetes.py            | 6 +++---
 sky/templates/kubernetes-ray.yml.j2 | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index d1fa151869c..2d3d4874088 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -263,14 +263,14 @@ def make_deploy_resources_variables(
         k8s_acc_label_value = None
         k8s_tpu_topology_label_key = None
         k8s_tpu_topology_label_value = None
-        tpu_is_requested = False
+        tpu_requested = False
 
         # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
         if acc_count > 0 and acc_type is not None:
             k8s_acc_label_key, k8s_acc_label_value = \
                 kubernetes_utils.get_gpu_label_key_value(acc_type)
             if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
-                tpu_is_requested = True
+                tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
                     kubernetes_utils.get_tpu_topology_key_value())
 
@@ -337,7 +337,7 @@ def make_deploy_resources_variables(
             'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
             'k8s_spot_label_key': spot_label_key,
             'k8s_spot_label_value': spot_label_value,
-            'tpu_is_requested': tpu_is_requested,
+            'tpu_requested': tpu_requested,
             'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key,
             'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value,
             'image_id': image_id,
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 287dda90aa3..d219bc6670e 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -401,7 +401,7 @@ available_node_types:
             requests:
               cpu: {{cpus}}
               memory: {{memory}}G
-              {% if tpu_is_requested %}
+              {% if tpu_requested %}
               google.com/tpu: {{accelerator_count}}
               {% else %}
               nvidia.com/gpu: {{accelerator_count}}
@@ -413,7 +413,7 @@ available_node_types:
               {% endif %}
             limits:
               # Limits need to be defined for GPU/TPU requests
-              {% if tpu_is_requested %}
+              {% if tpu_requested %}
               google.com/tpu: {{accelerator_count}}
               {% else %}
               nvidia.com/gpu: {{accelerator_count}}

From a8f5b6bf6a77429c494aae44d0b3db3fa6da4024 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 07:19:25 +0000
Subject: [PATCH 10/63] update method name

---
 sky/clouds/kubernetes.py          | 2 +-
 sky/provision/kubernetes/utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 2d3d4874088..8ea5b89afb3 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -272,7 +272,7 @@ def make_deploy_resources_variables(
             if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
                 tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                    kubernetes_utils.get_tpu_topology_key_value())
+                    kubernetes_utils.get_tpu_topology_label_key_value())
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 41870cf59f9..45e3bf3b80b 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -613,7 +613,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 f'to set up GPUs.{suffix}')
 
 
-def get_tpu_topology_key_value():
+def get_tpu_topology_label_key_value():
     label_formatter, node_labels = detect_gpu_label_formatter()
     for node_name, label_list in node_labels.items():
         for label, value in label_list:

From bdb34690e9464bdfd9f4b7253d0c4283cfca945e Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 07:24:49 +0000
Subject: [PATCH 11/63] update get_gke_accelerator_name to support TPU

---
 sky/provision/kubernetes/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 45e3bf3b80b..e6fc3a20325 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -123,6 +123,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
         # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
         # have a different name pattern.
         return 'nvidia-{}'.format(accelerator.lower())
+    elif accelerator.startswith('tpu-'):
+        return accelerator
     else:
         return 'nvidia-tesla-{}'.format(accelerator.lower())
 

From 1d2d24399544bbf4db8604e217dcd6a56b027c19 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 08:04:21 +0000
Subject: [PATCH 12/63] add support for get_label_keys method due to TPU label
 key

---
 sky/provision/kubernetes/instance.py |  3 +-
 sky/provision/kubernetes/utils.py    | 45 +++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 735f9366365..9dd0e2eb62d 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -180,8 +180,9 @@ def _lack_resource_msg(resource: str,
                         '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.'  # pylint: disable=line-too-long
                         f' Full error: {event_message}')
                 gpu_lf_keys = [
-                    lf.get_label_key()
+                    key
                     for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
+                    for key in lf.get_label_keys()
                 ]
                 if pod.spec.node_selector:
                     for label_key in pod.spec.node_selector.keys():
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index e6fc3a20325..38aaa3ce82b 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -76,10 +76,15 @@ class GPULabelFormatter:
     """
 
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: str = None) -> str:
         """Returns the label key for GPU type used by the Kubernetes cluster"""
         raise NotImplementedError
-        
+
+    @classmethod
+    def get_label_keys(cls) -> List[str]:
+        """Returns a list of label keys for GPU used by Kubernetes cluster."""
+        pass
+
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         """Given a GPU type, returns the label value to be used"""
@@ -139,8 +144,13 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'skypilot.co/accelerator'
 
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: str = None) -> str:
+        del accelerator # Unused
         return cls.LABEL_KEY
+   
+    @classmethod
+    def get_label_keys(cls) -> str:
+        return [cls.LABEL_KEY]
 
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
@@ -175,9 +185,14 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
 
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: str = None) -> str:
+        del accelerator # Unused
         return cls.LABEL_KEY
 
+    @classmethod
+    def get_label_keys(cls) -> str:
+        return [cls.LABEL_KEY]
+
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         return accelerator.upper()
@@ -203,9 +218,15 @@ class GKELabelFormatter(GPULabelFormatter):
     TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
 
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: str = None) -> str:
+        if accelerator.startswith('tpu-'):
+            return cls.TPU_LABEL_KEY
         return cls.GPU_LABEL_KEY
 
+    @classmethod
+    def get_label_keys(cls) -> str:
+        return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
+
     @classmethod
     def match_label_key(cls, label: str) -> bool:
         return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
@@ -255,9 +276,14 @@ class GFDLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'nvidia.com/gpu.product'
 
     @classmethod
-    def get_label_key(cls) -> str:
+    def get_label_key(cls, accelerator: str = None) -> str:
+        del accelerator # Unused
         return cls.LABEL_KEY
 
+    @classmethod
+    def get_label_keys(cls) -> str:
+        return [cls.LABEL_KEY]
+
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
         """An accelerator can map to many Nvidia GFD labels
@@ -528,7 +554,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
         formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
         assert formatter is not None, ('Unsupported autoscaler type:'
                                        f' {autoscaler_type}')
-        return formatter.get_label_key(), formatter.get_label_value(acc_type)
+        return formatter.get_label_key(acc_type), formatter.get_label_value(acc_type)
 
     has_gpus, cluster_resources = detect_gpu_resource()
     if has_gpus:
@@ -540,7 +566,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # detected, raise error
             with ux_utils.print_exception_no_traceback():
                 supported_formats = ', '.join(
-                    [f.get_label_key() for f in LABEL_FORMATTER_REGISTRY])
+                    key
+                    for f in LABEL_FORMATTER_REGISTRY
+                    for key in f.get_label_keys()
+                )
                 suffix = ''
                 if env_options.Options.SHOW_DEBUG_INFO.get():
                     suffix = f' Found node labels: {node_labels}'

From 92f4f382d8a51f7fe5c87c9f133293271eb320f0 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 08:09:06 +0000
Subject: [PATCH 13/63] syntax

---
 sky/provision/kubernetes/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 38aaa3ce82b..afe4b82c115 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -566,9 +566,9 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # detected, raise error
             with ux_utils.print_exception_no_traceback():
                 supported_formats = ', '.join(
-                    key
+                    [key
                     for f in LABEL_FORMATTER_REGISTRY
-                    for key in f.get_label_keys()
+                    for key in f.get_label_keys()]
                 )
                 suffix = ''
                 if env_options.Options.SHOW_DEBUG_INFO.get():
@@ -592,12 +592,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                             raise exceptions.ResourcesUnavailableError(
                                 f'Node {node_name!r} in Kubernetes cluster has '
                                 f'invalid GPU label: {label}={value}. {reason}')
-            
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
                 return '', ''
-
             # Search in node_labels to see if any node has the requested
             # GPU type.
             # Note - this only checks if the label is available on a

From 2662ec80e5c9b513f5d2700dcd97decb2de0ab18 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 17 Sep 2024 09:07:17 +0000
Subject: [PATCH 14/63] update get_tpu_topology_label_key_value

---
 sky/clouds/kubernetes.py          |  2 +-
 sky/provision/kubernetes/utils.py | 73 ++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 8ea5b89afb3..97447f25f88 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -272,7 +272,7 @@ def make_deploy_resources_variables(
             if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
                 tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                    kubernetes_utils.get_tpu_topology_label_key_value())
+                    kubernetes_utils.get_tpu_topology_label_key_value(acc_type))
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index afe4b82c115..3000eb8e829 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -621,9 +621,9 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                     suffix = f' Available GPUs on the cluster: {gpus_available}'
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '
-                    f'with {acc_type} GPU. Please ensure at least '
-                    f'one node in the cluster has {acc_type} GPU and node '
-                    'labels are setup correctly. '
+                    f'with {acc_type}. Please ensure at least one node in the '
+                    f'cluster has {acc_type} and node labels are setup '
+                    'correctly. '
                     f'Please refer to the documentation for more. {suffix}')
     else:
         # If GPU resources are not detected, raise error
@@ -633,31 +633,54 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 suffix = (' Available resources on the cluster: '
                           f'{cluster_resources}')
             raise exceptions.ResourcesUnavailableError(
-                'Could not detect GPU resources (`nvidia.com/gpu`) in '
-                'Kubernetes cluster. If this cluster contains GPUs, please '
-                'ensure GPU drivers are installed on the node. Check if the '
-                'GPUs are setup correctly by running `kubectl describe nodes` '
-                'and looking for the nvidia.com/gpu resource. '
-                'Please refer to the documentation on how '
-                f'to set up GPUs.{suffix}')
+                'Could not detect GPU/TPU resources (`nvidia.com/gpu` or '
+                '`google.com/tpu`) in Kubernetes cluster. If this cluster '
+                'contains GPUs, please ensure GPU drivers are installed on '
+                'the node. Check if the GPUs are setup correctly by running '
+                '`kubectl describe nodes` and looking for the nvidia.com/gpu '
+                'or google.com/tpu resource. Please refer to the documentation'
+                f'on how to set up GPUs.{suffix}')
 
 
-def get_tpu_topology_label_key_value():
+def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]:
+    """Returns the TPU topology label key and value for given accelerator type.
+
+    Args:
+        accelerator: The TPU accelerator type required by the task.
+
+    Returns:
+        A tuple of the TPU topology label key and value.
+
+    Raises:
+        ResourcesUnavailableError: Can be raised from the following conditions:
+            - The cluster does not have TPU labels set up correctly.
+            - The cluster doesn't have any nodes with the specified TPU
+              accelerator type.
+            - The TPU topology label is missing for the specified accelerator.
+    """
     label_formatter, node_labels = detect_gpu_label_formatter()
-    for node_name, label_list in node_labels.items():
-        for label, value in label_list:
-            if label == label_formatter.get_tpu_topology_label_key():
-                is_valid, reason = label_formatter.validate_label_value(value)
-                if not is_valid:
-                    raise exceptions.ResourcesUnavailableError(
-                        f'Node {node_name!r} in Kubernetes cluster has '
-                        f'invalid GPU label: {label}={value}. {reason}')
-
-    for node_name, label_list in node_labels.items():
-        for label, value in label_list:
-            if label == label_formatter.get_tpu_topology_label_key():
-                return label, value
-                
+    assert isinstance(label_formatter, GKELabelFormatter)
+
+    tpu_label_key = label_formatter.TPU_LABEL_KEY
+    tpu_topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY
+
+    for labels in node_labels.values():
+        labels_dict = dict(labels)
+        if labels_dict.get(tpu_label_key) == accelerator:
+            topology_value = labels_dict.get(tpu_topology_label_key)
+            return tpu_topology_label_key, topology_value
+        
+    # If TPU labels are not detected, raise error
+    with ux_utils.print_exception_no_traceback():
+        suffix = ''
+        if env_options.Options.SHOW_DEBUG_INFO.get():
+            suffix = (' Available node labels on the cluster: '
+                        f'{node_labels}')
+        raise exceptions.ResourcesUnavailableError(
+            f'Unable to find TPU topology for accelerator {accelerator!r}. '
+            f'No node found with label `{tpu_label_key}={accelerator}` '
+            f'or missing {tpu_topology_label_key!r} label.{suffix}')
+
 
 def get_head_ssh_port(cluster_name: str, namespace: str,
                       context: Optional[str]) -> int:

From 58f8ad66c02c4e7af3498abfcf3f6ed94eb97186 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 20 Sep 2024 19:45:15 +0000
Subject: [PATCH 15/63] nit

---
 sky/resources.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sky/resources.py b/sky/resources.py
index 51c9fe1b2b5..4b47e8e4247 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -568,14 +568,14 @@ def _set_accelerators(
                 # raise an error when cloud is not specified by the user
                 # if self.cloud is None:
                 #     self._cloud = clouds.GCP()
-                assert self.cloud.is_same_cloud(
-                    clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.'
+                # assert self.cloud.is_same_cloud(
+                #     clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.'
                 if accelerator_args is None:
                     accelerator_args = {}
                 
                 # Doyoung: May need to understand the usage of tpu_vm and make
                 # proper adjustments to the following snippets.
-                if self.cloud.is_same_cloud(clouds.GCP()):
+                if self.cloud is not None and self.cloud.is_same_cloud(clouds.GCP()):
                     use_tpu_vm = accelerator_args.get('tpu_vm', True)
                 else:
                     use_tpu_vm = False

From 1cf82b647b703731d01f3957128f760fa46f34b0 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 20 Sep 2024 19:45:40 +0000
Subject: [PATCH 16/63] refactor error surfacing methods to have it work with
 TPU support

---
 sky/provision/kubernetes/instance.py | 131 +++++++++++++++++----------
 1 file changed, 82 insertions(+), 49 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 9dd0e2eb62d..7969923272e 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -1,7 +1,7 @@
 """Kubernetes instance provisioning."""
 import copy
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 import uuid
 
 from sky import exceptions
@@ -79,6 +79,74 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
     return {'component': f'{cluster_name}-head'}
 
 
+def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]):
+    # Returns a formatted string of resource requirements for a pod.
+    resource_requirements = {}
+    
+    if isinstance(pod_or_spec, dict):
+        containers = pod_or_spec.get('spec', {}).get('containers', [])
+    else:
+        containers = pod_or_spec.spec.containers
+
+    for container in containers:
+        if isinstance(container, dict):
+            resources = container.get('resources', {})
+            requests = resources.get('requests', {})
+        else:
+            resources = container.resources
+            requests = resources.requests or {}
+            
+        for resource, value in requests.items():
+            if resource not in resource_requirements:
+                resource_requirements[resource] = 0
+            if resource == 'memory':
+                int_value = kubernetes_utils.parse_memory_resource(value)
+            else:
+                int_value = kubernetes_utils.parse_cpu_or_gpu_resource(
+                    value)
+            resource_requirements[resource] += int_value
+    return ', '.join(f'{resource}={value}'
+                        for resource, value in resource_requirements.items())
+
+
+def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]:
+    # Returns a formatted string of node selectors for a pod.
+    node_selectors = []
+
+    if isinstance(pod_or_spec, dict):
+        selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
+    else:
+        selectors = pod_or_spec.spec.node_selector
+    
+    if not selectors:
+        return None
+
+    for label_key, label_value in selectors.items():
+        node_selectors.append(f'{label_key}={label_value}')
+    return ', '.join(node_selectors)
+
+
+def _lack_resource_msg(resource: str,
+                        pod_or_spec: Union['Pod', dict],
+                        extra_msg: Optional[str] = None,
+                        details: Optional[str] = None) -> str:
+    resource_requirements = _formatted_resource_requirements(pod_or_spec)
+    node_selectors = _formatted_node_selector(pod_or_spec)
+    node_selector_str = f' and labels ({node_selectors})' if (
+        node_selectors) else ''
+    msg = (
+        f'Insufficient {resource} capacity on the cluster. '
+        f'Required resources ({resource_requirements}){node_selector_str} '
+        'were not found in a single node. Other SkyPilot tasks or pods may '
+        'be using resources. Check resource usage by running '
+        '`kubectl describe nodes`.')
+    if extra_msg:
+        msg += f' {extra_msg}'
+    if details:
+        msg += f'\nFull error: {details}'
+    return msg
+
+
 def _raise_pod_scheduling_errors(namespace, context, new_nodes):
     """Raise pod scheduling failure reason.
 
@@ -86,52 +154,6 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
     are recorded as events. This function retrieves those events and raises
     descriptive errors for better debugging and user feedback.
     """
-
-    def _formatted_resource_requirements(pod):
-        # Returns a formatted string of resource requirements for a pod.
-        resource_requirements = {}
-        for container in pod.spec.containers:
-            for resource, value in container.resources.requests.items():
-                if resource not in resource_requirements:
-                    resource_requirements[resource] = 0
-                if resource == 'memory':
-                    int_value = kubernetes_utils.parse_memory_resource(value)
-                else:
-                    int_value = kubernetes_utils.parse_cpu_or_gpu_resource(
-                        value)
-                resource_requirements[resource] += int_value
-        return ', '.join(f'{resource}={value}'
-                         for resource, value in resource_requirements.items())
-
-    def _formatted_node_selector(pod) -> Optional[str]:
-        # Returns a formatted string of node selectors for a pod.
-        node_selectors = []
-        if pod.spec.node_selector is None:
-            return None
-        for label_key, label_value in pod.spec.node_selector.items():
-            node_selectors.append(f'{label_key}={label_value}')
-        return ', '.join(node_selectors)
-
-    def _lack_resource_msg(resource: str,
-                           pod,
-                           extra_msg: Optional[str] = None,
-                           details: Optional[str] = None) -> str:
-        resource_requirements = _formatted_resource_requirements(pod)
-        node_selectors = _formatted_node_selector(pod)
-        node_selector_str = f' and labels ({node_selectors})' if (
-            node_selectors) else ''
-        msg = (
-            f'Insufficient {resource} capacity on the cluster. '
-            f'Required resources ({resource_requirements}){node_selector_str} '
-            'were not found in a single node. Other SkyPilot tasks or pods may '
-            'be using resources. Check resource usage by running '
-            '`kubectl describe nodes`.')
-        if extra_msg:
-            msg += f' {extra_msg}'
-        if details:
-            msg += f'\nFull error: {details}'
-        return msg
-
     for new_node in new_nodes:
         pod = kubernetes.core_api(context).read_namespaced_pod(
             new_node.metadata.name, namespace)
@@ -589,8 +611,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
             }
             pod_spec['spec']['tolerations'] = [tpu_toleration]
 
-        pod = kubernetes.core_api(context).create_namespaced_pod(
-            namespace, pod_spec)
+        try:
+            pod = kubernetes.core_api(context).create_namespaced_pod(
+                namespace, pod_spec)
+        except kubernetes.api_exception() as e:
+            error_msg = str(e)
+            if 'Invalid resource requests for google.com/tpu.' in error_msg:
+                extra_msg = ('Verify if the cluster has a TPU slice with a '
+                             'topology matching the number of TPU(s) '
+                             'requested.')
+                raise config_lib.KubernetesError(
+                    _lack_resource_msg('TPU', pod_spec, details=error_msg, extra_msg=extra_msg)
+                )
+            raise
         created_pods[pod.metadata.name] = pod
         if head_pod_name is None:
             head_pod_name = pod.metadata.name

From 7b551c94c66b44ae3e2779b398d6db68901b9429 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 21 Sep 2024 03:22:42 +0000
Subject: [PATCH 17/63] update toleration comment

---
 sky/provision/kubernetes/instance.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 7969923272e..b1d24c5765a 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -600,9 +600,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 }
             }
 
-        # Doyoung: Add comments on what kind of taint is added to TPU nodes by GCP(google.com/tpu=present:NoSchedule)
-        # and explain what those are for. And explain why we need toleration at this point to ignore that taint.
-        if kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY in config.node_config['spec']['nodeSelector']:
+        # TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
+        # This is to prevent from non-TPU workloads from being scheduled on TPU
+        # slice nodes. We need this toleration to allow the pod to be scheduled
+        # on TPU nodes.
+        # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
+        tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
+        if tpu_label in config.node_config['spec']['nodeSelector']:
             tpu_toleration = {
                 'key': 'google.com/tpu',
                 'operator': 'Equal',
@@ -617,12 +621,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         except kubernetes.api_exception() as e:
             error_msg = str(e)
             if 'Invalid resource requests for google.com/tpu.' in error_msg:
-                extra_msg = ('Verify if the cluster has a TPU slice with a '
-                             'topology matching the number of TPU(s) '
+                extra_msg = ('Verify if the cluster has a TPU slice node with '
+                             'a topology matching the number of TPU(s) '
                              'requested.')
                 raise config_lib.KubernetesError(
-                    _lack_resource_msg('TPU', pod_spec, details=error_msg, extra_msg=extra_msg)
-                )
+                    _lack_resource_msg('TPU',
+                                       pod_spec,
+                                       details=error_msg,
+                                       extra_msg=extra_msg))
             raise
         created_pods[pod.metadata.name] = pod
         if head_pod_name is None:

From 81a05ee6eef5eba2b98b1f4cf8c7b9d8a4355e6d Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 21 Sep 2024 04:38:07 +0000
Subject: [PATCH 18/63] support listing available TPUs and show-gpus for TPUs

---
 sky/cli.py                                    |   4 +-
 .../service_catalog/kubernetes_catalog.py     | 111 ++++++++++--------
 sky/provision/kubernetes/utils.py             |  73 +++++++-----
 3 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index eb0267f7ced..a4199dae68b 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3086,8 +3086,8 @@ def _get_kubernetes_node_info_table():
         for node_name, node_info in node_info_dict.items():
             node_table.add_row([
                 node_name, node_info.gpu_type,
-                node_info.total['nvidia.com/gpu'],
-                node_info.free['nvidia.com/gpu']
+                node_info.total['accelerator_count'],
+                node_info.free['accelerators_available']
             ])
         return node_table
 
diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 9365d693cbd..1534c4c39a5 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -84,7 +84,7 @@ def list_accelerators_realtime(
         return {}, {}, {}
 
     accelerators_qtys: Set[Tuple[str, int]] = set()
-    key = label_formatter.get_label_key()
+    keys = label_formatter.get_label_keys()
     nodes = kubernetes_utils.get_kubernetes_nodes()
     # Get the pods to get the real-time GPU usage
     pods = kubernetes_utils.get_kubernetes_pods()
@@ -95,56 +95,65 @@ def list_accelerators_realtime(
     min_quantity_filter = quantity_filter if quantity_filter else 1
 
     for node in nodes:
-        if key in node.metadata.labels:
-            allocated_qty = 0
-            accelerator_name = label_formatter.get_accelerator_from_label_value(
-                node.metadata.labels.get(key))
-
-            # Check if name_filter regex matches the accelerator_name
-            regex_flags = 0 if case_sensitive else re.IGNORECASE
-            if name_filter and not re.match(
-                    name_filter, accelerator_name, flags=regex_flags):
-                continue
-
-            accelerator_count = int(
-                node.status.allocatable.get('nvidia.com/gpu', 0))
-
-            # Generate the GPU quantities for the accelerators
-            if accelerator_name and accelerator_count > 0:
-                for count in range(1, accelerator_count + 1):
-                    accelerators_qtys.add((accelerator_name, count))
-
-            for pod in pods:
-                # Get all the pods running on the node
-                if (pod.spec.node_name == node.metadata.name and
-                        pod.status.phase in ['Running', 'Pending']):
-                    # Iterate over all the containers in the pod and sum the
-                    # GPU requests
-                    for container in pod.spec.containers:
-                        if container.resources.requests:
-                            allocated_qty += int(
-                                container.resources.requests.get(
-                                    'nvidia.com/gpu', 0))
-
-            accelerators_available = accelerator_count - allocated_qty
-
-            if accelerator_count >= min_quantity_filter:
-                quantized_count = (min_quantity_filter *
-                                   (accelerator_count // min_quantity_filter))
-                if accelerator_name not in total_accelerators_capacity:
-                    total_accelerators_capacity[
-                        accelerator_name] = quantized_count
-                else:
-                    total_accelerators_capacity[
-                        accelerator_name] += quantized_count
-
-            if accelerator_name not in total_accelerators_available:
-                total_accelerators_available[accelerator_name] = 0
-            if accelerators_available >= min_quantity_filter:
-                quantized_availability = min_quantity_filter * (
-                    accelerators_available // min_quantity_filter)
-                total_accelerators_available[
-                    accelerator_name] += quantized_availability
+        for key in keys:
+            if key in node.metadata.labels:
+                allocated_qty = 0
+                accelerator_name = label_formatter.get_accelerator_from_label_value(
+                    node.metadata.labels.get(key))
+
+                # Check if name_filter regex matches the accelerator_name
+                regex_flags = 0 if case_sensitive else re.IGNORECASE
+                if name_filter and not re.match(
+                        name_filter, accelerator_name, flags=regex_flags):
+                    continue
+
+                accelerator_count = 0
+                if 'nvidia.com/gpu' in node.status.allocatable:
+                    accelerator_count = int(
+                        node.status.allocatable['nvidia.com/gpu'])
+                elif 'google.com/tpu' in node.status.allocatable:
+                    accelerator_count = int(
+                        node.status.allocatable['google.com/tpu'])
+
+                # Generate the GPU quantities for the accelerators
+                if accelerator_name and accelerator_count > 0:
+                    for count in range(1, accelerator_count + 1):
+                        accelerators_qtys.add((accelerator_name, count))
+
+                for pod in pods:
+                    # Get all the pods running on the node
+                    if (pod.spec.node_name == node.metadata.name and
+                            pod.status.phase in ['Running', 'Pending']):
+                        # Iterate over all the containers in the pod and sum the
+                        # GPU requests
+                        for container in pod.spec.containers:
+                            if container.resources.requests:
+                                allocated_qty += int(
+                                    container.resources.requests.get(
+                                        'nvidia.com/gpu', 0))
+                                allocated_qty += int(
+                                    container.resources.requests.get(
+                                        'google.com/tpu', 0))
+
+                accelerators_available = accelerator_count - allocated_qty
+
+                if accelerator_count >= min_quantity_filter:
+                    quantized_count = (min_quantity_filter *
+                                    (accelerator_count // min_quantity_filter))
+                    if accelerator_name not in total_accelerators_capacity:
+                        total_accelerators_capacity[
+                            accelerator_name] = quantized_count
+                    else:
+                        total_accelerators_capacity[
+                            accelerator_name] += quantized_count
+
+                if accelerator_name not in total_accelerators_available:
+                    total_accelerators_available[accelerator_name] = 0
+                if accelerators_available >= min_quantity_filter:
+                    quantized_availability = min_quantity_filter * (
+                        accelerators_available // min_quantity_filter)
+                    total_accelerators_available[
+                        accelerator_name] += quantized_availability
 
     result = []
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 3000eb8e829..18642cdf667 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1812,40 +1812,51 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
     if not label_formatter:
         label_key = None
     else:
-        label_key = label_formatter.get_label_key()
+        label_keys = label_formatter.get_label_keys()
 
     node_info_dict: Dict[str, KubernetesNodeInfo] = {}
 
-    for node in nodes:
-        allocated_qty = 0
-        if label_formatter is not None and label_key in node.metadata.labels:
-            accelerator_name = label_formatter.get_accelerator_from_label_value(
-                node.metadata.labels.get(label_key))
-        else:
-            accelerator_name = None
-
-        accelerator_count = int(node.status.allocatable.get(
-            'nvidia.com/gpu', 0))
-
-        for pod in pods:
-            # Get all the pods running on the node
-            if (pod.spec.node_name == node.metadata.name and
-                    pod.status.phase in ['Running', 'Pending']):
-                # Iterate over all the containers in the pod and sum the
-                # GPU requests
-                for container in pod.spec.containers:
-                    if container.resources.requests:
-                        allocated_qty += int(
-                            container.resources.requests.get(
-                                'nvidia.com/gpu', 0))
-
-        accelerators_available = accelerator_count - allocated_qty
-
-        node_info_dict[node.metadata.name] = KubernetesNodeInfo(
-            name=node.metadata.name,
-            gpu_type=accelerator_name,
-            total={'nvidia.com/gpu': int(accelerator_count)},
-            free={'nvidia.com/gpu': int(accelerators_available)})
+    for label_key in label_keys:
+        for node in nodes:
+            allocated_qty = 0
+            if label_formatter is not None and label_key in node.metadata.labels:
+                accelerator_name = label_formatter.get_accelerator_from_label_value(
+                    node.metadata.labels.get(label_key))
+            else:
+                accelerator_name = None
+
+            accelerator_count = 0
+            if 'nvidia.com/gpu' in node.status.allocatable:
+                accelerator_count = int(
+                    node.status.allocatable['nvidia.com/gpu'])
+            elif 'google.com/tpu' in node.status.allocatable:
+                accelerator_count = int(
+                    node.status.allocatable['google.com/tpu'])
+
+            for pod in pods:
+                # Get all the pods running on the node
+                if (pod.spec.node_name == node.metadata.name and
+                        pod.status.phase in ['Running', 'Pending']):
+                    # Iterate over all the containers in the pod and sum the
+                    # GPU requests
+                    for container in pod.spec.containers:
+                        if container.resources.requests:
+                            if 'nvidia.com/gpu' in container.resources.requests:
+                                allocated_qty += int(
+                                    container.resources.requests.get(
+                                        'nvidia.com/gpu', 0))
+                            elif 'google.com/tpu' in container.resources.requests:
+                                allocated_qty += int(
+                                    container.resources.requests.get(
+                                        'google.com/tpu', 0))                            
+
+            accelerators_available = accelerator_count - allocated_qty
+
+            node_info_dict[node.metadata.name] = KubernetesNodeInfo(
+                name=node.metadata.name,
+                gpu_type=accelerator_name,
+                total={'accelerator_count': int(accelerator_count)},
+                free={'accelerators_available': int(accelerators_available)})
 
     return node_info_dict
 

From e8764f1c9493905aa618668a90080cb759e0dc48 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 21 Sep 2024 22:01:13 +0000
Subject: [PATCH 19/63] nit

---
 sky/provision/kubernetes/instance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index b1d24c5765a..e6da6db4c45 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -606,7 +606,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         # on TPU nodes.
         # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
         tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
-        if tpu_label in config.node_config['spec']['nodeSelector']:
+        if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}):
             tpu_toleration = {
                 'key': 'google.com/tpu',
                 'operator': 'Equal',

From 3497aee95a6d2dbecd559f6b06ca7cccaa4b3093 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 21 Sep 2024 22:14:29 +0000
Subject: [PATCH 20/63] update help message

---
 sky/provision/kubernetes/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 6aa6400dfa1..3d0c2f57f8e 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -38,10 +38,11 @@
     'T': 2**40,
     'P': 2**50,
 }
-NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
-                       'nvidia.com/gpu resource is available on the nodes and '
-                       'the node labels for identifying GPUs '
-                       '(e.g., skypilot.co/accelerator) are setup correctly. ')
+NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs or TPUs, make sure '
+                       'nvidia.com/gpu or google.com/tpu resource is available'
+                       ' on the nodes and the node labels for identifying '
+                       'GPUs/TPUs (e.g., skypilot.co/accelerator) are setup '
+                       'correctly. ')
 
 KUBERNETES_AUTOSCALER_NOTE = (
     'Note: Kubernetes cluster autoscaling is enabled. '

From 724806a5814b9afba048b396fabba5141863a303 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 22 Sep 2024 00:08:43 +0000
Subject: [PATCH 21/63] Update /tmp/tpu_logs dir's write permission

---
 sky/templates/kubernetes-ray.yml.j2 | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index d219bc6670e..be288cfc918 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -441,6 +441,16 @@ setup_commands:
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
+  {% if tpu_requested %}
+  # The /tmp/tpu_logs directory is where TPU-related logs, specifically logs
+  # from the TPU runtime (such as the TPU driver), are written. These logs
+  # capture important runtime information about the TPU execution, including
+  # any warnings, errors, or general activity of the TPU driver.
+  # By default, the /tmp/tpu_logs directory is created with 755 permissions,
+  # and user of the provisioned pod is not necessarily a root. Hence, we need
+  # to update the write permission so the logs can be properly written.
+  - sudo chmod 777 /tmp/tpu_logs;
+  {% endif %}
 
 # Format: `REMOTE_PATH : LOCAL_PATH`
 file_mounts: {

From e8d73feffc046e9bfae853d6b6d003462ec6b8d7 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 22 Sep 2024 00:10:21 +0000
Subject: [PATCH 22/63] nit

---
 sky/templates/kubernetes-ray.yml.j2 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index be288cfc918..186dbe09d3c 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -442,13 +442,13 @@ setup_commands:
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
   {% if tpu_requested %}
-  # The /tmp/tpu_logs directory is where TPU-related logs, specifically logs
-  # from the TPU runtime (such as the TPU driver), are written. These logs
-  # capture important runtime information about the TPU execution, including
-  # any warnings, errors, or general activity of the TPU driver.
+  # The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
+  # the TPU runtime (such as the TPU driver), are written. These logs capture
+  # important runtime information about the TPU execution, including any
+  # warnings, errors, or general activity of the TPU driver.
   # By default, the /tmp/tpu_logs directory is created with 755 permissions,
-  # and user of the provisioned pod is not necessarily a root. Hence, we need
-  # to update the write permission so the logs can be properly written.
+  # and the user of the provisioned pod is not necessarily a root. Hence, we
+  # need to update the write permission so the logs can be properly written.
   - sudo chmod 777 /tmp/tpu_logs;
   {% endif %}
 

From 7ac503673287c1b9f1759fe9756df07db6464676 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 22 Sep 2024 00:16:38 +0000
Subject: [PATCH 23/63] nit

---
 sky/templates/kubernetes-ray.yml.j2 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 186dbe09d3c..79c0cb2a77e 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -443,12 +443,12 @@ setup_commands:
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
   {% if tpu_requested %}
   # The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
-  # the TPU runtime (such as the TPU driver), are written. These logs capture
-  # important runtime information about the TPU execution, including any
-  # warnings, errors, or general activity of the TPU driver.
-  # By default, the /tmp/tpu_logs directory is created with 755 permissions,
-  # and the user of the provisioned pod is not necessarily a root. Hence, we
-  # need to update the write permission so the logs can be properly written.
+  # the TPU runtime, are written. These capture runtime information about the
+  # TPU execution, including any warnings, errors, or general activity of
+  # the TPU driver. By default, the /tmp/tpu_logs directory is created with
+  # 755 permissions, and the user of the provisioned pod is not necessarily
+  # a root. Hence, we need to update the write permission so the logs can be
+  # properly written.
   - sudo chmod 777 /tmp/tpu_logs;
   {% endif %}
 

From 4470dbeccb9e68ba336973592d2c76f110a650b6 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 22 Sep 2024 00:18:45 +0000
Subject: [PATCH 24/63] comment update on TPU resource lackage error handling

---
 sky/provision/kubernetes/instance.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index e6da6db4c45..73b3147f98d 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -620,6 +620,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 namespace, pod_spec)
         except kubernetes.api_exception() as e:
             error_msg = str(e)
+            # Unlike other errors from resource lackage on CPU/GPU/Memory, TPU
+            # lackage error is raised when pod is attemtped to be created.
             if 'Invalid resource requests for google.com/tpu.' in error_msg:
                 extra_msg = ('Verify if the cluster has a TPU slice node with '
                              'a topology matching the number of TPU(s) '

From 0860e4596e38f475ab4e1df61bf528566ae1d43d Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 22 Sep 2024 00:45:33 +0000
Subject: [PATCH 25/63] Update to use global constant instead of hard coded
 string of nvidia.com/gpu and google.com/tpu

---
 .../service_catalog/kubernetes_catalog.py     | 12 +++---
 sky/provision/kubernetes/instance.py          |  4 +-
 sky/provision/kubernetes/utils.py             | 39 +++++++++++--------
 sky/utils/kubernetes/gpu_labeler.py           |  4 +-
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 1534c4c39a5..c0ac2e16472 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -108,12 +108,12 @@ def list_accelerators_realtime(
                     continue
 
                 accelerator_count = 0
-                if 'nvidia.com/gpu' in node.status.allocatable:
+                if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable:
                     accelerator_count = int(
-                        node.status.allocatable['nvidia.com/gpu'])
-                elif 'google.com/tpu' in node.status.allocatable:
+                        node.status.allocatable[kubernetes_utils.GPU_RESOURCE_KEY])
+                elif kubernetes_utils.TPU_RESOURCE_KEY in node.status.allocatable:
                     accelerator_count = int(
-                        node.status.allocatable['google.com/tpu'])
+                        node.status.allocatable[kubernetes_utils.TPU_RESOURCE_KEY])
 
                 # Generate the GPU quantities for the accelerators
                 if accelerator_name and accelerator_count > 0:
@@ -130,10 +130,10 @@ def list_accelerators_realtime(
                             if container.resources.requests:
                                 allocated_qty += int(
                                     container.resources.requests.get(
-                                        'nvidia.com/gpu', 0))
+                                        kubernetes_utils.GPU_RESOURCE_KEY, 0))
                                 allocated_qty += int(
                                     container.resources.requests.get(
-                                        'google.com/tpu', 0))
+                                        kubernetes_utils.TPU_RESOURCE_KEY, 0))
 
                 accelerators_available = accelerator_count - allocated_qty
 
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 73b3147f98d..5c7d65f0d71 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -555,7 +555,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                        'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html')  # pylint: disable=line-too-long
 
     needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
-        'limits', {}).get('nvidia.com/gpu', 0) > 0)
+        'limits', {}).get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0)
     if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
 
@@ -608,7 +608,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
         if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}):
             tpu_toleration = {
-                'key': 'google.com/tpu',
+                'key': kubernetes_utils.TPU_RESOURCE_KEY,
                 'operator': 'Equal',
                 'value': 'present',
                 'effect': 'NoSchedule'
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 18642cdf667..d5bc586cde5 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -38,8 +38,15 @@
     'T': 2**40,
     'P': 2**50,
 }
+
+# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
+# nodes. These keys are typically used in the node's status.allocatable 
+# or status.capacity fields to indicate the available resources on the node.
+GPU_RESOURCE_KEY = 'nvidia.com/gpu'
+TPU_RESOURCE_KEY = 'google.com/tpu'
+
 NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
-                       'nvidia.com/gpu resource is available on the nodes and '
+                       f'{GPU_RESOURCE_KEY} resource is available on the nodes and '
                        'the node labels for identifying GPUs '
                        '(e.g., skypilot.co/accelerator) are setup correctly. ')
 
@@ -388,7 +395,7 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]:
     GPUs or the nvidia GPU operator and/or device drivers are not installed.
 
     Returns:
-        bool: True if the cluster has nvidia.com/gpu or google.com/tpu
+        bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
             resource, False otherwise.
     """
     # Get the set of resources across all nodes
@@ -396,8 +403,8 @@ def detect_gpu_resource() -> Tuple[bool, Set[str]]:
     nodes = get_kubernetes_nodes()
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = ('nvidia.com/gpu' in cluster_resources or
-               'google.com/tpu' in cluster_resources)
+    has_gpu = (GPU_RESOURCE_KEY in cluster_resources or
+               TPU_RESOURCE_KEY in cluster_resources)
 
     return has_gpu, cluster_resources
 
@@ -633,12 +640,12 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 suffix = (' Available resources on the cluster: '
                           f'{cluster_resources}')
             raise exceptions.ResourcesUnavailableError(
-                'Could not detect GPU/TPU resources (`nvidia.com/gpu` or '
-                '`google.com/tpu`) in Kubernetes cluster. If this cluster '
+                f'Could not detect GPU/TPU resources (`{GPU_RESOURCE_KEY}` or '
+                f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster '
                 'contains GPUs, please ensure GPU drivers are installed on '
                 'the node. Check if the GPUs are setup correctly by running '
-                '`kubectl describe nodes` and looking for the nvidia.com/gpu '
-                'or google.com/tpu resource. Please refer to the documentation'
+                f'`kubectl describe nodes` and looking for the {GPU_RESOURCE_KEY!r} '
+                f'or {TPU_RESOURCE_KEY!r} resource. Please refer to the documentation'
                 f'on how to set up GPUs.{suffix}')
 
 
@@ -1826,12 +1833,12 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
                 accelerator_name = None
 
             accelerator_count = 0
-            if 'nvidia.com/gpu' in node.status.allocatable:
+            if GPU_RESOURCE_KEY in node.status.allocatable:
                 accelerator_count = int(
-                    node.status.allocatable['nvidia.com/gpu'])
-            elif 'google.com/tpu' in node.status.allocatable:
+                    node.status.allocatable[GPU_RESOURCE_KEY])
+            elif TPU_RESOURCE_KEY in node.status.allocatable:
                 accelerator_count = int(
-                    node.status.allocatable['google.com/tpu'])
+                    node.status.allocatable[TPU_RESOURCE_KEY])
 
             for pod in pods:
                 # Get all the pods running on the node
@@ -1841,14 +1848,14 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
                     # GPU requests
                     for container in pod.spec.containers:
                         if container.resources.requests:
-                            if 'nvidia.com/gpu' in container.resources.requests:
+                            if GPU_RESOURCE_KEY in container.resources.requests:
                                 allocated_qty += int(
                                     container.resources.requests.get(
-                                        'nvidia.com/gpu', 0))
-                            elif 'google.com/tpu' in container.resources.requests:
+                                        GPU_RESOURCE_KEY, 0))
+                            elif TPU_RESOURCE_KEY in container.resources.requests:
                                 allocated_qty += int(
                                     container.resources.requests.get(
-                                        'google.com/tpu', 0))                            
+                                        TPU_RESOURCE_KEY, 0))                            
 
             accelerators_available = accelerator_count - allocated_qty
 
diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py
index b00bd4f21ae..14fbbdedca5 100644
--- a/sky/utils/kubernetes/gpu_labeler.py
+++ b/sky/utils/kubernetes/gpu_labeler.py
@@ -101,7 +101,7 @@ def label():
         # Get the list of nodes with GPUs
         gpu_nodes = []
         for node in nodes:
-            if 'nvidia.com/gpu' in node.status.capacity:
+            if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity:
                 gpu_nodes.append(node)
 
         print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
@@ -142,7 +142,7 @@ def label():
     if len(gpu_nodes) == 0:
         print('No GPU nodes found in the cluster. If you have GPU nodes, '
               'please ensure that they have the label '
-              '`nvidia.com/gpu: <number of GPUs>`')
+              f'`{kubernetes_utils.GPU_RESOURCE_KEY}: <number of GPUs>`')
     else:
         print('GPU labeling started - this may take 10 min or more to complete.'
               '\nTo check the status of GPU labeling jobs, run '

From 35f3c80106d7eeaef6feda47075b1b90a10887a3 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 23 Sep 2024 02:57:03 +0000
Subject: [PATCH 26/63] add smoke test and make exec work on TPU pods

---
 sky/backends/cloud_vm_ray_backend.py |  3 +--
 sky/clouds/utils/gcp_utils.py        |  9 +++++++
 sky/resources.py                     | 36 ++++++++++++++--------------
 tests/test_smoke.py                  | 16 +++++++++++++
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 191a09438aa..5c6e5129020 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -2484,8 +2484,7 @@ def head_ssh_port(self):
     @property
     def num_ips_per_node(self) -> int:
         """Returns number of IPs per node in the cluster, handling TPU Pod."""
-        is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
-        if is_tpu_vm_pod:
+        if gcp_utils.is_tpu_vm_pod(self.launched_resources):
             num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
         else:
             num_ips = 1
diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index 68e6192d351..f1ee93c7979 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -31,10 +31,19 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
     acc, _ = list(resources.accelerators.items())[0]
     return acc.startswith('tpu')
 
+def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool:
+    if not is_tpu(resources):
+        return False
+    assert resources is not None
+    acc, _ = list(resources.accelerators.items())[0]
+    # Reference on Accelerator names for TPU Pod slices: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long
+    return acc.endswith('-podslice') or acc.endswith('-device')
 
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
     if not is_tpu(resources):
         return False
+    elif is_tpu_pod_slice(resources):
+        return False
     assert resources is not None
     if resources.accelerator_args is None:
         return True
diff --git a/sky/resources.py b/sky/resources.py
index 4b47e8e4247..137a8e731f4 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -564,26 +564,26 @@ def _set_accelerators(
 
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
-                # Doyoung: Confirm if below two lines can be removed. Perhaps,
-                # raise an error when cloud is not specified by the user
-                # if self.cloud is None:
-                #     self._cloud = clouds.GCP()
-                # assert self.cloud.is_same_cloud(
-                #     clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes()), 'Cloud must be GCP or Kubernetes.'
+                if self.cloud is None:
+                    if acc.endswith('-podslice') or acc.endswith('-device'):
+                        self._cloud = clouds.Kubernetes()
+                    else:
+                        self._cloud = clouds.GCP()
+                assert (self.cloud.is_same_cloud(
+                    clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes())), 'Cloud must be GCP or Kubernetes.'
+
                 if accelerator_args is None:
                     accelerator_args = {}
-                
-                # Doyoung: May need to understand the usage of tpu_vm and make
-                # proper adjustments to the following snippets.
-                if self.cloud is not None and self.cloud.is_same_cloud(clouds.GCP()):
-                    use_tpu_vm = accelerator_args.get('tpu_vm', True)
-                else:
-                    use_tpu_vm = False
-                
-                ####
-                # use_tpu_vm = accelerator_args.get('tpu_vm', True)
-                ####
-                if self.instance_type is not None and use_tpu_vm:
+
+                # Supported TPU Podslice versions on GKE are v4 <= and those
+                # versions default the architecture to be TPU-VM.
+                # Reference: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_architectures
+                use_tpu_vm = True
+                if self.cloud.is_same_cloud(clouds.GCP()):
+                    use_tpu_vm = accelerator_args.get('tpu_vm', True)                    
+
+                if self.cloud.is_same_cloud(
+                    clouds.GCP()) and self.instance_type is not None and use_tpu_vm:
                     if self.instance_type != 'TPU-VM':
                         with ux_utils.print_exception_no_traceback():
                             raise ValueError(
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 3b2bba72e8a..6df14a27dad 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1890,6 +1890,22 @@ def test_tpu_vm_pod():
     run_one_test(test)
 
 
+# ---------- TPU Pod Slice on GKE. ----------
+@pytest.mark.kubernetes
+def test_tpu_pod_slice_gke():
+    name = _get_cluster_name()
+    test = Test(
+        'tpu_pod_slice_gke',
+        [
+            f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
+            f'sky logs {name} 1',  # Ensure the job finished.
+            f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,  # can take 30 mins
+    )
+    run_one_test(test)
+
 # ---------- Simple apps. ----------
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_multi_hostname(generic_cloud: str):

From 2b56a9eab6153b59e0413188324a02bd64c6e3af Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 02:34:15 +0000
Subject: [PATCH 27/63] update smoke test to check if TPU is reachable.

---
 tests/test_smoke.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 6df14a27dad..84224ef8bdb 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1900,6 +1900,8 @@ def test_tpu_pod_slice_gke():
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
             f'sky logs {name} 1',  # Ensure the job finished.
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
+            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu | exit 1;"', # Ensure TPU is reachable.
+            f'sky logs {name} 2 --status'
         ],
         f'sky down -y {name}',
         timeout=30 * 60,  # can take 30 mins

From 305705cbd575bd3ddbaf8ab1d804a35e12ee2e21 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 02:38:24 +0000
Subject: [PATCH 28/63] add comment

---
 sky/resources.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sky/resources.py b/sky/resources.py
index 137a8e731f4..74de19e1150 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -565,6 +565,7 @@ def _set_accelerators(
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
                 if self.cloud is None:
+                    # Reference on names of TPU Pod slices available on GKE: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long
                     if acc.endswith('-podslice') or acc.endswith('-device'):
                         self._cloud = clouds.Kubernetes()
                     else:

From c2b5bfcc1a5135cb7e3de50832f5d390107bd594 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 02:49:25 +0000
Subject: [PATCH 29/63] nit

---
 tests/test_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 84224ef8bdb..5f13b665120 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1900,7 +1900,7 @@ def test_tpu_pod_slice_gke():
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
             f'sky logs {name} 1',  # Ensure the job finished.
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu | exit 1;"', # Ensure TPU is reachable.
+            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable.
             f'sky logs {name} 2 --status'
         ],
         f'sky down -y {name}',

From 2ba5537d0a0742fbce1c571ff24080f189ce1661 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 05:55:02 +0000
Subject: [PATCH 30/63] Comment on number of requested TPU chips for multi- and
 single- host TPU slice.

---
 sky/templates/kubernetes-ray.yml.j2 | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 79c0cb2a77e..fb5576f6f0a 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -402,6 +402,11 @@ available_node_types:
               cpu: {{cpus}}
               memory: {{memory}}G
               {% if tpu_requested %}
+              # Number of requested google.com/tpu must be equal to the total
+              # number of available TPU chips on the TPU slice node either it
+              # being a node from multi-host TPU slice or single-host TPU
+              # slice. Example reference:
+              # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
               google.com/tpu: {{accelerator_count}}
               {% else %}
               nvidia.com/gpu: {{accelerator_count}}

From 92cd77d3f535927825c33ff93db32acdd7383991 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 06:05:49 +0000
Subject: [PATCH 31/63] update method to check GKE supported TPU name

---
 sky/clouds/utils/gcp_utils.py     |  4 +--
 sky/provision/kubernetes/utils.py |  9 ++++++
 sky/resources.py                  | 48 ++++++++++++++++---------------
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index f1ee93c7979..7ec5c8f3b2b 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -17,6 +17,7 @@
 from sky import sky_logging
 from sky import skypilot_config
 from sky.provision.gcp import constants
+from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.utils import subprocess_utils
 
 if typing.TYPE_CHECKING:
@@ -36,8 +37,7 @@ def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool:
         return False
     assert resources is not None
     acc, _ = list(resources.accelerators.items())[0]
-    # Reference on Accelerator names for TPU Pod slices: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long
-    return acc.endswith('-podslice') or acc.endswith('-device')
+    return acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION
 
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
     if not is_tpu(resources):
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index d5bc586cde5..90fc3d07735 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -71,6 +71,15 @@
 PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
                                f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
 
+# Mapping used to get generation for TPU accelerator name.
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
+GKE_TPU_ACCELERATOR_TO_GENERATION = {
+    "tpu-v4-podslice": "v4",
+    "tpu-v5-lite-device": "v5e",
+    "tpu-v5-lite-podslice": "v5e",
+    "tpu-v5p-slice": "v5p",
+}
+
 logger = sky_logging.init_logger(__name__)
 
 
diff --git a/sky/resources.py b/sky/resources.py
index 74de19e1150..fbff8422c8d 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -14,6 +14,7 @@
 from sky import skypilot_config
 from sky.clouds import service_catalog
 from sky.provision import docker_utils
+from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.skylet import constants
 from sky.utils import accelerator_registry
 from sky.utils import common_utils
@@ -565,8 +566,7 @@ def _set_accelerators(
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
                 if self.cloud is None:
-                    # Reference on names of TPU Pod slices available on GKE: https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#workload_preparation # pylint: disable=line-too-long
-                    if acc.endswith('-podslice') or acc.endswith('-device'):
+                    if acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION:
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()
@@ -576,13 +576,30 @@ def _set_accelerators(
                 if accelerator_args is None:
                     accelerator_args = {}
 
-                # Supported TPU Podslice versions on GKE are v4 <= and those
-                # versions default the architecture to be TPU-VM.
-                # Reference: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_architectures
-                use_tpu_vm = True
+                use_tpu_vm = accelerator_args.get('tpu_vm', True)
                 if self.cloud.is_same_cloud(clouds.GCP()):
-                    use_tpu_vm = accelerator_args.get('tpu_vm', True)                    
-
+                    if 'runtime_version' not in accelerator_args:
+
+                        def _get_default_runtime_version() -> str:
+                            if not use_tpu_vm:
+                                return '2.12.0'
+                            # TPU V5 requires a newer runtime version.
+                            if acc.startswith('tpu-v5'):
+                                return 'v2-alpha-tpuv5'
+                            return 'tpu-vm-base'
+
+                        accelerator_args['runtime_version'] = (
+                            _get_default_runtime_version())
+                        logger.info(
+                            'Missing runtime_version in accelerator_args, using'
+                            f' default ({accelerator_args["runtime_version"]})')
+                    
+                    if self.instance_type is not None and use_tpu_vm:
+                        if self.instance_type != 'TPU-VM':
+                            with ux_utils.print_exception_no_traceback():
+                                raise ValueError(
+                                    'Cannot specify instance type'
+                                    f' (got "{self.instance_type}") for TPU VM.')
                 if self.cloud.is_same_cloud(
                     clouds.GCP()) and self.instance_type is not None and use_tpu_vm:
                     if self.instance_type != 'TPU-VM':
@@ -590,21 +607,6 @@ def _set_accelerators(
                             raise ValueError(
                                 'Cannot specify instance type'
                                 f' (got "{self.instance_type}") for TPU VM.')
-                if 'runtime_version' not in accelerator_args:
-
-                    def _get_default_runtime_version() -> str:
-                        if not use_tpu_vm:
-                            return '2.12.0'
-                        # TPU V5 requires a newer runtime version.
-                        if acc.startswith('tpu-v5'):
-                            return 'v2-alpha-tpuv5'
-                        return 'tpu-vm-base'
-
-                    accelerator_args['runtime_version'] = (
-                        _get_default_runtime_version())
-                    logger.info(
-                        'Missing runtime_version in accelerator_args, using'
-                        f' default ({accelerator_args["runtime_version"]})')
 
         self._accelerators = accelerators
         self._accelerator_args = accelerator_args

From d085a5b918d4caa343f40891a302dfd4d6da69b1 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Tue, 24 Sep 2024 06:08:30 +0000
Subject: [PATCH 32/63] nit

---
 sky/backends/cloud_vm_ray_backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 5c6e5129020..191a09438aa 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -2484,7 +2484,8 @@ def head_ssh_port(self):
     @property
     def num_ips_per_node(self) -> int:
         """Returns number of IPs per node in the cluster, handling TPU Pod."""
-        if gcp_utils.is_tpu_vm_pod(self.launched_resources):
+        is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
+        if is_tpu_vm_pod:
             num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
         else:
             num_ips = 1

From 786067985a33a3f7908e5cc2f363d05e45afe5c9 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 25 Sep 2024 05:34:58 +0000
Subject: [PATCH 33/63] move is_tpu_pod_slice to kubernetes_utils

---
 sky/clouds/utils/gcp_utils.py     | 9 ++-------
 sky/provision/kubernetes/utils.py | 3 +++
 sky/resources.py                  | 9 +--------
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index 7ec5c8f3b2b..549eb422015 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -32,17 +32,12 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
     acc, _ = list(resources.accelerators.items())[0]
     return acc.startswith('tpu')
 
-def is_tpu_pod_slice(resources: Optional['resources_lib.Resources']) -> bool:
-    if not is_tpu(resources):
-        return False
-    assert resources is not None
-    acc, _ = list(resources.accelerators.items())[0]
-    return acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION
 
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
+    acc, _ = list(resources.accelerators.items())[0]
     if not is_tpu(resources):
         return False
-    elif is_tpu_pod_slice(resources):
+    elif kubernetes_utils.is_tpu_pod_slice(acc):
         return False
     assert resources is not None
     if resources.accelerator_args is None:
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 90fc3d07735..ee8ca7b503d 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1885,3 +1885,6 @@ def get_namespace_from_config(provider_config: Dict[str, Any]) -> str:
 def get_context_from_config(provider_config: Dict[str, Any]) -> str:
     return provider_config.get('context',
                                get_current_kube_config_context_name())
+
+def is_tpu_pod_slice(accelerator: str) -> bool:
+    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
diff --git a/sky/resources.py b/sky/resources.py
index fbff8422c8d..0ea5e4b1aaa 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -566,7 +566,7 @@ def _set_accelerators(
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
                 if self.cloud is None:
-                    if acc in kubernetes_utils.GKE_TPU_ACCELERATOR_TO_GENERATION:
+                    if kubernetes_utils.is_tpu_pod_slice(acc):
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()
@@ -600,13 +600,6 @@ def _get_default_runtime_version() -> str:
                                 raise ValueError(
                                     'Cannot specify instance type'
                                     f' (got "{self.instance_type}") for TPU VM.')
-                if self.cloud.is_same_cloud(
-                    clouds.GCP()) and self.instance_type is not None and use_tpu_vm:
-                    if self.instance_type != 'TPU-VM':
-                        with ux_utils.print_exception_no_traceback():
-                            raise ValueError(
-                                'Cannot specify instance type'
-                                f' (got "{self.instance_type}") for TPU VM.')
 
         self._accelerators = accelerators
         self._accelerator_args = accelerator_args

From 96924a71c6e5eeb551ce06e54debd883e121a3f7 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 25 Sep 2024 05:38:49 +0000
Subject: [PATCH 34/63] update get_accelerator_from_label_value to use
 is_tpu_pod_slice method

---
 sky/provision/kubernetes/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index ee8ca7b503d..afc984e835a 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -267,7 +267,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
                 # to distinguish between a3-high and a3-mega instances
                 return 'H100'
             return acc
-        elif value.startswith('tpu-'):
+        elif is_tpu_pod_slice(value):
             return value
         else:
             raise ValueError(

From 1bbac2126d97e37ea24edc5899f879fcbc186d62 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 25 Sep 2024 05:38:57 +0000
Subject: [PATCH 35/63] nit

---
 sky/provision/kubernetes/instance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 5c7d65f0d71..d81296c2b10 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -79,7 +79,7 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
     return {'component': f'{cluster_name}-head'}
 
 
-def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]):
+def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str:
     # Returns a formatted string of resource requirements for a pod.
     resource_requirements = {}
     

From 4f7ea0354d097ce3f85c10d84b2b4fd5a96b4c8d Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 25 Sep 2024 06:09:19 +0000
Subject: [PATCH 36/63] format

---
 sky/clouds/kubernetes.py                      |  3 +-
 .../service_catalog/kubernetes_catalog.py     | 24 ++---
 sky/clouds/utils/gcp_utils.py                 |  2 +-
 sky/provision/kubernetes/instance.py          | 40 ++++----
 sky/provision/kubernetes/utils.py             | 97 ++++++++++---------
 sky/resources.py                              | 10 +-
 tests/test_smoke.py                           |  3 +-
 7 files changed, 94 insertions(+), 85 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 97447f25f88..68fbd27b1c1 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -269,7 +269,8 @@ def make_deploy_resources_variables(
         if acc_count > 0 and acc_type is not None:
             k8s_acc_label_key, k8s_acc_label_value = \
                 kubernetes_utils.get_gpu_label_key_value(acc_type)
-            if k8s_acc_label_key == kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY:
+            if (k8s_acc_label_key ==
+                    kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
                     kubernetes_utils.get_tpu_topology_label_key_value(acc_type))
diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index c0ac2e16472..d0d606e5026 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -79,12 +79,12 @@ def list_accelerators_realtime(
     if not has_gpu:
         return {}, {}, {}
 
-    label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter()
-    if not label_formatter:
+    lf, _ = kubernetes_utils.detect_gpu_label_formatter()
+    if not lf:
         return {}, {}, {}
 
     accelerators_qtys: Set[Tuple[str, int]] = set()
-    keys = label_formatter.get_label_keys()
+    keys = lf.get_label_keys()
     nodes = kubernetes_utils.get_kubernetes_nodes()
     # Get the pods to get the real-time GPU usage
     pods = kubernetes_utils.get_kubernetes_pods()
@@ -98,7 +98,7 @@ def list_accelerators_realtime(
         for key in keys:
             if key in node.metadata.labels:
                 allocated_qty = 0
-                accelerator_name = label_formatter.get_accelerator_from_label_value(
+                accelerator_name = lf.get_accelerator_from_label_value(
                     node.metadata.labels.get(key))
 
                 # Check if name_filter regex matches the accelerator_name
@@ -109,11 +109,12 @@ def list_accelerators_realtime(
 
                 accelerator_count = 0
                 if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable:
-                    accelerator_count = int(
-                        node.status.allocatable[kubernetes_utils.GPU_RESOURCE_KEY])
-                elif kubernetes_utils.TPU_RESOURCE_KEY in node.status.allocatable:
-                    accelerator_count = int(
-                        node.status.allocatable[kubernetes_utils.TPU_RESOURCE_KEY])
+                    accelerator_count = int(node.status.allocatable[
+                        kubernetes_utils.GPU_RESOURCE_KEY])
+                elif (kubernetes_utils.TPU_RESOURCE_KEY
+                      in node.status.allocatable):
+                    accelerator_count = int(node.status.allocatable[
+                        kubernetes_utils.TPU_RESOURCE_KEY])
 
                 # Generate the GPU quantities for the accelerators
                 if accelerator_name and accelerator_count > 0:
@@ -138,8 +139,9 @@ def list_accelerators_realtime(
                 accelerators_available = accelerator_count - allocated_qty
 
                 if accelerator_count >= min_quantity_filter:
-                    quantized_count = (min_quantity_filter *
-                                    (accelerator_count // min_quantity_filter))
+                    quantized_count = (
+                        min_quantity_filter *
+                        (accelerator_count // min_quantity_filter))
                     if accelerator_name not in total_accelerators_capacity:
                         total_accelerators_capacity[
                             accelerator_name] = quantized_count
diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index 549eb422015..b44c84b122d 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -34,12 +34,12 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
 
 
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
+    assert resources is not None
     acc, _ = list(resources.accelerators.items())[0]
     if not is_tpu(resources):
         return False
     elif kubernetes_utils.is_tpu_pod_slice(acc):
         return False
-    assert resources is not None
     if resources.accelerator_args is None:
         return True
     return resources.accelerator_args.get('tpu_vm', True)
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index d81296c2b10..c66bd3f5a20 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -79,10 +79,10 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
     return {'component': f'{cluster_name}-head'}
 
 
-def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str:
+def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
     # Returns a formatted string of resource requirements for a pod.
     resource_requirements = {}
-    
+
     if isinstance(pod_or_spec, dict):
         containers = pod_or_spec.get('spec', {}).get('containers', [])
     else:
@@ -95,21 +95,20 @@ def _formatted_resource_requirements(pod_or_spec: Union['Pod', dict]) -> str:
         else:
             resources = container.resources
             requests = resources.requests or {}
-            
+
         for resource, value in requests.items():
             if resource not in resource_requirements:
                 resource_requirements[resource] = 0
             if resource == 'memory':
                 int_value = kubernetes_utils.parse_memory_resource(value)
             else:
-                int_value = kubernetes_utils.parse_cpu_or_gpu_resource(
-                    value)
-            resource_requirements[resource] += int_value
+                int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
+            resource_requirements[resource] += int(int_value)
     return ', '.join(f'{resource}={value}'
-                        for resource, value in resource_requirements.items())
+                     for resource, value in resource_requirements.items())
 
 
-def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]:
+def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
     # Returns a formatted string of node selectors for a pod.
     node_selectors = []
 
@@ -117,7 +116,7 @@ def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]:
         selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
     else:
         selectors = pod_or_spec.spec.node_selector
-    
+
     if not selectors:
         return None
 
@@ -127,19 +126,18 @@ def _formatted_node_selector(pod_or_spec: Union['Pod', dict]) -> Optional[str]:
 
 
 def _lack_resource_msg(resource: str,
-                        pod_or_spec: Union['Pod', dict],
-                        extra_msg: Optional[str] = None,
-                        details: Optional[str] = None) -> str:
+                       pod_or_spec: Union[Any, dict],
+                       extra_msg: Optional[str] = None,
+                       details: Optional[str] = None) -> str:
     resource_requirements = _formatted_resource_requirements(pod_or_spec)
     node_selectors = _formatted_node_selector(pod_or_spec)
     node_selector_str = f' and labels ({node_selectors})' if (
         node_selectors) else ''
-    msg = (
-        f'Insufficient {resource} capacity on the cluster. '
-        f'Required resources ({resource_requirements}){node_selector_str} '
-        'were not found in a single node. Other SkyPilot tasks or pods may '
-        'be using resources. Check resource usage by running '
-        '`kubectl describe nodes`.')
+    msg = (f'Insufficient {resource} capacity on the cluster. '
+           f'Required resources ({resource_requirements}){node_selector_str} '
+           'were not found in a single node. Other SkyPilot tasks or pods may '
+           'be using resources. Check resource usage by running '
+           '`kubectl describe nodes`.')
     if extra_msg:
         msg += f' {extra_msg}'
     if details:
@@ -202,8 +200,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                         '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.'  # pylint: disable=line-too-long
                         f' Full error: {event_message}')
                 gpu_lf_keys = [
-                    key
-                    for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
+                    key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
                     for key in lf.get_label_keys()
                 ]
                 if pod.spec.node_selector:
@@ -606,7 +603,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         # on TPU nodes.
         # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
         tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
-        if tpu_label in config.node_config.get('spec', {}).get('nodeSelector', {}):
+        if tpu_label in config.node_config.get('spec',
+                                               {}).get('nodeSelector', {}):
             tpu_toleration = {
                 'key': kubernetes_utils.TPU_RESOURCE_KEY,
                 'operator': 'Equal',
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index afc984e835a..b362ce32f36 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -40,15 +40,16 @@
 }
 
 # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
-# nodes. These keys are typically used in the node's status.allocatable 
+# nodes. These keys are typically used in the node's status.allocatable
 # or status.capacity fields to indicate the available resources on the node.
 GPU_RESOURCE_KEY = 'nvidia.com/gpu'
 TPU_RESOURCE_KEY = 'google.com/tpu'
 
-NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
-                       f'{GPU_RESOURCE_KEY} resource is available on the nodes and '
-                       'the node labels for identifying GPUs '
-                       '(e.g., skypilot.co/accelerator) are setup correctly. ')
+NO_GPU_HELP_MESSAGE = (
+    'If your cluster contains GPUs, make sure '
+    f'{GPU_RESOURCE_KEY} resource is available on the nodes and '
+    'the node labels for identifying GPUs '
+    '(e.g., skypilot.co/accelerator) are setup correctly. ')
 
 KUBERNETES_AUTOSCALER_NOTE = (
     'Note: Kubernetes cluster autoscaling is enabled. '
@@ -74,10 +75,10 @@
 # Mapping used to get generation for TPU accelerator name.
 # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
 GKE_TPU_ACCELERATOR_TO_GENERATION = {
-    "tpu-v4-podslice": "v4",
-    "tpu-v5-lite-device": "v5e",
-    "tpu-v5-lite-podslice": "v5e",
-    "tpu-v5p-slice": "v5p",
+    'tpu-v4-podslice': 'v4',
+    'tpu-v5-lite-device': 'v5e',
+    'tpu-v5-lite-podslice': 'v5e',
+    'tpu-v5p-slice': 'v5p',
 }
 
 logger = sky_logging.init_logger(__name__)
@@ -92,14 +93,14 @@ class GPULabelFormatter:
     """
 
     @classmethod
-    def get_label_key(cls, accelerator: str = None) -> str:
+    def get_label_key(cls, accelerator: str = '') -> str:
         """Returns the label key for GPU type used by the Kubernetes cluster"""
         raise NotImplementedError
 
     @classmethod
     def get_label_keys(cls) -> List[str]:
         """Returns a list of label keys for GPU used by Kubernetes cluster."""
-        pass
+        raise NotImplementedError
 
     @classmethod
     def get_label_value(cls, accelerator: str) -> str:
@@ -160,12 +161,12 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'skypilot.co/accelerator'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = None) -> str:
-        del accelerator # Unused
+    def get_label_key(cls, accelerator: str = '') -> str:
+        del accelerator  # Unused
         return cls.LABEL_KEY
-   
+
     @classmethod
-    def get_label_keys(cls) -> str:
+    def get_label_keys(cls) -> List[str]:
         return [cls.LABEL_KEY]
 
     @classmethod
@@ -201,12 +202,12 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = None) -> str:
-        del accelerator # Unused
+    def get_label_key(cls, accelerator: str = '') -> str:
+        del accelerator  # Unused
         return cls.LABEL_KEY
 
     @classmethod
-    def get_label_keys(cls) -> str:
+    def get_label_keys(cls) -> List[str]:
         return [cls.LABEL_KEY]
 
     @classmethod
@@ -234,18 +235,18 @@ class GKELabelFormatter(GPULabelFormatter):
     TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = None) -> str:
+    def get_label_key(cls, accelerator: str = '') -> str:
         if accelerator.startswith('tpu-'):
             return cls.TPU_LABEL_KEY
         return cls.GPU_LABEL_KEY
 
     @classmethod
-    def get_label_keys(cls) -> str:
+    def get_label_keys(cls) -> List[str]:
         return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
 
     @classmethod
-    def match_label_key(cls, label: str) -> bool:
-        return label in [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
+    def match_label_key(cls, label_key: str) -> bool:
+        return label_key in cls.get_label_keys()
 
     @classmethod
     def get_tpu_topology_label_key(cls) -> str:
@@ -292,12 +293,12 @@ class GFDLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'nvidia.com/gpu.product'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = None) -> str:
-        del accelerator # Unused
+    def get_label_key(cls, accelerator: str = '') -> str:
+        del accelerator  # Unused
         return cls.LABEL_KEY
 
     @classmethod
-    def get_label_keys(cls) -> str:
+    def get_label_keys(cls) -> List[str]:
         return [cls.LABEL_KEY]
 
     @classmethod
@@ -570,7 +571,8 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
         formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
         assert formatter is not None, ('Unsupported autoscaler type:'
                                        f' {autoscaler_type}')
-        return formatter.get_label_key(acc_type), formatter.get_label_value(acc_type)
+        return formatter.get_label_key(acc_type), formatter.get_label_value(
+            acc_type)
 
     has_gpus, cluster_resources = detect_gpu_resource()
     if has_gpus:
@@ -581,11 +583,10 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
             # If none of the GPU labels from LABEL_FORMATTER_REGISTRY are
             # detected, raise error
             with ux_utils.print_exception_no_traceback():
-                supported_formats = ', '.join(
-                    [key
-                    for f in LABEL_FORMATTER_REGISTRY
-                    for key in f.get_label_keys()]
-                )
+                supported_formats = ', '.join([
+                    key for f in LABEL_FORMATTER_REGISTRY
+                    for key in f.get_label_keys()
+                ])
                 suffix = ''
                 if env_options.Options.SHOW_DEBUG_INFO.get():
                     suffix = f' Found node labels: {node_labels}'
@@ -632,8 +633,8 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                     all_labels = []
                     for node_name, label_list in node_labels.items():
                         all_labels.extend(label_list)
-                    gpus_available = set(
-                        v for k, v in all_labels if label_formatter.match_label_key(k))
+                    gpus_available = set(v for k, v in all_labels
+                                         if label_formatter.match_label_key(k))
                     suffix = f' Available GPUs on the cluster: {gpus_available}'
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '
@@ -653,12 +654,14 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
                 f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster '
                 'contains GPUs, please ensure GPU drivers are installed on '
                 'the node. Check if the GPUs are setup correctly by running '
-                f'`kubectl describe nodes` and looking for the {GPU_RESOURCE_KEY!r} '
-                f'or {TPU_RESOURCE_KEY!r} resource. Please refer to the documentation'
-                f'on how to set up GPUs.{suffix}')
+                '`kubectl describe nodes` and looking for the '
+                f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
+                'Please refer to the documentation on how to set up GPUs.'
+                f'{suffix}')
 
 
-def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]:
+def get_tpu_topology_label_key_value(
+        accelerator: str) -> Tuple[str, Optional[str]]:
     """Returns the TPU topology label key and value for given accelerator type.
 
     Args:
@@ -685,13 +688,13 @@ def get_tpu_topology_label_key_value(accelerator: str) -> Tuple[str, str]:
         if labels_dict.get(tpu_label_key) == accelerator:
             topology_value = labels_dict.get(tpu_topology_label_key)
             return tpu_topology_label_key, topology_value
-        
+
     # If TPU labels are not detected, raise error
     with ux_utils.print_exception_no_traceback():
         suffix = ''
         if env_options.Options.SHOW_DEBUG_INFO.get():
             suffix = (' Available node labels on the cluster: '
-                        f'{node_labels}')
+                      f'{node_labels}')
         raise exceptions.ResourcesUnavailableError(
             f'Unable to find TPU topology for accelerator {accelerator!r}. '
             f'No node found with label `{tpu_label_key}={accelerator}` '
@@ -1824,19 +1827,19 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
     # Get the pods to get the real-time resource usage
     pods = get_kubernetes_pods()
 
-    label_formatter, _ = detect_gpu_label_formatter()
-    if not label_formatter:
+    lf, _ = detect_gpu_label_formatter()
+    if not lf:
         label_key = None
     else:
-        label_keys = label_formatter.get_label_keys()
+        label_keys = lf.get_label_keys()
 
     node_info_dict: Dict[str, KubernetesNodeInfo] = {}
 
     for label_key in label_keys:
         for node in nodes:
             allocated_qty = 0
-            if label_formatter is not None and label_key in node.metadata.labels:
-                accelerator_name = label_formatter.get_accelerator_from_label_value(
+            if lf is not None and label_key in node.metadata.labels:
+                accelerator_name = lf.get_accelerator_from_label_value(
                     node.metadata.labels.get(label_key))
             else:
                 accelerator_name = None
@@ -1861,10 +1864,11 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
                                 allocated_qty += int(
                                     container.resources.requests.get(
                                         GPU_RESOURCE_KEY, 0))
-                            elif TPU_RESOURCE_KEY in container.resources.requests:
+                            elif (TPU_RESOURCE_KEY in
+                                  container.resources.requests):
                                 allocated_qty += int(
                                     container.resources.requests.get(
-                                        TPU_RESOURCE_KEY, 0))                            
+                                        TPU_RESOURCE_KEY, 0))
 
             accelerators_available = accelerator_count - allocated_qty
 
@@ -1886,5 +1890,6 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> str:
     return provider_config.get('context',
                                get_current_kube_config_context_name())
 
+
 def is_tpu_pod_slice(accelerator: str) -> bool:
     return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
diff --git a/sky/resources.py b/sky/resources.py
index 0ea5e4b1aaa..928be3d7ca0 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -570,8 +570,9 @@ def _set_accelerators(
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()
-                assert (self.cloud.is_same_cloud(
-                    clouds.GCP()) or self.cloud.is_same_cloud(clouds.Kubernetes())), 'Cloud must be GCP or Kubernetes.'
+                assert (self.cloud.is_same_cloud(clouds.GCP()) or
+                        self.cloud.is_same_cloud(clouds.Kubernetes())
+                       ), 'Cloud must be GCP or Kubernetes.'
 
                 if accelerator_args is None:
                     accelerator_args = {}
@@ -593,13 +594,14 @@ def _get_default_runtime_version() -> str:
                         logger.info(
                             'Missing runtime_version in accelerator_args, using'
                             f' default ({accelerator_args["runtime_version"]})')
-                    
+
                     if self.instance_type is not None and use_tpu_vm:
                         if self.instance_type != 'TPU-VM':
                             with ux_utils.print_exception_no_traceback():
                                 raise ValueError(
                                     'Cannot specify instance type'
-                                    f' (got "{self.instance_type}") for TPU VM.')
+                                    f' (got "{self.instance_type}") for TPU VM.'
+                                )
 
         self._accelerators = accelerators
         self._accelerator_args = accelerator_args
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 5f13b665120..30872a69fff 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1900,7 +1900,7 @@ def test_tpu_pod_slice_gke():
             f'sky launch -y -c {name} examples/tpu/tpuvm_mnist.yaml --cloud kubernetes --gpus tpu-v5-lite-podslice',
             f'sky logs {name} 1',  # Ensure the job finished.
             f'sky logs {name} 1 --status',  # Ensure the job succeeded.
-            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"', # Ensure TPU is reachable.
+            f'sky exec {name} "conda activate flax; python -c \'import jax; print(jax.devices()[0].platform);\' | grep tpu || exit 1;"',  # Ensure TPU is reachable.
             f'sky logs {name} 2 --status'
         ],
         f'sky down -y {name}',
@@ -1908,6 +1908,7 @@ def test_tpu_pod_slice_gke():
     )
     run_one_test(test)
 
+
 # ---------- Simple apps. ----------
 @pytest.mark.no_scp  # SCP does not support num_nodes > 1 yet
 def test_multi_hostname(generic_cloud: str):

From 16b6c2909704daea9d6e0ef113ca131bca8ae6da Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 25 Sep 2024 08:12:10 +0000
Subject: [PATCH 37/63] nit

---
 sky/clouds/utils/gcp_utils.py     | 6 +++---
 sky/provision/kubernetes/utils.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index b44c84b122d..01ca1c89a71 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -34,11 +34,11 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
 
 
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
-    assert resources is not None
-    acc, _ = list(resources.accelerators.items())[0]
     if not is_tpu(resources):
         return False
-    elif kubernetes_utils.is_tpu_pod_slice(acc):
+    assert resources is not None
+    acc, _ = list(resources.accelerators.items())[0]
+    if kubernetes_utils.is_tpu_pod_slice(acc):
         return False
     if resources.accelerator_args is None:
         return True
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index b362ce32f36..60be85ce9e8 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1864,8 +1864,8 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
                                 allocated_qty += int(
                                     container.resources.requests.get(
                                         GPU_RESOURCE_KEY, 0))
-                            elif (TPU_RESOURCE_KEY in
-                                  container.resources.requests):
+                            elif (TPU_RESOURCE_KEY
+                                  in container.resources.requests):
                                 allocated_qty += int(
                                     container.resources.requests.get(
                                         TPU_RESOURCE_KEY, 0))

From e3908435d96849291d183f513cbc022ab5917352 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 18 Oct 2024 03:27:00 +0000
Subject: [PATCH 38/63] check acc count support

---
 sky/clouds/kubernetes.py          | 2 +-
 sky/provision/kubernetes/utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 68fbd27b1c1..ca50d133de3 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -273,7 +273,7 @@ def make_deploy_resources_variables(
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                    kubernetes_utils.get_tpu_topology_label_key_value(acc_type))
+                    kubernetes_utils.get_tpu_topology_label_key_value(acc_type, acc_count))
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 745394f8144..c187ce8394c 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -247,6 +247,7 @@ class GKELabelFormatter(GPULabelFormatter):
 
     GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
     TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
+    ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
     TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
 
     @classmethod
@@ -676,7 +677,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
 
 
 def get_tpu_topology_label_key_value(
-        accelerator: str) -> Tuple[str, Optional[str]]:
+        accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]:
     """Returns the TPU topology label key and value for given accelerator type.
 
     Args:
@@ -701,6 +702,8 @@ def get_tpu_topology_label_key_value(
     for labels in node_labels.values():
         labels_dict = dict(labels)
         if labels_dict.get(tpu_label_key) == accelerator:
+            tpu_chip_count = labels_dict.get(GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
+            #reduce topology and compare number with acc count
             topology_value = labels_dict.get(tpu_topology_label_key)
             return tpu_topology_label_key, topology_value
 

From 884f0a21beb7c8f240b7da40fb1ee5ce2ad50e44 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 18 Oct 2024 06:02:29 +0000
Subject: [PATCH 39/63] preemptive TPU check

---
 sky/provision/kubernetes/instance.py | 21 ++++++++-
 sky/provision/kubernetes/utils.py    | 65 +++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 4d9a0ea35f2..53200d0bfc6 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -173,7 +173,21 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                             # TODO(romilb): We may have additional node
                             #  affinity selectors in the future - in that
                             #  case we will need to update this logic.
-                            if (('Insufficient nvidia.com/gpu'
+                            # TODO(Doyoung): Update the error message raised
+                            # with the multi-host TPU support.
+                            if 'Insufficient google.com/tpu' in event_message:
+                                extra_msg = (
+                                    f'Verify if '
+                                    f'{pod.spec.node_selector[label_key]}'
+                                    ' is available in the cluster. Note '
+                                    'that multi-host TPU podslices are '
+                                    'currently not unsupported.')
+                                raise config_lib.KubernetesError(
+                                    _lack_resource_msg('TPU',
+                                                       pod,
+                                                       extra_msg,
+                                                       details=event_message))
+                            elif (('Insufficient nvidia.com/gpu'
                                  in event_message) or
                                 ('didn\'t match Pod\'s node affinity/selector'
                                  in event_message)):
@@ -585,10 +599,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
             error_msg = str(e)
             # Unlike other errors from resource lackage on CPU/GPU/Memory, TPU
             # lackage error is raised when pod is attemtped to be created.
+            # TODO(Doyoung): Update the error message raised
+            # with the multi-host TPU support.
             if 'Invalid resource requests for google.com/tpu.' in error_msg:
                 extra_msg = ('Verify if the cluster has a TPU slice node with '
                              'a topology matching the number of TPU(s) '
-                             'requested.')
+                             'requested. Note that multi-host TPU podslices '
+                             'are currently not unsupported.')
                 raise config_lib.KubernetesError(
                     _lack_resource_msg('TPU',
                                        pod_spec,
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index c187ce8394c..4acf6853b8d 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1,5 +1,6 @@
 """Kubernetes utilities for SkyPilot."""
 import dataclasses
+import functools
 import json
 import math
 import os
@@ -508,13 +509,54 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType',
             'Maximum resources found on a single node: '
             f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
 
+    def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
+                       node_list: List[Any]) -> Tuple[bool, Optional[str]]:
+        # check if the requested TPU type is in the cluster
+        # if exists, check if the requested TPU topology is available in the cluster.
+        node_list = gpu_nodes
+        acc_type = candidate_instance_type.accelerator_type
+        acc_count = candidate_instance_type.accelerator_count
+        tpu_list_in_cluster = []
+        for node in node_list:
+            if acc_type == node.metadata.labels[GKELabelFormatter.TPU_LABEL_KEY]:
+                topology_value = node.metadata.labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]
+                # node_tpu_chip_count represents the number of TPU chips
+                # available in this node. If the node is part of a node pool
+                # forming a multi-host TPU podslice, it only reflects the
+                # number of TPU chips in this individual node, not the entire
+                # multi-host TPU podslice.
+                node_tpu_chip_count = node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]
+                chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
+                # topology_chip_count represents the total number of TPU chips
+                # in the entire podslice, whether it is a single-host or
+                # multi-host TPU podslice.
+                topology_chip_count = functools.reduce(lambda x, y: x * y, chip_dimensions)
+                # TODO(Doyoung): Update the naming scheme with multi-host TPU
+                # support.
+                tpu_type = f'{acc_type}:{node_tpu_chip_count}'
+                tpu_list_in_cluster.append(tpu_type)
+                # For multi-host TPU podslices, topology_chip_count and
+                # node_tpu_chip_count will differ, as topology_chip_count
+                # reflects the total across all hosts, while
+                # node_tpu_chip_count reflects only the chips in a single node.
+                # TODO(Doyoung): Remove the condition,
+                # node_tpu_chip_count == topology_chip_count, when adding
+                # multi-host TPU support.
+                if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count:
+                    return True, None
+        tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
+        return False, ('Requested TPU type was not found in the cluster. TPU '
+                       'types found in the cluster: '
+                       f'{tpu_list_in_cluster_str}.')
+
     nodes = get_kubernetes_nodes()
     k8s_instance_type = KubernetesInstanceType.\
         from_instance_type(instance)
     acc_type = k8s_instance_type.accelerator_type
     if acc_type is not None:
-        # If GPUs are requested, check if GPU type is available, and if so,
-        # check if CPU and memory requirements on the specific node are met.
+        # If GPU/TPUs are requested, check if GPU/TPU type is available, and
+        # if so, check if CPU and memory requirements on the specific node are
+        # met.
         try:
             gpu_label_key, gpu_label_val = get_gpu_label_key_value(acc_type)
         except exceptions.ResourcesUnavailableError as e:
@@ -526,6 +568,13 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType',
             node.metadata.labels[gpu_label_key] == gpu_label_val
         ]
         assert len(gpu_nodes) > 0, 'GPU nodes not found'
+        if is_tpu_pod_slice(acc_type):
+            # If requested accelerator is a TPU type, check if the cluster
+            # has sufficient TPU resource to meet the requirement.
+            fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
+            if reason is not None:
+                return fits, reason
+
         candidate_nodes = gpu_nodes
         not_fit_reason_prefix = (
             f'GPU nodes with {acc_type} do not have '
@@ -537,7 +586,7 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType',
                                  f'CPU (> {k8s_instance_type.cpus} CPUs) '
                                  'and/or memory '
                                  f'(> {k8s_instance_type.memory} G). ')
-    # Check if  CPU and memory requirements are met on at least one
+    # Check if CPU and memory requirements are met on at least one
     # candidate node.
     fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes)
     if not fits:
@@ -702,10 +751,11 @@ def get_tpu_topology_label_key_value(
     for labels in node_labels.values():
         labels_dict = dict(labels)
         if labels_dict.get(tpu_label_key) == accelerator:
-            tpu_chip_count = labels_dict.get(GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
-            #reduce topology and compare number with acc count
             topology_value = labels_dict.get(tpu_topology_label_key)
-            return tpu_topology_label_key, topology_value
+            chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
+            num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions)
+            if num_chips == accelerator_count:
+                return tpu_topology_label_key, topology_value
 
     # If TPU labels are not detected, raise error
     with ux_utils.print_exception_no_traceback():
@@ -716,7 +766,8 @@ def get_tpu_topology_label_key_value(
         raise exceptions.ResourcesUnavailableError(
             f'Unable to find TPU topology for accelerator {accelerator!r}. '
             f'No node found with label `{tpu_label_key}={accelerator}` '
-            f'or missing {tpu_topology_label_key!r} label.{suffix}')
+            f'or missing {tpu_topology_label_key!r} label.{suffix}. Note '
+            'that multi-host TPU podslices are currently not unsupported.')
 
 
 def get_head_ssh_port(cluster_name: str, namespace: str,

From 11142e5338c4cfa6053f6e398aa7f2bdb289585c Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 19 Oct 2024 04:47:20 +0000
Subject: [PATCH 40/63] update check_tpu_fits

---
 sky/clouds/kubernetes.py          | 2 +-
 sky/provision/kubernetes/utils.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 6d2c7769bf3..e7bc02b651a 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -377,7 +377,7 @@ def make_deploy_resources_variables(
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
                 k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                    kubernetes_utils.get_tpu_topology_label_key_value(acc_type, acc_count))
+                    kubernetes_utils.get_tpu_topology_label_key_value(context, acc_type, acc_count))
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 6e8438c1ba0..d41e04d7e8b 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -532,7 +532,6 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
                        node_list: List[Any]) -> Tuple[bool, Optional[str]]:
         # check if the requested TPU type is in the cluster
         # if exists, check if the requested TPU topology is available in the cluster.
-        node_list = gpu_nodes
         acc_type = candidate_instance_type.accelerator_type
         acc_count = candidate_instance_type.accelerator_count
         tpu_list_in_cluster = []
@@ -544,7 +543,7 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
                 # forming a multi-host TPU podslice, it only reflects the
                 # number of TPU chips in this individual node, not the entire
                 # multi-host TPU podslice.
-                node_tpu_chip_count = node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY]
+                node_tpu_chip_count = int(node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
                 chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
                 # topology_chip_count represents the total number of TPU chips
                 # in the entire podslice, whether it is a single-host or
@@ -748,7 +747,7 @@ def get_gpu_label_key_value(context: Optional[str],
 
 
 def get_tpu_topology_label_key_value(
-        accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]:
+        context: Optional[str], accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]:
     """Returns the TPU topology label key and value for given accelerator type.
 
     Args:
@@ -764,7 +763,7 @@ def get_tpu_topology_label_key_value(
               accelerator type.
             - The TPU topology label is missing for the specified accelerator.
     """
-    label_formatter, node_labels = detect_gpu_label_formatter()
+    label_formatter, node_labels = detect_gpu_label_formatter(context)
     assert isinstance(label_formatter, GKELabelFormatter)
 
     tpu_label_key = label_formatter.TPU_LABEL_KEY

From de55663e961cca33cfe3ea2d2f1659827007de06 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 19 Oct 2024 04:51:11 +0000
Subject: [PATCH 41/63] error msg update

---
 sky/provision/kubernetes/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index d41e04d7e8b..0f8f56dd767 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -563,9 +563,12 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
                 if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count:
                     return True, None
         tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
+        # TODO(Doyoung): Update the error message raised with the multi-host
+        # TPU support.
         return False, ('Requested TPU type was not found in the cluster. TPU '
                        'types found in the cluster: '
-                       f'{tpu_list_in_cluster_str}.')
+                       f'{tpu_list_in_cluster_str}. Note that multi-host TPU '
+                       'podslices are currently not unsupported.')
 
     nodes = get_kubernetes_nodes(context)
     k8s_instance_type = KubernetesInstanceType.\

From a500555d72fd6d65c139a39db9da8a447c025123 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 19 Oct 2024 23:09:13 +0000
Subject: [PATCH 42/63] merge get_tpu_topology_label_key_value into
 get_gpu_label_key_value

---
 sky/clouds/kubernetes.py                      |   6 +-
 .../service_catalog/kubernetes_catalog.py     |   2 +-
 sky/provision/kubernetes/utils.py             | 123 ++++++++----------
 tests/common.py                               |   2 +-
 4 files changed, 55 insertions(+), 78 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index e7bc02b651a..c8565dd6145 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -371,13 +371,11 @@ def make_deploy_resources_variables(
 
         # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
         if acc_count > 0 and acc_type is not None:
-            k8s_acc_label_key, k8s_acc_label_value = \
-                kubernetes_utils.get_gpu_label_key_value(context, acc_type)
+            k8s_acc_label_key, k8s_acc_label_value, k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
+                kubernetes_utils.get_accelerator_label_key_value(context, acc_type, acc_count))
             if (k8s_acc_label_key ==
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
-                k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                    kubernetes_utils.get_tpu_topology_label_key_value(context, acc_type, acc_count))
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 309c574fa2a..1b6d5b2b414 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -84,7 +84,7 @@ def list_accelerators_realtime(
     ) or not kubernetes_utils.check_credentials(context)[0]:
         return {}, {}, {}
 
-    has_gpu = kubernetes_utils.detect_gpu_resource(context)
+    has_gpu = kubernetes_utils.detect_accelerator_resource(context)
     if not has_gpu:
         return {}, {}, {}
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 0f8f56dd767..6287a03d7c1 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -420,8 +420,8 @@ def detect_gpu_label_formatter(
 
 
 @functools.lru_cache(maxsize=10)
-def detect_gpu_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
-    """Checks if the Kubernetes cluster has nvidia.com/gpu resource.
+def detect_accelerator_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
+    """Checks if the Kubernetes cluster has GPU/TPU resource.
 
     Two types of accelerator resources are available which are each checked
     with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
@@ -574,13 +574,14 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
     k8s_instance_type = KubernetesInstanceType.\
         from_instance_type(instance)
     acc_type = k8s_instance_type.accelerator_type
+    acc_count = k8s_instance_type.accelerator_count
     if acc_type is not None:
         # If GPU/TPUs are requested, check if GPU/TPU type is available, and
         # if so, check if CPU and memory requirements on the specific node are
         # met.
         try:
-            gpu_label_key, gpu_label_val = get_gpu_label_key_value(
-                context, acc_type)
+            gpu_label_key, gpu_label_val, _, _ = get_accelerator_label_key_value(
+                context, acc_type, acc_count)
         except exceptions.ResourcesUnavailableError as e:
             # If GPU not found, return empty list and error message.
             return False, str(e)
@@ -619,25 +620,31 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
         return fits, reason
 
 
-def get_gpu_label_key_value(context: Optional[str],
-                            acc_type: str,
-                            check_mode=False) -> Tuple[str, str]:
-    """Returns the label key and value for the given GPU type.
+def get_accelerator_label_key_value(context: Optional[str],
+                                    acc_type: str,
+                                    acc_count: int,
+                                    check_mode=False
+                                    ) -> Tuple[str, str, str, str]:
+    """Returns the label key and value for the given GPU/TPU type.
 
     Args:
-        acc_type: The GPU type required by the task.
-        check_mode: If True, only checks if the cluster has GPU resources and
-            labels are setup on the cluster. acc_type is ignore does not return
-            the label key and value. Useful for checking if GPUs are configured
-            correctly on the cluster without explicitly requesting a acc_type.
+        acc_type: The GPU/TPU type required by the task.
+        acc_count: Number of GPU/TPUs required by the task.
+        check_mode: If True, only checks if the cluster has GPU/TPU resources
+            and labels are setup on the cluster. acc_type is ignore does not
+            return the label key and value. Useful for checking if GPUs are
+            configured correctly on the cluster without explicitly requesting
+            a acc_type.
     Returns:
-        A tuple of the label key and value. Returns empty strings if check_mode
-        is True.
+        A tuple of the accelerator label key, value, topology label key, and
+        topology value. The topology label key and value are populated only if
+        the requested accelerator type is TPU. Returns empty strings if
+        check_mode is True.
     Raises:
         ResourcesUnavailableError: Can be raised from the following conditions:
-            - The cluster does not have GPU resources (nvidia.com/gpu)
-            - The cluster does not have GPU labels setup correctly
-            - The cluster doesn't have any nodes with acc_type GPU
+            - The cluster does not have GPU/TPU resources (nvidia.com/gpu, google.com/tpu)
+            - The cluster does not have GPU/TPU labels setup correctly
+            - The cluster doesn't have any nodes with acc_type GPU/TPU
     """
     # Check if the cluster has GPU resources
     # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
@@ -656,14 +663,14 @@ def get_gpu_label_key_value(context: Optional[str],
             # If check mode is enabled and autoscaler is set, we can return
             # early since we assume the cluster autoscaler will handle GPU
             # node provisioning.
-            return '', ''
+            return '', '', '', ''
         formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
         assert formatter is not None, ('Unsupported autoscaler type:'
                                        f' {autoscaler_type}')
         return formatter.get_label_key(acc_type), formatter.get_label_value(
-            acc_type)
+            acc_type), '', ''
 
-    has_gpus, cluster_resources = detect_gpu_resource(context)
+    has_gpus, cluster_resources = detect_accelerator_resource(context)
     if has_gpus:
         # Check if the cluster has GPU labels setup correctly
         label_formatter, node_labels = \
@@ -701,7 +708,7 @@ def get_gpu_label_key_value(context: Optional[str],
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
-                return '', ''
+                return '', '', '', ''
             # Search in node_labels to see if any node has the requested
             # GPU type.
             # Note - this only checks if the label is available on a
@@ -713,7 +720,20 @@ def get_gpu_label_key_value(context: Optional[str],
                     if (label_formatter.match_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
-                        return label, value
+                        if is_tpu_pod_slice(acc_type):
+                            assert isinstance(label_formatter, GKELabelFormatter)
+                            topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY
+                            labels_dict = dict(node_labels[node_name])
+                            if labels_dict.get(label_formatter.TPU_LABEL_KEY) == acc_type:
+                                topology_value = labels_dict.get(topology_label_key)
+                                chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
+                                num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions)
+                                if num_chips == acc_count:
+                                    return label, value, topology_label_key, topology_value
+                                else:
+                                    continue
+                        else:
+                            return label, value, '', ''
 
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():
@@ -722,15 +742,19 @@ def get_gpu_label_key_value(context: Optional[str],
                     all_labels = []
                     for node_name, label_list in node_labels.items():
                         all_labels.extend(label_list)
-                    gpus_available = set(v for k, v in all_labels
+                    acc_available = set(v for k, v in all_labels
                                          if label_formatter.match_label_key(k))
-                    suffix = f' Available GPUs on the cluster: {gpus_available}'
+                    suffix = f' Available GPU/TPUs on the cluster: {acc_available}'
+                # TODO(Doyoung): Update the error message raised with the multi-host
+                # TPU support.
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '
                     f'with {acc_type}. Please ensure at least one node in the '
                     f'cluster has {acc_type} and node labels are setup '
                     'correctly. '
-                    f'Please refer to the documentation for more. {suffix}')
+                    f'Please refer to the documentation for more. {suffix}. '
+                    'Note that multi-host TPU podslices are currently not '
+                    'unsupported.')
     else:
         # If GPU resources are not detected, raise error
         with ux_utils.print_exception_no_traceback():
@@ -749,51 +773,6 @@ def get_gpu_label_key_value(context: Optional[str],
                 f'{suffix}')
 
 
-def get_tpu_topology_label_key_value(
-        context: Optional[str], accelerator: str, accelerator_count: int) -> Tuple[str, Optional[str]]:
-    """Returns the TPU topology label key and value for given accelerator type.
-
-    Args:
-        accelerator: The TPU accelerator type required by the task.
-
-    Returns:
-        A tuple of the TPU topology label key and value.
-
-    Raises:
-        ResourcesUnavailableError: Can be raised from the following conditions:
-            - The cluster does not have TPU labels set up correctly.
-            - The cluster doesn't have any nodes with the specified TPU
-              accelerator type.
-            - The TPU topology label is missing for the specified accelerator.
-    """
-    label_formatter, node_labels = detect_gpu_label_formatter(context)
-    assert isinstance(label_formatter, GKELabelFormatter)
-
-    tpu_label_key = label_formatter.TPU_LABEL_KEY
-    tpu_topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY
-
-    for labels in node_labels.values():
-        labels_dict = dict(labels)
-        if labels_dict.get(tpu_label_key) == accelerator:
-            topology_value = labels_dict.get(tpu_topology_label_key)
-            chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
-            num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions)
-            if num_chips == accelerator_count:
-                return tpu_topology_label_key, topology_value
-
-    # If TPU labels are not detected, raise error
-    with ux_utils.print_exception_no_traceback():
-        suffix = ''
-        if env_options.Options.SHOW_DEBUG_INFO.get():
-            suffix = (' Available node labels on the cluster: '
-                      f'{node_labels}')
-        raise exceptions.ResourcesUnavailableError(
-            f'Unable to find TPU topology for accelerator {accelerator!r}. '
-            f'No node found with label `{tpu_label_key}={accelerator}` '
-            f'or missing {tpu_topology_label_key!r} label.{suffix}. Note '
-            'that multi-host TPU podslices are currently not unsupported.')
-
-
 def get_head_ssh_port(cluster_name: str, namespace: str,
                       context: Optional[str]) -> int:
     svc_name = f'{cluster_name}-head-ssh'
@@ -886,7 +865,7 @@ def check_credentials(context: Optional[str],
     # provider if their cluster GPUs are not setup correctly.
     gpu_msg = ''
     try:
-        _, _ = get_gpu_label_key_value(context, acc_type='', check_mode=True)
+        _, _, _, _ = get_accelerator_label_key_value(context, acc_type='', acc_count=0, check_mode=True)
     except exceptions.ResourcesUnavailableError as e:
         # If GPUs are not available, we return cluster as enabled (since it can
         # be a CPU-only cluster) but we also return the exception message which
diff --git a/tests/common.py b/tests/common.py
index c6f08588d99..d50ae7facdf 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -64,7 +64,7 @@ def _get_az_mappings(_):
     monkeypatch.setattr(
         'sky.provision.kubernetes.utils.detect_gpu_label_formatter',
         lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}])
-    monkeypatch.setattr('sky.provision.kubernetes.utils.detect_gpu_resource',
+    monkeypatch.setattr('sky.provision.kubernetes.utils.detect_accelerator_resource',
                         lambda *_args, **_kwargs: [True, []])
     monkeypatch.setattr('sky.provision.kubernetes.utils.check_instance_fits',
                         lambda *_args, **_kwargs: [True, ''])

From bce87318e908dcbf4d6f47424c386c2c09e210b1 Mon Sep 17 00:00:00 2001
From: landscapepainter <34902420+landscapepainter@users.noreply.github.com>
Date: Sat, 19 Oct 2024 16:46:51 -0700
Subject: [PATCH 43/63] Update sky/provision/kubernetes/utils.py

Co-authored-by: Tian Xia <cblmemo@gmail.com>
---
 sky/provision/kubernetes/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 6287a03d7c1..98726a2d75b 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -57,9 +57,9 @@
 
 NO_GPU_HELP_MESSAGE = (
     'If your cluster contains GPUs or TPUs, make sure '
-    f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available'
-    ' on the nodes and the node labels for identifying '
-    'GPUs/TPUs (e.g., skypilot.co/accelerator) are setup correctly. ')
+    f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
+    'on the nodes and the node labels for identifying GPUs/TPUs '
+    '(e.g., skypilot.co/accelerator) are setup correctly. ')
 
 KUBERNETES_AUTOSCALER_NOTE = (
     'Note: Kubernetes cluster autoscaling is enabled. '

From 0e8366c6ca6fc46e89498eab952234ead8778c6f Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 20 Oct 2024 00:33:25 +0000
Subject: [PATCH 44/63] nit fixes

---
 sky/cli.py                        |  3 ++-
 sky/provision/kubernetes/utils.py | 18 +++++++++---------
 sky/resources.py                  |  5 ++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index d23df3c9c58..d25d847bb19 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3106,7 +3106,8 @@ def _get_kubernetes_realtime_gpu_table(
                            'in Kubernetes cluster. ')
                 debug_msg = ('To show available accelerators on kubernetes,'
                              ' run: sky show-gpus --cloud kubernetes ')
-            full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE +
+            full_err_msg = (err_msg +
+                            kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE +
                             debug_msg)
             raise ValueError(full_err_msg)
         for gpu, _ in sorted(counts.items()):
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 98726a2d75b..275b8a2b9cc 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -55,7 +55,7 @@
 GPU_RESOURCE_KEY = 'nvidia.com/gpu'
 TPU_RESOURCE_KEY = 'google.com/tpu'
 
-NO_GPU_HELP_MESSAGE = (
+NO_ACCELERATOR_HELP_MESSAGE = (
     'If your cluster contains GPUs or TPUs, make sure '
     f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
     'on the nodes and the node labels for identifying GPUs/TPUs '
@@ -153,10 +153,11 @@ def validate_label_value(cls, value: str) -> Tuple[bool, str]:
 
 
 def get_gke_accelerator_name(accelerator: str) -> str:
-    """Returns the accelerator name for GKE clusters
+    """Returns the accelerator name for GKE clusters.
 
     Uses the format - nvidia-tesla-<accelerator>.
-    A100-80GB, H100-80GB and L4 are an exception. They use nvidia-<accelerator>.
+    A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
+    TPU types are an exception as well keeping the given name.
     """
     if accelerator == 'H100':
         # H100 is named as H100-80GB in GKE.
@@ -751,10 +752,9 @@ def get_accelerator_label_key_value(context: Optional[str],
                     'Could not find any node in the Kubernetes cluster '
                     f'with {acc_type}. Please ensure at least one node in the '
                     f'cluster has {acc_type} and node labels are setup '
-                    'correctly. '
-                    f'Please refer to the documentation for more. {suffix}. '
-                    'Note that multi-host TPU podslices are currently not '
-                    'unsupported.')
+                    'correctly. Please refer to the documentration for more. '
+                    f'{suffix}. Note that multi-host TPU podslices are '
+                    'currently not unsupported.')
     else:
         # If GPU resources are not detected, raise error
         with ux_utils.print_exception_no_traceback():
@@ -1942,7 +1942,7 @@ def __init__(self, obj):
 class KubernetesNodeInfo:
     """Dataclass to store Kubernetes node information."""
     name: str
-    gpu_type: Optional[str]
+    accelerator_type: Optional[str]
     # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
     total: Dict[str, int]
     free: Dict[str, int]
@@ -2011,7 +2011,7 @@ def get_kubernetes_node_info(
 
             node_info_dict[node.metadata.name] = KubernetesNodeInfo(
                 name=node.metadata.name,
-                gpu_type=accelerator_name,
+                accelerator_type=accelerator_name,
                 total={'accelerator_count': int(accelerator_count)},
                 free={'accelerators_available': int(accelerators_available)})
 
diff --git a/sky/resources.py b/sky/resources.py
index 7fcaea1db82..764858afc10 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -599,9 +599,8 @@ def _get_default_runtime_version() -> str:
                         if self.instance_type != 'TPU-VM':
                             with ux_utils.print_exception_no_traceback():
                                 raise ValueError(
-                                    'Cannot specify instance type'
-                                    f' (got "{self.instance_type}") for TPU VM.'
-                                )
+                                    'Cannot specify instance type (got '
+                                    f'{self.instance_type!r}) for TPU VM.')
 
         self._accelerators = accelerators
         self._accelerator_args = accelerator_args

From f67ad0fb7a6bda6d541f15f4c3c8c53b678223e5 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 20 Oct 2024 01:03:02 +0000
Subject: [PATCH 45/63] format

---
 sky/cli.py                           |  2 +-
 sky/clouds/kubernetes.py             | 14 +++--
 sky/provision/kubernetes/instance.py |  6 +-
 sky/provision/kubernetes/utils.py    | 93 ++++++++++++++++++----------
 sky/templates/kubernetes-ray.yml.j2  |  4 +-
 tests/common.py                      |  5 +-
 6 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index d25d847bb19..05bcd732070 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3124,7 +3124,7 @@ def _get_kubernetes_node_info_table(context: Optional[str]):
         node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
         for node_name, node_info in node_info_dict.items():
             node_table.add_row([
-                node_name, node_info.gpu_type,
+                node_name, node_info.accelerator_type,
                 node_info.total['accelerator_count'],
                 node_info.free['accelerators_available']
             ])
diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index c8565dd6145..106a282438e 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -365,14 +365,16 @@ def make_deploy_resources_variables(
 
         k8s_acc_label_key = None
         k8s_acc_label_value = None
-        k8s_tpu_topology_label_key = None
-        k8s_tpu_topology_label_value = None
+        k8s_topology_label_key = None
+        k8s_topology_label_value = None
         tpu_requested = False
 
         # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
         if acc_count > 0 and acc_type is not None:
-            k8s_acc_label_key, k8s_acc_label_value, k8s_tpu_topology_label_key, k8s_tpu_topology_label_value = (
-                kubernetes_utils.get_accelerator_label_key_value(context, acc_type, acc_count))
+            (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
+             k8s_topology_label_value) = (
+                 kubernetes_utils.get_accelerator_label_key_value(
+                     context, acc_type, acc_count))
             if (k8s_acc_label_key ==
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
@@ -438,8 +440,8 @@ def make_deploy_resources_variables(
             'k8s_spot_label_key': spot_label_key,
             'k8s_spot_label_value': spot_label_value,
             'tpu_requested': tpu_requested,
-            'k8s_tpu_topology_label_key': k8s_tpu_topology_label_key,
-            'k8s_tpu_topology_label_value': k8s_tpu_topology_label_value,
+            'k8s_topology_label_key': k8s_topology_label_key,
+            'k8s_topology_label_value': k8s_topology_label_value,
             'image_id': image_id,
         }
 
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 9b298e0d784..44ed1b91ec6 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -188,9 +188,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                                                        extra_msg,
                                                        details=event_message))
                             elif (('Insufficient nvidia.com/gpu'
-                                 in event_message) or
-                                ('didn\'t match Pod\'s node affinity/selector'
-                                 in event_message)):
+                                   in event_message) or
+                                  ('didn\'t match Pod\'s node affinity/selector'
+                                   in event_message)):
                                 extra_msg = (
                                     f'Verify if '
                                     f'{pod.spec.node_selector[label_key]}'
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 275b8a2b9cc..b309a617271 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -421,7 +421,8 @@ def detect_gpu_label_formatter(
 
 
 @functools.lru_cache(maxsize=10)
-def detect_accelerator_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
+def detect_accelerator_resource(
+        context: Optional[str]) -> Tuple[bool, Set[str]]:
     """Checks if the Kubernetes cluster has GPU/TPU resource.
 
     Two types of accelerator resources are available which are each checked
@@ -531,25 +532,37 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType',
 
     def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
                        node_list: List[Any]) -> Tuple[bool, Optional[str]]:
-        # check if the requested TPU type is in the cluster
-        # if exists, check if the requested TPU topology is available in the cluster.
+        """Checks if the instance fits on the cluster based on requested TPU.
+
+        It checks if the TPU type and count on each node match the required
+        number of TPU chips for the instance. In the case of multi-host TPU
+        podslice, the function ensures that the number of TPU chips on a single
+        node (node_tpu_chip_count) and the total TPU chips across the entire
+        podslice (topology_chip_count) are correctly handled.
+        """
         acc_type = candidate_instance_type.accelerator_type
         acc_count = candidate_instance_type.accelerator_count
         tpu_list_in_cluster = []
         for node in node_list:
-            if acc_type == node.metadata.labels[GKELabelFormatter.TPU_LABEL_KEY]:
-                topology_value = node.metadata.labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]
+            if acc_type == node.metadata.labels[
+                    GKELabelFormatter.TPU_LABEL_KEY]:
+                topology_value = node.metadata.labels[
+                    GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]
                 # node_tpu_chip_count represents the number of TPU chips
                 # available in this node. If the node is part of a node pool
                 # forming a multi-host TPU podslice, it only reflects the
                 # number of TPU chips in this individual node, not the entire
                 # multi-host TPU podslice.
-                node_tpu_chip_count = int(node.metadata.labels[GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
-                chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
+                node_tpu_chip_count = int(node.metadata.labels[
+                    GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
+                chip_dimensions = [
+                    int(chip_count) for chip_count in topology_value.split('x')
+                ]
                 # topology_chip_count represents the total number of TPU chips
                 # in the entire podslice, whether it is a single-host or
                 # multi-host TPU podslice.
-                topology_chip_count = functools.reduce(lambda x, y: x * y, chip_dimensions)
+                topology_chip_count = functools.reduce(lambda x, y: x * y,
+                                                       chip_dimensions)
                 # TODO(Doyoung): Update the naming scheme with multi-host TPU
                 # support.
                 tpu_type = f'{acc_type}:{node_tpu_chip_count}'
@@ -561,7 +574,8 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
                 # TODO(Doyoung): Remove the condition,
                 # node_tpu_chip_count == topology_chip_count, when adding
                 # multi-host TPU support.
-                if node_tpu_chip_count == topology_chip_count and topology_chip_count == acc_count:
+                if (node_tpu_chip_count == topology_chip_count and
+                        topology_chip_count == acc_count):
                     return True, None
         tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
         # TODO(Doyoung): Update the error message raised with the multi-host
@@ -581,8 +595,8 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
         # if so, check if CPU and memory requirements on the specific node are
         # met.
         try:
-            gpu_label_key, gpu_label_val, _, _ = get_accelerator_label_key_value(
-                context, acc_type, acc_count)
+            gpu_label_key, gpu_label_val, _, _ = (
+                get_accelerator_label_key_value(context, acc_type, acc_count))
         except exceptions.ResourcesUnavailableError as e:
             # If GPU not found, return empty list and error message.
             return False, str(e)
@@ -621,11 +635,11 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
         return fits, reason
 
 
-def get_accelerator_label_key_value(context: Optional[str],
-                                    acc_type: str,
-                                    acc_count: int,
-                                    check_mode=False
-                                    ) -> Tuple[str, str, str, str]:
+def get_accelerator_label_key_value(
+        context: Optional[str],
+        acc_type: str,
+        acc_count: Optional[int],
+        check_mode=False) -> Tuple[str, str, str, str]:
     """Returns the label key and value for the given GPU/TPU type.
 
     Args:
@@ -643,7 +657,8 @@ def get_accelerator_label_key_value(context: Optional[str],
         check_mode is True.
     Raises:
         ResourcesUnavailableError: Can be raised from the following conditions:
-            - The cluster does not have GPU/TPU resources (nvidia.com/gpu, google.com/tpu)
+            - The cluster does not have GPU/TPU resources
+                (nvidia.com/gpu, google.com/tpu)
             - The cluster does not have GPU/TPU labels setup correctly
             - The cluster doesn't have any nodes with acc_type GPU/TPU
     """
@@ -722,15 +737,25 @@ def get_accelerator_label_key_value(context: Optional[str],
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
                         if is_tpu_pod_slice(acc_type):
-                            assert isinstance(label_formatter, GKELabelFormatter)
-                            topology_label_key = label_formatter.TPU_TOPOLOGY_LABEL_KEY
+                            assert isinstance(label_formatter,
+                                              GKELabelFormatter)
+                            topology_label_key = (
+                                label_formatter.TPU_TOPOLOGY_LABEL_KEY)
                             labels_dict = dict(node_labels[node_name])
-                            if labels_dict.get(label_formatter.TPU_LABEL_KEY) == acc_type:
-                                topology_value = labels_dict.get(topology_label_key)
-                                chip_dimensions = [int(chip_count) for chip_count in topology_value.split("x")]
-                                num_chips = functools.reduce(lambda x, y: x * y, chip_dimensions)
+                            if labels_dict.get(
+                                    label_formatter.TPU_LABEL_KEY) == acc_type:
+                                topology_value = labels_dict.get(
+                                    topology_label_key)
+                                assert topology_value is not None
+                                chip_dimensions = [
+                                    int(chip_count)
+                                    for chip_count in topology_value.split('x')
+                                ]
+                                num_chips = functools.reduce(
+                                    lambda x, y: x * y, chip_dimensions)
                                 if num_chips == acc_count:
-                                    return label, value, topology_label_key, topology_value
+                                    return (label, value, topology_label_key,
+                                            topology_value)
                                 else:
                                     continue
                         else:
@@ -744,10 +769,11 @@ def get_accelerator_label_key_value(context: Optional[str],
                     for node_name, label_list in node_labels.items():
                         all_labels.extend(label_list)
                     acc_available = set(v for k, v in all_labels
-                                         if label_formatter.match_label_key(k))
-                    suffix = f' Available GPU/TPUs on the cluster: {acc_available}'
-                # TODO(Doyoung): Update the error message raised with the multi-host
-                # TPU support.
+                                        if label_formatter.match_label_key(k))
+                    suffix = (' Available GPU/TPUs on the cluster: '
+                              f'{acc_available}')
+                # TODO(Doyoung): Update the error message raised with the
+                # multi-host TPU support.
                 raise exceptions.ResourcesUnavailableError(
                     'Could not find any node in the Kubernetes cluster '
                     f'with {acc_type}. Please ensure at least one node in the '
@@ -763,9 +789,9 @@ def get_accelerator_label_key_value(context: Optional[str],
                 suffix = (' Available resources on the cluster: '
                           f'{cluster_resources}')
             raise exceptions.ResourcesUnavailableError(
-                f'Could not detect GPU/TPU resources (`{GPU_RESOURCE_KEY}` or '
-                f'`{TPU_RESOURCE_KEY}`) in Kubernetes cluster. If this cluster '
-                'contains GPUs, please ensure GPU drivers are installed on '
+                f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
+                f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
+                ' contains GPUs, please ensure GPU drivers are installed on '
                 'the node. Check if the GPUs are setup correctly by running '
                 '`kubectl describe nodes` and looking for the '
                 f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
@@ -865,7 +891,10 @@ def check_credentials(context: Optional[str],
     # provider if their cluster GPUs are not setup correctly.
     gpu_msg = ''
     try:
-        _, _, _, _ = get_accelerator_label_key_value(context, acc_type='', acc_count=0, check_mode=True)
+        _, _, _, _ = get_accelerator_label_key_value(context,
+                                                     acc_type='',
+                                                     acc_count=0,
+                                                     check_mode=True)
     except exceptions.ResourcesUnavailableError as e:
         # If GPUs are not available, we return cluster as enabled (since it can
         # be a CPU-only cluster) but we also return the exception message which
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index e3f3a8bcee0..9b6d190c7ee 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -289,8 +289,8 @@ available_node_types:
             {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
             {{k8s_acc_label_key}}: {{k8s_acc_label_value}}
             {% endif %}
-            {% if k8s_tpu_topology_label_key is not none and k8s_tpu_topology_label_value is not none %}
-            {{k8s_tpu_topology_label_key}}: {{k8s_tpu_topology_label_value}}
+            {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
+            {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
             {% endif %}
             {% if k8s_spot_label_key is not none %}
             {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
diff --git a/tests/common.py b/tests/common.py
index d50ae7facdf..ad1f92f6455 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -64,8 +64,9 @@ def _get_az_mappings(_):
     monkeypatch.setattr(
         'sky.provision.kubernetes.utils.detect_gpu_label_formatter',
         lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}])
-    monkeypatch.setattr('sky.provision.kubernetes.utils.detect_accelerator_resource',
-                        lambda *_args, **_kwargs: [True, []])
+    monkeypatch.setattr(
+        'sky.provision.kubernetes.utils.detect_accelerator_resource',
+        lambda *_args, **_kwargs: [True, []])
     monkeypatch.setattr('sky.provision.kubernetes.utils.check_instance_fits',
                         lambda *_args, **_kwargs: [True, ''])
     monkeypatch.setattr('sky.provision.kubernetes.utils.get_spot_label',

From 05c37aaf9f48ddf1c43da02040faeb2582d3ca58 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 20 Oct 2024 01:13:18 +0000
Subject: [PATCH 46/63] nit

---
 sky/provision/kubernetes/utils.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index b309a617271..fe299b67310 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -113,7 +113,7 @@ class GPULabelFormatter:
     """
 
     @classmethod
-    def get_label_key(cls, accelerator: str = '') -> str:
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         """Returns the label key for GPU type used by the Kubernetes cluster"""
         raise NotImplementedError
 
@@ -182,8 +182,7 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'skypilot.co/accelerator'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = '') -> str:
-        del accelerator  # Unused
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
 
     @classmethod
@@ -223,8 +222,7 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = '') -> str:
-        del accelerator  # Unused
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
 
     @classmethod
@@ -257,8 +255,8 @@ class GKELabelFormatter(GPULabelFormatter):
     TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = '') -> str:
-        if accelerator.startswith('tpu-'):
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
+        if isinstance(accelerator, str) and accelerator.startswith('tpu-'):
             return cls.TPU_LABEL_KEY
         return cls.GPU_LABEL_KEY
 
@@ -315,8 +313,7 @@ class GFDLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'nvidia.com/gpu.product'
 
     @classmethod
-    def get_label_key(cls, accelerator: str = '') -> str:
-        del accelerator  # Unused
+    def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
 
     @classmethod

From 06d3879a4bfab5b90d6b4a7c8d147a39f99829b7 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 20 Oct 2024 01:58:24 +0000
Subject: [PATCH 47/63] Implement method for reading acc counts from node/pod
 object

---
 .../service_catalog/kubernetes_catalog.py     | 25 ++++-------
 sky/provision/kubernetes/utils.py             | 42 ++++++++++++-------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 1b6d5b2b414..bb69fe5dab7 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -116,16 +116,10 @@ def list_accelerators_realtime(
                         name_filter, accelerator_name, flags=regex_flags):
                     continue
 
-                accelerator_count = 0
-                if kubernetes_utils.GPU_RESOURCE_KEY in node.status.allocatable:
-                    accelerator_count = int(node.status.allocatable[
-                        kubernetes_utils.GPU_RESOURCE_KEY])
-                elif (kubernetes_utils.TPU_RESOURCE_KEY
-                      in node.status.allocatable):
-                    accelerator_count = int(node.status.allocatable[
-                        kubernetes_utils.TPU_RESOURCE_KEY])
-
                 # Generate the GPU quantities for the accelerators
+                accelerator_count = (
+                    kubernetes_utils.get_node_accelerator_count(
+                        node.status.allocatable))
                 if accelerator_name and accelerator_count > 0:
                     for count in range(1, accelerator_count + 1):
                         accelerators_qtys.add((accelerator_name, count))
@@ -134,16 +128,13 @@ def list_accelerators_realtime(
                     # Get all the pods running on the node
                     if (pod.spec.node_name == node.metadata.name and
                             pod.status.phase in ['Running', 'Pending']):
-                        # Iterate over all the containers in the pod and sum the
-                        # GPU requests
+                        # Iterate over all the containers in the pod and sum
+                        # the GPU requests
                         for container in pod.spec.containers:
                             if container.resources.requests:
-                                allocated_qty += int(
-                                    container.resources.requests.get(
-                                        kubernetes_utils.GPU_RESOURCE_KEY, 0))
-                                allocated_qty += int(
-                                    container.resources.requests.get(
-                                        kubernetes_utils.TPU_RESOURCE_KEY, 0))
+                                allocated_qty += (
+                                    kubernetes_utils.get_node_accelerator_count(
+                                        container.resources.requests))
 
                 accelerators_available = accelerator_count - allocated_qty
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index fe299b67310..ee7dd8d0b89 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -2007,13 +2007,8 @@ def get_kubernetes_node_info(
             else:
                 accelerator_name = None
 
-            accelerator_count = 0
-            if GPU_RESOURCE_KEY in node.status.allocatable:
-                accelerator_count = int(
-                    node.status.allocatable[GPU_RESOURCE_KEY])
-            elif TPU_RESOURCE_KEY in node.status.allocatable:
-                accelerator_count = int(
-                    node.status.allocatable[TPU_RESOURCE_KEY])
+            accelerator_count = get_node_accelerator_count(
+                node.status.allocatable)
 
             for pod in pods:
                 # Get all the pods running on the node
@@ -2023,15 +2018,8 @@ def get_kubernetes_node_info(
                     # GPU requests
                     for container in pod.spec.containers:
                         if container.resources.requests:
-                            if GPU_RESOURCE_KEY in container.resources.requests:
-                                allocated_qty += int(
-                                    container.resources.requests.get(
-                                        GPU_RESOURCE_KEY, 0))
-                            elif (TPU_RESOURCE_KEY
-                                  in container.resources.requests):
-                                allocated_qty += int(
-                                    container.resources.requests.get(
-                                        TPU_RESOURCE_KEY, 0))
+                            allocated_qty += get_node_accelerator_count(
+                                container.resources.requests)
 
             accelerators_available = accelerator_count - allocated_qty
 
@@ -2333,3 +2321,25 @@ def process_skypilot_pods(
         num_pods = len(cluster.pods)
         cluster.resources_str = f'{num_pods}x {cluster.resources}'
     return list(clusters.values()), jobs_controllers, serve_controllers
+
+
+def get_node_accelerator_count(attribute_dict: dict) -> int:
+    """Retrieves the count of accelerators from a node's resource dictionary.
+
+    This method checks the node's allocatable resources or the accelerators
+    already deployed on the node, using pod objects that describe resource
+    requests.
+
+    Args:
+        attribute_dict): Containing resource information from a node, such as
+            allocatable or requested resources.
+
+    Returns:
+        Number of accelerators allocated or available from the node. If no
+            resource is found, it returns 0.
+    """
+    if GPU_RESOURCE_KEY in attribute_dict:
+        return int(attribute_dict[GPU_RESOURCE_KEY])
+    elif TPU_RESOURCE_KEY in attribute_dict:
+        return int(attribute_dict[TPU_RESOURCE_KEY])
+    return 0

From 9a2046c52e2665fc9d9e3262e17bdfc9c0afff98 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 20 Oct 2024 02:16:14 +0000
Subject: [PATCH 48/63] assertion update for is_tpu_vm

---
 sky/clouds/utils/gcp_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index 01ca1c89a71..f44788d2e6b 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -36,7 +36,7 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
 def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
     if not is_tpu(resources):
         return False
-    assert resources is not None
+    assert (resources is not None and len(resources.accelerators) == 1)
     acc, _ = list(resources.accelerators.items())[0]
     if kubernetes_utils.is_tpu_pod_slice(acc):
         return False

From 62b235f0553c5d388e6aee1fcfc2dc1a9d508f9c Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 21 Oct 2024 01:01:35 +0000
Subject: [PATCH 49/63] Exclude multi-host TPUs to displayed from show-gpus

---
 .../service_catalog/kubernetes_catalog.py     |  6 ++
 sky/provision/kubernetes/utils.py             | 97 +++++++++++--------
 2 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index bb69fe5dab7..46078ddc590 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -110,6 +110,12 @@ def list_accelerators_realtime(
                 accelerator_name = lf.get_accelerator_from_label_value(
                     node.metadata.labels.get(key))
 
+                # Exclude multi-host TPUs from being processed.
+                # TODO(Doyoung): Remove the logic when adding support for
+                # multi-host TPUs.
+                if kubernetes_utils.is_multi_host_tpu(node.metadata.labels):
+                    continue
+
                 # Check if name_filter regex matches the accelerator_name
                 regex_flags = 0 if case_sensitive else re.IGNORECASE
                 if name_filter and not re.match(
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index ee7dd8d0b89..e9e7df135c9 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -543,36 +543,15 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
         for node in node_list:
             if acc_type == node.metadata.labels[
                     GKELabelFormatter.TPU_LABEL_KEY]:
-                topology_value = node.metadata.labels[
-                    GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY]
-                # node_tpu_chip_count represents the number of TPU chips
-                # available in this node. If the node is part of a node pool
-                # forming a multi-host TPU podslice, it only reflects the
-                # number of TPU chips in this individual node, not the entire
-                # multi-host TPU podslice.
+                # TODO(Doyoung): Update the logic when adding support for
+                # multi-host TPUs.
+                if is_multi_host_tpu(node.metadata.labels):
+                    continue
                 node_tpu_chip_count = int(node.metadata.labels[
                     GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
-                chip_dimensions = [
-                    int(chip_count) for chip_count in topology_value.split('x')
-                ]
-                # topology_chip_count represents the total number of TPU chips
-                # in the entire podslice, whether it is a single-host or
-                # multi-host TPU podslice.
-                topology_chip_count = functools.reduce(lambda x, y: x * y,
-                                                       chip_dimensions)
-                # TODO(Doyoung): Update the naming scheme with multi-host TPU
-                # support.
                 tpu_type = f'{acc_type}:{node_tpu_chip_count}'
                 tpu_list_in_cluster.append(tpu_type)
-                # For multi-host TPU podslices, topology_chip_count and
-                # node_tpu_chip_count will differ, as topology_chip_count
-                # reflects the total across all hosts, while
-                # node_tpu_chip_count reflects only the chips in a single node.
-                # TODO(Doyoung): Remove the condition,
-                # node_tpu_chip_count == topology_chip_count, when adding
-                # multi-host TPU support.
-                if (node_tpu_chip_count == topology_chip_count and
-                        topology_chip_count == acc_count):
+                if node_tpu_chip_count == acc_count:
                     return True, None
         tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
         # TODO(Doyoung): Update the error message raised with the multi-host
@@ -729,6 +708,11 @@ def get_accelerator_label_key_value(
             # quantity is available since that is dynamic and can change
             # during scheduling.
             for node_name, label_list in node_labels.items():
+                node_metadata_labels = dict(label_list)
+                # TODO(Doyoung): Update the logic when adding support for
+                # multi-host TPUs.
+                if is_multi_host_tpu(node_metadata_labels):
+                    continue
                 for label, value in label_list:
                     if (label_formatter.match_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
@@ -736,21 +720,16 @@ def get_accelerator_label_key_value(
                         if is_tpu_pod_slice(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)
-                            topology_label_key = (
-                                label_formatter.TPU_TOPOLOGY_LABEL_KEY)
-                            labels_dict = dict(node_labels[node_name])
-                            if labels_dict.get(
+                            if node_metadata_labels.get(
                                     label_formatter.TPU_LABEL_KEY) == acc_type:
-                                topology_value = labels_dict.get(
+                                topology_label_key = (
+                                    label_formatter.TPU_TOPOLOGY_LABEL_KEY)
+                                topology_value = node_metadata_labels.get(
                                     topology_label_key)
                                 assert topology_value is not None
-                                chip_dimensions = [
-                                    int(chip_count)
-                                    for chip_count in topology_value.split('x')
-                                ]
-                                num_chips = functools.reduce(
-                                    lambda x, y: x * y, chip_dimensions)
-                                if num_chips == acc_count:
+                                tpu_topology_chip_count = reduce_tpu_topology(
+                                    topology_value)
+                                if tpu_topology_chip_count == acc_count:
                                     return (label, value, topology_label_key,
                                             topology_value)
                                 else:
@@ -2023,6 +2002,12 @@ def get_kubernetes_node_info(
 
             accelerators_available = accelerator_count - allocated_qty
 
+            # Exclude multi-host TPUs from being processed.
+            # TODO(Doyoung): Remove the logic when adding support for
+            # multi-host TPUs.
+            if is_multi_host_tpu(node.metadata.labels):
+                continue
+
             node_info_dict[node.metadata.name] = KubernetesNodeInfo(
                 name=node.metadata.name,
                 accelerator_type=accelerator_name,
@@ -2343,3 +2328,39 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
     elif TPU_RESOURCE_KEY in attribute_dict:
         return int(attribute_dict[TPU_RESOURCE_KEY])
     return 0
+
+
+def reduce_tpu_topology(topology: str):
+    """Computes the number of TPU chips from its topology string."""
+    chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
+    # tpu_topology_chip_count represents the total number of TPU chips in the
+    # entire podslice, whether it is a single-host or multi-host TPU podslice.
+    tpu_topology_chip_count = functools.reduce(
+        lambda x, y: x * y, chip_dimensions)
+    return tpu_topology_chip_count
+
+
+def is_multi_host_tpu(node_metadata_labels: dict):
+    """Determines whether the given node is a multi-host TPU configuration."""
+    if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
+        assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
+        topology_value = (
+            node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
+        accelerator_count_label_key = (
+            GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
+        assert accelerator_count_label_key in node_metadata_labels
+        # node_tpu_chip_count represents the number of TPU chips
+        # available in this node. If the node is part of a node pool
+        # forming a multi-host TPU podslice, it only reflects the
+        # number of TPU chips in this individual node, not the entire
+        # multi-host TPU podslice.
+        node_tpu_chip_count = int(
+            node_metadata_labels[accelerator_count_label_key])
+        topology_chip_count = reduce_tpu_topology(topology_value)
+        # For multi-host TPU podslices, topology_chip_count and
+        # node_tpu_chip_count will differ, as topology_chip_count
+        # reflects the total across all hosts, while
+        # node_tpu_chip_count reflects only the chips in a single node.
+        if node_tpu_chip_count != topology_chip_count:
+            return True
+    return False

From 4db1e637f6efccdf870442a17b0963431ab759e1 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 21 Oct 2024 01:01:56 +0000
Subject: [PATCH 50/63] Notify users that multi-host TPUs are not supported
 from 'sky show-gpus'

---
 sky/cli.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sky/cli.py b/sky/cli.py
index 05bcd732070..124eb893241 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3180,8 +3180,11 @@ def _output():
                     yield from k8s_realtime_table.get_string()
                     k8s_node_table = _get_kubernetes_node_info_table(context)
                     yield '\n\n'
+                    # TODO(Doyoung): Update the message with the multi-host TPU
+                    # support.
                     yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                           f'Kubernetes per node GPU availability'
+                           f'Kubernetes per node accelerator availability '
+                           '(Note: Multi-host TPUs are not supported.)'
                            f'{colorama.Style.RESET_ALL}\n')
                     yield from k8s_node_table.get_string()
                 if kubernetes_autoscaling:

From 5923f104c34a93e6262698aa96f6c51941f40ad5 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 21 Oct 2024 01:04:29 +0000
Subject: [PATCH 51/63] format

---
 sky/provision/kubernetes/utils.py | 116 +++++++++++++++---------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index e9e7df135c9..a321bb39857 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -2198,6 +2198,64 @@ def is_tpu_pod_slice(accelerator: str) -> bool:
     return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
 
 
+def get_node_accelerator_count(attribute_dict: dict) -> int:
+    """Retrieves the count of accelerators from a node's resource dictionary.
+
+    This method checks the node's allocatable resources or the accelerators
+    already deployed on the node, using pod objects that describe resource
+    requests.
+
+    Args:
+        attribute_dict: Containing resource information from a node, such as
+            allocatable or requested resources.
+
+    Returns:
+        Number of accelerators allocated or available from the node. If no
+            resource is found, it returns 0.
+    """
+    if GPU_RESOURCE_KEY in attribute_dict:
+        return int(attribute_dict[GPU_RESOURCE_KEY])
+    elif TPU_RESOURCE_KEY in attribute_dict:
+        return int(attribute_dict[TPU_RESOURCE_KEY])
+    return 0
+
+
+def reduce_tpu_topology(topology: str):
+    """Computes the number of TPU chips from its topology string."""
+    chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
+    # tpu_topology_chip_count represents the total number of TPU chips in the
+    # entire podslice, whether it is a single-host or multi-host TPU podslice.
+    tpu_topology_chip_count = functools.reduce(lambda x, y: x * y,
+                                               chip_dimensions)
+    return tpu_topology_chip_count
+
+
+def is_multi_host_tpu(node_metadata_labels: dict):
+    """Determines whether the given node is a multi-host TPU configuration."""
+    if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
+        assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
+        topology_value = (
+            node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
+        accelerator_count_label_key = (
+            GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
+        assert accelerator_count_label_key in node_metadata_labels
+        # node_tpu_chip_count represents the number of TPU chips
+        # available in this node. If the node is part of a node pool
+        # forming a multi-host TPU podslice, it only reflects the
+        # number of TPU chips in this individual node, not the entire
+        # multi-host TPU podslice.
+        node_tpu_chip_count = int(
+            node_metadata_labels[accelerator_count_label_key])
+        topology_chip_count = reduce_tpu_topology(topology_value)
+        # For multi-host TPU podslices, topology_chip_count and
+        # node_tpu_chip_count will differ, as topology_chip_count
+        # reflects the total across all hosts, while
+        # node_tpu_chip_count reflects only the chips in a single node.
+        if node_tpu_chip_count != topology_chip_count:
+            return True
+    return False
+
+
 @dataclasses.dataclass
 class KubernetesSkyPilotClusterInfo:
     cluster_name_on_cloud: str
@@ -2306,61 +2364,3 @@ def process_skypilot_pods(
         num_pods = len(cluster.pods)
         cluster.resources_str = f'{num_pods}x {cluster.resources}'
     return list(clusters.values()), jobs_controllers, serve_controllers
-
-
-def get_node_accelerator_count(attribute_dict: dict) -> int:
-    """Retrieves the count of accelerators from a node's resource dictionary.
-
-    This method checks the node's allocatable resources or the accelerators
-    already deployed on the node, using pod objects that describe resource
-    requests.
-
-    Args:
-        attribute_dict): Containing resource information from a node, such as
-            allocatable or requested resources.
-
-    Returns:
-        Number of accelerators allocated or available from the node. If no
-            resource is found, it returns 0.
-    """
-    if GPU_RESOURCE_KEY in attribute_dict:
-        return int(attribute_dict[GPU_RESOURCE_KEY])
-    elif TPU_RESOURCE_KEY in attribute_dict:
-        return int(attribute_dict[TPU_RESOURCE_KEY])
-    return 0
-
-
-def reduce_tpu_topology(topology: str):
-    """Computes the number of TPU chips from its topology string."""
-    chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
-    # tpu_topology_chip_count represents the total number of TPU chips in the
-    # entire podslice, whether it is a single-host or multi-host TPU podslice.
-    tpu_topology_chip_count = functools.reduce(
-        lambda x, y: x * y, chip_dimensions)
-    return tpu_topology_chip_count
-
-
-def is_multi_host_tpu(node_metadata_labels: dict):
-    """Determines whether the given node is a multi-host TPU configuration."""
-    if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
-        assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
-        topology_value = (
-            node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
-        accelerator_count_label_key = (
-            GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
-        assert accelerator_count_label_key in node_metadata_labels
-        # node_tpu_chip_count represents the number of TPU chips
-        # available in this node. If the node is part of a node pool
-        # forming a multi-host TPU podslice, it only reflects the
-        # number of TPU chips in this individual node, not the entire
-        # multi-host TPU podslice.
-        node_tpu_chip_count = int(
-            node_metadata_labels[accelerator_count_label_key])
-        topology_chip_count = reduce_tpu_topology(topology_value)
-        # For multi-host TPU podslices, topology_chip_count and
-        # node_tpu_chip_count will differ, as topology_chip_count
-        # reflects the total across all hosts, while
-        # node_tpu_chip_count reflects only the chips in a single node.
-        if node_tpu_chip_count != topology_chip_count:
-            return True
-    return False

From fa2e6708bb5f4882becead44a7de6c0a22a6e15a Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 21 Oct 2024 01:05:25 +0000
Subject: [PATCH 52/63] nit

---
 sky/provision/kubernetes/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index a321bb39857..720ee5295d3 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -2195,6 +2195,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
 
 
 def is_tpu_pod_slice(accelerator: str) -> bool:
+    """Determins if the given accelerator is a TPU supported on GKE."""
     return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
 
 

From c1ee117ebef292fca0e775415955e41437497eb5 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 21 Oct 2024 01:41:52 +0000
Subject: [PATCH 53/63] display warning message from show-gpus conditionally

---
 sky/cli.py                        |  9 +++++++--
 sky/clouds/utils/gcp_utils.py     |  2 +-
 sky/provision/kubernetes/utils.py | 18 ++++++++++++++----
 sky/resources.py                  |  2 +-
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/sky/cli.py b/sky/cli.py
index 124eb893241..5db241562b8 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3182,9 +3182,14 @@ def _output():
                     yield '\n\n'
                     # TODO(Doyoung): Update the message with the multi-host TPU
                     # support.
+                    k8s_per_node_acc_message = (
+                        'Kubernetes per node accelerator availability ')
+                    if kubernetes_utils.multi_host_tpu_exists_in_cluster(
+                            context):
+                        k8s_per_node_acc_message += (
+                            '(Note: Multi-host TPUs are not supported.)')
                     yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
-                           f'Kubernetes per node accelerator availability '
-                           '(Note: Multi-host TPUs are not supported.)'
+                           f'{k8s_per_node_acc_message}'
                            f'{colorama.Style.RESET_ALL}\n')
                     yield from k8s_node_table.get_string()
                 if kubernetes_autoscaling:
diff --git a/sky/clouds/utils/gcp_utils.py b/sky/clouds/utils/gcp_utils.py
index f44788d2e6b..0fa7db5f344 100644
--- a/sky/clouds/utils/gcp_utils.py
+++ b/sky/clouds/utils/gcp_utils.py
@@ -38,7 +38,7 @@ def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
         return False
     assert (resources is not None and len(resources.accelerators) == 1)
     acc, _ = list(resources.accelerators.items())[0]
-    if kubernetes_utils.is_tpu_pod_slice(acc):
+    if kubernetes_utils.is_tpu_on_gke(acc):
         return False
     if resources.accelerator_args is None:
         return True
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 720ee5295d3..8c177dc5e60 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -288,7 +288,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
                 # to distinguish between a3-high and a3-mega instances
                 return 'H100'
             return acc
-        elif is_tpu_pod_slice(value):
+        elif is_tpu_on_gke(value):
             return value
         else:
             raise ValueError(
@@ -582,7 +582,7 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
             node.metadata.labels[gpu_label_key] == gpu_label_val
         ]
         assert len(gpu_nodes) > 0, 'GPU nodes not found'
-        if is_tpu_pod_slice(acc_type):
+        if is_tpu_on_gke(acc_type):
             # If requested accelerator is a TPU type, check if the cluster
             # has sufficient TPU resource to meet the requirement.
             fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
@@ -717,7 +717,7 @@ def get_accelerator_label_key_value(
                     if (label_formatter.match_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
                                 value) == acc_type):
-                        if is_tpu_pod_slice(acc_type):
+                        if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)
                             if node_metadata_labels.get(
@@ -2194,7 +2194,7 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
     return pods
 
 
-def is_tpu_pod_slice(accelerator: str) -> bool:
+def is_tpu_on_gke(accelerator: str) -> bool:
     """Determins if the given accelerator is a TPU supported on GKE."""
     return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
 
@@ -2257,6 +2257,16 @@ def is_multi_host_tpu(node_metadata_labels: dict):
     return False
 
 
+def multi_host_tpu_exists_in_cluster(context: Optional[str] = None):
+    """Checks if there exists a multi-host TPU within the cluster."""
+    multi_host_tpu_in_cluster = False
+    nodes = get_kubernetes_nodes(context)
+    for node in nodes:
+        if is_multi_host_tpu(node.metadata.labels):
+            multi_host_tpu_in_cluster = True
+    return multi_host_tpu_in_cluster
+
+
 @dataclasses.dataclass
 class KubernetesSkyPilotClusterInfo:
     cluster_name_on_cloud: str
diff --git a/sky/resources.py b/sky/resources.py
index 764858afc10..af303abaeeb 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -566,7 +566,7 @@ def _set_accelerators(
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
                 if self.cloud is None:
-                    if kubernetes_utils.is_tpu_pod_slice(acc):
+                    if kubernetes_utils.is_tpu_on_gke(acc):
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()

From cbce4d5115b3afbed37b2a911ae51d28c2774db3 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Wed, 23 Oct 2024 02:27:28 +0000
Subject: [PATCH 54/63] update sky show-gpus

---
 sky/clouds/service_catalog/kubernetes_catalog.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 46078ddc590..2bed325dfb6 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -127,8 +127,12 @@ def list_accelerators_realtime(
                     kubernetes_utils.get_node_accelerator_count(
                         node.status.allocatable))
                 if accelerator_name and accelerator_count > 0:
-                    for count in range(1, accelerator_count + 1):
-                        accelerators_qtys.add((accelerator_name, count))
+                    if kubernetes_utils.is_tpu_on_gke(accelerator_name):
+                        accelerators_qtys.add(
+                            (accelerator_name, accelerator_count))
+                    else:
+                        for count in range(1, accelerator_count + 1):
+                            accelerators_qtys.add((accelerator_name, count))
 
                 for pod in pods:
                     # Get all the pods running on the node

From 241efc046f6a0d400462cfe3d47daa37f032de2b Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 25 Oct 2024 04:47:00 +0000
Subject: [PATCH 55/63] update get_accelerator_label_key_value

---
 sky/provision/kubernetes/utils.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 8c177dc5e60..44ca8919315 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -615,7 +615,8 @@ def get_accelerator_label_key_value(
         context: Optional[str],
         acc_type: str,
         acc_count: Optional[int],
-        check_mode=False) -> Tuple[str, str, str, str]:
+        check_mode=False
+        ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
     """Returns the label key and value for the given GPU/TPU type.
 
     Args:
@@ -629,8 +630,8 @@ def get_accelerator_label_key_value(
     Returns:
         A tuple of the accelerator label key, value, topology label key, and
         topology value. The topology label key and value are populated only if
-        the requested accelerator type is TPU. Returns empty strings if
-        check_mode is True.
+        the requested accelerator type is TPU. Returns None if check_mode is
+        True.
     Raises:
         ResourcesUnavailableError: Can be raised from the following conditions:
             - The cluster does not have GPU/TPU resources
@@ -655,12 +656,12 @@ def get_accelerator_label_key_value(
             # If check mode is enabled and autoscaler is set, we can return
             # early since we assume the cluster autoscaler will handle GPU
             # node provisioning.
-            return '', '', '', ''
+            return None, None, None, None
         formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
         assert formatter is not None, ('Unsupported autoscaler type:'
                                        f' {autoscaler_type}')
         return formatter.get_label_key(acc_type), formatter.get_label_value(
-            acc_type), '', ''
+            acc_type), None, None
 
     has_gpus, cluster_resources = detect_accelerator_resource(context)
     if has_gpus:
@@ -700,7 +701,7 @@ def get_accelerator_label_key_value(
             if check_mode:
                 # If check mode is enabled and we reached so far, we can
                 # conclude that the cluster is setup correctly and return.
-                return '', '', '', ''
+                return None, None, None, None
             # Search in node_labels to see if any node has the requested
             # GPU type.
             # Note - this only checks if the label is available on a
@@ -735,7 +736,7 @@ def get_accelerator_label_key_value(
                                 else:
                                     continue
                         else:
-                            return label, value, '', ''
+                            return label, value, None, None
 
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():

From 2fbb4eb7298ac922c302acda193c51bfc9a55a25 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 25 Oct 2024 07:55:55 +0000
Subject: [PATCH 56/63] format

---
 sky/provision/kubernetes/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 44ca8919315..c38a8907463 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -612,11 +612,11 @@ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
 
 
 def get_accelerator_label_key_value(
-        context: Optional[str],
-        acc_type: str,
-        acc_count: Optional[int],
-        check_mode=False
-        ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
+    context: Optional[str],
+    acc_type: str,
+    acc_count: Optional[int],
+    check_mode=False
+) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
     """Returns the label key and value for the given GPU/TPU type.
 
     Args:

From 9e8d53d00a0418cf8d3fa293c4316458c28bedc3 Mon Sep 17 00:00:00 2001
From: landscapepainter <kei0112777@gmail.com>
Date: Sat, 26 Oct 2024 20:28:32 +0000
Subject: [PATCH 57/63] format

---
 sky/provision/kubernetes/instance.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index f633c577cea..a18f5a92933 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -530,9 +530,9 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
                              'are currently not unsupported.')
             raise config_lib.KubernetesError(
                 _lack_resource_msg('TPU',
-                                    pod_spec,
-                                    details=error_message,
-                                    extra_msg=extra_message))
+                                   pod_spec,
+                                   details=error_message,
+                                   extra_msg=extra_message))
 
         else:
             # Re-raise the exception if it's a different error

From 0a0eac28d9fd1222c5c1ea4fdc726889ceb5e361 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 1 Nov 2024 02:11:48 +0000
Subject: [PATCH 58/63] format

---
 sky/provision/kubernetes/instance.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index b033f454655..d6cbeed0c4e 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -525,13 +525,14 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
         # TPU support.
         elif 'Invalid resource requests for google.com/tpu.' in error_message:
             extra_message = ('Verify if the cluster has a TPU slice node with '
-                            'a topology matching the number of TPU(s) '
-                            'requested. Note that multi-host TPU podslices '
-                            'are currently not unsupported.')
-            raise config_lib.KubernetesError(_lack_resource_msg('TPU',
-                                    pod_spec,
-                                    details=error_msg,
-                                    extra_msg=extra_message))
+                             'a topology matching the number of TPU(s) '
+                             'requested. Note that multi-host TPU podslices '
+                             'are currently not unsupported.')
+            raise config_lib.KubernetesError(
+                _lack_resource_msg('TPU',
+                                   pod_spec,
+                                   details=error_message,
+                                   extra_msg=extra_message))
         else:
             # Re-raise the exception if it's a different error
             raise e

From 9dbaa72a10efae88db99779301d65a9a5a5eae7f Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 1 Nov 2024 02:28:29 +0000
Subject: [PATCH 59/63] update comment

---
 sky/clouds/service_catalog/kubernetes_catalog.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
index 4e27fb1b674..711d126963b 100644
--- a/sky/clouds/service_catalog/kubernetes_catalog.py
+++ b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -128,6 +128,9 @@ def list_accelerators_realtime(
                         node.status.allocatable))
 
                 if accelerator_name and accelerator_count > 0:
+                    # TPUs are counted in a different way compared to GPUs.
+                    # Multi-node GPUs can be split into smaller units and be
+                    # provisioned, but TPUs are considered as an atomic unit.
                     if kubernetes_utils.is_tpu_on_gke(accelerator_name):
                         accelerators_qtys.add(
                             (accelerator_name, accelerator_count))

From f5e1d373f7cf2cc91ccdb4c8526c21e3eee114ef Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Fri, 1 Nov 2024 04:02:37 +0000
Subject: [PATCH 60/63] resolve review comments

---
 sky/clouds/kubernetes.py             |  5 +++++
 sky/provision/kubernetes/instance.py |  8 ++++++--
 sky/provision/kubernetes/utils.py    | 29 ++++++++++++++--------------
 sky/resources.py                     |  5 +++--
 sky/templates/kubernetes-ray.yml.j2  | 12 ++++--------
 5 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 6322b8ddcd3..6633a5b8c0d 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -364,6 +364,7 @@ def make_deploy_resources_variables(
         k8s_acc_label_value = None
         k8s_topology_label_key = None
         k8s_topology_label_value = None
+        k8s_resource_key = None
         tpu_requested = False
 
         # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
@@ -375,6 +376,9 @@ def make_deploy_resources_variables(
             if (k8s_acc_label_key ==
                     kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
                 tpu_requested = True
+                k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
+            else:
+                k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY
 
         port_mode = network_utils.get_port_mode(None)
 
@@ -439,6 +443,7 @@ def make_deploy_resources_variables(
             'tpu_requested': tpu_requested,
             'k8s_topology_label_key': k8s_topology_label_key,
             'k8s_topology_label_value': k8s_topology_label_value,
+            'k8s_resource_key': k8s_resource_key,
             'image_id': image_id,
         }
 
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index d6c4169f553..936a4f389f2 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -613,8 +613,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                        'override runtimeClassName in ~/.sky/config.yaml. '
                        'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html')  # pylint: disable=line-too-long
 
-    needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
-        'limits', {}).get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0)
+    needs_gpus = False
+    limits = pod_spec['spec']['containers'][0].get('resources',
+                                                   {}).get('limits')
+    if limits is not None:
+        needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
+
     if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index c38a8907463..09675aad395 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -86,7 +86,9 @@
 # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
 GKE_TPU_ACCELERATOR_TO_GENERATION = {
     'tpu-v4-podslice': 'v4',
+    # Only Single-host v5e TPU configurations are allowed.
     'tpu-v5-lite-device': 'v5e',
+    # Multi-host compatible v5e TPU configurations allowed.
     'tpu-v5-lite-podslice': 'v5e',
     'tpu-v5p-slice': 'v5p',
 }
@@ -256,7 +258,7 @@ class GKELabelFormatter(GPULabelFormatter):
 
     @classmethod
     def get_label_key(cls, accelerator: Optional[str] = None) -> str:
-        if isinstance(accelerator, str) and accelerator.startswith('tpu-'):
+        if accelerator is not None and accelerator.startswith('tpu-'):
             return cls.TPU_LABEL_KEY
         return cls.GPU_LABEL_KEY
 
@@ -436,10 +438,10 @@ def detect_accelerator_resource(
     nodes = get_kubernetes_nodes(context)
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = (GPU_RESOURCE_KEY in cluster_resources or
-               TPU_RESOURCE_KEY in cluster_resources)
+    has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or
+                       TPU_RESOURCE_KEY in cluster_resources)
 
-    return has_gpu, cluster_resources
+    return has_accelerator, cluster_resources
 
 
 @functools.lru_cache(maxsize=10)
@@ -868,10 +870,10 @@ def check_credentials(context: Optional[str],
     # provider if their cluster GPUs are not setup correctly.
     gpu_msg = ''
     try:
-        _, _, _, _ = get_accelerator_label_key_value(context,
-                                                     acc_type='',
-                                                     acc_count=0,
-                                                     check_mode=True)
+        get_accelerator_label_key_value(context,
+                                        acc_type='',
+                                        acc_count=0,
+                                        check_mode=True)
     except exceptions.ResourcesUnavailableError as e:
         # If GPUs are not available, we return cluster as enabled (since it can
         # be a CPU-only cluster) but we also return the exception message which
@@ -2222,7 +2224,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
     return 0
 
 
-def reduce_tpu_topology(topology: str):
+def reduce_tpu_topology(topology: str) -> int:
     """Computes the number of TPU chips from its topology string."""
     chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
     # tpu_topology_chip_count represents the total number of TPU chips in the
@@ -2232,7 +2234,7 @@ def reduce_tpu_topology(topology: str):
     return tpu_topology_chip_count
 
 
-def is_multi_host_tpu(node_metadata_labels: dict):
+def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
     """Determines whether the given node is a multi-host TPU configuration."""
     if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
         assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
@@ -2258,14 +2260,13 @@ def is_multi_host_tpu(node_metadata_labels: dict):
     return False
 
 
-def multi_host_tpu_exists_in_cluster(context: Optional[str] = None):
+def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
     """Checks if there exists a multi-host TPU within the cluster."""
-    multi_host_tpu_in_cluster = False
     nodes = get_kubernetes_nodes(context)
     for node in nodes:
         if is_multi_host_tpu(node.metadata.labels):
-            multi_host_tpu_in_cluster = True
-    return multi_host_tpu_in_cluster
+            return True
+    return False
 
 
 @dataclasses.dataclass
diff --git a/sky/resources.py b/sky/resources.py
index 8c9a95b04fb..0528113f2fb 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -588,8 +588,9 @@ def _set_accelerators(
                     else:
                         self._cloud = clouds.GCP()
                 assert (self.cloud.is_same_cloud(clouds.GCP()) or
-                        self.cloud.is_same_cloud(clouds.Kubernetes())
-                       ), 'Cloud must be GCP or Kubernetes.'
+                        self.cloud.is_same_cloud(clouds.Kubernetes())), (
+                            'Cloud must be GCP or Kubernetes for TPU '
+                            'accelerators.')
 
                 if accelerator_args is None:
                     accelerator_args = {}
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 9b6d190c7ee..79a27527f9a 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -401,15 +401,13 @@ available_node_types:
             requests:
               cpu: {{cpus}}
               memory: {{memory}}G
-              {% if tpu_requested %}
+              {% if k8s_resource_key is not none %}
               # Number of requested google.com/tpu must be equal to the total
               # number of available TPU chips on the TPU slice node either it
               # being a node from multi-host TPU slice or single-host TPU
               # slice. Example reference:
               # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
-              google.com/tpu: {{accelerator_count}}
-              {% else %}
-              nvidia.com/gpu: {{accelerator_count}}
+              {{k8s_resource_key}}: {{accelerator_count}}
               {% endif %}
               {% if k8s_fuse_device_required %}
               # Kubernetes resource exposed by the fuse device manager
@@ -418,10 +416,8 @@ available_node_types:
               {% endif %}
             limits:
               # Limits need to be defined for GPU/TPU requests
-              {% if tpu_requested %}
-              google.com/tpu: {{accelerator_count}}
-              {% else %}
-              nvidia.com/gpu: {{accelerator_count}}
+              {% if k8s_resource_key is not none %}
+              {{k8s_resource_key}}: {{accelerator_count}}
               {% endif %}
               {% if k8s_fuse_device_required %}
               smarter-devices/fuse: "1"

From 688c0b4588ad6133d60269493f87008f3b466934 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sat, 2 Nov 2024 22:37:21 +0000
Subject: [PATCH 61/63] update tpuvm_mnist.yaml

---
 examples/tpu/tpuvm_mnist.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tpu/tpuvm_mnist.yaml b/examples/tpu/tpuvm_mnist.yaml
index d1fd434fad6..41b14283fac 100644
--- a/examples/tpu/tpuvm_mnist.yaml
+++ b/examples/tpu/tpuvm_mnist.yaml
@@ -5,7 +5,7 @@ resources:
 
 # The setup command.  Will be run under the working directory.
 setup: |
-  git clone https://github.com/google/flax.git --branch v0.8.2
+  git clone https://github.com/google/flax.git --branch v0.10.1
 
   conda activate flax
   if [ $? -eq 0 ]; then
@@ -15,7 +15,7 @@ setup: |
     conda activate flax
     # Make sure to install TPU related packages in a conda env to avoid package conflicts.
     pip install \
-      -f https://storage.googleapis.com/jax-releases/libtpu_releases.html "jax[tpu]==0.4.25" \
+      -f https://storage.googleapis.com/jax-releases/libtpu_releases.html "jax[tpu]==0.4.35" \
       clu \
       tensorflow tensorflow-datasets
     pip install -e flax

From 2dec7f9a2c9187f43088764e7331475b53fb7c41 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Sun, 3 Nov 2024 01:00:56 +0000
Subject: [PATCH 62/63] resolve comments

---
 sky/provision/kubernetes/instance.py | 2 ++
 sky/provision/kubernetes/utils.py    | 5 +++++
 sky/resources.py                     | 3 ++-
 sky/templates/kubernetes-ray.yml.j2  | 3 +++
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 936a4f389f2..66d22099a85 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -619,6 +619,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     if limits is not None:
         needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
 
+    # TPU pods provisioned on GKE use the default containerd runtime.
+    # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview  # pylint: disable=line-too-long
     if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 09675aad395..5982e5acefa 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -732,6 +732,9 @@ def get_accelerator_label_key_value(
                                 assert topology_value is not None
                                 tpu_topology_chip_count = reduce_tpu_topology(
                                     topology_value)
+                                # For single-host TPUs, there aren't multiple
+                                # different topologies that maps to identical
+                                # number of TPU chips.
                                 if tpu_topology_chip_count == acc_count:
                                     return (label, value, topology_label_key,
                                             topology_value)
@@ -2217,6 +2220,8 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
         Number of accelerators allocated or available from the node. If no
             resource is found, it returns 0.
     """
+    assert not (GPU_RESOURCE_KEY in attribute_dict and
+                TPU_RESOURCE_KEY in attribute_dict)
     if GPU_RESOURCE_KEY in attribute_dict:
         return int(attribute_dict[GPU_RESOURCE_KEY])
     elif TPU_RESOURCE_KEY in attribute_dict:
diff --git a/sky/resources.py b/sky/resources.py
index 0528113f2fb..deb05a6eade 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -596,7 +596,8 @@ def _set_accelerators(
                     accelerator_args = {}
 
                 use_tpu_vm = accelerator_args.get('tpu_vm', True)
-                if self.cloud.is_same_cloud(clouds.GCP()):
+                if (self.cloud.is_same_cloud(clouds.GCP()) and
+                        not kubernetes_utils.is_tpu_on_gke(acc)):
                     if 'runtime_version' not in accelerator_args:
 
                         def _get_default_runtime_version() -> str:
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 79a27527f9a..0b52238bf6c 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -450,6 +450,9 @@ setup_commands:
   # 755 permissions, and the user of the provisioned pod is not necessarily
   # a root. Hence, we need to update the write permission so the logs can be
   # properly written.
+  # TODO(Doyoung): Investigate to see why TPU workload fails to run without
+  # execution permission, such as granting 766 to log file. Check if it's a
+  # must and see if there's a workaround to grant minimum permission. 
   - sudo chmod 777 /tmp/tpu_logs;
   {% endif %}
 

From dc23e886d9b44d8d73feb80988afef9a460141b9 Mon Sep 17 00:00:00 2001
From: Doyoung Kim <kei0112777@gmail.com>
Date: Mon, 4 Nov 2024 01:34:51 +0000
Subject: [PATCH 63/63] update display message for show-gpus

---
 sky/cli.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sky/cli.py b/sky/cli.py
index 47975dd1ef9..a86aa5e5b11 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -3206,7 +3206,9 @@ def _output():
                     if kubernetes_utils.multi_host_tpu_exists_in_cluster(
                             context):
                         k8s_per_node_acc_message += (
-                            '(Note: Multi-host TPUs are not supported.)')
+                            '(Note: Multi-host TPUs are detected and excluded '
+                            'from the display as multi-host TPUs are not '
+                            'supported.)')
                     yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                            f'{k8s_per_node_acc_message}'
                            f'{colorama.Style.RESET_ALL}\n')