diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 1fa28105186..c06b974ec97 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -33,6 +33,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Llama 3.2 (Meta)' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, + { selector: '.toctree-l1 > a', text: 'Across Multiple Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index d83bf7821c3..e0a81a836d1 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -151,6 +151,7 @@ Read the research: ../reservations/reservations Using Existing Machines <../reservations/existing-machines> ../reference/kubernetes/index + ../reference/kubernetes/multi-kubernetes .. toctree:: :hidden: diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst new file mode 100644 index 00000000000..eb2e6ea9463 --- /dev/null +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -0,0 +1,131 @@ +.. _multi-kubernetes: + +Multiple Kubernetes Clusters +============================= + + +SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. + +You may have multiple Kubernetes clusters for different: + +* **Use cases:** e.g., a production cluster and a development/testing cluster. +* **Regions or clouds:** e.g., US and EU regions; or AWS and Lambda clouds. +* **Accelerators:** e.g., NVIDIA H100 cluster and a Google TPU cluster. +* **Configurations:** e.g., a small cluster for a single node and a large cluster for multiple nodes. +* **Kubernetes versions:** e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. + + +.. image:: /images/kubernetes/multi-kubernetes.png + + +Set Up Credentials for Multiple Kubernetes Clusters +--------------------------------------------------- + +To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. +Check that your local ``~/.kube/config`` file has the credentials for each cluster. For setting up clusters and their credentials, +see :ref:`kubernetes-setup-deploy`. + +For example, a ``~/.kube/config`` file may look like this: + +.. code-block:: yaml + + apiVersion: v1 + clusters: + - cluster: + certificate-authority-data: + ... + server: https://xx.xx.xx.xx:45819 + name: my-h100-cluster + - cluster: + certificate-authority-data: + ... + server: https://yy.yy.yy.yy:45819 + name: my-tpu-cluster + contexts: + - context: + cluster: my-h100-cluster + user: my-h100-cluster + name: my-h100-cluster + - context: + cluster: my-tpu-cluster + namespace: my-namespace + user: my-tpu-cluster + name: my-tpu-cluster + current-context: my-h100-cluster + ... + + +In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it. + +Point to a Kubernetes Cluster and Launch +----------------------------------------- + +SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes cluster. You can point to a Kubernetes cluster +by specifying the ``--region`` with the context name for that cluster. + +.. code-block:: console + + # Check the GPUs available in a Kubernetes cluster + $ sky show-gpus --cloud kubernetes --region my-h100-cluster + + Kubernetes GPUs (Context: my-h100-cluster) + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + H100 1, 2, 3, 4, 5, 6, 7, 8 8 8 + + Kubernetes per node GPU availability + NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS + my-h100-cluster-hbzn H100 8 8 + my-h100-cluster-w5x7 None 0 0 + +When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. + +.. code-block:: console + + $ sky launch --cloud kubernetes --region my-tpu-cluster echo 'Hello World' + + +.. note:: + + When you don't specify a region, SkyPilot will use the current context. + + +Failover across Multiple Kubernetes Clusters +-------------------------------------------- + +SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you want to launch a task in any of the clusters with available GPUs. + +Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple +Kubernetes clusters can be for different purposes. + +To enable the failover, you can specify the ``kubernetes.allowed_contexts`` in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). + +.. code-block:: yaml + + kubernetes: + allowed_contexts: + - my-h100-cluster-gke + - my-h100-cluster-eks + +With this global config, SkyPilot will failover through the Kubernetes clusters in the ``allowed_contexts`` with in the same +order as they are specified. + + +.. code-block:: console + + $ sky launch --cloud kubernetes echo 'Hello World' + + Considered resources (1 node): + ------------------------------------------------------------------------------------------------------------ + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + ------------------------------------------------------------------------------------------------------------ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-gke 0.00 ✔ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-eks 0.00 + ------------------------------------------------------------------------------------------------------------ + + + +Dynamically Update Kubernetes Clusters to Use +---------------------------------------------- + +To see how to dynamically update Kubernetes clusters to use, refer to :ref:`dynamic-kubernetes-contexts-update-policy`. + diff --git a/sky/optimizer.py b/sky/optimizer.py index 4326329579d..e42dda8d8ed 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -821,13 +821,14 @@ def format_number(x): return row def _get_resource_group_hash(resources: 'resources_lib.Resources'): - return json.dumps( - { - 'cloud': f'{resources.cloud}', - 'accelerators': f'{resources.accelerators}', - 'use_spot': resources.use_spot - }, - sort_keys=True) + resource_key_dict = { + 'cloud': f'{resources.cloud}', + 'accelerators': f'{resources.accelerators}', + 'use_spot': resources.use_spot + } + if isinstance(resources.cloud, clouds.Kubernetes): + resource_key_dict['region'] = resources.region + return json.dumps(resource_key_dict, sort_keys=True) # Print the list of resouces that the optimizer considered. resource_fields = [