From 8ca950c7075c7676ded721554a50ec281be00d52 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:21:09 +0000 Subject: [PATCH 1/7] Show multiple kubernetes in the optimizer table --- sky/optimizer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index 4326329579d..559039def37 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -821,12 +821,15 @@ def format_number(x): return row def _get_resource_group_hash(resources: 'resources_lib.Resources'): - return json.dumps( - { + resource_key_dict = { 'cloud': f'{resources.cloud}', 'accelerators': f'{resources.accelerators}', 'use_spot': resources.use_spot - }, + } + if isinstance(resources.cloud, clouds.Kubernetes): + resource_key_dict['region'] = resources.region + return json.dumps( + resource_key_dict, sort_keys=True) # Print the list of resouces that the optimizer considered. From de8a688478b2a55536cfce4c2e573e1aa01f5f61 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:21:19 +0000 Subject: [PATCH 2/7] Add docs for multiple kubernetes --- .../reference/kubernetes/multi-kubernetes.rst | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 docs/source/reference/kubernetes/multi-kubernetes.rst diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst new file mode 100644 index 00000000000..e42a7d0d377 --- /dev/null +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -0,0 +1,114 @@ +.. _multi-kubernetes: + +Across Multiple Kubernetes Clusters +=================================== + + +SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. + +You may have multiple Kubernetes clusters for a variety of reasons: + +* Clusters for different purposes: e.g.,a production cluster and a development/testing cluster. +* Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. +* Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. +* Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. + + +.. image:: /images/kubernetes/multi-kubernetes.png + + +Set Up Credentials for Multiple Kubernetes Clusters +--------------------------------------------------- + +To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. To get +it work with SkyPilot, you don't have to do any additional setup than having those credentials in your local ``~/.kube/config`` file. + +For example, a ``~/.kube/config`` file may look like this: + +.. code-block:: yaml + + apiVersion: v1 + clusters: + - cluster: + certificate-authority-data: + ... + server: https://xx.xx.xx.xx:45819 + name: my-h100-cluster + - cluster: + certificate-authority-data: + ... + server: https://yy.yy.yy.yy:45819 + name: my-tpu-cluster + contexts: + - context: + cluster: my-h100-cluster + user: my-h100-cluster + name: my-h100-cluster + - context: + cluster: my-tpu-cluster + namespace: my-namespace + user: my-tpu-cluster + name: my-tpu-cluster + current-context: my-h100-cluster + ... + + +In this example, we have two Kubernetes clusters: ``my-h100-cluster`` and ``my-tpu-cluster``, and each Kubernetes cluster has a context for it. + +Point to a Kubernetes Cluster and Launch +----------------------------------------- + +SkyPilot borrows the ``region`` concept from clouds to denote a Kubernetes cluster. You can point to a Kubernetes cluster +by specifying the ``--region`` with the context name for that cluster. + +.. code-block:: console + + # Check the GPUs available in a Kubernetes cluster + $ sky show-gpus --cloud kubernetes --region my-h100-cluster + + Kubernetes GPUs (Context: my-h100-cluster) + GPU QTY_PER_NODE TOTAL_GPUS TOTAL_FREE_GPUS + H100 1, 2, 3, 4, 5, 6, 7, 8 8 8 + + Kubernetes per node GPU availability + NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS + gke-test-zhwu-default-pool-20159504-hbzn H100 8 8 + gke-test-zhwu-default-pool-20159504-w5x7 None 0 0 + +When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. + +.. code-block:: console + + $ sky launch --cloud kubernetes --region my-tpu-cluster echo 'Hello World' + + +.. note:: + + When you don't specify a region, SkyPilot will use the current context. + + +Failover across Multiple Kubernetes Clusters +-------------------------------------------- + +SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you have multiple Kubernetes clusters +across different clouds and regions, and you want to launch a task in any of the clusters with available GPUs. + +Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple +Kubernetes clusters can be for different purposes. To enable the failover, you can specify the ``kubernetes.allowed_contexts`` +in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). + +.. code-block:: yaml + + kubernetes: + allowed_contexts: + - my-h100-cluster-gke + - my-h100-cluster-eks + +With this global config, SkyPilot will failover through the Kubernetes clusters in the ``allowed_contexts`` with in the same +order as they are specified. + + +.. code-block:: console + + $ sky launch --cloud kubernetes echo 'Hello World' + From 55c26c3c8eb43c2d65228f9b9ea0a75c67164210 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:43:01 +0000 Subject: [PATCH 3/7] Add dynamic update --- .../reference/kubernetes/multi-kubernetes.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index e42a7d0d377..6798a1c1969 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -12,6 +12,7 @@ You may have multiple Kubernetes clusters for a variety of reasons: * Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. * Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. * Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. +* Clusters for different Kubernetes versions: e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. .. image:: /images/kubernetes/multi-kubernetes.png @@ -112,3 +113,18 @@ order as they are specified. $ sky launch --cloud kubernetes echo 'Hello World' + Considered resources (1 node): + ------------------------------------------------------------------------------------------------------------ + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + ------------------------------------------------------------------------------------------------------------ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-gke 0.00 ✔ + Kubernetes 2CPU--8GB--1H100 2 8 H100:1 my-h100-cluster-eks 0.00 + ------------------------------------------------------------------------------------------------------------ + + + +Dynamically Update Kubernetes Clusters to Use +---------------------------------------------- + +To see how to dynamically update Kubernetes clusters to use, refer to :ref:`dynamic-kubernetes-contexts-update-policy`. + From 92b4ded1bdc2d4fb411b4488d9ade1868c72caa7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:44:10 +0000 Subject: [PATCH 4/7] format --- sky/optimizer.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index 559039def37..e42dda8d8ed 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -822,15 +822,13 @@ def format_number(x): def _get_resource_group_hash(resources: 'resources_lib.Resources'): resource_key_dict = { - 'cloud': f'{resources.cloud}', - 'accelerators': f'{resources.accelerators}', - 'use_spot': resources.use_spot - } + 'cloud': f'{resources.cloud}', + 'accelerators': f'{resources.accelerators}', + 'use_spot': resources.use_spot + } if isinstance(resources.cloud, clouds.Kubernetes): resource_key_dict['region'] = resources.region - return json.dumps( - resource_key_dict, - sort_keys=True) + return json.dumps(resource_key_dict, sort_keys=True) # Print the list of resouces that the optimizer considered. resource_fields = [ From 6a487572bf23dc1ebdb10e02b4e9192dbce6f60c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:46:06 +0000 Subject: [PATCH 5/7] Add new button --- docs/source/_static/custom.js | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index 1fa28105186..c06b974ec97 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -33,6 +33,7 @@ document.addEventListener('DOMContentLoaded', () => { { selector: '.toctree-l1 > a', text: 'Llama 3.2 (Meta)' }, { selector: '.toctree-l1 > a', text: 'Admin Policy Enforcement' }, { selector: '.toctree-l1 > a', text: 'Using Existing Machines' }, + { selector: '.toctree-l1 > a', text: 'Across Multiple Kubernetes Clusters' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { From 867a239ca14448e868be881bfb4242c511b82e75 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:46:29 +0000 Subject: [PATCH 6/7] Add to index --- docs/source/docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index d83bf7821c3..e0a81a836d1 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -151,6 +151,7 @@ Read the research: ../reservations/reservations Using Existing Machines <../reservations/existing-machines> ../reference/kubernetes/index + ../reference/kubernetes/multi-kubernetes .. toctree:: :hidden: From fe4c8c402de6801a80112b988e46b03128274015 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Sep 2024 23:59:19 +0000 Subject: [PATCH 7/7] fix --- .../reference/kubernetes/multi-kubernetes.rst | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/source/reference/kubernetes/multi-kubernetes.rst b/docs/source/reference/kubernetes/multi-kubernetes.rst index 6798a1c1969..eb2e6ea9463 100644 --- a/docs/source/reference/kubernetes/multi-kubernetes.rst +++ b/docs/source/reference/kubernetes/multi-kubernetes.rst @@ -1,18 +1,18 @@ .. _multi-kubernetes: -Across Multiple Kubernetes Clusters -=================================== +Multiple Kubernetes Clusters +============================= SkyPilot allows you to manage dev pods, jobs and services across multiple Kubernetes clusters in a single pane of glass. -You may have multiple Kubernetes clusters for a variety of reasons: +You may have multiple Kubernetes clusters for different: -* Clusters for different purposes: e.g.,a production cluster and a development/testing cluster. -* Clusters in different regions or clouds: e.g., US and EU regions; or AWS and Lambda clouds. -* Clusters for different accelerators: e.g., NVIDIA H100 cluster and a Google TPU cluster. -* Clusters with different configurations: e.g., a small cluster for a single node and a large cluster for multiple nodes. -* Clusters for different Kubernetes versions: e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. +* **Use cases:** e.g., a production cluster and a development/testing cluster. +* **Regions or clouds:** e.g., US and EU regions; or AWS and Lambda clouds. +* **Accelerators:** e.g., NVIDIA H100 cluster and a Google TPU cluster. +* **Configurations:** e.g., a small cluster for a single node and a large cluster for multiple nodes. +* **Kubernetes versions:** e.g., to upgrade a cluster from Kubernetes 1.20 to 1.21, you may create a new Kubernetes cluster to avoid downtime or unexpected errors. .. image:: /images/kubernetes/multi-kubernetes.png @@ -21,8 +21,9 @@ You may have multiple Kubernetes clusters for a variety of reasons: Set Up Credentials for Multiple Kubernetes Clusters --------------------------------------------------- -To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. To get -it work with SkyPilot, you don't have to do any additional setup than having those credentials in your local ``~/.kube/config`` file. +To work with multiple Kubernetes clusters, you need to ensure you have the necessary credentials for each cluster. +Check that your local ``~/.kube/config`` file has the credentials for each cluster. For setting up clusters and their credentials, +see :ref:`kubernetes-setup-deploy`. For example, a ``~/.kube/config`` file may look like this: @@ -73,8 +74,8 @@ by specifying the ``--region`` with the context name for that cluster. Kubernetes per node GPU availability NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS - gke-test-zhwu-default-pool-20159504-hbzn H100 8 8 - gke-test-zhwu-default-pool-20159504-w5x7 None 0 0 + my-h100-cluster-hbzn H100 8 8 + my-h100-cluster-w5x7 None 0 0 When launching a SkyPilot cluster or task, you can also specify the context name with ``--region`` to launch the cluster or task in. @@ -91,12 +92,12 @@ When launching a SkyPilot cluster or task, you can also specify the context name Failover across Multiple Kubernetes Clusters -------------------------------------------- -SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you have multiple Kubernetes clusters -across different clouds and regions, and you want to launch a task in any of the clusters with available GPUs. +SkyPilot enables you to failover across multiple Kubernetes clusters. It is useful when you want to launch a task in any of the clusters with available GPUs. Different from cloud providers, SkyPilot does not failover through different regions (contexts) by default, because multiple -Kubernetes clusters can be for different purposes. To enable the failover, you can specify the ``kubernetes.allowed_contexts`` -in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). +Kubernetes clusters can be for different purposes. + +To enable the failover, you can specify the ``kubernetes.allowed_contexts`` in SkyPilot config, ``~/.sky/config.yaml`` (See config YAML spec: :ref:`config-yaml`). .. code-block:: yaml