From 320e0cdf9bc64f7ce835876bd193731ba108141f Mon Sep 17 00:00:00 2001 From: Yika Luo Date: Thu, 31 Oct 2024 16:12:21 -0700 Subject: [PATCH] Move k8s script --- Dockerfile_k8s | 4 ---- Dockerfile_k8s_gpu | 4 ---- sky/clouds/service_catalog/images/README.md | 12 ++++++++++-- .../service_catalog/images/skypilot-k8s-image.sh | 13 +++++-------- sky/clouds/service_catalog/kubernetes_catalog.py | 14 +++++++++++++- 5 files changed, 28 insertions(+), 19 deletions(-) rename tests/kubernetes/build_image.sh => sky/clouds/service_catalog/images/skypilot-k8s-image.sh (84%) diff --git a/Dockerfile_k8s b/Dockerfile_k8s index ad6ae3d7d46..932df6d3197 100644 --- a/Dockerfile_k8s +++ b/Dockerfile_k8s @@ -43,9 +43,5 @@ RUN conda init && export PIP_DISABLE_PIP_VERSION_CHECK=1 && \ sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \ echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc -# Copy SkyPilot code base. This is required for the ssh jump pod to find the -# lifecycle management scripts -COPY --chown=sky . /skypilot/sky/ - # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 diff --git a/Dockerfile_k8s_gpu b/Dockerfile_k8s_gpu index f66bfda4bdb..8bc401bc454 100644 --- a/Dockerfile_k8s_gpu +++ b/Dockerfile_k8s_gpu @@ -50,9 +50,5 @@ RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x8 sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \ echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc -# Copy SkyPilot code base. This is required for the ssh jump pod to find the -# lifecycle management scripts -COPY --chown=sky . /skypilot/sky/ - # Set PYTHONUNBUFFERED=1 to have Python print to stdout/stderr immediately ENV PYTHONUNBUFFERED=1 diff --git a/sky/clouds/service_catalog/images/README.md b/sky/clouds/service_catalog/images/README.md index 3784a940925..8f9d8e85f16 100644 --- a/sky/clouds/service_catalog/images/README.md +++ b/sky/clouds/service_catalog/images/README.md @@ -31,7 +31,7 @@ packer build ${IMAGE}.pkr.hcl 2. Make the image public ```bash # Make image public -export IMAGE_NAME=skypilot-gcp-cpu-ubuntu-20241029144600 # Update this +export IMAGE_NAME=skypilot-gcp-gpu-ubuntu-241030 # Update this export IMAGE_ID=projects/sky-dev-465/global/images/${IMAGE_NAME} gcloud compute images add-iam-policy-binding ${IMAGE_NAME} --member='allAuthenticatedUsers' --role='roles/compute.imageUser' ``` @@ -46,7 +46,7 @@ packer build ${IMAGE}.pkr.hcl 2. Copy images to all regions ```bash export TYPE=gpu # Update this -export IMAGE_ID=ami-05e9f5efd844f1a4f # Update this +export IMAGE_ID=ami-0989556a89639b1bb # Update this python aws_utils/image_gen.py --image-id ${IMAGE_ID} --processor ${TYPE} ``` 3. Add fallback images if any region failed \ @@ -65,6 +65,14 @@ packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure- packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl ``` +### Kubernetes +1. Build the image +```bash +export REGION=europe # Update this: us, europe, asia +./skypilot-k8s-image.sh -p -l -r ${REGION} +./skypilot-k8s-image.sh -p -l -g -r ${REGION} +``` + ## Test Images 1. Minimal GPU test: `sky launch --image ${IMAGE_ID} --gpus=L4:1 --cloud ${CLOUD}` then run `nvidia-smi` in the launched instance. 2. Update the image ID in `sky/clouds/gcp.py` and run the test: diff --git a/tests/kubernetes/build_image.sh b/sky/clouds/service_catalog/images/skypilot-k8s-image.sh similarity index 84% rename from tests/kubernetes/build_image.sh rename to sky/clouds/service_catalog/images/skypilot-k8s-image.sh index c6d9b2a0099..075aa7ae4bc 100755 --- a/tests/kubernetes/build_image.sh +++ b/sky/clouds/service_catalog/images/skypilot-k8s-image.sh @@ -1,15 +1,12 @@ #!/bin/bash # Builds the Dockerfile_k8s image as the SkyPilot image. -# Optionally, if -p is specified, pushes the image to the registry. # Uses buildx to build the image for both amd64 and arm64. -# If -p flag is specified, pushes the image to the registry. -# If -g flag is specified, builds the GPU image in Dockerfile_k8s_gpu. GPU image is built only for amd64. -# If -l flag is specified, uses the latest tag instead of the date tag. Date tag is of the form YYYYMMDD. -# Usage: ./build_image.sh [-p] [-g] +# Usage: ./skypilot-k8s-image.sh [-p] [-g] [-l] [-r region] # -p: Push the image to the registry -# -g: Build the GPU image -# -l: Use latest tag -region=us # default region +# -g: Builds the GPU image in Dockerfile_k8s_gpu. GPU image is built only for amd64 +# -l: Use latest tag instead of the date tag. Date tag is of the form YYYYMMDD +# -r: Specify the region to be us, europe or asia +region=us push=false gpu=false latest=false diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 2d0cdbf7cf6..7ff8f49c621 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -8,12 +8,15 @@ from typing import Dict, List, Optional, Set, Tuple from sky import check as sky_check +from sky import sky_logging from sky.adaptors import common as adaptors_common from sky.clouds import Kubernetes from sky.clouds.service_catalog import CloudFilter from sky.clouds.service_catalog import common from sky.provision.kubernetes import utils as kubernetes_utils +logger = sky_logging.init_logger(__name__) + if typing.TYPE_CHECKING: import pandas as pd else: @@ -31,7 +34,16 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]: """Returns the image id from the tag.""" - return common.get_image_id_from_tag_impl(_image_df, tag, region) + global _image_df + image_id = common.get_image_id_from_tag_impl(_image_df, tag, region) + if image_id is None: + # Refresh the image catalog and try again, if the image tag is not + # found. + logger.debug('Refreshing the image catalog and trying again.') + _image_df = common.read_catalog('kubernetes/images.csv', + pull_frequency_hours=0) + image_id = common.get_image_id_from_tag_impl(_image_df, tag, region) + return image_id def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: