Merge branch 'skypilot-org:master' into master

asaiacai · Oct 3, 2024 · 4373180 · 4373180
2 parents eb30bb2 + b1f22c4
commit 4373180
Show file tree

Hide file tree

Showing 7 changed files with 323 additions and 30 deletions.
diff --git a/examples/aws-neuron/README.md b/examples/aws-neuron/README.md
@@ -0,0 +1,117 @@
+# AWS Inferentia
+
+SkyPilot supports AWS Inferentia accelerators. The Neuron SDK is a runtime and compiler for running deep learning models on AWS Inferentia chips. Here is an example of how to use the Neuron SDK to launch a Llama 3 8b model on an Inferentia chip:
+
+```bash
+$ sky launch -c aws-inf inferentia.yaml --env HF_TOKEN=hf_xxx
+```
+
+To send an example request to the model, you can use the following command:
+
+```bash
+$ ENDPOINT=$(sky status aws-inf --endpoint 9000)
+$ curl http://$ENDPOINT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "Who are you?"
+        }
+      ],
+      "stop_token_ids": [128009, 128001]
+    }'
+{"id":"chat-0631550312c143d88ca6d477d0df6c2c","object":"chat.completion","created":1727751137,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"I'm a helpful assistant! I","tool_calls":[]},"logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":32,"completion_tokens":7},"prompt_logprobs":null}
+```
+
+## Using multiple accelerator choices
+
+You can also specify multiple resources in a task YAML to allow SkyPilot to find the cheapest available resources for you. Specifically, you can specify both Neuron accelerators and Nvidia GPUs in the same YAML file. Here is an example (See [multi-accelerator.yaml](./multi-accelerator.yaml)):
+
+<details>
+
+<summary>Example YAML for multiple accelerators.</summary>
+
+```yaml
+resources:
+  accelerators: {A100:1, Inferentia:6}
+  disk_size: 512
+  ports: 9000
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: # fill
+
+setup: |
+  if command -v nvidia-smi; then
+    pip install vllm==0.4.2
+    pip install flash-attn==2.5.9.post1
+  else
+    # Install transformers-neuronx and its dependencies
+    sudo apt-get install -y python3.10-venv g++
+    python3.10 -m venv aws_neuron_venv_pytorch
+    source aws_neuron_venv_pytorch/bin/activate
+    pip install ipykernel
+    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+    pip install jupyter notebook
+    pip install environment_kernels
+    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+    python -m pip install wget
+    python -m pip install awscli
+    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+    # Install latest version of triton.
+    # Reference: https://github.com/vllm-project/vllm/issues/6987
+    pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
+
+    # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
+    # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
+    git clone https://github.com/vllm-project/vllm.git vllm_repo
+    cd vllm_repo
+    pip install -U -r requirements-neuron.txt
+    VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+    python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+    sudo apt update
+    sudo apt install -y numactl
+  fi
+
+run: |
+  if command -v nvidia-smi; then
+    TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE
+    PREFIX=""
+    DEVICE="cuda"
+  else
+    source aws_neuron_venv_pytorch/bin/activate
+    # Calculate the tensor parallel size. vLLM requires the tensor parallel size
+    # to be a factor of the number of attention heads, which is 32 for the model.
+    # Here we calculate the largest power of 2 that is less than or equal to the
+    # number of GPUs per node.
+    TENSOR_PARALLEL_SIZE=1
+    while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
+      TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
+    done
+    NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
+    OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
+    MASTER_PORT=12355
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
+    PREFIX="numactl --cpunodebind=0 --membind=0"
+    DEVICE="neuron"
+  fi
+  $PREFIX python3 -m vllm.entrypoints.openai.api_server \
+    --device $DEVICE \
+    --model $MODEL_NAME \
+    --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+    --max-num-seqs 16 \
+    --max-model-len 32 \
+    --block-size 32 \
+    --port 9000
+```
+
+</details>
diff --git a/examples/aws-neuron/inferentia.yaml b/examples/aws-neuron/inferentia.yaml
@@ -0,0 +1,62 @@
+resources:
+  accelerators: Inferentia:6
+  disk_size: 512
+  ports: 9000
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: # fill
+
+setup: |
+  # Install transformers-neuronx and its dependencies
+  sudo apt-get install -y python3.10-venv g++
+  python3.10 -m venv aws_neuron_venv_pytorch
+  source aws_neuron_venv_pytorch/bin/activate
+  pip install ipykernel
+  python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+  pip install jupyter notebook
+  pip install environment_kernels
+  python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+  python -m pip install wget
+  python -m pip install awscli
+  python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+  # Install latest version of triton.
+  # Reference: https://github.com/vllm-project/vllm/issues/6987
+  pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
+
+  # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
+  # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
+  git clone https://github.com/vllm-project/vllm.git vllm_repo
+  cd vllm_repo
+  pip install -U -r requirements-neuron.txt
+  VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+  sudo apt update
+  sudo apt install -y numactl
+
+run: |
+  source aws_neuron_venv_pytorch/bin/activate
+  # Calculate the tensor parallel size. vLLM requires the tensor parallel size
+  # to be a factor of the number of attention heads, which is 32 for the model.
+  # Here we calculate the largest power of 2 that is less than or equal to the
+  # number of GPUs per node.
+  TENSOR_PARALLEL_SIZE=1
+  while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
+    TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
+  done
+  NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
+  OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
+  MASTER_PORT=12355
+  LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
+  numactl --cpunodebind=0 --membind=0 \
+    python3 -m vllm.entrypoints.openai.api_server \
+      --device neuron \
+      --model $MODEL_NAME \
+      --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+      --max-num-seqs 16 \
+      --max-model-len 32 \
+      --block-size 32 \
+      --port 9000
diff --git a/examples/aws-neuron/multi-accelerator.yaml b/examples/aws-neuron/multi-accelerator.yaml
@@ -0,0 +1,74 @@
+resources:
+  accelerators: {A100:1, Inferentia:6}
+  disk_size: 512
+  ports: 9000
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: # fill
+
+setup: |
+  if command -v nvidia-smi; then
+    pip install vllm==0.4.2
+    pip install flash-attn==2.5.9.post1
+  else
+    # Install transformers-neuronx and its dependencies
+    sudo apt-get install -y python3.10-venv g++
+    python3.10 -m venv aws_neuron_venv_pytorch
+    source aws_neuron_venv_pytorch/bin/activate
+    pip install ipykernel
+    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+    pip install jupyter notebook
+    pip install environment_kernels
+    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+    python -m pip install wget
+    python -m pip install awscli
+    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+    # Install latest version of triton.
+    # Reference: https://github.com/vllm-project/vllm/issues/6987
+    pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
+
+    # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
+    # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
+    git clone https://github.com/vllm-project/vllm.git vllm_repo
+    cd vllm_repo
+    pip install -U -r requirements-neuron.txt
+    VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+    python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+    sudo apt update
+    sudo apt install -y numactl
+  fi
+
+run: |
+  if command -v nvidia-smi; then
+    TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE
+    PREFIX=""
+    DEVICE="cuda"
+  else
+    source aws_neuron_venv_pytorch/bin/activate
+    # Calculate the tensor parallel size. vLLM requires the tensor parallel size
+    # to be a factor of the number of attention heads, which is 32 for the model.
+    # Here we calculate the largest power of 2 that is less than or equal to the
+    # number of GPUs per node.
+    TENSOR_PARALLEL_SIZE=1
+    while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
+      TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
+    done
+    NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
+    OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
+    MASTER_PORT=12355
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
+    PREFIX="numactl --cpunodebind=0 --membind=0"
+    DEVICE="neuron"
+  fi
+  $PREFIX python3 -m vllm.entrypoints.openai.api_server \
+    --device $DEVICE \
+    --model $MODEL_NAME \
+    --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+    --max-num-seqs 16 \
+    --max-model-len 32 \
+    --block-size 32 \
+    --port 9000
diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
@@ -225,6 +225,9 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
             if acc_name == 'K80':
                 image_id = service_catalog.get_image_id_from_tag(
                     'skypilot:k80-ubuntu-2004', region_name, clouds='aws')
+            if acc_name in ['Trainium', 'Inferentia']:
+                image_id = service_catalog.get_image_id_from_tag(
+                    'skypilot:neuron-ubuntu-2204', region_name, clouds='aws')
         if image_id is not None:
             return image_id
         # Raise ResourcesUnavailableError to make sure the failover in

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py
@@ -379,26 +379,33 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
 #
 # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208
 #   Nvidia driver: 470.57.02, CUDA Version: 11.4
-_GPU_UBUNTU_DATE_PYTORCH = [
-    ('gpu', '20.04', '20231103', '2.1.0'),
-    ('gpu', '18.04', '20221114', '1.10.0'),
-    ('k80', '20.04', '20211208', '1.10.0'),
-    ('k80', '18.04', '20211208', '1.10.0'),
+#
+# Neuron (Inferentia / Trainium):
+# https://aws.amazon.com/releasenotes/aws-deep-learning-ami-base-neuron-ubuntu-20-04/  # pylint: disable=line-too-long
+# Deep Learning Base Neuron AMI (Ubuntu 20.04) 20240923
+# TODO(tian): find out the driver version.
+#   Neuron driver:
+_GPU_DESC_UBUNTU_DATE = [
+    ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'),
+    ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
+    ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
+    ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
+    ('neuron', 'Base Neuron AMI', '22.04', '20240923'),
 ]
 
 
-def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
-                    pytorch_version: str) -> Optional[str]:
+def _fetch_image_id(region: str, description: str, ubuntu_version: str,
+                    creation_date: str) -> Optional[str]:
     try:
         image = subprocess.check_output(f"""\
             aws ec2 describe-images --region {region} --owners amazon \\
-                --filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\
+                --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\
                     'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
             """,
                                         shell=True)
     except subprocess.CalledProcessError as e:
-        print(f'Failed {region}, {ubuntu_version}, {creation_date}. '
-              'Trying next date.')
+        print(f'Failed {region}, {description}, {ubuntu_version}, '
+              f'{creation_date}. Trying next date.')
         print(f'{type(e)}: {e}')
         image_id = None
     else:
@@ -407,21 +414,21 @@ def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
     return image_id
 
 
-def _get_image_row(
-        region: str, gpu: str, ubuntu_version: str, date: str,
-        pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]:
-    print(f'Getting image for {region}, {ubuntu_version}, {gpu}')
-    image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version)
+def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str,
+                   date: str) -> Tuple[str, str, str, str, Optional[str], str]:
+    print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}')
+    image_id = _fetch_image_id(region, description, ubuntu_version, date)
     if image_id is None:
         # not found
-        print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}')
+        print(f'Failed to find image for {region}, {description}, '
+              f'{ubuntu_version}, {gpu}')
     tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
     return tag, region, 'ubuntu', ubuntu_version, image_id, date
 
 
 def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
     image_metas = [
-        (r, *i) for r, i in itertools.product(regions, _GPU_UBUNTU_DATE_PYTORCH)
+        (r, *i) for r, i in itertools.product(regions, _GPU_DESC_UBUNTU_DATE)
     ]
     with mp_pool.Pool() as pool:
         results = pool.starmap(_get_image_row, image_metas)

diff --git a/sky/resources.py b/sky/resources.py
@@ -966,20 +966,22 @@ def _try_validate_labels(self) -> None:
         """
         if not self._labels:
             return
-
-        if self.cloud is None:
-            # Because each cloud has its own label format, we cannot validate
-            # the labels without knowing the cloud.
-            with ux_utils.print_exception_no_traceback():
-                raise ValueError(
-                    'Cloud must be specified when labels are provided.')
-
-        # Check if the label key value pairs are valid.
+        if self.cloud is not None:
+            validated_clouds = [self.cloud]
+        else:
+            # If no specific cloud is set, validate label against ALL clouds.
+            # The label will be dropped if invalid for any one of the cloud
+            validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
         invalid_table = log_utils.create_table(['Label', 'Reason'])
         for key, value in self._labels.items():
-            valid, err_msg = self.cloud.is_label_valid(key, value)
-            if not valid:
-                invalid_table.add_row([f'{key}: {value}', err_msg])
+            for cloud in validated_clouds:
+                valid, err_msg = cloud.is_label_valid(key, value)
+                if not valid:
+                    invalid_table.add_row([
+                        f'{key}: {value}',
+                        f'Label rejected due to {cloud}: {err_msg}'
+                    ])
+                    break
         if len(invalid_table.rows) > 0:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(