From e437e96bca4f6f500f228840eac430b0f223393b Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 30 Sep 2024 14:12:05 -0700 Subject: [PATCH 1/3] [Examples] AWS Neuron Accelerator Example. (#4020) * [Examples] AWS Neuron Accelerator Example. * add example * auto calculate tp size & use ubuntu 2204 * add mix acc example * fix * rename --- examples/aws-neuron/inferentia.yaml | 62 ++++++++++++++++ examples/aws-neuron/mix-accelerator.yaml | 74 +++++++++++++++++++ sky/clouds/aws.py | 3 + .../data_fetchers/fetch_aws.py | 41 +++++----- 4 files changed, 163 insertions(+), 17 deletions(-) create mode 100644 examples/aws-neuron/inferentia.yaml create mode 100644 examples/aws-neuron/mix-accelerator.yaml diff --git a/examples/aws-neuron/inferentia.yaml b/examples/aws-neuron/inferentia.yaml new file mode 100644 index 00000000000..0d0773b3d09 --- /dev/null +++ b/examples/aws-neuron/inferentia.yaml @@ -0,0 +1,62 @@ +resources: + accelerators: Inferentia:6 + disk_size: 512 + ports: 9000 + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # fill + +setup: | + # Install transformers-neuronx and its dependencies + sudo apt-get install -y python3.10-venv g++ + python3.10 -m venv aws_neuron_venv_pytorch + source aws_neuron_venv_pytorch/bin/activate + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install wget + python -m pip install awscli + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + + # Install latest version of triton. + # Reference: https://github.com/vllm-project/vllm/issues/6987 + pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly + + # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict. + # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930 + git clone https://github.com/vllm-project/vllm.git vllm_repo + cd vllm_repo + pip install -U -r requirements-neuron.txt + VLLM_TARGET_DEVICE="neuron" pip install -e . + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + sudo apt update + sudo apt install -y numactl + +run: | + source aws_neuron_venv_pytorch/bin/activate + # Calculate the tensor parallel size. vLLM requires the tensor parallel size + # to be a factor of the number of attention heads, which is 32 for the model. + # Here we calculate the largest power of 2 that is less than or equal to the + # number of GPUs per node. + TENSOR_PARALLEL_SIZE=1 + while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do + TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2)) + done + NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))" + OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE + MASTER_PORT=12355 + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib" + numactl --cpunodebind=0 --membind=0 \ + python3 -m vllm.entrypoints.openai.api_server \ + --device neuron \ + --model $MODEL_NAME \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --max-num-seqs 16 \ + --max-model-len 32 \ + --block-size 32 \ + --port 9000 diff --git a/examples/aws-neuron/mix-accelerator.yaml b/examples/aws-neuron/mix-accelerator.yaml new file mode 100644 index 00000000000..fc452a06804 --- /dev/null +++ b/examples/aws-neuron/mix-accelerator.yaml @@ -0,0 +1,74 @@ +resources: + accelerators: {A100:1, Inferentia:6} + disk_size: 512 + ports: 9000 + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # fill + +setup: | + if command -v nvidia-smi; then + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 + else + # Install transformers-neuronx and its dependencies + sudo apt-get install -y python3.10-venv g++ + python3.10 -m venv aws_neuron_venv_pytorch + source aws_neuron_venv_pytorch/bin/activate + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install wget + python -m pip install awscli + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + + # Install latest version of triton. + # Reference: https://github.com/vllm-project/vllm/issues/6987 + pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly + + # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict. + # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930 + git clone https://github.com/vllm-project/vllm.git vllm_repo + cd vllm_repo + pip install -U -r requirements-neuron.txt + VLLM_TARGET_DEVICE="neuron" pip install -e . + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + sudo apt update + sudo apt install -y numactl + fi + +run: | + if command -v nvidia-smi; then + TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE + PREFIX="" + DEVICE="cuda" + else + source aws_neuron_venv_pytorch/bin/activate + # Calculate the tensor parallel size. vLLM requires the tensor parallel size + # to be a factor of the number of attention heads, which is 32 for the model. + # Here we calculate the largest power of 2 that is less than or equal to the + # number of GPUs per node. + TENSOR_PARALLEL_SIZE=1 + while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do + TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2)) + done + NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))" + OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE + MASTER_PORT=12355 + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib" + PREFIX="numactl --cpunodebind=0 --membind=0" + DEVICE="neuron" + fi + $PREFIX python3 -m vllm.entrypoints.openai.api_server \ + --device $DEVICE \ + --model $MODEL_NAME \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --max-num-seqs 16 \ + --max-model-len 32 \ + --block-size 32 \ + --port 9000 diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 4ca57d75420..be1ecce0350 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -225,6 +225,9 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str: if acc_name == 'K80': image_id = service_catalog.get_image_id_from_tag( 'skypilot:k80-ubuntu-2004', region_name, clouds='aws') + if acc_name in ['Trainium', 'Inferentia']: + image_id = service_catalog.get_image_id_from_tag( + 'skypilot:neuron-ubuntu-2204', region_name, clouds='aws') if image_id is not None: return image_id # Raise ResourcesUnavailableError to make sure the failover in diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 1e1d6e98c03..e0e5ffa21a1 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -379,26 +379,33 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame': # # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208 # Nvidia driver: 470.57.02, CUDA Version: 11.4 -_GPU_UBUNTU_DATE_PYTORCH = [ - ('gpu', '20.04', '20231103', '2.1.0'), - ('gpu', '18.04', '20221114', '1.10.0'), - ('k80', '20.04', '20211208', '1.10.0'), - ('k80', '18.04', '20211208', '1.10.0'), +# +# Neuron (Inferentia / Trainium): +# https://aws.amazon.com/releasenotes/aws-deep-learning-ami-base-neuron-ubuntu-20-04/ # pylint: disable=line-too-long +# Deep Learning Base Neuron AMI (Ubuntu 20.04) 20240923 +# TODO(tian): find out the driver version. +# Neuron driver: +_GPU_DESC_UBUNTU_DATE = [ + ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'), + ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'), + ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'), + ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'), + ('neuron', 'Base Neuron AMI', '22.04', '20240923'), ] -def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str, - pytorch_version: str) -> Optional[str]: +def _fetch_image_id(region: str, description: str, ubuntu_version: str, + creation_date: str) -> Optional[str]: try: image = subprocess.check_output(f"""\ aws ec2 describe-images --region {region} --owners amazon \\ - --filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\ + --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\ 'Name=state,Values=available' --query 'Images[:1].ImageId' --output text """, shell=True) except subprocess.CalledProcessError as e: - print(f'Failed {region}, {ubuntu_version}, {creation_date}. ' - 'Trying next date.') + print(f'Failed {region}, {description}, {ubuntu_version}, ' + f'{creation_date}. Trying next date.') print(f'{type(e)}: {e}') image_id = None else: @@ -407,21 +414,21 @@ def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str, return image_id -def _get_image_row( - region: str, gpu: str, ubuntu_version: str, date: str, - pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]: - print(f'Getting image for {region}, {ubuntu_version}, {gpu}') - image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version) +def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str, + date: str) -> Tuple[str, str, str, str, Optional[str], str]: + print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}') + image_id = _fetch_image_id(region, description, ubuntu_version, date) if image_id is None: # not found - print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}') + print(f'Failed to find image for {region}, {description}, ' + f'{ubuntu_version}, {gpu}') tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}' return tag, region, 'ubuntu', ubuntu_version, image_id, date def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame': image_metas = [ - (r, *i) for r, i in itertools.product(regions, _GPU_UBUNTU_DATE_PYTORCH) + (r, *i) for r, i in itertools.product(regions, _GPU_DESC_UBUNTU_DATE) ] with mp_pool.Pool() as pool: results = pool.starmap(_get_image_row, image_metas) From 62222ee53cacb6a8965626c89e90f9fb2b6a3940 Mon Sep 17 00:00:00 2001 From: yika-luo Date: Mon, 30 Sep 2024 17:11:48 -0700 Subject: [PATCH 2/3] [UX] Remove requirement to specify cloud in Resources to use labels (#4022) Co-authored-by: Yika Luo --- sky/resources.py | 26 ++++++++++++++------------ tests/unit_tests/test_resources.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/sky/resources.py b/sky/resources.py index 2f19cd1aa01..e9a522cef48 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -966,20 +966,22 @@ def _try_validate_labels(self) -> None: """ if not self._labels: return - - if self.cloud is None: - # Because each cloud has its own label format, we cannot validate - # the labels without knowing the cloud. - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Cloud must be specified when labels are provided.') - - # Check if the label key value pairs are valid. + if self.cloud is not None: + validated_clouds = [self.cloud] + else: + # If no specific cloud is set, validate label against ALL clouds. + # The label will be dropped if invalid for any one of the cloud + validated_clouds = sky_check.get_cached_enabled_clouds_or_refresh() invalid_table = log_utils.create_table(['Label', 'Reason']) for key, value in self._labels.items(): - valid, err_msg = self.cloud.is_label_valid(key, value) - if not valid: - invalid_table.add_row([f'{key}: {value}', err_msg]) + for cloud in validated_clouds: + valid, err_msg = cloud.is_label_valid(key, value) + if not valid: + invalid_table.add_row([ + f'{key}: {value}', + f'Label rejected due to {cloud}: {err_msg}' + ]) + break if len(invalid_table.rows) > 0: with ux_utils.print_exception_no_traceback(): raise ValueError( diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index 01b83132a1b..5006fc454aa 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -6,6 +6,7 @@ import pytest from sky import clouds +from sky import global_user_state from sky import skypilot_config from sky.resources import Resources from sky.utils import resources_utils @@ -34,7 +35,8 @@ def test_get_reservations_available_resources(): def _run_label_test(allowed_labels: Dict[str, str], - invalid_labels: Dict[str, str], cloud: clouds.Cloud): + invalid_labels: Dict[str, str], + cloud: clouds.Cloud = None): """Run a test for labels with the given allowed and invalid labels.""" r_allowed = Resources(cloud=cloud, labels=allowed_labels) # Should pass assert r_allowed.labels == allowed_labels, ('Allowed labels ' @@ -92,6 +94,32 @@ def test_kubernetes_labels_resources(): _run_label_test(allowed_labels, invalid_labels, cloud) +def test_no_cloud_labels_resources(): + global_user_state.set_enabled_clouds(['aws', 'gcp']) + allowed_labels = { + **GLOBAL_VALID_LABELS, + } + invalid_labels = { + **GLOBAL_INVALID_LABELS, + 'aws:cannotstartwithaws': 'value', + 'domain/key': 'value', # Invalid for GCP + } + _run_label_test(allowed_labels, invalid_labels) + + +def test_no_cloud_labels_resources_single_enabled_cloud(): + global_user_state.set_enabled_clouds(['aws']) + allowed_labels = { + **GLOBAL_VALID_LABELS, + 'domain/key': 'value', # Valid for AWS + } + invalid_labels = { + **GLOBAL_INVALID_LABELS, + 'aws:cannotstartwithaws': 'value', + } + _run_label_test(allowed_labels, invalid_labels) + + @mock.patch('sky.clouds.service_catalog.instance_type_exists', return_value=True) @mock.patch('sky.clouds.service_catalog.get_accelerators_from_instance_type', From b1f22c4d5fe0a3cc25d1df0a8a05d4230a28b702 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 30 Sep 2024 23:37:47 -0700 Subject: [PATCH 3/3] [Docs] Add readme for inferentia example (#4024) * [Docs] Add readme for inferentia example * fold yaml * Update examples/aws-neuron/README.md Co-authored-by: Zhanghao Wu * Update examples/aws-neuron/README.md Co-authored-by: Zhanghao Wu * Update examples/aws-neuron/README.md Co-authored-by: Zhanghao Wu --------- Co-authored-by: Zhanghao Wu --- examples/aws-neuron/README.md | 117 ++++++++++++++++++ ...ccelerator.yaml => multi-accelerator.yaml} | 0 2 files changed, 117 insertions(+) create mode 100644 examples/aws-neuron/README.md rename examples/aws-neuron/{mix-accelerator.yaml => multi-accelerator.yaml} (100%) diff --git a/examples/aws-neuron/README.md b/examples/aws-neuron/README.md new file mode 100644 index 00000000000..38ffba7b885 --- /dev/null +++ b/examples/aws-neuron/README.md @@ -0,0 +1,117 @@ +# AWS Inferentia + +SkyPilot supports AWS Inferentia accelerators. The Neuron SDK is a runtime and compiler for running deep learning models on AWS Inferentia chips. Here is an example of how to use the Neuron SDK to launch a Llama 3 8b model on an Inferentia chip: + +```bash +$ sky launch -c aws-inf inferentia.yaml --env HF_TOKEN=hf_xxx +``` + +To send an example request to the model, you can use the following command: + +```bash +$ ENDPOINT=$(sky status aws-inf --endpoint 9000) +$ curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' +{"id":"chat-0631550312c143d88ca6d477d0df6c2c","object":"chat.completion","created":1727751137,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"I'm a helpful assistant! I","tool_calls":[]},"logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":32,"completion_tokens":7},"prompt_logprobs":null} +``` + +## Using multiple accelerator choices + +You can also specify multiple resources in a task YAML to allow SkyPilot to find the cheapest available resources for you. Specifically, you can specify both Neuron accelerators and Nvidia GPUs in the same YAML file. Here is an example (See [multi-accelerator.yaml](./multi-accelerator.yaml)): + +
+ +Example YAML for multiple accelerators. + +```yaml +resources: + accelerators: {A100:1, Inferentia:6} + disk_size: 512 + ports: 9000 + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # fill + +setup: | + if command -v nvidia-smi; then + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 + else + # Install transformers-neuronx and its dependencies + sudo apt-get install -y python3.10-venv g++ + python3.10 -m venv aws_neuron_venv_pytorch + source aws_neuron_venv_pytorch/bin/activate + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install wget + python -m pip install awscli + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + + # Install latest version of triton. + # Reference: https://github.com/vllm-project/vllm/issues/6987 + pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly + + # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict. + # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930 + git clone https://github.com/vllm-project/vllm.git vllm_repo + cd vllm_repo + pip install -U -r requirements-neuron.txt + VLLM_TARGET_DEVICE="neuron" pip install -e . + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + sudo apt update + sudo apt install -y numactl + fi + +run: | + if command -v nvidia-smi; then + TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE + PREFIX="" + DEVICE="cuda" + else + source aws_neuron_venv_pytorch/bin/activate + # Calculate the tensor parallel size. vLLM requires the tensor parallel size + # to be a factor of the number of attention heads, which is 32 for the model. + # Here we calculate the largest power of 2 that is less than or equal to the + # number of GPUs per node. + TENSOR_PARALLEL_SIZE=1 + while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do + TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2)) + done + NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))" + OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE + MASTER_PORT=12355 + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib" + PREFIX="numactl --cpunodebind=0 --membind=0" + DEVICE="neuron" + fi + $PREFIX python3 -m vllm.entrypoints.openai.api_server \ + --device $DEVICE \ + --model $MODEL_NAME \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --max-num-seqs 16 \ + --max-model-len 32 \ + --block-size 32 \ + --port 9000 +``` + +
diff --git a/examples/aws-neuron/mix-accelerator.yaml b/examples/aws-neuron/multi-accelerator.yaml similarity index 100% rename from examples/aws-neuron/mix-accelerator.yaml rename to examples/aws-neuron/multi-accelerator.yaml