Skip to content

Commit

Permalink
add manual test of vLLM on GPU and TPU
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 committed Oct 19, 2024
1 parent 8593701 commit 6db0ad5
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
3 changes: 3 additions & 0 deletions charts/kubeai/values-gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,23 @@ resourceProfiles:
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
cloud.google.com/gke-tpu-topology: "1x1"
cloud.google.com/gke-spot: "true"
google-tpu-v5e-2x2:
imageName: google-tpu
limits:
google.com/tpu: 1
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
cloud.google.com/gke-tpu-topology: "2x2"
cloud.google.com/gke-spot: "true"
google-tpu-v5e-2x4:
imageName: google-tpu
limits:
google.com/tpu: 1
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
cloud.google.com/gke-tpu-topology: "2x4"
cloud.google.com/gke-spot: "true"

cacheProfiles:
standard-filestore:
Expand Down
3 changes: 1 addition & 2 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ modelServers:

modelLoaders:
huggingface:
# TODO: Update image to the one built with GH Actions.
image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"
image: "substratusai/huggingface-model-loader:main"

modelServerPods:
# Security Context for the model pods
Expand Down
46 changes: 46 additions & 0 deletions test/e2e/gke-vllm-gpu-tpu/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash

set -ex

# Spin up latest release and run test GPU and TPU on GKE autopilot.

helm install kubeai ./charts/kubeai \
-f ./charts/kubeai/values-gke.yaml \
-f - <<EOF
secrets:
huggingface:
token: "${HF_TOKEN}"
modelLoaders:
huggingface:
image: "substratusai/huggingface-model-loader:main"
image:
tag: "main"
pullPolicy: "Always"
EOF

sleep 5

kubectl wait --for=condition=ready pod \
-l app.kubernetes.io/name=kubeai --timeout=300s

helm install kubeai-models ./charts/models -f - <<EOF
catalog:
llama-3.1-8b-instruct-fp8-l4:
enabled: true
cacheProfile: premium-filestore
llama-3.1-8b-instruct-tpu:
enabled: true
cacheProfile: premium-filestore
EOF

kubectl port-forward svc/kubeai 8000:80 &

# test scale from 0 on gpu
curl -v http://localhost:8000/openai/v1/completions \
-H "Content-Type: application/json" \
-d '{"model": "llama-3.1-8b-instruct-fp8-l4", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'

# test scale from 0 on tpu
curl -v http://localhost:8000/openai/v1/completions \
-H "Content-Type: application/json" \
-d '{"model": "llama-3.1-8b-instruct-tpu", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'

0 comments on commit 6db0ad5

Please sign in to comment.