add manual test of vLLM on GPU and TPU

substratusai · Oct 19, 2024 · 6db0ad5 · 6db0ad5
1 parent 8593701
commit 6db0ad5
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 2 deletions.
diff --git a/charts/kubeai/values-gke.yaml b/charts/kubeai/values-gke.yaml
@@ -22,20 +22,23 @@ resourceProfiles:
     nodeSelector:
       cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       cloud.google.com/gke-tpu-topology: "1x1"
+      cloud.google.com/gke-spot: "true"
   google-tpu-v5e-2x2:
     imageName: google-tpu
     limits:
       google.com/tpu: 1
     nodeSelector:
       cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       cloud.google.com/gke-tpu-topology: "2x2"
+      cloud.google.com/gke-spot: "true"
   google-tpu-v5e-2x4:
     imageName: google-tpu
     limits:
       google.com/tpu: 1
     nodeSelector:
       cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       cloud.google.com/gke-tpu-topology: "2x4"
+      cloud.google.com/gke-spot: "true"
 
 cacheProfiles:
   standard-filestore:

diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -33,8 +33,7 @@ modelServers:
 
 modelLoaders:
   huggingface:
-    # TODO: Update image to the one built with GH Actions.
-    image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"
+    image: "substratusai/huggingface-model-loader:main"
 
 modelServerPods:
   # Security Context for the model pods

diff --git a/test/e2e/gke-vllm-gpu-tpu/run.sh b/test/e2e/gke-vllm-gpu-tpu/run.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Spin up latest release and run test GPU and TPU on GKE autopilot.
+
+helm install kubeai ./charts/kubeai \
+  -f ./charts/kubeai/values-gke.yaml \
+  -f - <<EOF
+secrets:
+  huggingface:
+    token: "${HF_TOKEN}"
+modelLoaders:
+  huggingface:
+    image: "substratusai/huggingface-model-loader:main"
+image:
+  tag: "main"
+  pullPolicy: "Always"
+EOF
+
+sleep 5
+
+kubectl wait --for=condition=ready pod \
+  -l app.kubernetes.io/name=kubeai --timeout=300s
+
+helm install kubeai-models ./charts/models -f - <<EOF
+catalog:
+  llama-3.1-8b-instruct-fp8-l4:
+    enabled: true
+    cacheProfile: premium-filestore
+  llama-3.1-8b-instruct-tpu:
+    enabled: true
+    cacheProfile: premium-filestore
+EOF
+
+kubectl port-forward svc/kubeai 8000:80 &
+
+# test scale from 0 on gpu
+curl -v http://localhost:8000/openai/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "llama-3.1-8b-instruct-fp8-l4", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'
+
+# test scale from 0 on tpu
+curl -v http://localhost:8000/openai/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "llama-3.1-8b-instruct-tpu", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'