Llama 3.2 11B Instruct vision on 1 x L4 GPU (#258)

substratusai · Oct 1, 2024 · 8e94504 · 8e94504
1 parent ffbe290
commit 8e94504
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 2 deletions.
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -17,9 +17,9 @@ modelServers:
       # The key is the image name (referenced from resourceProfiles) and the value is the image.
       # The "default" image should always be specified.
       # "default" is used when no imageName is specified or if a specific image is not found.
-      default: "vllm/vllm-openai:v0.6.1.post2"
+      default: "vllm/vllm-openai:v0.6.2"
       cpu: "substratusai/vllm:v0.6.1-cpu"
-      nvidia-gpu: "vllm/vllm-openai:v0.6.1.post2"
+      nvidia-gpu: "vllm/vllm-openai:v0.6.2"
       google-tpu: "substratusai/vllm:v0.6.1-tpu"
   OLlama:
     images:

diff --git a/charts/models/values.yaml b/charts/models/values.yaml
@@ -3,6 +3,26 @@ all:
   enabled: false
 
 catalog:
+  llama-3.2-11b-vision-instruct-l4:
+    enabled: false
+    features: [TextGeneration]
+    url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic
+    engine: VLLM
+    env:
+      VLLM_WORKER_MULTIPROC_METHOD: spawn
+    args:
+      - --max-model-len=8192
+      - --max-num-batched-token=8192
+      - --gpu-memory-utilization=0.99
+      - --enforce-eager
+      - --disable-log-requests
+      - --max-num-seqs=16
+      # Setting this is broken in vllm 0.6.2
+      #    - --kv-cache-dtype=fp8
+    resourceProfile: nvidia-gpu-l4:1
+    minReplicas: 1
+    maxReplicas: 1
+    targetRequests: 32
   # Mistral #
   e5-mistral-7b-instruct-cpu:
     enabled: false

diff --git a/docs/benchmarks/llama-3.2-11b-vision.md b/docs/benchmarks/llama-3.2-11b-vision.md
@@ -0,0 +1,32 @@
+# Llama 3.2 11B Vision Instruct vLLM Benchmarks
+
+
+Single L4 GPU vLLM 0.6.2
+```
+python3 benchmark_serving.py --backend openai \
+    --base-url http://localhost:8000/openai \
+    --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama-3.2-11b-vision-instruct \
+    --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic
+============ Serving Benchmark Result ============
+Successful requests:                     1000
+Benchmark duration (s):                  681.93
+Total input tokens:                      230969
+Total generated tokens:                  194523
+Request throughput (req/s):              1.47
+Output token throughput (tok/s):         285.25
+Total Token throughput (tok/s):          623.95
+---------------Time to First Token----------------
+Mean TTFT (ms):                          319146.12
+Median TTFT (ms):                        322707.98
+P99 TTFT (ms):                           642512.79
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          54.84
+Median TPOT (ms):                        53.66
+P99 TPOT (ms):                           83.75
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           54.09
+Median ITL (ms):                         47.44
+P99 ITL (ms):                            216.77
+==================================================
+```
diff --git a/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml b/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml
@@ -0,0 +1,23 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.2-11b-vision-instruct-l4
+spec:
+  features: [TextGeneration]
+  owner:
+  url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic
+  engine: VLLM
+  args:
+    - --max-model-len=8192
+    - --max-num-batched-token=8192
+    - --gpu-memory-utilization=0.99
+    - --enforce-eager
+    - --disable-log-requests
+    - --max-num-seqs=16
+  env:
+    VLLM_WORKER_MULTIPROC_METHOD: spawn
+  minReplicas: 1
+  maxReplicas: 1
+  targetRequests: 32
+  resourceProfile: nvidia-gpu-l4:1