From 8e9450499d9ebc222903a6bf9e903546c7ddd31f Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 30 Sep 2024 19:59:37 -0700 Subject: [PATCH] Llama 3.2 11B Instruct vision on 1 x L4 GPU (#258) --- charts/kubeai/values.yaml | 4 +-- charts/models/values.yaml | 20 ++++++++++++ docs/benchmarks/llama-3.2-11b-vision.md | 32 +++++++++++++++++++ .../llama-3.2-11b-vision-instruct-l4.yaml | 23 +++++++++++++ 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 docs/benchmarks/llama-3.2-11b-vision.md create mode 100644 manifests/models/llama-3.2-11b-vision-instruct-l4.yaml diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index b275036b..62001ac6 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -17,9 +17,9 @@ modelServers: # The key is the image name (referenced from resourceProfiles) and the value is the image. # The "default" image should always be specified. # "default" is used when no imageName is specified or if a specific image is not found. - default: "vllm/vllm-openai:v0.6.1.post2" + default: "vllm/vllm-openai:v0.6.2" cpu: "substratusai/vllm:v0.6.1-cpu" - nvidia-gpu: "vllm/vllm-openai:v0.6.1.post2" + nvidia-gpu: "vllm/vllm-openai:v0.6.2" google-tpu: "substratusai/vllm:v0.6.1-tpu" OLlama: images: diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 367ebe91..97df836e 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -3,6 +3,26 @@ all: enabled: false catalog: + llama-3.2-11b-vision-instruct-l4: + enabled: false + features: [TextGeneration] + url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic + engine: VLLM + env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + args: + - --max-model-len=8192 + - --max-num-batched-token=8192 + - --gpu-memory-utilization=0.99 + - --enforce-eager + - --disable-log-requests + - --max-num-seqs=16 + # Setting this is broken in vllm 0.6.2 + # - --kv-cache-dtype=fp8 + resourceProfile: nvidia-gpu-l4:1 + minReplicas: 1 + maxReplicas: 1 + targetRequests: 32 # Mistral # e5-mistral-7b-instruct-cpu: enabled: false diff --git a/docs/benchmarks/llama-3.2-11b-vision.md b/docs/benchmarks/llama-3.2-11b-vision.md new file mode 100644 index 00000000..d2a83d59 --- /dev/null +++ b/docs/benchmarks/llama-3.2-11b-vision.md @@ -0,0 +1,32 @@ +# Llama 3.2 11B Vision Instruct vLLM Benchmarks + + +Single L4 GPU vLLM 0.6.2 +``` +python3 benchmark_serving.py --backend openai \ + --base-url http://localhost:8000/openai \ + --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \ + --model meta-llama-3.2-11b-vision-instruct \ + --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic +============ Serving Benchmark Result ============ +Successful requests: 1000 +Benchmark duration (s): 681.93 +Total input tokens: 230969 +Total generated tokens: 194523 +Request throughput (req/s): 1.47 +Output token throughput (tok/s): 285.25 +Total Token throughput (tok/s): 623.95 +---------------Time to First Token---------------- +Mean TTFT (ms): 319146.12 +Median TTFT (ms): 322707.98 +P99 TTFT (ms): 642512.79 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 54.84 +Median TPOT (ms): 53.66 +P99 TPOT (ms): 83.75 +---------------Inter-token Latency---------------- +Mean ITL (ms): 54.09 +Median ITL (ms): 47.44 +P99 ITL (ms): 216.77 +================================================== +``` \ No newline at end of file diff --git a/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml b/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml new file mode 100644 index 00000000..339c6986 --- /dev/null +++ b/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml @@ -0,0 +1,23 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.2-11b-vision-instruct-l4 +spec: + features: [TextGeneration] + owner: + url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic + engine: VLLM + args: + - --max-model-len=8192 + - --max-num-batched-token=8192 + - --gpu-memory-utilization=0.99 + - --enforce-eager + - --disable-log-requests + - --max-num-seqs=16 + env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + minReplicas: 1 + maxReplicas: 1 + targetRequests: 32 + resourceProfile: nvidia-gpu-l4:1