Skip to content

Commit

Permalink
update to the vertex model garden images and respective environment v…
Browse files Browse the repository at this point in the history
…ariables (#1474)
  • Loading branch information
kenthua authored Oct 3, 2024
1 parent 70751f8 commit c9e5dc1
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 24 deletions.
4 changes: 1 addition & 3 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-27b-it.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "10"
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-27b-it
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down
8 changes: 3 additions & 5 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-27b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "10"
memory: "80Gi"
ephemeral-storage: "80Gi"
ephemeral-storage: "120Gi"
nvidia.com/gpu: "4"
limits:
cpu: "10"
memory: "80Gi"
ephemeral-storage: "80Gi"
ephemeral-storage: "120Gi"
nvidia.com/gpu: "4"
command: ["python3", "-m", "vllm.entrypoints.api_server"]
args:
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-27b
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down
4 changes: 1 addition & 3 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-2b-it.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "2"
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-2b-it
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down
8 changes: 3 additions & 5 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-2b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "2"
memory: "10Gi"
ephemeral-storage: "10Gi"
ephemeral-storage: "12Gi"
nvidia.com/gpu: "1"
limits:
cpu: "2"
memory: "10Gi"
ephemeral-storage: "10Gi"
ephemeral-storage: "12Gi"
nvidia.com/gpu: "1"
command: ["python3", "-m", "vllm.entrypoints.api_server"]
args:
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-2b
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down
4 changes: 1 addition & 3 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "4"
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-9b-it
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down
8 changes: 3 additions & 5 deletions ai-ml/llm-serving-gemma/vllm/vllm-2-9b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ spec:
spec:
containers:
- name: inference-server
image: vllm/vllm-openai:v0.5.5
image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00
resources:
requests:
cpu: "4"
memory: "30Gi"
ephemeral-storage: "30Gi"
ephemeral-storage: "45Gi"
nvidia.com/gpu: "2"
limits:
cpu: "4"
memory: "30Gi"
ephemeral-storage: "30Gi"
ephemeral-storage: "45Gi"
nvidia.com/gpu: "2"
command: ["python3", "-m", "vllm.entrypoints.api_server"]
args:
Expand All @@ -51,8 +51,6 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-2-9b
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down

0 comments on commit c9e5dc1

Please sign in to comment.