diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-27b-it.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-27b-it.yaml index 8e5bf334e..f0cad649c 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-27b-it.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-27b-it.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "10" @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-27b-it - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-27b.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-27b.yaml index 7670a7f7b..db68f6c5f 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-27b.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-27b.yaml @@ -32,17 +32,17 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "10" memory: "80Gi" - ephemeral-storage: "80Gi" + ephemeral-storage: "120Gi" nvidia.com/gpu: "4" limits: cpu: "10" memory: "80Gi" - ephemeral-storage: "80Gi" + ephemeral-storage: "120Gi" nvidia.com/gpu: "4" command: ["python3", "-m", "vllm.entrypoints.api_server"] args: @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-27b - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-2b-it.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-2b-it.yaml index e5e0f9c29..7ee272517 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-2b-it.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-2b-it.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "2" @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-2b-it - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-2b.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-2b.yaml index ebd6de059..d9a9ef842 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-2b.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-2b.yaml @@ -32,17 +32,17 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "2" memory: "10Gi" - ephemeral-storage: "10Gi" + ephemeral-storage: "12Gi" nvidia.com/gpu: "1" limits: cpu: "2" memory: "10Gi" - ephemeral-storage: "10Gi" + ephemeral-storage: "12Gi" nvidia.com/gpu: "1" command: ["python3", "-m", "vllm.entrypoints.api_server"] args: @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-2b - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml index 37eca601a..93dc24c50 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-9b-it.yaml @@ -32,7 +32,7 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "4" @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-9b-it - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/ai-ml/llm-serving-gemma/vllm/vllm-2-9b.yaml b/ai-ml/llm-serving-gemma/vllm/vllm-2-9b.yaml index fe77962ff..9d89121af 100644 --- a/ai-ml/llm-serving-gemma/vllm/vllm-2-9b.yaml +++ b/ai-ml/llm-serving-gemma/vllm/vllm-2-9b.yaml @@ -32,17 +32,17 @@ spec: spec: containers: - name: inference-server - image: vllm/vllm-openai:v0.5.5 + image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240930_0945_RC00 resources: requests: cpu: "4" memory: "30Gi" - ephemeral-storage: "30Gi" + ephemeral-storage: "45Gi" nvidia.com/gpu: "2" limits: cpu: "4" memory: "30Gi" - ephemeral-storage: "30Gi" + ephemeral-storage: "45Gi" nvidia.com/gpu: "2" command: ["python3", "-m", "vllm.entrypoints.api_server"] args: @@ -51,8 +51,6 @@ spec: env: - name: MODEL_ID value: google/gemma-2-9b - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: