Start implementing server image names

substratusai · Aug 31, 2024 · 03e0aa5 · 03e0aa5
1 parent 8765ad9
commit 03e0aa5
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 37 deletions.
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -38,6 +38,10 @@ type ModelSpec struct {
 	// ResourceProfile maps to specific pre-configured resources.
 	ResourceProfile string `json:"resourceProfile,omitempty"`
 
+	// Image to be used for the server process.
+	// Will be set from the ResourceProfile if provided.
+	Image string `json:"image,omitempty"`
+
 	// Resources to be allocated to the server process.
 	// Will be set from the ResourceProfile if provided.
 	Resources *corev1.ResourceRequirements `json:"resources,omitempty"`

diff --git a/charts/kubeai/charts/models/values.yaml b/charts/kubeai/charts/models/values.yaml
@@ -9,7 +9,7 @@ catalog:
     features: ["TextEmbedding"]
     owner: intfloat
     url: "hf://intfloat/e5-mistral-7b-instruct"
-    server: VLLM
+    engine: VLLM
     resourceProfile: CPU:1
     args:
     - --gpu-memory-utilization=0.9
@@ -19,15 +19,15 @@ catalog:
     features: ["TextGeneration"]
     owner: google
     url: "ollama://gemma2:2b"
-    server: OLlama
+    engine: OLlama
     resourceProfile: CPU:2
   # Llama #
   llama-3.1-8b-instruct-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: "meta-llama"
     url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
-    server: VLLM
+    engine: VLLM
     resourceProfile: CPU:6
     env:
       VLLM_CPU_KVCACHE_SPACE: "4"
@@ -39,7 +39,7 @@ catalog:
     features: ["TextGeneration"]
     owner: "neuralmagic"
     url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
-    server: VLLM
+    engine: VLLM
     resourceProfile: NVIDIA_GPU_L4:1
     args:
     - --max-model-len=16384
@@ -51,29 +51,29 @@ catalog:
     features: ["TextEmbedding"]
     owner: nomic
     url: "ollama://nomic-embed-text"
-    server: OLlama
+    engine: OLlama
     resourceProfile: CPU:1
   # Opt #
   opt-125m-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
-    server: VLLM
+    engine: VLLM
     resourceProfile: CPU:1
   opt-125m-l4:
     enabled: false
     features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
-    server: VLLM
+    engine: VLLM
     resourceProfile: NVIDIA_GPU_L4:1
   # Qwen #
   qwen2-500m-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: alibaba
     url: "ollama://qwen2:0.5b"
-    server: OLlama
+    engine: OLlama
     resourceProfile: CPU:1
 
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -14,21 +14,24 @@ secrets:
 modelServers:
   VLLM:
     images:
-      # The key is the resource profile name (with "*" matching), and the value is the image to use.
-      "*": "vllm/vllm-openai:v0.5.5"
-      "CPU*": "substratusai/vllm-openai-cpu:v0.5.5"
-      "NVIDIA_GPU*": "vllm/vllm-openai:v0.5.5"
-      "GOOGLE_TPU*": "substratusai/vllm-openai-tpu:v0.5.5"
+      # The key is the image name (referenced from resourceProfiles) and the value is the image.
+      # "default" is used when no imageName is specified or if a specific image is not found.
+      default: "vllm/vllm-openai:v0.5.5"
+      cpu: "substratusai/vllm-openai-cpu:v0.5.5"
+      nvidia-gpu: "vllm/vllm-openai:v0.5.5"
+      google-tpu: "substratusai/vllm-openai-tpu:v0.5.5"
   OLlama:
     images:
-      "*": "ollama/ollama:latest"
+      default: "ollama/ollama:latest"
 
 resourceProfiles:
   CPU:
+    imageName: "cpu"
     requests:
       cpu: 1
       memory: "2Gi"
   NVIDIA_GPU_L4:
+    imageName: "nvidia-gpu"
     limits:
       nvidia.com/gpu: "1"
     requests:
@@ -40,6 +43,7 @@ resourceProfiles:
   # NVIDIA_GPU_A100:
     # ...
   GOOGLE_TPU_V5E:
+    imageName: "google-tpu"
     requests:
       google.com/tpu: 4
     limits:

diff --git a/docs/model-management.md b/docs/model-management.md
@@ -13,7 +13,7 @@ spec:
   features: ["TextGeneration"]
   owner: neuralmagic
   url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
-  server: VLLM
+  engine: VLLM
   args:
     - --max-model-len=16384
     - --max-num-batched-token=16384

diff --git a/internal/config/system.go b/internal/config/system.go
@@ -53,6 +53,7 @@ func (d *Duration) UnmarshalJSON(b []byte) error {
 }
 
 type ResourceProfile struct {
+	ImageName    string              `json:"imageName"`
 	Requests     corev1.ResourceList `json:"requests,omitempty"`
 	Limits       corev1.ResourceList `json:"limits,omitempty"`
 	NodeSelector map[string]string   `json:"nodeSelector,omitempty"`
@@ -65,11 +66,10 @@ type MessageStream struct {
 }
 
 type ModelServers struct {
-	Ollama struct {
-		Image string `json:"image"`
-	} `json:"ollama"`
-	VLLM struct {
-		CPUImage string `json:"cpuImage"`
-		GPUImage string `json:"gpuImage"`
-	}
+	OLlama ModelServer `json:"OLlama"`
+	VLLM   ModelServer `json:"VLLM"`
+}
+
+type ModelServer struct {
+	Images map[string]string `json:"images"`
 }
diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go
@@ -220,13 +220,6 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev
 	}
 	args = append(args, m.Spec.Args...)
 
-	var image string
-	if usesGPUResources(*m.Spec.Resources) {
-		image = r.ModelServers.VLLM.GPUImage
-	} else {
-		image = r.ModelServers.VLLM.CPUImage
-	}
-
 	env := []corev1.EnvVar{
 		{
 			// TODO: Conditionally set this token based on whether
@@ -266,7 +259,7 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev
 			Containers: []corev1.Container{
 				{
 					Name:      "server",
-					Image:     image,
+					Image:     m.Spec.Image,
 					Args:      args,
 					Env:       env,
 					Resources: *m.Spec.Resources,
@@ -399,7 +392,7 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
 			Containers: []corev1.Container{
 				{
 					Name:      "server",
-					Image:     r.ModelServers.Ollama.Image,
+					Image:     m.Spec.Image,
 					Args:      m.Spec.Args,
 					Env:       env,
 					Resources: *m.Spec.Resources,
@@ -508,12 +501,6 @@ func (r *ModelReconciler) annotationsForModel(m *kubeaiv1.Model) map[string]stri
 	return ann
 }
 
-func usesGPUResources(res corev1.ResourceRequirements) bool {
-	_, gpuLimits := res.Limits[corev1.ResourceName("nvidia.com/gpu")]
-	_, gpuRequests := res.Limits[corev1.ResourceName("nvidia.com/gpu")]
-	return gpuLimits || gpuRequests
-}
-
 func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, error) {
 	split := strings.Split(model.Spec.ResourceProfile, ":")
 	if len(split) != 2 {
@@ -561,6 +548,37 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
 		changed = true
 	}
 
+	if model.Spec.Image == "" {
+		var serverImgs map[string]string
+		switch model.Spec.Engine {
+		case kubeaiv1.OLlamaEngine:
+			serverImgs = r.ModelServers.OLlama.Images
+		default:
+			serverImgs = r.ModelServers.VLLM.Images
+		}
+
+		// If no image name is provided for a profile, use the default image name.
+		const defaultImageName = "default"
+		imageName := defaultImageName
+		if profile.ImageName != "" {
+			imageName = profile.ImageName
+		}
+
+		if img, ok := serverImgs[imageName]; ok {
+			model.Spec.Image = img
+			changed = true
+		} else {
+			// If the specific profile image name does not exist, use the default image name.
+			if img, ok := serverImgs[defaultImageName]; ok {
+				model.Spec.Image = img
+				changed = true
+			} else {
+				return false, fmt.Errorf("missing default server image")
+			}
+		}
+
+	}
+
 	return changed, nil
 }