Finish resource profile functionality and add doc

substratusai · Aug 31, 2024 · 76bf2c4 · 76bf2c4
1 parent 03e0aa5
commit 76bf2c4
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 41 deletions.
diff --git a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
@@ -61,6 +61,11 @@ spec:
                   - TextEmbedding
                   type: string
                 type: array
+              image:
+                description: |-
+                  Image to be used for the server process.
+                  Will be set from the ResourceProfile if provided.
+                type: string
               maxReplicas:
                 format: int32
                 type: integer

diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -15,6 +15,7 @@ modelServers:
   VLLM:
     images:
       # The key is the image name (referenced from resourceProfiles) and the value is the image.
+      # The "default" image should always be specified.
       # "default" is used when no imageName is specified or if a specific image is not found.
       default: "vllm/vllm-openai:v0.5.5"
       cpu: "substratusai/vllm-openai-cpu:v0.5.5"

diff --git a/docs/concepts/resource-profiles.md b/docs/concepts/resource-profiles.md
@@ -0,0 +1,51 @@
+# Resource Profiles
+
+A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are set on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires.
+
+Kubernetes Model resources specify the resource profile and the count of that resource that they require:
+
+```yaml
+# model.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-8b-instruct-fp8-l4
+spec:
+  engine: VLLM
+  resourceProfile: NVIDIA_GPU_L4:1 # Specified at <profile>:<count>
+  # ...
+```
+A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed on.
+
+Example: A resource profile named `NVIDIA_GPU_L4` might contain the following settings on a GKE Kubernetes cluster:
+
+```yaml
+# KubeAI config.yaml
+resourceProfiles:
+  NVIDIA_GPU_L4:
+    limits:
+      # Typical across most Kubernetes clusters:
+      nvidia.com/gpu: "1"
+    requests:
+      nvidia.com/gpu: "1"
+    nodeSelector:
+      # Specific to GKE:
+      cloud.google.com/gke-accelerator: "nvidia-l4"
+      cloud.google.com/gke-spot: "true"
+    imageName: "nvidia-gpu"
+```
+
+In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource:
+
+```yaml
+# KubeAI config.yaml
+modelServers:
+  VLLM:
+    images:
+      default: "vllm/vllm-openai:v0.5.5"
+      nvidia-gpu: "vllm/vllm-openai:v0.5.5" # <--
+      cpu: "vllm/vllm-openai-cpu:v0.5.5"
+  OLlama:
+    images:
+      # ...
+```
diff --git a/docs/development.md b/docs/development.md
@@ -1,42 +1,54 @@
 # Development
 
-## Cloud Setup
+## Optional: Cloud Setup
+
+### GCP PubSub
+
+If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the `.messaging.streams` in `./hack/dev-config.yaml`.
 
 ```bash
+gcloud auth login --update-adc
+
 gcloud pubsub topics create test-kubeai-requests
 gcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests
 gcloud pubsub topics create test-kubeai-responses
 gcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses
 ```
 
-## Local Cluster
+## Run in Local Cluster
 
 ```bash
 kind create cluster
 # OR
 #./hack/create-dev-gke-cluster.yaml
 
+# Generate CRDs from Go code.
+make generate && make manifests
+
 # When CRDs are changed reapply using kubectl:
 kubectl apply -f ./charts/kubeai/charts/crds/crds
 
 # Model with special address annotations:
 kubectl apply -f ./hack/dev-model.yaml
 
-# For developing in-cluster features:
+# OPTION A #
+# Run KubeAI inside cluster
+# Change `-f` based on the cluster environment.
 helm upgrade --install kubeai ./charts/kubeai \
     --set openwebui.enabled=true \
     --set image.tag=latest \
     --set image.pullPolicy=Always \
     --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
-    --set replicaCount=1 # 0 if running out-of-cluster (using "go run")
-
-# -f ./helm-values.yaml \
+    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
+    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml
 
-# Run in development mode.
+# OPTION B #
+# For quick local interation (run KubeAI outside of cluster)
 CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override
 
 # In another terminal:
 while true; do kubectl port-forward service/dev-model 7000:7000; done
+############
 ```
 
 ## Running

diff --git a/hack/dev-config.yaml b/hack/dev-config.yaml
@@ -2,16 +2,19 @@ secretNames:
   huggingface: huggingface
 modelServers:
   vLLM:
-    gpuImage: "vllm/vllm-openai:latest"
-    cpuImage: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
+    images:
+      default: "vllm/vllm-openai:latest"
+      cpu: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
   ollama:
-    image: "ollama/ollama:latest"
+    images:
+      default: "ollama/ollama:latest"
+      cpu: "ollama/ollama:0.3.8"
 messaging:
   errorMaxBackoff: 30s
-  streams:
-  - requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
-    responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
-    maxHandlers: 1
+  streams: []
+  #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
+  #  responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
+  #  maxHandlers: 1
 resourceProfiles:
   CPU:
     requests:

diff --git a/hack/dev-gke-helm-values.yaml b/hack/dev-gke-helm-values.yaml
@@ -0,0 +1,10 @@
+models:
+  catalog:
+    llama-3.1-8b-instruct-fp8-l4:
+      enabled: true
+
+resourceProfiles:
+  NVIDIA_GPU_L4:
+    nodeSelector:
+      cloud.google.com/gke-accelerator: "nvidia-l4"
+      cloud.google.com/gke-spot: "true"
diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go
@@ -548,38 +548,44 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
 		changed = true
 	}
 
-	if model.Spec.Image == "" {
-		var serverImgs map[string]string
-		switch model.Spec.Engine {
-		case kubeaiv1.OLlamaEngine:
-			serverImgs = r.ModelServers.OLlama.Images
-		default:
-			serverImgs = r.ModelServers.VLLM.Images
-		}
+	image, err := r.lookupServerImage(model, profile)
+	if err != nil {
+		return false, fmt.Errorf("looking up server image: %w", err)
+	}
+	if model.Spec.Image == "" || model.Spec.Image != image {
+		model.Spec.Image = image
+		changed = true
+	}
 
-		// If no image name is provided for a profile, use the default image name.
-		const defaultImageName = "default"
-		imageName := defaultImageName
-		if profile.ImageName != "" {
-			imageName = profile.ImageName
-		}
+	return changed, nil
+}
 
-		if img, ok := serverImgs[imageName]; ok {
-			model.Spec.Image = img
-			changed = true
-		} else {
-			// If the specific profile image name does not exist, use the default image name.
-			if img, ok := serverImgs[defaultImageName]; ok {
-				model.Spec.Image = img
-				changed = true
-			} else {
-				return false, fmt.Errorf("missing default server image")
-			}
-		}
+func (r *ModelReconciler) lookupServerImage(model *kubeaiv1.Model, profile config.ResourceProfile) (string, error) {
+	var serverImgs map[string]string
+	switch model.Spec.Engine {
+	case kubeaiv1.OLlamaEngine:
+		serverImgs = r.ModelServers.OLlama.Images
+	default:
+		serverImgs = r.ModelServers.VLLM.Images
+	}
 
+	// If no image name is provided for a profile, use the default image name.
+	const defaultImageName = "default"
+	imageName := defaultImageName
+	if profile.ImageName != "" {
+		imageName = profile.ImageName
 	}
 
-	return changed, nil
+	if img, ok := serverImgs[imageName]; ok {
+		return img, nil
+	}
+
+	// If the specific profile image name does not exist, use the default image name.
+	if img, ok := serverImgs[defaultImageName]; ok {
+		return img, nil
+	} else {
+		return "", fmt.Errorf("missing default server image")
+	}
 }
 
 func (r *ModelReconciler) applySelfLabels(model *kubeaiv1.Model) bool {