diff --git a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml index 2e4b8577..3962334a 100644 --- a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml +++ b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml @@ -61,6 +61,11 @@ spec: - TextEmbedding type: string type: array + image: + description: |- + Image to be used for the server process. + Will be set from the ResourceProfile if provided. + type: string maxReplicas: format: int32 type: integer diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 610d7f05..f7aba49a 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -15,6 +15,7 @@ modelServers: VLLM: images: # The key is the image name (referenced from resourceProfiles) and the value is the image. + # The "default" image should always be specified. # "default" is used when no imageName is specified or if a specific image is not found. default: "vllm/vllm-openai:v0.5.5" cpu: "substratusai/vllm-openai-cpu:v0.5.5" diff --git a/docs/concepts/resource-profiles.md b/docs/concepts/resource-profiles.md new file mode 100644 index 00000000..0c7eb781 --- /dev/null +++ b/docs/concepts/resource-profiles.md @@ -0,0 +1,51 @@ +# Resource Profiles + +A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are set on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires. + +Kubernetes Model resources specify the resource profile and the count of that resource that they require: + +```yaml +# model.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.1-8b-instruct-fp8-l4 +spec: + engine: VLLM + resourceProfile: NVIDIA_GPU_L4:1 # Specified at : + # ... +``` +A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed on. + +Example: A resource profile named `NVIDIA_GPU_L4` might contain the following settings on a GKE Kubernetes cluster: + +```yaml +# KubeAI config.yaml +resourceProfiles: + NVIDIA_GPU_L4: + limits: + # Typical across most Kubernetes clusters: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + nodeSelector: + # Specific to GKE: + cloud.google.com/gke-accelerator: "nvidia-l4" + cloud.google.com/gke-spot: "true" + imageName: "nvidia-gpu" +``` + +In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource: + +```yaml +# KubeAI config.yaml +modelServers: + VLLM: + images: + default: "vllm/vllm-openai:v0.5.5" + nvidia-gpu: "vllm/vllm-openai:v0.5.5" # <-- + cpu: "vllm/vllm-openai-cpu:v0.5.5" + OLlama: + images: + # ... +``` diff --git a/docs/development.md b/docs/development.md index 10ea3214..c77942b7 100644 --- a/docs/development.md +++ b/docs/development.md @@ -1,42 +1,54 @@ # Development -## Cloud Setup +## Optional: Cloud Setup + +### GCP PubSub + +If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the `.messaging.streams` in `./hack/dev-config.yaml`. ```bash +gcloud auth login --update-adc + gcloud pubsub topics create test-kubeai-requests gcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests gcloud pubsub topics create test-kubeai-responses gcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses ``` -## Local Cluster +## Run in Local Cluster ```bash kind create cluster # OR #./hack/create-dev-gke-cluster.yaml +# Generate CRDs from Go code. +make generate && make manifests + # When CRDs are changed reapply using kubectl: kubectl apply -f ./charts/kubeai/charts/crds/crds # Model with special address annotations: kubectl apply -f ./hack/dev-model.yaml -# For developing in-cluster features: +# OPTION A # +# Run KubeAI inside cluster +# Change `-f` based on the cluster environment. helm upgrade --install kubeai ./charts/kubeai \ --set openwebui.enabled=true \ --set image.tag=latest \ --set image.pullPolicy=Always \ --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \ - --set replicaCount=1 # 0 if running out-of-cluster (using "go run") - -# -f ./helm-values.yaml \ + --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \ + --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml -# Run in development mode. +# OPTION B # +# For quick local interation (run KubeAI outside of cluster) CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override # In another terminal: while true; do kubectl port-forward service/dev-model 7000:7000; done +############ ``` ## Running diff --git a/hack/dev-config.yaml b/hack/dev-config.yaml index 297b4511..d186d4dc 100644 --- a/hack/dev-config.yaml +++ b/hack/dev-config.yaml @@ -2,16 +2,19 @@ secretNames: huggingface: huggingface modelServers: vLLM: - gpuImage: "vllm/vllm-openai:latest" - cpuImage: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561" + images: + default: "vllm/vllm-openai:latest" + cpu: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561" ollama: - image: "ollama/ollama:latest" + images: + default: "ollama/ollama:latest" + cpu: "ollama/ollama:0.3.8" messaging: errorMaxBackoff: 30s - streams: - - requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub - responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses - maxHandlers: 1 + streams: [] + #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub + # responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses + # maxHandlers: 1 resourceProfiles: CPU: requests: diff --git a/hack/dev-gke-helm-values.yaml b/hack/dev-gke-helm-values.yaml new file mode 100644 index 00000000..bcac53be --- /dev/null +++ b/hack/dev-gke-helm-values.yaml @@ -0,0 +1,10 @@ +models: + catalog: + llama-3.1-8b-instruct-fp8-l4: + enabled: true + +resourceProfiles: + NVIDIA_GPU_L4: + nodeSelector: + cloud.google.com/gke-accelerator: "nvidia-l4" + cloud.google.com/gke-spot: "true" \ No newline at end of file diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go index cdf19812..3399a24d 100644 --- a/internal/modelcontroller/model_controller.go +++ b/internal/modelcontroller/model_controller.go @@ -548,38 +548,44 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err changed = true } - if model.Spec.Image == "" { - var serverImgs map[string]string - switch model.Spec.Engine { - case kubeaiv1.OLlamaEngine: - serverImgs = r.ModelServers.OLlama.Images - default: - serverImgs = r.ModelServers.VLLM.Images - } + image, err := r.lookupServerImage(model, profile) + if err != nil { + return false, fmt.Errorf("looking up server image: %w", err) + } + if model.Spec.Image == "" || model.Spec.Image != image { + model.Spec.Image = image + changed = true + } - // If no image name is provided for a profile, use the default image name. - const defaultImageName = "default" - imageName := defaultImageName - if profile.ImageName != "" { - imageName = profile.ImageName - } + return changed, nil +} - if img, ok := serverImgs[imageName]; ok { - model.Spec.Image = img - changed = true - } else { - // If the specific profile image name does not exist, use the default image name. - if img, ok := serverImgs[defaultImageName]; ok { - model.Spec.Image = img - changed = true - } else { - return false, fmt.Errorf("missing default server image") - } - } +func (r *ModelReconciler) lookupServerImage(model *kubeaiv1.Model, profile config.ResourceProfile) (string, error) { + var serverImgs map[string]string + switch model.Spec.Engine { + case kubeaiv1.OLlamaEngine: + serverImgs = r.ModelServers.OLlama.Images + default: + serverImgs = r.ModelServers.VLLM.Images + } + // If no image name is provided for a profile, use the default image name. + const defaultImageName = "default" + imageName := defaultImageName + if profile.ImageName != "" { + imageName = profile.ImageName } - return changed, nil + if img, ok := serverImgs[imageName]; ok { + return img, nil + } + + // If the specific profile image name does not exist, use the default image name. + if img, ok := serverImgs[defaultImageName]; ok { + return img, nil + } else { + return "", fmt.Errorf("missing default server image") + } } func (r *ModelReconciler) applySelfLabels(model *kubeaiv1.Model) bool {