Skip to content

Commit

Permalink
Finish resource profile functionality and add doc
Browse files Browse the repository at this point in the history
  • Loading branch information
nstogner committed Aug 31, 2024
1 parent 03e0aa5 commit 76bf2c4
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 41 deletions.
5 changes: 5 additions & 0 deletions charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ spec:
- TextEmbedding
type: string
type: array
image:
description: |-
Image to be used for the server process.
Will be set from the ResourceProfile if provided.
type: string
maxReplicas:
format: int32
type: integer
Expand Down
1 change: 1 addition & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ modelServers:
VLLM:
images:
# The key is the image name (referenced from resourceProfiles) and the value is the image.
# The "default" image should always be specified.
# "default" is used when no imageName is specified or if a specific image is not found.
default: "vllm/vllm-openai:v0.5.5"
cpu: "substratusai/vllm-openai-cpu:v0.5.5"
Expand Down
51 changes: 51 additions & 0 deletions docs/concepts/resource-profiles.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Resource Profiles

A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are set on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires.

Kubernetes Model resources specify the resource profile and the count of that resource that they require:

```yaml
# model.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-8b-instruct-fp8-l4
spec:
engine: VLLM
resourceProfile: NVIDIA_GPU_L4:1 # Specified at <profile>:<count>
# ...
```
A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed on.

Example: A resource profile named `NVIDIA_GPU_L4` might contain the following settings on a GKE Kubernetes cluster:

```yaml
# KubeAI config.yaml
resourceProfiles:
NVIDIA_GPU_L4:
limits:
# Typical across most Kubernetes clusters:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
nodeSelector:
# Specific to GKE:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-spot: "true"
imageName: "nvidia-gpu"
```
In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource:
```yaml
# KubeAI config.yaml
modelServers:
VLLM:
images:
default: "vllm/vllm-openai:v0.5.5"
nvidia-gpu: "vllm/vllm-openai:v0.5.5" # <--
cpu: "vllm/vllm-openai-cpu:v0.5.5"
OLlama:
images:
# ...
```
26 changes: 19 additions & 7 deletions docs/development.md
Original file line number Diff line number Diff line change
@@ -1,42 +1,54 @@
# Development

## Cloud Setup
## Optional: Cloud Setup

### GCP PubSub

If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the `.messaging.streams` in `./hack/dev-config.yaml`.

```bash
gcloud auth login --update-adc

gcloud pubsub topics create test-kubeai-requests
gcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests
gcloud pubsub topics create test-kubeai-responses
gcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses
```

## Local Cluster
## Run in Local Cluster

```bash
kind create cluster
# OR
#./hack/create-dev-gke-cluster.yaml

# Generate CRDs from Go code.
make generate && make manifests

# When CRDs are changed reapply using kubectl:
kubectl apply -f ./charts/kubeai/charts/crds/crds

# Model with special address annotations:
kubectl apply -f ./hack/dev-model.yaml

# For developing in-cluster features:
# OPTION A #
# Run KubeAI inside cluster
# Change `-f` based on the cluster environment.
helm upgrade --install kubeai ./charts/kubeai \
--set openwebui.enabled=true \
--set image.tag=latest \
--set image.pullPolicy=Always \
--set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
--set replicaCount=1 # 0 if running out-of-cluster (using "go run")

# -f ./helm-values.yaml \
--set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
--set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml

# Run in development mode.
# OPTION B #
# For quick local interation (run KubeAI outside of cluster)
CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override

# In another terminal:
while true; do kubectl port-forward service/dev-model 7000:7000; done
############
```

## Running
Expand Down
17 changes: 10 additions & 7 deletions hack/dev-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@ secretNames:
huggingface: huggingface
modelServers:
vLLM:
gpuImage: "vllm/vllm-openai:latest"
cpuImage: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
images:
default: "vllm/vllm-openai:latest"
cpu: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
ollama:
image: "ollama/ollama:latest"
images:
default: "ollama/ollama:latest"
cpu: "ollama/ollama:0.3.8"
messaging:
errorMaxBackoff: 30s
streams:
- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
maxHandlers: 1
streams: []
#- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
# responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
# maxHandlers: 1
resourceProfiles:
CPU:
requests:
Expand Down
10 changes: 10 additions & 0 deletions hack/dev-gke-helm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
models:
catalog:
llama-3.1-8b-instruct-fp8-l4:
enabled: true

resourceProfiles:
NVIDIA_GPU_L4:
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-spot: "true"
60 changes: 33 additions & 27 deletions internal/modelcontroller/model_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -548,38 +548,44 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
changed = true
}

if model.Spec.Image == "" {
var serverImgs map[string]string
switch model.Spec.Engine {
case kubeaiv1.OLlamaEngine:
serverImgs = r.ModelServers.OLlama.Images
default:
serverImgs = r.ModelServers.VLLM.Images
}
image, err := r.lookupServerImage(model, profile)
if err != nil {
return false, fmt.Errorf("looking up server image: %w", err)
}
if model.Spec.Image == "" || model.Spec.Image != image {
model.Spec.Image = image
changed = true
}

// If no image name is provided for a profile, use the default image name.
const defaultImageName = "default"
imageName := defaultImageName
if profile.ImageName != "" {
imageName = profile.ImageName
}
return changed, nil
}

if img, ok := serverImgs[imageName]; ok {
model.Spec.Image = img
changed = true
} else {
// If the specific profile image name does not exist, use the default image name.
if img, ok := serverImgs[defaultImageName]; ok {
model.Spec.Image = img
changed = true
} else {
return false, fmt.Errorf("missing default server image")
}
}
func (r *ModelReconciler) lookupServerImage(model *kubeaiv1.Model, profile config.ResourceProfile) (string, error) {
var serverImgs map[string]string
switch model.Spec.Engine {
case kubeaiv1.OLlamaEngine:
serverImgs = r.ModelServers.OLlama.Images
default:
serverImgs = r.ModelServers.VLLM.Images
}

// If no image name is provided for a profile, use the default image name.
const defaultImageName = "default"
imageName := defaultImageName
if profile.ImageName != "" {
imageName = profile.ImageName
}

return changed, nil
if img, ok := serverImgs[imageName]; ok {
return img, nil
}

// If the specific profile image name does not exist, use the default image name.
if img, ok := serverImgs[defaultImageName]; ok {
return img, nil
} else {
return "", fmt.Errorf("missing default server image")
}
}

func (r *ModelReconciler) applySelfLabels(model *kubeaiv1.Model) bool {
Expand Down

0 comments on commit 76bf2c4

Please sign in to comment.