Skip to content

Commit

Permalink
Start implementing server image names
Browse files Browse the repository at this point in the history
  • Loading branch information
nstogner committed Aug 31, 2024
1 parent 8765ad9 commit 03e0aa5
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 37 deletions.
4 changes: 4 additions & 0 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ type ModelSpec struct {
// ResourceProfile maps to specific pre-configured resources.
ResourceProfile string `json:"resourceProfile,omitempty"`

// Image to be used for the server process.
// Will be set from the ResourceProfile if provided.
Image string `json:"image,omitempty"`

// Resources to be allocated to the server process.
// Will be set from the ResourceProfile if provided.
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
Expand Down
16 changes: 8 additions & 8 deletions charts/kubeai/charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ catalog:
features: ["TextEmbedding"]
owner: intfloat
url: "hf://intfloat/e5-mistral-7b-instruct"
server: VLLM
engine: VLLM
resourceProfile: CPU:1
args:
- --gpu-memory-utilization=0.9
Expand All @@ -19,15 +19,15 @@ catalog:
features: ["TextGeneration"]
owner: google
url: "ollama://gemma2:2b"
server: OLlama
engine: OLlama
resourceProfile: CPU:2
# Llama #
llama-3.1-8b-instruct-cpu:
enabled: false
features: ["TextGeneration"]
owner: "meta-llama"
url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
server: VLLM
engine: VLLM
resourceProfile: CPU:6
env:
VLLM_CPU_KVCACHE_SPACE: "4"
Expand All @@ -39,7 +39,7 @@ catalog:
features: ["TextGeneration"]
owner: "neuralmagic"
url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
server: VLLM
engine: VLLM
resourceProfile: NVIDIA_GPU_L4:1
args:
- --max-model-len=16384
Expand All @@ -51,29 +51,29 @@ catalog:
features: ["TextEmbedding"]
owner: nomic
url: "ollama://nomic-embed-text"
server: OLlama
engine: OLlama
resourceProfile: CPU:1
# Opt #
opt-125m-cpu:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
server: VLLM
engine: VLLM
resourceProfile: CPU:1
opt-125m-l4:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
server: VLLM
engine: VLLM
resourceProfile: NVIDIA_GPU_L4:1
# Qwen #
qwen2-500m-cpu:
enabled: false
features: ["TextGeneration"]
owner: alibaba
url: "ollama://qwen2:0.5b"
server: OLlama
engine: OLlama
resourceProfile: CPU:1

16 changes: 10 additions & 6 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,24 @@ secrets:
modelServers:
VLLM:
images:
# The key is the resource profile name (with "*" matching), and the value is the image to use.
"*": "vllm/vllm-openai:v0.5.5"
"CPU*": "substratusai/vllm-openai-cpu:v0.5.5"
"NVIDIA_GPU*": "vllm/vllm-openai:v0.5.5"
"GOOGLE_TPU*": "substratusai/vllm-openai-tpu:v0.5.5"
# The key is the image name (referenced from resourceProfiles) and the value is the image.
# "default" is used when no imageName is specified or if a specific image is not found.
default: "vllm/vllm-openai:v0.5.5"
cpu: "substratusai/vllm-openai-cpu:v0.5.5"
nvidia-gpu: "vllm/vllm-openai:v0.5.5"
google-tpu: "substratusai/vllm-openai-tpu:v0.5.5"
OLlama:
images:
"*": "ollama/ollama:latest"
default: "ollama/ollama:latest"

resourceProfiles:
CPU:
imageName: "cpu"
requests:
cpu: 1
memory: "2Gi"
NVIDIA_GPU_L4:
imageName: "nvidia-gpu"
limits:
nvidia.com/gpu: "1"
requests:
Expand All @@ -40,6 +43,7 @@ resourceProfiles:
# NVIDIA_GPU_A100:
# ...
GOOGLE_TPU_V5E:
imageName: "google-tpu"
requests:
google.com/tpu: 4
limits:
Expand Down
2 changes: 1 addition & 1 deletion docs/model-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
features: ["TextGeneration"]
owner: neuralmagic
url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
server: VLLM
engine: VLLM
args:
- --max-model-len=16384
- --max-num-batched-token=16384
Expand Down
14 changes: 7 additions & 7 deletions internal/config/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func (d *Duration) UnmarshalJSON(b []byte) error {
}

type ResourceProfile struct {
ImageName string `json:"imageName"`
Requests corev1.ResourceList `json:"requests,omitempty"`
Limits corev1.ResourceList `json:"limits,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Expand All @@ -65,11 +66,10 @@ type MessageStream struct {
}

type ModelServers struct {
Ollama struct {
Image string `json:"image"`
} `json:"ollama"`
VLLM struct {
CPUImage string `json:"cpuImage"`
GPUImage string `json:"gpuImage"`
}
OLlama ModelServer `json:"OLlama"`
VLLM ModelServer `json:"VLLM"`
}

type ModelServer struct {
Images map[string]string `json:"images"`
}
48 changes: 33 additions & 15 deletions internal/modelcontroller/model_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,6 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev
}
args = append(args, m.Spec.Args...)

var image string
if usesGPUResources(*m.Spec.Resources) {
image = r.ModelServers.VLLM.GPUImage
} else {
image = r.ModelServers.VLLM.CPUImage
}

env := []corev1.EnvVar{
{
// TODO: Conditionally set this token based on whether
Expand Down Expand Up @@ -266,7 +259,7 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev
Containers: []corev1.Container{
{
Name: "server",
Image: image,
Image: m.Spec.Image,
Args: args,
Env: env,
Resources: *m.Spec.Resources,
Expand Down Expand Up @@ -399,7 +392,7 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
Containers: []corev1.Container{
{
Name: "server",
Image: r.ModelServers.Ollama.Image,
Image: m.Spec.Image,
Args: m.Spec.Args,
Env: env,
Resources: *m.Spec.Resources,
Expand Down Expand Up @@ -508,12 +501,6 @@ func (r *ModelReconciler) annotationsForModel(m *kubeaiv1.Model) map[string]stri
return ann
}

func usesGPUResources(res corev1.ResourceRequirements) bool {
_, gpuLimits := res.Limits[corev1.ResourceName("nvidia.com/gpu")]
_, gpuRequests := res.Limits[corev1.ResourceName("nvidia.com/gpu")]
return gpuLimits || gpuRequests
}

func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, error) {
split := strings.Split(model.Spec.ResourceProfile, ":")
if len(split) != 2 {
Expand Down Expand Up @@ -561,6 +548,37 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
changed = true
}

if model.Spec.Image == "" {
var serverImgs map[string]string
switch model.Spec.Engine {
case kubeaiv1.OLlamaEngine:
serverImgs = r.ModelServers.OLlama.Images
default:
serverImgs = r.ModelServers.VLLM.Images
}

// If no image name is provided for a profile, use the default image name.
const defaultImageName = "default"
imageName := defaultImageName
if profile.ImageName != "" {
imageName = profile.ImageName
}

if img, ok := serverImgs[imageName]; ok {
model.Spec.Image = img
changed = true
} else {
// If the specific profile image name does not exist, use the default image name.
if img, ok := serverImgs[defaultImageName]; ok {
model.Spec.Image = img
changed = true
} else {
return false, fmt.Errorf("missing default server image")
}
}

}

return changed, nil
}

Expand Down

0 comments on commit 03e0aa5

Please sign in to comment.