Skip to content

Commit

Permalink
Improved Pod Managment (#241)
Browse files Browse the repository at this point in the history
* Refactor Model Pod names to use non-indexed names.
* Improve scale-down - prioritize non-Ready Pods first.
* Ensure ready Pods stick around while rolling out updates.
* Add configurable `.modelRollouts.surge` field to system `config.yaml`.
* Add automated tests

### Manual Tests

```bash
# Model .spec.replicas: 1
# config .modelRollouts.surge: 0
(base) ➜  kubeai git:(rolling-updates) ✗ k get pods -w
NAME                        READY   STATUS              RESTARTS   AGE
model-dev-c47f4cb94-qftfc   0/1     ContainerCreating   0          3s
model-dev-c47f4cb94-qftfc   0/1     Running             0          2m
model-dev-c47f4cb94-qftfc   0/1     Running             0          3m
model-dev-c47f4cb94-qftfc   1/1     Running             0          3m
model-dev-c47f4cb94-qftfc   1/1     Terminating         0          3m33s
model-dev-76975599f6-przkw   0/1     Pending             0          0s
model-dev-76975599f6-przkw   0/1     Pending             0          0s
model-dev-76975599f6-przkw   0/1     ContainerCreating   0          0s
model-dev-c47f4cb94-qftfc    0/1     Terminating         0          3m34s
model-dev-c47f4cb94-qftfc    0/1     Terminating         0          3m34s
model-dev-c47f4cb94-qftfc    0/1     Terminating         0          3m34s
model-dev-c47f4cb94-qftfc    0/1     Terminating         0          3m34s
model-dev-76975599f6-przkw   0/1     Running             0          4s
model-dev-76975599f6-przkw   0/1     Running             0          50s
model-dev-76975599f6-przkw   1/1     Running             0          50s
```

```bash
# Model .spec.replicas: 1
# config .modelRollouts.surge: 1
(base) ➜  kubeai git:(rolling-updates) ✗ k get pods -w
NAME                         READY   STATUS    RESTARTS   AGE
model-dev-69d677549c-2pqhq   1/1     Running   0          33s
model-dev-765bff5b74-2qdzn   0/1     Pending   0          0s
model-dev-765bff5b74-2qdzn   0/1     Pending   0          0s
model-dev-765bff5b74-2qdzn   0/1     ContainerCreating   0          0s
model-dev-765bff5b74-2qdzn   0/1     Running             0          1s
model-dev-765bff5b74-2qdzn   0/1     Running             0          30s
model-dev-765bff5b74-2qdzn   1/1     Running             0          30s
model-dev-69d677549c-2pqhq   1/1     Terminating         0          72s
model-dev-69d677549c-2pqhq   0/1     Terminating         0          73s
model-dev-69d677549c-2pqhq   0/1     Terminating         0          74s
model-dev-69d677549c-2pqhq   0/1     Terminating         0          74s
model-dev-69d677549c-2pqhq   0/1     Terminating         0          74s
```
  • Loading branch information
nstogner authored Sep 25, 2024
1 parent 109aa7d commit 09cf9ba
Show file tree
Hide file tree
Showing 16 changed files with 752 additions and 183 deletions.
2 changes: 2 additions & 0 deletions charts/kubeai/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ data:
{{- .Values.resourceProfiles | toYaml | nindent 6 }}
modelServers:
{{- .Values.modelServers | toYaml | nindent 6 }}
modelRollouts:
{{- .Values.modelRollouts | toYaml | nindent 6 }}
modelServerPods:
{{- if .Values.modelServerPods }}
{{- if .Values.modelServerPods.podSecurityContext }}
Expand Down
4 changes: 4 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ modelServerPods:
drop:
- ALL

modelRollouts:
# The number of replicas to add when rolling out a new model.
surge: 1

resourceProfiles:
cpu:
imageName: "cpu"
Expand Down
6 changes: 4 additions & 2 deletions hack/dev-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ modelServers:
images:
default: "ollama/ollama:latest"
cpu: "ollama/ollama:0.3.8"
modelRollouts:
surge: 0
messaging:
errorMaxBackoff: 30s
streams: []
Expand All @@ -18,8 +20,8 @@ messaging:
resourceProfiles:
cpu:
requests:
cpu: 1
memory: 2Gi
cpu: 0.5
memory: 1Gi
nvidia-gpu-l4:
limits:
nvidia.com/gpu: "1"
Expand Down
7 changes: 7 additions & 0 deletions internal/config/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ type System struct {
ModelAutoscaling ModelAutoscaling `json:"modelAutoscaling" validate:"required"`

ModelServerPods ModelServerPods `json:"modelServerPods,omitempty"`

ModelRollouts ModelRollouts `json:"modelRollouts"`
}

func (s *System) DefaultAndValidate() error {
Expand All @@ -59,6 +61,11 @@ func (s *System) DefaultAndValidate() error {
return validator.New(validator.WithRequiredStructEnabled()).Struct(s)
}

type ModelRollouts struct {
// Surge is the number of additional Pods to create when rolling out an update.
Surge int32 `json:"surge"`
}

type ModelAutoscaling struct {
// Interval is the time between each autoscaling check.
// Defaults to 10 seconds.
Expand Down
4 changes: 4 additions & 0 deletions internal/k8sutils/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ import (
"k8s.io/apimachinery/pkg/util/rand"
)

func PodIsScheduled(pod *corev1.Pod) bool {
return pod.Spec.NodeName != ""
}

func PodIsReady(pod *corev1.Pod) bool {
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
Expand Down
1 change: 1 addition & 0 deletions internal/manager/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ func Run(ctx context.Context, k8sCfg *rest.Config, cfg config.System) error {
ResourceProfiles: cfg.ResourceProfiles,
ModelServers: cfg.ModelServers,
ModelServerPods: cfg.ModelServerPods,
ModelRollouts: cfg.ModelRollouts,
}
if err = modelReconciler.SetupWithManager(mgr); err != nil {
return fmt.Errorf("unable to create Model controller: %w", err)
Expand Down
28 changes: 18 additions & 10 deletions internal/modelcontroller/model_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"sort"
"strconv"
"strings"
"time"

"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -51,6 +52,7 @@ type ModelReconciler struct {
ResourceProfiles map[string]config.ResourceProfile
ModelServers config.ModelServers
ModelServerPods config.ModelServerPods
ModelRollouts config.ModelRollouts
}

// +kubebuilder:rbac:groups=kubeai.org,resources=models,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -99,8 +101,17 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
}

plan := r.calculatePodPlan(allPods, model, modelConfig)
if err := plan.execute(ctx, r.Client, r.Scheme); err != nil {
return ctrl.Result{}, fmt.Errorf("executing pod plan: %w", err)
if plan.containsActions() {
changed, err := plan.execute(ctx, r.Client, r.Scheme)
if changed {
// Slow things down to wait for caches to sync.
// This is important because the pod plan has some calculations that
// assume the cache is up to date.
time.Sleep(3 * time.Second)
}
if err != nil {
return ctrl.Result{}, fmt.Errorf("executing pod plan: %w", err)
}
}

// Summarize all pods.
Expand All @@ -125,6 +136,7 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl

// SetupWithManager sets up the controller with the Manager.
func (r *ModelReconciler) SetupWithManager(mgr ctrl.Manager) error {
// TODO: Set Model concurrency. Pod rollouts can be slow.
return ctrl.NewControllerManagedBy(mgr).
For(&kubeaiv1.Model{}).
Owns(&corev1.Pod{}).
Expand All @@ -140,7 +152,7 @@ func (r *ModelReconciler) apply(ctx context.Context, model *kubeaiv1.Model, obj
}
*/

func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig, name string) *corev1.Pod {
func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig) *corev1.Pod {
lbs := labelsForModel(m)
ann := r.annotationsForModel(m)
if _, ok := ann[kubeaiv1.ModelPodPortAnnotation]; !ok {
Expand Down Expand Up @@ -187,7 +199,6 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: m.Namespace,
Labels: lbs,
Annotations: ann,
Expand Down Expand Up @@ -279,7 +290,7 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig
return pod
}

func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConfig, name string) *corev1.Pod {
func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConfig) *corev1.Pod {
lbs := labelsForModel(m)
ann := r.annotationsForModel(m)

Expand Down Expand Up @@ -340,7 +351,6 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConf

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: m.Namespace,
Labels: lbs,
Annotations: ann,
Expand Down Expand Up @@ -441,7 +451,7 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConf

}

func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile ModelConfig, name string) *corev1.Pod {
func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile ModelConfig) *corev1.Pod {
lbs := labelsForModel(m)
ann := r.annotationsForModel(m)
if _, ok := ann[kubeaiv1.ModelPodPortAnnotation]; !ok {
Expand Down Expand Up @@ -489,7 +499,6 @@ func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile Mo

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: m.Namespace,
Labels: lbs,
Annotations: ann,
Expand Down Expand Up @@ -579,7 +588,7 @@ func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile Mo
return pod
}

func (r *ModelReconciler) infinityPodForModel(m *kubeaiv1.Model, profile ModelConfig, name string) *corev1.Pod {
func (r *ModelReconciler) infinityPodForModel(m *kubeaiv1.Model, profile ModelConfig) *corev1.Pod {
lbs := labelsForModel(m)
ann := r.annotationsForModel(m)

Expand Down Expand Up @@ -644,7 +653,6 @@ func (r *ModelReconciler) infinityPodForModel(m *kubeaiv1.Model, profile ModelCo

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: m.Namespace,
Labels: lbs,
Annotations: ann,
Expand Down
Loading

0 comments on commit 09cf9ba

Please sign in to comment.