Add runtimeClassName as optional field in resource profile (#253)

Useful for k3s... Also sync `manifests/` via `make manifests`
substratusai · Sep 27, 2024 · 3aaf99d · 3aaf99d
1 parent ebedde6
commit 3aaf99d
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 14 deletions.
diff --git a/docs/how-to/configure-resource-profiles.md b/docs/how-to/configure-resource-profiles.md
@@ -36,6 +36,7 @@ resourceProfiles:
       custom.com/gpu: "1"
       cpu: "3"
       memory: "12Gi"
+    runtimeClassName: "my-custom-runtime-class"
 ```
 
 If you need to run custom model server images on your resource profile, make sure to also add those in the `modelServers` section:

diff --git a/internal/config/system.go b/internal/config/system.go
@@ -130,12 +130,13 @@ func (d *Duration) UnmarshalJSON(b []byte) error {
 }
 
 type ResourceProfile struct {
-	ImageName    string              `json:"imageName"`
-	Requests     corev1.ResourceList `json:"requests,omitempty"`
-	Limits       corev1.ResourceList `json:"limits,omitempty"`
-	NodeSelector map[string]string   `json:"nodeSelector,omitempty"`
-	Affinity     *corev1.Affinity    `json:"affinity,omitempty"`
-	Tolerations  []corev1.Toleration `json:"tolerations,omitempty"`
+	ImageName        string              `json:"imageName"`
+	Requests         corev1.ResourceList `json:"requests,omitempty"`
+	Limits           corev1.ResourceList `json:"limits,omitempty"`
+	NodeSelector     map[string]string   `json:"nodeSelector,omitempty"`
+	Affinity         *corev1.Affinity    `json:"affinity,omitempty"`
+	Tolerations      []corev1.Toleration `json:"tolerations,omitempty"`
+	RuntimeClassName *string             `json:"runtimeClassName,omitempty"`
 }
 
 type MessageStream struct {

diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go
@@ -202,10 +202,11 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig
 		},
 		Spec: corev1.PodSpec{
 			NodeSelector:       profile.NodeSelector,
-			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
-			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
 			Affinity:           profile.Affinity,
 			Tolerations:        profile.Tolerations,
+			RuntimeClassName:   profile.RuntimeClassName,
+			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
+			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
 			Containers: []corev1.Container{
 				{
 					Name:            "server",
@@ -355,10 +356,11 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConf
 		},
 		Spec: corev1.PodSpec{
 			NodeSelector:       profile.NodeSelector,
-			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
-			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
 			Affinity:           profile.Affinity,
 			Tolerations:        profile.Tolerations,
+			RuntimeClassName:   profile.RuntimeClassName,
+			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
+			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
 			Containers: []corev1.Container{
 				{
 					Name:            "server",
@@ -503,10 +505,11 @@ func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile Mo
 		},
 		Spec: corev1.PodSpec{
 			NodeSelector:       profile.NodeSelector,
-			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
-			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
 			Affinity:           profile.Affinity,
 			Tolerations:        profile.Tolerations,
+			RuntimeClassName:   profile.RuntimeClassName,
+			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
+			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
 			Containers: []corev1.Container{
 				{
 					Name:            "server",
@@ -656,10 +659,11 @@ func (r *ModelReconciler) infinityPodForModel(m *kubeaiv1.Model, profile ModelCo
 			Annotations: ann,
 		},
 		Spec: corev1.PodSpec{
-			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
 			NodeSelector:       profile.NodeSelector,
 			Affinity:           profile.Affinity,
 			Tolerations:        profile.Tolerations,
+			RuntimeClassName:   profile.RuntimeClassName,
+			ServiceAccountName: r.ModelServerPods.ModelServiceAccountName,
 			SecurityContext:    r.ModelServerPods.ModelPodSecurityContext,
 			Containers: []corev1.Container{
 				{

diff --git a/manifests/models/gemma-2b-it-tpu.yaml b/manifests/models/gemma-2b-it-tpu.yaml
@@ -0,0 +1,13 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: gemma-2b-it-tpu
+spec:
+  features: [TextGeneration]
+  owner: google
+  url: hf://google/gemma-2b-it
+  engine: VLLM
+  args:
+    - --disable-log-requests
+  resourceProfile: google-tpu-v5e-1x1:1
diff --git a/manifests/models/llama-3.1-8b-instruct-tpu.yaml b/manifests/models/llama-3.1-8b-instruct-tpu.yaml
@@ -0,0 +1,18 @@
+# Source: models/templates/models.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-8b-instruct-tpu
+spec:
+  features: [TextGeneration]
+  owner: meta-llama
+  url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
+  engine: VLLM
+  args:
+    - --disable-log-requests
+    - --swap-space=8
+    - --tensor-parallel-size=4
+    - --num-scheduler-steps=4
+    - --max-model-len=8192
+    - --distributed-executor-backend=ray
+  resourceProfile: google-tpu-v5e-2x2:4
diff --git a/test/integration/main_test.go b/test/integration/main_test.go
@@ -15,7 +15,10 @@ import (
 	"gocloud.dev/pubsub"
 	_ "gocloud.dev/pubsub/mempubsub"
 	corev1 "k8s.io/api/core/v1"
+	nodev1 "k8s.io/api/node/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/envtest"
@@ -33,7 +36,13 @@ var (
 	testNS        = "default"
 	// testHTTPClient is a client with a long timeout for use in tests
 	// where requests may be held for long periods of time on purpose.
-	testHTTPClient = &http.Client{Timeout: 5 * time.Minute}
+	testHTTPClient  = &http.Client{Timeout: 5 * time.Minute}
+	cpuRuntimeClass = nodev1.RuntimeClass{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: cpuRuntimeClassName,
+		},
+		Handler: "my-cpu-runtime-handler",
+	}
 )
 
 // Messenger //
@@ -55,6 +64,7 @@ const (
 	resourceProfileNvidiaGPU = "nvidia-gpu-l4"
 	testVLLMDefualtImage     = "default-vllm-image:v1.2.3"
 	testVLLMCPUImage         = "cpu-vllm-image:v1.2.3"
+	cpuRuntimeClassName      = "my-cpu-runtime-class"
 )
 
 // sysCfg returns the System configuration for testing.
@@ -103,6 +113,7 @@ func sysCfg() config.System {
 						Effect:   corev1.TaintEffectNoSchedule,
 					},
 				},
+				RuntimeClassName: ptr.To(cpuRuntimeClassName),
 				Affinity: &corev1.Affinity{
 					NodeAffinity: &corev1.NodeAffinity{
 						RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
@@ -159,6 +170,9 @@ func TestMain(m *testing.M) {
 	testK8sClient, err = client.New(k8sCfg, client.Options{Scheme: manager.Scheme})
 	requireNoError(err)
 
+	err = installCommonResources()
+	requireNoError(err)
+
 	// Setup messenger requests.
 	testRequestsTopic, err = pubsub.OpenTopic(testCtx, memRequestsURL)
 	requireNoError(err)
@@ -208,3 +222,10 @@ func requireNoError(err error) {
 		log.Fatal(err)
 	}
 }
+
+func installCommonResources() error {
+	if err := testK8sClient.Create(testCtx, &cpuRuntimeClass); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/test/integration/model_profiles_test.go b/test/integration/model_profiles_test.go
@@ -8,6 +8,7 @@ import (
 	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -44,6 +45,7 @@ func TestModelProfiles(t *testing.T) {
 		// The Pod should have a single container named "server".
 		container := mustFindPodContainerByName(t, pod, "server")
 		assert.Equal(t, expectedResources, container.Resources)
+		assert.Equal(t, ptr.To(cpuRuntimeClassName), pod.Spec.RuntimeClassName)
 		assert.Contains(t, pod.Spec.Tolerations, sysCfg().ResourceProfiles[resourceProfileCPU].Tolerations[0])
 		assert.Equal(t, sysCfg().ResourceProfiles[resourceProfileCPU].Affinity, pod.Spec.Affinity)
 		assert.Equal(t, sysCfg().ResourceProfiles[resourceProfileCPU].NodeSelector, pod.Spec.NodeSelector)