diff --git a/docs/how-to/configure-resource-profiles.md b/docs/how-to/configure-resource-profiles.md index 1209aa74..601e4f27 100644 --- a/docs/how-to/configure-resource-profiles.md +++ b/docs/how-to/configure-resource-profiles.md @@ -36,6 +36,7 @@ resourceProfiles: custom.com/gpu: "1" cpu: "3" memory: "12Gi" + runtimeClassName: "my-custom-runtime-class" ``` If you need to run custom model server images on your resource profile, make sure to also add those in the `modelServers` section: diff --git a/internal/config/system.go b/internal/config/system.go index 4f550744..a789ad42 100644 --- a/internal/config/system.go +++ b/internal/config/system.go @@ -130,12 +130,13 @@ func (d *Duration) UnmarshalJSON(b []byte) error { } type ResourceProfile struct { - ImageName string `json:"imageName"` - Requests corev1.ResourceList `json:"requests,omitempty"` - Limits corev1.ResourceList `json:"limits,omitempty"` - NodeSelector map[string]string `json:"nodeSelector,omitempty"` - Affinity *corev1.Affinity `json:"affinity,omitempty"` - Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + ImageName string `json:"imageName"` + Requests corev1.ResourceList `json:"requests,omitempty"` + Limits corev1.ResourceList `json:"limits,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + Affinity *corev1.Affinity `json:"affinity,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + RuntimeClassName *string `json:"runtimeClassName,omitempty"` } type MessageStream struct { diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go index 75601a02..e96b82d2 100644 --- a/internal/modelcontroller/model_controller.go +++ b/internal/modelcontroller/model_controller.go @@ -202,10 +202,11 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, profile ModelConfig }, Spec: corev1.PodSpec{ NodeSelector: profile.NodeSelector, - SecurityContext: r.ModelServerPods.ModelPodSecurityContext, - ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, Affinity: profile.Affinity, Tolerations: profile.Tolerations, + RuntimeClassName: profile.RuntimeClassName, + ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, + SecurityContext: r.ModelServerPods.ModelPodSecurityContext, Containers: []corev1.Container{ { Name: "server", @@ -355,10 +356,11 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, profile ModelConf }, Spec: corev1.PodSpec{ NodeSelector: profile.NodeSelector, - SecurityContext: r.ModelServerPods.ModelPodSecurityContext, - ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, Affinity: profile.Affinity, Tolerations: profile.Tolerations, + RuntimeClassName: profile.RuntimeClassName, + ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, + SecurityContext: r.ModelServerPods.ModelPodSecurityContext, Containers: []corev1.Container{ { Name: "server", @@ -503,10 +505,11 @@ func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, profile Mo }, Spec: corev1.PodSpec{ NodeSelector: profile.NodeSelector, - SecurityContext: r.ModelServerPods.ModelPodSecurityContext, - ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, Affinity: profile.Affinity, Tolerations: profile.Tolerations, + RuntimeClassName: profile.RuntimeClassName, + ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, + SecurityContext: r.ModelServerPods.ModelPodSecurityContext, Containers: []corev1.Container{ { Name: "server", @@ -656,10 +659,11 @@ func (r *ModelReconciler) infinityPodForModel(m *kubeaiv1.Model, profile ModelCo Annotations: ann, }, Spec: corev1.PodSpec{ - ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, NodeSelector: profile.NodeSelector, Affinity: profile.Affinity, Tolerations: profile.Tolerations, + RuntimeClassName: profile.RuntimeClassName, + ServiceAccountName: r.ModelServerPods.ModelServiceAccountName, SecurityContext: r.ModelServerPods.ModelPodSecurityContext, Containers: []corev1.Container{ { diff --git a/manifests/models/gemma-2b-it-tpu.yaml b/manifests/models/gemma-2b-it-tpu.yaml new file mode 100644 index 00000000..3a933b7e --- /dev/null +++ b/manifests/models/gemma-2b-it-tpu.yaml @@ -0,0 +1,13 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: gemma-2b-it-tpu +spec: + features: [TextGeneration] + owner: google + url: hf://google/gemma-2b-it + engine: VLLM + args: + - --disable-log-requests + resourceProfile: google-tpu-v5e-1x1:1 diff --git a/manifests/models/llama-3.1-8b-instruct-tpu.yaml b/manifests/models/llama-3.1-8b-instruct-tpu.yaml new file mode 100644 index 00000000..9efee364 --- /dev/null +++ b/manifests/models/llama-3.1-8b-instruct-tpu.yaml @@ -0,0 +1,18 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.1-8b-instruct-tpu +spec: + features: [TextGeneration] + owner: meta-llama + url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct + engine: VLLM + args: + - --disable-log-requests + - --swap-space=8 + - --tensor-parallel-size=4 + - --num-scheduler-steps=4 + - --max-model-len=8192 + - --distributed-executor-backend=ray + resourceProfile: google-tpu-v5e-2x2:4 diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 3475df1c..03887660 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -15,7 +15,10 @@ import ( "gocloud.dev/pubsub" _ "gocloud.dev/pubsub/mempubsub" corev1 "k8s.io/api/core/v1" + nodev1 "k8s.io/api/node/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" @@ -33,7 +36,13 @@ var ( testNS = "default" // testHTTPClient is a client with a long timeout for use in tests // where requests may be held for long periods of time on purpose. - testHTTPClient = &http.Client{Timeout: 5 * time.Minute} + testHTTPClient = &http.Client{Timeout: 5 * time.Minute} + cpuRuntimeClass = nodev1.RuntimeClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: cpuRuntimeClassName, + }, + Handler: "my-cpu-runtime-handler", + } ) // Messenger // @@ -55,6 +64,7 @@ const ( resourceProfileNvidiaGPU = "nvidia-gpu-l4" testVLLMDefualtImage = "default-vllm-image:v1.2.3" testVLLMCPUImage = "cpu-vllm-image:v1.2.3" + cpuRuntimeClassName = "my-cpu-runtime-class" ) // sysCfg returns the System configuration for testing. @@ -103,6 +113,7 @@ func sysCfg() config.System { Effect: corev1.TaintEffectNoSchedule, }, }, + RuntimeClassName: ptr.To(cpuRuntimeClassName), Affinity: &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ @@ -159,6 +170,9 @@ func TestMain(m *testing.M) { testK8sClient, err = client.New(k8sCfg, client.Options{Scheme: manager.Scheme}) requireNoError(err) + err = installCommonResources() + requireNoError(err) + // Setup messenger requests. testRequestsTopic, err = pubsub.OpenTopic(testCtx, memRequestsURL) requireNoError(err) @@ -208,3 +222,10 @@ func requireNoError(err error) { log.Fatal(err) } } + +func installCommonResources() error { + if err := testK8sClient.Create(testCtx, &cpuRuntimeClass); err != nil { + return err + } + return nil +} diff --git a/test/integration/model_profiles_test.go b/test/integration/model_profiles_test.go index 97083ce4..3f200fd5 100644 --- a/test/integration/model_profiles_test.go +++ b/test/integration/model_profiles_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -44,6 +45,7 @@ func TestModelProfiles(t *testing.T) { // The Pod should have a single container named "server". container := mustFindPodContainerByName(t, pod, "server") assert.Equal(t, expectedResources, container.Resources) + assert.Equal(t, ptr.To(cpuRuntimeClassName), pod.Spec.RuntimeClassName) assert.Contains(t, pod.Spec.Tolerations, sysCfg().ResourceProfiles[resourceProfileCPU].Tolerations[0]) assert.Equal(t, sysCfg().ResourceProfiles[resourceProfileCPU].Affinity, pod.Spec.Affinity) assert.Equal(t, sysCfg().ResourceProfiles[resourceProfileCPU].NodeSelector, pod.Spec.NodeSelector)