koordinator-sh · koordinator-bot · Oct 15, 2024 · Sep 30, 2024 · Oct 8, 2024
diff --git a/apis/extension/device_share.go b/apis/extension/device_share.go
@@ -18,8 +18,10 @@ package extension
 
 import (
 	"encoding/json"
+	"fmt"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
@@ -32,6 +34,10 @@ const (
 	AnnotationDeviceAllocateHint = SchedulingDomainPrefix + "/device-allocate-hint"
 	// AnnotationDeviceJointAllocate guides the scheduler joint-allocates devices
 	AnnotationDeviceJointAllocate = SchedulingDomainPrefix + "/device-joint-allocate"
+	// AnnotationGPUPartitionSpec represents the GPU partition spec that pod requests
+	AnnotationGPUPartitionSpec = SchedulingDomainPrefix + "/gpu-partition-spec"
+	// AnnotationGPUPartitions represents the GPU partitions supported on the node
+	AnnotationGPUPartitions = SchedulingDomainPrefix + "/gpu-partitions"
 )
 
 const (
@@ -48,8 +54,9 @@ const (
 )
 
 const (
-	LabelGPUModel         string = NodeDomainPrefix + "/gpu-model"
-	LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version"
+	LabelGPUPartitionPolicy string = NodeDomainPrefix + "/gpu-partition-policy"
+	LabelGPUModel           string = NodeDomainPrefix + "/gpu-model"
+	LabelGPUDriverVersion   string = NodeDomainPrefix + "/gpu-driver-version"
 )
 
 // DeviceAllocations would be injected into Pod as form of annotation during Pre-bind stage.
@@ -132,10 +139,19 @@ const (
 type DeviceTopologyScope string
 
 const (
+	DeviceTopologyScopeDevice   DeviceTopologyScope = "Device"
 	DeviceTopologyScopePCIe     DeviceTopologyScope = "PCIe"
 	DeviceTopologyScopeNUMANode DeviceTopologyScope = "NUMANode"
+	DeviceTopologyScopeNode     DeviceTopologyScope = "Node"
 )
 
+var DeviceTopologyScopeLevel = map[DeviceTopologyScope]int{
+	DeviceTopologyScopeDevice:   4,
+	DeviceTopologyScopePCIe:     3,
+	DeviceTopologyScopeNUMANode: 2,
+	DeviceTopologyScopeNode:     1,
+}
+
 type DeviceExclusivePolicy string
 
 const (
@@ -145,6 +161,47 @@ const (
 	PCIExpressLevelDeviceExclusivePolicy DeviceExclusivePolicy = "PCIeLevel"
 )
 
+type GPUPartitionSpec struct {
+	AllocatePolicy   GPUPartitionAllocatePolicy `json:"allocatePolicy,omitempty"`
+	RingBusBandwidth *resource.Quantity         `json:"ringBusBandwidth,omitempty"`
+}
+
+type GPUPartitionAllocatePolicy string
+
+const (
+	// GPUPartitionAllocatePolicyRestricted indicates that only partitions with the most allocationScore will be considered.
+	GPUPartitionAllocatePolicyRestricted GPUPartitionAllocatePolicy = "Restricted"
+	// GPUPartitionAllocatePolicyBestEffort indicates that try best to pursue partition with more allocationScore.
+	GPUPartitionAllocatePolicyBestEffort GPUPartitionAllocatePolicy = "BestEffort"
+)
+
+type GPULinkType string
+
+const (
+	GPUNVLink GPULinkType = "NVLink"
+)
+
+type GPUPartition struct {
+	Minors           []int              `json:"minors"`
+	GPULinkType      GPULinkType        `json:"gpuLinkType,omitempty"`
+	RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"`
+	AllocationScore  int                `json:"allocationScore,omitempty"`
+	MinorsHash       int                `json:"-"`
+	BinPackScore     int                `json:"-"`
+}
+
+// GPUPartitionTable will be annotated on Device
+type GPUPartitionTable map[int][]GPUPartition
+
+type GPUPartitionPolicy string
+
+const (
+	// GPUPartitionPolicyHonor indicates that the partitions annotated to the Device CR should be honored.
+	GPUPartitionPolicyHonor GPUPartitionPolicy = "Honor"
+	// GPUPartitionPolicyPrefer indicates that the partitions annotated to the Device CR are preferred.
+	GPUPartitionPolicyPrefer GPUPartitionPolicy = "Prefer"
+)
+
 func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations, error) {
 	deviceAllocations := DeviceAllocations{}
 	data, ok := podAnnotations[AnnotationDeviceAllocated]
@@ -234,3 +291,44 @@ func GetDeviceJointAllocate(annotations map[string]string) (*DeviceJointAllocate
 	}
 	return &jointAllocate, nil
 }
+
+func GetGPUPartitionSpec(annotations map[string]string) (*GPUPartitionSpec, error) {
+	val, ok := annotations[AnnotationGPUPartitionSpec]
+	if !ok {
+		return nil, nil
+	}
+	var spec GPUPartitionSpec
+	err := json.Unmarshal([]byte(val), &spec)
+	if err != nil {
+		return nil, err
+	}
+	if spec.AllocatePolicy == "" {
+		spec.AllocatePolicy = GPUPartitionAllocatePolicyBestEffort
+	}
+	return &spec, nil
+}
+
+func GetGPUPartitionTable(device *schedulingv1alpha1.Device) (GPUPartitionTable, error) {
+	if rawGPUPartitionTable, ok := device.Annotations[AnnotationGPUPartitions]; ok && rawGPUPartitionTable != "" {
+		gpuPartitionTable := GPUPartitionTable{}
+		err := json.Unmarshal([]byte(rawGPUPartitionTable), &gpuPartitionTable)
+		if err != nil {
+			return nil, err
+		}
+		if gpuPartitionTable == nil {
+			return nil, fmt.Errorf("invalid gpu partitions in device cr: %s", rawGPUPartitionTable)
+		}
+		return gpuPartitionTable, nil
+	}
+	return nil, nil
+}
+
+func GetGPUPartitionPolicy(device *schedulingv1alpha1.Device) GPUPartitionPolicy {
+	if device == nil {
+		return GPUPartitionPolicyPrefer
+	}
+	if allocatePolicy := device.Labels[LabelGPUPartitionPolicy]; GPUPartitionPolicy(allocatePolicy) == GPUPartitionPolicyHonor {
+		return GPUPartitionPolicyHonor
+	}
+	return GPUPartitionPolicyPrefer
+}
diff --git a/apis/extension/device_share_test.go b/apis/extension/device_share_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package extension
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -27,6 +28,10 @@ import (
 	schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
 )
 
+var (
+	bandWidthOf200Gi = resource.MustParse("200Gi")
+)
+
 func Test_GetDeviceAllocations(t *testing.T) {
 	tests := []struct {
 		name    string
@@ -138,3 +143,196 @@ func Test_SetDeviceAllocations(t *testing.T) {
 		})
 	}
 }
+
+func TestGetGPUPartitionSpec(t *testing.T) {
+	type args struct {
+		annotations map[string]string
+	}
+	tests := []struct {
+		name    string
+		args    args
+		want    *GPUPartitionSpec
+		wantErr assert.ErrorAssertionFunc
+	}{
+		{
+			name: "nil partitionSpec",
+			args: args{
+				annotations: nil,
+			},
+			want:    nil,
+			wantErr: assert.NoError,
+		},
+		{
+			name: "empty partitionSpec",
+			args: args{
+				annotations: map[string]string{
+					AnnotationGPUPartitionSpec: `{}`,
+				},
+			},
+			want: &GPUPartitionSpec{
+				AllocatePolicy: GPUPartitionAllocatePolicyBestEffort,
+			},
+			wantErr: assert.NoError,
+		},
+		{
+			name: "allocatePolicy BestEffort",
+			args: args{
+				annotations: map[string]string{
+					AnnotationGPUPartitionSpec: `{"allocatePolicy":"BestEffort"}`,
+				},
+			},
+			want: &GPUPartitionSpec{
+				AllocatePolicy: GPUPartitionAllocatePolicyBestEffort,
+			},
+			wantErr: assert.NoError,
+		},
+		{
+			name: "allocatePolicy Restricted",
+			args: args{
+				annotations: map[string]string{
+					AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted"}`,
+				},
+			},
+			want: &GPUPartitionSpec{
+				AllocatePolicy: GPUPartitionAllocatePolicyRestricted,
+			},
+			wantErr: assert.NoError,
+		},
+		{
+			name: "allocatePolicy Restricted, ringAllReduceBandwidth specified",
+			args: args{
+				annotations: map[string]string{
+					AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted", "ringBusBandwidth":"200Gi"}`,
+				},
+			},
+			want: &GPUPartitionSpec{
+				AllocatePolicy:   GPUPartitionAllocatePolicyRestricted,
+				RingBusBandwidth: &bandWidthOf200Gi,
+			},
+			wantErr: assert.NoError,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := GetGPUPartitionSpec(tt.args.annotations)
+			if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionSpec(%v)", tt.args.annotations)) {
+				return
+			}
+			assert.Equalf(t, tt.want, got, "GetGPUPartitionSpec(%v)", tt.args.annotations)
+		})
+	}
+}
+
+func TestGetGPUPartitionTable(t *testing.T) {
+	tests := []struct {
+		name    string
+		device  *schedulingv1alpha1.Device
+		want    GPUPartitionTable
+		wantErr assert.ErrorAssertionFunc
+	}{
+		{
+			name: "Valid GPU Partition Table",
+			device: &schedulingv1alpha1.Device{
+				ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{
+						AnnotationGPUPartitions: `{"0": [{"minors": [0,1], "gpuLinkType": "NVLink","ringBusBandwidth": "200Gi", "allocationScore": 10}]}`,
+					},
+				},
+			},
+			want: GPUPartitionTable{
+				0: []GPUPartition{
+					{
+						Minors:           []int{0, 1},
+						GPULinkType:      GPUNVLink,
+						RingBusBandwidth: &bandWidthOf200Gi,
+						AllocationScore:  10,
+						MinorsHash:       0, // This would be calculated.
+						BinPackScore:     0, // This would also be calculated if needed in actual implementation.
+					},
+				},
+			},
+			wantErr: assert.NoError,
+		},
+		{
+			name:    "No Annotation",
+			device:  &schedulingv1alpha1.Device{},
+			want:    nil,
+			wantErr: assert.NoError,
+		},
+		{
+			name: "Invalid JSON",
+			device: &schedulingv1alpha1.Device{
+				ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{
+						AnnotationGPUPartitions: `Invalid JSON format`,
+					},
+				},
+			},
+			want:    nil,
+			wantErr: assert.Error,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := GetGPUPartitionTable(tt.device)
+			if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionTable(%v)", tt.device)) {
+				return
+			}
+			assert.Equalf(t, tt.want, got, "GetGPUPartitionTable(%v)", tt.device)
+		})
+	}
+}
+
+// TestGetNodeGPUAllocatePolicy tests the GetGPUPartitionPolicy function.
+func TestGetNodeLevelGPUAllocatePolicy(t *testing.T) {
+	tests := []struct {
+		name     string
+		node     *schedulingv1alpha1.Device
+		expected GPUPartitionPolicy
+	}{
+		{
+			name:     "Nil node",
+			node:     nil,
+			expected: GPUPartitionPolicyPrefer,
+		},
+		{
+			name: "Node with Honor policy",
+			node: &schedulingv1alpha1.Device{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						LabelGPUPartitionPolicy: string(GPUPartitionPolicyHonor),
+					},
+				},
+			},
+			expected: GPUPartitionPolicyHonor,
+		},
+		{
+			name: "Node with Prefer policy",
+			node: &schedulingv1alpha1.Device{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						LabelGPUPartitionPolicy: string(GPUPartitionPolicyPrefer),
+					},
+				},
+			},
+			expected: GPUPartitionPolicyPrefer,
+		},
+		{
+			name: "Node without policy label",
+			node: &schedulingv1alpha1.Device{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{},
+				},
+			},
+			expected: GPUPartitionPolicyPrefer,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := GetGPUPartitionPolicy(tt.node); got != tt.expected {
+				t.Errorf("GetGPUPartitionPolicy() = %v, want %v", got, tt.expected)
+			}
+		})
+	}
+}