diff --git a/apis/extension/device_share.go b/apis/extension/device_share.go index d4653a8aef..015a7e3230 100644 --- a/apis/extension/device_share.go +++ b/apis/extension/device_share.go @@ -18,8 +18,10 @@ package extension import ( "encoding/json" + "fmt" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" @@ -32,6 +34,10 @@ const ( AnnotationDeviceAllocateHint = SchedulingDomainPrefix + "/device-allocate-hint" // AnnotationDeviceJointAllocate guides the scheduler joint-allocates devices AnnotationDeviceJointAllocate = SchedulingDomainPrefix + "/device-joint-allocate" + // AnnotationGPUPartitionSpec represents the GPU partition spec that pod requests + AnnotationGPUPartitionSpec = SchedulingDomainPrefix + "/gpu-partition-spec" + // AnnotationGPUPartitions represents the GPU partitions supported on the node + AnnotationGPUPartitions = SchedulingDomainPrefix + "/gpu-partitions" ) const ( @@ -48,8 +54,9 @@ const ( ) const ( - LabelGPUModel string = NodeDomainPrefix + "/gpu-model" - LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version" + LabelGPUPartitionPolicy string = NodeDomainPrefix + "/gpu-partition-policy" + LabelGPUModel string = NodeDomainPrefix + "/gpu-model" + LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version" ) // DeviceAllocations would be injected into Pod as form of annotation during Pre-bind stage. @@ -132,10 +139,19 @@ const ( type DeviceTopologyScope string const ( + DeviceTopologyScopeDevice DeviceTopologyScope = "Device" DeviceTopologyScopePCIe DeviceTopologyScope = "PCIe" DeviceTopologyScopeNUMANode DeviceTopologyScope = "NUMANode" + DeviceTopologyScopeNode DeviceTopologyScope = "Node" ) +var DeviceTopologyScopeLevel = map[DeviceTopologyScope]int{ + DeviceTopologyScopeDevice: 4, + DeviceTopologyScopePCIe: 3, + DeviceTopologyScopeNUMANode: 2, + DeviceTopologyScopeNode: 1, +} + type DeviceExclusivePolicy string const ( @@ -145,6 +161,47 @@ const ( PCIExpressLevelDeviceExclusivePolicy DeviceExclusivePolicy = "PCIeLevel" ) +type GPUPartitionSpec struct { + AllocatePolicy GPUPartitionAllocatePolicy `json:"allocatePolicy,omitempty"` + RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"` +} + +type GPUPartitionAllocatePolicy string + +const ( + // GPUPartitionAllocatePolicyRestricted indicates that only partitions with the most allocationScore will be considered. + GPUPartitionAllocatePolicyRestricted GPUPartitionAllocatePolicy = "Restricted" + // GPUPartitionAllocatePolicyBestEffort indicates that try best to pursue partition with more allocationScore. + GPUPartitionAllocatePolicyBestEffort GPUPartitionAllocatePolicy = "BestEffort" +) + +type GPULinkType string + +const ( + GPUNVLink GPULinkType = "NVLink" +) + +type GPUPartition struct { + Minors []int `json:"minors"` + GPULinkType GPULinkType `json:"gpuLinkType,omitempty"` + RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"` + AllocationScore int `json:"allocationScore,omitempty"` + MinorsHash int `json:"-"` + BinPackScore int `json:"-"` +} + +// GPUPartitionTable will be annotated on Device +type GPUPartitionTable map[int][]GPUPartition + +type GPUPartitionPolicy string + +const ( + // GPUPartitionPolicyHonor indicates that the partitions annotated to the Device CR should be honored. + GPUPartitionPolicyHonor GPUPartitionPolicy = "Honor" + // GPUPartitionPolicyPrefer indicates that the partitions annotated to the Device CR are preferred. + GPUPartitionPolicyPrefer GPUPartitionPolicy = "Prefer" +) + func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations, error) { deviceAllocations := DeviceAllocations{} data, ok := podAnnotations[AnnotationDeviceAllocated] @@ -234,3 +291,44 @@ func GetDeviceJointAllocate(annotations map[string]string) (*DeviceJointAllocate } return &jointAllocate, nil } + +func GetGPUPartitionSpec(annotations map[string]string) (*GPUPartitionSpec, error) { + val, ok := annotations[AnnotationGPUPartitionSpec] + if !ok { + return nil, nil + } + var spec GPUPartitionSpec + err := json.Unmarshal([]byte(val), &spec) + if err != nil { + return nil, err + } + if spec.AllocatePolicy == "" { + spec.AllocatePolicy = GPUPartitionAllocatePolicyBestEffort + } + return &spec, nil +} + +func GetGPUPartitionTable(device *schedulingv1alpha1.Device) (GPUPartitionTable, error) { + if rawGPUPartitionTable, ok := device.Annotations[AnnotationGPUPartitions]; ok && rawGPUPartitionTable != "" { + gpuPartitionTable := GPUPartitionTable{} + err := json.Unmarshal([]byte(rawGPUPartitionTable), &gpuPartitionTable) + if err != nil { + return nil, err + } + if gpuPartitionTable == nil { + return nil, fmt.Errorf("invalid gpu partitions in device cr: %s", rawGPUPartitionTable) + } + return gpuPartitionTable, nil + } + return nil, nil +} + +func GetGPUPartitionPolicy(device *schedulingv1alpha1.Device) GPUPartitionPolicy { + if device == nil { + return GPUPartitionPolicyPrefer + } + if allocatePolicy := device.Labels[LabelGPUPartitionPolicy]; GPUPartitionPolicy(allocatePolicy) == GPUPartitionPolicyHonor { + return GPUPartitionPolicyHonor + } + return GPUPartitionPolicyPrefer +} diff --git a/apis/extension/device_share_test.go b/apis/extension/device_share_test.go index f94b0750dc..2001ec828b 100644 --- a/apis/extension/device_share_test.go +++ b/apis/extension/device_share_test.go @@ -17,6 +17,7 @@ limitations under the License. package extension import ( + "fmt" "testing" "github.com/stretchr/testify/assert" @@ -27,6 +28,10 @@ import ( schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" ) +var ( + bandWidthOf200Gi = resource.MustParse("200Gi") +) + func Test_GetDeviceAllocations(t *testing.T) { tests := []struct { name string @@ -138,3 +143,196 @@ func Test_SetDeviceAllocations(t *testing.T) { }) } } + +func TestGetGPUPartitionSpec(t *testing.T) { + type args struct { + annotations map[string]string + } + tests := []struct { + name string + args args + want *GPUPartitionSpec + wantErr assert.ErrorAssertionFunc + }{ + { + name: "nil partitionSpec", + args: args{ + annotations: nil, + }, + want: nil, + wantErr: assert.NoError, + }, + { + name: "empty partitionSpec", + args: args{ + annotations: map[string]string{ + AnnotationGPUPartitionSpec: `{}`, + }, + }, + want: &GPUPartitionSpec{ + AllocatePolicy: GPUPartitionAllocatePolicyBestEffort, + }, + wantErr: assert.NoError, + }, + { + name: "allocatePolicy BestEffort", + args: args{ + annotations: map[string]string{ + AnnotationGPUPartitionSpec: `{"allocatePolicy":"BestEffort"}`, + }, + }, + want: &GPUPartitionSpec{ + AllocatePolicy: GPUPartitionAllocatePolicyBestEffort, + }, + wantErr: assert.NoError, + }, + { + name: "allocatePolicy Restricted", + args: args{ + annotations: map[string]string{ + AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted"}`, + }, + }, + want: &GPUPartitionSpec{ + AllocatePolicy: GPUPartitionAllocatePolicyRestricted, + }, + wantErr: assert.NoError, + }, + { + name: "allocatePolicy Restricted, ringAllReduceBandwidth specified", + args: args{ + annotations: map[string]string{ + AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted", "ringBusBandwidth":"200Gi"}`, + }, + }, + want: &GPUPartitionSpec{ + AllocatePolicy: GPUPartitionAllocatePolicyRestricted, + RingBusBandwidth: &bandWidthOf200Gi, + }, + wantErr: assert.NoError, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := GetGPUPartitionSpec(tt.args.annotations) + if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionSpec(%v)", tt.args.annotations)) { + return + } + assert.Equalf(t, tt.want, got, "GetGPUPartitionSpec(%v)", tt.args.annotations) + }) + } +} + +func TestGetGPUPartitionTable(t *testing.T) { + tests := []struct { + name string + device *schedulingv1alpha1.Device + want GPUPartitionTable + wantErr assert.ErrorAssertionFunc + }{ + { + name: "Valid GPU Partition Table", + device: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + AnnotationGPUPartitions: `{"0": [{"minors": [0,1], "gpuLinkType": "NVLink","ringBusBandwidth": "200Gi", "allocationScore": 10}]}`, + }, + }, + }, + want: GPUPartitionTable{ + 0: []GPUPartition{ + { + Minors: []int{0, 1}, + GPULinkType: GPUNVLink, + RingBusBandwidth: &bandWidthOf200Gi, + AllocationScore: 10, + MinorsHash: 0, // This would be calculated. + BinPackScore: 0, // This would also be calculated if needed in actual implementation. + }, + }, + }, + wantErr: assert.NoError, + }, + { + name: "No Annotation", + device: &schedulingv1alpha1.Device{}, + want: nil, + wantErr: assert.NoError, + }, + { + name: "Invalid JSON", + device: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + AnnotationGPUPartitions: `Invalid JSON format`, + }, + }, + }, + want: nil, + wantErr: assert.Error, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := GetGPUPartitionTable(tt.device) + if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionTable(%v)", tt.device)) { + return + } + assert.Equalf(t, tt.want, got, "GetGPUPartitionTable(%v)", tt.device) + }) + } +} + +// TestGetNodeGPUAllocatePolicy tests the GetGPUPartitionPolicy function. +func TestGetNodeLevelGPUAllocatePolicy(t *testing.T) { + tests := []struct { + name string + node *schedulingv1alpha1.Device + expected GPUPartitionPolicy + }{ + { + name: "Nil node", + node: nil, + expected: GPUPartitionPolicyPrefer, + }, + { + name: "Node with Honor policy", + node: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + LabelGPUPartitionPolicy: string(GPUPartitionPolicyHonor), + }, + }, + }, + expected: GPUPartitionPolicyHonor, + }, + { + name: "Node with Prefer policy", + node: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + LabelGPUPartitionPolicy: string(GPUPartitionPolicyPrefer), + }, + }, + }, + expected: GPUPartitionPolicyPrefer, + }, + { + name: "Node without policy label", + node: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{}, + }, + }, + expected: GPUPartitionPolicyPrefer, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := GetGPUPartitionPolicy(tt.node); got != tt.expected { + t.Errorf("GetGPUPartitionPolicy() = %v, want %v", got, tt.expected) + } + }) + } +} diff --git a/docs/proposals/scheduling/20241008-gpu-partition-api.md b/docs/proposals/scheduling/20241008-gpu-partition-api.md new file mode 100644 index 0000000000..a5426fabb5 --- /dev/null +++ b/docs/proposals/scheduling/20241008-gpu-partition-api.md @@ -0,0 +1,234 @@ +--- +title: GPU Partitioning APIs +authors: +- "@ZiMengSheng" +reviewers: +- "@hormes" +- "@songtao98" +- "@saintube" +creation-date: 2024-10-08 +last-updated: 2024-10-08 +status: provisional +--- + +# GPU Partitioning APIs + +## Summary +This proposal outlines an enhancement to the GPU scheduling capabilities of Koordinator, particularly focusing on NVIDIA GPUs operating under SharedNVSwitch mode. The primary objective is to introduce functionality that allows Pods to specifically request GPU partitions based on predefined configurations (Partitions). + +## Motivation +In virtualized environments, when NVIDIA FabricManager operates in SharedNVSwitch mode, for security reasons, NVIDIA imposes certain requirements on the GPU configurations that can be allocated to a single VM, allowing only a few specific combinations of GPUs. NVIDIA refers to a combination of GPUs as a Partition and a table consisting of several such Partitions as a Partition Table. + +The scheduler in Koordinator is currently responsible for selection of GPUs for Pods. This PR expands upon the existing GPU scheduling capabilities of Koordinator, enabling it to recognize specific machine configurations and user requirements regarding GPU partitioning. + +### Goals +- Provide the API for a Pod to request a specific GPU Partition. +- Allow nodes to offer permitted Partitions. + +### Non-Goals/Future Work +- Describe what the Partition Table looks like for a specific GPU model. + +## User Story + +Typically, the rules for GPU partitioning are determined by the specific GPU model or system configuration, and may also be influenced by the configuration of GPUs on each individual node. The scheduler does not have insight into the specifics of the hardware models or GPU types; instead, it relies on components at the node level to report these Partition Rules to the Device Custom Resource (CR) as follows: + +```yaml +apiVersion: scheduling.koordinator.sh/v1alpha1 +kind: Device +metadata: + annotations: + scheduling.koordinator.sh/gpu-partitions: | + { + "1": [ + "NVLINK": { + { + # Which GPUs are included + "minors": [ + 0 + ], + # GPU Interconnect Type + "gpuLinkType": "NVLink", + # Here we take the bottleneck bandwidth between GPUs in the Ring algorithm. BusBandwidth can be referenced from https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + "ringBusBandwidth": 400Gi + # Indicate the overall allocation quality for the node after the partition has been assigned away. + "allocationScore": "1", + }, + ... + } + ... + ], + "2": [ + ... + ], + "4": [ + ... + ], + "8": [ + ... + ] + } + labels: + node.koordinator.sh/gpu-partition-policy: "Honor" + name: node-1 +``` + + + +Users can specify the desired GPU partition requirements at the Pod level. + +```yaml +kind: Pod +metadata: + name: hello-gpu + annotations: + scheduling.koordinator.sh/gpu-partition-spec: | + { + "allocatePolicy": "Restricted", + "ringBusBandwidth": "200Gi" + } +spec: + containers: + - name: main + resources: + limits: + nvidia.com/gpu: 1 +``` + +## Proposal + +### GPUPartitionTable + +Before we proceed, let's define the term "Partition" step by step. A Partition refers to a combination of GPUs that can be allocated to a user, possessing the following attributes: + +```yaml +{ + # Which GPUs are included + "minors": [ + 0 + ], + # GPU Interconnect Type + "gpuLinkType": "NVLink", + # GPU Interconnect Bandwidth + "ringBusBandwidth": 400Gi +}, +``` + +Additionally, selecting a Partition inherently means forgoing potentially better alternatives, implying that there is a trade-off among Partitions. By examining the Partition table, we can actually quantify the quality of the current allocation by calculating the maximum number of cards and the greatest bandwidth available in the remaining Partitions after assigning one. When no Partitions have been allocated yet, the sequential order of these goodness evaluations, which reflects the priority or desirability of each Partition, can be pre-established and attributed as a characteristic of the Partitions themselves. We refer to this attribute as the AllocationScore. + +```yaml +{ + # Which GPUs are included + "minors": [ + 0 + ], + # GPU Interconnect Type + "gpuLinkType": "NVLink", + # Ring All Reduce Bandwidth + "ringBusBandwidth": 400Gi, + # Indicate the overall allocation quality for the node after the partition has been assigned away. + "allocationScore": 1, +}, +``` + +Combining all possible combinations of partitions yields a partition table. The key here in a partition table is an integer that reflects how many GPU cards are included in this partition group. + +```yaml +{ + "1": [ + { + # Which GPUs are included + "minors": [ + 0 + ], + # GPU Interconnect Type + "gpuLinkType": "NVLink", + # GPU Interconnect Bandwidth + "ringAllReduceBandwidth": 400Gi, + # Indicate the overall allocation quality for the node after the partition has been assigned away. + "allocationScore": 1, + }, + ... + ], + "2": [ + ... + ], + "4": [ + ... + ], + "8": [ + ... + ] +} +``` + +Finally, when the AllocationScores of Partitions are equal, it implies that a allocation with the least fragmentation needs to be generated based on the current allocation situation. This calculation is to be performed during the actual allocation process within the scheduler. + +The GPU PartitionTable structure is defined as follows: + +```go +const( + // AnnotationGPUPartitions represents the GPU partitions supported on the node + AnnotationGPUPartitions = SchedulingDomainPrefix + "/gpu-partitions" +) + +type GPULinkType string + +const ( + GPUNVLink GPULinkType = "NVLink" +) + +type GPUPartition struct { + Minors []int `json:"minors"` + GPULinkType GPULinkType `json:"gpuLinkType,omitempty"` + RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"` + AllocationScore int `json:"allocationScore,omitempty"` +} + +// GPUPartitionTable will be annotated on Device +type GPUPartitionTable map[int][]GPUPartition +``` + +### GPU Partition Policy + +GPU Partition Policy indicates whether the partitions annotated to the Device CR need to be honored. + +```go +const( + LabelGPUPartitionPolicy string = NodeDomainPrefix + "/gpu-partition-policy" +) + +type GPUPartitionPolicy string + +const ( + // GPUPartitionPolicyHonor indicates that the partitions annotated to the Device CR should be honored. + GPUPartitionPolicyHonor GPUPartitionPolicy = "Honor" + // GPUPartitionPolicyPrefer indicates that the partitions annotated to the Device CR are preferred. + GPUPartitionPolicyPrefer GPUPartitionPolicy = "Prefer" +) +``` + +### GPUPartitionSpec API + +The GPUPartitionSpec structure is defined as follows: + +```go +const( + // AnnotationGPUPartitionSpec represents the GPU partition spec that pod requests + AnnotationGPUPartitionSpec = SchedulingDomainPrefix + "/gpu-partition-spec" +) + +type GPUPartitionSpec struct { + AllocatePolicy GPUPartitionAllocatePolicy `json:"allocatePolicy,omitempty"` + RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"` +} + +type GPUPartitionAllocatePolicy string + +const ( + // GPUPartitionAllocatePolicyRestricted indicates that only partitions with the most allocationScore will be considered. + GPUPartitionAllocatePolicyRestricted GPUPartitionAllocatePolicy = "Restricted" + // GPUPartitionAllocatePolicyBestEffort indicates that try best to pursue partition with more allocationScore. + GPUPartitionAllocatePolicyBestEffort GPUPartitionAllocatePolicy = "BestEffort" +) +``` +