Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: optimize GPU allocate logic #2221

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 100 additions & 2 deletions apis/extension/device_share.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ package extension

import (
"encoding/json"
"fmt"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
Expand All @@ -32,6 +34,10 @@ const (
AnnotationDeviceAllocateHint = SchedulingDomainPrefix + "/device-allocate-hint"
// AnnotationDeviceJointAllocate guides the scheduler joint-allocates devices
AnnotationDeviceJointAllocate = SchedulingDomainPrefix + "/device-joint-allocate"
// AnnotationGPUPartitionSpec represents the GPU partition spec that pod requests
AnnotationGPUPartitionSpec = SchedulingDomainPrefix + "/gpu-partition-spec"
// AnnotationGPUPartitions represents the GPU partitions supported on the node
AnnotationGPUPartitions = SchedulingDomainPrefix + "/gpu-partitions"
)

const (
Expand All @@ -48,8 +54,9 @@ const (
)

const (
LabelGPUModel string = NodeDomainPrefix + "/gpu-model"
LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version"
LabelGPUPartitionPolicy string = NodeDomainPrefix + "/gpu-partition-policy"
LabelGPUModel string = NodeDomainPrefix + "/gpu-model"
LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version"
)

// DeviceAllocations would be injected into Pod as form of annotation during Pre-bind stage.
Expand Down Expand Up @@ -132,10 +139,19 @@ const (
type DeviceTopologyScope string

const (
DeviceTopologyScopeDevice DeviceTopologyScope = "Device"
DeviceTopologyScopePCIe DeviceTopologyScope = "PCIe"
DeviceTopologyScopeNUMANode DeviceTopologyScope = "NUMANode"
DeviceTopologyScopeNode DeviceTopologyScope = "Node"
)

var DeviceTopologyScopeLevel = map[DeviceTopologyScope]int{
DeviceTopologyScopeDevice: 4,
DeviceTopologyScopePCIe: 3,
DeviceTopologyScopeNUMANode: 2,
DeviceTopologyScopeNode: 1,
}

type DeviceExclusivePolicy string

const (
Expand All @@ -145,6 +161,47 @@ const (
PCIExpressLevelDeviceExclusivePolicy DeviceExclusivePolicy = "PCIeLevel"
)

type GPUPartitionSpec struct {
AllocatePolicy GPUPartitionAllocatePolicy `json:"allocatePolicy,omitempty"`
RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"`
}

type GPUPartitionAllocatePolicy string

const (
// GPUPartitionAllocatePolicyRestricted indicates that only partitions with the most allocationScore will be considered.
GPUPartitionAllocatePolicyRestricted GPUPartitionAllocatePolicy = "Restricted"
// GPUPartitionAllocatePolicyBestEffort indicates that try best to pursue partition with more allocationScore.
GPUPartitionAllocatePolicyBestEffort GPUPartitionAllocatePolicy = "BestEffort"
)

type GPULinkType string

const (
GPUNVLink GPULinkType = "NVLink"
)

type GPUPartition struct {
Minors []int `json:"minors"`
GPULinkType GPULinkType `json:"gpuLinkType,omitempty"`
RingBusBandwidth *resource.Quantity `json:"ringBusBandwidth,omitempty"`
AllocationScore int `json:"allocationScore,omitempty"`
MinorsHash int `json:"-"`
BinPackScore int `json:"-"`
}

// GPUPartitionTable will be annotated on Device
type GPUPartitionTable map[int][]GPUPartition

type GPUPartitionPolicy string

const (
// GPUPartitionPolicyHonor indicates that the partitions annotated to the Device CR should be honored.
GPUPartitionPolicyHonor GPUPartitionPolicy = "Honor"
// GPUPartitionPolicyPrefer indicates that the partitions annotated to the Device CR are preferred.
GPUPartitionPolicyPrefer GPUPartitionPolicy = "Prefer"
)

func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations, error) {
deviceAllocations := DeviceAllocations{}
data, ok := podAnnotations[AnnotationDeviceAllocated]
Expand Down Expand Up @@ -234,3 +291,44 @@ func GetDeviceJointAllocate(annotations map[string]string) (*DeviceJointAllocate
}
return &jointAllocate, nil
}

func GetGPUPartitionSpec(annotations map[string]string) (*GPUPartitionSpec, error) {
val, ok := annotations[AnnotationGPUPartitionSpec]
if !ok {
return nil, nil
}
var spec GPUPartitionSpec
err := json.Unmarshal([]byte(val), &spec)
if err != nil {
return nil, err
}
if spec.AllocatePolicy == "" {
spec.AllocatePolicy = GPUPartitionAllocatePolicyBestEffort
}
return &spec, nil
}

func GetGPUPartitionTable(device *schedulingv1alpha1.Device) (GPUPartitionTable, error) {
if rawGPUPartitionTable, ok := device.Annotations[AnnotationGPUPartitions]; ok && rawGPUPartitionTable != "" {
gpuPartitionTable := GPUPartitionTable{}
err := json.Unmarshal([]byte(rawGPUPartitionTable), &gpuPartitionTable)
if err != nil {
return nil, err
}
if gpuPartitionTable == nil {
return nil, fmt.Errorf("invalid gpu partitions in device cr: %s", rawGPUPartitionTable)
}
return gpuPartitionTable, nil
}
return nil, nil
}

func GetGPUPartitionPolicy(device *schedulingv1alpha1.Device) GPUPartitionPolicy {
if device == nil {
return GPUPartitionPolicyPrefer
}
if allocatePolicy := device.Labels[LabelGPUPartitionPolicy]; GPUPartitionPolicy(allocatePolicy) == GPUPartitionPolicyHonor {
return GPUPartitionPolicyHonor
}
return GPUPartitionPolicyPrefer
}
198 changes: 198 additions & 0 deletions apis/extension/device_share_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package extension

import (
"fmt"
"testing"

"github.com/stretchr/testify/assert"
Expand All @@ -27,6 +28,10 @@ import (
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
)

var (
bandWidthOf200Gi = resource.MustParse("200Gi")
)

func Test_GetDeviceAllocations(t *testing.T) {
tests := []struct {
name string
Expand Down Expand Up @@ -138,3 +143,196 @@ func Test_SetDeviceAllocations(t *testing.T) {
})
}
}

func TestGetGPUPartitionSpec(t *testing.T) {
type args struct {
annotations map[string]string
}
tests := []struct {
name string
args args
want *GPUPartitionSpec
wantErr assert.ErrorAssertionFunc
}{
{
name: "nil partitionSpec",
args: args{
annotations: nil,
},
want: nil,
wantErr: assert.NoError,
},
{
name: "empty partitionSpec",
args: args{
annotations: map[string]string{
AnnotationGPUPartitionSpec: `{}`,
},
},
want: &GPUPartitionSpec{
AllocatePolicy: GPUPartitionAllocatePolicyBestEffort,
},
wantErr: assert.NoError,
},
{
name: "allocatePolicy BestEffort",
args: args{
annotations: map[string]string{
AnnotationGPUPartitionSpec: `{"allocatePolicy":"BestEffort"}`,
},
},
want: &GPUPartitionSpec{
AllocatePolicy: GPUPartitionAllocatePolicyBestEffort,
},
wantErr: assert.NoError,
},
{
name: "allocatePolicy Restricted",
args: args{
annotations: map[string]string{
AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted"}`,
},
},
want: &GPUPartitionSpec{
AllocatePolicy: GPUPartitionAllocatePolicyRestricted,
},
wantErr: assert.NoError,
},
{
name: "allocatePolicy Restricted, ringAllReduceBandwidth specified",
args: args{
annotations: map[string]string{
AnnotationGPUPartitionSpec: `{"allocatePolicy":"Restricted", "ringBusBandwidth":"200Gi"}`,
},
},
want: &GPUPartitionSpec{
AllocatePolicy: GPUPartitionAllocatePolicyRestricted,
RingBusBandwidth: &bandWidthOf200Gi,
},
wantErr: assert.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetGPUPartitionSpec(tt.args.annotations)
if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionSpec(%v)", tt.args.annotations)) {
return
}
assert.Equalf(t, tt.want, got, "GetGPUPartitionSpec(%v)", tt.args.annotations)
})
}
}

func TestGetGPUPartitionTable(t *testing.T) {
tests := []struct {
name string
device *schedulingv1alpha1.Device
want GPUPartitionTable
wantErr assert.ErrorAssertionFunc
}{
{
name: "Valid GPU Partition Table",
device: &schedulingv1alpha1.Device{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
AnnotationGPUPartitions: `{"0": [{"minors": [0,1], "gpuLinkType": "NVLink","ringBusBandwidth": "200Gi", "allocationScore": 10}]}`,
},
},
},
want: GPUPartitionTable{
0: []GPUPartition{
{
Minors: []int{0, 1},
GPULinkType: GPUNVLink,
RingBusBandwidth: &bandWidthOf200Gi,
AllocationScore: 10,
MinorsHash: 0, // This would be calculated.
BinPackScore: 0, // This would also be calculated if needed in actual implementation.
},
},
},
wantErr: assert.NoError,
},
{
name: "No Annotation",
device: &schedulingv1alpha1.Device{},
want: nil,
wantErr: assert.NoError,
},
{
name: "Invalid JSON",
device: &schedulingv1alpha1.Device{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
AnnotationGPUPartitions: `Invalid JSON format`,
},
},
},
want: nil,
wantErr: assert.Error,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetGPUPartitionTable(tt.device)
if !tt.wantErr(t, err, fmt.Sprintf("GetGPUPartitionTable(%v)", tt.device)) {
return
}
assert.Equalf(t, tt.want, got, "GetGPUPartitionTable(%v)", tt.device)
})
}
}

// TestGetNodeGPUAllocatePolicy tests the GetGPUPartitionPolicy function.
func TestGetNodeLevelGPUAllocatePolicy(t *testing.T) {
tests := []struct {
name string
node *schedulingv1alpha1.Device
expected GPUPartitionPolicy
}{
{
name: "Nil node",
node: nil,
expected: GPUPartitionPolicyPrefer,
},
{
name: "Node with Honor policy",
node: &schedulingv1alpha1.Device{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
LabelGPUPartitionPolicy: string(GPUPartitionPolicyHonor),
},
},
},
expected: GPUPartitionPolicyHonor,
},
{
name: "Node with Prefer policy",
node: &schedulingv1alpha1.Device{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
LabelGPUPartitionPolicy: string(GPUPartitionPolicyPrefer),
},
},
},
expected: GPUPartitionPolicyPrefer,
},
{
name: "Node without policy label",
node: &schedulingv1alpha1.Device{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{},
},
},
expected: GPUPartitionPolicyPrefer,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := GetGPUPartitionPolicy(tt.node); got != tt.expected {
t.Errorf("GetGPUPartitionPolicy() = %v, want %v", got, tt.expected)
}
})
}
}
Loading