Skip to content

Commit

Permalink
feature: Add podgroups statistics
Browse files Browse the repository at this point in the history
Signed-off-by: jessestutler <chenzicong4@huawei.com>
  • Loading branch information
JesseStutler committed Oct 25, 2024
1 parent 1ab6088 commit b8d8998
Show file tree
Hide file tree
Showing 23 changed files with 334 additions and 366 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ replace (
k8s.io/component-helpers => k8s.io/component-helpers v0.31.1
k8s.io/controller-manager => k8s.io/controller-manager v0.31.1
k8s.io/cri-api => k8s.io/cri-api v0.31.1
k8s.io/cri-client => k8s.io/cri-client v0.31.1
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.31.1
k8s.io/dynamic-resource-allocation => k8s.io/dynamic-resource-allocation v0.31.1
k8s.io/endpointslice => k8s.io/endpointslice v0.31.1
Expand Down
5 changes: 4 additions & 1 deletion installer/helm/chart/volcano/templates/controllers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,11 @@ rules:
resources: ["secrets"]
verbs: ["get", "create", "delete", "update"]
- apiGroups: ["scheduling.incubator.k8s.io", "scheduling.volcano.sh"]
resources: ["podgroups", "queues", "queues/status"]
resources: ["podgroups", "queues"]
verbs: ["get", "list", "watch", "create", "delete", "update"]
- apiGroups: [ "scheduling.incubator.k8s.io", "scheduling.volcano.sh" ]
resources: [ "queues/status" ]
verbs: ["get", "watch", "patch"]
- apiGroups: ["flow.volcano.sh"]
resources: ["jobflows", "jobtemplates"]
verbs: ["get", "list", "watch", "create", "delete", "update"]
Expand Down
5 changes: 4 additions & 1 deletion installer/volcano-development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4318,8 +4318,11 @@ rules:
resources: ["secrets"]
verbs: ["get", "create", "delete", "update"]
- apiGroups: ["scheduling.incubator.k8s.io", "scheduling.volcano.sh"]
resources: ["podgroups", "queues", "queues/status"]
resources: ["podgroups", "queues"]
verbs: ["get", "list", "watch", "create", "delete", "update"]
- apiGroups: [ "scheduling.incubator.k8s.io", "scheduling.volcano.sh" ]
resources: [ "queues/status" ]
verbs: ["get", "watch", "patch"]
- apiGroups: ["flow.volcano.sh"]
resources: ["jobflows", "jobtemplates"]
verbs: ["get", "list", "watch", "create", "delete", "update"]
Expand Down
30 changes: 23 additions & 7 deletions pkg/cli/queue/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,37 @@ func GetQueue(ctx context.Context) error {
return err
}

PrintQueue(queue, os.Stdout)
// Although the featuregate called CustomResourceFieldSelectors is enabled by default after v1.31, there are still
// users using k8s versions lower than v1.31. Therefore we can only get all the podgroups from kube-apiserver
// and then filtering them.
pgList, err := queueClient.SchedulingV1beta1().PodGroups("").List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list podgroup for queue %s with err: %v", getQueueFlags.Name, err)
}

pgStats := &PodGroupStatistics{}
for _, pg := range pgList.Items {
if pg.Spec.Queue == getQueueFlags.Name {
pgStats.StatPodGroupCountsForQueue(&pg)
}
}

PrintQueue(queue, pgStats, os.Stdout)

return nil
}

// PrintQueue prints queue information.
func PrintQueue(queue *v1beta1.Queue, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown)
func PrintQueue(queue *v1beta1.Queue, pgStats *PodGroupStatistics, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown, Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)

_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, pgStats.Inqueue,
pgStats.Pending, pgStats.Running, pgStats.Unknown, pgStats.Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
Expand Down
36 changes: 29 additions & 7 deletions pkg/cli/queue/list.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ const (
// Inqueue status of queue
Inqueue string = "Inqueue"

// Completed status of the queue
Completed string = "Completed"

// State is state of queue
State string = "State"
)
Expand Down Expand Up @@ -81,22 +84,41 @@ func ListQueue(ctx context.Context) error {
fmt.Printf("No resources found\n")
return nil
}
PrintQueues(queues, os.Stdout)

// Although the featuregate called CustomResourceFieldSelectors is enabled by default after v1.31, there are still
// users using k8s versions lower than v1.31. Therefore we can only get all the podgroups from kube-apiserver
// and then filtering them.
pgList, err := jobClient.SchedulingV1beta1().PodGroups("").List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list podgroups with err: %v", err)
}

queueStats := make(map[string]*PodGroupStatistics, len(queues.Items))
for _, queue := range queues.Items {
queueStats[queue.Name] = &PodGroupStatistics{}
}

for _, pg := range pgList.Items {
queueStats[pg.Spec.Queue].StatPodGroupCountsForQueue(&pg)
}

PrintQueues(queues, queueStats, os.Stdout)

return nil
}

// PrintQueues prints queue information.
func PrintQueues(queues *v1beta1.QueueList, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown)
func PrintQueues(queues *v1beta1.QueueList, queueStats map[string]*PodGroupStatistics, writer io.Writer) {
_, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
Name, Weight, State, Inqueue, Pending, Running, Unknown, Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}

for _, queue := range queues.Items {
_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
_, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d%-8d\n",
queue.Name, queue.Spec.Weight, queue.Status.State, queueStats[queue.Name].Inqueue, queueStats[queue.Name].Pending,
queueStats[queue.Name].Running, queueStats[queue.Name].Unknown, queueStats[queue.Name].Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/cli/queue/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
"volcano.sh/apis/pkg/apis/helpers"
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
)

Expand Down Expand Up @@ -62,3 +63,26 @@ func createQueueCommand(ctx context.Context, config *rest.Config, action busv1al

return nil
}

type PodGroupStatistics struct {
Inqueue int
Pending int
Running int
Unknown int
Completed int
}

func (pgStats *PodGroupStatistics) StatPodGroupCountsForQueue(pg *v1beta1.PodGroup) {
switch pg.Status.Phase {
case v1beta1.PodGroupInqueue:
pgStats.Inqueue++
case v1beta1.PodGroupPending:
pgStats.Pending++
case v1beta1.PodGroupRunning:
pgStats.Running++
case v1beta1.PodGroupUnknown:
pgStats.Unknown++
case v1beta1.PodGroupCompleted:
pgStats.Completed++
}
}
75 changes: 75 additions & 0 deletions pkg/controllers/metrics/podgroup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"volcano.sh/volcano/pkg/scheduler/metrics"
)

var (
pgPendingPhaseNum = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metrics.VolcanoNamespace,
Name: "podgroup_pending_phase_num",
Help: "Number of podgroup at pending phase",
}, []string{"queue_name"},
)

pgRunningPhaseNum = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metrics.VolcanoNamespace,
Name: "podgroup_running_phase_num",
Help: "Number of podgroup at running phase",
}, []string{"queue_name"},
)

pgUnknownPhaseNum = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metrics.VolcanoNamespace,
Name: "podgroup_unknown_phase_num",
Help: "Number of podgroup at unknown phase",
}, []string{"queue_name"},
)

pgInqueuePhaseNum = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metrics.VolcanoNamespace,
Name: "podgroup_inqueue_phase_num",
Help: "Number of podgroup at inqueue phase",
}, []string{"queue_name"},
)

pgCompletedPhaseNum = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: metrics.VolcanoNamespace,
Name: "podgroup_completed_phase_num",
Help: "Number of podgroup at completed phase",
}, []string{"queue_name"},
)
)

// UpdatePgPendingPhaseNum recored the num of podgroups at pending state in queue
func UpdatePgPendingPhaseNum(queueName string, num float64) {
pgPendingPhaseNum.WithLabelValues(queueName).Set(num)
}

// UpdatePgRunningPhaseNum recored the num of podgroups at running state in queue
func UpdatePgRunningPhaseNum(queueName string, num float64) {
pgRunningPhaseNum.WithLabelValues(queueName).Set(num)
}

// UpdatePgUnknownPhaseNum recored the num of podgroups at unknown state in queue
func UpdatePgUnknownPhaseNum(queueName string, num float64) {
pgUnknownPhaseNum.WithLabelValues(queueName).Set(num)
}

// UpdatePgInqueuePhaseNum recored the num of podgroups at inqueue state in queue
func UpdatePgInqueuePhaseNum(queueName string, num float64) {
pgInqueuePhaseNum.WithLabelValues(queueName).Set(num)
}

// UpdatePgCompletedPhaseNum recored the num of podgroups at completed state in queue
func UpdatePgCompletedPhaseNum(queueName string, num float64) {
pgCompletedPhaseNum.WithLabelValues(queueName).Set(num)
}
2 changes: 0 additions & 2 deletions pkg/controllers/queue/queue_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,6 @@ func (c *queuecontroller) Initialize(opt *framework.ControllerOption) error {
}

queuestate.SyncQueue = c.syncQueue
queuestate.OpenQueue = c.openQueue
queuestate.CloseQueue = c.closeQueue

c.syncHandler = c.handleQueue
c.syncCommandHandler = c.handleCommand
Expand Down
Loading

0 comments on commit b8d8998

Please sign in to comment.