From 5301d8ac38e86797f21953b260acdf59685cfeea Mon Sep 17 00:00:00 2001 From: liuqw Date: Tue, 23 Jul 2024 18:19:13 +0800 Subject: [PATCH 1/2] fix(controller): add statefulset gc for podgroup. Signed-off-by: liuqw --- .../chart/volcano/templates/controllers.yaml | 4 ++-- installer/volcano-development.yaml | 4 ++-- pkg/controllers/podgroup/pg_controller.go | 11 +++++++++- .../podgroup/pg_controller_handler.go | 20 +++++++++++++++++++ 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index 6fa8a96cd3..1fe85fd757 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -72,10 +72,10 @@ rules: resources: ["networkpolicies"] verbs: ["get", "create", "delete"] - apiGroups: ["apps"] - resources: ["daemonsets", "statefulsets"] + resources: ["daemonsets"] verbs: ["get"] - apiGroups: ["apps"] - resources: ["replicasets"] + resources: ["replicasets", "statefulsets"] verbs: ["get", "list", "watch"] - apiGroups: ["batch"] resources: ["jobs"] diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 1adc0dda5e..4874f35671 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -4304,10 +4304,10 @@ rules: resources: ["networkpolicies"] verbs: ["get", "create", "delete"] - apiGroups: ["apps"] - resources: ["daemonsets", "statefulsets"] + resources: ["daemonsets"] verbs: ["get"] - apiGroups: ["apps"] - resources: ["replicasets"] + resources: ["replicasets", "statefulsets"] verbs: ["get", "list", "watch"] - apiGroups: ["batch"] resources: ["jobs"] diff --git a/pkg/controllers/podgroup/pg_controller.go b/pkg/controllers/podgroup/pg_controller.go index 003e150da7..12cfd4a01f 100644 --- a/pkg/controllers/podgroup/pg_controller.go +++ b/pkg/controllers/podgroup/pg_controller.go @@ -51,6 +51,7 @@ type pgcontroller struct { podInformer coreinformers.PodInformer pgInformer schedulinginformer.PodGroupInformer rsInformer appinformers.ReplicaSetInformer + stsInformer appinformers.StatefulSetInformer informerFactory informers.SharedInformerFactory vcInformerFactory vcinformer.SharedInformerFactory @@ -64,7 +65,8 @@ type pgcontroller struct { pgSynced func() bool // A store of replicaset - rsSynced func() bool + rsSynced func() bool + stsSynced func() bool queue workqueue.RateLimitingInterface @@ -112,6 +114,13 @@ func (pg *pgcontroller) Initialize(opt *framework.ControllerOption) error { AddFunc: pg.addReplicaSet, UpdateFunc: pg.updateReplicaSet, }) + + pg.stsInformer = pg.informerFactory.Apps().V1().StatefulSets() + pg.stsSynced = pg.stsInformer.Informer().HasSynced + pg.stsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: pg.addStatefulSet, + UpdateFunc: pg.updateStatefulSet, + }) } return nil } diff --git a/pkg/controllers/podgroup/pg_controller_handler.go b/pkg/controllers/podgroup/pg_controller_handler.go index 12541d85a3..760142f0b8 100644 --- a/pkg/controllers/podgroup/pg_controller_handler.go +++ b/pkg/controllers/podgroup/pg_controller_handler.go @@ -105,6 +105,26 @@ func (pg *pgcontroller) updateReplicaSet(oldObj, newObj interface{}) { pg.addReplicaSet(newObj) } +func (pg *pgcontroller) addStatefulSet(obj interface{}) { + sts, ok := obj.(*appsv1.StatefulSet) + if !ok { + klog.Errorf("Failed to convert %v to appsv1.StatefulSet", obj) + return + } + + if *sts.Spec.Replicas == 0 { + pgName := batchv1alpha1.PodgroupNamePrefix + string(sts.UID) + err := pg.vcClient.SchedulingV1beta1().PodGroups(sts.Namespace).Delete(context.TODO(), pgName, metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + klog.Errorf("Failed to delete PodGroup <%s/%s>: %v", sts.Namespace, pgName, err) + } + } +} + +func (pg *pgcontroller) updateStatefulSet(oldObj, newObj interface{}) { + pg.addStatefulSet(newObj) +} + func (pg *pgcontroller) updatePodAnnotations(pod *v1.Pod, pgName string) error { if pod.Annotations == nil { pod.Annotations = make(map[string]string) From cd7ba7f454edb7943c8266a5bfb87d66cf839202 Mon Sep 17 00:00:00 2001 From: liuqw Date: Tue, 23 Jul 2024 19:52:20 +0800 Subject: [PATCH 2/2] fix(controller): Fix podgroup not created finally when rolling upgrade of the statefulset. Signed-off-by: liuqw --- .../podgroup/pg_controller_handler.go | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/controllers/podgroup/pg_controller_handler.go b/pkg/controllers/podgroup/pg_controller_handler.go index 760142f0b8..74b1f8420c 100644 --- a/pkg/controllers/podgroup/pg_controller_handler.go +++ b/pkg/controllers/podgroup/pg_controller_handler.go @@ -119,6 +119,27 @@ func (pg *pgcontroller) addStatefulSet(obj interface{}) { klog.Errorf("Failed to delete PodGroup <%s/%s>: %v", sts.Namespace, pgName, err) } } + + // In the rolling upgrade scenario, the addStatefulSet(replicas=0) event may be received before + // the updateStatefulSet(replicas=1) event, and after the addPod event for the new created pod. + // In this event, need to create PodGroup for the pod. + if *sts.Spec.Replicas > 0 { + selector := metav1.LabelSelector{MatchLabels: sts.Spec.Selector.MatchLabels} + podList, err := pg.kubeClient.CoreV1().Pods(sts.Namespace).List(context.TODO(), + metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(&selector)}) + if err != nil { + klog.Errorf("Failed to list pods for StatefulSet <%s/%s>: %v", sts.Namespace, sts.Name, err) + return + } + if podList != nil && len(podList.Items) > 0 { + pod := podList.Items[0] + klog.V(4).Infof("Try to create podgroup for pod %s/%s", pod.Namespace, pod.Name) + err := pg.createNormalPodPGIfNotExist(&pod) + if err != nil { + klog.Errorf("Failed to create PodGroup for pod <%s/%s>: %v", pod.Namespace, pod.Name, err) + } + } + } } func (pg *pgcontroller) updateStatefulSet(oldObj, newObj interface{}) {