Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move rolling update logic under clusterOp #586

Merged
merged 6 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions controllers/solr_cluster_ops_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,79 @@ func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler, inst
return
}

func determineRollingUpdateClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, logger logr.Logger) (clusterLockAcquired bool, retryLaterDuration time.Duration, err error) {
if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && !outOfDatePods.IsEmpty() {
// Managed Rolling Upgrade!
originalStatefulSet := statefulSet.DeepCopy()
statefulSet.Annotations[util.ClusterOpsLockAnnotation] = util.UpdateLock
// No rolling update metadata is currently required
statefulSet.Annotations[util.ClusterOpsMetadataAnnotation] = ""
if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
logger.Error(err, "Error while patching StatefulSet to start clusterOp", "clusterOp", util.UpdateLock, "clusterOpMetadata", "")
} else {
clusterLockAcquired = true
}
}
return
}

// handleManagedCloudRollingUpdate does the logic of a managed and "locked" cloud rolling update operation.
// This will take many reconcile loops to complete, as it is deleting pods/moving replicas.
func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (retryLaterDuration time.Duration, err error) {
// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
updateLogger := logger.WithName("ManagedUpdateSelector")

// First check if all pods are up to date. If so the rolling update is complete
if outOfDatePods.IsEmpty() {
// Once the rolling update is complete, finish the cluster operation by deleting the statefulSet annotations
originalStatefulSet := statefulSet.DeepCopy()
delete(statefulSet.Annotations, util.ClusterOpsLockAnnotation)
delete(statefulSet.Annotations, util.ClusterOpsMetadataAnnotation)
if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
logger.Error(err, "Error while patching StatefulSet to finish the managed SolrCloud rollingUpdate clusterOp")
}

// TODO: Create event for the CRD.
} else {
// The out of date pods that have not been started, should all be updated immediately.
// There is no use "safely" updating pods which have not been started yet.
podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
for _, pod := range outOfDatePods.NotStarted {
updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
}

// Pick which pods should be deleted for an update.
// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError :=
util.DeterminePodsSafeToUpdate(ctx, instance, int(*statefulSet.Spec.Replicas), outOfDatePods, hasReadyPod, availableUpdatedPodCount, updateLogger)
// If we do not have the clusterState, it's not safe to update pods that are running
if clusterStateError != nil {
retryLater = true
} else {
podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
}

// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
for _, pod := range podsToUpdate {
retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)

// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
retryLaterDuration = retryLaterDurationTemp
}
if errTemp != nil {
err = errTemp
}
}

if retryLater && retryLaterDuration == 0 {
retryLaterDuration = time.Second * 10
}
}
return
}

// clearClusterOp simply removes any clusterOp for the given statefulSet.
// This should only be used as a "break-glass" scenario. Do not use this to finish off successful clusterOps.
func clearClusterOp(ctx context.Context, r *SolrCloudReconciler, statefulSet *appsv1.StatefulSet, reason string, logger logr.Logger) (err error) {
Expand Down
81 changes: 22 additions & 59 deletions controllers/solrcloud_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,13 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
var security *util.SecurityConfig = nil
if instance.Spec.SolrSecurity != nil {
security, err = util.ReconcileSecurityConfig(ctx, &r.Client, instance)
if err == nil && security != nil {
// If authn enabled on Solr, we need to pass the auth header when making requests
ctx, err = security.AddAuthToContext(ctx)
if err != nil {
logger.Error(err, "failed to create Authorization header when reconciling")
}
}
if err != nil {
return requeueOrNot, err
}
Expand Down Expand Up @@ -443,12 +450,14 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}

// We only want to do one cluster operation at a time, so we use a lock to ensure that.
// Update or a Scale at a time. We do not want to do both.

// Update or Scale, one-at-a-time. We do not want to do both.
hasReadyPod := newStatus.ReadyReplicas > 0
var retryLaterDuration time.Duration
if clusterOpLock, hasAnn := statefulSet.Annotations[util.ClusterOpsLockAnnotation]; hasAnn {
clusterOpMetadata := statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]
switch clusterOpLock {
case util.UpdateLock:
retryLaterDuration, err = handleManagedCloudRollingUpdate(ctx, r, instance, statefulSet, outOfDatePods, hasReadyPod, availableUpdatedPodCount, logger)
case util.ScaleDownLock:
retryLaterDuration, err = handleManagedCloudScaleDown(ctx, r, instance, statefulSet, clusterOpMetadata, podList, logger)
case util.ScaleUpLock:
Expand All @@ -459,12 +468,22 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
err = clearClusterOp(ctx, r, statefulSet, "clusterOp not supported", logger)
}
} else {
lockAcquired := false
// Start cluster operations if needed.
// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
// And that lock will tell future reconcile loops that the operation needs to be done.
// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
// a "locked" cluster operation
lockAcquired, retryLaterDuration, err = determineRollingUpdateClusterOpLockIfNecessary(ctx, r, instance, statefulSet, outOfDatePods, logger)
// Start cluster operations if needed.
// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
// And that lock will tell future reconcile loops that the operation needs to be done.
// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
// a "locked" cluster operation
_, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
if !lockAcquired {
lockAcquired, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
}
// After a lock is acquired, the reconcile will be started again because the StatefulSet is being watched
}
if err != nil && retryLaterDuration == 0 {
retryLaterDuration = time.Second * 5
Expand All @@ -476,62 +495,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return requeueOrNot, err
}

// TODO: Move this logic into the ClusterOpLock, with the "rollingUpdate" lock
if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion)+len(outOfDatePods.Running) > 0 {
// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
updateLogger := logger.WithName("ManagedUpdateSelector")

// The out of date pods that have not been started, should all be updated immediately.
// There is no use "safely" updating pods which have not been started yet.
podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
for _, pod := range outOfDatePods.NotStarted {
updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
}

// If authn enabled on Solr, we need to pass the auth header
if security != nil {
ctx, err = security.AddAuthToContext(ctx)
if err != nil {
updateLogger.Error(err, "failed to create Authorization header when reconciling", "SolrCloud", instance.Name)
return requeueOrNot, err
}
}

// Pick which pods should be deleted for an update.
// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError := util.DeterminePodsSafeToUpdate(ctx, instance, outOfDatePods, int(newStatus.ReadyReplicas), availableUpdatedPodCount, updateLogger)
// If we do not have the clusterState, it's not safe to update pods that are running
if clusterStateError != nil {
retryLater = true
} else {
podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
}

// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
for _, pod := range podsToUpdate {
retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)

// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
retryLaterDuration = retryLaterDurationTemp
}
if errTemp != nil {
err = errTemp
}
}

if err != nil || retryLaterDuration > 0 || retryLater {
if retryLaterDuration == 0 {
retryLaterDuration = time.Second * 10
}
updateRequeueAfter(&requeueOrNot, retryLaterDuration)
}
if err != nil {
return requeueOrNot, err
}
}

// Upsert or delete solrcloud-wide PodDisruptionBudget(s) based on 'Enabled' flag.
pdb := util.GeneratePodDisruptionBudget(instance, statefulSet.Spec.Selector.MatchLabels)
if instance.Spec.Availability.PodDisruptionBudget.Enabled != nil && *instance.Spec.Availability.PodDisruptionBudget.Enabled {
Expand Down
13 changes: 8 additions & 5 deletions controllers/util/solr_update_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ type OutOfDatePodSegmentation struct {
Running []corev1.Pod
}

func (seg OutOfDatePodSegmentation) IsEmpty() bool {
return len(seg.NotStarted)+len(seg.ScheduledForDeletion)+len(seg.Running) == 0
}

// DeterminePodsSafeToUpdate takes a list of solr Pods and returns a list of pods that are safe to upgrade now.
// This function MUST be idempotent and return the same list of pods given the same kubernetes/solr state.
//
Expand All @@ -101,17 +105,17 @@ type OutOfDatePodSegmentation struct {
// TODO:
// - Think about caching this for ~250 ms? Not a huge need to send these requests milliseconds apart.
// - Might be too much complexity for very little gain.
func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, readyPods int, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, totalPods int, outOfDatePods OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
// Before fetching the cluster state, be sure that there is room to update at least 1 pod
maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, totalPods, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
if maxPodsToUpdate <= 0 {
logger.Info("Pod update selection canceled. The number of updated pods unavailable equals or exceeds the calculated maxPodsUnavailable.",
"unavailableUpdatedPods", unavailableUpdatedPodCount, "outOfDatePodsNotStarted", len(outOfDatePods.NotStarted), "alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion), "maxPodsUnavailable", maxPodsUnavailable)
} else {
clusterResp := &solr_api.SolrClusterStatusResponse{}
overseerResp := &solr_api.SolrOverseerStatusResponse{}

if readyPods > 0 {
if hasReadyPod {
queryParams := url.Values{}
queryParams.Add("action", "CLUSTERSTATUS")
err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, clusterResp)
Expand Down Expand Up @@ -148,8 +152,7 @@ func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOf
}

// calculateMaxPodsToUpdate determines the maximum number of additional pods that can be updated.
func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
totalPods := int(*cloud.Spec.Replicas)
func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, totalPods int, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
// In order to calculate the number of updated pods that are unavailable take all pods, take the total pods and subtract those that are available and updated, and those that are not updated.
unavailableUpdatedPodCount = totalPods - availableUpdatedPodCount - outOfDatePodCount - outOfDatePodsNotStartedCount
// If the maxBatchNodeUpgradeSpec is passed as a decimal between 0 and 1, then calculate as a percentage of the number of nodes.
Expand Down
14 changes: 7 additions & 7 deletions controllers/util/solr_update_util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,38 +524,38 @@ func TestCalculateMaxPodsToUpgrade(t *testing.T) {
},
}

foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 4, 0, 4)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 4)
assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
assert.Equal(t, 2, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 4, 0, 3)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 3)
assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
assert.Equal(t, -1, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 1, 3)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 1, 3)
assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
assert.Equal(t, -2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

maxPodsUnavailable = intstr.FromString("45%")
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 5)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 5)
assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

maxPodsUnavailable = intstr.FromString("45%")
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 1, 2, 5)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 1, 2, 5)
assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

maxPodsUnavailable = intstr.FromString("70%")
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
assert.Equal(t, 7, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"70%\")")
assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")

solrCloud.Spec.UpdateStrategy.ManagedUpdateOptions.MaxPodsUnavailable = nil
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"25%\")")
assert.Equal(t, -3, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
}
Expand Down
7 changes: 7 additions & 0 deletions helm/solr-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ annotations:
url: https://github.com/apache/solr-operator/issues/570
- name: Github PR
url: https://github.com/apache/solr-operator/pull/578
- kind: changed
description: Managed Rolling Updates are now computed via a Cluster Lock, like scaling operations.
links:
- name: Github Issue
url: https://github.com/apache/solr-operator/issues/560
- name: Github PR
url: https://github.com/apache/solr-operator/pull/586
artifacthub.io/images: |
- name: solr-operator
image: apache/solr-operator:v0.8.0-prerelease
Expand Down
Loading
Loading