apache · HoustonPutman · Jul 13, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go
@@ -156,6 +156,79 @@ func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler, inst
 	return
 }
 
+func determineRollingUpdateClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, logger logr.Logger) (clusterLockAcquired bool, retryLaterDuration time.Duration, err error) {
+	if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && !outOfDatePods.IsEmpty() {
+		// Managed Rolling Upgrade!
+		originalStatefulSet := statefulSet.DeepCopy()
+		statefulSet.Annotations[util.ClusterOpsLockAnnotation] = util.UpdateLock
+		// No rolling update metadata is currently required
+		statefulSet.Annotations[util.ClusterOpsMetadataAnnotation] = ""
+		if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
+			logger.Error(err, "Error while patching StatefulSet to start clusterOp", "clusterOp", util.UpdateLock, "clusterOpMetadata", "")
+		} else {
+			clusterLockAcquired = true
+		}
+	}
+	return
+}
+
+// handleManagedCloudRollingUpdate does the logic of a managed and "locked" cloud rolling update operation.
+// This will take many reconcile loops to complete, as it is deleting pods/moving replicas.
+func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (retryLaterDuration time.Duration, err error) {
+	// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
+	updateLogger := logger.WithName("ManagedUpdateSelector")
+
+	// First check if all pods are up to date. If so the rolling update is complete
+	if outOfDatePods.IsEmpty() {
+		// Once the rolling update is complete, finish the cluster operation by deleting the statefulSet annotations
+		originalStatefulSet := statefulSet.DeepCopy()
+		delete(statefulSet.Annotations, util.ClusterOpsLockAnnotation)
+		delete(statefulSet.Annotations, util.ClusterOpsMetadataAnnotation)
+		if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
+			logger.Error(err, "Error while patching StatefulSet to finish the managed SolrCloud rollingUpdate clusterOp")
+		}
+
+		// TODO: Create event for the CRD.
+	} else {
+		// The out of date pods that have not been started, should all be updated immediately.
+		// There is no use "safely" updating pods which have not been started yet.
+		podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
+		for _, pod := range outOfDatePods.NotStarted {
+			updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
+		}
+
+		// Pick which pods should be deleted for an update.
+		// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
+		additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError :=
+			util.DeterminePodsSafeToUpdate(ctx, instance, int(*statefulSet.Spec.Replicas), outOfDatePods, hasReadyPod, availableUpdatedPodCount, updateLogger)
+		// If we do not have the clusterState, it's not safe to update pods that are running
+		if clusterStateError != nil {
+			retryLater = true
+		} else {
+			podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
+			podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
+		}
+
+		// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
+		for _, pod := range podsToUpdate {
+			retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)
+
+			// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
+			if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
+				retryLaterDuration = retryLaterDurationTemp
+			}
+			if errTemp != nil {
+				err = errTemp
+			}
+		}
+
+		if retryLater && retryLaterDuration == 0 {
+			retryLaterDuration = time.Second * 10
+		}
+	}
+	return
+}
+
 // clearClusterOp simply removes any clusterOp for the given statefulSet.
 // This should only be used as a "break-glass" scenario. Do not use this to finish off successful clusterOps.
 func clearClusterOp(ctx context.Context, r *SolrCloudReconciler, statefulSet *appsv1.StatefulSet, reason string, logger logr.Logger) (err error) {

diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go
@@ -285,6 +285,13 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	var security *util.SecurityConfig = nil
 	if instance.Spec.SolrSecurity != nil {
 		security, err = util.ReconcileSecurityConfig(ctx, &r.Client, instance)
+		if err == nil && security != nil {
+			// If authn enabled on Solr, we need to pass the auth header when making requests
+			ctx, err = security.AddAuthToContext(ctx)
+			if err != nil {
+				logger.Error(err, "failed to create Authorization header when reconciling")
+			}
+		}
 		if err != nil {
 			return requeueOrNot, err
 		}
@@ -443,12 +450,14 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	}
 
 	// We only want to do one cluster operation at a time, so we use a lock to ensure that.
-	// Update or a Scale at a time. We do not want to do both.
-
+	// Update or Scale, one-at-a-time. We do not want to do both.
+	hasReadyPod := newStatus.ReadyReplicas > 0
 	var retryLaterDuration time.Duration
 	if clusterOpLock, hasAnn := statefulSet.Annotations[util.ClusterOpsLockAnnotation]; hasAnn {
 		clusterOpMetadata := statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]
 		switch clusterOpLock {
+		case util.UpdateLock:
+			retryLaterDuration, err = handleManagedCloudRollingUpdate(ctx, r, instance, statefulSet, outOfDatePods, hasReadyPod, availableUpdatedPodCount, logger)
 		case util.ScaleDownLock:
 			retryLaterDuration, err = handleManagedCloudScaleDown(ctx, r, instance, statefulSet, clusterOpMetadata, podList, logger)
 		case util.ScaleUpLock:
@@ -459,12 +468,22 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 			err = clearClusterOp(ctx, r, statefulSet, "clusterOp not supported", logger)
 		}
 	} else {
+		lockAcquired := false
+		// Start cluster operations if needed.
+		// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
+		// And that lock will tell future reconcile loops that the operation needs to be done.
+		// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
+		// a "locked" cluster operation
+		lockAcquired, retryLaterDuration, err = determineRollingUpdateClusterOpLockIfNecessary(ctx, r, instance, statefulSet, outOfDatePods, logger)
 		// Start cluster operations if needed.
 		// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
 		// And that lock will tell future reconcile loops that the operation needs to be done.
 		// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
 		// a "locked" cluster operation
-		_, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
+		if !lockAcquired {
+			lockAcquired, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
+		}
+		// After a lock is acquired, the reconcile will be started again because the StatefulSet is being watched
 	}
 	if err != nil && retryLaterDuration == 0 {
 		retryLaterDuration = time.Second * 5
@@ -476,62 +495,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		return requeueOrNot, err
 	}
 
-	// TODO: Move this logic into the ClusterOpLock, with the "rollingUpdate" lock
-	if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion)+len(outOfDatePods.Running) > 0 {
-		// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
-		updateLogger := logger.WithName("ManagedUpdateSelector")
-
-		// The out of date pods that have not been started, should all be updated immediately.
-		// There is no use "safely" updating pods which have not been started yet.
-		podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
-		for _, pod := range outOfDatePods.NotStarted {
-			updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
-		}
-
-		// If authn enabled on Solr, we need to pass the auth header
-		if security != nil {
-			ctx, err = security.AddAuthToContext(ctx)
-			if err != nil {
-				updateLogger.Error(err, "failed to create Authorization header when reconciling", "SolrCloud", instance.Name)
-				return requeueOrNot, err
-			}
-		}
-
-		// Pick which pods should be deleted for an update.
-		// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
-		additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError := util.DeterminePodsSafeToUpdate(ctx, instance, outOfDatePods, int(newStatus.ReadyReplicas), availableUpdatedPodCount, updateLogger)
-		// If we do not have the clusterState, it's not safe to update pods that are running
-		if clusterStateError != nil {
-			retryLater = true
-		} else {
-			podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
-			podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
-		}
-
-		// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
-		for _, pod := range podsToUpdate {
-			retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)
-
-			// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
-			if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
-				retryLaterDuration = retryLaterDurationTemp
-			}
-			if errTemp != nil {
-				err = errTemp
-			}
-		}
-
-		if err != nil || retryLaterDuration > 0 || retryLater {
-			if retryLaterDuration == 0 {
-				retryLaterDuration = time.Second * 10
-			}
-			updateRequeueAfter(&requeueOrNot, retryLaterDuration)
-		}
-		if err != nil {
-			return requeueOrNot, err
-		}
-	}
-
 	// Upsert or delete solrcloud-wide PodDisruptionBudget(s) based on 'Enabled' flag.
 	pdb := util.GeneratePodDisruptionBudget(instance, statefulSet.Spec.Selector.MatchLabels)
 	if instance.Spec.Availability.PodDisruptionBudget.Enabled != nil && *instance.Spec.Availability.PodDisruptionBudget.Enabled {

diff --git a/controllers/util/solr_update_util.go b/controllers/util/solr_update_util.go
@@ -92,6 +92,10 @@ type OutOfDatePodSegmentation struct {
 	Running              []corev1.Pod
 }
 
+func (seg OutOfDatePodSegmentation) IsEmpty() bool {
+	return len(seg.NotStarted)+len(seg.ScheduledForDeletion)+len(seg.Running) == 0
+}
+
 // DeterminePodsSafeToUpdate takes a list of solr Pods and returns a list of pods that are safe to upgrade now.
 // This function MUST be idempotent and return the same list of pods given the same kubernetes/solr state.
 //
@@ -101,17 +105,17 @@ type OutOfDatePodSegmentation struct {
 // TODO:
 //   - Think about caching this for ~250 ms? Not a huge need to send these requests milliseconds apart.
 //   - Might be too much complexity for very little gain.
-func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, readyPods int, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
+func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, totalPods int, outOfDatePods OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
 	// Before fetching the cluster state, be sure that there is room to update at least 1 pod
-	maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
+	maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, totalPods, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
 	if maxPodsToUpdate <= 0 {
 		logger.Info("Pod update selection canceled. The number of updated pods unavailable equals or exceeds the calculated maxPodsUnavailable.",
 			"unavailableUpdatedPods", unavailableUpdatedPodCount, "outOfDatePodsNotStarted", len(outOfDatePods.NotStarted), "alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion), "maxPodsUnavailable", maxPodsUnavailable)
 	} else {
 		clusterResp := &solr_api.SolrClusterStatusResponse{}
 		overseerResp := &solr_api.SolrOverseerStatusResponse{}
 
-		if readyPods > 0 {
+		if hasReadyPod {
 			queryParams := url.Values{}
 			queryParams.Add("action", "CLUSTERSTATUS")
 			err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, clusterResp)
@@ -148,8 +152,7 @@ func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOf
 }
 
 // calculateMaxPodsToUpdate determines the maximum number of additional pods that can be updated.
-func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
-	totalPods := int(*cloud.Spec.Replicas)
+func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, totalPods int, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
 	// In order to calculate the number of updated pods that are unavailable take all pods, take the total pods and subtract those that are available and updated, and those that are not updated.
 	unavailableUpdatedPodCount = totalPods - availableUpdatedPodCount - outOfDatePodCount - outOfDatePodsNotStartedCount
 	// If the maxBatchNodeUpgradeSpec is passed as a decimal between 0 and 1, then calculate as a percentage of the number of nodes.

diff --git a/controllers/util/solr_update_util_test.go b/controllers/util/solr_update_util_test.go
@@ -524,38 +524,38 @@ func TestCalculateMaxPodsToUpgrade(t *testing.T) {
 		},
 	}
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 4, 0, 4)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 4)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 2, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 4, 0, 3)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 3)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, -1, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 1, 3)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 1, 3)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, -2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("45%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 5)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 5)
 	assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
 	assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("45%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 1, 2, 5)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 1, 2, 5)
 	assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
 	assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("70%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
 	assert.Equal(t, 7, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"70%\")")
 	assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	solrCloud.Spec.UpdateStrategy.ManagedUpdateOptions.MaxPodsUnavailable = nil
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"25%\")")
 	assert.Equal(t, -3, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 }

diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
@@ -84,6 +84,13 @@ annotations:
           url: https://github.com/apache/solr-operator/issues/570
         - name: Github PR
           url: https://github.com/apache/solr-operator/pull/578
+    - kind: changed
+      description: Managed Rolling Updates are now computed via a Cluster Lock, like scaling operations.
+      links:
+        - name: Github Issue
+          url: https://github.com/apache/solr-operator/issues/560
+        - name: Github PR
+          url: https://github.com/apache/solr-operator/pull/586
   artifacthub.io/images: |
     - name: solr-operator
       image: apache/solr-operator:v0.8.0-prerelease