From 219f0fed368abe78bac4b8d93ad4ade3fadc6006 Mon Sep 17 00:00:00 2001 From: Nick Hudson Date: Thu, 26 Sep 2024 21:29:11 -0500 Subject: [PATCH] add function to remove stalled backups for paused instances --- .../src/cloudnativepg/cnpg_utils.rs | 58 ++++++++++++++++++- tembo-operator/src/cloudnativepg/hibernate.rs | 4 ++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tembo-operator/src/cloudnativepg/cnpg_utils.rs b/tembo-operator/src/cloudnativepg/cnpg_utils.rs index c026fdba2..67a952047 100644 --- a/tembo-operator/src/cloudnativepg/cnpg_utils.rs +++ b/tembo-operator/src/cloudnativepg/cnpg_utils.rs @@ -1,5 +1,6 @@ pub use crate::{ apis::coredb_types::CoreDB, + cloudnativepg::backups::Backup, cloudnativepg::clusters::{Cluster, ClusterStatusConditionsStatus}, cloudnativepg::poolers::Pooler, cloudnativepg::scheduledbackups::ScheduledBackup, @@ -7,8 +8,9 @@ pub use crate::{ extensions::database_queries::is_not_restarting, patch_cdb_status_merge, requeue_normal_with_jitter, Context, RESTARTED_AT, }; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time; use kube::{ - api::{Patch, PatchParams}, + api::{DeleteParams, ListParams, Patch, PatchParams}, runtime::controller::Action, Api, ResourceExt, }; @@ -321,3 +323,57 @@ pub(crate) async fn is_image_updated( Ok(()) } + +// remove_stalled_backups function takes a CoreDB, Conext and removed any stalled +// backups. A backup is considered stalled if it's older than 24 hours and does not have a phase set. +#[instrument(skip(cdb, ctx), fields(trace_id, instance_name = %cdb.name_any()))] +pub(crate) async fn removed_stalled_backups( + cdb: &CoreDB, + ctx: &Arc, +) -> Result<(), Action> { + let name = cdb.name_any(); + let namespace = cdb.metadata.namespace.as_ref().ok_or_else(|| { + error!("Namespace is empty for instance: {}.", name); + Action::requeue(Duration::from_secs(300)) + })?; + + let backup_api: Api = Api::namespaced(ctx.client.clone(), namespace); + + // List all backups for the cluster + let lp = ListParams { + label_selector: Some(format!("cnpg.io/cluster={}", name.as_str())), + ..ListParams::default() + }; + let backups = backup_api.list(&lp).await.map_err(|e| { + error!("Error listing backups: {}", e); + Action::requeue(Duration::from_secs(300)) + })?; + + let twenty_four_hours_ago = Time(chrono::Utc::now() - chrono::Duration::hours(24)); + + // Filter backups that do not have a status set and are older than 24 hours + for backup in &backups.items { + if backup.status.is_none() { + if let Some(creation_time) = backup.metadata.creation_timestamp.as_ref() { + if creation_time < &twenty_four_hours_ago { + info!("Deleting stalled backup: {}", backup.name_any()); + match backup_api + .delete(&backup.name_any(), &DeleteParams::default()) + .await + { + Ok(_) => { + info!("Successfully deleted stalled backup: {}", backup.name_any()) + } + Err(e) => error!( + "Failed to delete stalled backup {}: {}", + backup.name_any(), + e + ), + } + } + } + } + } + + Ok(()) +} diff --git a/tembo-operator/src/cloudnativepg/hibernate.rs b/tembo-operator/src/cloudnativepg/hibernate.rs index 346b7aecd..408fe54bd 100644 --- a/tembo-operator/src/cloudnativepg/hibernate.rs +++ b/tembo-operator/src/cloudnativepg/hibernate.rs @@ -15,6 +15,7 @@ use k8s_openapi::api::apps::v1::Deployment; use crate::app_service::manager::get_appservice_deployment_objects; use crate::cloudnativepg::cnpg_utils::{ get_pooler_instances, patch_cluster_merge, patch_pooler_merge, patch_scheduled_backup_merge, + removed_stalled_backups, }; use std::sync::Arc; use std::time::Duration; @@ -203,6 +204,9 @@ pub async fn reconcile_cluster_hibernation(cdb: &CoreDB, ctx: &Arc) -> patch_cdb_status_merge(&coredbs, &name, patch_status).await?; if cdb.spec.stop { + // Only remove stalled backups if the instance is stopped/paused + info!("Remove any stalled backups for paused instance {}", name); + removed_stalled_backups(cdb, ctx).await?; info!("Fully reconciled stopped instance {}", name); return Err(requeue_normal_with_jitter()); }