Skip to content

Commit

Permalink
add function to remove stalled backups for paused instances
Browse files Browse the repository at this point in the history
  • Loading branch information
nhudson committed Sep 27, 2024
1 parent 7fc1136 commit 219f0fe
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
58 changes: 57 additions & 1 deletion tembo-operator/src/cloudnativepg/cnpg_utils.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
pub use crate::{
apis::coredb_types::CoreDB,
cloudnativepg::backups::Backup,
cloudnativepg::clusters::{Cluster, ClusterStatusConditionsStatus},
cloudnativepg::poolers::Pooler,
cloudnativepg::scheduledbackups::ScheduledBackup,
controller,
extensions::database_queries::is_not_restarting,
patch_cdb_status_merge, requeue_normal_with_jitter, Context, RESTARTED_AT,
};
use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time;
use kube::{
api::{Patch, PatchParams},
api::{DeleteParams, ListParams, Patch, PatchParams},
runtime::controller::Action,
Api, ResourceExt,
};
Expand Down Expand Up @@ -321,3 +323,57 @@ pub(crate) async fn is_image_updated(

Ok(())
}

// remove_stalled_backups function takes a CoreDB, Conext and removed any stalled
// backups. A backup is considered stalled if it's older than 24 hours and does not have a phase set.
#[instrument(skip(cdb, ctx), fields(trace_id, instance_name = %cdb.name_any()))]
pub(crate) async fn removed_stalled_backups(
cdb: &CoreDB,
ctx: &Arc<Context>,
) -> Result<(), Action> {
let name = cdb.name_any();
let namespace = cdb.metadata.namespace.as_ref().ok_or_else(|| {
error!("Namespace is empty for instance: {}.", name);
Action::requeue(Duration::from_secs(300))
})?;

let backup_api: Api<Backup> = Api::namespaced(ctx.client.clone(), namespace);

// List all backups for the cluster
let lp = ListParams {
label_selector: Some(format!("cnpg.io/cluster={}", name.as_str())),
..ListParams::default()
};
let backups = backup_api.list(&lp).await.map_err(|e| {
error!("Error listing backups: {}", e);
Action::requeue(Duration::from_secs(300))
})?;

let twenty_four_hours_ago = Time(chrono::Utc::now() - chrono::Duration::hours(24));

// Filter backups that do not have a status set and are older than 24 hours
for backup in &backups.items {
if backup.status.is_none() {
if let Some(creation_time) = backup.metadata.creation_timestamp.as_ref() {
if creation_time < &twenty_four_hours_ago {
info!("Deleting stalled backup: {}", backup.name_any());
match backup_api
.delete(&backup.name_any(), &DeleteParams::default())
.await
{
Ok(_) => {
info!("Successfully deleted stalled backup: {}", backup.name_any())
}
Err(e) => error!(
"Failed to delete stalled backup {}: {}",
backup.name_any(),
e
),
}
}
}
}
}

Ok(())
}
4 changes: 4 additions & 0 deletions tembo-operator/src/cloudnativepg/hibernate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use k8s_openapi::api::apps::v1::Deployment;
use crate::app_service::manager::get_appservice_deployment_objects;
use crate::cloudnativepg::cnpg_utils::{
get_pooler_instances, patch_cluster_merge, patch_pooler_merge, patch_scheduled_backup_merge,
removed_stalled_backups,
};
use std::sync::Arc;
use std::time::Duration;
Expand Down Expand Up @@ -203,6 +204,9 @@ pub async fn reconcile_cluster_hibernation(cdb: &CoreDB, ctx: &Arc<Context>) ->
patch_cdb_status_merge(&coredbs, &name, patch_status).await?;

if cdb.spec.stop {
// Only remove stalled backups if the instance is stopped/paused
info!("Remove any stalled backups for paused instance {}", name);
removed_stalled_backups(cdb, ctx).await?;
info!("Fully reconciled stopped instance {}", name);
return Err(requeue_normal_with_jitter());
}
Expand Down

0 comments on commit 219f0fe

Please sign in to comment.