From c624b0bd02c261fb1b65b7ffdc7635ecc13e5a79 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 12:08:00 -0600 Subject: [PATCH] Check for validity of run IDs before issuing delete operations in delete_current_year_model_runs.R --- R/delete_current_year_model_runs.R | 50 +++++++++++++++++++++++------- R/helpers.R | 35 ++++++++++----------- 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 9cd6165b..e6d9d228 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -1,22 +1,40 @@ # Script to delete a list of model runs by ID from AWS. # +# Accepts an arbitrary number of arguments, each of which should be the run ID +# of a model run whose artifacts should be deleted. +# # Assumes that model runs are restricted to the current assessment cycle, where -# each assessment cycle starts in April. +# each assessment cycle starts in April. Raises an error if no objects matching +# a given ID for the current year could be located in S3. This error will get +# raised before any deletion occurs, so if one or more IDs are invalid then +# no objects will be deleted. +# +# Example usage: # -# Raises an error if no objects matching the given ID were deleted. +# delete_current_year_model_runs.R 123 456 789 library(glue) library(here) library(magrittr) source(here("R", "helpers.R")) -# Slightly altered version of model_delete_run from helpers.R that raises an -# error if no objects were deleted -delete_run <- function(run_id, year) { - deleted_objs <- model_delete_run(run_id, year) - if (length(deleted_objs) == 0) { - error_msg <- "No objects match the run ID '{run_id}' for year {year}" - error_msg %>% +# Function to check whether S3 artifacts exist for a given model run. +# Defining this as a separate check from the deletion operation is helpful for +# two reasons: +# +# 1. The aws.s3::delete_object API does not raise an error if an object does +# not exist, so a delete operation alone won't alert us for an incorrect +# ID +# 2. Even if aws.s3::delete_object could raise an error for missing objects, +# we want to alert the caller that one or more of the IDs were incorrect +# before deleting any objects so that this script is nondestructive +# in the case of a malformed ID +raise_if_run_id_is_invalid <- function(run_id, year) { + artifacts_exist <- model_get_s3_artifacts_for_run(run_id, year) %>% + sapply(aws.s3::object_exists) + + if (!any(artifacts_exist)) { + "Model run {run_id} for year {year} is missing all S3 artifacts" %>% glue::glue() %>% stop() } @@ -38,9 +56,17 @@ year <- if (current_month < "03") { run_ids <- commandArgs(trailingOnly = TRUE) -log_msg <- "Deleting run IDs for year {year}: {run_ids}" -log_msg %>% +"Confirming artifacts exist for run IDs in year {year}: {run_ids}" %>% + glue::glue() %>% + print() + +# For a future improvement, it would probably be more user friendly to catch +# the missing artifact errors raised by raise_if_run_id_is_invalid and compile +# a list of all invalid run IDs before raising +run_ids %>% sapply(raise_if_run_id_is_invalid, year = year) + +"Deleting S3 artifacts run IDs in year {year}: {run_ids}" %>% glue::glue() %>% print() -run_ids %>% sapply(delete_run, year = year) +run_ids %>% sapply(model_delete_run, year = year) diff --git a/R/helpers.R b/R/helpers.R index d8edbd8a..6bfd069b 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -28,18 +28,16 @@ model_file_dict <- function(run_id = NULL, year = NULL) { return(dict) } - -# Used to delete erroneous, incomplete, or otherwise unwanted runs -# Use with caution! Deleted models are retained for a period of time before -# being permanently deleted -model_delete_run <- function(run_id, year) { +# Get a vector of S3 paths to the artifacts for a given model run +model_get_s3_artifacts_for_run <- function(run_id, year) { # Get paths of all run objects based on the file dictionary paths <- model_file_dict(run_id, year) s3_objs <- grep("s3://", unlist(paths), value = TRUE) bucket <- strsplit(s3_objs[1], "/")[[1]][3] # First get anything partitioned only by year - s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) + s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>% + unname() # Next get the prefix of anything partitioned by year and run_id s3_objs_dir_path <- file.path( @@ -53,22 +51,21 @@ model_delete_run <- function(run_id, year) { ) s3_objs_dir_path <- gsub(paste0("s3://", bucket, "/"), "", s3_objs_dir_path) s3_objs_dir_path <- gsub("//", "/", s3_objs_dir_path) - s3_objs_w_run_id <- unlist(purrr::map( - s3_objs_dir_path, - ~ aws.s3::get_bucket_df(bucket, .x)$Key - )) - - # Delete current version of objects - del_objs_limited <- purrr::walk(s3_objs_limited, aws.s3::delete_object) - del_objs_w_run_id <- purrr::walk( - s3_objs_w_run_id, - aws.s3::delete_object, - bucket = bucket - ) + s3_objs_w_run_id <- s3_objs_dir_path %>% + purrr::map(~ aws.s3::get_bucket_df(bucket, .x)$Key) %>% + unlist() %>% + purrr::map_chr(~ glue::glue("s3://{bucket}/{.x}")) - return(c(del_objs_limited, del_objs_w_run_id)) + return(c(s3_objs_limited, s3_objs_w_run_id)) } +# Used to delete erroneous, incomplete, or otherwise unwanted runs +# Use with caution! Deleted models are retained for a period of time before +# being permanently deleted +model_delete_run <- function(run_id, year) { + model_get_s3_artifacts_for_run(run_id, year) %>% + purrr::walk(aws.s3::delete_object) +} # Used to fetch a run's output from S3 and populate it locally. Useful for # running reports and performing local troubleshooting