Skip to content

Commit

Permalink
Check for validity of run IDs before issuing delete operations in del…
Browse files Browse the repository at this point in the history
…ete_current_year_model_runs.R
  • Loading branch information
jeancochrane committed Nov 17, 2023
1 parent a090cc1 commit c624b0b
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 31 deletions.
50 changes: 38 additions & 12 deletions R/delete_current_year_model_runs.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,40 @@
# Script to delete a list of model runs by ID from AWS.
#
# Accepts an arbitrary number of arguments, each of which should be the run ID
# of a model run whose artifacts should be deleted.
#
# Assumes that model runs are restricted to the current assessment cycle, where
# each assessment cycle starts in April.
# each assessment cycle starts in April. Raises an error if no objects matching
# a given ID for the current year could be located in S3. This error will get
# raised before any deletion occurs, so if one or more IDs are invalid then
# no objects will be deleted.
#
# Example usage:
#
# Raises an error if no objects matching the given ID were deleted.
# delete_current_year_model_runs.R 123 456 789

library(glue)
library(here)
library(magrittr)
source(here("R", "helpers.R"))

# Slightly altered version of model_delete_run from helpers.R that raises an
# error if no objects were deleted
delete_run <- function(run_id, year) {
deleted_objs <- model_delete_run(run_id, year)
if (length(deleted_objs) == 0) {
error_msg <- "No objects match the run ID '{run_id}' for year {year}"
error_msg %>%
# Function to check whether S3 artifacts exist for a given model run.
# Defining this as a separate check from the deletion operation is helpful for
# two reasons:
#
# 1. The aws.s3::delete_object API does not raise an error if an object does
# not exist, so a delete operation alone won't alert us for an incorrect
# ID
# 2. Even if aws.s3::delete_object could raise an error for missing objects,
# we want to alert the caller that one or more of the IDs were incorrect
# before deleting any objects so that this script is nondestructive
# in the case of a malformed ID
raise_if_run_id_is_invalid <- function(run_id, year) {
artifacts_exist <- model_get_s3_artifacts_for_run(run_id, year) %>%
sapply(aws.s3::object_exists)

if (!any(artifacts_exist)) {
"Model run {run_id} for year {year} is missing all S3 artifacts" %>%
glue::glue() %>%
stop()
}
Expand All @@ -38,9 +56,17 @@ year <- if (current_month < "03") {

run_ids <- commandArgs(trailingOnly = TRUE)

log_msg <- "Deleting run IDs for year {year}: {run_ids}"
log_msg %>%
"Confirming artifacts exist for run IDs in year {year}: {run_ids}" %>%
glue::glue() %>%
print()

# For a future improvement, it would probably be more user friendly to catch
# the missing artifact errors raised by raise_if_run_id_is_invalid and compile
# a list of all invalid run IDs before raising
run_ids %>% sapply(raise_if_run_id_is_invalid, year = year)

"Deleting S3 artifacts run IDs in year {year}: {run_ids}" %>%
glue::glue() %>%
print()

run_ids %>% sapply(delete_run, year = year)
run_ids %>% sapply(model_delete_run, year = year)
35 changes: 16 additions & 19 deletions R/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,16 @@ model_file_dict <- function(run_id = NULL, year = NULL) {
return(dict)
}


# Used to delete erroneous, incomplete, or otherwise unwanted runs
# Use with caution! Deleted models are retained for a period of time before
# being permanently deleted
model_delete_run <- function(run_id, year) {
# Get a vector of S3 paths to the artifacts for a given model run
model_get_s3_artifacts_for_run <- function(run_id, year) {
# Get paths of all run objects based on the file dictionary
paths <- model_file_dict(run_id, year)
s3_objs <- grep("s3://", unlist(paths), value = TRUE)
bucket <- strsplit(s3_objs[1], "/")[[1]][3]

# First get anything partitioned only by year
s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE)
s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>%
unname()

# Next get the prefix of anything partitioned by year and run_id
s3_objs_dir_path <- file.path(
Expand All @@ -53,22 +51,21 @@ model_delete_run <- function(run_id, year) {
)
s3_objs_dir_path <- gsub(paste0("s3://", bucket, "/"), "", s3_objs_dir_path)
s3_objs_dir_path <- gsub("//", "/", s3_objs_dir_path)
s3_objs_w_run_id <- unlist(purrr::map(
s3_objs_dir_path,
~ aws.s3::get_bucket_df(bucket, .x)$Key
))

# Delete current version of objects
del_objs_limited <- purrr::walk(s3_objs_limited, aws.s3::delete_object)
del_objs_w_run_id <- purrr::walk(
s3_objs_w_run_id,
aws.s3::delete_object,
bucket = bucket
)
s3_objs_w_run_id <- s3_objs_dir_path %>%
purrr::map(~ aws.s3::get_bucket_df(bucket, .x)$Key) %>%
unlist() %>%
purrr::map_chr(~ glue::glue("s3://{bucket}/{.x}"))

return(c(del_objs_limited, del_objs_w_run_id))
return(c(s3_objs_limited, s3_objs_w_run_id))
}

# Used to delete erroneous, incomplete, or otherwise unwanted runs
# Use with caution! Deleted models are retained for a period of time before
# being permanently deleted
model_delete_run <- function(run_id, year) {
model_get_s3_artifacts_for_run(run_id, year) %>%
purrr::walk(aws.s3::delete_object)
}

# Used to fetch a run's output from S3 and populate it locally. Useful for
# running reports and performing local troubleshooting
Expand Down

0 comments on commit c624b0b

Please sign in to comment.