From a35baca94ad94f1b68cfaf0f917d0427394534fa Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 11:52:11 -0600 Subject: [PATCH 01/24] Add delete-model-runs workflow --- .github/workflows/delete-model-runs.yaml | 54 ++++++++++++++++++++++++ R/delete_current_year_model_runs.R | 24 +++++++++++ 2 files changed, 78 insertions(+) create mode 100644 .github/workflows/delete-model-runs.yaml create mode 100644 R/delete_current_year_model_runs.R diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml new file mode 100644 index 00000000..2c9119ee --- /dev/null +++ b/.github/workflows/delete-model-runs.yaml @@ -0,0 +1,54 @@ +# Workflow that can be manually dispatched to delete test model runs that +# do not need to be persisted indefinitely. +# +# Gated such that it's impossible to delete runs older than the current upcoming +# assessment cycle. + +name: delete-model-runs + +on: + workflow_dispatch: + inputs: + run-ids: + description: > + Comma-delimited list of IDs of model runs to delete. Note that the + workflow assumes these IDs correspond to model runs for the current + upcoming assessment cycle, and if that's not the case the deletion + script will raise an error. + required: true + type: string + +jobs: + delete-model-runs: + runs-on: ubuntu-latest + permissions: + # Needed to interact with GitHub's OIDC Token endpoint so we can auth AWS + contents: read + id-token: write + steps: + - name: Checkout repo code + uses: actions/checkout@v4 + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: Setup renv + uses: r-lib/actions/setup-renv@v2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + # TODO: Create a new role and add it to repo secrets + role-to-assume: ${{ secrets.AWS_IAM_ROLE_MODEL_DELETION_ARN }} + aws-region: us-east-1 + + - name: Delete model runs + run: | + for run_id in ${RUN_IDS//,/}; do + Rscript ./R/delete_current_year_model_runs.R "$run_id" + done + shell: bash + env: + RUN_IDS: ${{ inputs.run-ids }} diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R new file mode 100644 index 00000000..5bbee895 --- /dev/null +++ b/R/delete_current_year_model_runs.R @@ -0,0 +1,24 @@ +# Script to delete a list of model runs by ID from AWS. +# +# Assumes that model runs are restricted to the current upcoming +# assessment cycle year. + +library(here) +library(magrittr) +source(here("R", "helpers.R")) + +current_date <- as.POSIXct(Sys.Date()) +current_month <- current_date %>% format("%m") +current_year <- current_date %>% format("%Y") + +# The following heuristic determines the current upcoming assessment cycle year: +# +# * From March to December (post assessment), `year` = next year +# * From January to March (during assessment), `year` = current year +year <- if(current_month < "03") current_year else as.character(as.numeric(current_year) + 1) + +run_ids <- commandArgs(trailingOnly = TRUE) + +sprintf("Deleting run IDs for year %s: %s", year, run_ids) + +run_ids %>% sapply(model_delete_run, year = year) From c5c24048c0c06a8d64c056c5464c0393c9c22174 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 11:53:35 -0600 Subject: [PATCH 02/24] Temporarily run a readonly version of the delete-model-runs workflow for testing --- .github/workflows/delete-model-runs.yaml | 15 +++------------ R/delete_current_year_model_runs.R | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 2c9119ee..f155455c 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,16 +7,7 @@ name: delete-model-runs on: - workflow_dispatch: - inputs: - run-ids: - description: > - Comma-delimited list of IDs of model runs to delete. Note that the - workflow assumes these IDs correspond to model runs for the current - upcoming assessment cycle, and if that's not the case the deletion - script will raise an error. - required: true - type: string + pull_request: jobs: delete-model-runs: @@ -41,7 +32,7 @@ jobs: uses: aws-actions/configure-aws-credentials@v4 with: # TODO: Create a new role and add it to repo secrets - role-to-assume: ${{ secrets.AWS_IAM_ROLE_MODEL_DELETION_ARN }} + role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} aws-region: us-east-1 - name: Delete model runs @@ -51,4 +42,4 @@ jobs: done shell: bash env: - RUN_IDS: ${{ inputs.run-ids }} + RUN_IDS: 1,2,3,4,5 diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 5bbee895..f041fa64 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -21,4 +21,4 @@ run_ids <- commandArgs(trailingOnly = TRUE) sprintf("Deleting run IDs for year %s: %s", year, run_ids) -run_ids %>% sapply(model_delete_run, year = year) +# run_ids %>% sapply(model_delete_run, year = year) From 3f0b8c1fb89f23612eeb0a3811723e98f564e8f8 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 12:27:05 -0600 Subject: [PATCH 03/24] Install libgit2-dev for git2r in delete-model-runs.yaml --- .github/workflows/delete-model-runs.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index f155455c..d5e215b2 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -25,6 +25,10 @@ jobs: with: use-public-rspm: true + - name: Install system dependencies + # TODO: Cache this step + run: sudo apt-get install libgit2-dev + - name: Setup renv uses: r-lib/actions/setup-renv@v2 From d70deea0651e4bbca5490746e6f6e48853ef1994 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 13:40:18 -0600 Subject: [PATCH 04/24] Fix R styler error in delete_current_year_model_runs.R --- R/delete_current_year_model_runs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index f041fa64..f47bf83a 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -15,7 +15,7 @@ current_year <- current_date %>% format("%Y") # # * From March to December (post assessment), `year` = next year # * From January to March (during assessment), `year` = current year -year <- if(current_month < "03") current_year else as.character(as.numeric(current_year) + 1) +year <- if (current_month < "03") current_year else as.character(as.numeric(current_year) + 1) run_ids <- commandArgs(trailingOnly = TRUE) From 26c61bbb2c171a204ba6ae263f1ee07df5fbc18d Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:00:20 -0600 Subject: [PATCH 05/24] Clean up docstrings and test against 2023-11-14-frosty-jacob model --- .github/workflows/delete-model-runs.yaml | 11 +++++------ R/delete_current_year_model_runs.R | 8 ++++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index d5e215b2..40221f85 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -1,8 +1,8 @@ # Workflow that can be manually dispatched to delete test model runs that # do not need to be persisted indefinitely. # -# Gated such that it's impossible to delete runs older than the current upcoming -# assessment cycle. +# Gated such that it's impossible to delete runs older than the current +# assessment cycle, where each assessment cycle starts in April. name: delete-model-runs @@ -26,8 +26,8 @@ jobs: use-public-rspm: true - name: Install system dependencies - # TODO: Cache this step run: sudo apt-get install libgit2-dev + shell: bash - name: Setup renv uses: r-lib/actions/setup-renv@v2 @@ -35,8 +35,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - # TODO: Create a new role and add it to repo secrets - role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} + role-to-assume: ${{ secrets.AWS_IAM_ROLE_MODEL_DELETION_ARN }} aws-region: us-east-1 - name: Delete model runs @@ -46,4 +45,4 @@ jobs: done shell: bash env: - RUN_IDS: 1,2,3,4,5 + RUN_IDS: 2023-11-14-frosty-jacob diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index f47bf83a..e2168720 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -1,7 +1,7 @@ # Script to delete a list of model runs by ID from AWS. # -# Assumes that model runs are restricted to the current upcoming -# assessment cycle year. +# Assumes that model runs are restricted to the current assessment cycle, where +# each assessment cycle starts in April. library(here) library(magrittr) @@ -13,7 +13,7 @@ current_year <- current_date %>% format("%Y") # The following heuristic determines the current upcoming assessment cycle year: # -# * From March to December (post assessment), `year` = next year +# * From April to December (post assessment), `year` = next year # * From January to March (during assessment), `year` = current year year <- if (current_month < "03") current_year else as.character(as.numeric(current_year) + 1) @@ -21,4 +21,4 @@ run_ids <- commandArgs(trailingOnly = TRUE) sprintf("Deleting run IDs for year %s: %s", year, run_ids) -# run_ids %>% sapply(model_delete_run, year = year) +run_ids %>% sapply(model_delete_run, year = year) From 3297673b81f92ac8470b6d219ea17e2f73ece852 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:08:59 -0600 Subject: [PATCH 06/24] Satisfy pre-commit --- R/delete_current_year_model_runs.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index e2168720..c12517be 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -15,7 +15,11 @@ current_year <- current_date %>% format("%Y") # # * From April to December (post assessment), `year` = next year # * From January to March (during assessment), `year` = current year -year <- if (current_month < "03") current_year else as.character(as.numeric(current_year) + 1) +year <- if (current_month < "03") { + current_year +} else { + as.character(as.numeric(current_year) + 1) +} run_ids <- commandArgs(trailingOnly = TRUE) From 51843f3717e5491f0cf6937053ea049b13e9a4a3 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:30:03 -0600 Subject: [PATCH 07/24] Revert to workflow_dispatch event trigger for delete-model-runs.yaml --- .github/workflows/delete-model-runs.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 40221f85..88f8785a 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,7 +7,16 @@ name: delete-model-runs on: - pull_request: + workflow_dispatch: + inputs: + run-ids: + description: > + Comma-delimited list of IDs of model runs to delete. Note that the + workflow assumes these IDs correspond to model runs for the current + upcoming assessment cycle, and if that's not the case the deletion + script will raise an error. + required: true + type: string jobs: delete-model-runs: @@ -45,4 +54,4 @@ jobs: done shell: bash env: - RUN_IDS: 2023-11-14-frosty-jacob + RUN_IDS: ${{ inputs.run-ids }} From 49ea1a653ee85a2b550a243d39147f28c70f98c0 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:30:32 -0600 Subject: [PATCH 08/24] Revert "Revert to workflow_dispatch event trigger for delete-model-runs.yaml" This reverts commit 51843f3717e5491f0cf6937053ea049b13e9a4a3. --- .github/workflows/delete-model-runs.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 88f8785a..40221f85 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,16 +7,7 @@ name: delete-model-runs on: - workflow_dispatch: - inputs: - run-ids: - description: > - Comma-delimited list of IDs of model runs to delete. Note that the - workflow assumes these IDs correspond to model runs for the current - upcoming assessment cycle, and if that's not the case the deletion - script will raise an error. - required: true - type: string + pull_request: jobs: delete-model-runs: @@ -54,4 +45,4 @@ jobs: done shell: bash env: - RUN_IDS: ${{ inputs.run-ids }} + RUN_IDS: 2023-11-14-frosty-jacob From 85bfcf8b1e2eab9d5bce635c332422bff04f3ab7 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:33:56 -0600 Subject: [PATCH 09/24] Fix typo in Delete model runs step of delete-model-runs workflow --- .github/workflows/delete-model-runs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 40221f85..07ff57bf 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -40,7 +40,7 @@ jobs: - name: Delete model runs run: | - for run_id in ${RUN_IDS//,/}; do + for run_id in ${RUN_IDS//,/ }; do Rscript ./R/delete_current_year_model_runs.R "$run_id" done shell: bash From 61b9352a92498d2143ef96e800b1571896730558 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:34:25 -0600 Subject: [PATCH 10/24] Test obviously bogus value for run-ids to delete-model-runs workflow --- .github/workflows/delete-model-runs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 07ff57bf..9489805f 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -45,4 +45,4 @@ jobs: done shell: bash env: - RUN_IDS: 2023-11-14-frosty-jacob + RUN_IDS: 2024-11-14-foo-bar From a3a28faef678440f599e7f29729fbbcc61c77c91 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 16 Nov 2023 16:56:56 -0600 Subject: [PATCH 11/24] Raise an error in delete_current_year_model_runs.R if no objects were deleted --- R/delete_current_year_model_runs.R | 22 ++++++++++++++++++++-- R/helpers.R | 10 ++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index c12517be..9cd6165b 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -2,11 +2,26 @@ # # Assumes that model runs are restricted to the current assessment cycle, where # each assessment cycle starts in April. +# +# Raises an error if no objects matching the given ID were deleted. +library(glue) library(here) library(magrittr) source(here("R", "helpers.R")) +# Slightly altered version of model_delete_run from helpers.R that raises an +# error if no objects were deleted +delete_run <- function(run_id, year) { + deleted_objs <- model_delete_run(run_id, year) + if (length(deleted_objs) == 0) { + error_msg <- "No objects match the run ID '{run_id}' for year {year}" + error_msg %>% + glue::glue() %>% + stop() + } +} + current_date <- as.POSIXct(Sys.Date()) current_month <- current_date %>% format("%m") current_year <- current_date %>% format("%Y") @@ -23,6 +38,9 @@ year <- if (current_month < "03") { run_ids <- commandArgs(trailingOnly = TRUE) -sprintf("Deleting run IDs for year %s: %s", year, run_ids) +log_msg <- "Deleting run IDs for year {year}: {run_ids}" +log_msg %>% + glue::glue() %>% + print() -run_ids %>% sapply(model_delete_run, year = year) +run_ids %>% sapply(delete_run, year = year) diff --git a/R/helpers.R b/R/helpers.R index 0fbddd5f..d8edbd8a 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -59,8 +59,14 @@ model_delete_run <- function(run_id, year) { )) # Delete current version of objects - purrr::walk(s3_objs_limited, aws.s3::delete_object) - purrr::walk(s3_objs_w_run_id, aws.s3::delete_object, bucket = bucket) + del_objs_limited <- purrr::walk(s3_objs_limited, aws.s3::delete_object) + del_objs_w_run_id <- purrr::walk( + s3_objs_w_run_id, + aws.s3::delete_object, + bucket = bucket + ) + + return(c(del_objs_limited, del_objs_w_run_id)) } From 5c9ddf622458a2c42af35bdf685eec2b7db1be98 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 12:00:37 -0600 Subject: [PATCH 12/24] Disable renv sandbox to speed up install/run times --- .github/workflows/delete-model-runs.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 9489805f..341e8be5 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -29,6 +29,10 @@ jobs: run: sudo apt-get install libgit2-dev shell: bash + - name: Disable renv sandbox to speed up install time + run: echo "RENV_CONFIG_SANDBOX_ENABLED=FALSE" >> .Renviron + shell: bash + - name: Setup renv uses: r-lib/actions/setup-renv@v2 From a090cc129f80bb66a549f664414b6969a14553cc Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 12:00:51 -0600 Subject: [PATCH 13/24] Run delete_current_year_model_runs.R on all run IDs at once --- .github/workflows/delete-model-runs.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 341e8be5..ce2c727a 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -43,10 +43,7 @@ jobs: aws-region: us-east-1 - name: Delete model runs - run: | - for run_id in ${RUN_IDS//,/ }; do - Rscript ./R/delete_current_year_model_runs.R "$run_id" - done + run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: RUN_IDS: 2024-11-14-foo-bar From c624b0bd02c261fb1b65b7ffdc7635ecc13e5a79 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 12:08:00 -0600 Subject: [PATCH 14/24] Check for validity of run IDs before issuing delete operations in delete_current_year_model_runs.R --- R/delete_current_year_model_runs.R | 50 +++++++++++++++++++++++------- R/helpers.R | 35 ++++++++++----------- 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 9cd6165b..e6d9d228 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -1,22 +1,40 @@ # Script to delete a list of model runs by ID from AWS. # +# Accepts an arbitrary number of arguments, each of which should be the run ID +# of a model run whose artifacts should be deleted. +# # Assumes that model runs are restricted to the current assessment cycle, where -# each assessment cycle starts in April. +# each assessment cycle starts in April. Raises an error if no objects matching +# a given ID for the current year could be located in S3. This error will get +# raised before any deletion occurs, so if one or more IDs are invalid then +# no objects will be deleted. +# +# Example usage: # -# Raises an error if no objects matching the given ID were deleted. +# delete_current_year_model_runs.R 123 456 789 library(glue) library(here) library(magrittr) source(here("R", "helpers.R")) -# Slightly altered version of model_delete_run from helpers.R that raises an -# error if no objects were deleted -delete_run <- function(run_id, year) { - deleted_objs <- model_delete_run(run_id, year) - if (length(deleted_objs) == 0) { - error_msg <- "No objects match the run ID '{run_id}' for year {year}" - error_msg %>% +# Function to check whether S3 artifacts exist for a given model run. +# Defining this as a separate check from the deletion operation is helpful for +# two reasons: +# +# 1. The aws.s3::delete_object API does not raise an error if an object does +# not exist, so a delete operation alone won't alert us for an incorrect +# ID +# 2. Even if aws.s3::delete_object could raise an error for missing objects, +# we want to alert the caller that one or more of the IDs were incorrect +# before deleting any objects so that this script is nondestructive +# in the case of a malformed ID +raise_if_run_id_is_invalid <- function(run_id, year) { + artifacts_exist <- model_get_s3_artifacts_for_run(run_id, year) %>% + sapply(aws.s3::object_exists) + + if (!any(artifacts_exist)) { + "Model run {run_id} for year {year} is missing all S3 artifacts" %>% glue::glue() %>% stop() } @@ -38,9 +56,17 @@ year <- if (current_month < "03") { run_ids <- commandArgs(trailingOnly = TRUE) -log_msg <- "Deleting run IDs for year {year}: {run_ids}" -log_msg %>% +"Confirming artifacts exist for run IDs in year {year}: {run_ids}" %>% + glue::glue() %>% + print() + +# For a future improvement, it would probably be more user friendly to catch +# the missing artifact errors raised by raise_if_run_id_is_invalid and compile +# a list of all invalid run IDs before raising +run_ids %>% sapply(raise_if_run_id_is_invalid, year = year) + +"Deleting S3 artifacts run IDs in year {year}: {run_ids}" %>% glue::glue() %>% print() -run_ids %>% sapply(delete_run, year = year) +run_ids %>% sapply(model_delete_run, year = year) diff --git a/R/helpers.R b/R/helpers.R index d8edbd8a..6bfd069b 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -28,18 +28,16 @@ model_file_dict <- function(run_id = NULL, year = NULL) { return(dict) } - -# Used to delete erroneous, incomplete, or otherwise unwanted runs -# Use with caution! Deleted models are retained for a period of time before -# being permanently deleted -model_delete_run <- function(run_id, year) { +# Get a vector of S3 paths to the artifacts for a given model run +model_get_s3_artifacts_for_run <- function(run_id, year) { # Get paths of all run objects based on the file dictionary paths <- model_file_dict(run_id, year) s3_objs <- grep("s3://", unlist(paths), value = TRUE) bucket <- strsplit(s3_objs[1], "/")[[1]][3] # First get anything partitioned only by year - s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) + s3_objs_limited <- grep(".parquet$|.zip$|.rds$", s3_objs, value = TRUE) %>% + unname() # Next get the prefix of anything partitioned by year and run_id s3_objs_dir_path <- file.path( @@ -53,22 +51,21 @@ model_delete_run <- function(run_id, year) { ) s3_objs_dir_path <- gsub(paste0("s3://", bucket, "/"), "", s3_objs_dir_path) s3_objs_dir_path <- gsub("//", "/", s3_objs_dir_path) - s3_objs_w_run_id <- unlist(purrr::map( - s3_objs_dir_path, - ~ aws.s3::get_bucket_df(bucket, .x)$Key - )) - - # Delete current version of objects - del_objs_limited <- purrr::walk(s3_objs_limited, aws.s3::delete_object) - del_objs_w_run_id <- purrr::walk( - s3_objs_w_run_id, - aws.s3::delete_object, - bucket = bucket - ) + s3_objs_w_run_id <- s3_objs_dir_path %>% + purrr::map(~ aws.s3::get_bucket_df(bucket, .x)$Key) %>% + unlist() %>% + purrr::map_chr(~ glue::glue("s3://{bucket}/{.x}")) - return(c(del_objs_limited, del_objs_w_run_id)) + return(c(s3_objs_limited, s3_objs_w_run_id)) } +# Used to delete erroneous, incomplete, or otherwise unwanted runs +# Use with caution! Deleted models are retained for a period of time before +# being permanently deleted +model_delete_run <- function(run_id, year) { + model_get_s3_artifacts_for_run(run_id, year) %>% + purrr::walk(aws.s3::delete_object) +} # Used to fetch a run's output from S3 and populate it locally. Useful for # running reports and performing local troubleshooting From 17fcf90cd1e3bc0302816efa95ed71416d85cb40 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 13:27:02 -0600 Subject: [PATCH 15/24] Refactor run ID validity check in delete_current_year_model_runs to be nondestructive --- R/delete_current_year_model_runs.R | 55 ++++++++++++++++-------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index e6d9d228..6a17451f 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -18,28 +18,6 @@ library(here) library(magrittr) source(here("R", "helpers.R")) -# Function to check whether S3 artifacts exist for a given model run. -# Defining this as a separate check from the deletion operation is helpful for -# two reasons: -# -# 1. The aws.s3::delete_object API does not raise an error if an object does -# not exist, so a delete operation alone won't alert us for an incorrect -# ID -# 2. Even if aws.s3::delete_object could raise an error for missing objects, -# we want to alert the caller that one or more of the IDs were incorrect -# before deleting any objects so that this script is nondestructive -# in the case of a malformed ID -raise_if_run_id_is_invalid <- function(run_id, year) { - artifacts_exist <- model_get_s3_artifacts_for_run(run_id, year) %>% - sapply(aws.s3::object_exists) - - if (!any(artifacts_exist)) { - "Model run {run_id} for year {year} is missing all S3 artifacts" %>% - glue::glue() %>% - stop() - } -} - current_date <- as.POSIXct(Sys.Date()) current_month <- current_date %>% format("%m") current_year <- current_date %>% format("%Y") @@ -60,10 +38,35 @@ run_ids <- commandArgs(trailingOnly = TRUE) glue::glue() %>% print() -# For a future improvement, it would probably be more user friendly to catch -# the missing artifact errors raised by raise_if_run_id_is_invalid and compile -# a list of all invalid run IDs before raising -run_ids %>% sapply(raise_if_run_id_is_invalid, year = year) +# We consider a run ID to be valid if it has any matching data in S3 for +# the current year +run_id_is_valid <- function(run_id, year) { + return( + model_get_s3_artifacts_for_run(run_id, year) %>% + sapply(aws.s3::object_exists) %>% + any() + ) +} + +# We check for validity separate from the deletion operation for two reasons: +# +# 1. The aws.s3::delete_object API does not raise an error if an object does +# not exist, so a delete operation alone won't alert us for an incorrect +# ID +# 2. Even if aws.s3::delete_object could raise an error for missing objects, +# we want to alert the caller that one or more of the IDs were incorrect +# before deleting any objects so that this script is nondestructive +# in the case of a malformed ID +valid_run_ids <- run_ids %>% sapply(run_id_is_valid, year = year) + +if (!all(valid_run_ids)) { + invalid_run_ids <- run_ids[which(valid_run_ids == FALSE)] %>% + paste(collapse = ", ") + + "Some run IDs are missing all S3 artifacts for {year}: {invalid_run_ids}" %>% + glue::glue() %>% + stop() +} "Deleting S3 artifacts run IDs in year {year}: {run_ids}" %>% glue::glue() %>% From b6e695d49659e663b4e5a6093fcf081bca6d3870 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 13:34:43 -0600 Subject: [PATCH 16/24] Try deleting multiple invalid run IDs --- .github/workflows/delete-model-runs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index ce2c727a..e23976aa 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -46,4 +46,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: - RUN_IDS: 2024-11-14-foo-bar + RUN_IDS: 2024-11-14-foo-bar 2024-11-15-baz From 0e56190b3dad4776c2ab2676db8a6c4702656a85 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 13:46:33 -0600 Subject: [PATCH 17/24] Refactor delete_current_year_model_runs.R to accept a comma-delimited string of run IDs --- .github/workflows/delete-model-runs.yaml | 2 +- R/delete_current_year_model_runs.R | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index e23976aa..c9dd7857 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -46,4 +46,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: - RUN_IDS: 2024-11-14-foo-bar 2024-11-15-baz + RUN_IDS: 2024-11-14-foo-bar,2024-11-15-baz diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 6a17451f..3ceb9cd2 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -1,7 +1,7 @@ # Script to delete a list of model runs by ID from AWS. # -# Accepts an arbitrary number of arguments, each of which should be the run ID -# of a model run whose artifacts should be deleted. +# Accepts one argument, a comma-delimited list of run IDs for model runs +# whose artifacts should be deleted. # # Assumes that model runs are restricted to the current assessment cycle, where # each assessment cycle starts in April. Raises an error if no objects matching @@ -9,9 +9,9 @@ # raised before any deletion occurs, so if one or more IDs are invalid then # no objects will be deleted. # -# Example usage: +# Example usage (delete the runs 123, 456, and 789 in the current year): # -# delete_current_year_model_runs.R 123 456 789 +# delete_current_year_model_runs.R 123,456,789 library(glue) library(here) @@ -32,7 +32,14 @@ year <- if (current_month < "03") { as.character(as.numeric(current_year) + 1) } -run_ids <- commandArgs(trailingOnly = TRUE) +# Convert the comma-delimited input to a vector of run IDs. Accepting one or +# more positional arguments would be a cleaner UX, but since this script is +# intended to be called from a dispatched GitHub workflow, it's easier to parse +# one comma-delimited string than split a space-separated string passed as a +# workflow input +run_ids <- commandArgs(trailingOnly = TRUE) %>% + strsplit(split = ",", fixed = TRUE) %>% + unlist() "Confirming artifacts exist for run IDs in year {year}: {run_ids}" %>% glue::glue() %>% From cf4ff3c09a3182cbde1765326f0985d3e2afa52a Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 13:50:14 -0600 Subject: [PATCH 18/24] Test a delete-model-runs workflow where one run is valid and one isn't --- .github/workflows/delete-model-runs.yaml | 2 +- R/delete_current_year_model_runs.R | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index c9dd7857..e939c021 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -46,4 +46,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: - RUN_IDS: 2024-11-14-foo-bar,2024-11-15-baz + RUN_IDS: 2023-11-14-frosty-jacob,2024-11-15-baz diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 3ceb9cd2..0ba317da 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -37,11 +37,12 @@ year <- if (current_month < "03") { # intended to be called from a dispatched GitHub workflow, it's easier to parse # one comma-delimited string than split a space-separated string passed as a # workflow input -run_ids <- commandArgs(trailingOnly = TRUE) %>% +raw_run_ids <- commandArgs(trailingOnly = TRUE) +run_ids <- raw_run_ids %>% strsplit(split = ",", fixed = TRUE) %>% unlist() -"Confirming artifacts exist for run IDs in year {year}: {run_ids}" %>% +"Confirming artifacts exist for run IDs in year {year}: {raw_run_ids}" %>% glue::glue() %>% print() @@ -65,6 +66,9 @@ run_id_is_valid <- function(run_id, year) { # before deleting any objects so that this script is nondestructive # in the case of a malformed ID valid_run_ids <- run_ids %>% sapply(run_id_is_valid, year = year) +"Valid run IDs: {paste(valid_run_ids, collapse = ', ')}" %>% + glue::glue() %>% + print() if (!all(valid_run_ids)) { invalid_run_ids <- run_ids[which(valid_run_ids == FALSE)] %>% From e23d34c92e8d188b7de2b77d7b9333d4fe8b71d4 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 14:20:01 -0600 Subject: [PATCH 19/24] Test a delete-model-runs workflow where all run IDs are valid --- .github/workflows/delete-model-runs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index e939c021..b44ab530 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -46,4 +46,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: - RUN_IDS: 2023-11-14-frosty-jacob,2024-11-15-baz + RUN_IDS: 2023-11-14-frosty-jacob,2023-11-13-stupefied-liz From 6cdb449904480f76a3322559c92555315f13a479 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 14:25:29 -0600 Subject: [PATCH 20/24] Revert to workflow_dispatch trigger for delete-model-runs.yaml --- .github/workflows/delete-model-runs.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index b44ab530..a63c2ed6 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,7 +7,16 @@ name: delete-model-runs on: - pull_request: + workflow_dispatch: + inputs: + run-ids: + description: > + Comma-delimited list of IDs of model runs to delete (no spaces). Note + that the workflow assumes these IDs correspond to model runs for the + current upcoming assessment cycle, and if that's not the case the + deletion script will raise an error. + required: true + type: string jobs: delete-model-runs: @@ -46,4 +55,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" shell: bash env: - RUN_IDS: 2023-11-14-frosty-jacob,2023-11-13-stupefied-liz + RUN_IDS: ${{ inputs.run-ids }} From b02de7dbe7d41693a034a304511e0410d679826d Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 17 Nov 2023 14:28:43 -0600 Subject: [PATCH 21/24] Remove extraneous print statement from delete_current_year_model_runs.R --- R/delete_current_year_model_runs.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 0ba317da..80c74680 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -66,9 +66,6 @@ run_id_is_valid <- function(run_id, year) { # before deleting any objects so that this script is nondestructive # in the case of a malformed ID valid_run_ids <- run_ids %>% sapply(run_id_is_valid, year = year) -"Valid run IDs: {paste(valid_run_ids, collapse = ', ')}" %>% - glue::glue() %>% - print() if (!all(valid_run_ids)) { invalid_run_ids <- run_ids[which(valid_run_ids == FALSE)] %>% From a4895c20d2b44cad197df855ac7cdbeebbbe9616 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 20 Nov 2023 13:51:30 -0600 Subject: [PATCH 22/24] Clean up delete-model-runs and associated script in response to review --- .github/workflows/delete-model-runs.yaml | 9 +++++---- R/delete_current_year_model_runs.R | 12 ++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index a63c2ed6..687b25a6 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -11,12 +11,13 @@ on: inputs: run-ids: description: > - Comma-delimited list of IDs of model runs to delete (no spaces). Note + Run IDs: Space-delimited list of IDs of model runs to delete. Note that the workflow assumes these IDs correspond to model runs for the - current upcoming assessment cycle, and if that's not the case the - deletion script will raise an error. + current assessment cycle, and if that's not the case the deletion + script will raise an error. required: true type: string + default: 2024-01-01-foo-bar 2024-01-02-bar-baz jobs: delete-model-runs: @@ -52,7 +53,7 @@ jobs: aws-region: us-east-1 - name: Delete model runs - run: Rscript ./R/delete_current_year_model_runs.R "$RUN_IDS" + run: Rscript ./R/delete_current_year_model_runs.R "${RUN_IDS// /,}" shell: bash env: RUN_IDS: ${{ inputs.run-ids }} diff --git a/R/delete_current_year_model_runs.R b/R/delete_current_year_model_runs.R index 80c74680..a1a95019 100644 --- a/R/delete_current_year_model_runs.R +++ b/R/delete_current_year_model_runs.R @@ -4,7 +4,7 @@ # whose artifacts should be deleted. # # Assumes that model runs are restricted to the current assessment cycle, where -# each assessment cycle starts in April. Raises an error if no objects matching +# each assessment cycle starts in May. Raises an error if no objects matching # a given ID for the current year could be located in S3. This error will get # raised before any deletion occurs, so if one or more IDs are invalid then # no objects will be deleted. @@ -24,9 +24,9 @@ current_year <- current_date %>% format("%Y") # The following heuristic determines the current upcoming assessment cycle year: # -# * From April to December (post assessment), `year` = next year -# * From January to March (during assessment), `year` = current year -year <- if (current_month < "03") { +# * From May to December (post assessment), `year` = next year +# * From January to April (during assessment), `year` = current year +year <- if (current_month < "05") { current_year } else { as.character(as.numeric(current_year) + 1) @@ -35,8 +35,8 @@ year <- if (current_month < "03") { # Convert the comma-delimited input to a vector of run IDs. Accepting one or # more positional arguments would be a cleaner UX, but since this script is # intended to be called from a dispatched GitHub workflow, it's easier to parse -# one comma-delimited string than split a space-separated string passed as a -# workflow input +# one comma-delimited string than convert a space-separated string passed as a +# workflow input to an array of function arguments raw_run_ids <- commandArgs(trailingOnly = TRUE) run_ids <- raw_run_ids %>% strsplit(split = ",", fixed = TRUE) %>% From 09cfa5b4809cd2211c7c6809dc6af2f9363f94de Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 20 Nov 2023 13:52:37 -0600 Subject: [PATCH 23/24] Temporarily run delete-model-runs on pull_request event for testing --- .github/workflows/delete-model-runs.yaml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 687b25a6..67947a94 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,17 +7,7 @@ name: delete-model-runs on: - workflow_dispatch: - inputs: - run-ids: - description: > - Run IDs: Space-delimited list of IDs of model runs to delete. Note - that the workflow assumes these IDs correspond to model runs for the - current assessment cycle, and if that's not the case the deletion - script will raise an error. - required: true - type: string - default: 2024-01-01-foo-bar 2024-01-02-bar-baz + pull_request: jobs: delete-model-runs: @@ -56,4 +46,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "${RUN_IDS// /,}" shell: bash env: - RUN_IDS: ${{ inputs.run-ids }} + RUN_IDS: 2024-01-01-foo-bar 2024-01-02-bar-baz From 759550db476b68ef31427c7cd4e2e11cbf364cb8 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 20 Nov 2023 13:55:53 -0600 Subject: [PATCH 24/24] Revert "Temporarily run delete-model-runs on pull_request event for testing" This reverts commit 09cfa5b4809cd2211c7c6809dc6af2f9363f94de. --- .github/workflows/delete-model-runs.yaml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/delete-model-runs.yaml b/.github/workflows/delete-model-runs.yaml index 67947a94..687b25a6 100644 --- a/.github/workflows/delete-model-runs.yaml +++ b/.github/workflows/delete-model-runs.yaml @@ -7,7 +7,17 @@ name: delete-model-runs on: - pull_request: + workflow_dispatch: + inputs: + run-ids: + description: > + Run IDs: Space-delimited list of IDs of model runs to delete. Note + that the workflow assumes these IDs correspond to model runs for the + current assessment cycle, and if that's not the case the deletion + script will raise an error. + required: true + type: string + default: 2024-01-01-foo-bar 2024-01-02-bar-baz jobs: delete-model-runs: @@ -46,4 +56,4 @@ jobs: run: Rscript ./R/delete_current_year_model_runs.R "${RUN_IDS// /,}" shell: bash env: - RUN_IDS: 2024-01-01-foo-bar 2024-01-02-bar-baz + RUN_IDS: ${{ inputs.run-ids }}