Skip to content

Commit

Permalink
snapshotEngine: retire snapshot warmer (#602)
Browse files Browse the repository at this point in the history
* sleep first

* move snapshot creation to maker script

* add nodes config to cm

* rm warmer stuff

* set snapshot info

* update node class

* retrofit maker to take snapshot

* make sure to delete all snapshots before and after

* only delete by history mode

* sweet spot for vs cleanup
  • Loading branch information
orcutt989 authored Oct 3, 2023
1 parent fea7bb4 commit 40bdb62
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 215 deletions.
142 changes: 0 additions & 142 deletions charts/snapshotEngine/scripts/snapshot-warmer.sh

This file was deleted.

3 changes: 2 additions & 1 deletion charts/snapshotEngine/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ data:
SCHEMA_URL: {{ $.Values.schemaUrl }}
S3_BUCKET: {{ $.Values.s3BucketOverride }}
CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
STORAGE_CLASS: {{$.Values.volumeSnapClass }}
STORAGE_CLASS: {{ $.Values.volumeSnapClass }}
NODES: {{ $.Values.nodes }}
kind: ConfigMap
metadata:
name: snapshot-configmap
Expand Down
47 changes: 0 additions & 47 deletions charts/snapshotEngine/templates/snapshot-warmer.yaml

This file was deleted.

112 changes: 87 additions & 25 deletions snapshotEngine/snapshot-maker.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,58 @@
#!/bin/bash

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

PERSISTENT_VOLUME_CLAIM="var-volume-snapshot-${HISTORY_MODE}-node-0"

# For yq to work, the values resulting from the above cmds need to be exported.
# We don't export them inline because of
# https://github.com/koalaman/shellcheck/wiki/SC2155
export HISTORY_MODE
export PERSISTENT_VOLUME_CLAIM

yq e -i '.metadata.namespace=strenv(NAMESPACE)' createVolumeSnapshot.yaml
yq e -i '.metadata.labels.history_mode=strenv(HISTORY_MODE)' createVolumeSnapshot.yaml
yq e -i '.spec.source.persistentVolumeClaimName=strenv(PERSISTENT_VOLUME_CLAIM)' createVolumeSnapshot.yaml
yq e -i '.spec.volumeSnapshotClassName=strenv(STORAGE_CLASS)' createVolumeSnapshot.yaml

# Returns list of snapshots with a given status
# readyToUse true/false
getSnapshotNames() {
local readyToUse="${1##readyToUse=}"
shift
if [ -z "$readyToUse" ]; then
echo "Error: No jsonpath for volumesnapshots' ready status was provided."
exit 1
fi
kubectl get volumesnapshots -o jsonpath="{.items[?(.status.readyToUse==$readyToUse)].metadata.name}" --namespace "$NAMESPACE" "$@"
}

# SLEEP_TIME=0m

# if [ "${HISTORY_MODE}" = "archive" ]; then
# SLEEP_TIME="${ARCHIVE_SLEEP_DELAY}"
# if [ "${ARCHIVE_SLEEP_DELAY}" != "0m" ]; then
# printf "%s artifactDelay.archive is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ARCHIVE_SLEEP_DELAY}"
# fi
# elif [ "${HISTORY_MODE}" = "rolling" ]; then
# SLEEP_TIME="${ROLLING_SLEEP_DELAY}"
# if [ "${ROLLING_SLEEP_DELAY}" != "0m" ]; then
# printf "%s artifactDelay.rolling is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ROLLING_SLEEP_DELAY}"
# fi
# fi

# if [ "${SLEEP_TIME}" = "0m" ]; then
# printf "%s artifactDelay.HISTORY_MODE was not set! No delay...\n" "$(date "+%Y-%m-%d %H:%M:%S")"
# fi

# sleep "${SLEEP_TIME}"

cd /

ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}"

# Delete zip-and-upload job
# Delete zip-and-upload job if still around
if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then
printf "%s Old zip-and-upload job exits. Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
if ! kubectl delete jobs "${ZIP_AND_UPLOAD_JOB_NAME}"; then
Expand All @@ -16,7 +64,7 @@ else
printf "%s No old zip-and-upload job detected for cleanup.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
fi

# Delete old PVCs
# Delete old PVCs if still around
if [ "${HISTORY_MODE}" = rolling ]; then
if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then
printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand All @@ -40,12 +88,34 @@ if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then
sleep 5
fi

# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
# printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
# sleep 10
# done
# Take volume snapshot
current_date=$(date "+%Y-%m-%d-%H-%M-%S" "$@")
export SNAPSHOT_NAME="$current_date-$HISTORY_MODE-node-snapshot"
# Update volume snapshot name
yq e -i '.metadata.name=strenv(SNAPSHOT_NAME)' createVolumeSnapshot.yaml

printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
printf "%s Creating snapshot ${SNAPSHOT_NAME} in ${NAMESPACE}.\n" "$(timestamp)"

# Create snapshot
if ! kubectl apply -f createVolumeSnapshot.yaml; then
printf "%s ERROR creating volumeSnapshot ${SNAPSHOT_NAME} in ${NAMESPACE} .\n" "$(timestamp)"
exit 1
fi

sleep 5

# Wait for snapshot to finish
until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do
printf "%s Snapshot in progress. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
if [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; then
break
fi
done
done

printf "%s Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"

SNAPSHOTS=$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==true)].metadata.name}' -l history_mode="${HISTORY_MODE}")
NEWEST_SNAPSHOT=${SNAPSHOTS##* }
Expand Down Expand Up @@ -126,6 +196,11 @@ then
exit 1
fi

sleep 5

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

# TODO Check for PVC
printf "%s PersistentVolumeClaim ${HISTORY_MODE}-snap-volume created successfully in namespace ${NAMESPACE}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"

Expand Down Expand Up @@ -183,23 +258,6 @@ if [ "${HISTORY_MODE}" = archive ]; then
yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml
fi

# # Switch alternate cloud provider secret name based on actual cloud provider
# if [[ -n "${CLOUD_PROVIDER}" ]]; then
# # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
# SECRET_NAME="${NAMESPACE}-secret"
# # Index of zip-and-upload container changes depending on if rolling job or archive job
# NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
# # Index of mounts also changes depending on history mode
# NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
# # Secret volume mount is last item in list of volumeMounts for the zip and upload container
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# # Index of job volumes change depending on history mode
# NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
# # Setting job secret volume to value set by workflow
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
# fi

# Service account to be used by entire zip-and-upload job.
SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml

Expand All @@ -218,7 +276,7 @@ sleep 20
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
sleep 2m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \
[ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then
printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand Down Expand Up @@ -269,6 +327,9 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES
sleep 5
fi

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

SLEEP_TIME=0m

if [ "${HISTORY_MODE}" = "archive" ]; then
Expand Down Expand Up @@ -296,3 +357,4 @@ sleep 5
kubectl delete -f volumeFromSnap.yaml | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done
sleep 5
kubectl delete job snapshot-maker --namespace "${NAMESPACE}"

0 comments on commit 40bdb62

Please sign in to comment.