Skip to content

Commit

Permalink
Merge branch 'master' into pow-faucet
Browse files Browse the repository at this point in the history
  • Loading branch information
harryttd committed Aug 18, 2023
2 parents f771cc4 + 7e31daf commit 634fae4
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 96 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ build

# Ignore mkchain generated files
*_values.yaml
*-values.yaml

charts/tezos/charts
71 changes: 43 additions & 28 deletions charts/snapshotEngine/scripts/snapshot-warmer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ delete_old_volumesnapshots() {
local max_snapshots="${2##max_snapshots=}"

while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do
sleep 5
NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector")
printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)"
SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector")
Expand All @@ -37,31 +38,31 @@ delete_old_volumesnapshots() {
done
}

delete_stuck_volumesnapshots() {
snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
arr=(`echo ${snapshot_list}`);
for snapshot_name in "${arr[@]}"; do
snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
current_date_unix=$(date -u +%s)
snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 ))
# Snapshots should never be older than 6 minutes
# If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
if [ $snapshot_age_minutes -ge 6 ]; then
printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
if [ $? -ne 0 ]; then
printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
sleep 10
exit 1
else
printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
fi
fi
done
}
# delete_stuck_volumesnapshots() {
# snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
# arr=(`echo ${snapshot_list}`);
# for snapshot_name in "${arr[@]}"; do
# snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
# snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
# snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
# current_date_unix=$(date -u +%s)
# snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 ))
# # Snapshots should never be older than 6 minutes
# # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
# if [ $snapshot_age_minutes -ge 6 ]; then
# printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
# err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
# if [ $? -ne 0 ]; then
# printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
# printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
# sleep 10
# exit 1
# else
# printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
# fi
# fi
# done
# }

HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")"
TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")"
Expand All @@ -83,12 +84,23 @@ yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolu

while true; do

# Pause if nodes are not ready
until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
if [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then
break
fi
done
done

# Remove unlabeled snapshots
delete_old_volumesnapshots selector='!history_mode' max_snapshots=0
# Maintain 4 snapshots of a certain history mode
delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4
# Check for and delete old stuck snapshots
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots

if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then
# EBS Snapshot name based on current time and date
Expand All @@ -113,7 +125,7 @@ while true; do
while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do
printf "%s Snapshot is still creating...\n" "$(timestamp)"
sleep 10
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots
done
end_time=$(date +%s)
elapsed=$((end_time - start_time))
Expand All @@ -122,6 +134,9 @@ while true; do
else
printf "%s Snapshot already in progress...\n" "$(timestamp)"
sleep 10
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots
fi

printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)"
sleep 10m
done
2 changes: 1 addition & 1 deletion charts/snapshotEngine/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
SCHEMA_URL: {{ $.Values.schemaUrl }}
S3_BUCKET: {{ $.Values.s3BucketOverride }}
CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
FQDN: {{ $.Values.fqdn }}
STORAGE_CLASS: {{$.Values.volumeSnapClass }}
kind: ConfigMap
metadata:
name: snapshot-configmap
Expand Down
2 changes: 1 addition & 1 deletion rpc-auth/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ pysodium==0.7.5
pytezos==2.5.11
python-dateutil==2.8.1
pytzdata==2020.1
PyYAML==5.4
PyYAML==6.0.1
redis==3.5.3
requests==2.26.0
secp256k1==0.13.2
Expand Down
33 changes: 20 additions & 13 deletions snapshotEngine/mainJob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,18 @@ spec:
# These loops wait on the RPC to come online and prevent log from printing same line
# over and over and over again. This prints one line and waits for the RPC to come online for a clean log.
until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
printf "%s Waiting for node RPC to come online.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
if wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for rpc
if wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
break
fi
done
done
# If somehow we skip the above waiting loop, this kills the job if the RPC is not online.
if ! wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
if ! wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
printf "%s RPC is not online! Exiting...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
exit 1
Expand All @@ -76,15 +77,15 @@ spec:
# Tezos devs have advised us that it is safer to target HEAD~2 for rolling artifacts.
else
HEAD_BLOCK=$(wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
HEAD_BLOCK=$(wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
TARGET="${HEAD_BLOCK}~2"
fi
# Get BLOCK_HASH from RPC
wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
# Get BLOCK_HEIGHT from RPC
wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
# We need to check if the block is finalized for archive nodes since we aren't getting
# validation by a Tezos snapshot like our rolling tarball. We are just zipping up the data dir from an archive node.
Expand Down Expand Up @@ -117,13 +118,13 @@ spec:
fi
# Get BLOCK_TIMESTAMP from RPC
wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
# Old version string
/usr/local/bin/octez-node --version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION
# Get new version object from RPC
wget -qO- http://localhost:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
wget -qO- http://127.0.0.1:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
# Print variables for debug
printf "%s BLOCK_HASH is...$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH))\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand Down Expand Up @@ -225,8 +226,10 @@ spec:
name: snapshot-cache-volume
- mountPath: /rolling-tarball-restore
name: rolling-tarball-restore
- mountPath: /cloud-provider
name: cloud-provider
- mountPath: /aws-secrets
name: aws-secrets
- mountPath: /do-secrets
name: do-secrets
env:
- name: HISTORY_MODE
value: ""
Expand All @@ -244,8 +247,12 @@ spec:
- name: rolling-tarball-restore
persistentVolumeClaim:
claimName: rolling-tarball-restore
- name: cloud-provider
- name: aws-secrets
secret:
secretName: cloud-provider
secretName: aws-secrets
optional: true
- name: do-secrets
secret:
secretName: do-secrets
optional: true
backoffLimit: 0
2 changes: 1 addition & 1 deletion snapshotEngine/scratchVolume.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ metadata:
name: snapshot-cache-volume
namespace: ""
spec:
storageClassName: ebs-sc
storageClassName: do-block-storage
accessModes:
- ReadWriteOnce
resources:
Expand Down
Loading

0 comments on commit 634fae4

Please sign in to comment.