forked from kube-aws/kube-spot-termination-notice-handler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
entrypoint.sh
executable file
·104 lines (83 loc) · 4.68 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/sh
# How to test:
# NAMESPACE=default POD_NAME=kubesh-3976960141-b9b9t ./this_script
# Set VERBOSE=1 to get more output
POD_NAME=${POD_NAME:-""}
VERBOSE=${VERBOSE:-0}
verbose () {
[ "${VERBOSE}" -eq 1 ] && return 0 || return 1
}
echo 'This script polls the "EC2 Spot Instance Termination Notices" endpoint to gracefully stop and then reschedule all the pods running on this Kubernetes node, up to 2 minutes before the EC2 Spot Instance backing this node is terminated.'
echo 'See https://aws.amazon.com/blogs/aws/new-ec2-spot-instance-termination-notices/ for more information.'
if [ "${NAMESPACE}" = "" ]; then
echo '[ERROR] Environment variable `NAMESPACE` has no value set. You must set it via PodSpec like described in http://stackoverflow.com/a/34418819' 1>&2
exit 1
fi
if [ "${POD_NAME}" = "" ]; then
echo '[ERROR] Environment variable `POD_NAME` has no value set. You must set it via PodSpec like described in http://stackoverflow.com/a/34418819' 1>&2
exit 1
fi
NODE_NAME=$(kubectl --namespace "${NAMESPACE}" get pod "${POD_NAME}" --output jsonpath="{.spec.nodeName}")
if [ "${NODE_NAME}" = "" ]; then
echo "[ERROR] Unable to fetch the name of the node running the pod \"${POD_NAME}\" in the namespace \"${NAMESPACE}\". Maybe a bug?: " 1>&2
exit 1
fi
# Gather some information
AZ_URL=${AZ_URL:-http://169.254.169.254/latest/meta-data/placement/availability-zone}
AZ=$(curl -s "${AZ_URL}")
REGION=$(echo "${AZ}" | sed 's/[a-z]$//')
INSTANCE_ID_URL=${INSTANCE_ID_URL:-http://169.254.169.254/latest/meta-data/instance-id}
INSTANCE_ID=$(curl -s "${INSTANCE_ID_URL}")
INSTANCE_TYPE_URL=${INSTANCE_TYPE_URL:-http://169.254.169.254/latest/meta-data/instance-type}
INSTANCE_TYPE=$(curl -s "${INSTANCE_TYPE_URL}")
if [ -z "$CLUSTER" ]; then
echo "[WARNING] Environment variable CLUSTER has no name set. You can set this to get it reported in the Slack message." 1>&2
else
CLUSTER_INFO=" (${CLUSTER})"
fi
echo "\`kubectl drain ${NODE_NAME}\` will be executed once a termination notice is made."
POLL_INTERVAL=${POLL_INTERVAL:-5}
NOTICE_URL=${NOTICE_URL:-http://169.254.169.254/latest/meta-data/spot/termination-time}
echo "Polling ${NOTICE_URL} every ${POLL_INTERVAL} second(s)"
# To whom it may concern: http://superuser.com/questions/590099/can-i-make-curl-fail-with-an-exitcode-different-than-0-if-the-http-status-code-i
while http_status=$(curl -o /dev/null -w '%{http_code}' -sL "${NOTICE_URL}"); [ "${http_status}" -ne 200 ]; do
verbose && echo "$(date): ${http_status}"
sleep "${POLL_INTERVAL}"
done
echo "$(date): ${http_status}"
MESSAGE="Spot Termination${CLUSTER_INFO}: ${NODE_NAME}, Instance: ${INSTANCE_ID}, Instance Type: ${INSTANCE_TYPE}, AZ: ${AZ}"
# Get the names of the pods
PODS_STRING=""
if PODS_OUTPUT="$(kubectl get pods --all-namespaces -o custom-columns="NAME:.metadata.namespace,NAME:.metadata.name" --field-selector "spec.nodeName=${NODE_NAME}")"; then
PODS_STRING="$(echo "$PODS_OUTPUT" | grep -v "NAME" | tr -s '[:blank:]' '/' | sed 's/^/* /' | sort)"
fi
# Notify Slack incoming-webhook
# Docs: https://api.slack.com/incoming-webhooks
# Setup: https://slack.com/apps/A0F7XDUAZ-incoming-webhooks
#
# You will have to set SLACK_URL as an environment variable via PodSpec.
# The URL should look something like: https://hooks.slack.com/services/T67UBFNHQ/B4Q7WQM52/1ctEoFjkjdjwsa22934
# SLACK_CHANNEL variable will send alert to specific slack channel. if its empty then it will go to default webhook channel
if [ "${SLACK_URL}" != "" ]; then
# construct the webhook payload
color="danger"
CONTENT="payload={\"channel\":\"${SLACK_CHANNEL}\","
CONTENT="$CONTENT\"attachments\":[{\"fallback\":\"${MESSAGE}\",\"title\":\":warning: Spot Termination${CLUSTER_INFO}\",\"color\":\"${color}\","
CONTENT="$CONTENT\"fields\":[{\"title\":\"Node\",\"value\":\"${NODE_NAME}\",\"short\":false},"
CONTENT="$CONTENT{\"title\":\"Instance\",\"value\":\"${INSTANCE_ID}\",\"short\":true},"
CONTENT="$CONTENT{\"title\":\"Instance Type\",\"value\":\"${INSTANCE_TYPE}\",\"short\":true},"
CONTENT="$CONTENT{\"title\":\"Availability Zone\",\"value\":\"${AZ}\",\"short\":true}"
if [ ! -z "$PODS_STRING" ]; then
CONTENT="$CONTENT,{\"title\":\"Pods\",\"value\":\"$PODS_STRING\",\"short\":false}"
fi
CONTENT="$CONTENT]}]}"
# send the webhook
curl -s -X POST --data-binary "$CONTENT" "${SLACK_URL}"
fi
# Drain the node.
# https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/#use-kubectl-drain-to-remove-a-node-from-service
GRACE_PERIOD=${GRACE_PERIOD:-120}
kubectl drain "${NODE_NAME}" --force --ignore-daemonsets --delete-local-data --grace-period="${GRACE_PERIOD}"
# Sleep for 200 seconds to prevent this script from looping.
# The instance should be terminated by the end of the sleep.
sleep 200