-
Notifications
You must be signed in to change notification settings - Fork 24
/
autoheal.yml
172 lines (158 loc) · 5.09 KB
/
autoheal.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#
# Copyright (c) 2018 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This is an example of the configuration file used by the auto-heal service.
#
# This section contains the details needed to connect to the AWX or Ansible
# Tower server that will execute the Ansible playbooks.
#
awx:
#
# The URL of the AWX API endpoint, including the `/api` suffix, but not the
# `/v1` or `/v2` suffixes.
#
address: https://my-awx.example.com/api
#
# The URL of the proxy server used to connect to the AWX API. If not present
# or empty then no proxy server will be used.
#
proxy: http://my-proxy.example.com:3128
#
# Reference to the Kubernetes secret expected to contain "username" and
# "password" keys to use to connect to the AWX API.
#
credentialsRef:
namespace: my-namespace
name: my-awx-credentials
#
# Alternatively the user name and password can be also specified directly
# inside the configuration file. This is very convenient for development
# environments, but it is not recommended for production environments.
#
# credentials:
# username: my-user
# password: my-password
#
# Note that if both `credentialsRef` and `credentials` are used, then only
# `credentialsRef` will be used.
#
# Reference to the Kubernetes secret that contains the trusted CA certificates
# used to verify the TLS certificate presented by the AWX server. If not
# present or empty then the global system trusted certificates will be used.
#
tlsRef:
namespace: my-namespace
name: my-awx-tls
#
# Alternatively the trusted CA certificates can be also specified directly
# inside the configuration file.
#
# tls:
# caCerts: |-
# -----BEGIN CERTIFICATE-----
# MIIFgzCCA2ugAwIBAgIPXZONMGc2yAYdGsdUhGkHMA0GCSqGSIb3DQEBCwUAMDsx
# CzAJBgNVBAYTAkVTMREwDwYDVQQKDAhGTk1ULVJDTTEZMBcGA1UECwwQQUMgUkFJ
# ...
# -----END CERTIFICATE-----
#
# Or loaded from a file:
#
# tls:
# caFile: /etc/pki/tls/certs/my-ca.pem
#
# Whether to use an insecure connection to the AWX server this should always
# be set to `false` in production, but can be set to `true` when developing.
# Defaults to `false`.
#
insecure: false
#
# The name of the AWX project that contains the auto-heal job templates.
#
project: "My project"
#
# How often to check active jobs status. default: 5 minutes.
#
jobStatusCheckInterval: 5m
#
# This section describes how to throttle the execution of healing rules.
#
throttling:
#
# The time that the service will remember an executed healing action. If an
# action is triggered more than once in the given interval it will be executed
# only the first time. The rest of the times it will be logged and ignored.
#
# Note that for throttling purposes actions are considered the same if they
# have exactly the same fields with exactly the same values *after* processing
# them as templates. For example, an action defined like this:
#
# awxJob:
# template: "Restart {{ $labels.service }}"
#
# Will have different values for the `template` field if the triggering alerts
# have different `service` labels.
#
# Interval is a duration string ,that is, a sequence of decimal numbers, each
# with optional fraction and a unit suffix, such as "300ms" or "2h45m". Valid
# time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". (For more details
# see https://golang.org/pkg/time/#Duration)
#
interval: 1h
#
# This section contains the healing rules.
#
rules:
#
# This rule runs an the `Start Node` AWX job when the `NodeDown` alert
# is fired. The playbook that handles this alert is expected to take the
# name of the node from the `node` variable, so we need to translate the
# `instance` label into the `node` variable passed in the `extraVars`,
# and in the `limit` parameter.
#
- metadata:
name: start-node
labels:
alertname: "NodeDown"
awxJob:
template: "Start node"
extraVars:
node: "{{ $labels.instance }}"
limit: "{{ $labels.instance }}"
#
# This rule runs a batch job that uses Python to say hello when the
# `NewFriend` alert is fired. The alert is expected to have a `name`
# label containing the name of the new friend.
#
- metadata:
name: say-hello
labels:
alertname: "NewFriend"
batchJob:
apiVersion: batch/v1
kind: Job
metadata:
namespace: default
name: hello
spec:
template:
spec:
containers:
- name: python
image: python
command:
- python
- -c
- print("Hello {{ $labels.name }}!")
restartPolicy: Never