Skip to content

Commit

Permalink
add prometheus metrics
Browse files Browse the repository at this point in the history
When enabled, birdwatcher serves a prometheus endpoint with metrics on service
state and configuration
  • Loading branch information
skoef committed Jan 10, 2024
1 parent 17dafeb commit a0b2069
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 19 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,13 @@ Each service under this section can have the following settings:
| fail | The amount of times the check command should fail before the service is considered to be down. Defaults to **1** |
| rise | The amount of times the check command should succeed before the service is considered to be up. Defaults to **1** |
| prefixes | Array of prefixes, mixed IPv4 and IPv6. At least 1 prefix is **required** per service |

## **[prometheus]**

Configuration for the prometheus exporter

| key | description |
| ------- | ---------------------------------------------------------------------------- |
| enabled | Boolean whether you want to export prometheus metrics. Defaults to **false** |
| port | Port to export prometheus metrics on. Defaults to **9091** |
| path | Path to the prometheus metrics. Defaults to **/metrics** |
21 changes: 18 additions & 3 deletions birdwatcher/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,19 @@ import (
type Config struct {
ConfigFile string
ReloadCommand string
Services map[string]*ServiceCheck
Prometheus struct {
Enabled bool
Port int
Path string
}
Services map[string]*ServiceCheck
}

const (
defaultConfigFile = "/etc/bird/birdwatcher.conf"
defaultReloadCommand = "/usr/sbin/birdc configure"
defaultConfigFile = "/etc/bird/birdwatcher.conf"
defaultReloadCommand = "/usr/sbin/birdc configure"
defaultPrometheusPort = 9091
defaultPrometheusPath = "/metrics"

defaultFunctionName = "match_route"
defaultCheckInterval = 1
Expand Down Expand Up @@ -53,6 +60,14 @@ func ReadConfig(conf *Config, configFile string) error {
conf.ReloadCommand = defaultReloadCommand
}

if conf.Prometheus.Path == "" {
conf.Prometheus.Path = defaultPrometheusPath
}

if conf.Prometheus.Port == 0 {
conf.Prometheus.Port = defaultPrometheusPort
}

if len(conf.Services) == 0 {
return errors.New("no services configured")
}
Expand Down
6 changes: 6 additions & 0 deletions birdwatcher/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ func TestConfig(t *testing.T) {

assert.Equal(t, defaultConfigFile, testConf.ConfigFile)
assert.Equal(t, defaultReloadCommand, testConf.ReloadCommand)
assert.False(t, testConf.Prometheus.Enabled)
assert.Equal(t, defaultPrometheusPort, testConf.Prometheus.Port)
assert.Equal(t, defaultPrometheusPath, testConf.Prometheus.Path)
assert.Equal(t, 1, len(testConf.Services))
assert.Equal(t, "foo", testConf.Services["foo"].name)
assert.Equal(t, defaultCheckInterval, testConf.Services["foo"].Interval)
Expand Down Expand Up @@ -123,6 +126,9 @@ func TestConfig(t *testing.T) {
}
assert.Equal(t, "/etc/birdwatcher.conf", testConf.ConfigFile)
assert.Equal(t, "/sbin/birdc configure", testConf.ReloadCommand)
assert.True(t, testConf.Prometheus.Enabled)
assert.Equal(t, 1234, testConf.Prometheus.Port)
assert.Equal(t, "/something", testConf.Prometheus.Path)
assert.Equal(t, "foo_bar", testConf.Services["foo"].FunctionName)
if assert.Equal(t, 1, len(testConf.Services["foo"].prefixes)) {
assert.Equal(t, "192.168.0.0/24", testConf.Services["foo"].prefixes[0].String())
Expand Down
29 changes: 21 additions & 8 deletions birdwatcher/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
log "github.com/sirupsen/logrus"
)

Expand All @@ -19,6 +21,15 @@ const (
reloadTimeout = 10 * time.Second
)

var (
prefixStateMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "birdwatcher",
Subsystem: "prefix",
Name: "state",
Help: "Current health state per prefix",
}, []string{"service", "prefix"})
)

// HealthCheck -- struct holding everything needed for the never-ending health
// check loop
type HealthCheck struct {
Expand Down Expand Up @@ -86,9 +97,9 @@ func (h *HealthCheck) handleAction(action *Action, status *chan string) {
for _, p := range action.Prefixes {
switch action.State {
case ServiceStateUp:
h.addPrefix(action.Service.FunctionName, p)
h.addPrefix(action.Service, p)
case ServiceStateDown:
h.removePrefix(action.Service.FunctionName, p)
h.removePrefix(action.Service, p)
default:
log.WithFields(log.Fields{
"state": action.State,
Expand Down Expand Up @@ -207,16 +218,18 @@ func (h *HealthCheck) applyConfig(config Config, prefixes PrefixCollection) erro
return err
}

func (h *HealthCheck) addPrefix(functionName string, prefix net.IPNet) {
h.ensurePrefixSet(functionName)
func (h *HealthCheck) addPrefix(svc *ServiceCheck, prefix net.IPNet) {
h.ensurePrefixSet(svc.FunctionName)

h.prefixes[functionName].Add(prefix)
h.prefixes[svc.FunctionName].Add(prefix)
prefixStateMetric.WithLabelValues(svc.Name(), prefix.String()).Set(1.0)
}

func (h *HealthCheck) removePrefix(functionName string, prefix net.IPNet) {
h.ensurePrefixSet(functionName)
func (h *HealthCheck) removePrefix(svc *ServiceCheck, prefix net.IPNet) {
h.ensurePrefixSet(svc.FunctionName)

h.prefixes[functionName].Remove(prefix)
h.prefixes[svc.FunctionName].Remove(prefix)
prefixStateMetric.WithLabelValues(svc.Name(), prefix.String()).Set(0.0)
}

func (h *HealthCheck) ensurePrefixSet(functionName string) {
Expand Down
24 changes: 19 additions & 5 deletions birdwatcher/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"net"
"testing"

"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
)

Expand All @@ -14,33 +15,46 @@ func TestHealthCheck_addPrefix(t *testing.T) {
// adding a prefix should initialise the prefixcollection
// and add the prefix under the right prefixset
_, prefix, _ := net.ParseCIDR("1.2.3.0/24")
hc.addPrefix("foo", *prefix)
hc.addPrefix(&ServiceCheck{name: "svc1", FunctionName: "foo"}, *prefix)
assert.Equal(t, 1, len(hc.prefixes))
assert.Equal(t, *prefix, hc.prefixes["foo"].prefixes[0])

assert.Equal(t, 1.0, testutil.ToFloat64(prefixStateMetric.WithLabelValues("svc1", "1.2.3.0/24")))

_, prefix, _ = net.ParseCIDR("2.3.4.0/24")
hc.addPrefix("bar", *prefix)
hc.addPrefix(&ServiceCheck{name: "svc2", FunctionName: "bar"}, *prefix)
assert.Equal(t, 2, len(hc.prefixes))
assert.Equal(t, *prefix, hc.prefixes["bar"].prefixes[0])

assert.Equal(t, 1.0, testutil.ToFloat64(prefixStateMetric.WithLabelValues("svc2", "2.3.4.0/24")))
}

func TestHealthCheck_removePrefix(t *testing.T) {
hc := HealthCheck{}
assert.Nil(t, hc.prefixes)
_, prefix, _ := net.ParseCIDR("1.2.3.0/24")
hc.addPrefix("foo", *prefix)

svc1 := &ServiceCheck{name: "svc1", FunctionName: "foo"}
hc.addPrefix(svc1, *prefix)
assert.Equal(t, 1, len(hc.prefixes))
assert.Equal(t, 1, len(hc.prefixes["foo"].prefixes))

assert.Equal(t, 1.0, testutil.ToFloat64(prefixStateMetric.WithLabelValues("svc1", "1.2.3.0/24")))

// this should initialise the prefixset but won't remove any prefixes
hc.removePrefix("bar", *prefix)
svc2 := &ServiceCheck{name: "svc2", FunctionName: "bar"}
hc.removePrefix(svc2, *prefix)
assert.Equal(t, 2, len(hc.prefixes))
assert.Equal(t, 1, len(hc.prefixes["foo"].prefixes))
assert.Equal(t, 0, len(hc.prefixes["bar"].prefixes))

assert.Equal(t, 0.0, testutil.ToFloat64(prefixStateMetric.WithLabelValues("svc2", "1.2.3.0/24")))

// remove the prefix from the right prefixset
hc.removePrefix("foo", *prefix)
hc.removePrefix(svc1, *prefix)
assert.Equal(t, 0, len(hc.prefixes["foo"].prefixes))

assert.Equal(t, 0.0, testutil.ToFloat64(prefixStateMetric.WithLabelValues("svc1", "1.2.3.0/24")))
}

func TestHealthCheckDidReloadBefore(t *testing.T) {
Expand Down
93 changes: 92 additions & 1 deletion birdwatcher/servicecheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,68 @@ package birdwatcher
import (
"context"
"errors"
"fmt"
"net"
"os/exec"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
log "github.com/sirupsen/logrus"
)

var (
serviceInfoMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "info",
Help: "Services and their configuration",
}, []string{"service", "function_name", "command", "interval", "timeout", "rise", "fail"})

serviceCheckDuration = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "check_duration",
Help: "Service check duration in milliseconds",
}, []string{"service"})

serviceStateMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "state",
Help: "Current health state per service",
}, []string{"service"})

serviceTransitionMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "transition_total",
Help: "Number of transitions per service",
}, []string{"service"})

serviceSuccessMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "success_total",
Help: "Number of successful probes per service",
}, []string{"service"})

serviceFailMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "fail_total",
Help: "Number of failed probes per service",
}, []string{"service"})

serviceTimeoutMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "birdwatcher",
Subsystem: "service",
Name: "timeout_total",
Help: "Number of timed out probes per service",
}, []string{"service"})
)

// ServiceState represents the state the service is considered to be in
type ServiceState string

Expand Down Expand Up @@ -53,6 +107,17 @@ func (s *ServiceCheck) Start(action *chan *Action) {
"command": s.Command,
})

// set service info metric
serviceInfoMetric.With(prometheus.Labels{
"service": s.name,
"function_name": s.FunctionName,
"command": s.Command,
"interval": fmt.Sprint(s.Interval),
"timeout": s.Timeout.String(),
"rise": fmt.Sprint(s.Rise),
"fail": fmt.Sprint(s.Fail),
}).Set(1.0)

for {
select {
case <-s.stopped:
Expand All @@ -61,8 +126,11 @@ func (s *ServiceCheck) Start(action *chan *Action) {
return

case <-ticker.C:
beginCheck := time.Now()
// perform check synchronously to prevent checks to queue
err = s.performCheck()
// keep track of the time it took for the check to perform
serviceCheckDuration.WithLabelValues(s.name).Set(float64(time.Since(beginCheck)))

// based on the check result, decide if we're going up or down
//
Expand All @@ -71,6 +139,9 @@ func (s *ServiceCheck) Start(action *chan *Action) {
// reset downCounter
downCounter = 0

// update success metric
serviceSuccessMetric.WithLabelValues(s.name).Inc()

sLog.Debug("check command exited without error")

// are we up enough to consider service to be healthy
Expand All @@ -80,8 +151,14 @@ func (s *ServiceCheck) Start(action *chan *Action) {
"successes": upCounter,
}).Info("service transitioning to up")

// mark current state as up
s.state = ServiceStateUp

// update state metric
serviceStateMetric.WithLabelValues(s.name).Set(1)
// update transition metric
serviceTransitionMetric.WithLabelValues(s.name).Inc()

// send action on channel
*action <- s.getAction()
}
Expand All @@ -99,7 +176,15 @@ func (s *ServiceCheck) Start(action *chan *Action) {
// reset upcounter
upCounter = 0

sLog.Debug("check command failed or timed out")
// update success metric
serviceFailMetric.WithLabelValues(s.name).Inc()
// if this was a timeout, increment that counter as well
if errors.Is(err, context.DeadlineExceeded) {
serviceTimeoutMetric.WithLabelValues(s.name).Inc()
sLog.Debug("check command timed out")
} else {
sLog.Debug("check command failed")
}

// are we down long enough to consider service down
if downCounter >= (s.Fail - 1) {
Expand All @@ -108,8 +193,14 @@ func (s *ServiceCheck) Start(action *chan *Action) {
"failures": downCounter,
}).Info("service transitioning to down")

// mark current state as down
s.state = ServiceStateDown

// update state metric
serviceStateMetric.WithLabelValues(s.name).Set(0)
// update transition metric
serviceTransitionMetric.WithLabelValues(s.name).Inc()

// send action on channel
*action <- s.getAction()
}
Expand Down
4 changes: 4 additions & 0 deletions birdwatcher/testdata/config/overridden
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
configfile = "/etc/birdwatcher.conf"
reloadcommand = "/sbin/birdc configure"
[prometheus]
enabled = true
port = 1234
path = "/something"

[services]
[services."foo"]
Expand Down
8 changes: 8 additions & 0 deletions dist/birdwatcher.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ configfile = "/etc/bird/birdwatcher.conf"
# reload command birdwatcher will call when configfile was updated
reloadcommand = "/usr/sbin/birdc configure"

# configuration about the prometheus metrics exporter
[prometheus]
enabled = false
# TCP port to expose the prometheus exporter on
port = 9091
# HTTP path to expose the prometheus exporter on
path = /metrics

[services]
# example service
#
Expand Down
9 changes: 9 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,22 @@ go 1.20
require (
github.com/BurntSushi/toml v1.3.2
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf
github.com/prometheus/client_golang v1.18.0
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.8.4
)

require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
golang.org/x/sys v0.15.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading

0 comments on commit a0b2069

Please sign in to comment.