From 779113000b52d04a36806f6c232a8d691d145e10 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Sun, 1 Sep 2024 20:03:20 +0200 Subject: [PATCH] Remove tetragon_msg_op_total metric tetragon_msg_op_total was counting events per opcode in the ring buffer queue. It wasn't particularly useful, as there are other metrics exposing similar numbers: * tetragon_bpf_missed_events_total counting missed events per opcode in BPF * tetragon_observer_ringbuf_queue_events_received_total counting total events received in the ring buffer queue * tetragon_events_total counting events per event type in grpc If needed, in the future we can add opcode label to metrics counting events in the observer: * tetragon_observer_ringbuf_events_received_total * tetragon_observer_ringbuf_queue_events_received_total * tetragon_observer_ringbuf_queue_events_lost_total We could also add a metric counting all events (not only missed) per opcode in BPF. However, it's unclear if they could be useful - ringbuffer and events queue shouldn't discriminate different types of events, so having total counts of successful and missed events at each stage should be enough to troubleshoot capacity issues. There is still a per event type counter at the last stage, for monitoring overall data volume. Signed-off-by: Anna Kapuscinska --- contrib/upgrade-notes/latest.md | 2 ++ docs/content/en/docs/reference/metrics.md | 8 -------- pkg/metrics/opcodemetrics/opcodemetrics.go | 19 ------------------- pkg/observer/observer.go | 1 - 4 files changed, 2 insertions(+), 28 deletions(-) diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md index dd6acfe6889..79ac1cef26d 100644 --- a/contrib/upgrade-notes/latest.md +++ b/contrib/upgrade-notes/latest.md @@ -53,3 +53,5 @@ tetragon: * `tetragon_ringbuf_perf_event_lost_total` -> `tetragon_observer_ringbuf_events_lost_total` * `tetragon_ringbuf_queue_received_total` -> `tetragon_observer_ringbuf_queue_events_received_total` * `tetragon_ringbuf_queue_lost_total` -> `tetragon_observer_ringbuf_queue_events_lost_total` +* `tetragon_msg_op_total` metric is removed. `tetragon_observer_ringbuf_queue_events_received_total` or + `tetragon_events_total` can be used as a replacement, depending on the use case. diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index 1f2825cc717..733574a297d 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -200,14 +200,6 @@ The total number of Tetragon probe missed by program. | `attach` | `sys_panic` | | `policy` | `monitor_panic` | -### `tetragon_msg_op_total` - -The total number of times we encounter a given message opcode. For internal use only. - -| label | values | -| ----- | ------ | -| `msg_op` | `13, 14, 15, 16, 23, 24, 25, 26, 27, 5, 7` | - ### `tetragon_notify_overflowed_events_total` The total number of events dropped because listener buffer was full diff --git a/pkg/metrics/opcodemetrics/opcodemetrics.go b/pkg/metrics/opcodemetrics/opcodemetrics.go index cfb8b2c78b4..409923d97ee 100644 --- a/pkg/metrics/opcodemetrics/opcodemetrics.go +++ b/pkg/metrics/opcodemetrics/opcodemetrics.go @@ -13,13 +13,6 @@ import ( ) var ( - MsgOpsCount = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: consts.MetricsNamespace, - Name: "msg_op_total", - Help: "The total number of times we encounter a given message opcode. For internal use only.", - ConstLabels: nil, - }, []string{"msg_op"}) - LatencyStats = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: consts.MetricsNamespace, Name: "handling_latency", @@ -30,7 +23,6 @@ var ( ) func RegisterMetrics(group metrics.Group) { - group.MustRegister(MsgOpsCount) group.MustRegister(LatencyStats) } @@ -38,18 +30,7 @@ func InitMetrics() { // Initialize all metrics for opcode := range ops.OpCodeStrings { if opcode != ops.MSG_OP_UNDEF && opcode != ops.MSG_OP_TEST { - GetOpTotal(opcode).Add(0) LatencyStats.WithLabelValues(fmt.Sprint(int32(opcode))) } } } - -// Get a new handle on a msgOpsCount metric for an OpCode -func GetOpTotal(opcode ops.OpCode) prometheus.Counter { - return MsgOpsCount.WithLabelValues(fmt.Sprint(int32(opcode))) -} - -// Increment an msgOpsCount for an OpCode -func OpTotalInc(opcode ops.OpCode) { - GetOpTotal(opcode).Inc() -} diff --git a/pkg/observer/observer.go b/pkg/observer/observer.go index 91adb828cf5..0b9c6a26ea5 100644 --- a/pkg/observer/observer.go +++ b/pkg/observer/observer.go @@ -122,7 +122,6 @@ func (k *Observer) receiveEvent(data []byte) { } op, events, err := HandlePerfData(data) - opcodemetrics.OpTotalInc(ops.OpCode(op)) if err != nil { // Increment error metrics errormetrics.ErrorTotalInc(errormetrics.HandlerError)