From 779113000b52d04a36806f6c232a8d691d145e10 Mon Sep 17 00:00:00 2001
From: Anna Kapuscinska <anna@isovalent.com>
Date: Sun, 1 Sep 2024 20:03:20 +0200
Subject: [PATCH] Remove tetragon_msg_op_total metric

tetragon_msg_op_total was counting events per opcode in the ring buffer queue.
It wasn't particularly useful, as there are other metrics exposing similar
numbers:
* tetragon_bpf_missed_events_total counting missed events per opcode in BPF
* tetragon_observer_ringbuf_queue_events_received_total counting total events
  received in the ring buffer queue
* tetragon_events_total counting events per event type in grpc

If needed, in the future we can add opcode label to metrics counting events in
the observer:
* tetragon_observer_ringbuf_events_received_total
* tetragon_observer_ringbuf_queue_events_received_total
* tetragon_observer_ringbuf_queue_events_lost_total

We could also add a metric counting all events (not only missed) per opcode in
BPF. However, it's unclear if they could be useful - ringbuffer and events
queue shouldn't discriminate different types of events, so having total counts
of successful and missed events at each stage should be enough to troubleshoot
capacity issues. There is still a per event type counter at the last stage, for
monitoring overall data volume.

Signed-off-by: Anna Kapuscinska <anna@isovalent.com>
---
 contrib/upgrade-notes/latest.md            |  2 ++
 docs/content/en/docs/reference/metrics.md  |  8 --------
 pkg/metrics/opcodemetrics/opcodemetrics.go | 19 -------------------
 pkg/observer/observer.go                   |  1 -
 4 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md
index dd6acfe6889..79ac1cef26d 100644
--- a/contrib/upgrade-notes/latest.md
+++ b/contrib/upgrade-notes/latest.md
@@ -53,3 +53,5 @@ tetragon:
   * `tetragon_ringbuf_perf_event_lost_total` -> `tetragon_observer_ringbuf_events_lost_total`
   * `tetragon_ringbuf_queue_received_total` -> `tetragon_observer_ringbuf_queue_events_received_total`
   * `tetragon_ringbuf_queue_lost_total` -> `tetragon_observer_ringbuf_queue_events_lost_total`
+* `tetragon_msg_op_total` metric is removed. `tetragon_observer_ringbuf_queue_events_received_total` or
+  `tetragon_events_total` can be used as a replacement, depending on the use case.
diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md
index 1f2825cc717..733574a297d 100644
--- a/docs/content/en/docs/reference/metrics.md
+++ b/docs/content/en/docs/reference/metrics.md
@@ -200,14 +200,6 @@ The total number of Tetragon probe missed by program.
 | `attach` | `sys_panic` |
 | `policy` | `monitor_panic` |
 
-### `tetragon_msg_op_total`
-
-The total number of times we encounter a given message opcode. For internal use only.
-
-| label | values |
-| ----- | ------ |
-| `msg_op` | `13, 14, 15, 16, 23, 24, 25, 26, 27, 5, 7` |
-
 ### `tetragon_notify_overflowed_events_total`
 
 The total number of events dropped because listener buffer was full
diff --git a/pkg/metrics/opcodemetrics/opcodemetrics.go b/pkg/metrics/opcodemetrics/opcodemetrics.go
index cfb8b2c78b4..409923d97ee 100644
--- a/pkg/metrics/opcodemetrics/opcodemetrics.go
+++ b/pkg/metrics/opcodemetrics/opcodemetrics.go
@@ -13,13 +13,6 @@ import (
 )
 
 var (
-	MsgOpsCount = prometheus.NewCounterVec(prometheus.CounterOpts{
-		Namespace:   consts.MetricsNamespace,
-		Name:        "msg_op_total",
-		Help:        "The total number of times we encounter a given message opcode. For internal use only.",
-		ConstLabels: nil,
-	}, []string{"msg_op"})
-
 	LatencyStats = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace:   consts.MetricsNamespace,
 		Name:        "handling_latency",
@@ -30,7 +23,6 @@ var (
 )
 
 func RegisterMetrics(group metrics.Group) {
-	group.MustRegister(MsgOpsCount)
 	group.MustRegister(LatencyStats)
 }
 
@@ -38,18 +30,7 @@ func InitMetrics() {
 	// Initialize all metrics
 	for opcode := range ops.OpCodeStrings {
 		if opcode != ops.MSG_OP_UNDEF && opcode != ops.MSG_OP_TEST {
-			GetOpTotal(opcode).Add(0)
 			LatencyStats.WithLabelValues(fmt.Sprint(int32(opcode)))
 		}
 	}
 }
-
-// Get a new handle on a msgOpsCount metric for an OpCode
-func GetOpTotal(opcode ops.OpCode) prometheus.Counter {
-	return MsgOpsCount.WithLabelValues(fmt.Sprint(int32(opcode)))
-}
-
-// Increment an msgOpsCount for an OpCode
-func OpTotalInc(opcode ops.OpCode) {
-	GetOpTotal(opcode).Inc()
-}
diff --git a/pkg/observer/observer.go b/pkg/observer/observer.go
index 91adb828cf5..0b9c6a26ea5 100644
--- a/pkg/observer/observer.go
+++ b/pkg/observer/observer.go
@@ -122,7 +122,6 @@ func (k *Observer) receiveEvent(data []byte) {
 	}
 
 	op, events, err := HandlePerfData(data)
-	opcodemetrics.OpTotalInc(ops.OpCode(op))
 	if err != nil {
 		// Increment error metrics
 		errormetrics.ErrorTotalInc(errormetrics.HandlerError)