diff --git a/graphqlmetrics/pkg/telemetry/interceptor.go b/graphqlmetrics/pkg/telemetry/interceptor.go index 70d650c09b..a418702619 100644 --- a/graphqlmetrics/pkg/telemetry/interceptor.go +++ b/graphqlmetrics/pkg/telemetry/interceptor.go @@ -50,36 +50,34 @@ func checkIfClaimsAreSet(claims *utils.GraphAPITokenClaims) bool { } func (c *Config) ObservabilityInterceptor() connect.UnaryInterceptorFunc { - return connect.UnaryInterceptorFunc( - func(next connect.UnaryFunc) connect.UnaryFunc { - return connect.UnaryFunc(func(ctx context.Context, req connect.AnyRequest) (connect.AnyResponse, error) { - attributes := defaultAttributes(req) - // connect.CodeOK does not exist - var statusCode int = 0 + return func(next connect.UnaryFunc) connect.UnaryFunc { + return func(ctx context.Context, req connect.AnyRequest) (connect.AnyResponse, error) { + attributes := defaultAttributes(req) + // connect.CodeOK does not exist + var statusCode int = 0 - claims, err := utils.GetClaims(ctx) - if err != nil || checkIfClaimsAreSet(claims) { - // handling this error will happen in the service itself - statusCode = int(connect.CodeInvalidArgument) + claims, err := utils.GetClaims(ctx) + if err != nil || checkIfClaimsAreSet(claims) { + // handling this error will happen in the service itself + statusCode = int(connect.CodeInvalidArgument) - attributes = append(attributes, WgFederatedGraphId.String(claims.FederatedGraphID)) - attributes = append(attributes, WgOrganizationId.String(claims.OrganizationID)) - } else { - attributes = append(attributes, WgFederatedGraphId.String(claims.FederatedGraphID)) - attributes = append(attributes, WgOrganizationId.String(claims.OrganizationID)) - } + attributes = append(attributes, WgFederatedGraphId.String(claims.FederatedGraphID)) + attributes = append(attributes, WgOrganizationId.String(claims.OrganizationID)) + } else { + attributes = append(attributes, WgFederatedGraphId.String(claims.FederatedGraphID)) + attributes = append(attributes, WgOrganizationId.String(claims.OrganizationID)) + } - res, err := next(ctx, req) + res, err := next(ctx, req) - if err != nil { - statusCode = int(connect.CodeOf(err)) - } + if err != nil { + statusCode = int(connect.CodeOf(err)) + } - attributes = append(attributes, semconv.RPCGRPCStatusCodeKey.Int(statusCode)) - c.MetricStore.MeasureRequestCount(ctx, attributes...) + attributes = append(attributes, semconv.RPCGRPCStatusCodeKey.Int(statusCode)) + c.MetricStore.MeasureRequestCount(ctx, attributes...) - return res, err - }) - }, - ) + return res, err + } + } } diff --git a/router-tests/kafka_events_test.go b/router-tests/kafka_events_test.go index f30ae1cd19..7ab7e744ab 100644 --- a/router-tests/kafka_events_test.go +++ b/router-tests/kafka_events_test.go @@ -116,7 +116,7 @@ func TestKafkaEvents(t *testing.T) { go func() { wg.Wait() - require.NoError(t, client.Close()) + _ = client.Close() }() xEnv.WaitForSubscriptionCount(1, time.Second*10) @@ -413,7 +413,7 @@ func TestKafkaEvents(t *testing.T) { go func() { wg.Wait() - require.NoError(t, client.Close()) + _ = client.Close() }() xEnv.WaitForSubscriptionCount(1, time.Second*10) diff --git a/router-tests/prometheus_test.go b/router-tests/prometheus_test.go index d7f7abbe8a..4d5556907d 100644 --- a/router-tests/prometheus_test.go +++ b/router-tests/prometheus_test.go @@ -1,6 +1,7 @@ package integration import ( + "github.com/wundergraph/cosmo/router/core" "net/http" "testing" @@ -482,6 +483,63 @@ func TestPrometheus(t *testing.T) { }, }, responseContentLengthMetrics[1].Label) + planningTime := findMetricFamilyByName(mf, "router_graphql_operation_planning_time") + planningTimeMetrics := planningTime.GetMetric() + + require.Len(t, planningTimeMetrics, 1) + require.Len(t, planningTimeMetrics[0].Label, 12) + + require.Equal(t, []*io_prometheus_client.LabelPair{ + { + Name: PointerOf("otel_scope_name"), + Value: PointerOf("cosmo.router.prometheus"), + }, + { + Name: PointerOf("otel_scope_version"), + Value: PointerOf("0.0.1"), + }, + { + Name: PointerOf("wg_client_name"), + Value: PointerOf("unknown"), + }, + { + Name: PointerOf("wg_client_version"), + Value: PointerOf("missing"), + }, + { + Name: PointerOf("wg_engine_plan_cache_hit"), + Value: PointerOf("false"), + }, + { + Name: PointerOf("wg_federated_graph_id"), + Value: PointerOf("graph"), + }, + { + Name: PointerOf("wg_operation_name"), + Value: PointerOf("myQuery"), + }, + { + Name: PointerOf("wg_operation_protocol"), + Value: PointerOf("http"), + }, + { + Name: PointerOf("wg_operation_type"), + Value: PointerOf("query"), + }, + { + Name: PointerOf("wg_router_cluster_name"), + Value: PointerOf(""), + }, + { + Name: PointerOf("wg_router_config_version"), + Value: PointerOf(xEnv.RouterConfigVersionMain()), + }, + { + Name: PointerOf("wg_router_version"), + Value: PointerOf("dev"), + }, + }, planningTimeMetrics[0].Label) + }) }) @@ -496,13 +554,13 @@ func TestPrometheus(t *testing.T) { TraceExporter: exporter, MetricReader: metricReader, PrometheusRegistry: promRegistry, - OtelResourceAttributes: []config.CustomStaticAttribute{ + CustomResourceAttributes: []config.CustomStaticAttribute{ { Key: "custom.resource", Value: "value", }, }, - OtelAttributes: []config.CustomAttribute{ + CustomMetricAttributes: []config.CustomAttribute{ { Key: "custom", Default: "value_different", @@ -1020,13 +1078,13 @@ func TestPrometheus(t *testing.T) { TraceExporter: exporter, MetricReader: metricReader, PrometheusRegistry: promRegistry, - OtelResourceAttributes: []config.CustomStaticAttribute{ + CustomResourceAttributes: []config.CustomStaticAttribute{ { Key: "custom.resource", Value: "value", }, }, - OtelAttributes: []config.CustomAttribute{ + CustomMetricAttributes: []config.CustomAttribute{ { Key: "custom", Default: "value", @@ -1559,13 +1617,64 @@ func TestPrometheus(t *testing.T) { mf, err := promRegistry.Gather() require.NoError(t, err) - responseContentLength := findMetricFamilyByName(mf, "router_http_requests_error_total") - responseContentLengthMetrics := responseContentLength.GetMetric() + totalRequestsErrors := findMetricFamilyByName(mf, "router_http_requests_error_total") + totalRequestErrorsMetric := totalRequestsErrors.GetMetric() - require.Len(t, responseContentLengthMetrics, 3) - require.Len(t, responseContentLengthMetrics[0].Label, 12) - require.Len(t, responseContentLengthMetrics[1].Label, 16) - require.Len(t, responseContentLengthMetrics[2].Label, 16) + require.Len(t, totalRequestErrorsMetric, 2) + require.Len(t, totalRequestErrorsMetric[0].Label, 12) + require.Len(t, totalRequestErrorsMetric[1].Label, 15) + + // Error metric for the subgraph error + require.Equal(t, []*io_prometheus_client.LabelPair{ + { + Name: PointerOf("otel_scope_name"), + Value: PointerOf("cosmo.router.prometheus"), + }, + { + Name: PointerOf("otel_scope_version"), + Value: PointerOf("0.0.1"), + }, + { + Name: PointerOf("wg_client_name"), + Value: PointerOf("unknown"), + }, + { + Name: PointerOf("wg_client_version"), + Value: PointerOf("missing"), + }, + { + Name: PointerOf("wg_federated_graph_id"), + Value: PointerOf("graph"), + }, + { + Name: PointerOf("wg_operation_name"), + Value: PointerOf("myQuery"), + }, + { + Name: PointerOf("wg_operation_protocol"), + Value: PointerOf("http"), + }, + { + Name: PointerOf("wg_operation_type"), + Value: PointerOf("query"), + }, + { + Name: PointerOf("wg_request_error"), + Value: PointerOf("true"), + }, + { + Name: PointerOf("wg_router_cluster_name"), + Value: PointerOf(""), + }, + { + Name: PointerOf("wg_router_config_version"), + Value: PointerOf(xEnv.RouterConfigVersionMain()), + }, + { + Name: PointerOf("wg_router_version"), + Value: PointerOf("dev"), + }, + }, totalRequestErrorsMetric[0].Label) // Error metric for the subgraph error require.Equal(t, []*io_prometheus_client.LabelPair{ @@ -1621,10 +1730,6 @@ func TestPrometheus(t *testing.T) { Name: PointerOf("wg_router_version"), Value: PointerOf("dev"), }, - { - Name: PointerOf("wg_subgraph_error_extended_code"), - Value: PointerOf("UNAUTHORIZED"), - }, { Name: PointerOf("wg_subgraph_id"), Value: PointerOf("3"), @@ -1633,7 +1738,165 @@ func TestPrometheus(t *testing.T) { Name: PointerOf("wg_subgraph_name"), Value: PointerOf("products"), }, - }, responseContentLengthMetrics[1].Label) + }, totalRequestErrorsMetric[1].Label) + }) + }) + + t.Run("Custom slice metric attributes produces multiple metric series", func(t *testing.T) { + exporter := tracetest.NewInMemoryExporter(t) + metricReader := metric.NewManualReader() + promRegistry := prometheus.NewRegistry() + + testenv.Run(t, &testenv.Config{ + TraceExporter: exporter, + MetricReader: metricReader, + PrometheusRegistry: promRegistry, + CustomMetricAttributes: []config.CustomAttribute{ + { + Key: "error_codes", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldGraphQLErrorCodes, + }, + }, + }, + Subgraphs: testenv.SubgraphsConfig{ + Products: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"errors":[{"message":"Unauthorized","extensions":{"code":"UNAUTHORIZED"}},{"message":"MyErrorMessage","extensions":{"code":"YOUR_ERROR_CODE"}}]}`)) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Query: `query myQuery { employees { id details { forename surname } notes } }`, + }) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'products' at Path 'employees'.","extensions":{"errors":[{"message":"Unauthorized","extensions":{"code":"UNAUTHORIZED"}},{"message":"MyErrorMessage","extensions":{"code":"YOUR_ERROR_CODE"}}],"statusCode":403}}],"data":{"employees":[{"id":1,"details":{"forename":"Jens","surname":"Neuse"},"notes":null},{"id":2,"details":{"forename":"Dustin","surname":"Deus"},"notes":null},{"id":3,"details":{"forename":"Stefan","surname":"Avram"},"notes":null},{"id":4,"details":{"forename":"Björn","surname":"Schwenzer"},"notes":null},{"id":5,"details":{"forename":"Sergiy","surname":"Petrunin"},"notes":null},{"id":7,"details":{"forename":"Suvij","surname":"Surya"},"notes":null},{"id":8,"details":{"forename":"Nithin","surname":"Kumar"},"notes":null},{"id":10,"details":{"forename":"Eelco","surname":"Wiersma"},"notes":null},{"id":11,"details":{"forename":"Alexandra","surname":"Neuse"},"notes":null},{"id":12,"details":{"forename":"David","surname":"Stutt"},"notes":null}]}}`, res.Body) + + mf, err := promRegistry.Gather() + require.NoError(t, err) + + totalRequestsErrors := findMetricFamilyByName(mf, "router_http_requests_error_total") + totalRequestErrorsMetric := totalRequestsErrors.GetMetric() + + require.Len(t, totalRequestErrorsMetric, 3) + require.Len(t, totalRequestErrorsMetric[0].Label, 13) + require.Len(t, totalRequestErrorsMetric[1].Label, 13) + require.Len(t, totalRequestErrorsMetric[2].Label, 15) + + require.Equal(t, []*io_prometheus_client.LabelPair{ + { + Name: PointerOf("error_codes"), + Value: PointerOf("UNAUTHORIZED"), + }, + { + Name: PointerOf("otel_scope_name"), + Value: PointerOf("cosmo.router.prometheus"), + }, + { + Name: PointerOf("otel_scope_version"), + Value: PointerOf("0.0.1"), + }, + { + Name: PointerOf("wg_client_name"), + Value: PointerOf("unknown"), + }, + { + Name: PointerOf("wg_client_version"), + Value: PointerOf("missing"), + }, + { + Name: PointerOf("wg_federated_graph_id"), + Value: PointerOf("graph"), + }, + { + Name: PointerOf("wg_operation_name"), + Value: PointerOf("myQuery"), + }, + { + Name: PointerOf("wg_operation_protocol"), + Value: PointerOf("http"), + }, + { + Name: PointerOf("wg_operation_type"), + Value: PointerOf("query"), + }, + { + Name: PointerOf("wg_request_error"), + Value: PointerOf("true"), + }, + { + Name: PointerOf("wg_router_cluster_name"), + Value: PointerOf(""), + }, + { + Name: PointerOf("wg_router_config_version"), + Value: PointerOf(xEnv.RouterConfigVersionMain()), + }, + { + Name: PointerOf("wg_router_version"), + Value: PointerOf("dev"), + }, + }, totalRequestErrorsMetric[0].Label) + + // Error metric for the subgraph error + require.Equal(t, []*io_prometheus_client.LabelPair{ + { + Name: PointerOf("error_codes"), + Value: PointerOf("YOUR_ERROR_CODE"), + }, + { + Name: PointerOf("otel_scope_name"), + Value: PointerOf("cosmo.router.prometheus"), + }, + { + Name: PointerOf("otel_scope_version"), + Value: PointerOf("0.0.1"), + }, + { + Name: PointerOf("wg_client_name"), + Value: PointerOf("unknown"), + }, + { + Name: PointerOf("wg_client_version"), + Value: PointerOf("missing"), + }, + { + Name: PointerOf("wg_federated_graph_id"), + Value: PointerOf("graph"), + }, + { + Name: PointerOf("wg_operation_name"), + Value: PointerOf("myQuery"), + }, + { + Name: PointerOf("wg_operation_protocol"), + Value: PointerOf("http"), + }, + { + Name: PointerOf("wg_operation_type"), + Value: PointerOf("query"), + }, + { + Name: PointerOf("wg_request_error"), + Value: PointerOf("true"), + }, + { + Name: PointerOf("wg_router_cluster_name"), + Value: PointerOf(""), + }, + { + Name: PointerOf("wg_router_config_version"), + Value: PointerOf(xEnv.RouterConfigVersionMain()), + }, + { + Name: PointerOf("wg_router_version"), + Value: PointerOf("dev"), + }, + }, totalRequestErrorsMetric[1].Label) // Error metric for the subgraph error require.Equal(t, []*io_prometheus_client.LabelPair{ @@ -1689,10 +1952,6 @@ func TestPrometheus(t *testing.T) { Name: PointerOf("wg_router_version"), Value: PointerOf("dev"), }, - { - Name: PointerOf("wg_subgraph_error_extended_code"), - Value: PointerOf("YOUR_ERROR_CODE"), - }, { Name: PointerOf("wg_subgraph_id"), Value: PointerOf("3"), @@ -1701,7 +1960,7 @@ func TestPrometheus(t *testing.T) { Name: PointerOf("wg_subgraph_name"), Value: PointerOf("products"), }, - }, responseContentLengthMetrics[2].Label) + }, totalRequestErrorsMetric[2].Label) }) }) @@ -2208,6 +2467,8 @@ func TestPrometheus(t *testing.T) { }) } +// Creates a separate prometheus metric when service error codes are used as custom attributes + func findMetricFamilyByName(mf []*io_prometheus_client.MetricFamily, name string) *io_prometheus_client.MetricFamily { for _, m := range mf { if m.GetName() == name { diff --git a/router-tests/structured_logging_test.go b/router-tests/structured_logging_test.go index 0bbe942dc3..b0f0a8eab9 100644 --- a/router-tests/structured_logging_test.go +++ b/router-tests/structured_logging_test.go @@ -326,7 +326,7 @@ func TestAccessLogs(t *testing.T) { }) }) - t.Run("Log when operation parsing fails", func(t *testing.T) { + t.Run("Log as much information possible when operation parsing fails", func(t *testing.T) { t.Parallel() testenv.Run(t, &testenv.Config{ @@ -338,6 +338,12 @@ func TestAccessLogs(t *testing.T) { RequestHeader: "service-name", }, }, + { + Key: "error_message", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldResponseErrorMessage, + }, + }, { Key: "operation_hash", Default: "", @@ -407,14 +413,15 @@ func TestAccessLogs(t *testing.T) { require.Equal(t, requestLog.Len(), 1) requestContext := requestLog.All()[0].ContextMap() expectedValues := map[string]interface{}{ - "log_type": "request", - "status": int64(200), - "method": "POST", - "path": "/graphql", - "query": "", // http query is empty - "ip": "[REDACTED]", - "user_agent": "Go-http-client/1.1", - "service_name": "service-name", // From header + "log_type": "request", + "status": int64(200), + "method": "POST", + "path": "/graphql", + "query": "", // http query is empty + "ip": "[REDACTED]", + "user_agent": "Go-http-client/1.1", + "service_name": "service-name", // From header + "error_message": "unexpected token - got: EOF want one of: [RBRACE IDENT SPREAD]", } additionalExpectedKeys := []string{ "latency", @@ -429,7 +436,7 @@ func TestAccessLogs(t *testing.T) { }) }) - t.Run("Log when operation normalization fails", func(t *testing.T) { + t.Run("Log as much information possible when operation normalization fails", func(t *testing.T) { t.Parallel() testenv.Run(t, &testenv.Config{ @@ -441,6 +448,12 @@ func TestAccessLogs(t *testing.T) { RequestHeader: "service-name", }, }, + { + Key: "error_message", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldResponseErrorMessage, + }, + }, { Key: "operation_hash", Default: "", @@ -520,6 +533,7 @@ func TestAccessLogs(t *testing.T) { "service_name": "service-name", // From header "operation_type": "query", // From context "operation_name": "employees", // From context + "error_message": "field: notExists not defined on type: Query", } additionalExpectedKeys := []string{ "latency", @@ -535,7 +549,7 @@ func TestAccessLogs(t *testing.T) { }) }) - t.Run("Log when panic occurs on execution / error panic", func(t *testing.T) { + t.Run("Log as much information possible on execution / error panic", func(t *testing.T) { t.Parallel() testenv.Run(t, &testenv.Config{ @@ -547,6 +561,12 @@ func TestAccessLogs(t *testing.T) { RequestHeader: "service-name", }, }, + { + Key: "error_message", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldResponseErrorMessage, + }, + }, { Key: "operation_hash", Default: "", @@ -637,6 +657,7 @@ func TestAccessLogs(t *testing.T) { "operation_hash": "14226210703439426856", // From context "operation_name": "employees", // From context "operation_type": "query", // From context + "error_message": "implement me", } additionalExpectedKeys := []string{ "latency", @@ -655,7 +676,7 @@ func TestAccessLogs(t *testing.T) { }) }) - t.Run("Log when panic occurs on execution / string panic", func(t *testing.T) { + t.Run("Log as much information possible on execution / string panic", func(t *testing.T) { t.Parallel() testenv.Run(t, &testenv.Config{ @@ -667,6 +688,12 @@ func TestAccessLogs(t *testing.T) { RequestHeader: "service-name", }, }, + { + Key: "error_message", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldResponseErrorMessage, + }, + }, { Key: "operation_hash", Default: "", @@ -758,6 +785,7 @@ func TestAccessLogs(t *testing.T) { "operation_hash": "14226210703439426856", // From context "operation_name": "employees", // From context "operation_type": "query", // From context + "error_message": "implement me", } additionalExpectedKeys := []string{ "latency", diff --git a/router-tests/telemetry_test.go b/router-tests/telemetry_test.go index 5531a763a5..691d3d46ad 100644 --- a/router-tests/telemetry_test.go +++ b/router-tests/telemetry_test.go @@ -157,21 +157,25 @@ func TestTelemetry(t *testing.T) { require.Len(t, sn[3].Attributes(), 11) - require.Equal(t, "Operation - Plan", sn[4].Name()) - require.Equal(t, trace.SpanKindInternal, sn[4].SpanKind()) - require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[4].Status()) + require.Equal(t, "Operation - Validate", sn[3].Name()) + require.Equal(t, trace.SpanKindInternal, sn[3].SpanKind()) + require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[3].Status()) + require.Contains(t, sn[3].Attributes(), otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain())) require.Contains(t, sn[3].Attributes(), otel.WgRouterVersion.String("dev")) require.Contains(t, sn[3].Attributes(), otel.WgRouterClusterName.String("")) require.Contains(t, sn[3].Attributes(), otel.WgFederatedGraphID.String("graph")) - require.Contains(t, sn[3].Attributes(), otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain())) - require.Contains(t, sn[3].Attributes(), otel.WgValidationCacheHit.Bool(false)) + require.Contains(t, sn[3].Attributes(), otel.WgClientName.String("unknown")) + require.Contains(t, sn[3].Attributes(), otel.WgValidationCacheHit.Bool(false)) + require.Contains(t, sn[3].Attributes(), otel.WgClientVersion.String("missing")) + require.Contains(t, sn[3].Attributes(), otel.WgOperationProtocol.String("http")) require.Contains(t, sn[3].Attributes(), otel.WgOperationName.String("")) require.Contains(t, sn[3].Attributes(), otel.WgOperationType.String("query")) - require.Contains(t, sn[3].Attributes(), otel.WgOperationProtocol.String("http")) + require.Contains(t, sn[3].Attributes(), otel.WgOperationHash.String("14226210703439426856")) + require.Contains(t, sn[3].Attributes(), otel.WgValidationCacheHit.Bool(false)) // Span Resource attributes @@ -321,7 +325,14 @@ func TestTelemetry(t *testing.T) { // Span attributes - require.Len(t, sn[7].Attributes(), 5) + require.Len(t, sn[7].Attributes(), 11) + require.Contains(t, sn[7].Attributes(), otel.WgClientName.String("unknown")) + require.Contains(t, sn[7].Attributes(), otel.WgClientVersion.String("missing")) + require.Contains(t, sn[7].Attributes(), otel.WgOperationName.String("")) + require.Contains(t, sn[7].Attributes(), otel.WgOperationType.String("query")) + require.Contains(t, sn[7].Attributes(), otel.WgOperationProtocol.String("http")) + require.Contains(t, sn[7].Attributes(), otel.WgOperationHash.String("14226210703439426856")) + require.Contains(t, sn[7].Attributes(), otel.WgRouterVersion.String("dev")) require.Contains(t, sn[7].Attributes(), otel.WgRouterClusterName.String("")) require.Contains(t, sn[7].Attributes(), otel.WgFederatedGraphID.String("graph")) @@ -609,6 +620,33 @@ func TestTelemetry(t *testing.T) { }, } + operationPlanningTimeMetric := metricdata.Metrics{ + Name: "router.graphql.operation.planning_time", + Description: "Operation planning time in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + otel.WgEnginePlanCacheHit.Bool(false), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("14226210703439426856"), + otel.WgOperationName.String(""), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Sum: 0, + }, + }, + }, + } + want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -621,6 +659,7 @@ func TestTelemetry(t *testing.T) { requestContentLengthMetric, responseContentLengthMetric, requestInFlightMetric, + operationPlanningTimeMetric, }, } @@ -638,14 +677,16 @@ func TestTelemetry(t *testing.T) { require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router")) require.Equal(t, 1, len(rm.ScopeMetrics), "expected 1 ScopeMetrics, got %d", len(rm.ScopeMetrics)) - require.Equal(t, 5, len(rm.ScopeMetrics[0].Metrics), "expected 5 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) + require.Equal(t, 6, len(rm.ScopeMetrics[0].Metrics), "expected 6 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, httpRequestsMetric, rm.ScopeMetrics[0].Metrics[0], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, requestDurationMetric, rm.ScopeMetrics[0].Metrics[1], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, requestContentLengthMetric, rm.ScopeMetrics[0].Metrics[2], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, responseContentLengthMetric, rm.ScopeMetrics[0].Metrics[3], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, requestInFlightMetric, rm.ScopeMetrics[0].Metrics[4], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, operationPlanningTimeMetric, rm.ScopeMetrics[0].Metrics[5], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) // make a second request and assert that we're now hitting the validation cache @@ -715,8 +756,9 @@ func TestTelemetry(t *testing.T) { sn = exporter.GetSpans().Snapshots() - require.Len(t, sn, 8, "expected 8 spans, got %d", len(sn)) - require.NotEqualf(t, "Load Persisted Operation", sn[1].Name(), "excepted no span because it was a cache hit") + require.Len(t, sn, 9, "expected 9 spans, got %d", len(sn)) + require.Equalf(t, "Load Persisted Operation", sn[1].Name(), "A cache hit") + require.Contains(t, sn[1].Attributes(), otel.WgEnginePersistedOperationCacheHit.Bool(true)) }) }) @@ -729,13 +771,13 @@ func TestTelemetry(t *testing.T) { testenv.Run(t, &testenv.Config{ TraceExporter: exporter, MetricReader: metricReader, - OtelResourceAttributes: []config.CustomStaticAttribute{ + CustomResourceAttributes: []config.CustomStaticAttribute{ { Key: "custom.resource", Value: "value", }, }, - OtelAttributes: []config.CustomAttribute{ + CustomTelemetryAttributes: []config.CustomAttribute{ { Key: "custom", Default: "value", @@ -1050,6 +1092,34 @@ func TestTelemetry(t *testing.T) { }, } + operationPlanningTimeMetric := metricdata.Metrics{ + Name: "router.graphql.operation.planning_time", + Description: "Operation planning time in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + attribute.String("custom", "value"), + otel.WgEnginePlanCacheHit.Bool(false), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("14226210703439426856"), + otel.WgOperationName.String(""), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Sum: 0, + }, + }, + }, + } + want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -1062,6 +1132,7 @@ func TestTelemetry(t *testing.T) { requestContentLengthMetric, responseContentLengthMetric, requestInFlightMetric, + operationPlanningTimeMetric, }, } @@ -1080,14 +1151,15 @@ func TestTelemetry(t *testing.T) { require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router")) require.Equal(t, 1, len(rm.ScopeMetrics), "expected 1 ScopeMetrics, got %d", len(rm.ScopeMetrics)) - require.Equal(t, 5, len(rm.ScopeMetrics[0].Metrics), "expected 5 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) + require.Equal(t, 6, len(rm.ScopeMetrics[0].Metrics), "expected 6 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) - metricdatatest.AssertEqual(t, httpRequestsMetric, rm.ScopeMetrics[0].Metrics[0], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, requestDurationMetric, rm.ScopeMetrics[0].Metrics[1], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, requestContentLengthMetric, rm.ScopeMetrics[0].Metrics[2], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, responseContentLengthMetric, rm.ScopeMetrics[0].Metrics[3], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, requestInFlightMetric, rm.ScopeMetrics[0].Metrics[4], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, operationPlanningTimeMetric, rm.ScopeMetrics[0].Metrics[5], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) @@ -1101,13 +1173,13 @@ func TestTelemetry(t *testing.T) { testenv.Run(t, &testenv.Config{ TraceExporter: exporter, MetricReader: metricReader, - OtelResourceAttributes: []config.CustomStaticAttribute{ + CustomResourceAttributes: []config.CustomStaticAttribute{ { Key: "custom.resource", Value: "value", }, }, - OtelAttributes: []config.CustomAttribute{ + CustomTelemetryAttributes: []config.CustomAttribute{ { Key: "custom", Default: "value", @@ -1419,6 +1491,34 @@ func TestTelemetry(t *testing.T) { }, } + operationPlanningTimeMetric := metricdata.Metrics{ + Name: "router.graphql.operation.planning_time", + Description: "Operation planning time in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + attribute.String("custom", "value"), + otel.WgEnginePlanCacheHit.Bool(false), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("14226210703439426856"), + otel.WgOperationName.String(""), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Sum: 0, + }, + }, + }, + } + want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -1431,6 +1531,7 @@ func TestTelemetry(t *testing.T) { requestContentLengthMetric, responseContentLengthMetric, requestInFlightMetric, + operationPlanningTimeMetric, }, } @@ -1449,14 +1550,16 @@ func TestTelemetry(t *testing.T) { require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router")) require.Equal(t, 1, len(rm.ScopeMetrics), "expected 1 ScopeMetrics, got %d", len(rm.ScopeMetrics)) - require.Equal(t, 5, len(rm.ScopeMetrics[0].Metrics), "expected 5 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) + require.Equal(t, 6, len(rm.ScopeMetrics[0].Metrics), "expected 6 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, httpRequestsMetric, rm.ScopeMetrics[0].Metrics[0], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, requestDurationMetric, rm.ScopeMetrics[0].Metrics[1], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, requestContentLengthMetric, rm.ScopeMetrics[0].Metrics[2], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, responseContentLengthMetric, rm.ScopeMetrics[0].Metrics[3], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, requestInFlightMetric, rm.ScopeMetrics[0].Metrics[4], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, operationPlanningTimeMetric, rm.ScopeMetrics[0].Metrics[5], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) @@ -1518,7 +1621,7 @@ func TestTelemetry(t *testing.T) { require.Contains(t, sn[6].Attributes(), otel.WgFeatureFlag.String("myff")) require.Equal(t, "Operation - Execute", sn[7].Name()) - require.Len(t, sn[7].Attributes(), 6) + require.Len(t, sn[7].Attributes(), 12) require.Contains(t, sn[7].Attributes(), otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMyFF())) require.Contains(t, sn[7].Attributes(), otel.WgFeatureFlag.String("myff")) @@ -1765,6 +1868,34 @@ func TestTelemetry(t *testing.T) { }, } + operationPlanningTimeMetric := metricdata.Metrics{ + Name: "router.graphql.operation.planning_time", + Description: "Operation planning time in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + otel.WgEnginePlanCacheHit.Bool(false), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("14226210703439426856"), + otel.WgOperationName.String(""), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMyFF()), + otel.WgRouterVersion.String("dev"), + otel.WgFeatureFlag.String("myff"), + ), + Sum: 0, + }, + }, + }, + } + want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -1777,19 +1908,21 @@ func TestTelemetry(t *testing.T) { requestContentLengthMetric, responseContentLengthMetric, requestInFlightMetric, + operationPlanningTimeMetric, }, } require.Equal(t, 1, len(rm.ScopeMetrics), "expected 1 ScopeMetrics, got %d", len(rm.ScopeMetrics)) - require.Equal(t, 5, len(rm.ScopeMetrics[0].Metrics), "expected 5 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) + require.Equal(t, 6, len(rm.ScopeMetrics[0].Metrics), "expected 6 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, httpRequestsMetric, rm.ScopeMetrics[0].Metrics[0], metricdatatest.IgnoreTimestamp()) + metricdatatest.AssertEqual(t, requestDurationMetric, rm.ScopeMetrics[0].Metrics[1], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) metricdatatest.AssertEqual(t, requestContentLengthMetric, rm.ScopeMetrics[0].Metrics[2], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, responseContentLengthMetric, rm.ScopeMetrics[0].Metrics[3], metricdatatest.IgnoreTimestamp()) metricdatatest.AssertEqual(t, requestInFlightMetric, rm.ScopeMetrics[0].Metrics[4], metricdatatest.IgnoreTimestamp()) - + metricdatatest.AssertEqual(t, operationPlanningTimeMetric, rm.ScopeMetrics[0].Metrics[5], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) @@ -2063,11 +2196,13 @@ func TestTelemetry(t *testing.T) { require.Equal(t, "Engine - Fetch", sn[6].Name()) require.Equal(t, trace.SpanKindInternal, sn[6].SpanKind()) + require.Lenf(t, sn[6].Attributes(), 14, "expected 14 attributes, got %d", len(sn[8].Attributes())) require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[6].Status()) require.Equal(t, "Engine - Fetch", sn[8].Name()) require.Equal(t, trace.SpanKindInternal, sn[8].SpanKind()) require.Equal(t, codes.Error, sn[8].Status().Code) + require.Lenf(t, sn[8].Attributes(), 14, "expected 14 attributes, got %d", len(sn[8].Attributes())) require.Contains(t, sn[8].Status().Description, "connect: connection refused\nFailed to fetch from Subgraph 'products' at Path: 'employees'.") events := sn[8].Events() @@ -2112,6 +2247,8 @@ func TestTelemetry(t *testing.T) { require.Equal(t, trace.SpanKindInternal, sn[6].SpanKind()) require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[6].Status()) + require.Lenf(t, sn[6].Attributes(), 14, "expected 14 attributes, got %d", len(sn[6].Attributes())) + given := attribute.NewSet(sn[6].Attributes()...) want := attribute.NewSet([]attribute.KeyValue{ semconv.HTTPStatusCode(200), @@ -2136,6 +2273,8 @@ func TestTelemetry(t *testing.T) { require.Equal(t, "Engine - Fetch", sn[8].Name()) require.Equal(t, trace.SpanKindInternal, sn[8].SpanKind()) + require.Lenf(t, sn[8].Attributes(), 14, "expected 14 attributes, got %d", len(sn[6].Attributes())) + given = attribute.NewSet(sn[8].Attributes()...) want = attribute.NewSet([]attribute.KeyValue{ otel.WgSubgraphName.String("products"), @@ -2152,8 +2291,6 @@ func TestTelemetry(t *testing.T) { otel.WgOperationType.String("query"), otel.WgOperationProtocol.String("http"), otel.WgOperationHash.String("16884868987896027258"), - // Downstream errors - otel.WgSubgraphErrorExtendedCode.String("UNAUTHORIZED,YOUR_ERROR_CODE"), }...) require.True(t, given.Equals(&want)) @@ -2184,7 +2321,7 @@ func TestTelemetry(t *testing.T) { }) }) - t.Run("Operation parsing errors are traced", func(t *testing.T) { + t.Run("Operation parsing errors are tracked", func(t *testing.T) { t.Parallel() exporter := tracetest.NewInMemoryExporter(t) @@ -2204,6 +2341,17 @@ func TestTelemetry(t *testing.T) { require.Equal(t, codes.Error, sn[1].Status().Code) require.Contains(t, sn[1].Status().Description, "unexpected literal - got: UNDEFINED want one of: [ENUM TYPE UNION QUERY INPUT EXTEND SCHEMA SCALAR FRAGMENT INTERFACE DIRECTIVE]") + require.Lenf(t, sn[1].Attributes(), 8, "expected 14 attributes, got %d", len(sn[1].Attributes())) + + require.Contains(t, sn[1].Attributes(), otel.WgRouterVersion.String("dev")) + require.Contains(t, sn[1].Attributes(), otel.WgRouterClusterName.String("")) + require.Contains(t, sn[1].Attributes(), otel.WgFederatedGraphID.String("graph")) + require.Contains(t, sn[1].Attributes(), otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain())) + require.Contains(t, sn[1].Attributes(), otel.WgClientName.String("unknown")) + require.Contains(t, sn[1].Attributes(), otel.WgClientVersion.String("missing")) + require.Contains(t, sn[1].Attributes(), otel.WgOperationProtocol.String("http")) + require.Contains(t, sn[1].Attributes(), otel.WgRequestError.Bool(true)) + events := sn[1].Events() require.Len(t, events, 1, "expected 1 event because the GraphQL parsing failed") require.Equal(t, "exception", events[0].Name) @@ -2213,13 +2361,16 @@ func TestTelemetry(t *testing.T) { require.Equal(t, codes.Error, sn[2].Status().Code) require.Contains(t, sn[2].Status().Description, "unexpected literal - got: UNDEFINED want one of: [ENUM TYPE UNION QUERY INPUT EXTEND SCHEMA SCALAR FRAGMENT INTERFACE DIRECTIVE]") + require.Lenf(t, sn[2].Attributes(), 23, "expected 23 attributes, got %d", len(sn[2].Attributes())) + require.Contains(t, sn[2].Attributes(), otel.WgRequestError.Bool(true)) + events = sn[2].Events() require.Len(t, events, 1, "expected 1 event because the GraphQL request failed") require.Equal(t, "exception", events[0].Name) }) }) - t.Run("Operation normalization errors are traced", func(t *testing.T) { + t.Run("Operation normalization errors are tracked", func(t *testing.T) { t.Parallel() exporter := tracetest.NewInMemoryExporter(t) @@ -2487,4 +2638,623 @@ func TestTelemetry(t *testing.T) { require.Equal(t, "version", clientVersion) }) }) + + t.Run("Custom Metric Attributes", func(t *testing.T) { + + t.Run("Custom attributes are added to all metrics / subgraph error", func(t *testing.T) { + exporter := tracetest.NewInMemoryExporter(t) + metricReader := metric.NewManualReader() + + testenv.Run(t, &testenv.Config{ + TraceExporter: exporter, + MetricReader: metricReader, + CustomMetricAttributes: []config.CustomAttribute{ + { + Key: "from_header", + ValueFrom: &config.CustomDynamicAttribute{ + RequestHeader: "x-custom-header", + }, + }, + { + Key: "sha256", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldOperationSha256, + }, + }, + { + Key: "error_codes", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldGraphQLErrorCodes, + }, + }, + { + Key: "error_services", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldGraphQLErrorServices, + }, + }, + { + Key: "services", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldOperationServices, + }, + }, + }, + Subgraphs: testenv.SubgraphsConfig{ + Products: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusForbidden) + _, _ = w.Write([]byte(`{"errors":[{"message":"Unauthorized","path": ["foo"],"extensions":{"code":"UNAUTHORIZED"}},{"message":"MyErrorMessage","path": ["bar"],"extensions":{"code":"YOUR_ERROR_CODE"}}]}`)) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Header: map[string][]string{ + "x-custom-header": {"custom-value"}, + }, + Query: `query myQuery { employees { id details { forename surname } notes } }`, + }) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'products' at Path 'employees'.","extensions":{"errors":[{"message":"Unauthorized","path":["foo"],"extensions":{"code":"UNAUTHORIZED"}},{"message":"MyErrorMessage","path":["bar"],"extensions":{"code":"YOUR_ERROR_CODE"}}],"statusCode":403}}],"data":{"employees":[{"id":1,"details":{"forename":"Jens","surname":"Neuse"},"notes":null},{"id":2,"details":{"forename":"Dustin","surname":"Deus"},"notes":null},{"id":3,"details":{"forename":"Stefan","surname":"Avram"},"notes":null},{"id":4,"details":{"forename":"Björn","surname":"Schwenzer"},"notes":null},{"id":5,"details":{"forename":"Sergiy","surname":"Petrunin"},"notes":null},{"id":7,"details":{"forename":"Suvij","surname":"Surya"},"notes":null},{"id":8,"details":{"forename":"Nithin","surname":"Kumar"},"notes":null},{"id":10,"details":{"forename":"Eelco","surname":"Wiersma"},"notes":null},{"id":11,"details":{"forename":"Alexandra","surname":"Neuse"},"notes":null},{"id":12,"details":{"forename":"David","surname":"Stutt"},"notes":null}]}}`, res.Body) + + /** + * Traces + */ + + sn := exporter.GetSpans().Snapshots() + require.Len(t, sn, 11, "expected 11 spans, got %d", len(sn)) + + // No additional attributes are added to the spans + + /** + * Metrics + */ + rm := metricdata.ResourceMetrics{} + err := metricReader.Collect(context.Background(), &rm) + require.NoError(t, err) + + require.Equal(t, 1, len(rm.ScopeMetrics), "expected 1 ScopeMetrics, got %d", len(rm.ScopeMetrics)) + require.Equal(t, 7, len(rm.ScopeMetrics[0].Metrics), "expected 7 Metrics, got %d", len(rm.ScopeMetrics[0].Metrics)) + + httpRequestsMetric := metricdata.Metrics{ + Name: "router.http.requests", + Description: "Total number of requests", + Unit: "", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: []metricdata.DataPoint[int64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Value: 1, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + semconv.HTTPStatusCode(200), + attribute.StringSlice("error_codes", []string{"UNAUTHORIZED", "YOUR_ERROR_CODE"}), + attribute.StringSlice("services", []string{"employees", "products"}), + attribute.StringSlice("error_services", []string{"products"}), + otel.WgRequestError.Bool(true), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Value: 1, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("0"), + otel.WgSubgraphName.String("employees"), + ), + Value: 1, + }, + }, + }, + } + + requestDurationMetric := metricdata.Metrics{ + Name: "router.http.request.duration_milliseconds", + Description: "Server latency in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Sum: 0, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + semconv.HTTPStatusCode(200), + attribute.StringSlice("error_codes", []string{"UNAUTHORIZED", "YOUR_ERROR_CODE"}), + attribute.StringSlice("services", []string{"employees", "products"}), + attribute.StringSlice("error_services", []string{"products"}), + otel.WgRequestError.Bool(true), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Sum: 0, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("0"), + otel.WgSubgraphName.String("employees"), + ), + Sum: 0, + }, + }, + }, + } + + requestContentLengthMetric := metricdata.Metrics{ + Name: "router.http.request.content_length", + Description: "Total number of request bytes", + Unit: "bytes", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: []metricdata.DataPoint[int64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Value: 494, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + attribute.StringSlice("error_codes", []string{"UNAUTHORIZED", "YOUR_ERROR_CODE"}), + attribute.StringSlice("services", []string{"employees", "products"}), + attribute.StringSlice("error_services", []string{"products"}), + semconv.HTTPStatusCode(200), + otel.WgRequestError.Bool(true), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Value: 81, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("0"), + otel.WgSubgraphName.String("employees"), + ), + Value: 66, + }, + }, + }, + } + + responseContentLengthMetric := metricdata.Metrics{ + Name: "router.http.response.content_length", + Description: "Total number of response bytes", + Unit: "bytes", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: []metricdata.DataPoint[int64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + semconv.HTTPStatusCode(200), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("0"), + otel.WgSubgraphName.String("employees"), + ), + Value: 863, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + semconv.HTTPStatusCode(403), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Value: 177, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + attribute.StringSlice("error_codes", []string{"UNAUTHORIZED", "YOUR_ERROR_CODE"}), + attribute.StringSlice("services", []string{"employees", "products"}), + attribute.StringSlice("error_services", []string{"products"}), + semconv.HTTPStatusCode(200), + otel.WgRequestError.Bool(true), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Value: 1046, + }, + }, + }, + } + + requestInFlightMetric := metricdata.Metrics{ + Name: "router.http.requests.in_flight", + Description: "Number of requests in flight", + Unit: "", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.DataPoint[int64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("0"), + otel.WgSubgraphName.String("employees"), + ), + Value: 0, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Value: 0, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationProtocol.String("http"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Value: 0, + }, + }, + }, + } + + operationPlanningTimeMetric := metricdata.Metrics{ + Name: "router.graphql.operation.planning_time", + Description: "Operation planning time in milliseconds", + Unit: "ms", + Data: metricdata.Histogram[float64]{ + Temporality: metricdata.CumulativeTemporality, + DataPoints: []metricdata.HistogramDataPoint[float64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + otel.WgEnginePlanCacheHit.Bool(false), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + ), + Sum: 0, + }, + }, + }, + } + + failedRequestsMetric := metricdata.Metrics{ + Name: "router.http.requests.error", + Description: "Total number of failed requests", + Unit: "", + Data: metricdata.Sum[int64]{ + Temporality: metricdata.CumulativeTemporality, + IsMonotonic: true, + DataPoints: []metricdata.DataPoint[int64]{ + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + semconv.HTTPStatusCode(403), + otel.WgComponentName.String("engine-loader"), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgSubgraphID.String("3"), + otel.WgSubgraphName.String("products"), + ), + Value: 1, + }, + { + Attributes: attribute.NewSet( + attribute.String("from_header", "custom-value"), + attribute.String("sha256", "b0066f89f91315b4610ed127be677e6cea380494eb20c83cc121c97552ca44b2"), + attribute.StringSlice("error_codes", []string{"UNAUTHORIZED", "YOUR_ERROR_CODE"}), + attribute.StringSlice("services", []string{"employees", "products"}), + attribute.StringSlice("error_services", []string{"products"}), + otel.WgClientName.String("unknown"), + otel.WgClientVersion.String("missing"), + otel.WgFederatedGraphID.String("graph"), + otel.WgOperationHash.String("16884868987896027258"), + otel.WgOperationName.String("myQuery"), + otel.WgOperationProtocol.String("http"), + otel.WgOperationType.String("query"), + otel.WgRouterClusterName.String(""), + otel.WgRouterConfigVersion.String(xEnv.RouterConfigVersionMain()), + otel.WgRouterVersion.String("dev"), + otel.WgRequestError.Bool(true), + ), + Value: 1, + }, + }, + }, + } + + want := metricdata.ScopeMetrics{ + Scope: instrumentation.Scope{ + Name: "cosmo.router", + SchemaURL: "", + Version: "0.0.1", + }, + Metrics: []metricdata.Metrics{ + httpRequestsMetric, + requestDurationMetric, + requestContentLengthMetric, + responseContentLengthMetric, + requestInFlightMetric, + operationPlanningTimeMetric, + failedRequestsMetric, + }, + } + metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) + }) + }) + + t.Run("Tracing is not affected by custom metric attributes", func(t *testing.T) { + t.Parallel() + + metricReader := metric.NewManualReader() + exporter := tracetest.NewInMemoryExporter(t) + defer exporter.Reset() + + testenv.Run(t, &testenv.Config{ + TraceExporter: exporter, + MetricReader: metricReader, + CustomMetricAttributes: []config.CustomAttribute{ + { + Key: "from_header", + ValueFrom: &config.CustomDynamicAttribute{ + RequestHeader: "x-custom-header", + }, + }, + { + Key: "services", + ValueFrom: &config.CustomDynamicAttribute{ + ContextField: core.ContextFieldOperationServices, + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Header: map[string][]string{ + "x-custom-header": {"custom-value"}, + }, + Query: `query { employees { id } }`, + }) + require.JSONEq(t, employeesIDData, res.Body) + + sn := exporter.GetSpans().Snapshots() + require.Len(t, sn, 9, "expected 9 spans, got %d", len(sn)) + + /** + * Spans + */ + + // Pre-Handler Operation Read + + require.Equal(t, "HTTP - Read Body", sn[0].Name()) + require.Len(t, sn[0].Resource().Attributes(), 9) + require.Len(t, sn[0].Attributes(), 7) + + require.Equal(t, "Operation - Parse", sn[1].Name()) + require.Len(t, sn[1].Resource().Attributes(), 9) + require.Len(t, sn[1].Attributes(), 7) + + require.Equal(t, "Operation - Normalize", sn[2].Name()) + require.Len(t, sn[2].Resource().Attributes(), 9) + require.Len(t, sn[2].Attributes(), 10) + + require.Equal(t, "Operation - Validate", sn[3].Name()) + require.Len(t, sn[3].Resource().Attributes(), 9) + require.Len(t, sn[3].Attributes(), 11) + + require.Equal(t, "Operation - Plan", sn[4].Name()) + require.Len(t, sn[4].Resource().Attributes(), 9) + require.Len(t, sn[4].Attributes(), 12) + + // Engine Transport + require.Equal(t, "query unnamed", sn[5].Name()) + require.Len(t, sn[5].Resource().Attributes(), 9) + require.Len(t, sn[5].Attributes(), 21) + + require.Equal(t, "Engine - Fetch", sn[6].Name()) + require.Len(t, sn[6].Resource().Attributes(), 9) + require.Len(t, sn[6].Attributes(), 14) + + // GraphQL handler + require.Equal(t, "Operation - Execute", sn[7].Name()) + require.Len(t, sn[7].Resource().Attributes(), 9) + require.Len(t, sn[7].Attributes(), 11) + + // Root Server middleware + require.Equal(t, "query unnamed", sn[8].Name()) + require.Len(t, sn[8].Resource().Attributes(), 9) + require.Len(t, sn[8].Attributes(), 26) + }) + }) + + }) } diff --git a/router-tests/testenv/testenv.go b/router-tests/testenv/testenv.go index 4d4593b821..a3c34c259f 100644 --- a/router-tests/testenv/testenv.go +++ b/router-tests/testenv/testenv.go @@ -120,8 +120,9 @@ type Config struct { DisableParentBasedSampler bool TLSConfig *core.TlsConfig TraceExporter trace.SpanExporter - OtelAttributes []config.CustomAttribute - OtelResourceAttributes []config.CustomStaticAttribute + CustomMetricAttributes []config.CustomAttribute + CustomTelemetryAttributes []config.CustomAttribute + CustomResourceAttributes []config.CustomStaticAttribute MetricReader metric.Reader PrometheusRegistry *prometheus.Registry ShutdownDelay time.Duration @@ -700,8 +701,7 @@ func configureRouter(listenerAddr string, testConfig *Config, routerConfig *node c := core.TraceConfigFromTelemetry(&config.Telemetry{ ServiceName: "cosmo-router", - Attributes: testConfig.OtelAttributes, - ResourceAttributes: testConfig.OtelResourceAttributes, + ResourceAttributes: testConfig.CustomResourceAttributes, Tracing: config.Tracing{ Enabled: true, SamplingRate: 1, @@ -715,7 +715,13 @@ func configureRouter(listenerAddr string, testConfig *Config, routerConfig *node c.TestMemoryExporter = testConfig.TraceExporter - routerOpts = append(routerOpts, core.WithTracing(c)) + routerOpts = append(routerOpts, + core.WithTracing(c), + ) + } + + if testConfig.CustomTelemetryAttributes != nil { + routerOpts = append(routerOpts, core.WithTelemetryAttributes(testConfig.CustomTelemetryAttributes)) } var prometheusConfig rmetric.PrometheusConfig @@ -736,10 +742,10 @@ func configureRouter(listenerAddr string, testConfig *Config, routerConfig *node if testConfig.MetricReader != nil { c := core.MetricConfigFromTelemetry(&config.Telemetry{ ServiceName: "cosmo-router", - Attributes: testConfig.OtelAttributes, - ResourceAttributes: testConfig.OtelResourceAttributes, + ResourceAttributes: testConfig.CustomResourceAttributes, Tracing: config.Tracing{}, Metrics: config.Metrics{ + Attributes: testConfig.CustomMetricAttributes, Prometheus: config.Prometheus{ Enabled: true, }, diff --git a/router/config.yaml b/router/config.yaml index 91c9c2bd87..44e91f5c12 100644 --- a/router/config.yaml +++ b/router/config.yaml @@ -3,4 +3,4 @@ # See pkg/config/config.go for the full list of configuration options. # This file is used to debugging purposes only. -version: "1" +version: "1" \ No newline at end of file diff --git a/router/core/context.go b/router/core/context.go index c557500ea7..9d71044ea5 100644 --- a/router/core/context.go +++ b/router/core/context.go @@ -5,12 +5,10 @@ import ( "github.com/wundergraph/cosmo/router/pkg/config" "net/http" "net/url" - "strconv" "sync" "time" graphqlmetrics "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/graphqlmetrics/v1" - "github.com/wundergraph/cosmo/router/pkg/otel" "github.com/wundergraph/graphql-go-tools/v2/pkg/engine/datasource/httpclient" "go.opentelemetry.io/otel/attribute" @@ -21,9 +19,12 @@ import ( "go.uber.org/zap" ) -type requestContextKey = struct{} -type subgraphResolverContextKey struct{} -type baseAttributesContextKey struct{} +type contextKey int + +const ( + requestContextKey contextKey = iota + subgraphResolverContextKey +) var _ RequestContext = (*requestContext)(nil) @@ -125,6 +126,49 @@ type RequestContext interface { Authentication() authentication.Authentication } +type requestTelemetryAttributes struct { + // attributes are the base attributes for traces and metrics + attributes []attribute.KeyValue + // metricAttributes are the attributes for metrics only + metricAttributes []attribute.KeyValue + // metricSetAttributes is map to quickly check if a metric attribute is set and to what key it is remapped + metricSetAttributes map[string]string +} + +func (r *requestTelemetryAttributes) AddCustomMetricStringSliceAttr(key string, values []string) { + if remapKey, ok := r.metricSetAttributes[key]; ok && len(values) > 0 { + r.metricAttributes = append(r.metricAttributes, attribute.StringSlice(remapKey, values)) + } +} + +func (r *requestTelemetryAttributes) AddCustomMetricStringAttr(key string, value string) { + if remapKey, ok := r.metricSetAttributes[key]; ok && value != "" { + r.metricAttributes = append(r.metricAttributes, attribute.String(remapKey, value)) + } +} + +func (r *requestTelemetryAttributes) AddCommonAttribute(vals ...attribute.KeyValue) { + r.attributes = append(r.attributes, vals...) +} + +func (r *requestTelemetryAttributes) CommonAttrs() []attribute.KeyValue { + return r.attributes +} + +func (r *requestTelemetryAttributes) MetricAttrs(includeCommon bool) []attribute.KeyValue { + if includeCommon { + attrs := make([]attribute.KeyValue, 0, len(r.attributes)+len(r.metricAttributes)) + attrs = append(attrs, r.attributes...) + attrs = append(attrs, r.metricAttributes...) + return attrs + } + return r.metricAttributes +} + +func (r *requestTelemetryAttributes) AddMetricAttribute(vals ...attribute.KeyValue) { + r.metricAttributes = append(r.attributes, vals...) +} + // requestContext is the default implementation of RequestContext // It is accessible to custom modules in the request lifecycle type requestContext struct { @@ -144,8 +188,14 @@ type requestContext struct { operation *operationContext // subgraphResolver can be used to resolve Subgraph by ID or by request subgraphResolver *SubgraphResolver - // dataSources the list of datasources involved in resolving the operation - dataSources []resolve.DataSourceInfo + // dataSourceNames the list of datasource involved in resolving the operation + dataSourceNames []string + // graphQLErrorServices are the services that produced the GraphQL errors + graphQLErrorServices []string + // graphQLErrorCodes are the error codes of the GraphQL errors + graphQLErrorCodes []string + // telemetry are the base telemetry information of the request + telemetry *requestTelemetryAttributes } func (c *requestContext) Operation() OperationContext { @@ -157,14 +207,14 @@ func (c *requestContext) Request() *http.Request { } func withRequestContext(ctx context.Context, operation *requestContext) context.Context { - return context.WithValue(ctx, requestContextKey{}, operation) + return context.WithValue(ctx, requestContextKey, operation) } func getRequestContext(ctx context.Context) *requestContext { if ctx == nil { return nil } - op := ctx.Value(requestContextKey{}) + op := ctx.Value(requestContextKey) if op == nil { return nil } @@ -379,35 +429,12 @@ type operationContext struct { argumentUsageInfo []*graphqlmetrics.ArgumentUsageInfo inputUsageInfo []*graphqlmetrics.InputUsageInfo - attributes []attribute.KeyValue - parsingTime time.Duration validationTime time.Duration planningTime time.Duration normalizationTime time.Duration } -func (o *operationContext) setAttributes() { - numberOfAttributes := 6 - if o.persistedID != "" { - numberOfAttributes += 1 - } - o.attributes = make([]attribute.KeyValue, numberOfAttributes) - o.attributes[0] = otel.WgClientName.String(o.clientInfo.Name) - o.attributes[1] = otel.WgClientVersion.String(o.clientInfo.Version) - o.attributes[2] = otel.WgOperationName.String(o.Name()) - o.attributes[3] = otel.WgOperationType.String(o.Type()) - o.attributes[4] = otel.WgOperationProtocol.String(o.Protocol().String()) - o.attributes[5] = otel.WgOperationHash.String(strconv.FormatUint(o.Hash(), 10)) - if o.persistedID != "" { - o.attributes[6] = otel.WgOperationPersistedID.String(o.PersistedID()) - } -} - -func (o *operationContext) Attributes() []attribute.KeyValue { - return o.attributes -} - func (o *operationContext) Variables() []byte { return o.variables } @@ -484,30 +511,34 @@ func (s *SubgraphResolver) BySubgraphRequest(subgraphRequest *http.Request) *Sub } func withSubgraphResolver(ctx context.Context, resolver *SubgraphResolver) context.Context { - return context.WithValue(ctx, subgraphResolverContextKey{}, resolver) + return context.WithValue(ctx, subgraphResolverContextKey, resolver) } func subgraphResolverFromContext(ctx context.Context) *SubgraphResolver { - resolver, _ := ctx.Value(subgraphResolverContextKey{}).(*SubgraphResolver) + resolver, _ := ctx.Value(subgraphResolverContextKey).(*SubgraphResolver) return resolver } -func withBaseAttributes(ctx context.Context, attributes []attribute.KeyValue) context.Context { - return context.WithValue(ctx, baseAttributesContextKey{}, attributes) -} - -func baseAttributesFromContext(ctx context.Context) []attribute.KeyValue { - attributes, _ := ctx.Value(baseAttributesContextKey{}).([]attribute.KeyValue) - return attributes +type requestContextOptions struct { + operationContext *operationContext + requestLogger *zap.Logger + metricSetAttributes map[string]string + w http.ResponseWriter + r *http.Request } -func buildRequestContext(w http.ResponseWriter, r *http.Request, opContext *operationContext, requestLogger *zap.Logger) *requestContext { +func buildRequestContext(opts requestContextOptions) *requestContext { return &requestContext{ - logger: requestLogger, - keys: map[string]any{}, - responseWriter: w, - request: r, - operation: opContext, - subgraphResolver: subgraphResolverFromContext(r.Context()), + logger: opts.requestLogger, + keys: map[string]any{}, + responseWriter: opts.w, + request: opts.r, + operation: opts.operationContext, + telemetry: &requestTelemetryAttributes{ + metricSetAttributes: opts.metricSetAttributes, + attributes: make([]attribute.KeyValue, 0), + metricAttributes: make([]attribute.KeyValue, 0), + }, + subgraphResolver: subgraphResolverFromContext(opts.r.Context()), } } diff --git a/router/core/engine_loader_hooks.go b/router/core/engine_loader_hooks.go index d644f30e95..ff7230bf44 100644 --- a/router/core/engine_loader_hooks.go +++ b/router/core/engine_loader_hooks.go @@ -5,9 +5,6 @@ import ( "errors" "fmt" "github.com/wundergraph/cosmo/router/internal/unique" - "slices" - "strings" - "github.com/wundergraph/cosmo/router/pkg/metric" rotel "github.com/wundergraph/cosmo/router/pkg/otel" "github.com/wundergraph/graphql-go-tools/v2/pkg/engine/resolve" @@ -16,6 +13,7 @@ import ( "go.opentelemetry.io/otel/codes" semconv "go.opentelemetry.io/otel/semconv/v1.21.0" "go.opentelemetry.io/otel/trace" + "slices" ) var ( @@ -55,13 +53,9 @@ func (f *EngineLoaderHooks) OnLoad(ctx context.Context, ds resolve.DataSourceInf return ctx } - var baseAttributes []attribute.KeyValue - - if attributes := baseAttributesFromContext(reqContext.Request().Context()); attributes != nil { - baseAttributes = append(baseAttributes, attributes...) - } - - ctx, span := f.tracer.Start(ctx, "Engine - Fetch", trace.WithAttributes(baseAttributes...)) + ctx, span := f.tracer.Start(ctx, "Engine - Fetch", + trace.WithAttributes(reqContext.telemetry.CommonAttrs()...), + ) subgraph := reqContext.SubgraphByID(ds.ID) if subgraph != nil { @@ -88,11 +82,12 @@ func (f *EngineLoaderHooks) OnFinished(ctx context.Context, statusCode int, ds r } span := trace.SpanFromContext(ctx) + defer span.End() activeSubgraph := reqContext.SubgraphByID(ds.ID) - baseAttributes := []attribute.KeyValue{ + attributes := []attribute.KeyValue{ // Subgraph response status code semconv.HTTPStatusCode(statusCode), rotel.WgComponentName.String("engine-loader"), @@ -101,11 +96,7 @@ func (f *EngineLoaderHooks) OnFinished(ctx context.Context, statusCode int, ds r } // Ensure common attributes are set - baseAttributes = append(baseAttributes, reqContext.operation.Attributes()...) - - if attributes := baseAttributesFromContext(reqContext.Request().Context()); attributes != nil { - baseAttributes = append(baseAttributes, attributes...) - } + attributes = append(attributes, reqContext.telemetry.CommonAttrs()...) if err != nil { @@ -150,19 +141,10 @@ func (f *EngineLoaderHooks) OnFinished(ctx context.Context, statusCode int, ds r slices.Sort(errorCodesAttr) if len(errorCodesAttr) > 0 { - // Create individual metrics for each error code - for _, code := range errorCodesAttr { - f.metricStore.MeasureRequestError(ctx, - // Add only the error code as an attribute - append(baseAttributes, rotel.WgSubgraphErrorExtendedCode.String(code))..., - ) - } - - // Add this after the metrics have been created - // The list might be used for post-processing - baseAttributes = append(baseAttributes, rotel.WgSubgraphErrorExtendedCode.String(strings.Join(errorCodesAttr, ","))) + mAttr := append(attributes, reqContext.telemetry.MetricAttrs(false)...) + f.metricStore.MeasureRequestError(ctx, mAttr...) } } - span.SetAttributes(baseAttributes...) + span.SetAttributes(attributes...) } diff --git a/router/core/errors.go b/router/core/errors.go index 4ee7d9af1a..ddf7e90372 100644 --- a/router/core/errors.go +++ b/router/core/errors.go @@ -91,22 +91,25 @@ func logInternalErrorsFromReport(report *operationreport.Report, requestLogger * } } -// trackResponseError sets the final response error on the request context and +// trackFinalResponseError sets the final response error on the request context and // attaches it to the span. This is used to process the error in the outer middleware // and therefore only intended to be used in the GraphQL handler. -func trackResponseError(ctx context.Context, err error) { +func trackFinalResponseError(ctx context.Context, err error) { if err == nil { return } - reqCtx := getRequestContext(ctx) - if reqCtx == nil { + span := trace.SpanFromContext(ctx) + requestContext := getRequestContext(ctx) + if requestContext == nil { return } - reqCtx.error = err + requestContext.error = err + requestContext.graphQLErrorServices = getAggregatedSubgraphServiceNames(requestContext.error) + requestContext.graphQLErrorCodes = getAggregatedSubgraphErrorCodes(requestContext.error) - rtrace.AttachErrToSpan(trace.SpanFromContext(ctx), err) + rtrace.AttachErrToSpan(span, err) } func getAggregatedSubgraphErrorCodes(err error) []string { @@ -130,6 +133,14 @@ func getAggregatedSubgraphErrorCodes(err error) []string { return nil } +func getSubgraphNames(ds []resolve.DataSourceInfo) []string { + operationServiceNames := make([]string, 0, len(ds)) + for _, ds := range ds { + operationServiceNames = append(operationServiceNames, ds.Name) + } + return operationServiceNames +} + func getAggregatedSubgraphServiceNames(err error) []string { if unwrapped, ok := err.(MultiError); ok { @@ -156,7 +167,7 @@ func propagateSubgraphErrors(ctx *resolve.Context) { err := ctx.SubgraphErrors() if err != nil { - trackResponseError(ctx.Context(), err) + trackFinalResponseError(ctx.Context(), err) } } diff --git a/router/core/graph_server.go b/router/core/graph_server.go index cdcadf3666..db95d56ee1 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -153,9 +153,6 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC rmetric.WithOtlpMeterProvider(s.otlpMeterProvider), rmetric.WithLogger(s.logger), rmetric.WithProcessStartTime(s.processStartTime), - // Don't pass the router config version or feature flags here - // We scope the metrics to the feature flags and config version in the handler - rmetric.WithAttributes(baseOtelAttributes...), ) if err != nil { return nil, fmt.Errorf("failed to create metric handler: %w", err) @@ -427,26 +424,36 @@ func (s *graphServer) buildGraphMux(ctx context.Context, } computeSha256 := false - if s.accessLogsConfig != nil { - for _, aa := range s.accessLogsConfig.Attributes { - if aa.ValueFrom != nil && aa.ValueFrom.ContextField == ContextFieldOperationSha256 { + + // Currently, we only support custom attributes from the context for OTLP metrics + if len(s.metricConfig.Attributes) > 0 { + for _, customAttribute := range s.metricConfig.Attributes { + if customAttribute.ValueFrom != nil && customAttribute.ValueFrom.ContextField == ContextFieldOperationSha256 { computeSha256 = true break } } - if computeSha256 { - operationHashCacheConfig := &ristretto.Config[uint64, string]{ - MaxCost: s.engineExecutionConfiguration.OperationHashCacheSize, - NumCounters: s.engineExecutionConfiguration.OperationHashCacheSize * 10, - BufferItems: 64, - } - gm.operationHashCache, err = ristretto.NewCache[uint64, string](operationHashCacheConfig) - if err != nil { - return nil, fmt.Errorf("failed to create operation hash cache: %w", err) + } else if s.accessLogsConfig != nil { + for _, customAttribute := range s.accessLogsConfig.Attributes { + if customAttribute.ValueFrom != nil && customAttribute.ValueFrom.ContextField == ContextFieldOperationSha256 { + computeSha256 = true + break } } } + if computeSha256 { + operationHashCacheConfig := &ristretto.Config[uint64, string]{ + MaxCost: s.engineExecutionConfiguration.OperationHashCacheSize, + NumCounters: s.engineExecutionConfiguration.OperationHashCacheSize * 10, + BufferItems: 64, + } + gm.operationHashCache, err = ristretto.NewCache[uint64, string](operationHashCacheConfig) + if err != nil { + return nil, fmt.Errorf("failed to create operation hash cache: %w", err) + } + } + metrics := NewRouterMetrics(&routerMetricsConfig{ metrics: s.metricStore, gqlMetricsExporter: s.gqlMetricsExporter, @@ -456,7 +463,6 @@ func (s *graphServer) buildGraphMux(ctx context.Context, }) var traceHandler *rtrace.Middleware - var otelAttributesMappers []func(req *http.Request) []attribute.KeyValue if s.traceConfig.Enabled { spanStartOptions := []oteltrace.SpanStartOption{ @@ -473,10 +479,11 @@ func (s *graphServer) buildGraphMux(ctx context.Context, traceHandler = rtrace.NewMiddleware( rtrace.WithTracePreHandler( func(r *http.Request, w http.ResponseWriter, graphqlExecutionSpan oteltrace.Span) { + reqContext := getRequestContext(r.Context()) span := oteltrace.SpanFromContext(r.Context()) - if attributes := baseAttributesFromContext(r.Context()); attributes != nil { - span.SetAttributes(attributes...) - } + span.SetAttributes(reqContext.telemetry.CommonAttrs()...) + + // Set the trace ID in the response header if s.traceConfig.ResponseTraceHeader.Enabled { spanContext := graphqlExecutionSpan.SpanContext() traceID := spanContext.TraceID().String() @@ -495,14 +502,6 @@ func (s *graphServer) buildGraphMux(ctx context.Context, otelhttp.WithTracerProvider(s.tracerProvider), ), ) - - if s.traceConfig.SpanAttributesMapper != nil { - otelAttributesMappers = append(otelAttributesMappers, s.traceConfig.SpanAttributesMapper) - } - - otelAttributesMappers = append(otelAttributesMappers, func(req *http.Request) []attribute.KeyValue { - return baseOtelAttributes - }) } baseLogFields := []zapcore.Field{ @@ -513,13 +512,24 @@ func (s *graphServer) buildGraphMux(ctx context.Context, baseLogFields = append(baseLogFields, zap.String("feature_flag", featureFlagName)) } + // Currently, we only support custom attributes from the context for OTLP metrics + b := buildAttributesMap(s.metricConfig.Attributes) + // Enrich the request context with the subgraph information which is required for custom modules and tracing subgraphResolver := NewSubgraphResolver(subgraphs) httpRouter.Use(func(h http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { requestLogger := s.logger.With(logging.WithRequestID(middleware.GetReqID(r.Context()))) r = r.WithContext(withSubgraphResolver(r.Context(), subgraphResolver)) - r = r.WithContext(withRequestContext(r.Context(), buildRequestContext(w, r, nil, requestLogger))) + + reqContext := buildRequestContext(requestContextOptions{ + operationContext: nil, + requestLogger: requestLogger, + metricSetAttributes: b, + w: w, + r: r, + }) + r = r.WithContext(withRequestContext(r.Context(), reqContext)) // For debugging purposes, we can validate from what version of the config the request is coming from if s.setConfigVersionHeader { @@ -530,26 +540,44 @@ func (s *graphServer) buildGraphMux(ctx context.Context, }) }) - if s.traceConfig.Enabled { - httpRouter.Use(func(h http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + /** + * Initialize base attributes from headers and other sources + */ - var attributes []attribute.KeyValue - for _, mapper := range otelAttributesMappers { - attributes = append(attributes, mapper(r)...) - } + var commonAttrRequestMapper func(r *http.Request) []attribute.KeyValue - attributes = append(attributes, baseOtelAttributes...) + if len(s.telemetryAttributes) > 0 { + // Common attributes across traces and metrics + commonAttrRequestMapper = buildHeaderAttributesMapper(s.telemetryAttributes) + } - r = r.WithContext( - withBaseAttributes(r.Context(), attributes), - ) + var metricAttrRequestMapper func(r *http.Request) []attribute.KeyValue - h.ServeHTTP(w, r) - }) - }) + // Metric attributes are only used for OTLP metrics and Prometheus metrics + if s.metricConfig.IsEnabled() { + metricAttrRequestMapper = buildHeaderAttributesMapper(s.metricConfig.Attributes) } + httpRouter.Use(func(h http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + + reqContext := getRequestContext(r.Context()) + attributes := make([]attribute.KeyValue, 0, len(baseOtelAttributes)) + attributes = append(attributes, baseOtelAttributes...) + + if commonAttrRequestMapper != nil { + attributes = append(attributes, commonAttrRequestMapper(r)...) + } + if metricAttrRequestMapper != nil { + reqContext.telemetry.AddMetricAttribute(metricAttrRequestMapper(r)...) + } + + reqContext.telemetry.AddCommonAttribute(attributes...) + + h.ServeHTTP(w, r) + }) + }) + // Register the trace middleware before the request logger, so we can log the trace ID if traceHandler != nil { httpRouter.Use(traceHandler.Handler) @@ -560,7 +588,7 @@ func (s *graphServer) buildGraphMux(ctx context.Context, requestlogger.WithDefaultOptions(), requestlogger.WithNoTimeField(), requestlogger.WithFields(baseLogFields...), - requestlogger.WithRequestFields(s.accessLogsFieldHandler), + requestlogger.WithFieldsHandler(s.accessLogsFieldHandler), } if s.ipAnonymization.Enabled { @@ -731,6 +759,7 @@ func (s *graphServer) buildGraphMux(ctx context.Context, EpollKqueueConnBufferSize: s.engineExecutionConfiguration.EpollKqueueConnBufferSize, WebSocketConfiguration: s.webSocketConfiguration, ClientHeader: s.clientHeader, + Attributes: baseOtelAttributes, }) // When the playground path is equal to the graphql path, we need to handle @@ -780,8 +809,11 @@ func (s *graphServer) buildGraphMux(ctx context.Context, return gm, nil } -func (s *graphServer) accessLogsFieldHandler(request *http.Request) []zapcore.Field { - requestContext := getRequestContext(request.Context()) +func (s *graphServer) accessLogsFieldHandler(panicError any, request *http.Request) []zapcore.Field { + reqContext := getRequestContext(request.Context()) + if reqContext == nil { + return nil + } resFields := make([]zapcore.Field, 0, len(s.accessLogsConfig.Attributes)) resFields = append(resFields, zap.String("request_id", middleware.GetReqID(request.Context()))) @@ -791,72 +823,68 @@ func (s *graphServer) accessLogsFieldHandler(request *http.Request) []zapcore.Fi continue } - if field.ValueFrom.ContextField != "" && requestContext != nil && requestContext.operation != nil { + if field.ValueFrom.ContextField != "" && reqContext.operation != nil { switch field.ValueFrom.ContextField { case ContextFieldOperationName: - if v := NewStringLogField(requestContext.operation.name, field); v != zap.Skip() { + if v := NewStringLogField(reqContext.operation.name, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationType: - if v := NewStringLogField(requestContext.operation.opType, field); v != zap.Skip() { + if v := NewStringLogField(reqContext.operation.opType, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationPlanningTime: - if v := NewDurationLogField(requestContext.operation.planningTime, field); v != zap.Skip() { + if v := NewDurationLogField(reqContext.operation.planningTime, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationNormalizationTime: - if v := NewDurationLogField(requestContext.operation.normalizationTime, field); v != zap.Skip() { + if v := NewDurationLogField(reqContext.operation.normalizationTime, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationParsingTime: - if v := NewDurationLogField(requestContext.operation.parsingTime, field); v != zap.Skip() { + if v := NewDurationLogField(reqContext.operation.parsingTime, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationValidationTime: - if v := NewDurationLogField(requestContext.operation.validationTime, field); v != zap.Skip() { + if v := NewDurationLogField(reqContext.operation.validationTime, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationSha256: - if v := NewStringLogField(requestContext.operation.sha256Hash, field); v != zap.Skip() { + if v := NewStringLogField(reqContext.operation.sha256Hash, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldOperationHash: - if requestContext.operation.hash == 0 { - break + if reqContext.operation.hash != 0 { + if v := NewStringLogField(strconv.FormatUint(reqContext.operation.hash, 10), field); v != zap.Skip() { + resFields = append(resFields, v) + } } - resFields = append(resFields, NewStringLogField(strconv.FormatUint(requestContext.operation.hash, 10), field)) case ContextFieldPersistedOperationSha256: - if v := NewStringLogField(requestContext.operation.persistedID, field); v != zap.Skip() { + if v := NewStringLogField(reqContext.operation.persistedID, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldResponseErrorMessage: - if requestContext.error != nil { - if v := NewStringLogField(requestContext.error.Error(), field); v != zap.Skip() { - resFields = append(resFields, v) - } + var errMessage string + if panicError != nil { + errMessage = fmt.Sprintf("%v", panicError) + } else if reqContext.error != nil { + errMessage = reqContext.error.Error() } - case ContextFieldOperationServices: - var operationServiceNames []string - for _, ds := range requestContext.dataSources { - operationServiceNames = append(operationServiceNames, ds.Name) + if v := NewStringLogField(errMessage, field); v != zap.Skip() { + resFields = append(resFields, v) } - if v := NewStringSliceLogField(operationServiceNames, field); v != zap.Skip() { + + case ContextFieldOperationServices: + if v := NewStringSliceLogField(reqContext.dataSourceNames, field); v != zap.Skip() { resFields = append(resFields, v) } case ContextFieldGraphQLErrorServices: - if requestContext.error != nil { - errorServiceNames := getAggregatedSubgraphServiceNames(requestContext.error) - if v := NewStringSliceLogField(errorServiceNames, field); v != zap.Skip() { - resFields = append(resFields, v) - } + if v := NewStringSliceLogField(reqContext.graphQLErrorServices, field); v != zap.Skip() { + resFields = append(resFields, v) } case ContextFieldGraphQLErrorCodes: - if requestContext.error != nil { - errorCodes := getAggregatedSubgraphErrorCodes(requestContext.error) - if v := NewStringSliceLogField(errorCodes, field); v != zap.Skip() { - resFields = append(resFields, v) - } + if v := NewStringSliceLogField(reqContext.graphQLErrorCodes, field); v != zap.Skip() { + resFields = append(resFields, v) } } } diff --git a/router/core/graphql_handler.go b/router/core/graphql_handler.go index f3c4c8ee36..35b297e6d5 100644 --- a/router/core/graphql_handler.go +++ b/router/core/graphql_handler.go @@ -8,7 +8,6 @@ import ( "fmt" rErrors "github.com/wundergraph/cosmo/router/internal/errors" rotel "github.com/wundergraph/cosmo/router/pkg/otel" - "go.opentelemetry.io/otel/attribute" "io" "net/http" "strings" @@ -139,15 +138,9 @@ type GraphQLHandler struct { func (h *GraphQLHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { requestContext := getRequestContext(r.Context()) - var baseAttributes []attribute.KeyValue - - if attributes := baseAttributesFromContext(r.Context()); attributes != nil { - baseAttributes = append(baseAttributes, attributes...) - } - executionContext, graphqlExecutionSpan := h.tracer.Start(r.Context(), "Operation - Execute", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(baseAttributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) defer graphqlExecutionSpan.End() @@ -186,10 +179,11 @@ func (h *GraphQLHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { defer propagateSubgraphErrors(ctx) resp, err := h.executor.Resolver.ResolveGraphQLResponse(ctx, p.Response, nil, HeaderPropagationWriter(w, ctx.Context())) - requestContext.dataSources = p.Response.DataSources + requestContext.dataSourceNames = getSubgraphNames(p.Response.DataSources) + requestContext.telemetry.AddCustomMetricStringSliceAttr(ContextFieldOperationServices, requestContext.dataSourceNames) if err != nil { - trackResponseError(ctx.Context(), err) + trackFinalResponseError(ctx.Context(), err) h.WriteError(ctx, err, p.Response, w) return } @@ -206,7 +200,7 @@ func (h *GraphQLHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { ctx, writer, ok = GetSubscriptionResponseWriter(ctx, ctx.Variables, r, w) if !ok { requestContext.logger.Error("unable to get subscription response writer", zap.Error(errCouldNotFlushResponse)) - trackResponseError(r.Context(), errCouldNotFlushResponse) + trackFinalResponseError(r.Context(), errCouldNotFlushResponse) writeRequestErrors(r, w, http.StatusInternalServerError, graphqlerrors.RequestErrorsFromError(errCouldNotFlushResponse), requestContext.logger) return } @@ -215,25 +209,28 @@ func (h *GraphQLHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { defer h.websocketStats.ConnectionsDec() err := h.executor.Resolver.ResolveGraphQLSubscription(ctx, p.Response, writer) + requestContext.dataSourceNames = getSubgraphNames(p.Response.Response.DataSources) + requestContext.telemetry.AddCustomMetricStringSliceAttr(ContextFieldOperationServices, requestContext.dataSourceNames) + if err != nil { if errors.Is(err, context.Canceled) { requestContext.logger.Debug("context canceled: unable to resolve subscription response", zap.Error(err)) - trackResponseError(r.Context(), err) + trackFinalResponseError(r.Context(), err) return } else if errors.Is(err, ErrUnauthorized) { - trackResponseError(ctx.Context(), err) + trackFinalResponseError(ctx.Context(), err) writeRequestErrors(r, w, http.StatusUnauthorized, graphqlerrors.RequestErrorsFromError(err), requestContext.logger) return } requestContext.logger.Error("unable to resolve subscription response", zap.Error(err)) - trackResponseError(ctx.Context(), err) + trackFinalResponseError(ctx.Context(), err) writeRequestErrors(r, w, http.StatusInternalServerError, graphqlerrors.RequestErrorsFromError(errCouldNotResolveResponse), requestContext.logger) return } default: requestContext.logger.Error("unsupported plan kind") - trackResponseError(ctx.Context(), errOperationPlanUnsupported) + trackFinalResponseError(ctx.Context(), errOperationPlanUnsupported) writeRequestErrors(r, w, http.StatusInternalServerError, graphqlerrors.RequestErrorsFromError(errOperationPlanUnsupported), requestContext.logger) } } diff --git a/router/core/graphql_prehandler.go b/router/core/graphql_prehandler.go index 6ad7567ad3..3f050a399e 100644 --- a/router/core/graphql_prehandler.go +++ b/router/core/graphql_prehandler.go @@ -6,6 +6,8 @@ import ( "crypto/ecdsa" "fmt" "github.com/wundergraph/cosmo/router/pkg/config" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" "net/http" "strconv" "strings" @@ -18,7 +20,6 @@ import ( "github.com/wundergraph/graphql-go-tools/v2/pkg/engine/plan" "github.com/wundergraph/graphql-go-tools/v2/pkg/engine/resolve" "github.com/wundergraph/graphql-go-tools/v2/pkg/graphqlerrors" - "go.opentelemetry.io/otel/attribute" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace" "go.uber.org/zap" @@ -94,7 +95,6 @@ type httpOperation struct { body []byte files []httpclient.File requestLogger *zap.Logger - attributes []attribute.KeyValue routerSpan trace.Span operationMetrics *OperationMetrics traceTimings *art.TraceTimings @@ -170,7 +170,6 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { var ( // In GraphQL the statusCode does not always express the error state of the request // we use this flag to determine if we have an error for the request metrics - finalErr error writtenBytes int statusCode = http.StatusOK traceTimings *art.TraceTimings @@ -182,19 +181,46 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { routerSpan := trace.SpanFromContext(r.Context()) clientInfo := NewClientInfoFromRequest(r, h.clientHeader) - commonAttributes := []attribute.KeyValue{ + + requestContext.telemetry.AddCommonAttribute( otel.WgClientName.String(clientInfo.Name), otel.WgClientVersion.String(clientInfo.Version), otel.WgOperationProtocol.String(OperationProtocolHTTP.String()), - } + ) + + metrics := h.metrics.StartOperation(requestLogger, r.ContentLength, requestContext.telemetry.MetricAttrs(true)) + + routerSpan.SetAttributes(requestContext.telemetry.CommonAttrs()...) requestContext.operation = &operationContext{ clientInfo: clientInfo, } + defer func() { + requestContext.telemetry.AddCustomMetricStringSliceAttr(ContextFieldGraphQLErrorServices, requestContext.graphQLErrorServices) + requestContext.telemetry.AddCustomMetricStringSliceAttr(ContextFieldOperationServices, requestContext.dataSourceNames) + requestContext.telemetry.AddCustomMetricStringSliceAttr(ContextFieldGraphQLErrorCodes, requestContext.graphQLErrorCodes) + + metricAttrs := requestContext.telemetry.MetricAttrs(true) + + metrics.Finish( + r.Context(), + requestContext.error, + statusCode, + writtenBytes, + h.flushTelemetryAfterResponse, + requestContext.operation, + metricAttrs, + ) + + if h.flushTelemetryAfterResponse { + h.flushMetrics(r.Context(), requestLogger) + } + }() + executionOptions, traceOptions, err := h.parseRequestOptions(r, clientInfo, requestLogger) if err != nil { - finalErr = err + requestContext.error = err writeRequestErrors(r, w, http.StatusBadRequest, graphqlerrors.RequestErrorsFromError(err), requestLogger) return } @@ -208,21 +234,6 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { traceTimings = art.NewTraceTimings(r.Context()) } - if baseAttributes := baseAttributesFromContext(r.Context()); baseAttributes != nil { - commonAttributes = append(commonAttributes, baseAttributes...) - } - - metrics := h.metrics.StartOperation(clientInfo, requestLogger, r.ContentLength, commonAttributes) - - routerSpan.SetAttributes(commonAttributes...) - - defer func() { - metrics.Finish(finalErr, statusCode, writtenBytes, h.flushTelemetryAfterResponse) - if h.flushTelemetryAfterResponse { - h.flushMetrics(r.Context(), requestLogger) - } - }() - var body []byte var files []httpclient.File // XXX: This buffer needs to be returned to the pool only @@ -231,18 +242,18 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { if strings.Contains(r.Header.Get("Content-Type"), "multipart/form-data") { if !h.fileUploadEnabled { - finalErr = &httpGraphqlError{ + requestContext.error = &httpGraphqlError{ message: "file upload disabled", statusCode: http.StatusOK, } - writeOperationError(r, w, requestLogger, finalErr) + writeOperationError(r, w, requestLogger, requestContext.error) h.releaseBodyReadBuffer(buf) return } _, readMultiPartSpan := h.tracer.Start(r.Context(), "HTTP - Read Multipart", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(commonAttributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) multipartParser := NewMultipartParser(h.operationProcessor, h.maxUploadFiles, h.maxUploadFileSize) @@ -250,8 +261,8 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { var err error body, files, err = multipartParser.Parse(r, buf) if err != nil { - finalErr = err - writeOperationError(r, w, requestLogger, finalErr) + requestContext.error = err + writeOperationError(r, w, requestLogger, requestContext.error) h.releaseBodyReadBuffer(buf) readMultiPartSpan.End() return @@ -273,13 +284,13 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { } else if r.Method == http.MethodPost { _, readOperationBodySpan := h.tracer.Start(r.Context(), "HTTP - Read Body", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(commonAttributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) var err error body, err = h.operationProcessor.ReadBody(buf, r.Body) if err != nil { - finalErr = err + requestContext.error = err // Don't produce errors logs here because it can only be client side errors // e.g. too large body, slow client, aborted connection etc. @@ -294,10 +305,9 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { readOperationBodySpan.End() } - opContext, err := h.handleOperation(r, buf, &httpOperation{ + err = h.handleOperation(r, buf, &httpOperation{ requestContext: requestContext, requestLogger: requestLogger, - attributes: commonAttributes, routerSpan: routerSpan, operationMetrics: metrics, traceTimings: traceTimings, @@ -305,7 +315,7 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { body: body, }) if err != nil { - finalErr = err + requestContext.error = err // Mark the root span of the router as failed, so we can easily identify failed requests rtrace.AttachErrToSpan(routerSpan, err) @@ -318,12 +328,12 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { if h.accessController != nil { _, authenticateSpan := h.tracer.Start(r.Context(), "Authenticate", trace.WithSpanKind(trace.SpanKindServer), - trace.WithAttributes(commonAttributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) validatedReq, err := h.accessController.Access(w, r) if err != nil { - finalErr = err + requestContext.error = err requestLogger.Error("Failed to authenticate request", zap.Error(err)) // Mark the root span of the router as failed, so we can easily identify failed requests @@ -345,7 +355,6 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { } art.SetRequestTracingStats(r.Context(), traceOptions, traceTimings) - metrics.AddOperationContext(opContext) ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor) @@ -359,20 +368,17 @@ func (h *PreHandler) Handler(next http.Handler) http.Handler { statusCode = ww.Status() writtenBytes = ww.BytesWritten() - // Evaluate the request after the request has been handled by the engine handler - finalErr = requestContext.error - // Mark the root span of the router as failed, so we can easily identify failed requests - if finalErr != nil { - rtrace.AttachErrToSpan(trace.SpanFromContext(r.Context()), finalErr) + if requestContext.error != nil { + rtrace.AttachErrToSpan(trace.SpanFromContext(r.Context()), requestContext.error) } }) } -func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpOperation *httpOperation) (*operationContext, error) { +func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpOperation *httpOperation) error { operationKit, err := h.operationProcessor.NewKit() if err != nil { - return nil, err + return err } defer func() { @@ -391,14 +397,14 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO // Handle the case when operation information are provided as GET parameters if req.Method == http.MethodGet { if err := operationKit.UnmarshalOperationFromURL(req.URL); err != nil { - return nil, &httpGraphqlError{ + return &httpGraphqlError{ message: fmt.Sprintf("error parsing request query params: %s", err), statusCode: http.StatusBadRequest, } } } else if req.Method == http.MethodPost { if err := operationKit.UnmarshalOperationFromBody(httpOperation.body); err != nil { - return nil, &httpGraphqlError{ + return &httpGraphqlError{ message: fmt.Sprintf("error parsing request body: %s", err), statusCode: http.StatusBadRequest, } @@ -413,12 +419,13 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO // Compute the operation sha256 hash as soon as possible for observability reasons if h.computeOperationSha256 { if err := operationKit.ComputeOperationSha256(); err != nil { - return nil, &httpGraphqlError{ + return &httpGraphqlError{ message: fmt.Sprintf("error hashing operation: %s", err), statusCode: http.StatusInternalServerError, } } requestContext.operation.sha256Hash = operationKit.parsedOperation.Sha256Hash + requestContext.telemetry.AddCustomMetricStringAttr(ContextFieldOperationSha256, requestContext.operation.sha256Hash) } requestContext.operation.extensions = operationKit.parsedOperation.Request.Extensions @@ -427,11 +434,26 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO var skipParse bool if operationKit.parsedOperation.IsPersistedOperation { - requestContext.operation.persistedID = operationKit.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash - skipParse, err = operationKit.FetchPersistedOperation(req.Context(), requestContext.operation.clientInfo, httpOperation.attributes) + ctx, span := h.tracer.Start(req.Context(), "Load Persisted Operation", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), + ) + + skipParse, err = operationKit.FetchPersistedOperation(ctx, requestContext.operation.clientInfo) if err != nil { - return nil, err + span.RecordError(err) + span.SetAttributes(otel.WgEnginePersistedOperationCacheHit.Bool(operationKit.parsedOperation.PersistedOperationCacheHit)) + span.SetStatus(codes.Error, err.Error()) + + span.End() + + return err } + + span.SetAttributes(otel.WgEnginePersistedOperationCacheHit.Bool(operationKit.parsedOperation.PersistedOperationCacheHit)) + + span.End() + requestContext.operation.persistedOperationCacheHit = operationKit.parsedOperation.PersistedOperationCacheHit } @@ -441,7 +463,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO if !skipParse { _, engineParseSpan := h.tracer.Start(req.Context(), "Operation - Parse", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(httpOperation.attributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) httpOperation.traceTimings.StartParse() @@ -458,7 +480,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO engineParseSpan.End() - return nil, err + return err } requestContext.operation.parsingTime = time.Since(startParsing) @@ -469,34 +491,35 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO engineParseSpan.End() } + attributesAfterParse := []attribute.KeyValue{ + otel.WgOperationName.String(operationKit.parsedOperation.Request.OperationName), + otel.WgOperationType.String(operationKit.parsedOperation.Type), + } + + requestContext.telemetry.AddCommonAttribute(attributesAfterParse...) + requestContext.operation.name = operationKit.parsedOperation.Request.OperationName + requestContext.operation.opType = operationKit.parsedOperation.Type + // Set the router span name after we have the operation name + httpOperation.routerSpan.SetName(GetSpanName(operationKit.parsedOperation.Request.OperationName, operationKit.parsedOperation.Type)) + // Give the buffer back to the pool as soon as we're done with it h.releaseBodyReadBuffer(buf) if req.Method == http.MethodGet && operationKit.parsedOperation.Type == "mutation" { - return nil, &httpGraphqlError{ + return &httpGraphqlError{ message: "Mutations can only be sent over HTTP POST", statusCode: http.StatusMethodNotAllowed, } } - attributes := []attribute.KeyValue{ - otel.WgOperationName.String(operationKit.parsedOperation.Request.OperationName), - otel.WgOperationType.String(operationKit.parsedOperation.Type), - } - attributes = append(attributes, httpOperation.attributes...) - - // Set the router span name after we have the operation name - httpOperation.routerSpan.SetName(GetSpanName(operationKit.parsedOperation.Request.OperationName, operationKit.parsedOperation.Type)) - // Set the operation name and type to the operation metrics and the router span as early as possible - httpOperation.routerSpan.SetAttributes(attributes...) - httpOperation.operationMetrics.AddAttributes(attributes...) + httpOperation.routerSpan.SetAttributes(attributesAfterParse...) if err := h.operationBlocker.OperationIsBlocked(operationKit.parsedOperation); err != nil { - return nil, &httpGraphqlError{ + return &httpGraphqlError{ message: err.Error(), statusCode: http.StatusOK, } @@ -504,10 +527,13 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO if operationKit.parsedOperation.GraphQLRequestExtensions.PersistedQuery != nil && operationKit.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash != "" { + + requestContext.operation.persistedID = operationKit.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash persistedIDAttribute := otel.WgOperationPersistedID.String(operationKit.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash) - attributes = append(attributes, persistedIDAttribute) + + requestContext.telemetry.AddCommonAttribute(persistedIDAttribute) + httpOperation.routerSpan.SetAttributes(persistedIDAttribute) - httpOperation.operationMetrics.AddAttributes(persistedIDAttribute) } /** @@ -522,7 +548,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO _, engineNormalizeSpan := h.tracer.Start(req.Context(), "Operation - Normalize", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(attributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) cached, err := operationKit.NormalizeOperation() @@ -536,19 +562,21 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO engineNormalizeSpan.End() - return nil, err + return err } + // Set the cache hit attribute on the span engineNormalizeSpan.SetAttributes(otel.WgNormalizationCacheHit.Bool(cached)) requestContext.operation.hash = operationKit.parsedOperation.ID requestContext.operation.normalizationCacheHit = operationKit.parsedOperation.NormalizationCacheHit - operationHashAttribute := otel.WgOperationHash.String(strconv.FormatUint(operationKit.parsedOperation.ID, 10)) - attributes = append(attributes, operationHashAttribute) + operationHashString := strconv.FormatUint(operationKit.parsedOperation.ID, 10) + operationHashAttribute := otel.WgOperationHash.String(operationHashString) + + requestContext.telemetry.AddCommonAttribute(operationHashAttribute) httpOperation.routerSpan.SetAttributes(operationHashAttribute) - httpOperation.operationMetrics.AddAttributes(operationHashAttribute) /** * Normalize the variables @@ -559,13 +587,14 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO rtrace.AttachErrToSpan(engineNormalizeSpan, err) requestContext.operation.normalizationTime = time.Since(startNormalization) + if !requestContext.operation.traceOptions.ExcludeNormalizeStats { httpOperation.traceTimings.EndNormalize() } engineNormalizeSpan.End() - return nil, err + return err } requestContext.operation.content = operationKit.parsedOperation.NormalizedRepresentation @@ -603,7 +632,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO _, engineValidateSpan := h.tracer.Start(req.Context(), "Operation - Validate", trace.WithSpanKind(trace.SpanKindInternal), - trace.WithAttributes(attributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) validationCached, err := operationKit.Validate(requestContext.operation.executionOptions.SkipLoader) if err != nil { @@ -617,7 +646,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO engineValidateSpan.End() - return nil, err + return err } engineValidateSpan.SetAttributes(otel.WgValidationCacheHit.Bool(validationCached)) @@ -642,7 +671,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO engineValidateSpan.End() - return nil, queryDepthErr + return queryDepthErr } } @@ -666,7 +695,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO _, enginePlanSpan := h.tracer.Start(req.Context(), "Operation - Plan", trace.WithSpanKind(trace.SpanKindInternal), trace.WithAttributes(otel.WgEngineRequestTracingEnabled.Bool(requestContext.operation.traceOptions.Enable)), - trace.WithAttributes(attributes...), + trace.WithAttributes(requestContext.telemetry.CommonAttrs()...), ) planOptions := PlanOptions{ @@ -677,8 +706,6 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO TrackSchemaUsageInfo: h.trackSchemaUsageInfo, } - requestContext.operation.setAttributes() - err = h.planner.plan(requestContext.operation, planOptions) if err != nil { @@ -692,7 +719,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO enginePlanSpan.End() - return nil, err + return err } enginePlanSpan.SetAttributes(otel.WgEnginePlanCacheHit.Bool(requestContext.operation.planCacheHit)) @@ -702,6 +729,15 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO enginePlanSpan.End() + httpOperation.operationMetrics.routerMetrics.MetricStore().MeasureOperationPlanningTime( + req.Context(), + requestContext.operation.planningTime, + append( + requestContext.telemetry.MetricAttrs(true), + otel.WgEnginePlanCacheHit.Bool(requestContext.operation.planCacheHit), + )..., + ) + // we could log the query plan only if query plans are calculated if (h.queryPlansEnabled && requestContext.operation.executionOptions.IncludeQueryPlanInResponse) || h.alwaysIncludeQueryPlan { @@ -720,7 +756,7 @@ func (h *PreHandler) handleOperation(req *http.Request, buf *bytes.Buffer, httpO } } - return requestContext.operation, nil + return nil } // flushMetrics flushes all metrics to the respective exporters diff --git a/router/core/operation_metrics.go b/router/core/operation_metrics.go index e19a3aee76..e7f432b269 100644 --- a/router/core/operation_metrics.go +++ b/router/core/operation_metrics.go @@ -30,58 +30,34 @@ type OperationMetrics struct { requestContentLength int64 routerMetrics RouterMetrics operationStartTime time.Time - metricBaseFields []attribute.KeyValue inflightMetric func() routerConfigVersion string - opContext *operationContext logger *zap.Logger trackUsageInfo bool } -func (m *OperationMetrics) AddOperationContext(opContext *operationContext) { - m.opContext = opContext -} +func (m *OperationMetrics) Finish(ctx context.Context, err error, statusCode int, responseSize int, exportSynchronous bool, opContext *operationContext, attr []attribute.KeyValue) { + latency := time.Since(m.operationStartTime) -func (m *OperationMetrics) Finish(err error, statusCode int, responseSize int, exportSynchronous bool) { m.inflightMetric() - ctx := context.Background() - rm := m.routerMetrics.MetricStore() if err != nil { // We don't store false values in the metrics, so only add the error attribute if it's true - m.metricBaseFields = append(m.metricBaseFields, otel.WgRequestError.Bool(true)) - rm.MeasureRequestError(ctx, m.metricBaseFields...) - } - - m.metricBaseFields = append(m.metricBaseFields, semconv.HTTPStatusCode(statusCode)) - rm.MeasureRequestCount(ctx, m.metricBaseFields...) - rm.MeasureRequestSize(ctx, m.requestContentLength, m.metricBaseFields...) - rm.MeasureLatency(ctx, - m.operationStartTime, - m.metricBaseFields..., - ) - rm.MeasureResponseSize(ctx, int64(responseSize), m.metricBaseFields...) - - if m.trackUsageInfo && m.opContext != nil { - m.routerMetrics.ExportSchemaUsageInfo(m.opContext, statusCode, err != nil, exportSynchronous) + attr = append(attr, otel.WgRequestError.Bool(true)) + rm.MeasureRequestError(ctx, attr...) } -} -func (m *OperationMetrics) AddAttributes(kv ...attribute.KeyValue) { - m.metricBaseFields = append(m.metricBaseFields, kv...) -} + attr = append(attr, semconv.HTTPStatusCode(statusCode)) + rm.MeasureRequestCount(ctx, attr...) + rm.MeasureRequestSize(ctx, m.requestContentLength, attr...) + rm.MeasureLatency(ctx, latency, attr...) + rm.MeasureResponseSize(ctx, int64(responseSize), attr...) -// AddClientInfo adds the client info to the operation metrics. If OperationMetrics -// is nil, it's a no-op. -func (m *OperationMetrics) AddClientInfo(info *ClientInfo) { - if info == nil { - return + if m.trackUsageInfo && opContext != nil { + m.routerMetrics.ExportSchemaUsageInfo(opContext, statusCode, err != nil, exportSynchronous) } - // Add client info to metrics base fields - m.metricBaseFields = append(m.metricBaseFields, otel.WgClientName.String(info.Name)) - m.metricBaseFields = append(m.metricBaseFields, otel.WgClientVersion.String(info.Version)) } type OperationMetricsOptions struct { @@ -100,7 +76,6 @@ func newOperationMetrics(opts OperationMetricsOptions) *OperationMetrics { inflightMetric := opts.RouterMetrics.MetricStore().MeasureInFlight(context.Background(), opts.Attributes...) return &OperationMetrics{ - metricBaseFields: opts.Attributes, requestContentLength: opts.RequestContentLength, operationStartTime: operationStartTime, inflightMetric: inflightMetric, diff --git a/router/core/operation_processor.go b/router/core/operation_processor.go index 43f7a0ae8d..b98abac8e2 100644 --- a/router/core/operation_processor.go +++ b/router/core/operation_processor.go @@ -19,6 +19,8 @@ import ( "github.com/pkg/errors" "github.com/tidwall/sjson" fastjson "github.com/wundergraph/astjson" + "github.com/wundergraph/cosmo/router/internal/persistedoperation" + "github.com/wundergraph/cosmo/router/internal/unsafebytes" "github.com/wundergraph/graphql-go-tools/v2/pkg/ast" "github.com/wundergraph/graphql-go-tools/v2/pkg/astnormalization" "github.com/wundergraph/graphql-go-tools/v2/pkg/astparser" @@ -28,10 +30,6 @@ import ( "github.com/wundergraph/graphql-go-tools/v2/pkg/middleware/operation_complexity" "github.com/wundergraph/graphql-go-tools/v2/pkg/operationreport" "github.com/wundergraph/graphql-go-tools/v2/pkg/variablesvalidation" - "go.opentelemetry.io/otel/attribute" - - "github.com/wundergraph/cosmo/router/internal/persistedoperation" - "github.com/wundergraph/cosmo/router/internal/unsafebytes" ) var ( @@ -346,7 +344,7 @@ func (o *OperationKit) ComputeOperationSha256() error { // FetchPersistedOperation fetches the persisted operation from the cache or the client. If the operation is fetched from the cache it returns true. // UnmarshalOperationFromBody or UnmarshalOperationFromURL must be called before calling this method. -func (o *OperationKit) FetchPersistedOperation(ctx context.Context, clientInfo *ClientInfo, commonTraceAttributes []attribute.KeyValue) (bool, error) { +func (o *OperationKit) FetchPersistedOperation(ctx context.Context, clientInfo *ClientInfo) (bool, error) { if o.operationProcessor.persistedOperationClient == nil { return false, &httpGraphqlError{ message: "could not resolve persisted query, feature is not configured", @@ -363,7 +361,8 @@ func (o *OperationKit) FetchPersistedOperation(ctx context.Context, clientInfo * if fromCache { return true, nil } - persistedOperationData, err := o.operationProcessor.persistedOperationClient.PersistedOperation(ctx, clientInfo.Name, o.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash, commonTraceAttributes) + + persistedOperationData, err := o.operationProcessor.persistedOperationClient.PersistedOperation(ctx, clientInfo.Name, o.parsedOperation.GraphQLRequestExtensions.PersistedQuery.Sha256Hash) if err != nil { return false, err } diff --git a/router/core/operation_processor_test.go b/router/core/operation_processor_test.go index bfbcd4f72b..86604b283b 100644 --- a/router/core/operation_processor_test.go +++ b/router/core/operation_processor_test.go @@ -56,7 +56,7 @@ func TestOperationProcessorPersistentOperations(t *testing.T) { require.NoError(t, err) - _, err = kit.FetchPersistedOperation(context.Background(), clientInfo, nil) + _, err = kit.FetchPersistedOperation(context.Background(), clientInfo) if err != nil { require.EqualError(t, tc.ExpectedError, err.Error()) diff --git a/router/core/router.go b/router/core/router.go index 281bb310bf..03df6a6bd2 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -189,32 +189,23 @@ type ( accessLogsConfig *AccessLogsConfig // If connecting to localhost inside Docker fails, fallback to the docker internal address for the host localhostFallbackInsideDocker bool - - tlsServerConfig *tls.Config - tlsConfig *TlsConfig - + tlsServerConfig *tls.Config + tlsConfig *TlsConfig + telemetryAttributes []config.CustomAttribute // Poller - configPoller configpoller.ConfigPoller - selfRegister selfregister.SelfRegister - - registrationInfo *nodev1.RegistrationInfo - - securityConfiguration config.SecurityConfiguration - - customModules []Module - + configPoller configpoller.ConfigPoller + selfRegister selfregister.SelfRegister + registrationInfo *nodev1.RegistrationInfo + securityConfiguration config.SecurityConfiguration + customModules []Module engineExecutionConfiguration config.EngineExecutionConfiguration // should be removed once the users have migrated to the new overrides config overrideRoutingURLConfiguration config.OverrideRoutingURLConfiguration // the new overrides config - overrides config.OverridesConfiguration - - authorization *config.AuthorizationConfiguration - - rateLimit *config.RateLimitConfiguration - - webSocketConfiguration *config.WebSocketConfiguration - + overrides config.OverridesConfiguration + authorization *config.AuthorizationConfiguration + rateLimit *config.RateLimitConfiguration + webSocketConfiguration *config.WebSocketConfiguration subgraphErrorPropagation config.SubgraphErrorPropagationConfiguration clientHeader config.ClientHeader } @@ -834,8 +825,7 @@ func (r *Router) buildClients() error { } c, err := cdn.NewClient(provider.URL, r.graphApiToken, cdn.Options{ - Logger: r.logger, - TraceProvider: r.tracerProvider, + Logger: r.logger, }) if err != nil { return err @@ -870,8 +860,7 @@ func (r *Router) buildClients() error { } c, err := cdn.NewClient(r.cdnConfig.URL, r.graphApiToken, cdn.Options{ - Logger: r.logger, - TraceProvider: r.tracerProvider, + Logger: r.logger, }) if err != nil { return err @@ -1580,6 +1569,12 @@ func WithTLSConfig(cfg *TlsConfig) Option { } } +func WithTelemetryAttributes(attributes []config.CustomAttribute) Option { + return func(r *Router) { + r.telemetryAttributes = attributes + } +} + func WithConfigPollerConfig(cfg *RouterConfigPollerConfig) Option { return func(r *Router) { r.routerConfigPollerConfig = cfg @@ -1688,18 +1683,35 @@ func TraceConfigFromTelemetry(cfg *config.Telemetry) *rtrace.Config { Sampler: cfg.Tracing.SamplingRate, ParentBasedSampler: cfg.Tracing.ParentBasedSampler, WithNewRoot: cfg.Tracing.WithNewRoot, + Attributes: nil, ExportGraphQLVariables: rtrace.ExportGraphQLVariables{ Enabled: cfg.Tracing.ExportGraphQLVariables, }, - SpanAttributesMapper: buildAttributesMapper(cfg.Attributes), - ResourceAttributes: buildResourceAttributes(cfg.ResourceAttributes), - Exporters: exporters, - Propagators: propagators, - ResponseTraceHeader: cfg.Tracing.ResponseTraceHeader, + ResourceAttributes: buildResourceAttributes(cfg.ResourceAttributes), + Exporters: exporters, + Propagators: propagators, + ResponseTraceHeader: cfg.Tracing.ResponseTraceHeader, + } +} + +// buildAttributesMap returns a map of custom attributes to quickly check if a field is used in the custom attributes. +func buildAttributesMap(attributes []config.CustomAttribute) map[string]string { + result := make(map[string]string) + for _, attr := range attributes { + if attr.ValueFrom != nil && attr.ValueFrom.ContextField != "" { + result[attr.ValueFrom.ContextField] = attr.Key + } } + return result } -func buildAttributesMapper(attributes []config.CustomAttribute) func(req *http.Request) []attribute.KeyValue { +// buildHeaderAttributesMapper returns a function that maps custom attributes to the request headers. +func buildHeaderAttributesMapper(attributes []config.CustomAttribute) func(req *http.Request) []attribute.KeyValue { + + if len(attributes) == 0 { + return nil + } + return func(req *http.Request) []attribute.KeyValue { var result []attribute.KeyValue @@ -1748,7 +1760,7 @@ func MetricConfigFromTelemetry(cfg *config.Telemetry) *rmetric.Config { return &rmetric.Config{ Name: cfg.ServiceName, Version: Version, - AttributesMapper: buildAttributesMapper(cfg.Attributes), + Attributes: cfg.Metrics.Attributes, ResourceAttributes: buildResourceAttributes(cfg.ResourceAttributes), OpenTelemetry: rmetric.OpenTelemetry{ Enabled: cfg.Metrics.OTLP.Enabled, diff --git a/router/core/router_metrics.go b/router/core/router_metrics.go index 300b8419f9..f9546e3811 100644 --- a/router/core/router_metrics.go +++ b/router/core/router_metrics.go @@ -1,19 +1,18 @@ package core import ( - "strconv" - - "github.com/wundergraph/cosmo/router/internal/unsafebytes" - "github.com/wundergraph/cosmo/router/pkg/metric" "go.opentelemetry.io/otel/attribute" + "strconv" graphqlmetricsv1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/graphqlmetrics/v1" "github.com/wundergraph/cosmo/router/internal/graphqlmetrics" + "github.com/wundergraph/cosmo/router/internal/unsafebytes" + "github.com/wundergraph/cosmo/router/pkg/metric" "go.uber.org/zap" ) type RouterMetrics interface { - StartOperation(clientInfo *ClientInfo, logger *zap.Logger, requestContentLength int64, metricAttributes []attribute.KeyValue) *OperationMetrics + StartOperation(logger *zap.Logger, requestContentLength int64, attributes []attribute.KeyValue) *OperationMetrics ExportSchemaUsageInfo(operationContext *operationContext, statusCode int, hasError bool, exportSynchronous bool) GqlMetricsExporter() *graphqlmetrics.Exporter MetricStore() metric.Provider @@ -50,16 +49,15 @@ func NewRouterMetrics(cfg *routerMetricsConfig) RouterMetrics { // StartOperation starts the metrics for a new GraphQL operation. The returned value is a OperationMetrics // where the caller must always call Finish() (usually via defer()). If the metrics are disabled, this // returns nil, but OperationMetrics is safe to call with a nil receiver. -func (m *routerMetrics) StartOperation(clientInfo *ClientInfo, logger *zap.Logger, requestContentLength int64, metricAttributes []attribute.KeyValue) *OperationMetrics { +func (m *routerMetrics) StartOperation(logger *zap.Logger, requestContentLength int64, attributes []attribute.KeyValue) *OperationMetrics { metrics := newOperationMetrics(OperationMetricsOptions{ RouterMetrics: m, - Attributes: metricAttributes, Logger: logger, RequestContentLength: requestContentLength, RouterConfigVersion: m.routerConfigVersion, TrackUsageInfo: m.exportEnabled, + Attributes: attributes, }) - metrics.AddClientInfo(clientInfo) return metrics } diff --git a/router/core/transport.go b/router/core/transport.go index 9ffc5068c1..2e729d98c0 100644 --- a/router/core/transport.go +++ b/router/core/transport.go @@ -81,17 +81,13 @@ func (ct *CustomTransport) measureSubgraphMetrics(req *http.Request) func(err er var baseFields []attribute.KeyValue + baseFields = append(baseFields, reqContext.telemetry.MetricAttrs(true)...) + if activeSubgraph != nil { baseFields = append(baseFields, otel.WgSubgraphName.String(activeSubgraph.Name)) baseFields = append(baseFields, otel.WgSubgraphID.String(activeSubgraph.Id)) } - if attributes := baseAttributesFromContext(req.Context()); attributes != nil { - baseFields = append(baseFields, attributes...) - } - - baseFields = append(baseFields, reqContext.operation.Attributes()...) - inFlightDone := ct.metricStore.MeasureInFlight(req.Context(), baseFields...) ct.metricStore.MeasureRequestSize(req.Context(), req.ContentLength, baseFields...) @@ -100,12 +96,14 @@ func (ct *CustomTransport) measureSubgraphMetrics(req *http.Request) func(err er return func(err error, resp *http.Response) { defer inFlightDone() + latency := time.Since(operationStartTime) + if err != nil { baseFields = append(baseFields, otel.WgRequestError.Bool(true)) } ct.metricStore.MeasureRequestCount(req.Context(), baseFields...) - ct.metricStore.MeasureLatency(req.Context(), operationStartTime, baseFields...) + ct.metricStore.MeasureLatency(req.Context(), latency, baseFields...) if resp != nil { baseFields = append(baseFields, semconv.HTTPStatusCode(resp.StatusCode)) @@ -373,21 +371,17 @@ func (t TransportFactory) RoundTripper(enableSingleFlight bool, transport http.R span := otrace.SpanFromContext(r.Context()) reqContext := getRequestContext(r.Context()) - var commonAttributeValues []attribute.KeyValue + var attributes []attribute.KeyValue subgraph := reqContext.ActiveSubgraph(r) if subgraph != nil { - commonAttributeValues = append(commonAttributeValues, otel.WgSubgraphID.String(subgraph.Id)) - commonAttributeValues = append(commonAttributeValues, otel.WgSubgraphName.String(subgraph.Name)) - } - - if attributes := baseAttributesFromContext(r.Context()); attributes != nil { - commonAttributeValues = append(commonAttributeValues, attributes...) + attributes = append(attributes, otel.WgSubgraphID.String(subgraph.Id)) + attributes = append(attributes, otel.WgSubgraphName.String(subgraph.Name)) } - commonAttributeValues = append(commonAttributeValues, reqContext.operation.Attributes()...) + attributes = append(attributes, reqContext.telemetry.CommonAttrs()...) - span.SetAttributes(commonAttributeValues...) + span.SetAttributes(attributes...) }), ) diff --git a/router/core/websocket.go b/router/core/websocket.go index 0d9360d466..afdc1e4d1b 100644 --- a/router/core/websocket.go +++ b/router/core/websocket.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "go.opentelemetry.io/otel/attribute" "net" "net/http" "regexp" @@ -58,6 +59,7 @@ type WebsocketMiddlewareOptions struct { WebSocketConfiguration *config.WebSocketConfiguration ClientHeader config.ClientHeader + Attributes []attribute.KeyValue } func NewWebsocketMiddleware(ctx context.Context, opts WebsocketMiddlewareOptions) func(http.Handler) http.Handler { @@ -77,6 +79,7 @@ func NewWebsocketMiddleware(ctx context.Context, opts WebsocketMiddlewareOptions config: opts.WebSocketConfiguration, clientHeader: opts.ClientHeader, handlerSem: semaphore.NewWeighted(128), + attributes: opts.Attributes, } if opts.WebSocketConfiguration != nil && opts.WebSocketConfiguration.AbsintheProtocol.Enabled { handler.absintheHandlerEnabled = true @@ -215,7 +218,8 @@ type WebsocketHandler struct { handlerSem *semaphore.Weighted connectionIDs atomic.Int64 - stats WebSocketsStatistics + stats WebSocketsStatistics + attributes []attribute.KeyValue readTimeout time.Duration @@ -322,6 +326,7 @@ func (h *WebsocketHandler) handleUpgradeRequest(w http.ResponseWriter, r *http.R Config: h.config, ForwardUpgradeHeaders: h.forwardUpgradeHeadersConfig, ForwardQueryParams: h.forwardQueryParamsConfig, + Attributes: h.attributes, }) err = handler.Initialize() if err != nil { @@ -647,6 +652,7 @@ type WebSocketConnectionHandlerOptions struct { InitRequestID string ForwardUpgradeHeaders forwardConfig ForwardQueryParams forwardConfig + Attributes []attribute.KeyValue } type WebSocketConnectionHandler struct { @@ -677,6 +683,8 @@ type WebSocketConnectionHandler struct { subscriptions sync.Map stats WebSocketsStatistics + attributes []attribute.KeyValue + forwardInitialPayload bool forwardUpgradeHeaders *forwardConfig @@ -718,6 +726,7 @@ func NewWebsocketConnectionHandler(ctx context.Context, opts WebSocketConnection forwardQueryParams: &opts.ForwardQueryParams, forwardInitialPayload: opts.Config != nil && opts.Config.ForwardInitialPayload, plannerOptions: opts.PlanOptions, + attributes: opts.Attributes, } } @@ -765,7 +774,7 @@ func (h *WebSocketConnectionHandler) parseAndPlan(payload []byte) (*ParsedOperat var skipParse bool if operationKit.parsedOperation.IsPersistedOperation { - skipParse, err = operationKit.FetchPersistedOperation(h.ctx, h.clientInfo, baseAttributesFromContext(h.ctx)) + skipParse, err = operationKit.FetchPersistedOperation(h.ctx, h.clientInfo) if err != nil { return nil, nil, err } @@ -827,8 +836,6 @@ func (h *WebSocketConnectionHandler) parseAndPlan(payload []byte) (*ParsedOperat opContext.initialPayload = h.initialPayload - opContext.setAttributes() - return operationKit.parsedOperation, opContext, nil } @@ -893,7 +900,14 @@ func (h *WebSocketConnectionHandler) executeSubscription(registration *Subscript resolveCtx.InitialPayload = operationCtx.initialPayload } - resolveCtx = resolveCtx.WithContext(withRequestContext(h.ctx, buildRequestContext(nil, registration.clientRequest, operationCtx, h.logger))) + reqContext := buildRequestContext(requestContextOptions{ + operationContext: operationCtx, + requestLogger: h.logger, + metricSetAttributes: nil, + w: nil, + r: registration.clientRequest, + }) + resolveCtx = resolveCtx.WithContext(withRequestContext(h.ctx, reqContext)) if h.graphqlHandler.authorizer != nil { resolveCtx = WithAuthorizationExtension(resolveCtx) resolveCtx.SetAuthorizer(h.graphqlHandler.authorizer) diff --git a/router/internal/persistedoperation/cdn/client.go b/router/internal/persistedoperation/cdn/client.go index b469b54299..91688d3274 100644 --- a/router/internal/persistedoperation/cdn/client.go +++ b/router/internal/persistedoperation/cdn/client.go @@ -9,9 +9,7 @@ import ( "github.com/wundergraph/cosmo/router/internal/httpclient" "github.com/wundergraph/cosmo/router/internal/jwt" "github.com/wundergraph/cosmo/router/internal/persistedoperation" - "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" - sdktrace "go.opentelemetry.io/otel/sdk/trace" semconv12 "go.opentelemetry.io/otel/semconv/v1.12.0" semconv "go.opentelemetry.io/otel/semconv/v1.17.0" "go.opentelemetry.io/otel/trace" @@ -22,8 +20,7 @@ import ( ) type Options struct { - Logger *zap.Logger - TraceProvider *sdktrace.TracerProvider + Logger *zap.Logger } type client struct { @@ -37,28 +34,18 @@ type client struct { organizationID string httpClient *http.Client logger *zap.Logger - tracer trace.Tracer } -func (cdn *client) PersistedOperation(ctx context.Context, clientName string, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) { - - ctx, span := cdn.tracer.Start(ctx, "Load Persisted Operation", - trace.WithSpanKind(trace.SpanKindClient), - trace.WithAttributes(attributes...), - ) - defer span.End() - - content, err := cdn.persistedOperation(ctx, clientName, sha256Hash, attributes) +func (cdn *client) PersistedOperation(ctx context.Context, clientName string, sha256Hash string) ([]byte, error) { + content, err := cdn.persistedOperation(ctx, clientName, sha256Hash) if err != nil { - span.RecordError(err) - span.SetStatus(codes.Error, err.Error()) return nil, err } return content, nil } -func (cdn *client) persistedOperation(ctx context.Context, clientName string, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) { +func (cdn *client) persistedOperation(ctx context.Context, clientName string, sha256Hash string) ([]byte, error) { span := trace.SpanFromContext(ctx) @@ -161,10 +148,6 @@ func NewClient(endpoint string, token string, opts Options) (persistedoperation. organizationID: url.PathEscape(claims.OrganizationID), httpClient: httpclient.NewRetryableHTTPClient(logger), logger: logger, - tracer: opts.TraceProvider.Tracer( - "wundergraph/cosmo/router/cdn_persisted_operations_client", - trace.WithInstrumentationVersion("0.0.1"), - ), }, nil } diff --git a/router/internal/persistedoperation/client.go b/router/internal/persistedoperation/client.go index bf493977fb..98b88de6d2 100644 --- a/router/internal/persistedoperation/client.go +++ b/router/internal/persistedoperation/client.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "github.com/dgraph-io/ristretto" - "go.opentelemetry.io/otel/attribute" "go.uber.org/zap" ) @@ -23,7 +22,7 @@ func (e *PersistentOperationNotFoundError) Error() string { } type Client interface { - PersistedOperation(ctx context.Context, clientName string, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) + PersistedOperation(ctx context.Context, clientName string, sha256Hash string) ([]byte, error) Close() } @@ -69,12 +68,12 @@ func NewClient(opts *Options) (Client, error) { }, nil } -func (c client) PersistedOperation(ctx context.Context, clientName string, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) { +func (c client) PersistedOperation(ctx context.Context, clientName string, sha256Hash string) ([]byte, error) { if data := c.cache.Get(clientName, sha256Hash); data != nil { return data, nil } - content, err := c.providerClient.PersistedOperation(ctx, clientName, sha256Hash, attributes) + content, err := c.providerClient.PersistedOperation(ctx, clientName, sha256Hash) if err != nil { return nil, err } diff --git a/router/internal/persistedoperation/s3/client.go b/router/internal/persistedoperation/s3/client.go index 82b7bb7509..aa42a7b77b 100644 --- a/router/internal/persistedoperation/s3/client.go +++ b/router/internal/persistedoperation/s3/client.go @@ -10,8 +10,6 @@ import ( "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" "github.com/wundergraph/cosmo/router/internal/persistedoperation" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/codes" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace" ) @@ -76,24 +74,16 @@ func NewClient(endpoint string, options *Options) (persistedoperation.Client, er return client, nil } -func (c Client) PersistedOperation(ctx context.Context, clientName, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) { - ctx, span := c.tracer.Start(ctx, "Load Persisted Operation", - trace.WithSpanKind(trace.SpanKindClient), - trace.WithAttributes(attributes...), - ) - defer span.End() - - content, err := c.persistedOperation(ctx, clientName, sha256Hash, attributes) +func (c Client) PersistedOperation(ctx context.Context, clientName, sha256Hash string) ([]byte, error) { + content, err := c.persistedOperation(ctx, clientName, sha256Hash) if err != nil { - span.RecordError(err) - span.SetStatus(codes.Error, err.Error()) return nil, err } return content, nil } -func (c Client) persistedOperation(ctx context.Context, clientName, sha256Hash string, attributes []attribute.KeyValue) ([]byte, error) { +func (c Client) persistedOperation(ctx context.Context, clientName, sha256Hash string) ([]byte, error) { objectPath := fmt.Sprintf("%s/%s.json", c.options.ObjectPathPrefix, sha256Hash) reader, err := c.client.GetObject(ctx, c.options.BucketName, objectPath, minio.GetObjectOptions{}) if err != nil { diff --git a/router/internal/requestlogger/requestlogger.go b/router/internal/requestlogger/requestlogger.go index 6bbc345549..d005015100 100644 --- a/router/internal/requestlogger/requestlogger.go +++ b/router/internal/requestlogger/requestlogger.go @@ -15,7 +15,7 @@ import ( "go.uber.org/zap/zapcore" ) -type ContextFunc func(r *http.Request) []zapcore.Field +type ContextFunc func(panic any, r *http.Request) []zapcore.Field // Option provides a functional approach to define // configuration for a handler; such as setting the logging @@ -42,7 +42,7 @@ type handler struct { skipPaths []string ipAnonymizationConfig *IPAnonymizationConfig traceID bool // optionally log Open Telemetry TraceID - context ContextFunc + fieldsHandler ContextFunc handler http.Handler logger *zap.Logger baseFields []zapcore.Field @@ -62,9 +62,9 @@ func WithAnonymization(ipConfig *IPAnonymizationConfig) Option { } } -func WithRequestFields(fn ContextFunc) Option { +func WithFieldsHandler(fn ContextFunc) Option { return func(r *handler) { - r.context = fn + r.fieldsHandler = fn } } @@ -87,7 +87,7 @@ func WithDefaultOptions() Option { r.utc = true r.skipPaths = []string{} r.traceID = true - r.context = nil + r.fieldsHandler = nil } } @@ -170,8 +170,8 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // This is only called on panic so it is safe to call it here again // to gather all the fields that are needed for logging - if h.context != nil { - fields = append(fields, h.context(r)...) + if h.fieldsHandler != nil { + fields = append(fields, h.fieldsHandler(err, r)...) } if brokenPipe { @@ -182,7 +182,7 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { h.logger.Error("[Recovery from panic]", fields...) } - // rethrow the error to the recover middleware can handle it + // re-panic the error to the recover middleware can handle it panic(err) } @@ -198,8 +198,8 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { zap.Int("status", ww.Status()), } - if h.context != nil { - resFields = append(resFields, h.context(r)...) + if h.fieldsHandler != nil { + resFields = append(resFields, h.fieldsHandler(nil, r)...) } h.logger.Info(path, append(fields, resFields...)...) diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index 172c0aac8b..00dd9dbb0b 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -100,8 +100,9 @@ type MetricsOTLPExporter struct { } type Metrics struct { - OTLP MetricsOTLP `yaml:"otlp"` - Prometheus Prometheus `yaml:"prometheus"` + Attributes []CustomAttribute `yaml:"attributes"` + OTLP MetricsOTLP `yaml:"otlp"` + Prometheus Prometheus `yaml:"prometheus"` } type MetricsOTLP struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index c0b718417c..dd26e37354 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -184,9 +184,7 @@ "properties": { "fallback_storage": { "description": "The fallback storage provider for the execution config in case the primary one fails.", - "required": [ - "enabled" - ], + "required": ["enabled"], "properties": { "enabled": { "type": "boolean", @@ -489,6 +487,7 @@ "description": "The fields to add to the logs. The fields are added to the logs as key-value pairs.", "items": { "type": "object", + "description": "The configuration for custom fields. Custom attributes can be created from request headers or context fields. Not every context fields are available at all request life-cycle stages. If a value is a list, the value is JSON encoded for OTLP. For Prometheus, the values are exploded into multiple metrics with unique labels. Keep in mind, that every new custom attribute increases the cardinality.", "additionalProperties": false, "required": ["key"], "properties": { @@ -566,39 +565,29 @@ }, "attributes": { "type": "array", - "description": "The attributes to add to OTEL metrics and traces. Because Prometheus metrics rely on the OpenTelemetry metrics, the attributes are also added to the Prometheus metrics.", + "description": "The default attributes to add to OTEL and Prometheus metrics. Because Prometheus metrics rely on the OpenTelemetry metrics, the attributes are also added to the Prometheus metrics.", "items": { "type": "object", + "description": "The configuration for custom attributes. Custom attributes can be created from request headers or static values. Keep in mind, that every new custom attribute increases the cardinality of the pipeline.", "additionalProperties": false, "required": ["key"], "properties": { "key": { "type": "string", - "description": "The key of the attribute." + "description": "The key of the field." }, "default": { "type": "string", - "description": "The default value of the attribute. If the value is not set, value_from is used. If both value and value_from are set, value_from has precedence and in case of a missing value_from, the default value is used." + "description": "The default value of the field. If the value is not set, value_from is used. If both value and value_from are set, value_from has precedence and in case of a missing value_from, the default value is used." }, "value_from": { "type": "object", - "description": "Defines a source for the attribute value e.g. from a request header. If both default and value_from are set, value_from has precedence.", + "description": "Defines a source for the field value e.g. from a request header. If both default and value_from are set, value_from has precedence.", "additionalProperties": false, "properties": { "request_header": { "type": "string", "description": "The name of the request header from which to extract the value. The value is only extracted when a request context is available otherwise the default value is used." - }, - "context": { - "type": "object", - "additionalProperties": false, - "description": "The context value from which to extract the value. The value is only extracted when a context is available otherwise the default value is used.", - "properties": { - "path": { - "type": "string", - "description": "The path to the context value. The path is used to extract the value from the context. The path is a dot-separated string e.g. 'request.operation.hash', 'request.operation.name' or 'response.error.errorCodes'." - } - } } } } @@ -737,6 +726,42 @@ "description": "The configuration for the collection and export of metrics. The metrics are collected and exported using the OpenTelemetry protocol (OTLP) and Prometheus.", "additionalProperties": false, "properties": { + "attributes": { + "type": "array", + "description": "The attributes to add to OTLP Metrics and Prometheus.", + "items": { + "type": "object", + "description": "The configuration for custom attributes. Custom attributes can be created from request headers, static values or context fields. Not every context fields are available at all request life-cycle stages. If a value is a list, the value is JSON encoded for OTLP. For Prometheus, the values are exploded into multiple metrics with unique labels. Keep in mind, that every new custom attribute increases the cardinality.", + "additionalProperties": false, + "required": ["key"], + "properties": { + "key": { + "type": "string", + "description": "The key of the field." + }, + "default": { + "type": "string", + "description": "The default value of the field. If the value is not set, value_from is used. If both value and value_from are set, value_from has precedence and in case of a missing value_from, the default value is used." + }, + "value_from": { + "type": "object", + "description": "Defines a source for the field value e.g. from a request header or request context. If both default and value_from are set, value_from has precedence.", + "additionalProperties": false, + "properties": { + "request_header": { + "type": "string", + "description": "The name of the request header from which to extract the value. The value is only extracted when a request context is available otherwise the default value is used." + }, + "context_field": { + "type": "string", + "description": "The field name of the context from which to extract the value. The value is only extracted when a context is available otherwise the default value is used.", + "enum": ["operation_service_names", "graphql_error_codes", "graphql_error_service_names", "operation_sha256"] + } + } + } + } + } + }, "otlp": { "type": "object", "description": "The configuration for the OpenTelemetry protocol (OTLP). The OTLP is used to collect and export the metrics.", diff --git a/router/pkg/config/testdata/config_defaults.json b/router/pkg/config/testdata/config_defaults.json index 7e60090e6d..0a7328ef96 100644 --- a/router/pkg/config/testdata/config_defaults.json +++ b/router/pkg/config/testdata/config_defaults.json @@ -29,6 +29,7 @@ "WithNewRoot": false }, "Metrics": { + "Attributes": null, "OTLP": { "Enabled": true, "RouterRuntime": true, diff --git a/router/pkg/config/testdata/config_full.json b/router/pkg/config/testdata/config_full.json index f3fdd867cb..0fa3b0bded 100644 --- a/router/pkg/config/testdata/config_full.json +++ b/router/pkg/config/testdata/config_full.json @@ -39,6 +39,7 @@ "WithNewRoot": false }, "Metrics": { + "Attributes": null, "OTLP": { "Enabled": true, "RouterRuntime": true, diff --git a/router/pkg/metric/config.go b/router/pkg/metric/config.go index 54129e6410..b9d1da4375 100644 --- a/router/pkg/metric/config.go +++ b/router/pkg/metric/config.go @@ -2,10 +2,10 @@ package metric import ( "github.com/prometheus/client_golang/prometheus" + "github.com/wundergraph/cosmo/router/pkg/config" "github.com/wundergraph/cosmo/router/pkg/otel/otelconfig" "go.opentelemetry.io/otel/attribute" sdkmetric "go.opentelemetry.io/otel/sdk/metric" - "net/http" "net/url" "regexp" ) @@ -81,11 +81,10 @@ type Config struct { // Prometheus includes the Prometheus configuration Prometheus PrometheusConfig - // AttributesMapper added to the global attributes for all metrics. - AttributesMapper func(req *http.Request) []attribute.KeyValue - // ResourceAttributes added to the global resource attributes for all metrics. ResourceAttributes []attribute.KeyValue + + Attributes []config.CustomAttribute } func (c *Config) IsEnabled() bool { @@ -98,8 +97,8 @@ func DefaultConfig(serviceVersion string) *Config { return &Config{ Name: DefaultServerName, Version: serviceVersion, - AttributesMapper: nil, ResourceAttributes: make([]attribute.KeyValue, 0), + Attributes: make([]config.CustomAttribute, 0), OpenTelemetry: OpenTelemetry{ Enabled: false, RouterRuntime: true, diff --git a/router/pkg/metric/measurements.go b/router/pkg/metric/measurements.go index 566d3686dd..661cdb8e21 100644 --- a/router/pkg/metric/measurements.go +++ b/router/pkg/metric/measurements.go @@ -81,5 +81,14 @@ func createMeasures(meter otelmetric.Meter) (*Measurements, error) { h.upDownCounters[InFlightRequestsUpDownCounter] = inFlightRequestsGauge + operationPlanningTime, err := meter.Float64Histogram( + OperationPlanningTime, + OperationPlanningTimeHistogramOptions..., + ) + if err != nil { + return nil, fmt.Errorf("failed to create operation planning time measure: %w", err) + } + h.histograms[OperationPlanningTime] = operationPlanningTime + return h, nil } diff --git a/router/pkg/metric/metric_store.go b/router/pkg/metric/metric_store.go index a8229d9104..4e51f4f055 100644 --- a/router/pkg/metric/metric_store.go +++ b/router/pkg/metric/metric_store.go @@ -21,6 +21,8 @@ const ( InFlightRequestsUpDownCounter = "router.http.requests.in_flight" // Number of requests in flight RequestError = "router.http.requests.error" // Total request error count + OperationPlanningTime = "router.graphql.operation.planning_time" // Time taken to plan the operation + unitBytes = "bytes" unitMilliseconds = "ms" ) @@ -32,7 +34,7 @@ var ( RequestCounterOptions = []otelmetric.Int64CounterOption{ otelmetric.WithDescription(RequestCounterDescription), } - RequestErrorCounterDescription = "Total number of failed request" + RequestErrorCounterDescription = "Total number of failed requests" RequestErrorCounterOptions = []otelmetric.Int64CounterOption{ otelmetric.WithDescription(RequestErrorCounterDescription), } @@ -55,6 +57,14 @@ var ( InFlightRequestsUpDownCounterOptions = []otelmetric.Int64UpDownCounterOption{ otelmetric.WithDescription(InFlightRequestsUpDownCounterDescription), } + + // GraphQL operation metrics + + OperationPlanningTimeHistogramDescription = "Operation planning time in milliseconds" + OperationPlanningTimeHistogramOptions = []otelmetric.Float64HistogramOption{ + otelmetric.WithUnit("ms"), + otelmetric.WithDescription(OperationPlanningTimeHistogramDescription), + } ) type ( @@ -69,8 +79,7 @@ type ( otlpRequestMetrics Provider promRequestMetrics Provider - baseAttributes []attribute.KeyValue - logger *zap.Logger + logger *zap.Logger } Provider interface { @@ -78,8 +87,9 @@ type ( MeasureRequestCount(ctx context.Context, attr ...attribute.KeyValue) MeasureRequestSize(ctx context.Context, contentLength int64, attr ...attribute.KeyValue) MeasureResponseSize(ctx context.Context, size int64, attr ...attribute.KeyValue) - MeasureLatency(ctx context.Context, requestStartTime time.Time, attr ...attribute.KeyValue) + MeasureLatency(ctx context.Context, latency time.Duration, attr ...attribute.KeyValue) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) + MeasureOperationPlanningTime(ctx context.Context, planningTime time.Duration, attr ...attribute.KeyValue) Flush(ctx context.Context) error } @@ -99,7 +109,7 @@ func NewStore(opts ...Option) (Store, error) { } // Create OTLP metrics exported to OTEL - oltpMetrics, err := NewOtlpMetricStore(h.logger, h.otelMeterProvider, h.baseAttributes) + oltpMetrics, err := NewOtlpMetricStore(h.logger, h.otelMeterProvider) if err != nil { return nil, err } @@ -107,7 +117,7 @@ func NewStore(opts ...Option) (Store, error) { h.otlpRequestMetrics = oltpMetrics // Create prometheus metrics exported to Prometheus scrape endpoint - promMetrics, err := NewPromMetricStore(h.logger, h.promMeterProvider, h.baseAttributes) + promMetrics, err := NewPromMetricStore(h.logger, h.promMeterProvider) if err != nil { return nil, err } @@ -142,9 +152,9 @@ func (h *Metrics) MeasureResponseSize(ctx context.Context, size int64, attr ...a h.promRequestMetrics.MeasureResponseSize(ctx, size, attr...) } -func (h *Metrics) MeasureLatency(ctx context.Context, requestStartTime time.Time, attr ...attribute.KeyValue) { - h.otlpRequestMetrics.MeasureLatency(ctx, requestStartTime, attr...) - h.promRequestMetrics.MeasureLatency(ctx, requestStartTime, attr...) +func (h *Metrics) MeasureLatency(ctx context.Context, latency time.Duration, attr ...attribute.KeyValue) { + h.otlpRequestMetrics.MeasureLatency(ctx, latency, attr...) + h.promRequestMetrics.MeasureLatency(ctx, latency, attr...) } func (h *Metrics) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) { @@ -152,6 +162,11 @@ func (h *Metrics) MeasureRequestError(ctx context.Context, attr ...attribute.Key h.promRequestMetrics.MeasureRequestError(ctx, attr...) } +func (h *Metrics) MeasureOperationPlanningTime(ctx context.Context, planningTime time.Duration, attr ...attribute.KeyValue) { + h.otlpRequestMetrics.MeasureOperationPlanningTime(ctx, planningTime, attr...) + h.promRequestMetrics.MeasureOperationPlanningTime(ctx, planningTime, attr...) +} + // Flush flushes the metrics to the backend synchronously. func (h *Metrics) Flush(ctx context.Context) error { @@ -179,13 +194,6 @@ func (h *Metrics) Shutdown(ctx context.Context) error { return err } -// WithAttributes adds attributes to the base attributes -func WithAttributes(attrs ...attribute.KeyValue) Option { - return func(h *Metrics) { - h.baseAttributes = append(h.baseAttributes, attrs...) - } -} - func WithLogger(logger *zap.Logger) Option { return func(h *Metrics) { h.logger = logger diff --git a/router/pkg/metric/noop_metrics.go b/router/pkg/metric/noop_metrics.go index d9fba5a8c1..afdb8ebb09 100644 --- a/router/pkg/metric/noop_metrics.go +++ b/router/pkg/metric/noop_metrics.go @@ -11,7 +11,8 @@ import ( type NoopMetrics struct{} -func (n NoopMetrics) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) {} +func (n NoopMetrics) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) { +} func NewNoopMetrics() Store { return &NoopMetrics{} @@ -21,7 +22,8 @@ func (n NoopMetrics) MeasureInFlight(ctx context.Context, attr ...attribute.KeyV return func() {} } -func (n NoopMetrics) MeasureRequestCount(ctx context.Context, attr ...attribute.KeyValue) {} +func (n NoopMetrics) MeasureRequestCount(ctx context.Context, attr ...attribute.KeyValue) { +} func (n NoopMetrics) MeasureRequestSize(ctx context.Context, contentLength int64, attr ...attribute.KeyValue) { } @@ -29,7 +31,10 @@ func (n NoopMetrics) MeasureRequestSize(ctx context.Context, contentLength int64 func (n NoopMetrics) MeasureResponseSize(ctx context.Context, size int64, attr ...attribute.KeyValue) { } -func (n NoopMetrics) MeasureLatency(ctx context.Context, requestStartTime time.Time, attr ...attribute.KeyValue) { +func (n NoopMetrics) MeasureLatency(ctx context.Context, latency time.Duration, attr ...attribute.KeyValue) { +} + +func (n NoopMetrics) MeasureOperationPlanningTime(ctx context.Context, planningTime time.Duration, attr ...attribute.KeyValue) { } func (n NoopMetrics) Flush(ctx context.Context) error { diff --git a/router/pkg/metric/otlp_metric_store.go b/router/pkg/metric/otlp_metric_store.go index dad4b560da..0b8a748be0 100644 --- a/router/pkg/metric/otlp_metric_store.go +++ b/router/pkg/metric/otlp_metric_store.go @@ -15,25 +15,23 @@ const ( ) type OtlpMetricStore struct { - meter otelmetric.Meter - baseAttributes []attribute.KeyValue - meterProvider *metric.MeterProvider - logger *zap.Logger + meter otelmetric.Meter + meterProvider *metric.MeterProvider + logger *zap.Logger measurements *Measurements } -func NewOtlpMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider, baseAttributes []attribute.KeyValue) (Provider, error) { +func NewOtlpMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider) (Provider, error) { meter := meterProvider.Meter(cosmoRouterMeterName, otelmetric.WithInstrumentationVersion(cosmoRouterMeterVersion), ) m := &OtlpMetricStore{ - meter: meter, - baseAttributes: baseAttributes, - logger: logger, - meterProvider: meterProvider, + meter: meter, + logger: logger, + meterProvider: meterProvider, } measures, err := createMeasures(meter) @@ -47,89 +45,75 @@ func NewOtlpMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider, } func (h *OtlpMetricStore) MeasureInFlight(ctx context.Context, attr ...attribute.KeyValue) func() { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) + attributeAddOpt := otelmetric.WithAttributes(attributes...) if c, ok := h.measurements.upDownCounters[InFlightRequestsUpDownCounter]; ok { - c.Add(ctx, 1, baseAttributes) + c.Add(ctx, 1, attributeAddOpt) } return func() { if c, ok := h.measurements.upDownCounters[InFlightRequestsUpDownCounter]; ok { - c.Add(ctx, -1, baseAttributes) + c.Add(ctx, -1, attributeAddOpt) } } } func (h *OtlpMetricStore) MeasureRequestCount(ctx context.Context, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[RequestCounter]; ok { - c.Add(ctx, 1, baseAttributes) + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) + c.Add(ctx, 1, otelmetric.WithAttributes(attributes...)) } } func (h *OtlpMetricStore) MeasureRequestSize(ctx context.Context, contentLength int64, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[RequestContentLengthCounter]; ok { - c.Add(ctx, contentLength, baseAttributes) + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) + c.Add(ctx, contentLength, otelmetric.WithAttributes(attributes...)) } } func (h *OtlpMetricStore) MeasureResponseSize(ctx context.Context, size int64, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[ResponseContentLengthCounter]; ok { - c.Add(ctx, size, baseAttributes) + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) + c.Add(ctx, size, otelmetric.WithAttributes(attributes...)) } } -func (h *OtlpMetricStore) MeasureLatency(ctx context.Context, requestStartTime time.Time, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) +func (h *OtlpMetricStore) MeasureLatency(ctx context.Context, latency time.Duration, attr ...attribute.KeyValue) { + if c, ok := h.measurements.histograms[ServerLatencyHistogram]; ok { + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) - // Use floating point division here for higher precision (instead of Millisecond method). - elapsedTime := float64(time.Since(requestStartTime)) / float64(time.Millisecond) + // Use floating point division here for higher precision (instead of Millisecond method). + elapsedTime := float64(latency) / float64(time.Millisecond) - if c, ok := h.measurements.histograms[ServerLatencyHistogram]; ok { - c.Record(ctx, elapsedTime, baseAttributes) + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(attributes...)) } } func (h *OtlpMetricStore) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue + if c, ok := h.measurements.counters[RequestError]; ok { + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) + c.Add(ctx, 1, otelmetric.WithAttributes(attributes...)) + } +} - baseAttributes := otelmetric.WithAttributes(baseKeys...) +func (h *OtlpMetricStore) MeasureOperationPlanningTime(ctx context.Context, planningTime time.Duration, attr ...attribute.KeyValue) { + if c, ok := h.measurements.histograms[OperationPlanningTime]; ok { + attributes := make([]attribute.KeyValue, 0, len(attr)) + attributes = append(attributes, attr...) - if c, ok := h.measurements.counters[RequestError]; ok { - c.Add(ctx, 1, baseAttributes) + // Use floating point division here for higher precision (instead of Millisecond method). + elapsedTime := float64(planningTime) / float64(time.Millisecond) + + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(attributes...)) } } diff --git a/router/pkg/metric/prom_metric_store.go b/router/pkg/metric/prom_metric_store.go index 3475179a1d..ac9e4d4b5e 100644 --- a/router/pkg/metric/prom_metric_store.go +++ b/router/pkg/metric/prom_metric_store.go @@ -15,25 +15,23 @@ const ( ) type PromMetricStore struct { - meter otelmetric.Meter - baseAttributes []attribute.KeyValue - meterProvider *metric.MeterProvider - logger *zap.Logger + meter otelmetric.Meter + meterProvider *metric.MeterProvider + logger *zap.Logger measurements *Measurements } -func NewPromMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider, baseAttributes []attribute.KeyValue) (Provider, error) { +func NewPromMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider) (Provider, error) { meter := meterProvider.Meter(cosmoRouterPrometheusMeterName, otelmetric.WithInstrumentationVersion(cosmoRouterPrometheusMeterVersion), ) m := &PromMetricStore{ - meter: meter, - baseAttributes: baseAttributes, - logger: logger, - meterProvider: meterProvider, + meter: meter, + logger: logger, + meterProvider: meterProvider, } measures, err := createMeasures(meter) @@ -47,89 +45,126 @@ func NewPromMetricStore(logger *zap.Logger, meterProvider *metric.MeterProvider, } func (h *PromMetricStore) MeasureInFlight(ctx context.Context, attr ...attribute.KeyValue) func() { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) + attributeAddOpt := otelmetric.WithAttributes(otherAttributes...) if c, ok := h.measurements.upDownCounters[InFlightRequestsUpDownCounter]; ok { - c.Add(ctx, 1, baseAttributes) + if len(dimensionsSet) == 0 { + c.Add(ctx, 1, attributeAddOpt) + } else { + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, 1, otelmetric.WithAttributes(attrs...)) + }) + } } return func() { if c, ok := h.measurements.upDownCounters[InFlightRequestsUpDownCounter]; ok { - c.Add(ctx, -1, baseAttributes) + if len(dimensionsSet) == 0 { + c.Add(ctx, -1, attributeAddOpt) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, -1, otelmetric.WithAttributes(attrs...)) + }) } } } func (h *PromMetricStore) MeasureRequestCount(ctx context.Context, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[RequestCounter]; ok { - c.Add(ctx, 1, baseAttributes) + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) + + if len(dimensionsSet) == 0 { + c.Add(ctx, 1, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, 1, otelmetric.WithAttributes(attrs...)) + }) } } func (h *PromMetricStore) MeasureRequestSize(ctx context.Context, contentLength int64, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[RequestContentLengthCounter]; ok { - c.Add(ctx, contentLength, baseAttributes) + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) + + if len(dimensionsSet) == 0 { + c.Add(ctx, contentLength, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, contentLength, otelmetric.WithAttributes(attrs...)) + }) } } func (h *PromMetricStore) MeasureResponseSize(ctx context.Context, size int64, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) - if c, ok := h.measurements.counters[ResponseContentLengthCounter]; ok { - c.Add(ctx, size, baseAttributes) + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) + + if len(dimensionsSet) == 0 { + c.Add(ctx, size, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, size, otelmetric.WithAttributes(attrs...)) + }) } } -func (h *PromMetricStore) MeasureLatency(ctx context.Context, requestStartTime time.Time, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue - - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) - - baseAttributes := otelmetric.WithAttributes(baseKeys...) +func (h *PromMetricStore) MeasureLatency(ctx context.Context, latency time.Duration, attr ...attribute.KeyValue) { + if c, ok := h.measurements.histograms[ServerLatencyHistogram]; ok { + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) - // Use floating point division here for higher precision (instead of Millisecond method). - elapsedTime := float64(time.Since(requestStartTime)) / float64(time.Millisecond) + // Use floating point division here for higher precision (instead of Millisecond method). + elapsedTime := float64(latency) / float64(time.Millisecond) - if c, ok := h.measurements.histograms[ServerLatencyHistogram]; ok { - c.Record(ctx, elapsedTime, baseAttributes) + if len(dimensionsSet) == 0 { + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(attrs...)) + }) } } func (h *PromMetricStore) MeasureRequestError(ctx context.Context, attr ...attribute.KeyValue) { - var baseKeys []attribute.KeyValue + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) - baseKeys = append(baseKeys, h.baseAttributes...) - baseKeys = append(baseKeys, attr...) + if c, ok := h.measurements.counters[RequestError]; ok { + if len(dimensionsSet) == 0 { + c.Add(ctx, 1, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Add(ctx, 1, otelmetric.WithAttributes(attrs...)) + }) + } +} - baseAttributes := otelmetric.WithAttributes(baseKeys...) +func (h *PromMetricStore) MeasureOperationPlanningTime(ctx context.Context, planningTime time.Duration, attr ...attribute.KeyValue) { + dimensionsSet, otherAttributes := filterStringSliceAttr(attr) + otherAttributes = append(otherAttributes, otherAttributes...) - if c, ok := h.measurements.counters[RequestError]; ok { - c.Add(ctx, 1, baseAttributes) + // Use floating point division here for higher precision (instead of Millisecond method). + elapsedTime := float64(planningTime) / float64(time.Millisecond) + + if c, ok := h.measurements.histograms[OperationPlanningTime]; ok { + if len(dimensionsSet) == 0 { + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(otherAttributes...)) + return + } + explodeMetric(ctx, dimensionsSet, otherAttributes, func(ctx context.Context, attrs []attribute.KeyValue) { + c.Record(ctx, elapsedTime, otelmetric.WithAttributes(attrs...)) + }) } } @@ -140,3 +175,33 @@ func (h *PromMetricStore) Flush(ctx context.Context) error { func (h *PromMetricStore) Shutdown(ctx context.Context) error { return h.meterProvider.Shutdown(ctx) } + +// explodeMetric explodes the metric into multiple metrics with different label values in Prometheus. +func explodeMetric(ctx context.Context, sliceAttrs []attribute.KeyValue, extraAttrs []attribute.KeyValue, applyFunc func(ctx context.Context, attrs []attribute.KeyValue)) { + for _, attr := range sliceAttrs { + for _, v := range attr.Value.AsStringSlice() { + attrs := append(extraAttrs, attribute.KeyValue{ + Key: attr.Key, + Value: attribute.StringValue(v), + }) + applyFunc(ctx, attrs) + } + } +} + +// filterStringSliceAttr filters out the string slice attributes from the given attributes. +func isStringSliceAttr(kv attribute.KeyValue) bool { + return kv.Value.Type() == attribute.STRINGSLICE +} + +// filterStringSliceAttr filters out the string slice attributes from the given attributes. +func filterStringSliceAttr(kv []attribute.KeyValue) (stringSliceAttr []attribute.KeyValue, excludeAttr []attribute.KeyValue) { + for _, a := range kv { + if isStringSliceAttr(a) { + stringSliceAttr = append(stringSliceAttr, a) + } else { + excludeAttr = append(excludeAttr, a) + } + } + return +} diff --git a/router/pkg/trace/config.go b/router/pkg/trace/config.go index 748e3e83dd..63e52b70c7 100644 --- a/router/pkg/trace/config.go +++ b/router/pkg/trace/config.go @@ -5,7 +5,6 @@ import ( "github.com/wundergraph/cosmo/router/pkg/otel/otelconfig" "go.opentelemetry.io/otel/attribute" sdktrace "go.opentelemetry.io/otel/sdk/trace" - "net/http" "net/url" "time" ) @@ -64,11 +63,11 @@ type Config struct { ExportGraphQLVariables ExportGraphQLVariables Exporters []*ExporterConfig Propagators []Propagator - SpanAttributesMapper func(req *http.Request) []attribute.KeyValue ResourceAttributes []attribute.KeyValue // TestMemoryExporter is used for testing purposes. If set, the exporter will be used instead of the configured exporters. TestMemoryExporter sdktrace.SpanExporter ResponseTraceHeader config.ResponseTraceHeader + Attributes []config.CustomAttribute } func DefaultExporter(cfg *Config) *ExporterConfig { @@ -100,11 +99,11 @@ func DefaultConfig(serviceVersion string) *Config { Sampler: 1, WithNewRoot: false, ParentBasedSampler: true, + Attributes: make([]config.CustomAttribute, 0), ExportGraphQLVariables: ExportGraphQLVariables{ Enabled: true, }, - SpanAttributesMapper: nil, - ResourceAttributes: make([]attribute.KeyValue, 0), + ResourceAttributes: make([]attribute.KeyValue, 0), Exporters: []*ExporterConfig{ { Disabled: false,