diff --git a/charts/nopo11y/Chart.yaml b/charts/nopo11y/Chart.yaml index 6989314..e0c047a 100644 --- a/charts/nopo11y/Chart.yaml +++ b/charts/nopo11y/Chart.yaml @@ -21,4 +21,4 @@ version: 1.0.2 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.0.1" +appVersion: "1.1.0" diff --git a/charts/nopo11y/templates/defaultAlerts.yaml b/charts/nopo11y/templates/defaultAlerts.yaml index 3fe12b4..cd69fe8 100644 --- a/charts/nopo11y/templates/defaultAlerts.yaml +++ b/charts/nopo11y/templates/defaultAlerts.yaml @@ -6,13 +6,14 @@ metadata: name: {{ include "app.label" . }}-default-alert-rules labels: release: {{ .Values.prometheusReleaseLabel }} + managedby: nopo11y spec: groups: - name: {{ include "app.label" . }}-default-alert-rules rules: {{- if .Values.istioMetrics.enabled }} - alert: {{ include "app.label" . }}High5xxErrorRate - expr: sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate5xx }} + expr: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate5xx }} annotations: description: {{ include "app.label" . }} service is experiencing high 5xx errors rate from last 5 minutes. summary: {{ include "app.label" . }} service is experiencing high 5xx error rate. @@ -22,7 +23,7 @@ spec: labels: severity: critical - alert: {{ include "app.label" . }}High4xxErrorRate - expr: sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate4xx }} + expr: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate4xx }} for: 5m annotations: {{- if .Values.grafanaURL }} @@ -35,7 +36,7 @@ spec: {{- end }} {{- if .Values.nginxIngressMetrics.enabled }} - alert: {{ include "app.label" . }}IngressHigh5xxErrorRate - expr: sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5.."}[5m])) / sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}"}[5m])) * 100 > {{ .Values.errorRate5xx }} + expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5..", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) * 100 > {{ .Values.errorRate5xx }} annotations: description: {{ include "app.label" . }} service is experiencing high 5xx errors rate from last 5 minutes. summary: {{ include "app.label" . }} is experiencing high 5xx error rate. @@ -45,7 +46,7 @@ spec: labels: severity: critical - alert: {{ include "app.label" . }}IngressHigh4xxErrorRate - expr: sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"4.."}[5m])) / sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}"}[5m])) * 100 > {{ .Values.rrorRate4xx }} + expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"4..", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) * 100 > {{ .Values.errorRate4xx }} for: 10m annotations: description: {{ include "app.label" . }} service is experiencing high 4xx errors rate from last 5 minutes. diff --git a/charts/nopo11y/templates/defaultDashboard.yaml b/charts/nopo11y/templates/defaultDashboard.yaml index 759c4d3..76de762 100644 --- a/charts/nopo11y/templates/defaultDashboard.yaml +++ b/charts/nopo11y/templates/defaultDashboard.yaml @@ -147,7 +147,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ingress=\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval]))", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", "legendFormat": "Requests/sec", "range": true, "refId": "A" @@ -241,7 +241,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_request_duration_seconds_sum{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval]))", + "expr": "sum(rate(nginx_ingress_controller_request_duration_seconds_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", "legendFormat": "Latency", "range": true, "refId": "A" @@ -335,7 +335,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", "legendFormat": "5xx error rate", "range": true, "refId": "A" @@ -429,7 +429,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"2..|4..\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"2..|4..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", "legendFormat": "Success ", "range": true, "refId": "A" @@ -440,7 +440,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", "hide": false, "legendFormat": "Error", "range": true, @@ -535,7 +535,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum (rate(nginx_ingress_controller_response_size_sum{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval]))/sum(rate(nginx_ingress_controller_response_size_count{ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\"}[$__rate_interval]))", + "expr": "sum (rate(nginx_ingress_controller_response_size_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))/sum(rate(nginx_ingress_controller_response_size_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", "legendFormat": "Response Size", "range": true, "refId": "A" @@ -639,7 +639,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", "legendFormat": "Requests/sec", "range": true, "refId": "A" @@ -733,7 +733,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_request_duration_milliseconds_sum{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_request_duration_milliseconds_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", "legendFormat": "Latency", "range": true, "refId": "A" @@ -827,7 +827,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", "legendFormat": "5xx error rate", "range": true, "refId": "A" @@ -921,7 +921,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"2..|4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"2..|4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", "legendFormat": "Success ", "range": true, "refId": "A" @@ -932,7 +932,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", "hide": false, "legendFormat": "Error", "range": true, @@ -1028,7 +1028,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_response_bytes_sum{app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_response_bytes_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", "legendFormat": "Response Size", "range": true, "refId": "A" @@ -1136,7 +1136,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n container_memory_working_set_bytes{namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1230,7 +1230,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n container_memory_working_set_bytes{namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1324,7 +1324,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"{{ .Release.Namespace }}\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1418,7 +1418,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"{{ .Release.Namespace }}\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1471,7 +1471,7 @@ data: "uid": "P8E80F9AEF21F6940" }, "editorMode": "code", - "expr": "{ {{- if and .Values.logLabel .Values.logLabelValue }} {{ .Values.logLabel }}=\"{{ .Values.logLabelValue }}\"{{- else }} app=\"{{ include "app.label" . }}\"{{- end }}, container!=\"istio-proxy\"} |= ``", + "expr": "{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}{{- if and .Values.logLabel .Values.logLabelValue }} {{ .Values.logLabel }}=\"{{ .Values.logLabelValue }}\"{{- else }} app=\"{{ include "app.label" . }}\"{{- end }}, container!=\"istio-proxy\"} |= ``", "queryType": "range", "refId": "A" } diff --git a/charts/nopo11y/templates/defaultSLOs.yaml b/charts/nopo11y/templates/defaultSLOs.yaml index faba6e4..1172b77 100644 --- a/charts/nopo11y/templates/defaultSLOs.yaml +++ b/charts/nopo11y/templates/defaultSLOs.yaml @@ -5,6 +5,7 @@ kind: PrometheusServiceLevel metadata: labels: release: {{ .Values.prometheusReleaseLabel }} + managedby: nopo11y name: {{ include "app.label" . }}-availability-slo namespace: {{ .Values.namespace }} spec: @@ -44,14 +45,15 @@ spec: objective: {{ .Values.availabilitySLO }} sli: events: - errorQuery: sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[{{ printf "{{.window}}" }}])) - totalQuery: sum(rate(istio_requests_total{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[{{ printf "{{.window}}" }}])) + errorQuery: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[{{ printf "{{.window}}" }}])) + totalQuery: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[{{ printf "{{.window}}" }}])) --- apiVersion: sloth.slok.dev/v1 kind: PrometheusServiceLevel metadata: labels: release: {{ .Values.prometheusReleaseLabel }} + managedby: nopo11y name: {{ include "app.label" . }}-latency-slo namespace: {{ .Values.namespace }} spec: @@ -91,8 +93,8 @@ spec: objective: {{ .Values.latencySLO }} sli: events: - errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) - sum(rate(istio_request_duration_milliseconds_bucket{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="{{ .Values.latency }}"}[{{ printf "{{.window}}" }}]))) - totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) + errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="{{ .Values.latency }}"}[{{ printf "{{.window}}" }}]))) + totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) {{- end }} {{- end }} --- @@ -103,6 +105,7 @@ kind: PrometheusServiceLevel metadata: labels: release: {{ .Values.prometheusReleaseLabel }} + managedby: nopo11y name: {{ include "app.label" . }}-ingress-availability-slo namespace: {{ .Values.namespace }} spec: @@ -142,14 +145,15 @@ spec: objective: {{ .Values.availabilitySLO }} sli: events: - errorQuery: sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5.."}[{{ printf "{{.window}}" }}])) - totalQuery: sum(rate(nginx_ingress_controller_requests{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}"}[{{ printf "{{.window}}" }}])) + errorQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5..", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) + totalQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) --- apiVersion: sloth.slok.dev/v1 kind: PrometheusServiceLevel metadata: labels: release: {{ .Values.prometheusReleaseLabel }} + managedby: nopo11y name: {{ include "app.label" . }}-ingress-latency-slo namespace: {{ .Values.namespace }} spec: @@ -165,11 +169,11 @@ spec: dashboard: {{ .Values.grafanaURL }}/d/slo-detail?var-service={{ include "app.label" . }} {{- end }} summary: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < 1s. When you receive this alert it means that + be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that the SLO is at risk as your error budget is getting exhausted. To know more about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < 1s. When you receive this alert it means that + be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that the SLO is at risk as your error budget is getting exhausted. To know more about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ name: {{ include "app.label" . }}-ingress - latency SLO is at RISK @@ -182,14 +186,14 @@ spec: alert_type: symptom severity: warning description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < 1s.. When you receive this alert it means that the + be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that the SLO is at risk as your error budget is getting exhausted. To know more about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ name: latency-{{ include "app.label" . }}-ingress objective: {{ .Values.latencySLO }} sli: events: - errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf"}[{{ printf "{{.window}}" }}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="1"}[{{ printf "{{.window}}" }}]))) - totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf"}[{{ printf "{{.window}}" }}])) + errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="{{ divf .Values.latency 1000 }}", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}]))) + totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) +{{- end }} {{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/nopo11y/values.yaml b/charts/nopo11y/values.yaml index 59b574e..48e6933 100644 --- a/charts/nopo11y/values.yaml +++ b/charts/nopo11y/values.yaml @@ -16,4 +16,5 @@ istioMetrics: enabled: true nginxIngressMetrics: enabled: false - ingressName: "sample-ingress" \ No newline at end of file + ingressName: "sample-ingress" + path: "/" \ No newline at end of file