From e80710cb0bc5ca8dbaeed6224179a73502bf1f7b Mon Sep 17 00:00:00 2001 From: Roman Hros Date: Fri, 2 Feb 2024 09:05:17 +0100 Subject: [PATCH] Add missing cluster labels and aggregations for apiserver alerts Also, remove unused variable kubeAPILatencyWarningSeconds, which was not deleted in kubernetes-monitoring/kubernetes-mixin#451 Signed-off-by: Roman Hros --- alerts/kube_apiserver.libsonnet | 17 +++++++++-------- tests.yaml | 5 +++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/alerts/kube_apiserver.libsonnet b/alerts/kube_apiserver.libsonnet index 5639454d5..55448ac2b 100644 --- a/alerts/kube_apiserver.libsonnet +++ b/alerts/kube_apiserver.libsonnet @@ -4,8 +4,6 @@ local utils = import '../lib/utils.libsonnet'; _config+:: { kubeApiserverSelector: error 'must provide selector for kube-apiserver', - kubeAPILatencyWarningSeconds: 1, - certExpirationWarningSeconds: 7 * 24 * 3600, certExpirationCriticalSeconds: 1 * 24 * 3600, }, @@ -18,13 +16,16 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeAPIErrorBudgetBurn', expr: ||| - sum(apiserver_request:burnrate%s) > (%.2f * %.5f) - and - sum(apiserver_request:burnrate%s) > (%.2f * %.5f) + sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f) + and on(%s) + sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f) ||| % [ + $._config.clusterLabel, w.long, w.factor, (1 - $._config.SLOs.apiserver.target), + $._config.clusterLabel, + $._config.clusterLabel, w.short, w.factor, (1 - $._config.SLOs.apiserver.target), @@ -49,7 +50,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeClientCertificateExpiration', expr: ||| - apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s + apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s ||| % $._config, 'for': '5m', labels: { @@ -63,7 +64,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeClientCertificateExpiration', expr: ||| - apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s + apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s ||| % $._config, 'for': '5m', labels: { @@ -108,7 +109,7 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeAPITerminatedRequests', expr: ||| - sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20 + sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum by(%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20 ||| % $._config, labels: { severity: 'warning', diff --git a/tests.yaml b/tests.yaml index f1893b63c..82dd4bb60 100644 --- a/tests.yaml +++ b/tests.yaml @@ -1125,9 +1125,9 @@ tests: - interval: 1m input_series: - - series: 'apiserver_request_terminations_total{job="kube-apiserver",apiserver="kube-apiserver"}' + - series: 'apiserver_request_terminations_total{cluster="kubernetes",job="kube-apiserver",apiserver="kube-apiserver"}' values: '1+1x10' - - series: 'apiserver_request_total{job="kube-apiserver",apiserver="kube-apiserver"}' + - series: 'apiserver_request_total{cluster="kubernetes",job="kube-apiserver",apiserver="kube-apiserver"}' values: '1+2x10' alert_rule_test: - eval_time: 5m # alert hasn't fired @@ -1137,6 +1137,7 @@ tests: exp_alerts: - exp_labels: severity: warning + cluster: "kubernetes" exp_annotations: summary: "The kubernetes apiserver has terminated 33.33% of its incoming requests." description: "The kubernetes apiserver has terminated 33.33% of its incoming requests."