diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 1edac9b71..675f5aeec 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -23,7 +23,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").', + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff")%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Pod is crash looping.', }, 'for': '15m', @@ -47,7 +49,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.', + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Pod has been in a non-ready state for more than 15 minutes.', }, 'for': '15m', @@ -63,7 +67,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.', + description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Deployment generation mismatch due to possible roll-back', }, 'for': '15m', @@ -85,7 +91,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.', + description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Deployment has not matched the expected number of replicas.', }, 'for': '15m', @@ -100,7 +108,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.', + description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Deployment rollout is not progressing.', }, 'for': '15m', @@ -122,7 +132,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.', + description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'StatefulSet has not matched the expected number of replicas.', }, 'for': '15m', @@ -138,7 +150,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.', + description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'StatefulSet generation mismatch due to possible roll-back', }, 'for': '15m', @@ -168,7 +182,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.', + description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'StatefulSet update has not been rolled out.', }, 'for': '15m', @@ -205,7 +221,10 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config, + description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %s%s.' % [ + $._config.kubeDaemonSetRolloutStuckFor, + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'DaemonSet rollout is stuck.', }, 'for': $._config.kubeDaemonSetRolloutStuckFor, @@ -218,7 +237,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").', + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}")%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Pod container waiting longer than 1 hour', }, 'for': '1h', @@ -235,7 +256,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.', + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'DaemonSet pods are not scheduled.', }, 'for': '10m', @@ -249,7 +272,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.', + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'DaemonSet pods are misscheduled.', }, 'for': '15m', @@ -265,7 +290,10 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%(kubeJobTimeoutDuration)s" | humanizeDuration }} to complete.' % $._config, + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "%s" | humanizeDuration }} to complete%s.' % [ + $._config.kubeJobTimeoutDuration, + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Job did not complete in time', }, }, @@ -279,7 +307,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.', + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Job failed to complete.', }, }, @@ -303,7 +333,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.', + description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'HPA has not matched desired number of replicas.', }, 'for': '15m', @@ -319,7 +351,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.', + description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'HPA is running at max replicas', }, 'for': '15m', diff --git a/alerts/kube_apiserver.libsonnet b/alerts/kube_apiserver.libsonnet index d67bbf3ed..e22bd8688 100644 --- a/alerts/kube_apiserver.libsonnet +++ b/alerts/kube_apiserver.libsonnet @@ -36,7 +36,9 @@ local utils = import '../lib/utils.libsonnet'; long: '%(long)s' % w, }, annotations: { - description: 'The API server is burning too much error budget.', + description: 'The API server is burning too much error budget%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'The API server is burning too much error budget.', }, 'for': '%(for)s' % w, @@ -111,7 +113,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.', + description: 'Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}%% available over the last 10m%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubernetes aggregated API is down.', }, }, @@ -128,7 +132,9 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.', + description: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.', }, 'for': '5m', diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index 2b206dec1..1bc4ea558 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -1,3 +1,5 @@ +local utils = import '../lib/utils.libsonnet'; + { _config+:: { kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', @@ -25,7 +27,9 @@ severity: 'warning', }, annotations: { - description: '{{ $labels.node }} has been unready for more than 15 minutes.', + description: '{{ $labels.node }} has been unready for more than 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Node is not ready.', }, 'for': '15m', @@ -41,7 +45,9 @@ severity: 'warning', }, annotations: { - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.', + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Node is unreachable.', }, 'for': '15m', @@ -52,12 +58,16 @@ // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it. // We have to ignore this special node in the KubeletTooManyPods alert. expr: ||| - count by(%(clusterLabel)s, node) ( - (kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,%(clusterLabel)s) group_left(node) topk by(instance,pod,namespace,%(clusterLabel)s) (1, kube_pod_info{%(kubeStateMetricsSelector)s}) + count by (%(clusterLabel)s, node) ( + (kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase="Running"} == 1) + * on (%(clusterLabel)s, namespace, pod) group_left (node) + group by (%(clusterLabel)s, namespace, pod, node) ( + kube_pod_info{%(kubeStateMetricsSelector)s} + ) ) / - max by(%(clusterLabel)s, node) ( - kube_node_status_capacity{%(kubeStateMetricsSelector)s,resource="pods"} != 1 + max by (%(clusterLabel)s, node) ( + kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1 ) > 0.95 ||| % $._config, 'for': '15m', @@ -65,7 +75,9 @@ severity: 'info', }, annotations: { - description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", + description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity%s." % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet is running at capacity.', }, }, @@ -79,7 +91,9 @@ severity: 'warning', }, annotations: { - description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.', + description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Node readiness status is flapping.', }, }, @@ -93,7 +107,9 @@ severity: 'warning', }, annotations: { - description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.', + description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.', }, }, @@ -107,7 +123,9 @@ severity: 'warning', }, annotations: { - description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.', + description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet Pod startup latency is too high.', }, }, @@ -120,7 +138,9 @@ severity: 'warning', }, annotations: { - description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.', + description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet client certificate is about to expire.', }, }, @@ -133,7 +153,9 @@ severity: 'critical', }, annotations: { - description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.', + description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet client certificate is about to expire.', }, }, @@ -146,7 +168,9 @@ severity: 'warning', }, annotations: { - description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.', + description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet server certificate is about to expire.', }, }, @@ -159,7 +183,9 @@ severity: 'critical', }, annotations: { - description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.', + description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet server certificate is about to expire.', }, }, @@ -173,7 +199,9 @@ }, 'for': '15m', annotations: { - description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).', + description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet has failed to renew its client certificate.', }, }, @@ -187,7 +215,9 @@ }, 'for': '15m', annotations: { - description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).', + description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubelet has failed to renew its server certificate.', }, }, diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index a8a355f76..75abb51af 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -1,3 +1,5 @@ +local utils = import '../lib/utils.libsonnet'; + { _config+:: { kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', @@ -158,7 +160,9 @@ severity: 'info', }, annotations: { - description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.', + description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Namespace quota is going to be full.', }, }, @@ -175,7 +179,9 @@ severity: 'info', }, annotations: { - description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.', + description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Namespace quota is fully used.', }, }, @@ -192,7 +198,9 @@ severity: 'warning', }, annotations: { - description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.', + description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Namespace quota has exceeded the limits.', }, }, @@ -209,7 +217,9 @@ severity: 'info', }, annotations: { - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.', + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Processes experience elevated CPU throttling.', }, }, diff --git a/alerts/system_alerts.libsonnet b/alerts/system_alerts.libsonnet index 2a434a30f..950c9bb8e 100644 --- a/alerts/system_alerts.libsonnet +++ b/alerts/system_alerts.libsonnet @@ -1,3 +1,5 @@ +local utils = import '../lib/utils.libsonnet'; + { _config+:: { notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"', @@ -19,7 +21,9 @@ severity: 'warning', }, annotations: { - description: 'There are {{ $value }} different semantic versions of Kubernetes components running.', + description: 'There are {{ $value }} different semantic versions of Kubernetes components running%s.' % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Different semantic versions of Kubernetes components running.', }, }, @@ -39,7 +43,9 @@ severity: 'warning', }, annotations: { - description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'", + description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors%s." % [ + utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config), + ], summary: 'Kubernetes API server client is experiencing errors.', }, },