From cc812bff31401df0f637119c16798da396c743f2 Mon Sep 17 00:00:00 2001 From: Florian Bergmann Date: Tue, 2 Jan 2024 13:21:21 +0100 Subject: [PATCH] Split inodes < 3% alert for workers and controlplanes. Worker nodes will be notified by ocm-agent instead of being actioned by SRE. --- .../10-managednotifications-cr.yaml | 6 +++ ...-filedescriptor-limits.PrometheusRule.yaml | 18 ++++++++ .../100-ocm-agent.PrometheusRule.yaml | 22 ++++++++++ ...naged-cluster-config-integration.yaml.tmpl | 44 +++++++++++++++++++ ...anaged-cluster-config-production.yaml.tmpl | 44 +++++++++++++++++++ ...osd-managed-cluster-config-stage.yaml.tmpl | 44 +++++++++++++++++++ 6 files changed, 178 insertions(+) diff --git a/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml b/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml index a690733763..4c1384c996 100644 --- a/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml +++ b/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml @@ -64,3 +64,9 @@ spec: resendWait: 1 severity: Error summary: Workload preventing machine deletion + - activeBody: |- + Your cluster requires you to take action. The available file system inodes for a worker node are currently at or below 3% and are predicted to be fully exhausted soon. Without action, this could impact the usability of this node. Please reduce the amount of inodes used on this mountpoint, either by adjusting application configuration or by moving some applications to other nodes. + name: WorkerNodeFilesystemAlmostOutOfFiles + resendWait: 24 + severity: Error + summary: "Filesystem has less than 3% inodes left" diff --git a/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml b/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml index dbf6eb34ba..f1118ca71a 100644 --- a/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml +++ b/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml @@ -43,3 +43,21 @@ spec: annotations: message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. summary: Filesystem is predicted to run out of inodes within the next 4 hours. + - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + # This is the same as the upstream alert, but groups by node type: worker node or non-worker node + # Original source: https://github.com/helm/charts/blob/master/stable/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml#L133C1-L146C27 + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) * on (instance) group_left () group by (instance) ( + label_replace(kube_node_role{role=~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)") + ) + for: 1h + labels: + severity: critical + namespace: openshift-monitoring diff --git a/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml b/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml index 13e5865fd0..dce117a705 100644 --- a/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml +++ b/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml @@ -112,3 +112,25 @@ spec: send_managed_notification: "true" annotations: message: "A non-openshift workload is preventing a node from draining." + - alert: WorkerNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + # This is the same as the upstream alert, but groups by node type: worker node or non-worker node + # Original source: https://github.com/helm/charts/blob/master/stable/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml#L133C1-L146C27 + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) * on (instance) group_left () group by (instance) ( + label_replace(kube_node_role{role!~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)") + ) unless group by(instance) ( + label_replace(kube_node_role{role=~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)") + ) + for: 1h + labels: + severity: critical + namespace: openshift-monitoring + managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles + send_managed_notification: "true" diff --git a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl index f6a86366c7..08a52da50e 100644 --- a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl @@ -22113,6 +22113,16 @@ objects: resendWait: 1 severity: Error summary: Workload preventing machine deletion + - activeBody: Your cluster requires you to take action. The available file + system inodes for a worker node are currently at or below 3% and are predicted + to be fully exhausted soon. Without action, this could impact the usability + of this node. Please reduce the amount of inodes used on this mountpoint, + either by adjusting application configuration or by moving some applications + to other nodes. + name: WorkerNodeFilesystemAlmostOutOfFiles + resendWait: 24 + severity: Error + summary: Filesystem has less than 3% inodes left - apiVersion: ocmagent.managed.openshift.io/v1alpha1 kind: ManagedNotification metadata: @@ -34350,6 +34360,21 @@ objects: filling up fast. summary: Filesystem is predicted to run out of inodes within the next 4 hours. + - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -36048,6 +36073,25 @@ objects: send_managed_notification: 'true' annotations: message: A non-openshift workload is preventing a node from draining. + - alert: WorkerNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role!~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n) unless group by(instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring + managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles + send_managed_notification: 'true' - apiVersion: hive.openshift.io/v1 kind: SelectorSyncSet metadata: diff --git a/hack/00-osd-managed-cluster-config-production.yaml.tmpl b/hack/00-osd-managed-cluster-config-production.yaml.tmpl index f6a86366c7..08a52da50e 100644 --- a/hack/00-osd-managed-cluster-config-production.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-production.yaml.tmpl @@ -22113,6 +22113,16 @@ objects: resendWait: 1 severity: Error summary: Workload preventing machine deletion + - activeBody: Your cluster requires you to take action. The available file + system inodes for a worker node are currently at or below 3% and are predicted + to be fully exhausted soon. Without action, this could impact the usability + of this node. Please reduce the amount of inodes used on this mountpoint, + either by adjusting application configuration or by moving some applications + to other nodes. + name: WorkerNodeFilesystemAlmostOutOfFiles + resendWait: 24 + severity: Error + summary: Filesystem has less than 3% inodes left - apiVersion: ocmagent.managed.openshift.io/v1alpha1 kind: ManagedNotification metadata: @@ -34350,6 +34360,21 @@ objects: filling up fast. summary: Filesystem is predicted to run out of inodes within the next 4 hours. + - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -36048,6 +36073,25 @@ objects: send_managed_notification: 'true' annotations: message: A non-openshift workload is preventing a node from draining. + - alert: WorkerNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role!~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n) unless group by(instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring + managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles + send_managed_notification: 'true' - apiVersion: hive.openshift.io/v1 kind: SelectorSyncSet metadata: diff --git a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl index f6a86366c7..08a52da50e 100644 --- a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl +++ b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl @@ -22113,6 +22113,16 @@ objects: resendWait: 1 severity: Error summary: Workload preventing machine deletion + - activeBody: Your cluster requires you to take action. The available file + system inodes for a worker node are currently at or below 3% and are predicted + to be fully exhausted soon. Without action, this could impact the usability + of this node. Please reduce the amount of inodes used on this mountpoint, + either by adjusting application configuration or by moving some applications + to other nodes. + name: WorkerNodeFilesystemAlmostOutOfFiles + resendWait: 24 + severity: Error + summary: Filesystem has less than 3% inodes left - apiVersion: ocmagent.managed.openshift.io/v1alpha1 kind: ManagedNotification metadata: @@ -34350,6 +34360,21 @@ objects: filling up fast. summary: Filesystem is predicted to run out of inodes within the next 4 hours. + - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -36048,6 +36073,25 @@ objects: send_managed_notification: 'true' annotations: message: A non-openshift workload is preventing a node from draining. + - alert: WorkerNodeFilesystemAlmostOutOfFiles + annotations: + message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} + has only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\ + \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\ + \ < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ + \"} == 0\n) * on (instance) group_left () group by (instance) (\n label_replace(kube_node_role{role!~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n) unless group by(instance) (\n label_replace(kube_node_role{role=~\"\ + infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\ + )\n)" + for: 1h + labels: + severity: critical + namespace: openshift-monitoring + managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles + send_managed_notification: 'true' - apiVersion: hive.openshift.io/v1 kind: SelectorSyncSet metadata: