Merge pull request openshift#1972 from bergmannf/osd-20187-split-inod…

…es-alert [OSD-20187] Split inodes < 3% alert for workers and controlplanes.
sam-nguyen7 · Jan 2, 2024 · f4c5114 · f4c5114
2 parents ae9548f + cc812bf
commit f4c5114
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 0 deletions.
diff --git a/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml b/deploy/ocm-agent-operator-managednotifications/10-managednotifications-cr.yaml
@@ -64,3 +64,9 @@ spec:
       resendWait: 1
       severity: Error
       summary: Workload preventing machine deletion
+    - activeBody: |-
+        Your cluster requires you to take action. The available file system inodes for a worker node are currently at or below 3% and are predicted to be fully exhausted soon. Without action, this could impact the usability of this node. Please reduce the amount of inodes used on this mountpoint, either by adjusting application configuration or by moving some applications to other nodes.
+      name: WorkerNodeFilesystemAlmostOutOfFiles
+      resendWait: 24
+      severity: Error
+      summary: "Filesystem has less than 3% inodes left"
diff --git a/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml b/deploy/sre-prometheus/100-node-filedescriptor-limits.PrometheusRule.yaml
@@ -43,3 +43,21 @@ spec:
       annotations:
         message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
         summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+    - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles
+      annotations:
+        message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
+        summary: Filesystem has less than 3% inodes left.
+      # This is the same as the upstream alert, but groups by node type: worker node or non-worker node
+      # Original source: https://github.com/helm/charts/blob/master/stable/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml#L133C1-L146C27
+      expr: |-
+        (
+          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
+        and
+          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+        ) * on (instance) group_left () group by (instance) (
+          label_replace(kube_node_role{role=~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)")
+        )
+      for: 1h
+      labels:
+        severity: critical
+        namespace: openshift-monitoring
diff --git a/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml b/deploy/sre-prometheus/ocm-agent/100-ocm-agent.PrometheusRule.yaml
@@ -112,3 +112,25 @@ spec:
         send_managed_notification: "true"
       annotations:
         message: "A non-openshift workload is preventing a node from draining."
+    - alert: WorkerNodeFilesystemAlmostOutOfFiles
+      annotations:
+        message: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
+        summary: Filesystem has less than 3% inodes left.
+      # This is the same as the upstream alert, but groups by node type: worker node or non-worker node
+      # Original source: https://github.com/helm/charts/blob/master/stable/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml#L133C1-L146C27
+      expr: |-
+        (
+          node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
+        and
+          node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+        ) * on (instance) group_left () group by (instance) (
+          label_replace(kube_node_role{role!~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)")
+        ) unless group by(instance) (
+          label_replace(kube_node_role{role=~"infra|control-plane|master"}, "instance", "$1", "node", "(.*)")
+        )
+      for: 1h
+      labels:
+        severity: critical
+        namespace: openshift-monitoring
+        managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles
+        send_managed_notification: "true"
diff --git a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl
@@ -22113,6 +22113,16 @@ objects:
           resendWait: 1
           severity: Error
           summary: Workload preventing machine deletion
+        - activeBody: Your cluster requires you to take action. The available file
+            system inodes for a worker node are currently at or below 3% and are predicted
+            to be fully exhausted soon. Without action, this could impact the usability
+            of this node. Please reduce the amount of inodes used on this mountpoint,
+            either by adjusting application configuration or by moving some applications
+            to other nodes.
+          name: WorkerNodeFilesystemAlmostOutOfFiles
+          resendWait: 24
+          severity: Error
+          summary: Filesystem has less than 3% inodes left
     - apiVersion: ocmagent.managed.openshift.io/v1alpha1
       kind: ManagedNotification
       metadata:
@@ -34350,6 +34360,21 @@ objects:
                 filling up fast.
               summary: Filesystem is predicted to run out of inodes within the next
                 4 hours.
+          - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
     - apiVersion: monitoring.coreos.com/v1
       kind: PrometheusRule
       metadata:
@@ -36048,6 +36073,25 @@ objects:
               send_managed_notification: 'true'
             annotations:
               message: A non-openshift workload is preventing a node from draining.
+          - alert: WorkerNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role!~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n) unless group by(instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
+              managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles
+              send_managed_notification: 'true'
 - apiVersion: hive.openshift.io/v1
   kind: SelectorSyncSet
   metadata:

diff --git a/hack/00-osd-managed-cluster-config-production.yaml.tmpl b/hack/00-osd-managed-cluster-config-production.yaml.tmpl
@@ -22113,6 +22113,16 @@ objects:
           resendWait: 1
           severity: Error
           summary: Workload preventing machine deletion
+        - activeBody: Your cluster requires you to take action. The available file
+            system inodes for a worker node are currently at or below 3% and are predicted
+            to be fully exhausted soon. Without action, this could impact the usability
+            of this node. Please reduce the amount of inodes used on this mountpoint,
+            either by adjusting application configuration or by moving some applications
+            to other nodes.
+          name: WorkerNodeFilesystemAlmostOutOfFiles
+          resendWait: 24
+          severity: Error
+          summary: Filesystem has less than 3% inodes left
     - apiVersion: ocmagent.managed.openshift.io/v1alpha1
       kind: ManagedNotification
       metadata:
@@ -34350,6 +34360,21 @@ objects:
                 filling up fast.
               summary: Filesystem is predicted to run out of inodes within the next
                 4 hours.
+          - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
     - apiVersion: monitoring.coreos.com/v1
       kind: PrometheusRule
       metadata:
@@ -36048,6 +36073,25 @@ objects:
               send_managed_notification: 'true'
             annotations:
               message: A non-openshift workload is preventing a node from draining.
+          - alert: WorkerNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role!~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n) unless group by(instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
+              managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles
+              send_managed_notification: 'true'
 - apiVersion: hive.openshift.io/v1
   kind: SelectorSyncSet
   metadata:

diff --git a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl
@@ -22113,6 +22113,16 @@ objects:
           resendWait: 1
           severity: Error
           summary: Workload preventing machine deletion
+        - activeBody: Your cluster requires you to take action. The available file
+            system inodes for a worker node are currently at or below 3% and are predicted
+            to be fully exhausted soon. Without action, this could impact the usability
+            of this node. Please reduce the amount of inodes used on this mountpoint,
+            either by adjusting application configuration or by moving some applications
+            to other nodes.
+          name: WorkerNodeFilesystemAlmostOutOfFiles
+          resendWait: 24
+          severity: Error
+          summary: Filesystem has less than 3% inodes left
     - apiVersion: ocmagent.managed.openshift.io/v1alpha1
       kind: ManagedNotification
       metadata:
@@ -34350,6 +34360,21 @@ objects:
                 filling up fast.
               summary: Filesystem is predicted to run out of inodes within the next
                 4 hours.
+          - alert: ControlPlaneNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
     - apiVersion: monitoring.coreos.com/v1
       kind: PrometheusRule
       metadata:
@@ -36048,6 +36073,25 @@ objects:
               send_managed_notification: 'true'
             annotations:
               message: A non-openshift workload is preventing a node from draining.
+          - alert: WorkerNodeFilesystemAlmostOutOfFiles
+            annotations:
+              message: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
+                has only {{ printf "%.2f" $value }}% available inodes left.
+              summary: Filesystem has less than 3% inodes left.
+            expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\
+              \"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100\
+              \ < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
+              \"} == 0\n) * on (instance) group_left () group by (instance) (\n  label_replace(kube_node_role{role!~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n) unless group by(instance) (\n  label_replace(kube_node_role{role=~\"\
+              infra|control-plane|master\"}, \"instance\", \"$1\", \"node\", \"(.*)\"\
+              )\n)"
+            for: 1h
+            labels:
+              severity: critical
+              namespace: openshift-monitoring
+              managed_notification_template: WorkerNodeFilesystemAlmostOutOfFiles
+              send_managed_notification: 'true'
 - apiVersion: hive.openshift.io/v1
   kind: SelectorSyncSet
   metadata: