Update InfraNodesNeedResizingSRE

sam-nguyen7 · Feb 8, 2024 · 337af07 · 337af07
1 parent a795041
commit 337af07
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 20 deletions.
diff --git a/deploy/sre-prometheus/100-infra-resizing.PrometheusRule.yaml b/deploy/sre-prometheus/100-infra-resizing.PrometheusRule.yaml
@@ -91,7 +91,7 @@ spec:
       ## If either of the CPU or Memory resource consumption alerts (see below) fire, then trigger an alert for SRE
       - expr: (
                 count(
-                  ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE", alertstate="firing"}
+                  ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE1h", alertstate="firing"}
                   OR
                   ALERTS{alertname="memory-InfraNodesExcessiveResourceConsumptionSRE", alertstate="firing"}
                 ) >= 1
@@ -103,7 +103,7 @@ spec:
         expr: sre:node_infra:excessive_consumption_cpu > 0
         for: 1h
         labels:
-          severity: critical
+          severity: warning
           namespace: openshift-monitoring
         annotations:
           message: "The cluster's infrastructure nodes have been consuming excessive CPU for 1 hours and may need to be vertically scaled to support the existing workers. See linked SOP for details."
@@ -113,7 +113,7 @@ spec:
         expr: sre:node_infra:excessive_consumption_cpu > 0
         for: 16h
         labels:
-          severity: critical
+          severity: warning
           namespace: openshift-monitoring
         annotations:
           message: "The cluster's infrastructure nodes have been consuming excessive CPU for 16 hours and may need to be vertically scaled to support the existing workers. See linked SOP for details."
@@ -122,14 +122,14 @@ spec:
         expr: sre:node_infra:excessive_consumption_memory > 0
         for: 24h
         labels:
-          severity: critical
+          severity: warning
           namespace: openshift-monitoring
         annotations:
           message: "The cluster's infrastructure nodes have been consuming excessive memory for 24 hours and may need to be vertically scaled to support the existing workers. See linked SOP for details."
       ## If the CPU or Memory related "InfraNodesExcessiveResourceConsumptionSRE" alerts are firing, raise a critical ticket to SRE to scale the infra nodes up
       - alert: InfraNodesNeedResizingSRE
         expr: sre:node_infras:need_resize > 0
-        for: 2h
+        for: 5m
         labels:
           severity: critical
           namespace: openshift-monitoring

diff --git a/hack/00-osd-managed-cluster-config-integration.yaml.tmpl b/hack/00-osd-managed-cluster-config-integration.yaml.tmpl
@@ -34946,7 +34946,7 @@ objects:
               ="infra"} ) - 1 ) / count ( cluster:nodes_roles{label_node_role_kubernetes_io
               ="infra"} ) ) )
             record: sre:node_infra:excessive_consumption_memory
-          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE",
+          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE1h",
               alertstate="firing"} OR ALERTS{alertname="memory-InfraNodesExcessiveResourceConsumptionSRE",
               alertstate="firing"} ) >= 1 )
             record: sre:node_infras:need_resize
@@ -34956,7 +34956,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 1h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34966,7 +34966,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 16h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34976,15 +34976,15 @@ objects:
             expr: sre:node_infra:excessive_consumption_memory > 0
             for: 24h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
                 memory for 24 hours and may need to be vertically scaled to support
                 the existing workers. See linked SOP for details.
           - alert: InfraNodesNeedResizingSRE
             expr: sre:node_infras:need_resize > 0
-            for: 2h
+            for: 5m
             labels:
               severity: critical
               namespace: openshift-monitoring

diff --git a/hack/00-osd-managed-cluster-config-production.yaml.tmpl b/hack/00-osd-managed-cluster-config-production.yaml.tmpl
@@ -34946,7 +34946,7 @@ objects:
               ="infra"} ) - 1 ) / count ( cluster:nodes_roles{label_node_role_kubernetes_io
               ="infra"} ) ) )
             record: sre:node_infra:excessive_consumption_memory
-          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE",
+          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE1h",
               alertstate="firing"} OR ALERTS{alertname="memory-InfraNodesExcessiveResourceConsumptionSRE",
               alertstate="firing"} ) >= 1 )
             record: sre:node_infras:need_resize
@@ -34956,7 +34956,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 1h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34966,7 +34966,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 16h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34976,15 +34976,15 @@ objects:
             expr: sre:node_infra:excessive_consumption_memory > 0
             for: 24h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
                 memory for 24 hours and may need to be vertically scaled to support
                 the existing workers. See linked SOP for details.
           - alert: InfraNodesNeedResizingSRE
             expr: sre:node_infras:need_resize > 0
-            for: 2h
+            for: 5m
             labels:
               severity: critical
               namespace: openshift-monitoring

diff --git a/hack/00-osd-managed-cluster-config-stage.yaml.tmpl b/hack/00-osd-managed-cluster-config-stage.yaml.tmpl
@@ -34946,7 +34946,7 @@ objects:
               ="infra"} ) - 1 ) / count ( cluster:nodes_roles{label_node_role_kubernetes_io
               ="infra"} ) ) )
             record: sre:node_infra:excessive_consumption_memory
-          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE",
+          - expr: ( count( ALERTS{alertname="cpu-InfraNodesExcessiveResourceConsumptionSRE1h",
               alertstate="firing"} OR ALERTS{alertname="memory-InfraNodesExcessiveResourceConsumptionSRE",
               alertstate="firing"} ) >= 1 )
             record: sre:node_infras:need_resize
@@ -34956,7 +34956,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 1h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34966,7 +34966,7 @@ objects:
             expr: sre:node_infra:excessive_consumption_cpu > 0
             for: 16h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
@@ -34976,15 +34976,15 @@ objects:
             expr: sre:node_infra:excessive_consumption_memory > 0
             for: 24h
             labels:
-              severity: critical
+              severity: warning
               namespace: openshift-monitoring
             annotations:
               message: The cluster's infrastructure nodes have been consuming excessive
                 memory for 24 hours and may need to be vertically scaled to support
                 the existing workers. See linked SOP for details.
           - alert: InfraNodesNeedResizingSRE
             expr: sre:node_infras:need_resize > 0
-            for: 2h
+            for: 5m
             labels:
               severity: critical
               namespace: openshift-monitoring