From 4c85bc9be200d428313bb825d12f8b6c8cfc0e41 Mon Sep 17 00:00:00 2001
From: Santosh <sapillai@redhat.com>
Date: Thu, 2 Jan 2025 13:36:47 +0530
Subject: [PATCH] remove MDSCacheUsageHigh prometheus alert

Customers are complaining about erroneous MDS cache usage alerts.
Ceph team suggested that `ceph_mds_mem_rss` might not be the right
metric to capture this cache usage. So this alert needs to looked at
again. For the time being, we can just remove this alert due to
increasing number of customer cases around this.

Signed-off-by: Santosh <sapillai@redhat.com>
---
 metrics/deploy/prometheus-ocs-rules.yaml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml
index 85d1a27346..8656b5af2c 100644
--- a/metrics/deploy/prometheus-ocs-rules.yaml
+++ b/metrics/deploy/prometheus-ocs-rules.yaml
@@ -367,19 +367,6 @@ spec:
         severity: info
   - name: ceph-daemon-performance-alerts.rules
     rules:
-    - alert: MDSCacheUsageHigh
-      annotations:
-        description: MDS cache usage for the daemon {{ $labels.ceph_daemon }} has
-          exceeded above 95% of the requested value. Increase the memory request for
-          {{ $labels.ceph_daemon }} pod.
-        message: High MDS cache usage for the daemon {{ $labels.ceph_daemon }}.
-        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephMdsCacheUsageHigh.md
-        severity_level: error
-      expr: |
-        (ceph_mds_mem_rss * 1000) / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95
-      for: 5m
-      labels:
-        severity: critical
     - alert: OSDCPULoadHigh
       annotations:
         description: CPU usage for osd on pod {{ $labels.pod }} has exceeded 80%.