From 4c85bc9be200d428313bb825d12f8b6c8cfc0e41 Mon Sep 17 00:00:00 2001 From: Santosh Date: Thu, 2 Jan 2025 13:36:47 +0530 Subject: [PATCH] remove MDSCacheUsageHigh prometheus alert Customers are complaining about erroneous MDS cache usage alerts. Ceph team suggested that `ceph_mds_mem_rss` might not be the right metric to capture this cache usage. So this alert needs to looked at again. For the time being, we can just remove this alert due to increasing number of customer cases around this. Signed-off-by: Santosh --- metrics/deploy/prometheus-ocs-rules.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 85d1a27346..8656b5af2c 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -367,19 +367,6 @@ spec: severity: info - name: ceph-daemon-performance-alerts.rules rules: - - alert: MDSCacheUsageHigh - annotations: - description: MDS cache usage for the daemon {{ $labels.ceph_daemon }} has - exceeded above 95% of the requested value. Increase the memory request for - {{ $labels.ceph_daemon }} pod. - message: High MDS cache usage for the daemon {{ $labels.ceph_daemon }}. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephMdsCacheUsageHigh.md - severity_level: error - expr: | - (ceph_mds_mem_rss * 1000) / on(ceph_daemon) group_left(job)(label_replace(kube_pod_container_resource_requests{container="mds", resource="memory"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") * .5) > .95 - for: 5m - labels: - severity: critical - alert: OSDCPULoadHigh annotations: description: CPU usage for osd on pod {{ $labels.pod }} has exceeded 80%.