From a1e1805084014547fa469714ad4e6b61297d66e5 Mon Sep 17 00:00:00 2001
From: Igor Zolotarev <yngvar.antonsson@gmail.com>
Date: Thu, 15 Aug 2024 17:39:01 +0300
Subject: [PATCH] Use membership instead of cache in disk_failure issues

---
 CHANGELOG.rst                          |  2 +
 cartridge/issues.lua                   | 45 +++++++---------
 test/integration/disk_failure_test.lua | 75 ++++++++++++--------------
 3 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index cfb5422b8..ea90add7c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -37,6 +37,8 @@ Changed
 
 - uuids in issues replaces with instance names and uris.
 
+- Use ``membership`` instead of cache in ``disk_failure`` issues
+
 -------------------------------------------------------------------------------
 [2.12.2] - 2024-06-24
 -------------------------------------------------------------------------------
diff --git a/cartridge/issues.lua b/cartridge/issues.lua
index 942333f77..2d85f96c1 100644
--- a/cartridge/issues.lua
+++ b/cartridge/issues.lua
@@ -530,17 +530,8 @@ local function list_on_instance(opts)
         end
     end
 
-    if type(box.cfg) == 'table' and not fio.lstat(box.cfg.memtx_dir) then
-        table.insert(ret, {
-            level = 'critical',
-            topic = 'disk_failure',
-            instance_uuid = instance_uuid,
-            replicaset_uuid = replicaset_uuid,
-            message = string.format(
-                'Disk error on instance %s. This issue stays until restart',
-                describe(self_uri)
-            ),
-        })
+    if type(box.cfg) == 'table' then
+        membership.set_payload('disk_failure', not fio.lstat(box.cfg.memtx_dir))
     end
 
     -- add custom issues from each role
@@ -564,7 +555,6 @@ local function list_on_instance(opts)
     return ret
 end
 
-local disk_failure_cache = {}
 local function list_on_cluster()
     local state, err = confapplier.get_state()
     if state == 'Unconfigured' and lua_api_proxy.can_call()  then
@@ -699,8 +689,8 @@ local function list_on_cluster()
         end
     end
 
-    -- Check aliens in membership and unrecoverable instances
-    local unrecoverable_uuids = {}
+    -- Check aliens in membership, unrecoverable instances and disk_failures
+    local uuids_to_disable = {}
     for uri, member in membership.pairs() do
         local uuid = member.payload.uuid
         if member.status == 'alive'
@@ -731,7 +721,7 @@ local function list_on_cluster()
 
             ::uuid_found::
             if uuid ~= nil then -- still no uuid, skipping
-                table.insert(unrecoverable_uuids, uuid)
+                table.insert(uuids_to_disable, uuid)
                 table.insert(ret, {
                     level = 'warning',
                     topic = 'autodisable',
@@ -744,6 +734,19 @@ local function list_on_cluster()
                 })
             end
         end
+
+        if member.payload.disk_failure then
+            table.insert(ret, {
+                level = 'critical',
+                topic = 'disk_failure',
+                instance_uuid = uuid,
+                message = string.format(
+                    'Disk error on instance %s',
+                    describe(uri)
+                ),
+            })
+            table.insert(uuids_to_disable, uuid)
+        end
     end
 
     -- Get each instance issues (replication, failover, memory usage)
@@ -760,24 +763,12 @@ local function list_on_cluster()
         {uri_list = uri_list, timeout = 1}
     )
 
-    local uuids_to_disable = {}
     for _, issues in pairs(issues_map) do
         for _, issue in pairs(issues) do
             table.insert(ret, issue)
-            if issue.topic == 'disk_failure' then
-                table.insert(uuids_to_disable, issue.instance_uuid)
-                disk_failure_cache[issue.instance_uuid] = issue
-            end
         end
     end
 
-    for _, issue in pairs(disk_failure_cache) do
-        table.insert(ret, issue)
-    end
-
-    if vars.disable_unrecoverable then
-        uuids_to_disable = fun.chain(uuids_to_disable, unrecoverable_uuids):totable()
-    end
     if #uuids_to_disable > 0 then
         lua_api_topology.disable_servers(uuids_to_disable)
     end
diff --git a/test/integration/disk_failure_test.lua b/test/integration/disk_failure_test.lua
index 085975ac5..057201452 100644
--- a/test/integration/disk_failure_test.lua
+++ b/test/integration/disk_failure_test.lua
@@ -65,12 +65,10 @@ function g.test_disk_failure_disable()
         local expected_issues = {
             {
                 level = 'critical',
-                replicaset_uuid = sharded_storage_1.replicaset_uuid,
                 instance_uuid = sharded_storage_1.instance_uuid,
                 topic = 'disk_failure',
             }, {
                 level = 'critical',
-                replicaset_uuid = simple_storage_1.replicaset_uuid,
                 instance_uuid = simple_storage_1.instance_uuid,
                 topic = 'disk_failure',
             }
@@ -79,41 +77,44 @@ function g.test_disk_failure_disable()
         t.assert_covers(issues, expected_issues)
     end)
 
-    local resp = router:graphql({
-        query = [[
-            {
-                servers {
-                    uri
-                    disabled
+    t.helpers.retrying({}, function()
+        local resp = router:graphql({
+            query = [[
+                {
+                    servers {
+                        uri
+                        disabled
+                    }
                 }
-            }
-        ]]
-    })
+            ]]
+        })
 
-    table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end)
+        table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end)
+
+        t.assert_items_equals(resp['data']['servers'], {
+            {
+                uri = 'localhost:13301',
+                disabled = false,
+            },
+            {
+                uri = 'localhost:13302',
+                disabled = true,
+            },
+            {
+                uri = 'localhost:13303',
+                disabled = false,
+            },
+            {
+                uri = 'localhost:13304',
+                disabled = true,
+            },
+            {
+                uri = 'localhost:13305',
+                disabled = false,
+            },
+        })
+    end)
 
-    t.assert_items_equals(resp['data']['servers'], {
-        {
-            uri = 'localhost:13301',
-            disabled = false,
-        },
-        {
-            uri = 'localhost:13302',
-            disabled = true,
-        },
-        {
-            uri = 'localhost:13303',
-            disabled = false,
-        },
-        {
-            uri = 'localhost:13304',
-            disabled = true,
-        },
-        {
-            uri = 'localhost:13305',
-            disabled = false,
-        },
-    })
     -- first storage is disabled
     t.assert_not(sharded_storage_1:exec(function()
         return _G.vshard.storage.internal.is_enabled
@@ -138,12 +139,6 @@ function g.test_disk_failure_disable()
         }
     ]]):format(sharded_storage_1.instance_uuid, simple_storage_1.instance_uuid)})
 
-    -- restart router to remove issues
-    router:restart()
-    t.helpers.retrying({}, function()
-        t.assert_equals(helpers.list_cluster_issues(router), {})
-    end)
-
     -- vshard is enabled again
     t.assert(sharded_storage_1:exec(function()
         return _G.vshard.storage.internal.is_enabled