From a1e1805084014547fa469714ad4e6b61297d66e5 Mon Sep 17 00:00:00 2001 From: Igor Zolotarev Date: Thu, 15 Aug 2024 17:39:01 +0300 Subject: [PATCH] Use membership instead of cache in disk_failure issues --- CHANGELOG.rst | 2 + cartridge/issues.lua | 45 +++++++--------- test/integration/disk_failure_test.lua | 75 ++++++++++++-------------- 3 files changed, 55 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cfb5422b8..ea90add7c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -37,6 +37,8 @@ Changed - uuids in issues replaces with instance names and uris. +- Use ``membership`` instead of cache in ``disk_failure`` issues + ------------------------------------------------------------------------------- [2.12.2] - 2024-06-24 ------------------------------------------------------------------------------- diff --git a/cartridge/issues.lua b/cartridge/issues.lua index 942333f77..2d85f96c1 100644 --- a/cartridge/issues.lua +++ b/cartridge/issues.lua @@ -530,17 +530,8 @@ local function list_on_instance(opts) end end - if type(box.cfg) == 'table' and not fio.lstat(box.cfg.memtx_dir) then - table.insert(ret, { - level = 'critical', - topic = 'disk_failure', - instance_uuid = instance_uuid, - replicaset_uuid = replicaset_uuid, - message = string.format( - 'Disk error on instance %s. This issue stays until restart', - describe(self_uri) - ), - }) + if type(box.cfg) == 'table' then + membership.set_payload('disk_failure', not fio.lstat(box.cfg.memtx_dir)) end -- add custom issues from each role @@ -564,7 +555,6 @@ local function list_on_instance(opts) return ret end -local disk_failure_cache = {} local function list_on_cluster() local state, err = confapplier.get_state() if state == 'Unconfigured' and lua_api_proxy.can_call() then @@ -699,8 +689,8 @@ local function list_on_cluster() end end - -- Check aliens in membership and unrecoverable instances - local unrecoverable_uuids = {} + -- Check aliens in membership, unrecoverable instances and disk_failures + local uuids_to_disable = {} for uri, member in membership.pairs() do local uuid = member.payload.uuid if member.status == 'alive' @@ -731,7 +721,7 @@ local function list_on_cluster() ::uuid_found:: if uuid ~= nil then -- still no uuid, skipping - table.insert(unrecoverable_uuids, uuid) + table.insert(uuids_to_disable, uuid) table.insert(ret, { level = 'warning', topic = 'autodisable', @@ -744,6 +734,19 @@ local function list_on_cluster() }) end end + + if member.payload.disk_failure then + table.insert(ret, { + level = 'critical', + topic = 'disk_failure', + instance_uuid = uuid, + message = string.format( + 'Disk error on instance %s', + describe(uri) + ), + }) + table.insert(uuids_to_disable, uuid) + end end -- Get each instance issues (replication, failover, memory usage) @@ -760,24 +763,12 @@ local function list_on_cluster() {uri_list = uri_list, timeout = 1} ) - local uuids_to_disable = {} for _, issues in pairs(issues_map) do for _, issue in pairs(issues) do table.insert(ret, issue) - if issue.topic == 'disk_failure' then - table.insert(uuids_to_disable, issue.instance_uuid) - disk_failure_cache[issue.instance_uuid] = issue - end end end - for _, issue in pairs(disk_failure_cache) do - table.insert(ret, issue) - end - - if vars.disable_unrecoverable then - uuids_to_disable = fun.chain(uuids_to_disable, unrecoverable_uuids):totable() - end if #uuids_to_disable > 0 then lua_api_topology.disable_servers(uuids_to_disable) end diff --git a/test/integration/disk_failure_test.lua b/test/integration/disk_failure_test.lua index 085975ac5..057201452 100644 --- a/test/integration/disk_failure_test.lua +++ b/test/integration/disk_failure_test.lua @@ -65,12 +65,10 @@ function g.test_disk_failure_disable() local expected_issues = { { level = 'critical', - replicaset_uuid = sharded_storage_1.replicaset_uuid, instance_uuid = sharded_storage_1.instance_uuid, topic = 'disk_failure', }, { level = 'critical', - replicaset_uuid = simple_storage_1.replicaset_uuid, instance_uuid = simple_storage_1.instance_uuid, topic = 'disk_failure', } @@ -79,41 +77,44 @@ function g.test_disk_failure_disable() t.assert_covers(issues, expected_issues) end) - local resp = router:graphql({ - query = [[ - { - servers { - uri - disabled + t.helpers.retrying({}, function() + local resp = router:graphql({ + query = [[ + { + servers { + uri + disabled + } } - } - ]] - }) + ]] + }) - table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end) + table.sort(resp['data']['servers'], function(a, b) return a.uri < b.uri end) + + t.assert_items_equals(resp['data']['servers'], { + { + uri = 'localhost:13301', + disabled = false, + }, + { + uri = 'localhost:13302', + disabled = true, + }, + { + uri = 'localhost:13303', + disabled = false, + }, + { + uri = 'localhost:13304', + disabled = true, + }, + { + uri = 'localhost:13305', + disabled = false, + }, + }) + end) - t.assert_items_equals(resp['data']['servers'], { - { - uri = 'localhost:13301', - disabled = false, - }, - { - uri = 'localhost:13302', - disabled = true, - }, - { - uri = 'localhost:13303', - disabled = false, - }, - { - uri = 'localhost:13304', - disabled = true, - }, - { - uri = 'localhost:13305', - disabled = false, - }, - }) -- first storage is disabled t.assert_not(sharded_storage_1:exec(function() return _G.vshard.storage.internal.is_enabled @@ -138,12 +139,6 @@ function g.test_disk_failure_disable() } ]]):format(sharded_storage_1.instance_uuid, simple_storage_1.instance_uuid)}) - -- restart router to remove issues - router:restart() - t.helpers.retrying({}, function() - t.assert_equals(helpers.list_cluster_issues(router), {}) - end) - -- vshard is enabled again t.assert(sharded_storage_1:exec(function() return _G.vshard.storage.internal.is_enabled