You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Currently failover fiber pings replicas, counts the number of failed requests. After closing #453 it'll also check the state of replication from master. But currently failover says All replicas are ok, when there was no exception during fiber step (and it's difficult to trigger one, everything is pcalled there). So, it just spams, that everything is ok after every change of the prioritized replica.
We should properly return the state of replicas from the failover_step. Here's the draft patch:
Details
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index e68550b..3a4474b 100644
--- a/vshard/router/init.lua+++ b/vshard/router/init.lua@@ -1268,6 +1268,7 @@ local function failover_ping(replica, opts)
end
local function failover_ping_round(router, curr_ts)
+ local all_replicas_ok = true
local opts = {timeout = router.failover_ping_timeout}
for _, replicaset in pairs(router.replicasets) do
local replica = replicaset.replica
@@ -1300,9 +1301,14 @@ local function failover_ping_round(router, curr_ts)
-- and closed.
replica:detach_conn()
replicaset:connect_replica(replica)
+ all_replicas_ok = false+ end+ if replica.health_status ~= consts.STATUS.GREEN then+ all_replicas_ok = false
end
end
end
+ return all_replicas_ok
end
@@ -1314,7 +1320,7 @@ end
--
local function failover_step(router)
local curr_ts = fiber_clock()
- failover_ping_round(router, curr_ts)+ local all_replicas_ok = failover_ping_round(router, curr_ts)
local id_to_update = failover_collect_to_update(router)
if #id_to_update == 0 then
return false
@@ -1347,7 +1353,7 @@ local function failover_step(router)
end
::continue::
end
- return replica_is_changed+ return replica_is_changed, all_replicas_ok
end
--
@@ -1384,19 +1390,19 @@ local function failover_service_f(router, service)
end
service:next_iter()
service:set_activity('updating replicas')
- local ok, replica_is_changed = pcall(failover_step, router)+ local ok, replica_is_changed, all_ok = pcall(failover_step, router)
if not ok then
log.error(service:set_status_error(
'Error during failovering: %s',
lerror.make(replica_is_changed)))
- replica_is_changed = true- elseif not prev_was_ok then+ all_ok = false+ elseif not prev_was_ok and all_ok then
log.info('All replicas are ok')
service:set_status_ok()
end
- prev_was_ok = not replica_is_changed+ prev_was_ok = all_ok
local logf
- if replica_is_changed then+ if replica_is_changed or not all_ok then
logf = log.info
else
-- In any case it is necessary to periodically log
The text was updated successfully, but these errors were encountered:
Currently failover fiber pings replicas, counts the number of failed requests. After closing #453 it'll also check the state of replication from master. But currently failover says
All replicas are ok
, when there was no exception during fiber step (and it's difficult to trigger one, everything is pcalled there). So, it just spams, that everything is ok after every change of the prioritized replica.We should properly return the state of replicas from the
failover_step
. Here's the draft patch:Details
The text was updated successfully, but these errors were encountered: