From bf620650c429d0b6c0bae8588f54ea56a0299fd6 Mon Sep 17 00:00:00 2001 From: Nikolay Shirokovskiy Date: Fri, 19 Jan 2024 12:33:08 +0300 Subject: [PATCH] box: finish client fibers on shutdown In the process of graceful shutdown it is convenient to first finish all client (non system) fibers. Otherwise we should be ready for any subsystem to handle request from client fiber during or after subsystem shutdown. This would make code more complex. We first cancel client fibers and then wait for their finishing. The fiber may not respond to cancel and hang which cause shutdown hang but this is the approach we choose for iproto shutdown already. Note that as a result of this approach application will panic if it is shutdown during execution of initialization script (in particular if this script is doing box.cfg). There are changes in application/test to adopt to client fibers shutdown: - make code cancellable (only to pass existing tests, we did not investigate all the possible places that should be made such). - make console stop sending echo to client before client fibers shutdown. Otherwise as console server fiber is client one we will send message that fiber is cancelled on shutdown which breaks a lot of existing tests. This approach is on par with iproto shutdown. - some tests (7743, replication-luatest/shutdown, replication/anon, replication/force_recovery etc etc) test shutdown during execution of init script. Now panic is expected so change them accordingly. - some tests (8530, errinj_vylog) use injection that block client fiber finishing. In that tests we don't need graceful shutdown so let's just kill tarantool instead. - we change test in vinyl/errinj for gh-3225. We don't really need to check when vinyl reader is blocked as it executes small tasks (we assume reading syscall will not hang). Also change test for vinyl dump shutdown by slowing dump down instead of blocking it entirely. This is required to finish in time client fibers in the test. - other similar changes Also we can drop code from replication shutdown which is required to handle client requests during/after shutdown. Part of #8423 NO_CHANGELOG=internal NO_DOC=internal --- src/box/box.cc | 12 ++ src/box/lua/console.lua | 8 + src/box/memtx_engine.cc | 11 +- src/box/replication.cc | 34 +---- src/box/vy_quota.c | 4 + src/box/vy_scheduler.c | 12 +- src/lib/core/errinj.h | 1 - src/lib/core/fiber.c | 48 +++++- src/lib/core/fiber.h | 12 ++ src/lib/core/fiber_pool.c | 14 +- src/lua/fiber.c | 12 ++ src/lua/fiber.lua | 9 ++ src/lua/init.lua | 1 + src/main.cc | 9 -- .../gh_7743_term_initial_cfg_snap_test.lua | 16 +- .../gh_8530_alter_space_snapshot_test.lua | 6 + test/box/errinj.result | 1 - test/replication-luatest/shutdown_test.lua | 26 +--- test/replication-py/cluster.test.py | 1 + test/replication/anon.result | 2 +- test/replication/anon.test.lua | 2 +- test/replication/force_recovery.result | 2 +- test/replication/force_recovery.test.lua | 2 +- ...637-misc-error-on-replica-auth-fail.result | 13 ++ ...7-misc-error-on-replica-auth-fail.test.lua | 5 + test/replication/gh-4739-vclock-assert.result | 2 +- .../gh-4739-vclock-assert.test.lua | 2 +- .../gh-5613-bootstrap-prefer-booted.result | 2 +- .../gh-5613-bootstrap-prefer-booted.test.lua | 2 +- test/replication/gh-5806-xlog-cleanup.result | 2 +- .../replication/gh-5806-xlog-cleanup.test.lua | 2 +- test/replication/prune.result | 2 +- test/replication/prune.test.lua | 2 +- test/replication/replica_auth.lua | 3 + test/replication/replica_rejoin.result | 2 +- test/replication/replica_rejoin.test.lua | 2 +- test/unit/fiber.cc | 144 +++++++++++++++++- test/unit/fiber.result | 6 + test/vinyl/errinj.result | 26 +--- test/vinyl/errinj.test.lua | 17 +-- test/vinyl/errinj_vylog.result | 4 +- test/vinyl/errinj_vylog.test.lua | 4 +- test/xlog/panic_on_wal_error.result | 2 +- test/xlog/panic_on_wal_error.test.lua | 2 +- 44 files changed, 357 insertions(+), 134 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 9767349ac8ea..936d58f545e4 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -4941,6 +4941,8 @@ bootstrap_from_master(struct replica *master) try { applier_resume_to_state(applier, APPLIER_READY, TIMEOUT_INFINITY); + } catch (FiberIsCancelled *e) { + throw e; } catch (...) { return false; } @@ -4958,6 +4960,8 @@ bootstrap_from_master(struct replica *master) try { applier_resume_to_state(applier, APPLIER_FETCH_SNAPSHOT, TIMEOUT_INFINITY); + } catch (FiberIsCancelled *e) { + throw e; } catch (...) { return false; } @@ -5926,6 +5930,14 @@ box_storage_shutdown() if (!is_storage_initialized) return; iproto_shutdown(); + /* + * Finish client fibers after iproto_shutdown otherwise new fibers + * can be started through new iproto requests. Also we should + * finish client fibers before other subsystems shutdown so that + * we won't need to handle requests from client fibers after/during + * subsystem shutdown. + */ + fiber_shutdown(); replication_shutdown(); } diff --git a/src/box/lua/console.lua b/src/box/lua/console.lua index 90111ccade2b..06e5a8503386 100644 --- a/src/box/lua/console.lua +++ b/src/box/lua/console.lua @@ -985,7 +985,15 @@ local function client_handler(client, _peer) state:print(string.format("%-63s\n%-63s\n", "Tarantool ".. version.." (Lua console)", "type 'help' for interactive help")) + local on_shutdown = function() + -- Fiber is going to be cancelled on shutdown. Do not report + -- cancel induced error to the peer. + client:close(); + end + state.fiber = fiber.self() + box.ctl.on_shutdown(on_shutdown) repl(state) + box.ctl.on_shutdown(nil, on_shutdown) session_internal.run_on_disconnect() end diff --git a/src/box/memtx_engine.cc b/src/box/memtx_engine.cc index a0530aae6b43..0ace4a528367 100644 --- a/src/box/memtx_engine.cc +++ b/src/box/memtx_engine.cc @@ -1028,12 +1028,19 @@ checkpoint_f(va_list ap) return -1; } - struct mh_i32_t *temp_space_ids = mh_i32_new(); + struct mh_i32_t *temp_space_ids; say_info("saving snapshot `%s'", snap->filename); - ERROR_INJECT_SLEEP(ERRINJ_SNAP_WRITE_DELAY); + ERROR_INJECT_WHILE(ERRINJ_SNAP_WRITE_DELAY, { + fiber_sleep(0.001); + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + goto fail; + } + }); ERROR_INJECT(ERRINJ_SNAP_SKIP_ALL_ROWS, goto done); struct space_read_view *space_rv; + temp_space_ids = mh_i32_new(); read_view_foreach_space(space_rv, &ckpt->rv) { FiberGCChecker gc_check; bool skip = false; diff --git a/src/box/replication.cc b/src/box/replication.cc index 261b5e5cd303..83bc4b6cd4db 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -62,18 +62,6 @@ double replication_sync_timeout = 300.0; /* seconds */ bool replication_skip_conflict = false; int replication_threads = 1; -/** - * Fiber executing replicaset_connect. NULL if the function - * is not being executed. - */ -static struct fiber *replication_connect_fiber; - -/** Condition that replicaset_connect finished execution. */ -static struct fiber_cond replication_connect_cond; - -/** If set then replication shutdown is started. */ -static bool replication_is_shutting_down; - bool cfg_replication_anon = true; struct tt_uuid cfg_bootstrap_leader_uuid; struct uri cfg_bootstrap_leader_uri; @@ -231,7 +219,6 @@ replication_init(int num_threads) diag_create(&replicaset.applier.diag); replication_threads = num_threads; - fiber_cond_create(&replication_connect_cond); /* The local instance is always part of the quorum. */ replicaset.healthy_count = 1; @@ -242,12 +229,6 @@ replication_init(int num_threads) void replication_shutdown(void) { - replication_is_shutting_down = true; - if (replication_connect_fiber != NULL) - fiber_cancel(replication_connect_fiber); - while (replication_connect_fiber != NULL) - fiber_cond_wait(&replication_connect_cond); - struct replica *replica; rlist_foreach_entry(replica, &replicaset.anon, in_anon) applier_stop(replica->applier); @@ -263,7 +244,6 @@ replication_free(void) diag_destroy(&replicaset.applier.diag); trigger_destroy(&replicaset.on_ack); trigger_destroy(&replicaset.on_relay_thread_start); - fiber_cond_destroy(&replication_connect_cond); fiber_cond_destroy(&replicaset.applier.cond); latch_destroy(&replicaset.applier.order_latch); applier_free(); @@ -1072,9 +1052,6 @@ void replicaset_connect(const struct uri_set *uris, bool connect_quorum, bool keep_connect) { - if (replication_is_shutting_down) - tnt_raise(ClientError, ER_SHUTDOWN); - if (uris->uri_count == 0) { /* Cleanup the replica set. */ replicaset_update(NULL, 0, false); @@ -1087,12 +1064,6 @@ replicaset_connect(const struct uri_set *uris, tnt_raise(ClientError, ER_CFG, "replication", "too many replicas"); } - assert(replication_connect_fiber == NULL); - replication_connect_fiber = fiber(); - auto connect_fiber_guard = make_scoped_guard([&]{ - replication_connect_fiber = NULL; - fiber_cond_signal(&replication_connect_cond); - }); int count = 0; struct applier *appliers[VCLOCK_MAX] = {}; auto appliers_guard = make_scoped_guard([&]{ @@ -1342,6 +1313,11 @@ replicaset_sync(void) say_info("replica set sync complete"); box_set_orphan(false); } + /* + * If fiber is cancelled raise error here so that orphan status is + * correct. + */ + fiber_testcancel(); } void diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c index 8f86be915c4a..ebbbde09603e 100644 --- a/src/box/vy_quota.c +++ b/src/box/vy_quota.c @@ -346,6 +346,10 @@ vy_quota_use(struct vy_quota *q, enum vy_quota_consumer_type type, diag_set(ClientError, ER_VY_QUOTA_TIMEOUT); return -1; } + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + return -1; + } double wait_time = ev_monotonic_now(loop()) - wait_start; if (wait_time > q->too_long_threshold) { diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c index daddff071cef..677bf5749135 100644 --- a/src/box/vy_scheduler.c +++ b/src/box/vy_scheduler.c @@ -730,13 +730,20 @@ vy_scheduler_wait_checkpoint(struct vy_scheduler *scheduler) /* A dump error occurred, abort checkpoint. */ struct error *e = diag_last_error(&scheduler->diag); diag_set_error(diag_get(), e); - say_error("vinyl checkpoint failed: %s", e->errmsg); - return -1; + goto error; } fiber_cond_wait(&scheduler->dump_cond); + if (fiber_is_cancelled()) { + diag_set(FiberIsCancelled); + goto error; + } } say_info("vinyl checkpoint completed"); return 0; +error: + say_error("vinyl checkpoint failed: %s", + diag_last_error(diag_get())->errmsg); + return -1; } void @@ -886,6 +893,7 @@ vy_deferred_delete_batch_process_f(struct cmsg *cmsg) struct vy_deferred_delete_batch *batch = container_of(cmsg, struct vy_deferred_delete_batch, cmsg); struct vy_task *task = batch->task; + fiber_set_system(fiber(), true); /* * Wait for memory quota if necessary before starting to * process the batch (we can't yield between statements). diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h index 89d81a606dbc..12e5412fdde7 100644 --- a/src/lib/core/errinj.h +++ b/src/lib/core/errinj.h @@ -104,7 +104,6 @@ struct errinj { _(ERRINJ_IPROTO_TX_DELAY, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_IPROTO_WRITE_ERROR_DELAY, ERRINJ_BOOL, {.bparam = false})\ _(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \ - _(ERRINJ_MAIN_MAKE_FILE_ON_RETURN, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_MEMTX_DELAY_GC, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_NETBOX_DISABLE_ID, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_NETBOX_FLIP_FEATURE, ERRINJ_INT, {.iparam = -1}) \ diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c index 9ec5d2fa69ce..26bc4459e0a7 100644 --- a/src/lib/core/fiber.c +++ b/src/lib/core/fiber.c @@ -1187,6 +1187,13 @@ fiber_loop(MAYBE_UNUSED void *data) assert(f != fiber); fiber_wakeup(f); } + if (!(fiber->flags & FIBER_IS_SYSTEM)) { + assert(cord()->client_fiber_count > 0); + cord()->client_fiber_count--; + if (cord()->shutdown_fiber != NULL && + cord()->client_fiber_count == 0) + fiber_wakeup(cord()->shutdown_fiber); + } fiber_on_stop(fiber); /* reset pending wakeups */ rlist_del(&fiber->state); @@ -1590,6 +1597,8 @@ fiber_new_ex(const char *name, const struct fiber_attr *fiber_attr, fiber_gc_checker_init(fiber); cord->next_fid++; assert(cord->next_fid > FIBER_ID_MAX_RESERVED); + if (!(fiber->flags & FIBER_IS_SYSTEM)) + cord()->client_fiber_count++; return fiber; @@ -1849,7 +1858,7 @@ cord_create(struct cord *cord, const char *name) cord->sched.name = NULL; fiber_set_name(&cord->sched, "sched"); cord->fiber = &cord->sched; - cord->sched.flags = FIBER_IS_RUNNING; + cord->sched.flags = FIBER_IS_RUNNING | FIBER_IS_SYSTEM; cord->sched.max_slice = zero_slice; cord->max_slice = default_slice; @@ -1884,6 +1893,8 @@ cord_create(struct cord *cord, const char *name) cord->sched.stack_watermark = NULL; #endif signal_stack_init(); + cord->shutdown_fiber = NULL; + cord->client_fiber_count = 0; } void @@ -2339,3 +2350,38 @@ fiber_lua_state(struct fiber *f) { return f->storage.lua.stack; } + +void +fiber_set_system(struct fiber *f, bool yesno) +{ + if (yesno) { + if (!(f->flags & FIBER_IS_SYSTEM)) { + f->flags |= FIBER_IS_SYSTEM; + assert(cord()->client_fiber_count > 0); + cord()->client_fiber_count--; + if (cord()->shutdown_fiber != NULL && + cord()->client_fiber_count == 0) + fiber_wakeup(cord()->shutdown_fiber); + } + } else { + if (f->flags & FIBER_IS_SYSTEM) { + f->flags &= ~FIBER_IS_SYSTEM; + cord()->client_fiber_count++; + } + } +} + +void +fiber_shutdown(void) +{ + assert(cord()->shutdown_fiber == NULL); + struct fiber *fiber; + rlist_foreach_entry(fiber, &cord()->alive, link) { + if (!(fiber->flags & FIBER_IS_SYSTEM)) + fiber_cancel(fiber); + } + cord()->shutdown_fiber = fiber(); + while (cord()->client_fiber_count != 0) + fiber_yield(); + cord()->shutdown_fiber = NULL; +} diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h index 2835220501f2..e58553bdd2f9 100644 --- a/src/lib/core/fiber.h +++ b/src/lib/core/fiber.h @@ -852,6 +852,10 @@ struct cord { struct fiber *main_fiber; /** An event triggered to cancel cord main fiber. */ ev_async cancel_event; + /** Number of alive client (non system) fibers. */ + int client_fiber_count; + /** Fiber calling fiber_shutdown. NULL if there is no such. */ + struct fiber *shutdown_fiber; }; extern __thread struct cord *cord_ptr; @@ -1243,6 +1247,14 @@ fiber_check_gc(void); struct lua_State * fiber_lua_state(struct fiber *f); +/** Change whether fiber is system or not. */ +void +fiber_set_system(struct fiber *f, bool yesno); + +/** Cancel all client (non system) fibers and wait until they finished. */ +void +fiber_shutdown(void); + #if defined(__cplusplus) } /* extern "C" */ diff --git a/src/lib/core/fiber_pool.c b/src/lib/core/fiber_pool.c index cdf067a7f70c..bd5f47284a52 100644 --- a/src/lib/core/fiber_pool.c +++ b/src/lib/core/fiber_pool.c @@ -61,7 +61,9 @@ fiber_pool_f(va_list ap) f->caller->flags |= FIBER_IS_READY; assert(f->caller->caller == &cord->sched); } + fiber_set_system(fiber(), false); cmsg_deliver(msg); + fiber_set_system(fiber(), true); fiber_check_gc(); /* * Normally fibers die after their function @@ -131,7 +133,17 @@ fiber_pool_cb(ev_loop *loop, struct ev_watcher *watcher, int events) f = rlist_shift_entry(&pool->idle, struct fiber, state); fiber_call(f); } else if (pool->size < pool->max_size) { - f = fiber_new(cord_name(cord()), fiber_pool_f); + /* + * We don't want fibers to be cancellable by client + * while they are in the pool. However system flag is + * reset during processing message from pool endpoint + * so that fiber is made cancellable back. + * + * If some message processing should not be cancellable + * by client then it can just set system flag during + * it's execution. + */ + f = fiber_new_system(cord_name(cord()), fiber_pool_f); if (f == NULL) { diag_log(); break; diff --git a/src/lua/fiber.c b/src/lua/fiber.c index 8a8f41308489..bed60bd1ff1a 100644 --- a/src/lua/fiber.c +++ b/src/lua/fiber.c @@ -883,6 +883,17 @@ lbox_fiber_stall(struct lua_State *L) return 0; } +/** Make fiber system. Takes the fiber as a single argument. */ +static int +lbox_fiber_set_system(struct lua_State *L) +{ + if (lua_gettop(L) != 1) + luaL_error(L, "fiber.set_system(id): bad arguments"); + struct fiber *fiber = lbox_checkfiber(L, 1); + fiber_set_system(fiber, true); + return 0; +} + /** Helper for fiber slice parsing. */ static struct fiber_slice lbox_fiber_slice_parse(struct lua_State *L, int idx) @@ -1018,6 +1029,7 @@ static const struct luaL_Reg fiberlib[] = { {"extend_slice", lbox_fiber_extend_slice}, /* Internal functions, to hide in fiber.lua. */ {"stall", lbox_fiber_stall}, + {"set_system", lbox_fiber_set_system}, {NULL, NULL} }; diff --git a/src/lua/fiber.lua b/src/lua/fiber.lua index faaae31cfd55..48eda368add5 100644 --- a/src/lua/fiber.lua +++ b/src/lua/fiber.lua @@ -73,7 +73,9 @@ fiber.clock = fiber_clock fiber.clock64 = fiber_clock64 local stall = fiber.stall +local fiber_set_system = fiber.set_system fiber.stall = nil +fiber.set_system = nil local worker_next_task = nil local worker_last_task @@ -101,15 +103,21 @@ local function worker_f() end end +local worker_name = 'tasks_worker_fiber' + local function worker_safe_f() pcall(worker_f) -- Worker_f never returns. If the execution is here, this -- fiber is probably canceled and now is not able to sleep. -- Create a new one. worker_fiber = fiber.new(worker_safe_f) + fiber_set_system(worker_fiber) + worker_fiber:name(worker_name) end worker_fiber = fiber.new(worker_safe_f) +fiber_set_system(worker_fiber) +worker_fiber:name(worker_name) local function worker_schedule_task(f, arg) local task = {f = f, arg = arg} @@ -125,6 +133,7 @@ end -- Start from '_' to hide it from auto completion. fiber._internal = fiber._internal or {} fiber._internal.schedule_task = worker_schedule_task +fiber._internal.set_system = fiber_set_system setmetatable(fiber, {__serialize = function(self) local res = table.copy(self) diff --git a/src/lua/init.lua b/src/lua/init.lua index 352e7a8c3778..e61772a6958a 100644 --- a/src/lua/init.lua +++ b/src/lua/init.lua @@ -157,6 +157,7 @@ local function exit(code) -- os.exit() never yields. After on_shutdown -- fiber completes, we will never wake up again. local TIMEOUT_INFINITY = 500 * 365 * 86400 + fiber._internal.set_system(fiber.self()) while true do fiber.sleep(TIMEOUT_INFINITY) end end rawset(os, "exit", exit) diff --git a/src/main.cc b/src/main.cc index 42f3a68a2f77..2bdf6759327d 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1109,14 +1109,5 @@ main(int argc, char **argv) free((void *)instance.name); free((void *)instance.config); tarantool_free(); - ERROR_INJECT(ERRINJ_MAIN_MAKE_FILE_ON_RETURN, do { - int fd = open("tt_exit_file.txt.inprogress", - O_WRONLY | O_CREAT | O_TRUNC, -1); - if (fd < 0) - break; - dprintf(fd, "ExitCode: %d\n", exit_code); - close(fd); - rename("tt_exit_file.txt.inprogress", "tt_exit_file.txt"); - } while (false)); return exit_code; } diff --git a/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua b/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua index daebfcd69991..c3e87490b01d 100644 --- a/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua +++ b/test/box-luatest/gh_7743_term_initial_cfg_snap_test.lua @@ -20,7 +20,6 @@ g.test_sigterm_during_initial_snapshot = function() -- uses usleep() which is a pthread cancellation point. TARANTOOL_RUN_BEFORE_BOX_CFG = [[ box.ctl.set_on_shutdown_timeout(1000) - box.error.injection.set('ERRINJ_MAIN_MAKE_FILE_ON_RETURN', true) box.error.injection.set('ERRINJ_SNAP_WRITE_DELAY', true) ]] } @@ -30,18 +29,7 @@ g.test_sigterm_during_initial_snapshot = function() t.helpers.retrying({}, function() assert(g.server:grep_log('saving snapshot', nil, {filename = logname})) end) - g.server.process:kill('TERM') - local path = fio.pathjoin(g.server.workdir, 'tt_exit_file.txt') - local exit_text - t.helpers.retrying({}, function() - local f = fio.open(path, 'O_RDONLY') - if f == nil then - error('could not open') - end - exit_text = f:read() - f:close() - end) - g.server.process = nil g.server:stop() - t.assert_str_contains(exit_text, 'ExitCode: 0\n') + local panic_msg = "failed to create a checkpoint" + t.assert(g.server:grep_log(panic_msg, nil, {filename = logname})) end diff --git a/test/box-luatest/gh_8530_alter_space_snapshot_test.lua b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua index cddc401f03ce..1e9eb0004d1a 100644 --- a/test/box-luatest/gh_8530_alter_space_snapshot_test.lua +++ b/test/box-luatest/gh_8530_alter_space_snapshot_test.lua @@ -49,6 +49,9 @@ g.test_build_index = function(cg) box.snapshot() t.assert_equals(f:status(), 'suspended') end) + -- Use KILL because server will hang on shutdown due to injection. + -- We don't need graceful shutdown for the test anyway. + cg.server.process:kill('KILL') cg.server:restart() cg.server:exec(function() local s = box.space.test @@ -69,6 +72,9 @@ g.test_change_format = function(cg) box.snapshot() t.assert_equals(f:status(), 'suspended') end) + -- Use KILL because server will hang on shutdown due to injection. + -- We don't need graceful shutdown for the test anyway. + cg.server.process:kill('KILL') cg.server:restart() cg.server:exec(function() local s = box.space.test diff --git a/test/box/errinj.result b/test/box/errinj.result index 8c4c8150f1a5..979031600d13 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -77,7 +77,6 @@ evals - ERRINJ_IPROTO_TX_DELAY: false - ERRINJ_IPROTO_WRITE_ERROR_DELAY: false - ERRINJ_LOG_ROTATE: false - - ERRINJ_MAIN_MAKE_FILE_ON_RETURN: false - ERRINJ_MEMTX_DELAY_GC: false - ERRINJ_NETBOX_DISABLE_ID: false - ERRINJ_NETBOX_FLIP_FEATURE: -1 diff --git a/test/replication-luatest/shutdown_test.lua b/test/replication-luatest/shutdown_test.lua index 0b1a0f4b78d0..b4901803ff97 100644 --- a/test/replication-luatest/shutdown_test.lua +++ b/test/replication-luatest/shutdown_test.lua @@ -17,20 +17,6 @@ g.after_each(function(cg) end end) -local test_no_crash_on_shutdown = function(server) - server.process:kill() - local path = fio.pathjoin(server.workdir, 'tt_exit_file.txt') - t.helpers.retrying({}, function() - t.assert(fio.path.exists(path)) - end) - local fh, err = fio.open(path, 'O_RDONLY') - assert(fh, err) - local str, err = fh:read() - assert(str, err) - fh:close() - t.assert_str_contains(str, 'ExitCode: 0\n') -end - g.test_shutdown_on_rebootstrap = function(cg) t.tarantool.skip_if_not_debug() -- It is critical for test that we can connect to uri but cannot auth. @@ -40,13 +26,7 @@ g.test_shutdown_on_rebootstrap = function(cg) replication = 'no:way@' .. cg.master.net_box_uri, replication_timeout = 100, } - local env = { - -- There will be no connection to replica in test. - TARANTOOL_RUN_BEFORE_BOX_CFG = [[ - box.error.injection.set('ERRINJ_MAIN_MAKE_FILE_ON_RETURN', true) - ]], - } - cg.replica = server:new({box_cfg = cfg, env = env}) + cg.replica = server:new({box_cfg = cfg}) -- Can't not wait because replica will not be bootstrapped. cg.replica:start({wait_until_ready = false}) local retry_msg = string.format('will retry every %.2f second', @@ -56,5 +36,7 @@ g.test_shutdown_on_rebootstrap = function(cg) t.helpers.retrying({}, function() t.assert(cg.replica:grep_log(retry_msg, nil, {filename = log})) end) - test_no_crash_on_shutdown(cg.replica) + cg.replica:stop() + local panic_msg = "can't initialize storage: fiber is cancelled" + t.assert(cg.replica:grep_log(panic_msg, nil, {filename = log})) end diff --git a/test/replication-py/cluster.test.py b/test/replication-py/cluster.test.py index 514c874a39b9..8249a79876ab 100644 --- a/test/replication-py/cluster.test.py +++ b/test/replication-py/cluster.test.py @@ -230,6 +230,7 @@ def check_join(msg): failed.name = "failed" failed.deploy(True, wait=False) +failed.crash_expected = True line = "ER_READONLY" if failed.logfile_pos.seek_wait(line): print("'{}' exists in server log".format(line)) diff --git a/test/replication/anon.result b/test/replication/anon.result index 68e629f61b10..997e5f0280b1 100644 --- a/test/replication/anon.result +++ b/test/replication/anon.result @@ -407,7 +407,7 @@ test_run:cmd([[create server replica with rpl_master=replica_anon1,\ | --- | - true | ... -test_run:cmd('start server replica with wait_load=False, wait=False') +test_run:cmd('start server replica with wait_load=False, wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/anon.test.lua b/test/replication/anon.test.lua index 97b2e7d67f9b..a2fc8b47df1a 100644 --- a/test/replication/anon.test.lua +++ b/test/replication/anon.test.lua @@ -146,7 +146,7 @@ test_run:cmd('delete server replica_anon2') -- Check that joining to an anonymous replica is prohibited. test_run:cmd([[create server replica with rpl_master=replica_anon1,\ script="replication/replica.lua"]]) -test_run:cmd('start server replica with wait_load=False, wait=False') +test_run:cmd('start server replica with wait_load=False, wait=False, crash_expected=True') test_run:wait_log('replica', 'ER_UNSUPPORTED: Anonymous replica does not support registration of non%-anonymous nodes.', nil, 10) test_run:cmd('stop server replica') test_run:cmd('delete server replica') diff --git a/test/replication/force_recovery.result b/test/replication/force_recovery.result index e142e829ab5c..c278a218a37a 100644 --- a/test/replication/force_recovery.result +++ b/test/replication/force_recovery.result @@ -63,7 +63,7 @@ fio.unlink(xlog) box.cfg{force_recovery = true} --- ... -test_run:cmd("start server test with wait=False") +test_run:cmd("start server test with wait=False, crash_expected=True") --- - true ... diff --git a/test/replication/force_recovery.test.lua b/test/replication/force_recovery.test.lua index bd3b439d2a96..e6f7ae716088 100644 --- a/test/replication/force_recovery.test.lua +++ b/test/replication/force_recovery.test.lua @@ -27,7 +27,7 @@ fio.unlink(xlog) -- Check that even though box.cfg.force_recovery is set, -- replication will still fail due to LSN gap. box.cfg{force_recovery = true} -test_run:cmd("start server test with wait=False") +test_run:cmd("start server test with wait=False, crash_expected=True") test_run:cmd("switch test") test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'}) box.space.test:select() diff --git a/test/replication/gh-3637-misc-error-on-replica-auth-fail.result b/test/replication/gh-3637-misc-error-on-replica-auth-fail.result index 98880d8e4088..9008f88c1182 100644 --- a/test/replication/gh-3637-misc-error-on-replica-auth-fail.result +++ b/test/replication/gh-3637-misc-error-on-replica-auth-fail.result @@ -49,6 +49,19 @@ vclock[0] = nil _ = test_run:wait_vclock('replica_auth', vclock) --- ... +-- Wait server init script finish or server will panic on stop. +test_run:switch('replica_auth') +--- +- true +... +test_run:wait_cond(function() return _G.startup_finished == true end) +--- +- true +... +test_run:switch('default') +--- +- true +... test_run:cmd("stop server replica_auth") --- - true diff --git a/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua b/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua index c51a2f628977..6028796d74d9 100644 --- a/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua +++ b/test/replication/gh-3637-misc-error-on-replica-auth-fail.test.lua @@ -24,6 +24,11 @@ vclock = test_run:get_vclock('default') vclock[0] = nil _ = test_run:wait_vclock('replica_auth', vclock) +-- Wait server init script finish or server will panic on stop. +test_run:switch('replica_auth') +test_run:wait_cond(function() return _G.startup_finished == true end) + +test_run:switch('default') test_run:cmd("stop server replica_auth") test_run:cmd("cleanup server replica_auth") test_run:cmd("delete server replica_auth") diff --git a/test/replication/gh-4739-vclock-assert.result b/test/replication/gh-4739-vclock-assert.result index 83896c4e16e9..21247e42b8a2 100644 --- a/test/replication/gh-4739-vclock-assert.result +++ b/test/replication/gh-4739-vclock-assert.result @@ -56,7 +56,7 @@ end, 10) -- Restart the remote instance. This will make the first instance -- resubscribe without entering orphan mode. -test_run:cmd('restart server rebootstrap2 with wait=False') +test_run:cmd('restart server rebootstrap2 with wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-4739-vclock-assert.test.lua b/test/replication/gh-4739-vclock-assert.test.lua index 5755ad75285a..781b7bc04115 100644 --- a/test/replication/gh-4739-vclock-assert.test.lua +++ b/test/replication/gh-4739-vclock-assert.test.lua @@ -24,7 +24,7 @@ end, 10) -- Restart the remote instance. This will make the first instance -- resubscribe without entering orphan mode. -test_run:cmd('restart server rebootstrap2 with wait=False') +test_run:cmd('restart server rebootstrap2 with wait=False, crash_expected=True') test_run:cmd('switch rebootstrap1') -- Wait until resubscribe is sent test_run:wait_cond(function()\ diff --git a/test/replication/gh-5613-bootstrap-prefer-booted.result b/test/replication/gh-5613-bootstrap-prefer-booted.result index d31b66c191ff..077b2992c389 100644 --- a/test/replication/gh-5613-bootstrap-prefer-booted.result +++ b/test/replication/gh-5613-bootstrap-prefer-booted.result @@ -43,7 +43,7 @@ test_run:cmd('create server replica2 with script="replication/gh-5613-replica2.l | --- | - true | ... -test_run:cmd('start server replica2 with wait=False') +test_run:cmd('start server replica2 with wait=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-5613-bootstrap-prefer-booted.test.lua b/test/replication/gh-5613-bootstrap-prefer-booted.test.lua index 6d4fcd14261c..9300d4e3d47a 100644 --- a/test/replication/gh-5613-bootstrap-prefer-booted.test.lua +++ b/test/replication/gh-5613-bootstrap-prefer-booted.test.lua @@ -17,7 +17,7 @@ box.cfg{read_only = true} test_run:switch('default') test_run:cmd('create server replica2 with script="replication/gh-5613-replica2.lua"') -test_run:cmd('start server replica2 with wait=False') +test_run:cmd('start server replica2 with wait=False, crash_expected=True') opts = {filename = 'gh-5613-replica2.log'} assert(test_run:wait_log(nil, 'ER_READONLY', nil, nil, opts) ~= nil) diff --git a/test/replication/gh-5806-xlog-cleanup.result b/test/replication/gh-5806-xlog-cleanup.result index aa709f8c8b64..21d6d18b6f9b 100644 --- a/test/replication/gh-5806-xlog-cleanup.result +++ b/test/replication/gh-5806-xlog-cleanup.result @@ -153,7 +153,7 @@ assert(not box.info.gc().is_paused) -- -- Start replica and wait for error. -test_run:cmd('start server replica with wait=False, wait_load=False') +test_run:cmd('start server replica with wait=False, wait_load=False, crash_expected=True') | --- | - true | ... diff --git a/test/replication/gh-5806-xlog-cleanup.test.lua b/test/replication/gh-5806-xlog-cleanup.test.lua index 3c4abe5ee4ea..310ab6b641ab 100644 --- a/test/replication/gh-5806-xlog-cleanup.test.lua +++ b/test/replication/gh-5806-xlog-cleanup.test.lua @@ -78,7 +78,7 @@ assert(not box.info.gc().is_paused) -- -- Start replica and wait for error. -test_run:cmd('start server replica with wait=False, wait_load=False') +test_run:cmd('start server replica with wait=False, wait_load=False, crash_expected=True') -- -- Wait error to appear, 60 seconds should be more than enough, diff --git a/test/replication/prune.result b/test/replication/prune.result index e25e9684e298..b2040cc198d0 100644 --- a/test/replication/prune.result +++ b/test/replication/prune.result @@ -137,7 +137,7 @@ test_run:cmd('stop server replica1') --- - true ... -test_run:cmd('start server replica1 with args="true", wait=False') +test_run:cmd('start server replica1 with args="true", wait=False, crash_expected=True') --- - true ... diff --git a/test/replication/prune.test.lua b/test/replication/prune.test.lua index 68300b270c34..fd24b707739d 100644 --- a/test/replication/prune.test.lua +++ b/test/replication/prune.test.lua @@ -66,7 +66,7 @@ test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"') -- restart replica and check that replica isn't able to join to cluster test_run:cmd('stop server replica1') -test_run:cmd('start server replica1 with args="true", wait=False') +test_run:cmd('start server replica1 with args="true", wait=False, crash_expected=True') test_run:cmd('switch replica1') test_run:wait_upstream(1, {message_re = "Can't subscribe non%-anonymous replica"}) test_run:cmd('switch default') diff --git a/test/replication/replica_auth.lua b/test/replication/replica_auth.lua index 61d046fc47f0..72898c618651 100644 --- a/test/replication/replica_auth.lua +++ b/test/replication/replica_auth.lua @@ -4,9 +4,12 @@ local USER_PASS = arg[1] local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1 require('console').listen(os.getenv('ADMIN')) +_G.startup_finished = false box.cfg({ listen = os.getenv("LISTEN"), replication = USER_PASS .. "@" .. os.getenv("MASTER"), replication_timeout = TIMEOUT, }) + +_G.startup_finished = true diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result index e489c150a6ed..0cccc7f0b459 100644 --- a/test/replication/replica_rejoin.result +++ b/test/replication/replica_rejoin.result @@ -238,7 +238,7 @@ test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*. box.cfg{checkpoint_count = checkpoint_count} --- ... -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") --- - true ... diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua index 2563177cf55d..f9d1b45f6316 100644 --- a/test/replication/replica_rejoin.test.lua +++ b/test/replication/replica_rejoin.test.lua @@ -90,7 +90,7 @@ for i = 1, 3 do box.space.test:insert{i * 100} end fio = require('fio') test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') box.cfg{checkpoint_count = checkpoint_count} -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") test_run:cmd("switch replica") test_run:wait_upstream(1, {message_re = 'Missing %.xlog file', status = 'loading'}) box.space.test:select() diff --git a/test/unit/fiber.cc b/test/unit/fiber.cc index 2ccd5a8ca75f..7caab3a8e273 100644 --- a/test/unit/fiber.cc +++ b/test/unit/fiber.cc @@ -581,6 +581,145 @@ fiber_test_leak_modes() say_logger_free(); } +static void +fiber_test_client_fiber_count(void) +{ + header(); + + int count = cord()->client_fiber_count; + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + fail_unless(++count == cord()->client_fiber_count); + + struct fiber *fiber2 = fiber_new("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + fail_unless(++count == cord()->client_fiber_count); + + struct fiber *fiber3 = fiber_new_system("fiber3", wait_cancel_f); + fail_unless(fiber3 != NULL); + fail_unless(count == cord()->client_fiber_count); + + struct fiber *fiber4 = fiber_new_system("fiber4", wait_cancel_f); + fail_unless(fiber4 != NULL); + fail_unless(count == cord()->client_fiber_count); + + fiber_set_joinable(fiber1, true); + fiber_cancel(fiber1); + fiber_join(fiber1); + fail_unless(--count == cord()->client_fiber_count); + + fiber_set_joinable(fiber4, true); + fiber_cancel(fiber4); + fiber_join(fiber4); + fail_unless(count == cord()->client_fiber_count); + + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + fail_unless(--count == cord()->client_fiber_count); + + fiber_set_joinable(fiber3, true); + fiber_cancel(fiber3); + fiber_join(fiber3); + fail_unless(count == cord()->client_fiber_count); + + footer(); +} + +static void +fiber_test_set_system(void) +{ + header(); + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + int count = cord()->client_fiber_count; + + fiber_set_system(fiber1, true); + fail_unless(--count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber1, true); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber1, false); + fail_unless(++count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber1, false); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber1->flags & FIBER_IS_SYSTEM) == 0); + + struct fiber *fiber2 = fiber_new_system("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + count = cord()->client_fiber_count; + + fiber_set_system(fiber2, false); + fail_unless(++count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber2, false); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) == 0); + + fiber_set_system(fiber2, true); + fail_unless(--count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_system(fiber2, true); + fail_unless(count == cord()->client_fiber_count); + fail_unless((fiber2->flags & FIBER_IS_SYSTEM) != 0); + + fiber_set_joinable(fiber1, true); + fiber_cancel(fiber1); + fiber_join(fiber1); + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + + footer(); +} + +static int +hang_on_cancel_f(va_list ap) +{ + while (!fiber_is_cancelled()) + fiber_yield(); + fiber_set_system(fiber(), true); + while (true) + fiber_yield(); + return 0; +} + +static void +fiber_test_shutdown(void) +{ + footer(); + + struct fiber *fiber1 = fiber_new("fiber1", wait_cancel_f); + fail_unless(fiber1 != NULL); + fiber_set_joinable(fiber1, true); + struct fiber *fiber2 = fiber_new_system("fiber2", wait_cancel_f); + fail_unless(fiber2 != NULL); + struct fiber *fiber3 = fiber_new("fiber3", hang_on_cancel_f); + fail_unless(fiber3 != NULL); + + fiber_shutdown(); + fail_unless((fiber1->flags & FIBER_IS_DEAD) != 0); + fail_unless((fiber2->flags & FIBER_IS_DEAD) == 0); + fail_unless((fiber3->flags & FIBER_IS_DEAD) == 0); + + fiber_join(fiber1); + + fiber_set_joinable(fiber2, true); + fiber_cancel(fiber2); + fiber_join(fiber2); + + header(); +} + static int main_f(va_list ap) { @@ -597,6 +736,9 @@ main_f(va_list ap) cord_cancel_and_join_test(); fiber_test_defaults(); fiber_test_leak_modes(); + fiber_test_client_fiber_count(); + fiber_test_set_system(); + fiber_test_shutdown(); ev_break(loop(), EVBREAK_ALL); return 0; } @@ -611,7 +753,7 @@ int main() memory_init(); fiber_init(fiber_cxx_invoke); fiber_attr_create(&default_attr); - struct fiber *main = fiber_new_xc("main", main_f); + struct fiber *main = fiber_new_system_xc("main", main_f); fiber_wakeup(main); ev_run(loop(), 0); fiber_free(); diff --git a/test/unit/fiber.result b/test/unit/fiber.result index 0066138356d2..2b5469cfba87 100644 --- a/test/unit/fiber.result +++ b/test/unit/fiber.result @@ -40,3 +40,9 @@ OutOfMemory: Failed to allocate 42 bytes in allocator for exception *** fiber_test_leak: done *** *** fiber_test_leak *** *** fiber_test_leak: done *** + *** fiber_test_client_fiber_count *** + *** fiber_test_client_fiber_count: done *** + *** fiber_test_set_system *** + *** fiber_test_set_system: done *** + *** fiber_test_shutdown: done *** + *** fiber_test_shutdown *** diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result index 18d10b077bd0..e30c34ab11c1 100644 --- a/test/vinyl/errinj.result +++ b/test/vinyl/errinj.result @@ -1145,8 +1145,8 @@ s:drop() --- ... -- --- Check that tarantool stops immediately even if a vinyl worker --- thread is blocked (see gh-3225). +-- Check that tarantool stops immediately if large snapshot write +-- is in progress. -- s = box.schema.space.create('test', {engine = 'vinyl'}) --- @@ -1154,33 +1154,17 @@ s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') --- ... -s:replace{1, 1} ---- -- [1, 1] -... -box.snapshot() ---- -- ok -... -errinj.set('ERRINJ_VY_READ_PAGE_TIMEOUT', 9000) ---- -- ok -... -_ = fiber.create(function() s:get(1) end) +for i = 1, 10000 do s:replace({i}) end --- ... -s:replace{1, 2} ---- -- [1, 2] -... -errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 9000) +errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 0.01) --- - ok ... _ = fiber.create(function() box.snapshot() end) --- ... -test_run:cmd("restart server default") +test_run:cmd("restart server default") -- don't stuck box.space.test:drop() --- ... diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua index d698b4408481..0a7beac68273 100644 --- a/test/vinyl/errinj.test.lua +++ b/test/vinyl/errinj.test.lua @@ -414,23 +414,16 @@ box.schema.user.revoke('guest', 'replication') s:drop() -- --- Check that tarantool stops immediately even if a vinyl worker --- thread is blocked (see gh-3225). +-- Check that tarantool stops immediately if large snapshot write +-- is in progress. -- s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') -s:replace{1, 1} -box.snapshot() - -errinj.set('ERRINJ_VY_READ_PAGE_TIMEOUT', 9000) -_ = fiber.create(function() s:get(1) end) - -s:replace{1, 2} - -errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 9000) +for i = 1, 10000 do s:replace({i}) end +errinj.set('ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT', 0.01) _ = fiber.create(function() box.snapshot() end) -test_run:cmd("restart server default") +test_run:cmd("restart server default") -- don't stuck box.space.test:drop() -- diff --git a/test/vinyl/errinj_vylog.result b/test/vinyl/errinj_vylog.result index b9ae9332e9b6..6ac76b2c816c 100644 --- a/test/vinyl/errinj_vylog.result +++ b/test/vinyl/errinj_vylog.result @@ -399,7 +399,9 @@ fiber.sleep(0.01) --- ... -- Should ignore the incomplete index on recovery. -test_run:cmd('restart server default') +-- Use KILL because server will hang on shutdown due to injection. +-- We don't need graceful shutdown for the test anyway. +test_run:cmd('restart server default with signal=KILL') s = box.space.test --- ... diff --git a/test/vinyl/errinj_vylog.test.lua b/test/vinyl/errinj_vylog.test.lua index 4401f301502c..54a69c65997b 100644 --- a/test/vinyl/errinj_vylog.test.lua +++ b/test/vinyl/errinj_vylog.test.lua @@ -198,7 +198,9 @@ _ = fiber.create(function() s:create_index('sk', {parts = {2, 'unsigned'}}) end) fiber.sleep(0.01) -- Should ignore the incomplete index on recovery. -test_run:cmd('restart server default') +-- Use KILL because server will hang on shutdown due to injection. +-- We don't need graceful shutdown for the test anyway. +test_run:cmd('restart server default with signal=KILL') s = box.space.test s.index[1] == nil diff --git a/test/xlog/panic_on_wal_error.result b/test/xlog/panic_on_wal_error.result index c4494ac87a84..0806a96ed20a 100644 --- a/test/xlog/panic_on_wal_error.result +++ b/test/xlog/panic_on_wal_error.result @@ -121,7 +121,7 @@ box.cfg.force_recovery -- try to start the replica, ha-ha -- (replication should fail, some rows are missing) -- -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") --- - true ... diff --git a/test/xlog/panic_on_wal_error.test.lua b/test/xlog/panic_on_wal_error.test.lua index eea6aad300ea..77bcde7877de 100644 --- a/test/xlog/panic_on_wal_error.test.lua +++ b/test/xlog/panic_on_wal_error.test.lua @@ -57,7 +57,7 @@ box.cfg.force_recovery -- try to start the replica, ha-ha -- (replication should fail, some rows are missing) -- -test_run:cmd("start server replica with wait=False") +test_run:cmd("start server replica with wait=False, crash_expected=True") test_run:cmd("switch replica") -- Need to wait for box.info.replication[1] defined, otherwise test-run fails to -- wait for the upstream status sometimes.