From 433638885143639a4cd70e04d2d344fb6a370925 Mon Sep 17 00:00:00 2001 From: Crystal Lemire Date: Thu, 7 Dec 2023 15:28:26 -0800 Subject: [PATCH] [CORE-668,CORE-669] - Add health monitor flags (#802) --- protocol/app/app.go | 22 ++++------ protocol/app/flags/flags.go | 1 - protocol/daemons/flags/flags.go | 36 ++++++++++++++-- protocol/daemons/flags/flags_test.go | 12 +++++- .../client/client_integration_test.go | 7 +++- .../daemons/server/types/health_checker.go | 21 +++++----- .../daemons/server/types/health_monitor.go | 37 ++++++++++++----- .../server/types/health_monitor_test.go | 41 ++++++++++++++++++- protocol/docker-compose.yml | 22 ++++++---- .../clob/client/cli/cancel_order_cli_test.go | 8 ++++ .../clob/client/cli/liquidations_cli_test.go | 8 ++++ .../x/clob/client/cli/place_order_cli_test.go | 8 ++++ 12 files changed, 173 insertions(+), 50 deletions(-) diff --git a/protocol/app/app.go b/protocol/app/app.go index a13d66b9dc..6a6b04fb61 100644 --- a/protocol/app/app.go +++ b/protocol/app/app.go @@ -183,10 +183,6 @@ import ( var ( // DefaultNodeHome default home directories for the application daemon DefaultNodeHome string - - // MaximumDaemonUnhealthyDuration is the maximum amount of time that a daemon can be unhealthy before the - // application panics. - MaximumDaemonUnhealthyDuration = 5 * time.Minute ) var ( @@ -599,6 +595,7 @@ func New( daemonservertypes.DaemonStartupGracePeriod, daemonservertypes.HealthCheckPollFrequency, app.Logger(), + daemonFlags.Shared.PanicOnDaemonFailureEnabled, ) // Create a closure for starting daemons and daemon server. Daemon services are delayed until after the gRPC // service is started because daemons depend on the gRPC service being available. If a node is initialized @@ -606,6 +603,7 @@ func New( // daemons will not be able to connect to the cosmos gRPC query service and finish initialization, and the daemon // monitoring service will panic. app.startDaemons = func() { + maxDaemonUnhealthyDuration := time.Duration(daemonFlags.Shared.MaxDaemonUnhealthySeconds) * time.Second // Start server for handling gRPC messages from daemons. go app.Server.Start() @@ -613,7 +611,7 @@ func New( if daemonFlags.Liquidation.Enabled { app.LiquidationsClient = liquidationclient.NewClient(logger) go func() { - app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, maxDaemonUnhealthyDuration) if err := app.LiquidationsClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -645,17 +643,15 @@ func New( constants.StaticExchangeDetails, &pricefeedclient.SubTaskRunnerImpl{}, ) - app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, maxDaemonUnhealthyDuration) } // Start Bridge Daemon. // Non-validating full-nodes have no need to run the bridge daemon. if !appFlags.NonValidatingFullNode && daemonFlags.Bridge.Enabled { - // TODO(CORE-582): Re-enable bridge daemon registration once the bridge daemon is fixed in local / CI - // environments. app.BridgeClient = bridgeclient.NewClient(logger) go func() { - app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration) + app.RegisterDaemonWithHealthMonitor(app.BridgeClient, maxDaemonUnhealthyDuration) if err := app.BridgeClient.Start( // The client will use `context.Background` so that it can have a different context from // the main application. @@ -1234,17 +1230,17 @@ func New( // the health of the daemon. If the daemon does not register, the method will panic. func (app *App) RegisterDaemonWithHealthMonitor( healthCheckableDaemon daemontypes.HealthCheckable, - maximumAcceptableUpdateDelay time.Duration, + maxDaemonUnhealthyDuration time.Duration, ) { - if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maximumAcceptableUpdateDelay); err != nil { + if err := app.DaemonHealthMonitor.RegisterService(healthCheckableDaemon, maxDaemonUnhealthyDuration); err != nil { app.Logger().Error( "Failed to register daemon service with update monitor", "error", err, "service", healthCheckableDaemon.ServiceName(), - "maximumAcceptableUpdateDelay", - maximumAcceptableUpdateDelay, + "maxDaemonUnhealthyDuration", + maxDaemonUnhealthyDuration, ) panic(err) } diff --git a/protocol/app/flags/flags.go b/protocol/app/flags/flags.go index 21333d95f1..0603997801 100644 --- a/protocol/app/flags/flags.go +++ b/protocol/app/flags/flags.go @@ -2,7 +2,6 @@ package flags import ( "fmt" - "github.com/cosmos/cosmos-sdk/server/config" servertypes "github.com/cosmos/cosmos-sdk/server/types" "github.com/spf13/cast" diff --git a/protocol/daemons/flags/flags.go b/protocol/daemons/flags/flags.go index eebdf7e631..e386f379ab 100644 --- a/protocol/daemons/flags/flags.go +++ b/protocol/daemons/flags/flags.go @@ -9,7 +9,9 @@ import ( // List of CLI flags for Server and Client. const ( // Flag names - FlagUnixSocketAddress = "unix-socket-address" + FlagUnixSocketAddress = "unix-socket-address" + FlagPanicOnDaemonFailureEnabled = "panic-on-daemon-failure-enabled" + FlagMaxDaemonUnhealthySeconds = "max-daemon-unhealthy-seconds" FlagPriceDaemonEnabled = "price-daemon-enabled" FlagPriceDaemonLoopDelayMs = "price-daemon-loop-delay-ms" @@ -28,6 +30,10 @@ const ( type SharedFlags struct { // SocketAddress is the location of the unix socket to communicate with the daemon gRPC service. SocketAddress string + // PanicOnDaemonFailureEnabled toggles whether the daemon should panic on failure. + PanicOnDaemonFailureEnabled bool + // MaxDaemonUnhealthySeconds is the maximum allowable duration for which a daemon can be unhealthy. + MaxDaemonUnhealthySeconds uint32 } // BridgeFlags contains configuration flags for the Bridge Daemon. @@ -74,7 +80,9 @@ func GetDefaultDaemonFlags() DaemonFlags { if defaultDaemonFlags == nil { defaultDaemonFlags = &DaemonFlags{ Shared: SharedFlags{ - SocketAddress: "/tmp/daemons.sock", + SocketAddress: "/tmp/daemons.sock", + PanicOnDaemonFailureEnabled: true, + MaxDaemonUnhealthySeconds: 5 * 60, // 5 minutes. }, Bridge: BridgeFlags{ Enabled: true, @@ -109,8 +117,18 @@ func AddDaemonFlagsToCmd( cmd.Flags().String( FlagUnixSocketAddress, df.Shared.SocketAddress, - "Socket address for the price daemon to send updates to, if not set "+ - "will establish default location to ingest price updates from", + "Socket address for the daemons to send updates to, if not set "+ + "will establish default location to ingest daemon updates from", + ) + cmd.Flags().Bool( + FlagPanicOnDaemonFailureEnabled, + df.Shared.PanicOnDaemonFailureEnabled, + "Enables panicking when a daemon fails.", + ) + cmd.Flags().Uint32( + FlagMaxDaemonUnhealthySeconds, + df.Shared.MaxDaemonUnhealthySeconds, + "Maximum allowable duration for which a daemon can be unhealthy.", ) // Bridge Daemon. @@ -178,6 +196,16 @@ func GetDaemonFlagValuesFromOptions( result.Shared.SocketAddress = v } } + if option := appOpts.Get(FlagPanicOnDaemonFailureEnabled); option != nil { + if v, err := cast.ToBoolE(option); err == nil { + result.Shared.PanicOnDaemonFailureEnabled = v + } + } + if option := appOpts.Get(FlagMaxDaemonUnhealthySeconds); option != nil { + if v, err := cast.ToUint32E(option); err == nil { + result.Shared.MaxDaemonUnhealthySeconds = v + } + } // Bridge Daemon. if option := appOpts.Get(FlagBridgeDaemonEnabled); option != nil { diff --git a/protocol/daemons/flags/flags_test.go b/protocol/daemons/flags/flags_test.go index 5c79395a39..04191032f6 100644 --- a/protocol/daemons/flags/flags_test.go +++ b/protocol/daemons/flags/flags_test.go @@ -17,6 +17,8 @@ func TestAddDaemonFlagsToCmd(t *testing.T) { flags.AddDaemonFlagsToCmd(&cmd) tests := []string{ flags.FlagUnixSocketAddress, + flags.FlagPanicOnDaemonFailureEnabled, + flags.FlagMaxDaemonUnhealthySeconds, flags.FlagBridgeDaemonEnabled, flags.FlagBridgeDaemonLoopDelayMs, @@ -41,6 +43,8 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { optsMap := make(map[string]interface{}) optsMap[flags.FlagUnixSocketAddress] = "test-socket-address" + optsMap[flags.FlagPanicOnDaemonFailureEnabled] = false + optsMap[flags.FlagMaxDaemonUnhealthySeconds] = uint32(1234) optsMap[flags.FlagBridgeDaemonEnabled] = true optsMap[flags.FlagBridgeDaemonLoopDelayMs] = uint32(1111) @@ -64,6 +68,12 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { // Shared. require.Equal(t, optsMap[flags.FlagUnixSocketAddress], r.Shared.SocketAddress) + require.Equal(t, optsMap[flags.FlagPanicOnDaemonFailureEnabled], r.Shared.PanicOnDaemonFailureEnabled) + require.Equal( + t, + optsMap[flags.FlagMaxDaemonUnhealthySeconds], + r.Shared.MaxDaemonUnhealthySeconds, + ) // Bridge Daemon. require.Equal(t, optsMap[flags.FlagBridgeDaemonEnabled], r.Bridge.Enabled) @@ -81,7 +91,7 @@ func TestGetDaemonFlagValuesFromOptions_Custom(t *testing.T) { require.Equal(t, optsMap[flags.FlagPriceDaemonLoopDelayMs], r.Price.LoopDelayMs) } -func TestGetDaemonFlagValuesFromOptions_Defaul(t *testing.T) { +func TestGetDaemonFlagValuesFromOptions_Default(t *testing.T) { mockOpts := mocks.AppOptions{} mockOpts.On("Get", mock.Anything). Return(func(key string) interface{} { diff --git a/protocol/daemons/pricefeed/client/client_integration_test.go b/protocol/daemons/pricefeed/client/client_integration_test.go index 70e2dfd75c..45d32a28a4 100644 --- a/protocol/daemons/pricefeed/client/client_integration_test.go +++ b/protocol/daemons/pricefeed/client/client_integration_test.go @@ -5,7 +5,6 @@ package client_test import ( "fmt" "github.com/cometbft/cometbft/libs/log" - "github.com/dydxprotocol/v4-chain/protocol/app" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/flags" "github.com/dydxprotocol/v4-chain/protocol/daemons/pricefeed/client" @@ -285,6 +284,7 @@ func (s *PriceDaemonIntegrationTestSuite) SetupTest() { servertypes.DaemonStartupGracePeriod, servertypes.HealthCheckPollFrequency, log.TestingLogger(), + flags.GetDefaultDaemonFlags().Shared.PanicOnDaemonFailureEnabled, // Use default behavior for testing ) s.exchangePriceCache = pricefeedserver_types.NewMarketToExchangePrices(pricefeed_types.MaxPriceAge) @@ -337,7 +337,10 @@ func (s *PriceDaemonIntegrationTestSuite) startClient() { testExchangeToQueryDetails, &client.SubTaskRunnerImpl{}, ) - err := s.healthMonitor.RegisterService(s.pricefeedDaemon, app.MaximumDaemonUnhealthyDuration) + err := s.healthMonitor.RegisterService( + s.pricefeedDaemon, + time.Duration(s.daemonFlags.Shared.MaxDaemonUnhealthySeconds)*time.Second, + ) s.Require().NoError(err) } diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go index 6c4d2727ab..228a6dfd8d 100644 --- a/protocol/daemons/server/types/health_checker.go +++ b/protocol/daemons/server/types/health_checker.go @@ -145,10 +145,10 @@ type healthChecker struct { // pollFrequency is the frequency at which the health-checkable service is polled. pollFrequency time.Duration - // maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to + // maxUnhealthyDuration is the maximum acceptable duration for a health-checkable service to // remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the // specified callback function. - maxAcceptableUnhealthyDuration time.Duration + maxUnhealthyDuration time.Duration // unhealthyCallback is the callback function to be executed if the health-checkable service remains // unhealthy for a period of time greater than or equal to the maximum acceptable unhealthy duration. @@ -174,7 +174,7 @@ func (hc *healthChecker) Poll() { streakDuration := hc.mutableState.ReportFailure(now, err) // If the service has been unhealthy for longer than the maximum acceptable unhealthy duration, execute the // callback function. - if streakDuration >= hc.maxAcceptableUnhealthyDuration { + if streakDuration >= hc.maxUnhealthyDuration { hc.unhealthyCallback(err) } } @@ -197,17 +197,18 @@ func StartNewHealthChecker( pollFrequency time.Duration, unhealthyCallback func(error), timeProvider libtime.TimeProvider, - maximumAcceptableUnhealthyDuration time.Duration, + maxUnhealthyDuration time.Duration, startupGracePeriod time.Duration, logger log.Logger, ) *healthChecker { checker := &healthChecker{ - healthCheckable: healthCheckable, - pollFrequency: pollFrequency, - unhealthyCallback: unhealthyCallback, - timeProvider: timeProvider, - maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration, - logger: logger, + healthCheckable: healthCheckable, + pollFrequency: pollFrequency, + unhealthyCallback: unhealthyCallback, + timeProvider: timeProvider, + maxUnhealthyDuration: maxUnhealthyDuration, + logger: logger, + mutableState: &healthCheckerMutableState{}, } // The first poll is scheduled after the startup grace period to allow the service to initialize. diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go index 130398d18d..1ac3109589 100644 --- a/protocol/daemons/server/types/health_monitor.go +++ b/protocol/daemons/server/types/health_monitor.go @@ -122,9 +122,14 @@ type HealthMonitor struct { mutableState *healthMonitorMutableState // These fields are initialized in NewHealthMonitor and are not modified after initialization. - logger log.Logger + logger log.Logger + // startupGracePeriod is the grace period before the monitor starts polling the health-checkable services. startupGracePeriod time.Duration - pollingFrequency time.Duration + // pollingFrequency is the frequency at which the health-checkable services are polled. + pollingFrequency time.Duration + // enablePanics is used to toggle between panics or error logs when a daemon sustains an unhealthy state past the + // maximum allowable duration. + enablePanics bool } // NewHealthMonitor creates a new health monitor. @@ -132,12 +137,14 @@ func NewHealthMonitor( startupGracePeriod time.Duration, pollingFrequency time.Duration, logger log.Logger, + enablePanics bool, ) *HealthMonitor { return &HealthMonitor{ mutableState: newHealthMonitorMutableState(), logger: logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName), startupGracePeriod: startupGracePeriod, pollingFrequency: pollingFrequency, + enablePanics: enablePanics, } } @@ -153,15 +160,15 @@ func (hm *HealthMonitor) DisableForTesting() { // health-checkable service before returning. func (hm *HealthMonitor) RegisterServiceWithCallback( hc types.HealthCheckable, - maximumAcceptableUnhealthyDuration time.Duration, + maxUnhealthyDuration time.Duration, callback func(error), ) error { - if maximumAcceptableUnhealthyDuration <= 0 { + if maxUnhealthyDuration <= 0 { return fmt.Errorf( "health check registration failure for service %v: "+ - "maximum acceptable unhealthy duration %v must be positive", + "maximum unhealthy duration %v must be positive", hc.ServiceName(), - maximumAcceptableUnhealthyDuration, + maxUnhealthyDuration, ) } @@ -171,7 +178,7 @@ func (hm *HealthMonitor) RegisterServiceWithCallback( hm.pollingFrequency, callback, &libtime.TimeProviderImpl{}, - maximumAcceptableUnhealthyDuration, + maxUnhealthyDuration, hm.startupGracePeriod, hm.logger, ) @@ -202,18 +209,26 @@ func LogErrorServiceNotResponding(hc types.HealthCheckable, logger log.Logger) f // RegisterService registers a new health-checkable service with the health check monitor. If the service // is unhealthy every time it is polled for a duration greater than or equal to the maximum acceptable unhealthy -// duration, the monitor will panic. +// duration, the monitor will panic or log an error, depending on the app configuration via the +// `panic-on-daemon-failure-enabled` flag. // This method is synchronized. It returns an error if the service was already registered or the monitor has // already been stopped. If the monitor has been stopped, this method will proactively stop the health-checkable // service before returning. func (hm *HealthMonitor) RegisterService( hc types.HealthCheckable, - maximumAcceptableUnhealthyDuration time.Duration, + maxDaemonUnhealthyDuration time.Duration, ) error { + // If the monitor is configured to panic, use the panic callback. Otherwise, use the error log callback. + // This behavior is configured via flag and defaults to panicking on daemon failure. + callback := LogErrorServiceNotResponding(hc, hm.logger) + if hm.enablePanics { + callback = PanicServiceNotResponding(hc) + } + return hm.RegisterServiceWithCallback( hc, - maximumAcceptableUnhealthyDuration, - PanicServiceNotResponding(hc), + maxDaemonUnhealthyDuration, + callback, ) } diff --git a/protocol/daemons/server/types/health_monitor_test.go b/protocol/daemons/server/types/health_monitor_test.go index af991fd21c..4a8e701901 100644 --- a/protocol/daemons/server/types/health_monitor_test.go +++ b/protocol/daemons/server/types/health_monitor_test.go @@ -39,6 +39,7 @@ func createTestMonitor() (*types.HealthMonitor, *mocks.Logger) { ZeroDuration, 10*time.Millisecond, logger, + true, // enable panics here for stricter testing - a panic will definitely cause a test failure. ), logger } @@ -126,6 +127,44 @@ func TestRegisterServiceWithCallback_Mixed(t *testing.T) { } } +func TestHealthMonitor_DisablePanics_DoesNotPanic(t *testing.T) { + logger := &mocks.Logger{} + logger.On("With", "module", "daemon-health-monitor").Return(logger).Once() + logger.On( + "Error", + "health-checked service is unhealthy", + "service", + "test-service", + "error", + mock.Anything, + ).Return() + + hm := types.NewHealthMonitor( + ZeroDuration, + 10*time.Millisecond, + logger, + false, + ) + + hc := mockFailingHealthCheckerWithError("test-service", TestError1) + + err := hm.RegisterService(hc, 10*time.Millisecond) + require.NoError(t, err) + + defer func() { + hm.Stop() + }() + + // A 100ms sleep should be sufficient for the health monitor to detect the unhealthy service and trigger a callback. + time.Sleep(100 * time.Millisecond) + + // Assert. + // This test is confirmed to panic when panics are not disabled - but because the panic occurs in a separate + // go-routine, it cannot be easily captured with an assert. Instead, we do not try to capture the panic, but + // assert that the logger was called with the expected arguments. + mock.AssertExpectationsForObjects(t, logger) +} + func TestRegisterServiceWithCallback_DoubleRegistrationFails(t *testing.T) { // Setup. ufm, logger := createTestMonitor() @@ -203,7 +242,7 @@ func TestRegisterValidResponseWithCallback_NegativeUnhealthyDuration(t *testing. ufm, _ := createTestMonitor() hc := mockFailingHealthCheckerWithError("test-service", TestError1) err := ufm.RegisterServiceWithCallback(hc, -50*time.Millisecond, func(error) {}) - require.ErrorContains(t, err, "maximum acceptable unhealthy duration -50ms must be positive") + require.ErrorContains(t, err, "maximum unhealthy duration -50ms must be positive") } func TestPanicServiceNotResponding(t *testing.T) { diff --git a/protocol/docker-compose.yml b/protocol/docker-compose.yml index 203f802f66..c303801158 100644 --- a/protocol/docker-compose.yml +++ b/protocol/docker-compose.yml @@ -10,13 +10,15 @@ services: - --log_level # Note that only this validator has a log-level of `info`; other validators use `error` by default. # Change to `debug` for more verbose log-level. - - info + - info - --home - /dydxprotocol/chain/.alice - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" # Effectively disable the daemon monitor because bridge daemon is flaky in localnet. environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -28,7 +30,7 @@ services: - "26657:26657" - "9090:9090" - "1317:1317" - + dydxprotocold1: image: local:dydxprotocol entrypoint: @@ -39,10 +41,12 @@ services: - error - --home - /dydxprotocol/chain/.bob - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -52,7 +56,7 @@ services: - ./localnet/dydxprotocol1:/dydxprotocol/chain/.bob/data ports: - "26658:26657" - + dydxprotocold2: image: local:dydxprotocol entrypoint: @@ -67,6 +71,8 @@ services: - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} @@ -74,7 +80,7 @@ services: - DAEMON_HOME=/dydxprotocol/chain/.carl volumes: - ./localnet/dydxprotocol2:/dydxprotocol/chain/.carl/data - + dydxprotocold3: image: local:dydxprotocol entrypoint: @@ -85,10 +91,12 @@ services: - error - --home - /dydxprotocol/chain/.dave - - --p2p.persistent_peers + - --p2p.persistent_peers - "17e5e45691f0d01449c84fd4ae87279578cdd7ec@dydxprotocold0:26656,b69182310be02559483e42c77b7b104352713166@dydxprotocold1:26656,47539956aaa8e624e0f1d926040e54908ad0eb44@dydxprotocold2:26656,5882428984d83b03d0c907c1f0af343534987052@dydxprotocold3:26656" - --bridge-daemon-eth-rpc-endpoint - "${ETH_RPC_ENDPOINT}" + - --max-daemon-unhealthy-seconds + - "4294967295" environment: # See https://docs.datadoghq.com/profiler/enabling/go/ for DD_ specific environment variables - DD_ENV=localnet_${USER} diff --git a/protocol/x/clob/client/cli/cancel_order_cli_test.go b/protocol/x/clob/client/cli/cancel_order_cli_test.go index 9f148f36ba..de7afeafe4 100644 --- a/protocol/x/clob/client/cli/cancel_order_cli_test.go +++ b/protocol/x/clob/client/cli/cancel_order_cli_test.go @@ -8,6 +8,7 @@ import ( appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" daemonflags "github.com/dydxprotocol/v4-chain/protocol/daemons/flags" "github.com/dydxprotocol/v4-chain/protocol/testutil/appoptions" + "math" "math/big" "testing" @@ -73,6 +74,13 @@ func (s *CancelOrderIntegrationTestSuite) SetupTest() { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) }, diff --git a/protocol/x/clob/client/cli/liquidations_cli_test.go b/protocol/x/clob/client/cli/liquidations_cli_test.go index 01d02bf1a1..9c8223ae95 100644 --- a/protocol/x/clob/client/cli/liquidations_cli_test.go +++ b/protocol/x/clob/client/cli/liquidations_cli_test.go @@ -4,6 +4,7 @@ package cli_test import ( "fmt" + "math" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" "math/big" @@ -74,6 +75,13 @@ func TestLiquidationOrderIntegrationTestSuite(t *testing.T) { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) diff --git a/protocol/x/clob/client/cli/place_order_cli_test.go b/protocol/x/clob/client/cli/place_order_cli_test.go index bf6832efe8..5bbd5511fc 100644 --- a/protocol/x/clob/client/cli/place_order_cli_test.go +++ b/protocol/x/clob/client/cli/place_order_cli_test.go @@ -5,6 +5,7 @@ package cli_test import ( "fmt" appflags "github.com/dydxprotocol/v4-chain/protocol/app/flags" + "math" "math/big" "testing" @@ -68,6 +69,13 @@ func TestPlaceOrderIntegrationTestSuite(t *testing.T) { appOptions.Set(daemonflags.FlagPriceDaemonEnabled, false) appOptions.Set(daemonflags.FlagBridgeDaemonEnabled, false) + // Effectively disable the health monitor panic timeout for these tests. This is necessary + // because all clob cli tests are running in the same process and the total time to run is >> 5 minutes + // on CI, causing the panic to trigger for liquidations daemon go routines that haven't been properly + // cleaned up after a test run. + // TODO(CORE-29): Remove this once the liquidations daemon is refactored to be stoppable. + appOptions.Set(daemonflags.FlagMaxDaemonUnhealthySeconds, math.MaxUint32) + // Make sure the daemon is using the correct GRPC address. appOptions.Set(appflags.GrpcAddress, testval.AppConfig.GRPC.Address) },