PR comments.

dydxprotocol · Dec 2, 2023 · cd77cee · cd77cee
1 parent 7b8f83f
commit cd77cee
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 43 deletions.
diff --git a/protocol/app/app.go b/protocol/app/app.go
@@ -613,7 +613,7 @@ func New(
 		if daemonFlags.Liquidation.Enabled {
 			app.LiquidationsClient = liquidationclient.NewClient(logger)
 			go func() {
-				app.MonitorDaemon(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.LiquidationsClient, MaximumDaemonUnhealthyDuration)
 				if err := app.LiquidationsClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -645,7 +645,7 @@ func New(
 				constants.StaticExchangeDetails,
 				&pricefeedclient.SubTaskRunnerImpl{},
 			)
-			app.MonitorDaemon(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
+			app.RegisterDaemonWithHealthMonitor(app.PriceFeedClient, MaximumDaemonUnhealthyDuration)
 		}
 
 		// Start Bridge Daemon.
@@ -655,7 +655,7 @@ func New(
 			// environments.
 			app.BridgeClient = bridgeclient.NewClient(logger)
 			go func() {
-				app.MonitorDaemon(app.BridgeClient, MaximumDaemonUnhealthyDuration)
+				app.RegisterDaemonWithHealthMonitor(app.BridgeClient, MaximumDaemonUnhealthyDuration)
 				if err := app.BridgeClient.Start(
 					// The client will use `context.Background` so that it can have a different context from
 					// the main application.
@@ -1230,9 +1230,9 @@ func New(
 	return app
 }
 
-// MonitorDaemon registers a daemon service with the update monitor. If the daemon does not register, the method will
-// panic.
-func (app *App) MonitorDaemon(
+// RegisterDaemonWithHealthMonitor registers a daemon service with the update monitor, which will commence monitoring
+// the health of the daemon. If the daemon does not register, the method will panic.
+func (app *App) RegisterDaemonWithHealthMonitor(
 	healthCheckableDaemon daemontypes.HealthCheckable,
 	maximumAcceptableUpdateDelay time.Duration,
 ) {

diff --git a/protocol/app/app_test.go b/protocol/app/app_test.go
@@ -225,17 +225,17 @@ func TestModuleBasics(t *testing.T) {
 	require.Equal(t, expectedFieldTypes, actualFieldTypes, "Module basics does not match expected")
 }
 
-func TestMonitorDaemon_Panics(t *testing.T) {
+func TestRegisterDaemonWithHealthMonitor_Panics(t *testing.T) {
 	app := testapp.DefaultTestApp(nil)
 	hc := &mocks.HealthCheckable{}
 	hc.On("ServiceName").Return("test-service")
 	hc.On("HealthCheck").Return(nil)
 
-	app.MonitorDaemon(hc, 5*time.Minute)
+	app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute)
 	// The second registration should fail, causing a panic.
 	require.PanicsWithError(
 		t,
 		"service test-service already registered",
-		func() { app.MonitorDaemon(hc, 5*time.Minute) },
+		func() { app.RegisterDaemonWithHealthMonitor(hc, 5*time.Minute) },
 	)
 }
diff --git a/protocol/daemons/server/bridge.go b/protocol/daemons/server/bridge.go
@@ -30,16 +30,12 @@ func (s *Server) AddBridgeEvents(
 	response *api.AddBridgeEventsResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
-	s.reportValidResponse(types.BridgeDaemonServiceName)
 	if err = s.bridgeEventManager.AddBridgeEvents(req.BridgeEvents); err != nil {
 		return nil, err
 	}
+
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.BridgeDaemonServiceName)
+
 	return &api.AddBridgeEventsResponse{}, nil
 }
diff --git a/protocol/daemons/server/liquidation.go b/protocol/daemons/server/liquidation.go
@@ -33,13 +33,6 @@ func (s *Server) LiquidateSubaccounts(
 	response *api.LiquidateSubaccountsResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
 	telemetry.ModuleSetGauge(
 		metrics.LiquidationDaemon,
 		float32(len(req.SubaccountIds)),
@@ -49,5 +42,9 @@ func (s *Server) LiquidateSubaccounts(
 	)
 
 	s.liquidatableSubaccountIds.UpdateSubaccountIds(req.SubaccountIds)
+
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.LiquidationsDaemonServiceName)
+
 	return &api.LiquidateSubaccountsResponse{}, nil
 }
diff --git a/protocol/daemons/server/pricefeed.go b/protocol/daemons/server/pricefeed.go
@@ -39,13 +39,6 @@ func (s *Server) UpdateMarketPrices(
 	response *api.UpdateMarketPricesResponse,
 	err error,
 ) {
-	// Capture valid responses in metrics.
-	defer func() {
-		if err == nil {
-			s.reportValidResponse(types.PricefeedDaemonServiceName)
-		}
-	}()
-
 	// Measure latency in ingesting and handling gRPC price update.
 	defer telemetry.ModuleMeasureSince(
 		metrics.PricefeedServer,
@@ -54,6 +47,8 @@ func (s *Server) UpdateMarketPrices(
 		metrics.Latency,
 	)
 
+	// This panic is an unexpected condition because we initialize the market price cache in app initialization before
+	// starting the server or daemons.
 	if s.marketToExchange == nil {
 		panic(
 			errorsmod.Wrapf(
@@ -71,6 +66,9 @@ func (s *Server) UpdateMarketPrices(
 
 	s.marketToExchange.UpdatePrices(req.MarketPriceUpdates)
 
+	// Capture valid responses in metrics.
+	s.reportValidResponse(types.PricefeedDaemonServiceName)
+
 	return &api.UpdateMarketPricesResponse{}, nil
 }
 

diff --git a/protocol/daemons/server/types/health_checker.go b/protocol/daemons/server/types/health_checker.go
@@ -17,27 +17,36 @@ type timestampWithError struct {
 	err       error
 }
 
+// Update updates the timeStampWithError to reflect the current error. If the timestamp is zero, this is the first
+// update, so set the timestamp.
 func (u *timestampWithError) Update(timestamp time.Time, err error) {
 	// If the timestamp is zero, this is the first update, so set the timestamp.
 	if u.timestamp.IsZero() {
 		u.timestamp = timestamp
 	}
+
 	u.err = err
 }
 
+// Reset resets the timestampWithError to its zero value, indicating that the service is healthy.
 func (u *timestampWithError) Reset() {
 	u.timestamp = time.Time{}
 	u.err = nil
 }
 
+// IsZero returns true if the timestampWithError is zero, indicating that the service is healthy.
 func (u *timestampWithError) IsZero() bool {
 	return u.timestamp.IsZero() && u.err == nil
 }
 
+// Timestamp returns the timestamp associated with the timestampWithError, which is the timestamp of the first error
+// in the current error streak.
 func (u *timestampWithError) Timestamp() time.Time {
 	return u.timestamp
 }
 
+// Error returns the error associated with the timestampWithError, which is the most recent error in the current error
+// streak.
 func (u *timestampWithError) Error() error {
 	return u.err
 }
@@ -48,16 +57,31 @@ type healthCheckerMutableState struct {
 	// lock is used to synchronize access to mutable state fields.
 	lock sync.Mutex
 
-	// mostRecentSuccess is the timestamp of the most recent successful health check.
-	// Access to mostRecentSuccess is synchronized.
-	mostRecentSuccess time.Time
+	// lastSuccessTimestamp is the timestamp of the most recent successful health check.
+	// Access to lastSuccessTimestamp is synchronized.
+	lastSuccessTimestamp time.Time
 
 	// mostRecentFailureStreakError tracks the timestamp of the first error in the most recent streak of errors, as well
 	// as the most recent error. It is updated on every error and reset every time the service sees a healthy response.
 	// This field is used to determine how long the daemon has been unhealthy. If this timestamp is nil, then either
 	// the service has never been unhealthy, or the most recent error streak ended before it could trigger a callback.
 	// Access to mostRecentFailureStreakError is synchronized.
 	mostRecentFailureStreakError timestampWithError
+
+	// timer triggers a health check poll for a health-checkable service.
+	timer *time.Timer
+
+	// stopped indicates whether the health checker has been stopped. Additional health checks cannot be scheduled
+	// after the health checker has been stopped.
+	stopped bool
+}
+
+// newHealthCheckerMutableState creates a new health checker mutable state scheduled to trigger a poll after the
+// initial poll delay.
+func newHealthCheckerMutableState(initialPollDelay time.Duration, pollFunc func()) *healthCheckerMutableState {
+	return &healthCheckerMutableState{
+		timer: time.AfterFunc(initialPollDelay, pollFunc),
+	}
 }
 
 // ReportSuccess updates the health checker's mutable state to reflect a successful health check and schedules the next
@@ -66,7 +90,7 @@ func (u *healthCheckerMutableState) ReportSuccess(now time.Time) {
 	u.lock.Lock()
 	defer u.lock.Unlock()
 
-	u.mostRecentSuccess = now
+	u.lastSuccessTimestamp = now
 
 	// Whenever the service is healthy, reset the first failure in streak timestamp.
 	u.mostRecentFailureStreakError.Reset()
@@ -83,6 +107,35 @@ func (u *healthCheckerMutableState) ReportFailure(now time.Time, err error) time
 	return now.Sub(u.mostRecentFailureStreakError.Timestamp())
 }
 
+// SchedulePoll schedules the next poll for the health-checkable service. If the service is stopped, the next poll
+// will not be scheduled. This method is synchronized.
+func (u *healthCheckerMutableState) SchedulePoll(nextPollDelay time.Duration) {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	// Don't schedule a poll if the health checker has been stopped.
+	if u.stopped {
+		return
+	}
+
+	// Schedule the next poll.
+	u.timer.Reset(nextPollDelay)
+}
+
+// Stop stops the health checker. This method is synchronized.
+func (u *healthCheckerMutableState) Stop() {
+	u.lock.Lock()
+	defer u.lock.Unlock()
+
+	// Don't stop the health checker if it has already been stopped.
+	if u.stopped {
+		return
+	}
+
+	u.timer.Stop()
+	u.stopped = true
+}
+
 // healthChecker encapsulates the logic for monitoring the health of a health-checkable service.
 type healthChecker struct {
 	// mutableState is the mutable state of the health checker. Access to these fields is synchronized.
@@ -94,9 +147,6 @@ type healthChecker struct {
 	// pollFrequency is the frequency at which the health-checkable service is polled.
 	pollFrequency time.Duration
 
-	// timer triggers a health check poll for a health-checkable service.
-	timer *time.Timer
-
 	// maxAcceptableUnhealthyDuration is the maximum acceptable duration for a health-checkable service to
 	// remain unhealthy. If the service remains unhealthy for this duration, the monitor will execute the
 	// specified callback function.
@@ -134,12 +184,12 @@ func (hc *healthChecker) Poll() {
 	}
 
 	// Schedule next poll.
-	hc.timer.Reset(hc.pollFrequency)
+	hc.mutableState.SchedulePoll(hc.pollFrequency)
 }
 
 // Stop stops the health checker. This method is not synchronized, as the timer does not need synchronization.
 func (hc *healthChecker) Stop() {
-	hc.timer.Stop()
+	hc.mutableState.Stop()
 }
 
 // StartNewHealthChecker creates and starts a new health checker for a health-checkable service.
@@ -159,11 +209,10 @@ func StartNewHealthChecker(
 		timeProvider:                   timeProvider,
 		maxAcceptableUnhealthyDuration: maximumAcceptableUnhealthyDuration,
 		logger:                         logger,
-		mutableState:                   &healthCheckerMutableState{},
 	}
 
 	// The first poll is scheduled after the startup grace period to allow the service to initialize.
-	checker.timer = time.AfterFunc(startupGracePeriod, checker.Poll)
+	checker.mutableState = newHealthCheckerMutableState(startupGracePeriod, checker.Poll)
 
 	return checker
 }
diff --git a/protocol/daemons/server/types/health_monitor.go b/protocol/daemons/server/types/health_monitor.go
@@ -13,6 +13,9 @@ import (
 const (
 	// HealthCheckPollFrequency is the frequency at which the health-checkable service is polled.
 	HealthCheckPollFrequency = 5 * time.Second
+
+	// HealthMonitorLogModuleName is the module name used for logging within the health monitor.
+	HealthMonitorLogModuleName = "daemon-health-monitor"
 )
 
 // healthMonitorMutableState tracks all mutable state associated with the health monitor. This state is gathered into
@@ -132,7 +135,7 @@ func NewHealthMonitor(
 ) *HealthMonitor {
 	return &HealthMonitor{
 		mutableState:       newHealthMonitorMutableState(),
-		logger:             logger.With(cosmoslog.ModuleKey, "health-monitor"),
+		logger:             logger.With(cosmoslog.ModuleKey, HealthMonitorLogModuleName),
 		startupGracePeriod: startupGracePeriod,
 		pollingFrequency:   pollingFrequency,
 	}