Skip to content

Commit

Permalink
fix #678 - Fix single-shot mode always exiting in error (#679)
Browse files Browse the repository at this point in the history
* fix #678 - Fix single-shot mode always exiting in error (#679)
   * We added code in #653 to have TRD send a non-zero exit status when anything wrong happens - including payout failures, or (relevant here) keyboard interrupts.
   * However, it turns out that when using TRD in runmode -2, -3, -4, the regular course of operations is to have the producer thread send a KeyboardInterrupt when done... which is then treated as error!
   * This is done with `_thread.interrupt_main()`. It's bad: humans send interrupts, programs should not. Instead, I'm suggesting that the producer use `SIGUSR1` to trigger a normal interruption.
   * This triggers the same code path as today, just with an exit status of SUCCESS. So I expect that the main thread will still wait for the consumer, and it won't actually exit before payouts are done, if there are any.
   * For the record, I still believe this entire producer/consumer logic should be thrown away and we should simply do things in order (calculate, then pay). I've ranted about this before (#491).
* catch the consumer failures and fail the main process accordingly
* Also propagate error for producer failures (e.g. tzkt unresponsive)
* Contributor: nicolasochem, Effort=3h
* Reviewer: jdsika, Effort=0.5h
---------

Co-authored-by: Carlo van Driesten <carlo.van-driesten@vdl.digital>
  • Loading branch information
nicolasochem and jdsika authored Feb 7, 2024
1 parent 24e06f1 commit 587237a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
20 changes: 17 additions & 3 deletions src/pay/payment_producer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import _thread
import os
import signal
import threading
from datetime import datetime, timedelta
from _decimal import ROUND_HALF_DOWN, Decimal
Expand Down Expand Up @@ -115,6 +116,7 @@ def __init__(
self.payments_queue = payments_queue
self.life_cycle = life_cycle
self.dry_run = dry_run
self.consumer_failure = False

self.payment_calc = PhasedPaymentCalculator(
self.founders_map,
Expand Down Expand Up @@ -152,11 +154,22 @@ def exit(self, exit_code):
self.life_cycle.is_running()
and threading.current_thread() is not threading.main_thread()
):
_thread.interrupt_main()
logger.info("Sending KeyboardInterrupt signal.")
if self.consumer_failure:
os.kill(os.getpid(), signal.SIGUSR2)
logger.debug(
"Payment failure, sending sigusr2 signal to main thread."
)
elif exit_code != ExitCode.SUCCESS:
os.kill(os.getpid(), signal.SIGUSR2)
logger.debug(
"Producer failure, sending sigusr2 signal to main thread."
)
else:
os.kill(os.getpid(), signal.SIGUSR1)
logger.debug("Sending sigusr1 signal.")
exit_program(
exit_code,
"Error at payment producer. Please consult the verbose logs!",
"TRD Exit triggered by producer",
)
if self.retry_fail_event:
self.retry_fail_event.set()
Expand Down Expand Up @@ -677,4 +690,5 @@ def on_success(self, pymnt_batch):
self.notify_retry_fail_thread()

def on_fail(self, pymnt_batch):
self.consumer_failure = True
pass
10 changes: 8 additions & 2 deletions src/util/process_life_cycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import queue
import signal
from _signal import SIGABRT, SIGILL, SIGSEGV, SIGTERM
from _signal import SIGABRT, SIGILL, SIGSEGV, SIGTERM, SIGUSR1, SIGUSR2
from enum import Enum, auto
from time import sleep

Expand Down Expand Up @@ -301,8 +301,9 @@ def do_set_up_dirs(self, e):
self.__baking_dirs = BakingDirs(self.args, self.__cfg.get_baking_address())

def do_register_signals(self, e):
for sig in (SIGABRT, SIGILL, SIGSEGV, SIGTERM):
for sig in (SIGABRT, SIGILL, SIGSEGV, SIGTERM, SIGUSR2):
signal.signal(sig, self.stop_handler)
signal.signal(SIGUSR1, self.producer_exit_handler)

def do_init_service_fees(self, e):
self.__srvc_fee_calc = ServiceFeeCalculator(
Expand Down Expand Up @@ -381,6 +382,11 @@ def stop_handler(self, signum, frame):
logger.info("Application stop handler called: {}".format(signum))
self.shut_down_on_error()

def producer_exit_handler(self, signum, frame):
logger.info("Application stop handler called by producer: {}".format(signum))
self.fsm.trigger_event(TrdEvent.SHUT_DOWN_ON_DEMAND)
exit_program(ExitCode.SUCCESS, "Shutdown.")

def shut_down_on_error(self):
self.fsm.trigger_event(TrdEvent.SHUT_DOWN_ON_ERROR)
exit_program(ExitCode.GENERAL_ERROR, "Shutdown due to error!")
Expand Down

0 comments on commit 587237a

Please sign in to comment.