diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4026020..d551562 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -49,7 +49,7 @@ jobs: - run: mix test --exclude skip:true --trace macos: - runs-on: macos-11 + runs-on: macos-14 name: Test - Elixir (MacOS) steps: - uses: actions/checkout@v4 diff --git a/lib/exile/process.ex b/lib/exile/process.ex index 566f8a8..542eb49 100644 --- a/lib/exile/process.ex +++ b/lib/exile/process.ex @@ -340,6 +340,8 @@ defmodule Exile.Process do @type exit_status :: non_neg_integer + @type os_pid :: pos_integer + @default_opts [env: [], stderr: :console] @default_buffer_size 65_535 @os_signal_timeout 1000 @@ -595,7 +597,7 @@ defmodule Exile.Process do This is meant only for debugging. Avoid interacting with the external process directly """ - @spec os_pid(t) :: pos_integer() + @spec os_pid(t) :: os_pid() def os_pid(process) do GenServer.call(process.pid, :os_pid, :infinity) end @@ -626,6 +628,9 @@ defmodule Exile.Process do {:noreply, exec(state)} end + # certain programs expects SIGTERM multiple times + @exit_seq [:no_signal, :sigterm, :sigterm, :sigkill] + @impl true def handle_cast({:prepare_exit, caller, timeout}, state) do state = @@ -648,9 +653,9 @@ defmodule Exile.Process do if timeout == :infinity do {:noreply, state} else - total_stages = 3 - stage_timeout = div(timeout, total_stages) - handle_info({:prepare_exit, :normal_exit, stage_timeout}, state) + step_timeout = div(timeout, length(@exit_seq)) + exit_seq = Enum.map(@exit_seq, &{&1, step_timeout}) + handle_info({:run_exit_sequence, exit_seq}, state) end end end @@ -735,28 +740,24 @@ defmodule Exile.Process do end @impl true - def handle_info({:prepare_exit, current_stage, timeout}, %{status: status, port: port} = state) do - cond do - status != :running -> + def handle_info({:run_exit_sequence, exit_seq}, %{status: status, port: port} = state) do + case exit_seq do + _ when status != :running -> + # we are done if port is not running {:noreply, state} - current_stage == :normal_exit -> - Elixir.Process.send_after(self(), {:prepare_exit, :sigterm, timeout}, timeout) - {:noreply, state} + [] -> + # This should never happen, since SIGKILL signal can not be + # ignored by the OS process + {:stop, :kill_timeout, state} - current_stage == :sigterm -> - signal(port, :sigterm) - Elixir.Process.send_after(self(), {:prepare_exit, :sigkill, timeout}, timeout) - {:noreply, state} + [{signal, timeout} | exit_seq] -> + if signal != :no_signal do + signal(port, signal) + end - current_stage == :sigkill -> - signal(port, :sigkill) - Elixir.Process.send_after(self(), {:prepare_exit, :stop, timeout}, timeout) + Elixir.Process.send_after(self(), {:run_exit_sequence, exit_seq}, timeout) {:noreply, state} - - # this should never happen, since sigkill signal can not be ignored by the OS process - current_stage == :stop -> - {:stop, :sigkill_timeout, state} end end diff --git a/lib/exile/watcher.ex b/lib/exile/watcher.ex index 7c8524b..47c9635 100644 --- a/lib/exile/watcher.ex +++ b/lib/exile/watcher.ex @@ -24,17 +24,17 @@ defmodule Exile.Watcher do end @impl true - def handle_info({:DOWN, ref, :process, pid, _reason}, %{ - pid: pid, - socket_path: socket_path, - os_pid: os_pid, - ref: ref - }) do + def handle_info({:DOWN, ref, :process, pid, _reason}, %{pid: pid, ref: ref} = state) do + %{socket_path: socket_path, os_pid: os_pid} = state _ = File.rm(socket_path) - # at max we wait for 50ms for program to exit - if process_exit?(os_pid, 50) do - :ok - else + + # Ideally we should skip checking if program is alive if the Exile + # process is terminated with reason `:normal`. But at the moment + # when owner process terminates we unconditionally exit + # potentially with the reason `:normal`, so we can not rely on + # `:normal` reason + + if !process_dead?(os_pid, 50) do attempt_graceful_exit(os_pid) end @@ -46,37 +46,60 @@ defmodule Exile.Watcher do # when watcher is attempted to be killed, we forcefully kill external os process. # This can happen when beam receive SIGTERM def handle_info({:EXIT, _, reason}, %{pid: pid, socket_path: socket_path, os_pid: os_pid}) do - Logger.debug(fn -> "Watcher exiting. reason: #{inspect(reason)}" end) + Logger.debug("Watcher exiting. reason: #{inspect(reason)}") File.rm!(socket_path) Elixir.Process.exit(pid, :watcher_exit) attempt_graceful_exit(os_pid) {:stop, reason, nil} end + # certain programs such as older version of `parallel` expects + # multiple `SIGTERM` signals + @term_seq [sigterm: 150, sigterm: 100, sigkill: 100] + + @spec attempt_graceful_exit(Exile.Process.os_pid()) :: :ok | no_return defp attempt_graceful_exit(os_pid) do - Logger.debug("Failed to stop external program gracefully. attempting SIGTERM") - Nif.nif_kill(os_pid, :sigterm) - process_exit?(os_pid, 100) && throw(:done) - - Logger.debug("Failed to stop external program with SIGTERM. attempting SIGKILL") - Nif.nif_kill(os_pid, :sigkill) - process_exit?(os_pid, 200) && throw(:done) - - Logger.error("failed to kill external process") - raise "Failed to kill external process" - catch - :done -> - Logger.debug(fn -> "External program exited successfully" end) - end + Logger.debug("Starting graceful termination sequence: #{inspect(@term_seq)}") + + dead? = + Enum.reduce_while(@term_seq, false, fn {signal, wait_time}, _dead? -> + Nif.nif_kill(os_pid, signal) + + # check process_status first before going to sleep + if process_dead?(os_pid, wait_time) do + Logger.debug("External program terminated successfully with signal: #{signal}") + {:halt, true} + else + Logger.debug("Failed to stop with signal: #{signal}, wait_time: #{wait_time}") + {:cont, false} + end + end) - defp process_exit?(os_pid), do: !Nif.nif_is_os_pid_alive(os_pid) + if dead? do + :ok + else + Logger.error("Failed to kill external process, os_pid #{os_pid}") + raise "Failed to kill external process" + end + end - defp process_exit?(os_pid, timeout) do - if process_exit?(os_pid) do + @spec process_dead?(Exile.Process.os_pid(), pos_integer) :: boolean + defp process_dead?(os_pid, wait_time) do + # check process_status first before going to sleep + if process_status(os_pid) == :dead do true else - :timer.sleep(timeout) - process_exit?(os_pid) + Elixir.Process.sleep(wait_time) + process_status(os_pid) == :dead + end + end + + @spec process_status(Exile.Process.os_pid()) :: :alive | :dead + defp process_status(os_pid) do + if Nif.nif_is_os_pid_alive(os_pid) do + :alive + else + :dead end end end diff --git a/test/exile/process_test.exs b/test/exile/process_test.exs index fdb8909..3ee5be6 100644 --- a/test/exile/process_test.exs +++ b/test/exile/process_test.exs @@ -208,7 +208,7 @@ defmodule Exile.ProcessTest do assert os_process_alive?(os_pid) # ensure the script set the correct signal handlers (handlers to ignore signal) - assert {:ok, "ignored signals\n"} = Process.read(s) + assert {:ok, "ignored signals\n" <> _} = Process.read(s) # exit without waiting for the exile process {os_pid, s}