Skip to content

Commit

Permalink
Merge pull request #48 from akash-akya/dev
Browse files Browse the repository at this point in the history
Send SIGTERM twice before sending SIGKILL
  • Loading branch information
akash-akya authored Dec 17, 2024
2 parents e3cb8a2 + f938e93 commit be9fd4a
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- run: mix test --exclude skip:true --trace

macos:
runs-on: macos-11
runs-on: macos-14
name: Test - Elixir (MacOS)
steps:
- uses: actions/checkout@v4
Expand Down
43 changes: 22 additions & 21 deletions lib/exile/process.ex
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ defmodule Exile.Process do

@type exit_status :: non_neg_integer

@type os_pid :: pos_integer

@default_opts [env: [], stderr: :console]
@default_buffer_size 65_535
@os_signal_timeout 1000
Expand Down Expand Up @@ -595,7 +597,7 @@ defmodule Exile.Process do
This is meant only for debugging. Avoid interacting with the
external process directly
"""
@spec os_pid(t) :: pos_integer()
@spec os_pid(t) :: os_pid()
def os_pid(process) do
GenServer.call(process.pid, :os_pid, :infinity)
end
Expand Down Expand Up @@ -626,6 +628,9 @@ defmodule Exile.Process do
{:noreply, exec(state)}
end

# certain programs expects SIGTERM multiple times
@exit_seq [:no_signal, :sigterm, :sigterm, :sigkill]

@impl true
def handle_cast({:prepare_exit, caller, timeout}, state) do
state =
Expand All @@ -648,9 +653,9 @@ defmodule Exile.Process do
if timeout == :infinity do
{:noreply, state}
else
total_stages = 3
stage_timeout = div(timeout, total_stages)
handle_info({:prepare_exit, :normal_exit, stage_timeout}, state)
step_timeout = div(timeout, length(@exit_seq))
exit_seq = Enum.map(@exit_seq, &{&1, step_timeout})
handle_info({:run_exit_sequence, exit_seq}, state)
end
end
end
Expand Down Expand Up @@ -735,28 +740,24 @@ defmodule Exile.Process do
end

@impl true
def handle_info({:prepare_exit, current_stage, timeout}, %{status: status, port: port} = state) do
cond do
status != :running ->
def handle_info({:run_exit_sequence, exit_seq}, %{status: status, port: port} = state) do
case exit_seq do
_ when status != :running ->
# we are done if port is not running
{:noreply, state}

current_stage == :normal_exit ->
Elixir.Process.send_after(self(), {:prepare_exit, :sigterm, timeout}, timeout)
{:noreply, state}
[] ->
# This should never happen, since SIGKILL signal can not be
# ignored by the OS process
{:stop, :kill_timeout, state}

current_stage == :sigterm ->
signal(port, :sigterm)
Elixir.Process.send_after(self(), {:prepare_exit, :sigkill, timeout}, timeout)
{:noreply, state}
[{signal, timeout} | exit_seq] ->
if signal != :no_signal do
signal(port, signal)
end

current_stage == :sigkill ->
signal(port, :sigkill)
Elixir.Process.send_after(self(), {:prepare_exit, :stop, timeout}, timeout)
Elixir.Process.send_after(self(), {:run_exit_sequence, exit_seq}, timeout)
{:noreply, state}

# this should never happen, since sigkill signal can not be ignored by the OS process
current_stage == :stop ->
{:stop, :sigkill_timeout, state}
end
end

Expand Down
83 changes: 53 additions & 30 deletions lib/exile/watcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ defmodule Exile.Watcher do
end

@impl true
def handle_info({:DOWN, ref, :process, pid, _reason}, %{
pid: pid,
socket_path: socket_path,
os_pid: os_pid,
ref: ref
}) do
def handle_info({:DOWN, ref, :process, pid, _reason}, %{pid: pid, ref: ref} = state) do
%{socket_path: socket_path, os_pid: os_pid} = state
_ = File.rm(socket_path)
# at max we wait for 50ms for program to exit
if process_exit?(os_pid, 50) do
:ok
else

# Ideally we should skip checking if program is alive if the Exile
# process is terminated with reason `:normal`. But at the moment
# when owner process terminates we unconditionally exit
# potentially with the reason `:normal`, so we can not rely on
# `:normal` reason

if !process_dead?(os_pid, 50) do
attempt_graceful_exit(os_pid)
end

Expand All @@ -46,37 +46,60 @@ defmodule Exile.Watcher do
# when watcher is attempted to be killed, we forcefully kill external os process.
# This can happen when beam receive SIGTERM
def handle_info({:EXIT, _, reason}, %{pid: pid, socket_path: socket_path, os_pid: os_pid}) do
Logger.debug(fn -> "Watcher exiting. reason: #{inspect(reason)}" end)
Logger.debug("Watcher exiting. reason: #{inspect(reason)}")
File.rm!(socket_path)
Elixir.Process.exit(pid, :watcher_exit)
attempt_graceful_exit(os_pid)
{:stop, reason, nil}
end

# certain programs such as older version of `parallel` expects
# multiple `SIGTERM` signals
@term_seq [sigterm: 150, sigterm: 100, sigkill: 100]

@spec attempt_graceful_exit(Exile.Process.os_pid()) :: :ok | no_return
defp attempt_graceful_exit(os_pid) do
Logger.debug("Failed to stop external program gracefully. attempting SIGTERM")
Nif.nif_kill(os_pid, :sigterm)
process_exit?(os_pid, 100) && throw(:done)

Logger.debug("Failed to stop external program with SIGTERM. attempting SIGKILL")
Nif.nif_kill(os_pid, :sigkill)
process_exit?(os_pid, 200) && throw(:done)

Logger.error("failed to kill external process")
raise "Failed to kill external process"
catch
:done ->
Logger.debug(fn -> "External program exited successfully" end)
end
Logger.debug("Starting graceful termination sequence: #{inspect(@term_seq)}")

dead? =
Enum.reduce_while(@term_seq, false, fn {signal, wait_time}, _dead? ->
Nif.nif_kill(os_pid, signal)

# check process_status first before going to sleep
if process_dead?(os_pid, wait_time) do
Logger.debug("External program terminated successfully with signal: #{signal}")
{:halt, true}
else
Logger.debug("Failed to stop with signal: #{signal}, wait_time: #{wait_time}")
{:cont, false}
end
end)

defp process_exit?(os_pid), do: !Nif.nif_is_os_pid_alive(os_pid)
if dead? do
:ok
else
Logger.error("Failed to kill external process, os_pid #{os_pid}")
raise "Failed to kill external process"
end
end

defp process_exit?(os_pid, timeout) do
if process_exit?(os_pid) do
@spec process_dead?(Exile.Process.os_pid(), pos_integer) :: boolean
defp process_dead?(os_pid, wait_time) do
# check process_status first before going to sleep
if process_status(os_pid) == :dead do
true
else
:timer.sleep(timeout)
process_exit?(os_pid)
Elixir.Process.sleep(wait_time)
process_status(os_pid) == :dead
end
end

@spec process_status(Exile.Process.os_pid()) :: :alive | :dead
defp process_status(os_pid) do
if Nif.nif_is_os_pid_alive(os_pid) do
:alive
else
:dead
end
end
end
2 changes: 1 addition & 1 deletion test/exile/process_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ defmodule Exile.ProcessTest do
assert os_process_alive?(os_pid)

# ensure the script set the correct signal handlers (handlers to ignore signal)
assert {:ok, "ignored signals\n"} = Process.read(s)
assert {:ok, "ignored signals\n" <> _} = Process.read(s)

# exit without waiting for the exile process
{os_pid, s}
Expand Down

0 comments on commit be9fd4a

Please sign in to comment.