Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Send SIGTERM twice before sending SIGKILL #48

Merged
merged 2 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- run: mix test --exclude skip:true --trace

macos:
runs-on: macos-11
runs-on: macos-14
name: Test - Elixir (MacOS)
steps:
- uses: actions/checkout@v4
Expand Down
43 changes: 22 additions & 21 deletions lib/exile/process.ex
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ defmodule Exile.Process do

@type exit_status :: non_neg_integer

@type os_pid :: pos_integer

@default_opts [env: [], stderr: :console]
@default_buffer_size 65_535
@os_signal_timeout 1000
Expand Down Expand Up @@ -595,7 +597,7 @@ defmodule Exile.Process do
This is meant only for debugging. Avoid interacting with the
external process directly
"""
@spec os_pid(t) :: pos_integer()
@spec os_pid(t) :: os_pid()
def os_pid(process) do
GenServer.call(process.pid, :os_pid, :infinity)
end
Expand Down Expand Up @@ -626,6 +628,9 @@ defmodule Exile.Process do
{:noreply, exec(state)}
end

# certain programs expects SIGTERM multiple times
@exit_seq [:no_signal, :sigterm, :sigterm, :sigkill]

@impl true
def handle_cast({:prepare_exit, caller, timeout}, state) do
state =
Expand All @@ -648,9 +653,9 @@ defmodule Exile.Process do
if timeout == :infinity do
{:noreply, state}
else
total_stages = 3
stage_timeout = div(timeout, total_stages)
handle_info({:prepare_exit, :normal_exit, stage_timeout}, state)
step_timeout = div(timeout, length(@exit_seq))
exit_seq = Enum.map(@exit_seq, &{&1, step_timeout})
handle_info({:run_exit_sequence, exit_seq}, state)
end
end
end
Expand Down Expand Up @@ -735,28 +740,24 @@ defmodule Exile.Process do
end

@impl true
def handle_info({:prepare_exit, current_stage, timeout}, %{status: status, port: port} = state) do
cond do
status != :running ->
def handle_info({:run_exit_sequence, exit_seq}, %{status: status, port: port} = state) do
case exit_seq do
_ when status != :running ->
# we are done if port is not running
{:noreply, state}

current_stage == :normal_exit ->
Elixir.Process.send_after(self(), {:prepare_exit, :sigterm, timeout}, timeout)
{:noreply, state}
[] ->
# This should never happen, since SIGKILL signal can not be
# ignored by the OS process
{:stop, :kill_timeout, state}

current_stage == :sigterm ->
signal(port, :sigterm)
Elixir.Process.send_after(self(), {:prepare_exit, :sigkill, timeout}, timeout)
{:noreply, state}
[{signal, timeout} | exit_seq] ->
if signal != :no_signal do
signal(port, signal)
end

current_stage == :sigkill ->
signal(port, :sigkill)
Elixir.Process.send_after(self(), {:prepare_exit, :stop, timeout}, timeout)
Elixir.Process.send_after(self(), {:run_exit_sequence, exit_seq}, timeout)
{:noreply, state}

# this should never happen, since sigkill signal can not be ignored by the OS process
current_stage == :stop ->
{:stop, :sigkill_timeout, state}
end
end

Expand Down
83 changes: 53 additions & 30 deletions lib/exile/watcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ defmodule Exile.Watcher do
end

@impl true
def handle_info({:DOWN, ref, :process, pid, _reason}, %{
pid: pid,
socket_path: socket_path,
os_pid: os_pid,
ref: ref
}) do
def handle_info({:DOWN, ref, :process, pid, _reason}, %{pid: pid, ref: ref} = state) do
%{socket_path: socket_path, os_pid: os_pid} = state
_ = File.rm(socket_path)
# at max we wait for 50ms for program to exit
if process_exit?(os_pid, 50) do
:ok
else

# Ideally we should skip checking if program is alive if the Exile
# process is terminated with reason `:normal`. But at the moment
# when owner process terminates we unconditionally exit
# potentially with the reason `:normal`, so we can not rely on
# `:normal` reason

if !process_dead?(os_pid, 50) do
attempt_graceful_exit(os_pid)
end

Expand All @@ -46,37 +46,60 @@ defmodule Exile.Watcher do
# when watcher is attempted to be killed, we forcefully kill external os process.
# This can happen when beam receive SIGTERM
def handle_info({:EXIT, _, reason}, %{pid: pid, socket_path: socket_path, os_pid: os_pid}) do
Logger.debug(fn -> "Watcher exiting. reason: #{inspect(reason)}" end)
Logger.debug("Watcher exiting. reason: #{inspect(reason)}")
File.rm!(socket_path)
Elixir.Process.exit(pid, :watcher_exit)
attempt_graceful_exit(os_pid)
{:stop, reason, nil}
end

# certain programs such as older version of `parallel` expects
# multiple `SIGTERM` signals
@term_seq [sigterm: 150, sigterm: 100, sigkill: 100]

@spec attempt_graceful_exit(Exile.Process.os_pid()) :: :ok | no_return
defp attempt_graceful_exit(os_pid) do
Logger.debug("Failed to stop external program gracefully. attempting SIGTERM")
Nif.nif_kill(os_pid, :sigterm)
process_exit?(os_pid, 100) && throw(:done)

Logger.debug("Failed to stop external program with SIGTERM. attempting SIGKILL")
Nif.nif_kill(os_pid, :sigkill)
process_exit?(os_pid, 200) && throw(:done)

Logger.error("failed to kill external process")
raise "Failed to kill external process"
catch
:done ->
Logger.debug(fn -> "External program exited successfully" end)
end
Logger.debug("Starting graceful termination sequence: #{inspect(@term_seq)}")

dead? =
Enum.reduce_while(@term_seq, false, fn {signal, wait_time}, _dead? ->
Nif.nif_kill(os_pid, signal)

# check process_status first before going to sleep
if process_dead?(os_pid, wait_time) do
Logger.debug("External program terminated successfully with signal: #{signal}")
{:halt, true}
else
Logger.debug("Failed to stop with signal: #{signal}, wait_time: #{wait_time}")
{:cont, false}
end
end)

defp process_exit?(os_pid), do: !Nif.nif_is_os_pid_alive(os_pid)
if dead? do
:ok
else
Logger.error("Failed to kill external process, os_pid #{os_pid}")
raise "Failed to kill external process"
end
end

defp process_exit?(os_pid, timeout) do
if process_exit?(os_pid) do
@spec process_dead?(Exile.Process.os_pid(), pos_integer) :: boolean
defp process_dead?(os_pid, wait_time) do
# check process_status first before going to sleep
if process_status(os_pid) == :dead do
true
else
:timer.sleep(timeout)
process_exit?(os_pid)
Elixir.Process.sleep(wait_time)
process_status(os_pid) == :dead
end
end

@spec process_status(Exile.Process.os_pid()) :: :alive | :dead
defp process_status(os_pid) do
if Nif.nif_is_os_pid_alive(os_pid) do
:alive
else
:dead
end
end
end
2 changes: 1 addition & 1 deletion test/exile/process_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ defmodule Exile.ProcessTest do
assert os_process_alive?(os_pid)

# ensure the script set the correct signal handlers (handlers to ignore signal)
assert {:ok, "ignored signals\n"} = Process.read(s)
assert {:ok, "ignored signals\n" <> _} = Process.read(s)

# exit without waiting for the exile process
{os_pid, s}
Expand Down