fix(mcp): skip killpg when child shares gateway's process group (#47134)

/reload-mcp -> shutdown_mcp_servers -> _kill_orphaned_mcp_children(include_active=True)
-> _send_signal -> killpg(pgid, SIGTERM). When a tracked MCP stdio child shares
the gateway's OWN process group, killpg delivers SIGTERM to the gateway itself,
firing its SIGTERM handler -> os._exit(0): /reload-mcp crashes the gateway.

Pre-compute the gateway's own pgid (os.getpgrp(), None on Windows/restricted)
and, in _send_signal, skip killpg when pgid == own pgid, falling through to the
per-pid os.kill path so the child is still reaped without self-signaling.

Adds a regression test (folded in) that pins the guard: with a tracked pgid
equal to the gateway's own pgid, killpg is never called for that pgid and the
per-pid kill fallback is used. Mutation-checked.

Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
kyssta-exe
2026-06-16 12:21:27 +00:00
parent 64131bf975
commit 23c47371d2
2 changed files with 80 additions and 9 deletions

View File

@@ -260,6 +260,56 @@ class TestStdioPgroupReaping:
assert fake_pid not in _orphan_stdio_pids
assert fake_pid not in _stdio_pgids
def test_killpg_skipped_when_pgid_matches_gateway_own_pgroup(self, monkeypatch):
"""#47134: when a tracked MCP child shares the gateway's OWN process
group, killpg(pgid) would signal the gateway itself and crash it.
The guard must skip killpg for that pgid and fall through to per-pid
os.kill instead."""
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pgids,
_lock,
)
if not hasattr(os, "killpg") or not hasattr(os, "getpgrp"):
pytest.skip("os.killpg/os.getpgrp not available on this platform")
self._reset_state()
gateway_pgid = 424242
fake_pid = 717171 # a child pid that resolves to the gateway's pgid
other_pid = 818181 # a normal child in its OWN (non-gateway) group
other_pgid = 818181
with _lock:
_orphan_stdio_pids.add(fake_pid)
_stdio_pgids[fake_pid] = gateway_pgid # == gateway's own pgid
_orphan_stdio_pids.add(other_pid)
_stdio_pgids[other_pid] = other_pgid # distinct group → killpg OK
fake_sigkill = 9
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
with patch("tools.mcp_tool.os.getpgrp", return_value=gateway_pgid), \
patch("tools.mcp_tool.os.killpg") as mock_killpg, \
patch("tools.mcp_tool.os.kill") as mock_kill, \
patch("gateway.status._pid_exists", return_value=True), \
patch("time.sleep"):
_kill_orphaned_mcp_children()
# killpg must NEVER be called for the gateway's own pgid (would self-kill).
killpg_pgids = [call.args[0] for call in mock_killpg.call_args_list]
assert gateway_pgid not in killpg_pgids, (
"killpg was called with the gateway's own pgid — self-kill (#47134)"
)
# The shared-pgid child must be reaped via per-pid kill instead.
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
mock_kill.assert_any_call(fake_pid, fake_sigkill)
# NEGATIVE CONTROL: a child in a DISTINCT group must STILL use killpg —
# the guard must skip only the gateway's own group, not all pgids.
assert other_pgid in killpg_pgids, (
"killpg must still be used for a non-gateway pgid (guard too broad)"
)
def test_killpg_failure_falls_back_to_kill(self, monkeypatch):
"""If killpg raises ProcessLookupError (pgroup gone), try os.kill."""
from tools.mcp_tool import (

View File

@@ -4643,21 +4643,42 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
if not pids:
return
# Pre-compute the gateway's own pgid so _send_signal can avoid killing it.
try:
_my_pgid = os.getpgrp()
except (AttributeError, OSError):
_my_pgid = None # Windows or restricted environment
def _send_signal(pid: int, sig: int, server_name: str) -> None:
"""SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal."""
pgid = pgids.get(pid)
killpg = getattr(os, "killpg", None)
if pgid is not None and killpg is not None:
try:
killpg(pgid, sig)
return
except (ProcessLookupError, PermissionError, OSError) as exc:
# Pgroup gone (all members exited) or refused — fall back to
# the per-pid path so we still try the direct child if alive.
logger.debug(
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
pgid, sig, server_name, exc,
if _my_pgid is not None and pgid == _my_pgid:
# The MCP child shares the gateway's own process group.
# Using killpg would deliver the signal to the gateway as
# well, crashing it (see #47134). Fall through to the
# per-pid kill() path instead. Warn because per-pid kill
# cannot reach grandchildren in this shared group — if the
# direct child has already exited, they may leak (inherent:
# group-killing them would also kill the gateway).
logger.warning(
"MCP server '%s' pgid %d matches gateway pgid; skipping "
"killpg to avoid self-kill and using per-pid kill — any "
"grandchildren in this group may not be reaped",
server_name, pgid,
)
else:
try:
killpg(pgid, sig)
return
except (ProcessLookupError, PermissionError, OSError) as exc:
# Pgroup gone (all members exited) or refused — fall back to
# the per-pid path so we still try the direct child if alive.
logger.debug(
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
pgid, sig, server_name, exc,
)
try:
os.kill(pid, sig)
except (ProcessLookupError, PermissionError, OSError):