mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 03:29:17 +00:00
fix(mcp): skip killpg when child shares gateway's process group (#47134)
/reload-mcp -> shutdown_mcp_servers -> _kill_orphaned_mcp_children(include_active=True) -> _send_signal -> killpg(pgid, SIGTERM). When a tracked MCP stdio child shares the gateway's OWN process group, killpg delivers SIGTERM to the gateway itself, firing its SIGTERM handler -> os._exit(0): /reload-mcp crashes the gateway. Pre-compute the gateway's own pgid (os.getpgrp(), None on Windows/restricted) and, in _send_signal, skip killpg when pgid == own pgid, falling through to the per-pid os.kill path so the child is still reaped without self-signaling. Adds a regression test (folded in) that pins the guard: with a tracked pgid equal to the gateway's own pgid, killpg is never called for that pgid and the per-pid kill fallback is used. Mutation-checked. Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
This commit is contained in:
@@ -260,6 +260,56 @@ class TestStdioPgroupReaping:
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
assert fake_pid not in _stdio_pgids
|
||||
|
||||
def test_killpg_skipped_when_pgid_matches_gateway_own_pgroup(self, monkeypatch):
|
||||
"""#47134: when a tracked MCP child shares the gateway's OWN process
|
||||
group, killpg(pgid) would signal the gateway itself and crash it.
|
||||
The guard must skip killpg for that pgid and fall through to per-pid
|
||||
os.kill instead."""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
if not hasattr(os, "killpg") or not hasattr(os, "getpgrp"):
|
||||
pytest.skip("os.killpg/os.getpgrp not available on this platform")
|
||||
|
||||
self._reset_state()
|
||||
gateway_pgid = 424242
|
||||
fake_pid = 717171 # a child pid that resolves to the gateway's pgid
|
||||
other_pid = 818181 # a normal child in its OWN (non-gateway) group
|
||||
other_pgid = 818181
|
||||
with _lock:
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
_stdio_pgids[fake_pid] = gateway_pgid # == gateway's own pgid
|
||||
_orphan_stdio_pids.add(other_pid)
|
||||
_stdio_pgids[other_pid] = other_pgid # distinct group → killpg OK
|
||||
|
||||
fake_sigkill = 9
|
||||
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
|
||||
|
||||
with patch("tools.mcp_tool.os.getpgrp", return_value=gateway_pgid), \
|
||||
patch("tools.mcp_tool.os.killpg") as mock_killpg, \
|
||||
patch("tools.mcp_tool.os.kill") as mock_kill, \
|
||||
patch("gateway.status._pid_exists", return_value=True), \
|
||||
patch("time.sleep"):
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
# killpg must NEVER be called for the gateway's own pgid (would self-kill).
|
||||
killpg_pgids = [call.args[0] for call in mock_killpg.call_args_list]
|
||||
assert gateway_pgid not in killpg_pgids, (
|
||||
"killpg was called with the gateway's own pgid — self-kill (#47134)"
|
||||
)
|
||||
# The shared-pgid child must be reaped via per-pid kill instead.
|
||||
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
||||
mock_kill.assert_any_call(fake_pid, fake_sigkill)
|
||||
# NEGATIVE CONTROL: a child in a DISTINCT group must STILL use killpg —
|
||||
# the guard must skip only the gateway's own group, not all pgids.
|
||||
assert other_pgid in killpg_pgids, (
|
||||
"killpg must still be used for a non-gateway pgid (guard too broad)"
|
||||
)
|
||||
|
||||
def test_killpg_failure_falls_back_to_kill(self, monkeypatch):
|
||||
"""If killpg raises ProcessLookupError (pgroup gone), try os.kill."""
|
||||
from tools.mcp_tool import (
|
||||
|
||||
@@ -4643,21 +4643,42 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
||||
if not pids:
|
||||
return
|
||||
|
||||
# Pre-compute the gateway's own pgid so _send_signal can avoid killing it.
|
||||
try:
|
||||
_my_pgid = os.getpgrp()
|
||||
except (AttributeError, OSError):
|
||||
_my_pgid = None # Windows or restricted environment
|
||||
|
||||
def _send_signal(pid: int, sig: int, server_name: str) -> None:
|
||||
"""SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal."""
|
||||
pgid = pgids.get(pid)
|
||||
killpg = getattr(os, "killpg", None)
|
||||
if pgid is not None and killpg is not None:
|
||||
try:
|
||||
killpg(pgid, sig)
|
||||
return
|
||||
except (ProcessLookupError, PermissionError, OSError) as exc:
|
||||
# Pgroup gone (all members exited) or refused — fall back to
|
||||
# the per-pid path so we still try the direct child if alive.
|
||||
logger.debug(
|
||||
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
|
||||
pgid, sig, server_name, exc,
|
||||
if _my_pgid is not None and pgid == _my_pgid:
|
||||
# The MCP child shares the gateway's own process group.
|
||||
# Using killpg would deliver the signal to the gateway as
|
||||
# well, crashing it (see #47134). Fall through to the
|
||||
# per-pid kill() path instead. Warn because per-pid kill
|
||||
# cannot reach grandchildren in this shared group — if the
|
||||
# direct child has already exited, they may leak (inherent:
|
||||
# group-killing them would also kill the gateway).
|
||||
logger.warning(
|
||||
"MCP server '%s' pgid %d matches gateway pgid; skipping "
|
||||
"killpg to avoid self-kill and using per-pid kill — any "
|
||||
"grandchildren in this group may not be reaped",
|
||||
server_name, pgid,
|
||||
)
|
||||
else:
|
||||
try:
|
||||
killpg(pgid, sig)
|
||||
return
|
||||
except (ProcessLookupError, PermissionError, OSError) as exc:
|
||||
# Pgroup gone (all members exited) or refused — fall back to
|
||||
# the per-pid path so we still try the direct child if alive.
|
||||
logger.debug(
|
||||
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
|
||||
pgid, sig, server_name, exc,
|
||||
)
|
||||
try:
|
||||
os.kill(pid, sig)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
|
||||
Reference in New Issue
Block a user