mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 11:38:29 +00:00
Profiles without their own messaging token inherit the default profile's token via os.getenv, hit a token collision, and exit with startup_failed. s6 restarts them immediately, creating ~30MB tirith sandbox dirs in /tmp each cycle — filling the disk in hours (#51228). Changes: - gateway/restart.py: add GATEWAY_FATAL_CONFIG_EXIT_CODE = 78 - gateway/run.py: set exit_code=78 on non-retryable startup errors (token collision, no platforms) - hermes_cli/service_manager.py: add _render_finish_script() that translates exit 78 → exit 125 (s6 permanent failure) - hermes_cli/container_boot.py: write finish script alongside run script during profile registration The s6 finish script pattern follows docker/s6-rc.d/dashboard/finish. Closes #51228
27 lines
1.0 KiB
Python
27 lines
1.0 KiB
Python
"""Shared gateway restart constants and parsing helpers."""
|
|
|
|
from hermes_cli.config import DEFAULT_CONFIG
|
|
|
|
# EX_TEMPFAIL from sysexits.h — used to ask the service manager to restart
|
|
# the gateway after a graceful drain/reload path completes.
|
|
GATEWAY_SERVICE_RESTART_EXIT_CODE = 75
|
|
|
|
# EX_CONFIG from sysexits.h — fatal configuration error (e.g. token
|
|
# collision, no messaging platforms). The s6 finish script translates
|
|
# this into exit 125 (permanent failure) so the supervisor stops
|
|
# restarting the gateway. See #51228.
|
|
GATEWAY_FATAL_CONFIG_EXIT_CODE = 78
|
|
|
|
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT = float(
|
|
DEFAULT_CONFIG["agent"]["restart_drain_timeout"]
|
|
)
|
|
|
|
|
|
def parse_restart_drain_timeout(raw: object) -> float:
|
|
"""Parse a configured drain timeout, falling back to the shared default."""
|
|
try:
|
|
value = float(raw) if str(raw or "").strip() else DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
except (TypeError, ValueError):
|
|
return DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
return max(0.0, value)
|