fix(update): honor RestartSec when polling for gateway respawn (#15707)
The post-graceful-drain is-active poll used a fixed 10s timeout, but systemd's hermes-gateway.service has RestartSec=30 — so systemd won't respawn the unit for 30s after exit-75, and our poll gives up during the cooldown. Result: every 'hermes update' printed ⚠ hermes-gateway drained but didn't relaunch — forcing restart followed by a redundant 'systemctl restart' that kicked the newly- respawning gateway again (and re-started WhatsApp / Discord a second time in the process). Fix: read RestartUSec from the unit via 'systemctl show' and set the poll budget to max(10s, RestartSec + 10s slack). Units without RestartSec set (or value=infinity) fall back to the original 10s. Observed timeline from journalctl before fix: 08:56:22.262 old PID exits 75 08:56:32.707 systemd logs Stopped -> Started (10.4s gap, > 10s budget) After fix the poll covers 40s — comfortably inside RestartSec + slack. Validation: - RestartUSec parser tested against '30s', '100ms', '1min 30s', 'infinity', '', 'garbage', '500us', '2min' — all correct. - Against the live hermes-gateway.service: parses to 30.0s. - tests/hermes_cli/test_update_gateway_restart.py: 41/41 pass.
This commit is contained in:
@@ -6070,6 +6070,50 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
return False
|
||||
_time.sleep(0.5)
|
||||
|
||||
def _service_restart_sec(
|
||||
scope_cmd_: list, svc_name_: str, default: float = 0.0,
|
||||
) -> float:
|
||||
"""Read the unit's ``RestartUSec`` (RestartSec) in seconds.
|
||||
|
||||
After a graceful exit-75, systemd waits ``RestartSec`` before
|
||||
respawning the unit. Callers that poll for ``is-active``
|
||||
must use a timeout >= ``RestartSec`` + transition slack, or
|
||||
they'll give up *during* the cooldown window and wrongly
|
||||
conclude the unit didn't relaunch.
|
||||
"""
|
||||
try:
|
||||
_show = subprocess.run(
|
||||
scope_cmd_ + [
|
||||
"show", svc_name_,
|
||||
"--property=RestartUSec", "--value",
|
||||
],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return default
|
||||
raw = (_show.stdout or "").strip()
|
||||
# systemd emits values like "30s", "100ms", "1min 30s", or
|
||||
# "infinity". Parse conservatively; on any miss return default.
|
||||
if not raw or raw == "infinity":
|
||||
return default
|
||||
total = 0.0
|
||||
matched = False
|
||||
for part in raw.split():
|
||||
for _suf, _mult in (
|
||||
("ms", 0.001),
|
||||
("us", 0.000001),
|
||||
("min", 60.0),
|
||||
("s", 1.0),
|
||||
):
|
||||
if part.endswith(_suf):
|
||||
try:
|
||||
total += float(part[: -len(_suf)]) * _mult
|
||||
matched = True
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
return total if matched else default
|
||||
|
||||
# Drain budget for graceful SIGUSR1 restarts. The gateway drains
|
||||
# for up to ``agent.restart_drain_timeout`` (default 60s) before
|
||||
# exiting with code 75; we wait slightly longer so the drain
|
||||
@@ -6176,13 +6220,22 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
|
||||
if _graceful_ok:
|
||||
# Gateway exited 75; systemd should relaunch
|
||||
# via Restart=on-failure. Poll is-active for
|
||||
# up to ~10s because the unit's Stopped ->
|
||||
# Started transition can take a few seconds
|
||||
# after the old PID exits, and a one-shot
|
||||
# check races that window.
|
||||
# via Restart=on-failure. The unit's
|
||||
# RestartSec (default 30s on ours) gates the
|
||||
# respawn — poll past that + slack so we
|
||||
# don't give up mid-cooldown and falsely
|
||||
# print "drained but didn't relaunch". For
|
||||
# units without RestartSec set we fall back
|
||||
# to the original 10s budget.
|
||||
_restart_sec = _service_restart_sec(
|
||||
scope_cmd, svc_name, default=0.0,
|
||||
)
|
||||
_post_drain_timeout = max(
|
||||
10.0, _restart_sec + 10.0,
|
||||
)
|
||||
if _wait_for_service_active(
|
||||
scope_cmd, svc_name, timeout=10.0,
|
||||
scope_cmd, svc_name,
|
||||
timeout=_post_drain_timeout,
|
||||
):
|
||||
restarted_services.append(svc_name)
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user