From 5006b2204b329ab017c05c0822c50c13453237f2 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 25 Apr 2026 09:08:27 -0700 Subject: [PATCH] fix(update): honor RestartSec when polling for gateway respawn (#15707) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-graceful-drain is-active poll used a fixed 10s timeout, but systemd's hermes-gateway.service has RestartSec=30 — so systemd won't respawn the unit for 30s after exit-75, and our poll gives up during the cooldown. Result: every 'hermes update' printed ⚠ hermes-gateway drained but didn't relaunch — forcing restart followed by a redundant 'systemctl restart' that kicked the newly- respawning gateway again (and re-started WhatsApp / Discord a second time in the process). Fix: read RestartUSec from the unit via 'systemctl show' and set the poll budget to max(10s, RestartSec + 10s slack). Units without RestartSec set (or value=infinity) fall back to the original 10s. Observed timeline from journalctl before fix: 08:56:22.262 old PID exits 75 08:56:32.707 systemd logs Stopped -> Started (10.4s gap, > 10s budget) After fix the poll covers 40s — comfortably inside RestartSec + slack. Validation: - RestartUSec parser tested against '30s', '100ms', '1min 30s', 'infinity', '', 'garbage', '500us', '2min' — all correct. - Against the live hermes-gateway.service: parses to 30.0s. - tests/hermes_cli/test_update_gateway_restart.py: 41/41 pass. --- hermes_cli/main.py | 65 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 4a2f32d46..be7aac374 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6070,6 +6070,50 @@ def _cmd_update_impl(args, gateway_mode: bool): return False _time.sleep(0.5) + def _service_restart_sec( + scope_cmd_: list, svc_name_: str, default: float = 0.0, + ) -> float: + """Read the unit's ``RestartUSec`` (RestartSec) in seconds. + + After a graceful exit-75, systemd waits ``RestartSec`` before + respawning the unit. Callers that poll for ``is-active`` + must use a timeout >= ``RestartSec`` + transition slack, or + they'll give up *during* the cooldown window and wrongly + conclude the unit didn't relaunch. + """ + try: + _show = subprocess.run( + scope_cmd_ + [ + "show", svc_name_, + "--property=RestartUSec", "--value", + ], + capture_output=True, text=True, timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return default + raw = (_show.stdout or "").strip() + # systemd emits values like "30s", "100ms", "1min 30s", or + # "infinity". Parse conservatively; on any miss return default. + if not raw or raw == "infinity": + return default + total = 0.0 + matched = False + for part in raw.split(): + for _suf, _mult in ( + ("ms", 0.001), + ("us", 0.000001), + ("min", 60.0), + ("s", 1.0), + ): + if part.endswith(_suf): + try: + total += float(part[: -len(_suf)]) * _mult + matched = True + except ValueError: + pass + break + return total if matched else default + # Drain budget for graceful SIGUSR1 restarts. The gateway drains # for up to ``agent.restart_drain_timeout`` (default 60s) before # exiting with code 75; we wait slightly longer so the drain @@ -6176,13 +6220,22 @@ def _cmd_update_impl(args, gateway_mode: bool): if _graceful_ok: # Gateway exited 75; systemd should relaunch - # via Restart=on-failure. Poll is-active for - # up to ~10s because the unit's Stopped -> - # Started transition can take a few seconds - # after the old PID exits, and a one-shot - # check races that window. + # via Restart=on-failure. The unit's + # RestartSec (default 30s on ours) gates the + # respawn — poll past that + slack so we + # don't give up mid-cooldown and falsely + # print "drained but didn't relaunch". For + # units without RestartSec set we fall back + # to the original 10s budget. + _restart_sec = _service_restart_sec( + scope_cmd, svc_name, default=0.0, + ) + _post_drain_timeout = max( + 10.0, _restart_sec + 10.0, + ) if _wait_for_service_active( - scope_cmd, svc_name, timeout=10.0, + scope_cmd, svc_name, + timeout=_post_drain_timeout, ): restarted_services.append(svc_name) continue