From 5006b2204b329ab017c05c0822c50c13453237f2 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 25 Apr 2026 09:08:27 -0700
Subject: [PATCH] fix(update): honor RestartSec when polling for gateway
 respawn (#15707)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The post-graceful-drain is-active poll used a fixed 10s timeout, but
systemd's hermes-gateway.service has RestartSec=30 — so systemd won't
respawn the unit for 30s after exit-75, and our poll gives up during
the cooldown. Result: every 'hermes update' printed

  ⚠ hermes-gateway drained but didn't relaunch — forcing restart

followed by a redundant 'systemctl restart' that kicked the newly-
respawning gateway again (and re-started WhatsApp / Discord a second
time in the process).

Fix: read RestartUSec from the unit via 'systemctl show' and set the
poll budget to max(10s, RestartSec + 10s slack). Units without
RestartSec set (or value=infinity) fall back to the original 10s.

Observed timeline from journalctl before fix:
  08:56:22.262  old PID exits 75
  08:56:32.707  systemd logs Stopped -> Started  (10.4s gap, > 10s budget)

After fix the poll covers 40s — comfortably inside RestartSec + slack.

Validation:
- RestartUSec parser tested against '30s', '100ms', '1min 30s',
  'infinity', '', 'garbage', '500us', '2min' — all correct.
- Against the live hermes-gateway.service: parses to 30.0s.
- tests/hermes_cli/test_update_gateway_restart.py: 41/41 pass.
---
 hermes_cli/main.py | 65 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 4a2f32d46..be7aac374 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -6070,6 +6070,50 @@ def _cmd_update_impl(args, gateway_mode: bool):
                         return False
                     _time.sleep(0.5)
 
+            def _service_restart_sec(
+                scope_cmd_: list, svc_name_: str, default: float = 0.0,
+            ) -> float:
+                """Read the unit's ``RestartUSec`` (RestartSec) in seconds.
+
+                After a graceful exit-75, systemd waits ``RestartSec`` before
+                respawning the unit.  Callers that poll for ``is-active``
+                must use a timeout >= ``RestartSec`` + transition slack, or
+                they'll give up *during* the cooldown window and wrongly
+                conclude the unit didn't relaunch.
+                """
+                try:
+                    _show = subprocess.run(
+                        scope_cmd_ + [
+                            "show", svc_name_,
+                            "--property=RestartUSec", "--value",
+                        ],
+                        capture_output=True, text=True, timeout=5,
+                    )
+                except (FileNotFoundError, subprocess.TimeoutExpired):
+                    return default
+                raw = (_show.stdout or "").strip()
+                # systemd emits values like "30s", "100ms", "1min 30s", or
+                # "infinity".  Parse conservatively; on any miss return default.
+                if not raw or raw == "infinity":
+                    return default
+                total = 0.0
+                matched = False
+                for part in raw.split():
+                    for _suf, _mult in (
+                        ("ms", 0.001),
+                        ("us", 0.000001),
+                        ("min", 60.0),
+                        ("s", 1.0),
+                    ):
+                        if part.endswith(_suf):
+                            try:
+                                total += float(part[: -len(_suf)]) * _mult
+                                matched = True
+                            except ValueError:
+                                pass
+                            break
+                return total if matched else default
+
             # Drain budget for graceful SIGUSR1 restarts.  The gateway drains
             # for up to ``agent.restart_drain_timeout`` (default 60s) before
             # exiting with code 75; we wait slightly longer so the drain
@@ -6176,13 +6220,22 @@ def _cmd_update_impl(args, gateway_mode: bool):
 
                             if _graceful_ok:
                                 # Gateway exited 75; systemd should relaunch
-                                # via Restart=on-failure.  Poll is-active for
-                                # up to ~10s because the unit's Stopped ->
-                                # Started transition can take a few seconds
-                                # after the old PID exits, and a one-shot
-                                # check races that window.
+                                # via Restart=on-failure.  The unit's
+                                # RestartSec (default 30s on ours) gates the
+                                # respawn — poll past that + slack so we
+                                # don't give up mid-cooldown and falsely
+                                # print "drained but didn't relaunch".  For
+                                # units without RestartSec set we fall back
+                                # to the original 10s budget.
+                                _restart_sec = _service_restart_sec(
+                                    scope_cmd, svc_name, default=0.0,
+                                )
+                                _post_drain_timeout = max(
+                                    10.0, _restart_sec + 10.0,
+                                )
                                 if _wait_for_service_active(
-                                    scope_cmd, svc_name, timeout=10.0,
+                                    scope_cmd, svc_name,
+                                    timeout=_post_drain_timeout,
                                 ):
                                     restarted_services.append(svc_name)
                                     continue