diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 4a2f32d46..be7aac374 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6070,6 +6070,50 @@ def _cmd_update_impl(args, gateway_mode: bool): return False _time.sleep(0.5) + def _service_restart_sec( + scope_cmd_: list, svc_name_: str, default: float = 0.0, + ) -> float: + """Read the unit's ``RestartUSec`` (RestartSec) in seconds. + + After a graceful exit-75, systemd waits ``RestartSec`` before + respawning the unit. Callers that poll for ``is-active`` + must use a timeout >= ``RestartSec`` + transition slack, or + they'll give up *during* the cooldown window and wrongly + conclude the unit didn't relaunch. + """ + try: + _show = subprocess.run( + scope_cmd_ + [ + "show", svc_name_, + "--property=RestartUSec", "--value", + ], + capture_output=True, text=True, timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return default + raw = (_show.stdout or "").strip() + # systemd emits values like "30s", "100ms", "1min 30s", or + # "infinity". Parse conservatively; on any miss return default. + if not raw or raw == "infinity": + return default + total = 0.0 + matched = False + for part in raw.split(): + for _suf, _mult in ( + ("ms", 0.001), + ("us", 0.000001), + ("min", 60.0), + ("s", 1.0), + ): + if part.endswith(_suf): + try: + total += float(part[: -len(_suf)]) * _mult + matched = True + except ValueError: + pass + break + return total if matched else default + # Drain budget for graceful SIGUSR1 restarts. The gateway drains # for up to ``agent.restart_drain_timeout`` (default 60s) before # exiting with code 75; we wait slightly longer so the drain @@ -6176,13 +6220,22 @@ def _cmd_update_impl(args, gateway_mode: bool): if _graceful_ok: # Gateway exited 75; systemd should relaunch - # via Restart=on-failure. Poll is-active for - # up to ~10s because the unit's Stopped -> - # Started transition can take a few seconds - # after the old PID exits, and a one-shot - # check races that window. + # via Restart=on-failure. The unit's + # RestartSec (default 30s on ours) gates the + # respawn — poll past that + slack so we + # don't give up mid-cooldown and falsely + # print "drained but didn't relaunch". For + # units without RestartSec set we fall back + # to the original 10s budget. + _restart_sec = _service_restart_sec( + scope_cmd, svc_name, default=0.0, + ) + _post_drain_timeout = max( + 10.0, _restart_sec + 10.0, + ) if _wait_for_service_active( - scope_cmd, svc_name, timeout=10.0, + scope_cmd, svc_name, + timeout=_post_drain_timeout, ): restarted_services.append(svc_name) continue