fix(update): kill stale dashboard processes instead of warning (#17832)

`hermes update` previously just printed a warning when it detected a
running `hermes dashboard` process from the previous version, telling
the user to kill and restart it themselves.  In practice dashboards get
started and forgotten, so the warning was routinely ignored and users
ended up with a silent frontend/backend mismatch (new JS bundle served
against the old in-memory Python backend, e.g. new auth headers the old
code doesn't recognise → every API call 401s).

The dashboard has no service manager, no PID file, and we don't record
the original launch args (--host, --port, --insecure, --tui, --no-open)
so we can't auto-restart it.  But we CAN stop it, which is what the
user wants — the failure mode when the stale process is left alive is
worse than the dashboard just being down.

- POSIX: SIGTERM, poll for ~3s, SIGKILL any survivors.
- Windows: `taskkill /PID <pid> /F`.
- Print each PID's outcome plus a one-line restart hint.
- Detection logic is unchanged (same ps / wmic scan, same guards
  against the `pgrep -f` greedy-match trap from #16872 and the
  #17049 wmic UnicodeDecodeError fix).

Also split the old monolithic `_warn_stale_dashboard_processes` into
`_find_stale_dashboard_pids` (scan) + `_kill_stale_dashboard_processes`
(kill), keeping the old name as an alias so any external callers still
work.

E2E verified: spawned a fake `hermes dashboard` cmdline via
`exec -a 'hermes dashboard …' sleep 300`, ran
`_kill_stale_dashboard_processes()`, confirmed SIGTERM exit (-15)
and that a post-scan returns an empty PID list.
This commit is contained in:
Teknium
2026-04-30 01:34:34 -07:00
committed by GitHub
parent fd0796947f
commit 5b85a7d351
2 changed files with 358 additions and 129 deletions

View File

@@ -5335,8 +5335,8 @@ def _build_web_ui(web_dir: Path, *, fatal: bool = False) -> bool:
return True
def _warn_stale_dashboard_processes() -> None:
"""Warn about running dashboard processes that still hold pre-update code.
def _find_stale_dashboard_pids() -> list[int]:
"""Return PIDs of ``hermes dashboard`` processes other than ourselves.
``hermes dashboard`` is a long-lived server process commonly started and
forgotten. When ``hermes update`` replaces files on disk, the running
@@ -5344,9 +5344,13 @@ def _warn_stale_dashboard_processes() -> None:
disk is updated, causing a silent frontend/backend mismatch (e.g. new
auth headers the old backend doesn't recognise → every API call 401s).
Unlike the gateway, the dashboard has no service manager (systemd /
launchd), so we can only warn — we don't auto-kill user-managed
background processes.
The dashboard has no service manager (systemd / launchd), no PID file,
and we can't know the original launch args — so the only sane action
after an update is to kill the stale process and let the user restart
it. This helper is just the detection step; see
``_kill_stale_dashboard_processes`` for the kill.
Returns an empty list on any scan error (missing ps/wmic, timeout, etc.).
"""
patterns = [
"hermes dashboard",
@@ -5372,7 +5376,7 @@ def _warn_stale_dashboard_processes() -> None:
encoding="utf-8", errors="ignore",
)
if result.returncode != 0 or result.stdout is None:
return
return []
current_cmd = ""
for line in result.stdout.split("\n"):
line = line.strip()
@@ -5414,20 +5418,110 @@ def _warn_stale_dashboard_processes() -> None:
and pid != self_pid):
dashboard_pids.append(pid)
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
return
return []
if not dashboard_pids:
return dashboard_pids
def _kill_stale_dashboard_processes() -> None:
"""Kill ``hermes dashboard`` processes left over from the previous version.
Called at the end of ``hermes update``. The dashboard has no service
manager, so after an update the running process is guaranteed to be
serving stale Python against a freshly-updated JS bundle. Leaving it
alive produces silent frontend/backend mismatches (new auth headers the
old backend doesn't recognise → every API call 401s).
POSIX: SIGTERM, wait up to ~3s for graceful exit, SIGKILL any survivors.
Windows: ``taskkill /PID <pid> /F`` since there's no clean SIGTERM
equivalent for background console apps.
The dashboard isn't auto-restarted because we don't know the original
launch args (--host, --port, --insecure, --tui, --no-open). The user
restarts it manually; we print a hint with the previous port(s) where
possible.
"""
pids = _find_stale_dashboard_pids()
if not pids:
return
print()
print(f" {len(dashboard_pids)} dashboard process(es) still running "
f"with the previous version:")
for pid in dashboard_pids:
print(f" PID {pid}")
print(" The running backend may not match the updated frontend,")
print(" causing silent auth failures or empty data.")
print(" Restart them to pick up the changes:")
print(" kill <pid> && hermes dashboard --port <port> ...")
print(f"⟲ Stopping {len(pids)} stale dashboard process(es) "
f"(the running backend no longer matches the updated frontend)")
killed: list[int] = []
failed: list[tuple[int, str]] = []
if sys.platform == "win32":
for pid in pids:
try:
result = subprocess.run(
["taskkill", "/PID", str(pid), "/F"],
capture_output=True, text=True, timeout=10,
)
if result.returncode == 0:
killed.append(pid)
else:
failed.append((pid, (result.stderr or result.stdout or "").strip()))
except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as e:
failed.append((pid, str(e)))
else:
import signal as _signal
import time as _time
# SIGTERM first — give each process a chance to shut down cleanly
# (uvicorn closes its socket, flushes logs, etc.).
for pid in pids:
try:
os.kill(pid, _signal.SIGTERM)
except ProcessLookupError:
# Already gone — count as killed.
killed.append(pid)
except (PermissionError, OSError) as e:
failed.append((pid, str(e)))
# Poll for exit up to ~3s total.
deadline = _time.monotonic() + 3.0
pending = [p for p in pids if p not in killed
and p not in {f[0] for f in failed}]
while pending and _time.monotonic() < deadline:
_time.sleep(0.1)
still_pending = []
for pid in pending:
try:
os.kill(pid, 0) # probe
except ProcessLookupError:
killed.append(pid)
except (PermissionError, OSError):
# Can't probe — assume still there.
still_pending.append(pid)
else:
still_pending.append(pid)
pending = still_pending
# SIGKILL any survivors.
for pid in pending:
try:
os.kill(pid, _signal.SIGKILL)
killed.append(pid)
except ProcessLookupError:
killed.append(pid)
except (PermissionError, OSError) as e:
failed.append((pid, str(e)))
for pid in killed:
print(f" ✓ stopped PID {pid}")
for pid, reason in failed:
print(f" ✗ failed to stop PID {pid}: {reason}")
if killed:
print(" Restart the dashboard when you're ready:")
print(" hermes dashboard --port <port>")
# Back-compat alias: some tests and any external callers may import the old
# warn-only name. The new behaviour (kill stale processes) replaces it.
_warn_stale_dashboard_processes = _kill_stale_dashboard_processes
def _update_via_zip(args):
@@ -5564,7 +5658,7 @@ def _update_via_zip(args):
print()
print("✓ Update complete!")
_warn_stale_dashboard_processes()
_kill_stale_dashboard_processes()
def _stash_local_changes_if_needed(git_cmd: list[str], cwd: Path) -> Optional[str]:
@@ -7381,9 +7475,12 @@ def _cmd_update_impl(args, gateway_mode: bool):
except Exception as e:
logger.debug("Legacy unit check during update failed: %s", e)
# Warn about stale dashboard processes — the dashboard has no
# service manager, so we can only tell the user to restart them.
_warn_stale_dashboard_processes()
# Kill stale dashboard processes — the dashboard has no service
# manager, so leaving it alive after a code update produces a
# silent frontend/backend mismatch. We can't auto-restart it
# (no saved launch args) but we can stop it, and a hint is
# printed for the user to re-launch.
_kill_stale_dashboard_processes()
print()
print("Tip: You can now select a provider and model:")

View File

@@ -1,19 +1,29 @@
"""Tests for _warn_stale_dashboard_processes — stale dashboard detection.
"""Tests for the stale-dashboard handling run at the end of ``hermes update``.
Ensures ``hermes update`` warns the user when dashboard processes from a
previous version are still running after files on disk have been replaced.
See #16872.
``hermes update`` detects ``hermes dashboard`` processes left over from the
previous version and kills them (SIGTERM + SIGKILL grace, or ``taskkill /F``
on Windows). Without this, the running backend silently serves stale Python
against a freshly-updated JS bundle, producing 401s / empty data.
History:
- #16872 introduced the warn-only helper (``_warn_stale_dashboard_processes``).
- #17049 fixed a Windows wmic UnicodeDecodeError crash on non-UTF-8 locales.
- This file now also covers the kill semantics that replaced the warning.
"""
from __future__ import annotations
import os
import sys
from unittest.mock import patch, MagicMock
from unittest.mock import patch, MagicMock, call
import pytest
from hermes_cli.main import _warn_stale_dashboard_processes
from hermes_cli.main import (
_find_stale_dashboard_pids,
_kill_stale_dashboard_processes,
_warn_stale_dashboard_processes, # back-compat alias
)
def _ps_line(pid: int, cmd: str) -> str:
@@ -21,11 +31,26 @@ def _ps_line(pid: int, cmd: str) -> str:
return f"{pid:>7} {cmd}"
class TestWarnStaleDashboardProcesses:
"""Unit tests for the stale dashboard process warning."""
def _ps_runner(stdout: str):
"""Build a subprocess.run side_effect that only stubs ps -A calls.
def test_no_warning_when_no_dashboard_running(self, capsys):
"""ps returns no matching processes — no warning should be printed."""
Any other subprocess.run invocation (e.g. taskkill on Windows) is
handed back as a successful no-op. This lets tests exercise the real
scan path without having to re-stub every unrelated subprocess call
made later in ``_kill_stale_dashboard_processes``.
"""
def _side_effect(args, *a, **kw):
if isinstance(args, (list, tuple)) and args and args[0] == "ps":
return MagicMock(returncode=0, stdout=stdout, stderr="")
# Any other subprocess.run (e.g. taskkill) — benign success stub.
return MagicMock(returncode=0, stdout="", stderr="")
return _side_effect
class TestFindStaleDashboardPids:
"""Unit tests for the ps/wmic-based detection step."""
def test_no_matches_returns_empty(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
@@ -35,26 +60,18 @@ class TestWarnStaleDashboardProcesses:
+ "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "dashboard process" not in output
assert _find_stale_dashboard_pids() == []
def test_warning_printed_for_running_dashboard(self, capsys):
"""ps finds a dashboard PID — warning with PID should appear."""
def test_matches_running_dashboard(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout=_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119") + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "1 dashboard process" in output
assert "PID 12345" in output
assert "kill <pid>" in output
assert _find_stale_dashboard_pids() == [12345]
def test_multiple_dashboard_pids(self, capsys):
"""Multiple dashboard processes — all PIDs listed."""
def test_multiple_matches(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
@@ -65,15 +82,9 @@ class TestWarnStaleDashboardProcesses:
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "3 dashboard process" in output
assert "PID 12345" in output
assert "PID 12346" in output
assert "PID 12347" in output
assert sorted(_find_stale_dashboard_pids()) == [12345, 12346, 12347]
def test_self_pid_excluded(self, capsys):
"""The current process PID should not be reported."""
def test_self_pid_excluded(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
@@ -83,41 +94,51 @@ class TestWarnStaleDashboardProcesses:
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
# The self PID may still appear inside an unrelated context, so anchor
# the check to "PID <self>" which is how the warning prints.
assert f"PID {os.getpid()}" not in output
assert "PID 12345" in output
pids = _find_stale_dashboard_pids()
assert os.getpid() not in pids
assert 12345 in pids
def test_ps_not_found_silently_ignored(self, capsys):
"""If ps is missing (FileNotFoundError), no crash, no warning."""
def test_ps_not_found_returns_empty(self):
with patch("subprocess.run", side_effect=FileNotFoundError):
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert output == ""
assert _find_stale_dashboard_pids() == []
def test_ps_timeout_silently_ignored(self, capsys):
"""If ps times out, no crash, no warning."""
def test_ps_timeout_returns_empty(self):
import subprocess as sp
with patch("subprocess.run", side_effect=sp.TimeoutExpired("ps", 10)):
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert output == ""
assert _find_stale_dashboard_pids() == []
def test_empty_ps_output_no_warning(self, capsys):
"""ps returns 0 but empty stdout — no warning."""
def test_unrelated_process_containing_word_dashboard_not_matched(self):
"""Guards against greedy pgrep-style matching catching chat sessions
or unrelated processes whose cmdline happens to contain 'dashboard'.
"""
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0, stdout="\n", stderr=""
returncode=0,
stdout="\n".join([
_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"),
_ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"),
_ps_line(33333, "node /opt/grafana/dashboard-server.js"),
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "dashboard process" not in output
pids = _find_stale_dashboard_pids()
assert pids == [12345]
def test_invalid_pid_lines_skipped(self, capsys):
"""Malformed ps lines should be skipped gracefully."""
def test_grep_lines_ignored(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="\n".join([
_ps_line(99999, "grep hermes dashboard"),
_ps_line(12345, "hermes dashboard --port 9119"),
]) + "\n",
stderr="",
)
pids = _find_stale_dashboard_pids()
assert 99999 not in pids
assert 12345 in pids
def test_invalid_pid_lines_skipped(self):
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
@@ -128,53 +149,166 @@ class TestWarnStaleDashboardProcesses:
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "PID 12345" in output
assert "1 dashboard process" in output
pids = _find_stale_dashboard_pids()
assert pids == [12345]
def test_unrelated_process_containing_word_dashboard_not_matched(self, capsys):
"""A process whose cmdline contains 'dashboard' but isn't a hermes
dashboard process must NOT be flagged. This guards against the old
``pgrep -f "hermes.*dashboard"`` greedy regex that matched e.g. a
chat session argv containing both words.
"""
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="\n".join([
# Legitimate dashboard — should match.
_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"),
# hermes running something else, with "dashboard" as a
# substring of an unrelated arg — should NOT match.
_ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"),
# Completely unrelated process mentioning dashboard.
_ps_line(33333, "node /opt/grafana/dashboard-server.js"),
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "1 dashboard process" in output
assert "PID 12345" in output
assert "PID 22222" not in output
assert "PID 33333" not in output
def test_grep_lines_ignored(self, capsys):
"""Lines containing 'grep' (from a pipe in ps output) are ignored."""
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="\n".join([
_ps_line(99999, "grep hermes dashboard"),
_ps_line(12345, "hermes dashboard --port 9119"),
]) + "\n",
stderr="",
)
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "PID 99999" not in output
assert "PID 12345" in output
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX kill semantics")
class TestKillStaleDashboardPosix:
"""Kill path on Linux / macOS: SIGTERM then SIGKILL any survivors."""
def test_no_stale_processes_is_a_noop(self, capsys):
with patch("hermes_cli.main._find_stale_dashboard_pids", return_value=[]):
_kill_stale_dashboard_processes()
assert capsys.readouterr().out == ""
def test_sigterm_graceful_exit(self, capsys):
"""Processes that exit on SIGTERM (the probe gets ProcessLookupError)
are reported as stopped and SIGKILL is never sent."""
import signal as _signal
killed_signals: list[tuple[int, int]] = []
def fake_kill(pid, sig):
killed_signals.append((pid, sig))
if sig == 0:
# Probe after SIGTERM → "process gone".
raise ProcessLookupError
# SIGTERM itself: succeed silently.
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[12345, 12346]), \
patch("os.kill", side_effect=fake_kill), \
patch("time.sleep"):
_kill_stale_dashboard_processes()
# Both got SIGTERM.
sigterms = [pid for pid, sig in killed_signals if sig == _signal.SIGTERM]
assert sorted(sigterms) == [12345, 12346]
# No SIGKILL was needed.
assert not any(sig == _signal.SIGKILL for _, sig in killed_signals)
out = capsys.readouterr().out
assert "Stopping 2 stale dashboard" in out
assert "✓ stopped PID 12345" in out
assert "✓ stopped PID 12346" in out
assert "Restart the dashboard" in out
def test_sigkill_fallback_for_survivors(self, capsys):
"""If a process survives SIGTERM + the grace window, SIGKILL is sent."""
import signal as _signal
sent: list[tuple[int, int]] = []
def fake_kill(pid, sig):
sent.append((pid, sig))
# Simulate stubborn process: probe (sig 0) always succeeds,
# SIGTERM does nothing, SIGKILL is where it "dies".
if sig in (_signal.SIGTERM, 0, _signal.SIGKILL):
return
# Any other signal — also fine.
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[99999]), \
patch("os.kill", side_effect=fake_kill), \
patch("time.sleep"), \
patch("time.monotonic", side_effect=[0.0] + [10.0] * 20):
# monotonic jumps past the 3s deadline on the second read so the
# grace loop exits immediately after one iteration.
_kill_stale_dashboard_processes()
signals_sent = [sig for _, sig in sent]
assert _signal.SIGTERM in signals_sent
assert _signal.SIGKILL in signals_sent
out = capsys.readouterr().out
assert "✓ stopped PID 99999" in out
def test_permission_error_is_reported_not_raised(self, capsys):
"""os.kill raising PermissionError (e.g. another user's process)
must not abort hermes update — it's reported as a failure and we
move on."""
def fake_kill(pid, sig):
raise PermissionError("Operation not permitted")
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[12345]), \
patch("os.kill", side_effect=fake_kill), \
patch("time.sleep"):
_kill_stale_dashboard_processes() # must not raise
out = capsys.readouterr().out
assert "✗ failed to stop PID 12345" in out
assert "Operation not permitted" in out
def test_process_already_gone_counts_as_stopped(self, capsys):
"""ProcessLookupError on the initial SIGTERM means the process
already exited between detection and the kill — treat as success."""
def fake_kill(pid, sig):
raise ProcessLookupError
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[12345]), \
patch("os.kill", side_effect=fake_kill), \
patch("time.sleep"):
_kill_stale_dashboard_processes()
out = capsys.readouterr().out
assert "✓ stopped PID 12345" in out
assert "failed to stop" not in out
class TestKillStaleDashboardWindows:
"""Kill path on Windows: taskkill /F."""
def test_taskkill_invoked_for_each_pid(self, monkeypatch, capsys):
monkeypatch.setattr(sys, "platform", "win32")
def fake_run(args, *a, **kw):
# taskkill returns 0 on success
return MagicMock(returncode=0, stdout="", stderr="")
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[12345, 12346]), \
patch("subprocess.run", side_effect=fake_run) as mock_run:
_kill_stale_dashboard_processes()
# Each PID triggered a taskkill /PID <n> /F invocation.
taskkill_calls = [
c for c in mock_run.call_args_list
if c.args and isinstance(c.args[0], list) and c.args[0][:1] == ["taskkill"]
]
assert len(taskkill_calls) == 2
assert ["taskkill", "/PID", "12345", "/F"] in [c.args[0] for c in taskkill_calls]
assert ["taskkill", "/PID", "12346", "/F"] in [c.args[0] for c in taskkill_calls]
out = capsys.readouterr().out
assert "✓ stopped PID 12345" in out
assert "✓ stopped PID 12346" in out
def test_taskkill_failure_is_reported(self, monkeypatch, capsys):
monkeypatch.setattr(sys, "platform", "win32")
def fake_run(args, *a, **kw):
return MagicMock(returncode=128, stdout="",
stderr="ERROR: Access is denied.")
with patch("hermes_cli.main._find_stale_dashboard_pids",
return_value=[12345]), \
patch("subprocess.run", side_effect=fake_run):
_kill_stale_dashboard_processes() # must not raise
out = capsys.readouterr().out
assert "✗ failed to stop PID 12345" in out
assert "Access is denied" in out
class TestBackCompatAlias:
"""``_warn_stale_dashboard_processes`` is kept as an alias for the
new kill function so old imports don't break."""
def test_alias_is_the_kill_function(self):
assert _warn_stale_dashboard_processes is _kill_stale_dashboard_processes
class TestWindowsWmicEncoding:
@@ -188,7 +322,6 @@ class TestWindowsWmicEncoding:
UnicodeDecodeError on non-UTF-8 wmic output."""
monkeypatch.setattr(sys, "platform", "win32")
with patch("subprocess.run") as mock_run:
# Provide a minimal valid wmic /FORMAT:LIST response.
mock_run.return_value = MagicMock(
returncode=0,
stdout=(
@@ -197,10 +330,12 @@ class TestWindowsWmicEncoding:
),
stderr="",
)
_warn_stale_dashboard_processes()
_find_stale_dashboard_pids()
# The wmic call is the first subprocess.run invocation.
assert mock_run.called, "subprocess.run was not invoked"
kwargs = mock_run.call_args.kwargs
wmic_call = mock_run.call_args_list[0]
kwargs = wmic_call.kwargs
assert kwargs.get("encoding") == "utf-8", (
"encoding kwarg must be 'utf-8' so wmic output is decoded "
"deterministically rather than via the implicit reader-thread "
@@ -211,10 +346,10 @@ class TestWindowsWmicEncoding:
"down the reader thread (#17049)."
)
def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch, capsys):
def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch):
"""If subprocess.run returns successfully but stdout is None — which
is what Python 3.11 leaves behind when the reader thread silently
crashed on UnicodeDecodeError before this fix landed — the warning
crashed on UnicodeDecodeError before this fix landed — detection
must short-circuit instead of raising AttributeError on
``None.split('\\n')`` and aborting `hermes update` (#17049)."""
monkeypatch.setattr(sys, "platform", "win32")
@@ -223,7 +358,4 @@ class TestWindowsWmicEncoding:
returncode=0, stdout=None, stderr=""
)
# Must not raise.
_warn_stale_dashboard_processes()
output = capsys.readouterr().out
assert "dashboard process" not in output
assert _find_stale_dashboard_pids() == []