diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 5bf908004..408445521 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -5335,8 +5335,8 @@ def _build_web_ui(web_dir: Path, *, fatal: bool = False) -> bool: return True -def _warn_stale_dashboard_processes() -> None: - """Warn about running dashboard processes that still hold pre-update code. +def _find_stale_dashboard_pids() -> list[int]: + """Return PIDs of ``hermes dashboard`` processes other than ourselves. ``hermes dashboard`` is a long-lived server process commonly started and forgotten. When ``hermes update`` replaces files on disk, the running @@ -5344,9 +5344,13 @@ def _warn_stale_dashboard_processes() -> None: disk is updated, causing a silent frontend/backend mismatch (e.g. new auth headers the old backend doesn't recognise → every API call 401s). - Unlike the gateway, the dashboard has no service manager (systemd / - launchd), so we can only warn — we don't auto-kill user-managed - background processes. + The dashboard has no service manager (systemd / launchd), no PID file, + and we can't know the original launch args — so the only sane action + after an update is to kill the stale process and let the user restart + it. This helper is just the detection step; see + ``_kill_stale_dashboard_processes`` for the kill. + + Returns an empty list on any scan error (missing ps/wmic, timeout, etc.). """ patterns = [ "hermes dashboard", @@ -5372,7 +5376,7 @@ def _warn_stale_dashboard_processes() -> None: encoding="utf-8", errors="ignore", ) if result.returncode != 0 or result.stdout is None: - return + return [] current_cmd = "" for line in result.stdout.split("\n"): line = line.strip() @@ -5414,20 +5418,110 @@ def _warn_stale_dashboard_processes() -> None: and pid != self_pid): dashboard_pids.append(pid) except (FileNotFoundError, subprocess.TimeoutExpired, OSError): - return + return [] - if not dashboard_pids: + return dashboard_pids + + +def _kill_stale_dashboard_processes() -> None: + """Kill ``hermes dashboard`` processes left over from the previous version. + + Called at the end of ``hermes update``. The dashboard has no service + manager, so after an update the running process is guaranteed to be + serving stale Python against a freshly-updated JS bundle. Leaving it + alive produces silent frontend/backend mismatches (new auth headers the + old backend doesn't recognise → every API call 401s). + + POSIX: SIGTERM, wait up to ~3s for graceful exit, SIGKILL any survivors. + Windows: ``taskkill /PID /F`` since there's no clean SIGTERM + equivalent for background console apps. + + The dashboard isn't auto-restarted because we don't know the original + launch args (--host, --port, --insecure, --tui, --no-open). The user + restarts it manually; we print a hint with the previous port(s) where + possible. + """ + pids = _find_stale_dashboard_pids() + if not pids: return print() - print(f"⚠ {len(dashboard_pids)} dashboard process(es) still running " - f"with the previous version:") - for pid in dashboard_pids: - print(f" PID {pid}") - print(" The running backend may not match the updated frontend,") - print(" causing silent auth failures or empty data.") - print(" Restart them to pick up the changes:") - print(" kill && hermes dashboard --port ...") + print(f"⟲ Stopping {len(pids)} stale dashboard process(es) " + f"(the running backend no longer matches the updated frontend)") + + killed: list[int] = [] + failed: list[tuple[int, str]] = [] + + if sys.platform == "win32": + for pid in pids: + try: + result = subprocess.run( + ["taskkill", "/PID", str(pid), "/F"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + killed.append(pid) + else: + failed.append((pid, (result.stderr or result.stdout or "").strip())) + except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as e: + failed.append((pid, str(e))) + else: + import signal as _signal + import time as _time + + # SIGTERM first — give each process a chance to shut down cleanly + # (uvicorn closes its socket, flushes logs, etc.). + for pid in pids: + try: + os.kill(pid, _signal.SIGTERM) + except ProcessLookupError: + # Already gone — count as killed. + killed.append(pid) + except (PermissionError, OSError) as e: + failed.append((pid, str(e))) + + # Poll for exit up to ~3s total. + deadline = _time.monotonic() + 3.0 + pending = [p for p in pids if p not in killed + and p not in {f[0] for f in failed}] + while pending and _time.monotonic() < deadline: + _time.sleep(0.1) + still_pending = [] + for pid in pending: + try: + os.kill(pid, 0) # probe + except ProcessLookupError: + killed.append(pid) + except (PermissionError, OSError): + # Can't probe — assume still there. + still_pending.append(pid) + else: + still_pending.append(pid) + pending = still_pending + + # SIGKILL any survivors. + for pid in pending: + try: + os.kill(pid, _signal.SIGKILL) + killed.append(pid) + except ProcessLookupError: + killed.append(pid) + except (PermissionError, OSError) as e: + failed.append((pid, str(e))) + + for pid in killed: + print(f" ✓ stopped PID {pid}") + for pid, reason in failed: + print(f" ✗ failed to stop PID {pid}: {reason}") + + if killed: + print(" Restart the dashboard when you're ready:") + print(" hermes dashboard --port ") + + +# Back-compat alias: some tests and any external callers may import the old +# warn-only name. The new behaviour (kill stale processes) replaces it. +_warn_stale_dashboard_processes = _kill_stale_dashboard_processes def _update_via_zip(args): @@ -5564,7 +5658,7 @@ def _update_via_zip(args): print() print("✓ Update complete!") - _warn_stale_dashboard_processes() + _kill_stale_dashboard_processes() def _stash_local_changes_if_needed(git_cmd: list[str], cwd: Path) -> Optional[str]: @@ -7381,9 +7475,12 @@ def _cmd_update_impl(args, gateway_mode: bool): except Exception as e: logger.debug("Legacy unit check during update failed: %s", e) - # Warn about stale dashboard processes — the dashboard has no - # service manager, so we can only tell the user to restart them. - _warn_stale_dashboard_processes() + # Kill stale dashboard processes — the dashboard has no service + # manager, so leaving it alive after a code update produces a + # silent frontend/backend mismatch. We can't auto-restart it + # (no saved launch args) but we can stop it, and a hint is + # printed for the user to re-launch. + _kill_stale_dashboard_processes() print() print("Tip: You can now select a provider and model:") diff --git a/tests/hermes_cli/test_update_stale_dashboard.py b/tests/hermes_cli/test_update_stale_dashboard.py index 20f23e582..6a36af6d9 100644 --- a/tests/hermes_cli/test_update_stale_dashboard.py +++ b/tests/hermes_cli/test_update_stale_dashboard.py @@ -1,19 +1,29 @@ -"""Tests for _warn_stale_dashboard_processes — stale dashboard detection. +"""Tests for the stale-dashboard handling run at the end of ``hermes update``. -Ensures ``hermes update`` warns the user when dashboard processes from a -previous version are still running after files on disk have been replaced. -See #16872. +``hermes update`` detects ``hermes dashboard`` processes left over from the +previous version and kills them (SIGTERM + SIGKILL grace, or ``taskkill /F`` +on Windows). Without this, the running backend silently serves stale Python +against a freshly-updated JS bundle, producing 401s / empty data. + +History: +- #16872 introduced the warn-only helper (``_warn_stale_dashboard_processes``). +- #17049 fixed a Windows wmic UnicodeDecodeError crash on non-UTF-8 locales. +- This file now also covers the kill semantics that replaced the warning. """ from __future__ import annotations import os import sys -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, call import pytest -from hermes_cli.main import _warn_stale_dashboard_processes +from hermes_cli.main import ( + _find_stale_dashboard_pids, + _kill_stale_dashboard_processes, + _warn_stale_dashboard_processes, # back-compat alias +) def _ps_line(pid: int, cmd: str) -> str: @@ -21,11 +31,26 @@ def _ps_line(pid: int, cmd: str) -> str: return f"{pid:>7} {cmd}" -class TestWarnStaleDashboardProcesses: - """Unit tests for the stale dashboard process warning.""" +def _ps_runner(stdout: str): + """Build a subprocess.run side_effect that only stubs ps -A calls. - def test_no_warning_when_no_dashboard_running(self, capsys): - """ps returns no matching processes — no warning should be printed.""" + Any other subprocess.run invocation (e.g. taskkill on Windows) is + handed back as a successful no-op. This lets tests exercise the real + scan path without having to re-stub every unrelated subprocess call + made later in ``_kill_stale_dashboard_processes``. + """ + def _side_effect(args, *a, **kw): + if isinstance(args, (list, tuple)) and args and args[0] == "ps": + return MagicMock(returncode=0, stdout=stdout, stderr="") + # Any other subprocess.run (e.g. taskkill) — benign success stub. + return MagicMock(returncode=0, stdout="", stderr="") + return _side_effect + + +class TestFindStaleDashboardPids: + """Unit tests for the ps/wmic-based detection step.""" + + def test_no_matches_returns_empty(self): with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, @@ -35,26 +60,18 @@ class TestWarnStaleDashboardProcesses: + "\n", stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "dashboard process" not in output + assert _find_stale_dashboard_pids() == [] - def test_warning_printed_for_running_dashboard(self, capsys): - """ps finds a dashboard PID — warning with PID should appear.""" + def test_matches_running_dashboard(self): with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout=_ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119") + "\n", stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "1 dashboard process" in output - assert "PID 12345" in output - assert "kill " in output + assert _find_stale_dashboard_pids() == [12345] - def test_multiple_dashboard_pids(self, capsys): - """Multiple dashboard processes — all PIDs listed.""" + def test_multiple_matches(self): with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, @@ -65,15 +82,9 @@ class TestWarnStaleDashboardProcesses: ]) + "\n", stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "3 dashboard process" in output - assert "PID 12345" in output - assert "PID 12346" in output - assert "PID 12347" in output + assert sorted(_find_stale_dashboard_pids()) == [12345, 12346, 12347] - def test_self_pid_excluded(self, capsys): - """The current process PID should not be reported.""" + def test_self_pid_excluded(self): with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, @@ -83,41 +94,51 @@ class TestWarnStaleDashboardProcesses: ]) + "\n", stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - # The self PID may still appear inside an unrelated context, so anchor - # the check to "PID " which is how the warning prints. - assert f"PID {os.getpid()}" not in output - assert "PID 12345" in output + pids = _find_stale_dashboard_pids() + assert os.getpid() not in pids + assert 12345 in pids - def test_ps_not_found_silently_ignored(self, capsys): - """If ps is missing (FileNotFoundError), no crash, no warning.""" + def test_ps_not_found_returns_empty(self): with patch("subprocess.run", side_effect=FileNotFoundError): - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert output == "" + assert _find_stale_dashboard_pids() == [] - def test_ps_timeout_silently_ignored(self, capsys): - """If ps times out, no crash, no warning.""" + def test_ps_timeout_returns_empty(self): import subprocess as sp - with patch("subprocess.run", side_effect=sp.TimeoutExpired("ps", 10)): - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert output == "" + assert _find_stale_dashboard_pids() == [] - def test_empty_ps_output_no_warning(self, capsys): - """ps returns 0 but empty stdout — no warning.""" + def test_unrelated_process_containing_word_dashboard_not_matched(self): + """Guards against greedy pgrep-style matching catching chat sessions + or unrelated processes whose cmdline happens to contain 'dashboard'. + """ with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( - returncode=0, stdout="\n", stderr="" + returncode=0, + stdout="\n".join([ + _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"), + _ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"), + _ps_line(33333, "node /opt/grafana/dashboard-server.js"), + ]) + "\n", + stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "dashboard process" not in output + pids = _find_stale_dashboard_pids() + assert pids == [12345] - def test_invalid_pid_lines_skipped(self, capsys): - """Malformed ps lines should be skipped gracefully.""" + def test_grep_lines_ignored(self): + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="\n".join([ + _ps_line(99999, "grep hermes dashboard"), + _ps_line(12345, "hermes dashboard --port 9119"), + ]) + "\n", + stderr="", + ) + pids = _find_stale_dashboard_pids() + assert 99999 not in pids + assert 12345 in pids + + def test_invalid_pid_lines_skipped(self): with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, @@ -128,53 +149,166 @@ class TestWarnStaleDashboardProcesses: ]) + "\n", stderr="", ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "PID 12345" in output - assert "1 dashboard process" in output + pids = _find_stale_dashboard_pids() + assert pids == [12345] - def test_unrelated_process_containing_word_dashboard_not_matched(self, capsys): - """A process whose cmdline contains 'dashboard' but isn't a hermes - dashboard process must NOT be flagged. This guards against the old - ``pgrep -f "hermes.*dashboard"`` greedy regex that matched e.g. a - chat session argv containing both words. - """ - with patch("subprocess.run") as mock_run: - mock_run.return_value = MagicMock( - returncode=0, - stdout="\n".join([ - # Legitimate dashboard — should match. - _ps_line(12345, "python3 -m hermes_cli.main dashboard --port 9119"), - # hermes running something else, with "dashboard" as a - # substring of an unrelated arg — should NOT match. - _ps_line(22222, "python3 -m hermes_cli.main chat -q 'rewrite my dashboard'"), - # Completely unrelated process mentioning dashboard. - _ps_line(33333, "node /opt/grafana/dashboard-server.js"), - ]) + "\n", - stderr="", - ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "1 dashboard process" in output - assert "PID 12345" in output - assert "PID 22222" not in output - assert "PID 33333" not in output - def test_grep_lines_ignored(self, capsys): - """Lines containing 'grep' (from a pipe in ps output) are ignored.""" - with patch("subprocess.run") as mock_run: - mock_run.return_value = MagicMock( - returncode=0, - stdout="\n".join([ - _ps_line(99999, "grep hermes dashboard"), - _ps_line(12345, "hermes dashboard --port 9119"), - ]) + "\n", - stderr="", - ) - _warn_stale_dashboard_processes() - output = capsys.readouterr().out - assert "PID 99999" not in output - assert "PID 12345" in output +@pytest.mark.skipif(sys.platform == "win32", reason="POSIX kill semantics") +class TestKillStaleDashboardPosix: + """Kill path on Linux / macOS: SIGTERM then SIGKILL any survivors.""" + + def test_no_stale_processes_is_a_noop(self, capsys): + with patch("hermes_cli.main._find_stale_dashboard_pids", return_value=[]): + _kill_stale_dashboard_processes() + assert capsys.readouterr().out == "" + + def test_sigterm_graceful_exit(self, capsys): + """Processes that exit on SIGTERM (the probe gets ProcessLookupError) + are reported as stopped and SIGKILL is never sent.""" + import signal as _signal + + killed_signals: list[tuple[int, int]] = [] + + def fake_kill(pid, sig): + killed_signals.append((pid, sig)) + if sig == 0: + # Probe after SIGTERM → "process gone". + raise ProcessLookupError + # SIGTERM itself: succeed silently. + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[12345, 12346]), \ + patch("os.kill", side_effect=fake_kill), \ + patch("time.sleep"): + _kill_stale_dashboard_processes() + + # Both got SIGTERM. + sigterms = [pid for pid, sig in killed_signals if sig == _signal.SIGTERM] + assert sorted(sigterms) == [12345, 12346] + # No SIGKILL was needed. + assert not any(sig == _signal.SIGKILL for _, sig in killed_signals) + + out = capsys.readouterr().out + assert "Stopping 2 stale dashboard" in out + assert "✓ stopped PID 12345" in out + assert "✓ stopped PID 12346" in out + assert "Restart the dashboard" in out + + def test_sigkill_fallback_for_survivors(self, capsys): + """If a process survives SIGTERM + the grace window, SIGKILL is sent.""" + import signal as _signal + + sent: list[tuple[int, int]] = [] + + def fake_kill(pid, sig): + sent.append((pid, sig)) + # Simulate stubborn process: probe (sig 0) always succeeds, + # SIGTERM does nothing, SIGKILL is where it "dies". + if sig in (_signal.SIGTERM, 0, _signal.SIGKILL): + return + # Any other signal — also fine. + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[99999]), \ + patch("os.kill", side_effect=fake_kill), \ + patch("time.sleep"), \ + patch("time.monotonic", side_effect=[0.0] + [10.0] * 20): + # monotonic jumps past the 3s deadline on the second read so the + # grace loop exits immediately after one iteration. + _kill_stale_dashboard_processes() + + signals_sent = [sig for _, sig in sent] + assert _signal.SIGTERM in signals_sent + assert _signal.SIGKILL in signals_sent + + out = capsys.readouterr().out + assert "✓ stopped PID 99999" in out + + def test_permission_error_is_reported_not_raised(self, capsys): + """os.kill raising PermissionError (e.g. another user's process) + must not abort hermes update — it's reported as a failure and we + move on.""" + def fake_kill(pid, sig): + raise PermissionError("Operation not permitted") + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[12345]), \ + patch("os.kill", side_effect=fake_kill), \ + patch("time.sleep"): + _kill_stale_dashboard_processes() # must not raise + + out = capsys.readouterr().out + assert "✗ failed to stop PID 12345" in out + assert "Operation not permitted" in out + + def test_process_already_gone_counts_as_stopped(self, capsys): + """ProcessLookupError on the initial SIGTERM means the process + already exited between detection and the kill — treat as success.""" + def fake_kill(pid, sig): + raise ProcessLookupError + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[12345]), \ + patch("os.kill", side_effect=fake_kill), \ + patch("time.sleep"): + _kill_stale_dashboard_processes() + + out = capsys.readouterr().out + assert "✓ stopped PID 12345" in out + assert "failed to stop" not in out + + +class TestKillStaleDashboardWindows: + """Kill path on Windows: taskkill /F.""" + + def test_taskkill_invoked_for_each_pid(self, monkeypatch, capsys): + monkeypatch.setattr(sys, "platform", "win32") + + def fake_run(args, *a, **kw): + # taskkill returns 0 on success + return MagicMock(returncode=0, stdout="", stderr="") + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[12345, 12346]), \ + patch("subprocess.run", side_effect=fake_run) as mock_run: + _kill_stale_dashboard_processes() + + # Each PID triggered a taskkill /PID /F invocation. + taskkill_calls = [ + c for c in mock_run.call_args_list + if c.args and isinstance(c.args[0], list) and c.args[0][:1] == ["taskkill"] + ] + assert len(taskkill_calls) == 2 + assert ["taskkill", "/PID", "12345", "/F"] in [c.args[0] for c in taskkill_calls] + assert ["taskkill", "/PID", "12346", "/F"] in [c.args[0] for c in taskkill_calls] + + out = capsys.readouterr().out + assert "✓ stopped PID 12345" in out + assert "✓ stopped PID 12346" in out + + def test_taskkill_failure_is_reported(self, monkeypatch, capsys): + monkeypatch.setattr(sys, "platform", "win32") + + def fake_run(args, *a, **kw): + return MagicMock(returncode=128, stdout="", + stderr="ERROR: Access is denied.") + + with patch("hermes_cli.main._find_stale_dashboard_pids", + return_value=[12345]), \ + patch("subprocess.run", side_effect=fake_run): + _kill_stale_dashboard_processes() # must not raise + + out = capsys.readouterr().out + assert "✗ failed to stop PID 12345" in out + assert "Access is denied" in out + + +class TestBackCompatAlias: + """``_warn_stale_dashboard_processes`` is kept as an alias for the + new kill function so old imports don't break.""" + + def test_alias_is_the_kill_function(self): + assert _warn_stale_dashboard_processes is _kill_stale_dashboard_processes class TestWindowsWmicEncoding: @@ -188,7 +322,6 @@ class TestWindowsWmicEncoding: UnicodeDecodeError on non-UTF-8 wmic output.""" monkeypatch.setattr(sys, "platform", "win32") with patch("subprocess.run") as mock_run: - # Provide a minimal valid wmic /FORMAT:LIST response. mock_run.return_value = MagicMock( returncode=0, stdout=( @@ -197,10 +330,12 @@ class TestWindowsWmicEncoding: ), stderr="", ) - _warn_stale_dashboard_processes() + _find_stale_dashboard_pids() + # The wmic call is the first subprocess.run invocation. assert mock_run.called, "subprocess.run was not invoked" - kwargs = mock_run.call_args.kwargs + wmic_call = mock_run.call_args_list[0] + kwargs = wmic_call.kwargs assert kwargs.get("encoding") == "utf-8", ( "encoding kwarg must be 'utf-8' so wmic output is decoded " "deterministically rather than via the implicit reader-thread " @@ -211,10 +346,10 @@ class TestWindowsWmicEncoding: "down the reader thread (#17049)." ) - def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch, capsys): + def test_wmic_returns_none_stdout_does_not_crash(self, monkeypatch): """If subprocess.run returns successfully but stdout is None — which is what Python 3.11 leaves behind when the reader thread silently - crashed on UnicodeDecodeError before this fix landed — the warning + crashed on UnicodeDecodeError before this fix landed — detection must short-circuit instead of raising AttributeError on ``None.split('\\n')`` and aborting `hermes update` (#17049).""" monkeypatch.setattr(sys, "platform", "win32") @@ -223,7 +358,4 @@ class TestWindowsWmicEncoding: returncode=0, stdout=None, stderr="" ) # Must not raise. - _warn_stale_dashboard_processes() - - output = capsys.readouterr().out - assert "dashboard process" not in output + assert _find_stale_dashboard_pids() == []