fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (#19740)

The stale-code self-check (Issue #17648) used sentinel-file mtimes to
decide whether the gateway survived a `hermes update` with stale
`sys.modules`. That signal false-positives on any write to the
sentinel files — including agent-driven edits during Hermes-on-Hermes
dev sessions. Telling the agent to patch `run_agent.py` would flip
the check to True on the next user message and force a gateway
restart even though no update happened.

Switch the signal to `git rev-parse HEAD`. Agent file edits don't
move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD
directly (no subprocess) with a 5s cache keeps the overhead negligible
on bursty chats. Non-git installs short-circuit to False — the
stale-modules class can't occur without a git-backed update path, so
there's nothing to detect.

The legacy `_compute_repo_mtime` helper is kept but unused by
detection, reserved as a fallback hook for future pip-install update
paths.

- _read_git_head_sha(): resolves HEAD across main checkout, worktree
  (follows `gitdir:` + `commondir` pointers), and packed-refs layouts.
- _current_git_sha_cached(): per-runner 5s SHA cache.
- _detect_stale_code(): boot SHA vs current SHA, returns False when
  either is unavailable.
- Tests cover all four layouts, the agent-edits-don't-trigger
  regression, and cache behavior.

Refs #17648.
This commit is contained in:
Teknium
2026-05-04 12:33:21 -07:00
committed by GitHub
parent a21f364ad7
commit d90f73bcec
2 changed files with 482 additions and 139 deletions

View File

@@ -101,10 +101,21 @@ _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60
# already-loaded stale module object and raises ``ImportError`` — see
# Issue #17648. Rather than papering over the import failure site-by-site
# in every tool file, detect the stale state centrally and auto-restart
# so the gateway reloads with fresh code. The sentinel files below are
# the canonical repo-level markers that every update touches; if any is
# newer than the gateway's boot time, we know the running process is out
# of date.
# so the gateway reloads with fresh code.
#
# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
# update`` moves that is NOT moved by agent-driven file edits. Earlier
# revisions of this check compared file mtimes across a sentinel set
# (run_agent.py, gateway/run.py, ...), but that produced false positives
# whenever the agent edited its own source files during a session:
# mtime jumps, stale-check fires, gateway restarts, user must retype.
# See the conversation at PR #<this> for the motivating incident.
#
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
# non-git installs (pip install from wheel, sparse clones with no .git
# dir). In those environments ``hermes update`` is not a supported path,
# so the check effectively no-ops — which is the safe behavior: better
# to ship one broken import than to restart on every agent-edit.
_STALE_CODE_SENTINELS: tuple[str, ...] = (
"hermes_cli/config.py",
"hermes_cli/__init__.py",
@@ -113,10 +124,106 @@ _STALE_CODE_SENTINELS: tuple[str, ...] = (
"pyproject.toml",
)
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
# spawn one subprocess per message. 5s is long enough to collapse a burst
# and short enough that the real post-update detection still fires within
# the user's perceived "next message" window.
_GIT_SHA_CACHE_TTL_SECS = 5.0
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
of shelling out to ``git`` — cheaper, no subprocess tax, works on
gateway hosts that don't have a ``git`` binary on PATH. Returns
None for non-git installs (no ``.git`` dir) or any I/O error; callers
treat None as "can't tell" and skip the check.
Supports the three layouts we care about:
1. Main checkout: ``<repo>/.git/`` is a directory.
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
points at ``<main>/.git/worktrees/<name>/``. The worktree's
gitdir has HEAD + index but NOT refs/heads/ — those live in
the main checkout, and ``<worktree-gitdir>/commondir`` points
at the main ``.git``. We search both locations for refs.
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
listed in ``<main-git-dir>/packed-refs``.
"""
try:
git_dir = repo_root / ".git"
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
if git_dir.is_file():
try:
content = git_dir.read_text().strip()
if content.startswith("gitdir:"):
git_dir = Path(content.split(":", 1)[1].strip())
if not git_dir.is_absolute():
git_dir = (repo_root / git_dir).resolve()
except OSError:
return None
if not git_dir.is_dir():
return None
# Figure out the "common" git dir — the one that owns shared refs.
# For a worktree, commondir points at it (relative path, resolve
# against git_dir). For a main checkout, common_dir == git_dir.
common_dir = git_dir
commondir_file = git_dir / "commondir"
if commondir_file.is_file():
try:
rel = commondir_file.read_text().strip()
candidate = (git_dir / rel).resolve() if rel else git_dir
if candidate.is_dir():
common_dir = candidate
except OSError:
pass
head_path = git_dir / "HEAD"
if not head_path.is_file():
return None
head_content = head_path.read_text().strip()
if head_content.startswith("ref:"):
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
# Worktree-local refs (bisect, rebase-merge state) live under
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
# common_dir. Try git_dir first, then common_dir.
ref_rel = head_content.split(":", 1)[1].strip()
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
ref_path = base / ref_rel
if ref_path.is_file():
try:
sha = ref_path.read_text().strip()
except OSError:
continue
if sha:
return sha
# Packed refs fallback — always stored in the common dir.
packed = common_dir / "packed-refs"
if packed.is_file():
try:
for line in packed.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or line.startswith("^"):
continue
parts = line.split(None, 1)
if len(parts) == 2 and parts[1] == ref_rel:
return parts[0] or None
except OSError:
return None
return None
# Detached HEAD — content is the SHA directly.
return head_content or None
except Exception:
return None
def _compute_repo_mtime(repo_root: Path) -> float:
"""Return the newest mtime across the stale-code sentinel files.
Legacy fallback used only for non-git installs (``.git`` missing).
Missing files are ignored (they may not exist on older checkouts).
Returns 0.0 if no sentinel file is readable — treat that as "can't
tell", which downstream callers interpret as "not stale" to avoid
@@ -1005,6 +1112,7 @@ class GatewayRunner:
# running __init__ don't crash when _handle_message reads these.
_boot_wall_time: float = 0.0
_boot_repo_mtime: float = 0.0
_boot_git_sha: Optional[str] = None
_stale_code_restart_triggered: bool = False
def __init__(self, config: Optional[GatewayConfig] = None):
@@ -1020,15 +1128,23 @@ class GatewayRunner:
try:
self._boot_wall_time: float = time.time()
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
self._boot_git_sha: Optional[str] = _read_git_head_sha(
self._repo_root_for_staleness,
)
self._boot_repo_mtime: float = _compute_repo_mtime(
self._repo_root_for_staleness,
)
except Exception:
self._boot_wall_time = 0.0
self._repo_root_for_staleness = Path(".")
self._boot_git_sha = None
self._boot_repo_mtime = 0.0
self._stale_code_notified: set[str] = set()
self._stale_code_restart_triggered: bool = False
# Cached current-SHA read, refreshed at most every
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
self._cached_current_sha: Optional[str] = self._boot_git_sha
self._cached_current_sha_at: float = self._boot_wall_time
# Load ephemeral config from config.yaml / env vars.
# Both are injected at API-call time only and never persisted.
@@ -2737,36 +2853,69 @@ class GatewayRunner:
task.add_done_callback(self._background_tasks.discard)
return True
def _current_git_sha_cached(self) -> Optional[str]:
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
A bursty chat (user mashes "hello?" three times) would otherwise
re-read ``.git/HEAD`` on every message. Caching collapses that
into a single read and still re-checks within the user's
perceived "next message" window.
"""
now = time.time()
if (
self._cached_current_sha is not None
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
):
return self._cached_current_sha
try:
sha = _read_git_head_sha(self._repo_root_for_staleness)
except Exception:
sha = None
self._cached_current_sha = sha
self._cached_current_sha_at = now
return sha
def _detect_stale_code(self) -> bool:
"""Return True if source files on disk are newer than the running process.
"""Return True if the git HEAD moved since this process booted.
A gateway that survives ``hermes update`` (manual SIGTERM never
escalated, systemd restart race, detached-process respawn failed,
etc.) keeps pre-update modules cached in ``sys.modules``. Later
imports of names added post-update — e.g. ``cfg_get`` from PR
#17304 — raise ImportError against the stale module object (see
Issue #17648). Detecting this at the source — "the code on disk
is newer than me" — lets us auto-restart instead of serving
broken responses until the user notices and runs
``hermes gateway restart`` manually.
Issue #17648).
Returns False when the boot-time snapshot is unavailable or no
sentinel file is readable, to avoid false-positive restart loops
in unusual checkouts (sparse clones, read-only filesystems).
We compare the git HEAD SHA at boot to the current SHA on disk.
``hermes update`` always moves HEAD forward via ``git pull``;
agent file edits (the agent patching ``run_agent.py`` or
``gateway/run.py`` during a self-dev session) never move HEAD.
That makes SHA comparison free of the false-positive class that
the old mtime check suffered from — the agent can edit any file
without triggering a phantom restart.
Returns False when:
- the boot SHA is unavailable (non-git install, first call
during partial init, etc.); we can't tell and refuse to loop
- the current SHA matches the boot SHA
- reading the current SHA fails for any reason
"""
if not self._boot_wall_time or not self._boot_repo_mtime:
if not self._boot_wall_time:
return False
if not self._boot_git_sha:
# Non-git install. ``hermes update`` is git-based, so a
# non-git install can't experience the stale-modules class
# this check exists to catch. Return False — no check, no
# false positives. (If we ever ship a pip-install update
# path, we'd add a persistent update marker here and compare
# its timestamp to self._boot_wall_time.)
return False
try:
current = _compute_repo_mtime(self._repo_root_for_staleness)
current = self._current_git_sha_cached()
except Exception:
return False
if current <= 0.0:
if not current:
return False
# 2-second slack guards against filesystems with coarse mtime
# resolution (FAT32, some NFS mounts). Real updates always move
# the newest-file mtime forward by minutes, so this doesn't hide
# genuine staleness.
return current > self._boot_repo_mtime + 2.0
return current != self._boot_git_sha
def _trigger_stale_code_restart(self) -> None:
"""Idempotently kick off a graceful restart after stale-code detection.
@@ -2782,12 +2931,17 @@ class GatewayRunner:
if self._stale_code_restart_triggered:
return
self._stale_code_restart_triggered = True
current_sha = None
try:
current_sha = self._current_git_sha_cached()
except Exception:
pass
logger.warning(
"Stale-code self-check: source files newer than gateway boot "
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
"Stale-code self-check: git HEAD moved since gateway boot "
"(boot=%s, current=%s) — requesting graceful restart. "
"See Issue #17648.",
self._boot_repo_mtime,
_compute_repo_mtime(self._repo_root_for_staleness),
(self._boot_git_sha or "?")[:12],
(current_sha or "?")[:12],
)
try:
self.request_restart(detached=False, via_service=True)

View File

@@ -3,25 +3,34 @@
A gateway that survives ``hermes update`` keeps pre-update modules cached
in ``sys.modules``. Later imports of names added post-update (e.g.
``cfg_get`` from PR #17304) raise ImportError against the stale module
object. The self-check in ``GatewayRunner._detect_stale_code()`` detects
this by comparing boot-time sentinel-file mtimes against current ones,
and ``_trigger_stale_code_restart()`` triggers a graceful restart.
object.
The self-check compares the git HEAD SHA at boot to the current SHA on
disk. ``hermes update`` always moves HEAD forward via ``git pull``;
agent-driven file edits (Hermes editing ``run_agent.py`` / ``gateway/run.py``
during a self-dev session) never move HEAD — so the SHA signal is free of
the false-positive class that the earlier mtime-based check suffered from.
"""
import os
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from gateway.run import (
GatewayRunner,
_compute_repo_mtime,
_read_git_head_sha,
_STALE_CODE_SENTINELS,
_GIT_SHA_CACHE_TTL_SECS,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_tmp_repo(tmp_path: Path) -> Path:
"""Create a fake repo with all stale-code sentinel files."""
for rel in _STALE_CODE_SENTINELS:
@@ -31,109 +40,303 @@ def _make_tmp_repo(tmp_path: Path) -> Path:
return tmp_path
def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
def _make_git_repo(tmp_path: Path, sha: str = "a" * 40, branch: str = "main") -> Path:
"""Stamp a minimal .git directory so _read_git_head_sha can resolve a SHA.
We don't run real git — just lay down the files the reader walks
(.git/HEAD pointing at refs/heads/<branch>, refs/heads/<branch>
containing the SHA).
"""
git_dir = tmp_path / ".git"
git_dir.mkdir(parents=True, exist_ok=True)
(git_dir / "HEAD").write_text(f"ref: refs/heads/{branch}\n")
refs_dir = git_dir / "refs" / "heads"
refs_dir.mkdir(parents=True, exist_ok=True)
(refs_dir / branch).write_text(f"{sha}\n")
return tmp_path
def _set_head_sha(repo_root: Path, sha: str, branch: str = "main") -> None:
"""Rewrite the current branch ref to a new SHA (simulates git pull)."""
(repo_root / ".git" / "refs" / "heads" / branch).write_text(f"{sha}\n")
def _make_runner(
repo_root: Path,
*,
boot_sha: str | None,
boot_wall: float = None,
boot_mtime: float = 0.0,
):
"""Bare GatewayRunner with just the stale-check attributes set."""
if boot_wall is None:
boot_wall = time.time()
runner = object.__new__(GatewayRunner)
runner._repo_root_for_staleness = repo_root
runner._boot_wall_time = boot_wall
runner._boot_git_sha = boot_sha
runner._boot_repo_mtime = boot_mtime
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
runner._cached_current_sha = boot_sha
runner._cached_current_sha_at = boot_wall
return runner
def test_compute_repo_mtime_returns_newest(tmp_path):
"""_compute_repo_mtime returns the newest mtime across sentinel files."""
repo = _make_tmp_repo(tmp_path)
# ---------------------------------------------------------------------------
# _read_git_head_sha — raw SHA reader
# ---------------------------------------------------------------------------
# Stamp a baseline mtime across all sentinels
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
# Touch one file forward
newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
result = _compute_repo_mtime(repo)
assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution)
def test_read_git_head_sha_branch_ref(tmp_path):
"""Resolves ref: refs/heads/<branch> → SHA from refs/heads/<branch>."""
sha = "b" * 40
_make_git_repo(tmp_path, sha=sha, branch="main")
assert _read_git_head_sha(tmp_path) == sha
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
"""Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
# tmp_path has none of the sentinels
assert _compute_repo_mtime(tmp_path) == 0.0
def test_read_git_head_sha_detached_head(tmp_path):
"""Detached HEAD: .git/HEAD contains the SHA directly."""
sha = "c" * 40
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text(f"{sha}\n")
assert _read_git_head_sha(tmp_path) == sha
def test_compute_repo_mtime_partial_files_still_works(tmp_path):
"""Partial sentinel presence still returns newest of the readable ones."""
(tmp_path / "hermes_cli").mkdir()
target = tmp_path / "hermes_cli" / "config.py"
target.write_text("# partial\n")
target_mtime = time.time() - 50
os.utime(target, (target_mtime, target_mtime))
result = _compute_repo_mtime(tmp_path)
assert abs(result - target_mtime) < 1.0
def test_read_git_head_sha_packed_refs(tmp_path):
"""Falls back to packed-refs when refs/heads/<branch> is missing."""
sha = "d" * 40
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("ref: refs/heads/main\n")
# No refs/heads/main file — only packed-refs
(git_dir / "packed-refs").write_text(
f"# pack-refs with: peeled fully-peeled sorted\n"
f"{sha} refs/heads/main\n"
)
assert _read_git_head_sha(tmp_path) == sha
def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
"""No boot snapshot → can't tell → not stale (no restart loop)."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
def test_read_git_head_sha_worktree_gitdir_file(tmp_path):
"""Worktree: .git is a file with `gitdir: <path>` pointing to the real git dir.
Real git worktrees store shared refs (refs/heads/*) in the main
checkout's .git/ and write a ``commondir`` pointer into the
worktree-gitdir. The reader must follow commondir to resolve the
branch ref — this is the layout Hermes dev sessions actually use.
"""
sha = "e" * 40
# Main repo layout
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
(main_git / "refs" / "heads").mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
(main_git / "refs" / "heads" / "main").write_text("0" * 40 + "\n")
# Worktree lives in main-repo/.git/worktrees/<name>/
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
# commondir points back at the main .git (relative path, "../..")
(worktree_git_dir / "commondir").write_text("../..\n")
# Feature branch ref lives in the shared refs/heads
(main_git / "refs" / "heads" / "feature").write_text(f"{sha}\n")
# Worktree checkout with .git file pointing at worktree_git_dir
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_worktree_packed_refs_in_common(tmp_path):
"""Worktree + packed-refs in common dir: fallback still resolves."""
sha = "f" * 40
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
main_git.mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
# packed-refs in the common (main) .git
(main_git / "packed-refs").write_text(
f"# pack-refs with: peeled fully-peeled sorted\n"
f"{sha} refs/heads/feature\n"
)
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
(worktree_git_dir / "commondir").write_text("../..\n")
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_no_git_returns_none(tmp_path):
"""No .git dir → None (non-git install, safely disables the check)."""
assert _read_git_head_sha(tmp_path) is None
def test_read_git_head_sha_malformed_head_returns_none(tmp_path):
"""Empty HEAD file → None (don't loop on corrupt repos)."""
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("")
assert _read_git_head_sha(tmp_path) is None
# ---------------------------------------------------------------------------
# _detect_stale_code — the main regression guard
# ---------------------------------------------------------------------------
def test_detect_stale_code_false_when_sha_unchanged(tmp_path):
"""Boot SHA == current SHA → not stale (no restart)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Force fresh read by expiring the cache
runner._cached_current_sha_at = 0.0
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_files_unchanged(tmp_path):
"""Source files at boot mtime → not stale."""
repo = _make_tmp_repo(tmp_path)
# Freeze all sentinels to the same mtime
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
assert runner._detect_stale_code() is False
def test_detect_stale_code_true_after_update(tmp_path):
"""Sentinel files newer than boot snapshot → stale."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Simulate hermes update touching config.py
new_mtime = time.time()
os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
def test_detect_stale_code_true_after_git_pull(tmp_path):
"""Boot SHA != current SHA → stale (hermes update happened)."""
boot_sha = "a" * 40
_make_git_repo(tmp_path, sha=boot_sha)
runner = _make_runner(tmp_path, boot_sha=boot_sha)
# Simulate git pull moving HEAD forward
_set_head_sha(tmp_path, "b" * 40)
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is True
def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
"""2-second slack prevents false positives on coarse-mtime filesystems."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
def test_detect_stale_code_ignores_agent_file_edits(tmp_path):
"""THE CORE REGRESSION: agent edits to source files do NOT trigger restart.
This is the motivating incident for the SHA-based check. Under the
previous mtime-based scheme, any ``patch`` / ``write_file`` call
against run_agent.py / gateway/run.py / hermes_cli/config.py would
flip the stale-check to True and force a gateway restart on the
next message — even though no update actually happened. SHA
comparison decouples the two: git HEAD only moves on ``git pull``,
never on file writes.
"""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
_make_tmp_repo(tmp_path) # lay down sentinel files too
runner = _make_runner(tmp_path, boot_sha=sha)
# Simulate the agent editing run_agent.py and gateway/run.py with
# mtimes far into the future — exactly the scenario that used to
# false-positive the old mtime check.
future = time.time() + 10_000
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
p = tmp_path / rel
if p.is_file():
p.write_text("# agent just edited this\n")
os.utime(p, (future, future))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Touch config.py 1s newer — within the 2s slack → not stale
os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
# HEAD SHA has NOT moved — check must stay False.
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is False
# Touch 5s newer → stale
os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
assert runner._detect_stale_code() is True
def test_detect_stale_code_false_for_non_git_install(tmp_path):
"""Non-git install (no .git dir) → check disabled, never fires."""
# No .git dir at all; runner's boot_sha is None
runner = _make_runner(tmp_path, boot_sha=None)
# Even if we pretended the current SHA differed, the check should
# short-circuit on boot_sha=None and return False.
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_no_boot_wall_time(tmp_path):
"""No boot snapshot at all → can't tell → not stale (no restart loop)."""
runner = _make_runner(tmp_path, boot_sha="a" * 40, boot_wall=0.0)
assert runner._detect_stale_code() is False
def test_detect_stale_code_handles_disappearing_git_dir(tmp_path):
""".git vanishes mid-run → current_sha = None → not stale (don't loop)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Nuke the git dir after boot
import shutil
shutil.rmtree(tmp_path / ".git")
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is False
# ---------------------------------------------------------------------------
# SHA cache
# ---------------------------------------------------------------------------
def test_current_sha_cache_collapses_bursts(tmp_path, monkeypatch):
"""Consecutive calls inside the TTL window reuse the cached SHA."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
# Force cache expiry so the first call definitely reads
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first_count = read_calls["n"]
# Immediate second/third calls should hit cache (no new read)
runner._current_git_sha_cached()
runner._current_git_sha_cached()
assert read_calls["n"] == first_count
def test_current_sha_cache_expires_after_ttl(tmp_path, monkeypatch):
"""After _GIT_SHA_CACHE_TTL_SECS elapses, a fresh read happens."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first = read_calls["n"]
# Age the cache past the TTL
runner._cached_current_sha_at = time.time() - (_GIT_SHA_CACHE_TTL_SECS + 1.0)
runner._current_git_sha_cached()
assert read_calls["n"] == first + 1
# ---------------------------------------------------------------------------
# _trigger_stale_code_restart — idempotency preserved
# ---------------------------------------------------------------------------
def test_trigger_stale_code_restart_is_idempotent(tmp_path):
"""Calling _trigger_stale_code_restart twice only requests restart once."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
calls = []
@@ -153,8 +356,9 @@ def test_trigger_stale_code_restart_is_idempotent(tmp_path):
def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
"""If request_restart raises, we swallow and mark as triggered anyway."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
def boom(*, detached=False, via_service=False):
raise RuntimeError("no event loop")
@@ -168,56 +372,41 @@ def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
assert runner._stale_code_restart_triggered is True
def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
"""If the repo root vanishes after boot, return False (don't loop)."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Remove all sentinel files — _compute_repo_mtime returns 0.0
for rel in _STALE_CODE_SENTINELS:
(repo / rel).unlink(missing_ok=True)
assert runner._detect_stale_code() is False
# ---------------------------------------------------------------------------
# Class-level defaults — tests that build bare runners via object.__new__
# ---------------------------------------------------------------------------
def test_class_level_defaults_prevent_uninitialized_access():
"""Partial construction via object.__new__ must not crash _detect_stale_code."""
runner = object.__new__(GatewayRunner)
# Don't set any instance attrs — class-level defaults should kick in
runner._repo_root_for_staleness = Path(".")
# _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
# _boot_wall_time / _boot_git_sha fall through to class defaults
# (0.0 and None respectively)
assert runner._detect_stale_code() is False
# _stale_code_restart_triggered falls through to class default (False)
assert runner._stale_code_restart_triggered is False
def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
"""GatewayRunner.__init__ captures a usable stale-code baseline."""
# Stub out the heavy parts of __init__ we don't need. We only want
# to prove the stale-code snapshot is captured before anything else.
from gateway import run as run_mod
# ---------------------------------------------------------------------------
# Legacy mtime reader kept for compatibility — light sanity check only
# ---------------------------------------------------------------------------
calls = {}
def test_compute_repo_mtime_still_returns_newest(tmp_path):
"""_compute_repo_mtime remains available for any legacy callers."""
repo = _make_tmp_repo(tmp_path)
def fake_compute(repo_root):
calls["repo_root"] = repo_root
return 1234567890.0
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)
newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
# Build a runner without running the full __init__ — then manually
# exercise the stale-check init block that __init__ contains.
runner = object.__new__(GatewayRunner)
runner._boot_wall_time = time.time()
runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
result = _compute_repo_mtime(repo)
assert abs(result - newer) < 1.0
assert runner._boot_repo_mtime == 1234567890.0
assert calls["repo_root"] == runner._repo_root_for_staleness
assert runner._boot_wall_time > 0
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
"""Legacy sanity: missing sentinels → 0.0."""
assert _compute_repo_mtime(tmp_path) == 0.0