fix(codex): resync pool entry from auth.json after reauth (#17001)

When openai-codex tokens expire or the ChatGPT account hits a 429 window, the pool entry gets marked STATUS_EXHAUSTED with last_error_reset_at many hours in the future. If the user then runs `hermes model` / `hermes auth openai-codex` to reauth, fresh tokens land in ~/.hermes/auth.json but the pool entry stayed frozen behind its reset_at — every request kept failing with 'credential pool: no available entries (all exhausted or empty)' until the original window elapsed. _available_entries() already had auth.json/credentials-file resync branches for anthropic/claude_code and nous/device_code; openai-codex was missing. Added _sync_codex_entry_from_auth_store() mirroring the nous version (reads state["tokens"][{access,refresh}_token] + state["last_refresh"]) and wired it into the exhausted-entry resync loop. Also softens the 'codex CLI not found' doctor warning — native device-code OAuth does not require the Codex binary, only importing existing Codex CLI tokens does. Downgraded to an info line. Reported on Discord by p1aceho1der: Codex stalled indefinitely after a rate-limit reset, reauth didn't help, and doctor falsely warned that the codex CLI was required. Co-authored-by: teknium1 <teknium@users.noreply.github.com>
2026-04-28 05:43:09 -07:00
parent 529eb29b6a
commit 06164a7b28
3 changed files with 224 additions and 1 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -456,6 +456,70 @@ class CredentialPool:
            logger.debug("Failed to sync from credentials file: %s", exc)
        return entry

+    def _sync_codex_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
+        """Sync a Codex device_code pool entry from auth.json if tokens differ.
+
+        When a Codex OAuth access token expires (or the ChatGPT account hits
+        its 5h/weekly quota), the pool entry gets marked ``STATUS_EXHAUSTED``
+        with a ``last_error_reset_at`` that can be many hours in the future.
+        Meanwhile the user may run ``hermes model`` / ``hermes auth`` which
+        performs a fresh device-code login and writes new tokens to
+        ``auth.json`` under ``_auth_store_lock``.  Without this sync the pool
+        entry stays frozen until ``last_error_reset_at`` elapses — even
+        though fresh credentials are sitting on disk — and every request
+        fails with "no available entries (all exhausted or empty)".
+
+        Mirrors the Nous/Anthropic resync paths above.  Only applies to
+        device_code-sourced entries; env/API-key-sourced entries have no
+        auth.json shadow to sync from.
+        """
+        if self.provider != "openai-codex" or entry.source != "device_code":
+            return entry
+        try:
+            with _auth_store_lock():
+                auth_store = _load_auth_store()
+                state = _load_provider_state(auth_store, "openai-codex")
+            if not isinstance(state, dict):
+                return entry
+            tokens = state.get("tokens")
+            if not isinstance(tokens, dict):
+                return entry
+            store_access = tokens.get("access_token", "")
+            store_refresh = tokens.get("refresh_token", "")
+            # Adopt auth.json tokens when either side differs.  Codex refresh
+            # tokens are single-use too, so a fresh refresh_token from
+            # another process means our entry's pair is consumed/stale.
+            entry_access = entry.access_token or ""
+            entry_refresh = entry.refresh_token or ""
+            if store_access and (
+                store_access != entry_access
+                or (store_refresh and store_refresh != entry_refresh)
+            ):
+                logger.debug(
+                    "Pool entry %s: syncing Codex tokens from auth.json "
+                    "(refreshed by another process)",
+                    entry.id,
+                )
+                field_updates: Dict[str, Any] = {
+                    "access_token": store_access,
+                    "refresh_token": store_refresh or entry.refresh_token,
+                    "last_status": None,
+                    "last_status_at": None,
+                    "last_error_code": None,
+                    "last_error_reason": None,
+                    "last_error_message": None,
+                    "last_error_reset_at": None,
+                }
+                if state.get("last_refresh"):
+                    field_updates["last_refresh"] = state["last_refresh"]
+                updated = replace(entry, **field_updates)
+                self._replace_entry(entry, updated)
+                self._persist()
+                return updated
+        except Exception as exc:
+            logger.debug("Failed to sync Codex entry from auth.json: %s", exc)
+        return entry
+
    def _sync_nous_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
        """Sync a Nous pool entry from auth.json if tokens differ.

@@ -788,6 +852,18 @@ class CredentialPool:
                if synced is not entry:
                    entry = synced
                    cleared_any = True
+            # For openai-codex entries, same pattern: the user may have
+            # re-authed via `hermes model` / `hermes auth` after a 429/401,
+            # leaving fresh tokens on disk while the pool entry is still
+            # frozen behind last_error_reset_at (can be hours in the
+            # future for ChatGPT weekly windows).
+            if (self.provider == "openai-codex"
+                    and entry.source == "device_code"
+                    and entry.last_status == STATUS_EXHAUSTED):
+                synced = self._sync_codex_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
+                    cleared_any = True
            if entry.last_status == STATUS_EXHAUSTED:
                exhausted_until = _exhausted_until(entry)
                if exhausted_until is not None and now < exhausted_until:
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@@ -517,7 +517,14 @@ def run_doctor(args):
    if shutil.which("codex"):
        check_ok("codex CLI")
    else:
-        check_warn("codex CLI not found", "(required for openai-codex login)")
+        # Native OAuth uses Hermes' own device-code flow — the Codex CLI is
+        # only needed if you want to import existing tokens from
+        # ~/.codex/auth.json.  Downgrade to info so users running
+        # `hermes auth openai-codex` aren't told they're missing something.
+        check_info(
+            "codex CLI not installed "
+            "(optional — only required to import tokens from an existing Codex CLI login)"
+        )

    # =========================================================================
    # Check: Directory structure
--- a/tests/agent/test_credential_pool.py
+++ b/tests/agent/test_credential_pool.py
@@ -1370,3 +1370,143 @@ def test_nous_exhausted_entry_recovers_via_auth_store_sync(tmp_path, monkeypatch
    assert len(available) == 1
    assert available[0].refresh_token == "refresh-FRESH"
    assert available[0].last_status is None
+
+
+# ── OpenAI Codex OAuth cross-process sync tests ────────────────────────────
+
+def _codex_auth_store(access: str, refresh: str) -> dict:
+    return {
+        "version": 1,
+        "active_provider": "openai-codex",
+        "providers": {
+            "openai-codex": {
+                "auth_mode": "chatgpt",
+                "tokens": {
+                    "access_token": access,
+                    "refresh_token": refresh,
+                    "id_token": "id-" + access,
+                },
+                "last_refresh": "2026-04-28T00:00:00Z",
+            }
+        },
+    }
+
+
+def test_sync_codex_entry_from_auth_store_adopts_newer_tokens(tmp_path, monkeypatch):
+    """When auth.json has newer Codex tokens, the pool entry should adopt them."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    _write_auth_store(tmp_path, _codex_auth_store("access-OLD", "refresh-OLD"))
+
+    from agent.credential_pool import load_pool
+
+    pool = load_pool("openai-codex")
+    entry = pool.select()
+    assert entry is not None
+    assert entry.access_token == "access-OLD"
+    assert entry.refresh_token == "refresh-OLD"
+
+    # Simulate `hermes auth openai-codex` replacing the token pair on disk.
+    _write_auth_store(tmp_path, _codex_auth_store("access-NEW", "refresh-NEW"))
+
+    synced = pool._sync_codex_entry_from_auth_store(entry)
+    assert synced is not entry
+    assert synced.access_token == "access-NEW"
+    assert synced.refresh_token == "refresh-NEW"
+    assert synced.last_status is None
+    assert synced.last_error_code is None
+    assert synced.last_error_reset_at is None
+
+
+def test_sync_codex_entry_noop_when_tokens_match(tmp_path, monkeypatch):
+    """When auth.json has the same tokens, sync should be a no-op."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    _write_auth_store(tmp_path, _codex_auth_store("access-same", "refresh-same"))
+
+    from agent.credential_pool import load_pool
+
+    pool = load_pool("openai-codex")
+    entry = pool.select()
+    assert entry is not None
+
+    synced = pool._sync_codex_entry_from_auth_store(entry)
+    assert synced is entry
+
+
+def test_codex_exhausted_entry_recovers_via_auth_store_sync(tmp_path, monkeypatch):
+    """An exhausted Codex entry should recover when auth.json has newer tokens.
+
+    Reproduces the Discord report (p1aceho1der, Apr 2026): after a Codex
+    rate-limit reset the user ran `hermes model` to reauth, but the pool
+    entry stayed marked EXHAUSTED with last_error_reset_at many hours in
+    the future — so `_available_entries` kept returning empty and every
+    request failed with "no available entries (all exhausted or empty)".
+    """
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    from agent.credential_pool import load_pool, STATUS_EXHAUSTED
+    from dataclasses import replace as dc_replace
+
+    _write_auth_store(tmp_path, _codex_auth_store("access-OLD", "refresh-OLD"))
+
+    pool = load_pool("openai-codex")
+    entry = pool.select()
+    assert entry is not None
+
+    # Mark entry as exhausted with last_error_reset_at one hour in the
+    # future (Codex 429 weekly-window pattern).
+    now = time.time()
+    exhausted = dc_replace(
+        entry,
+        last_status=STATUS_EXHAUSTED,
+        last_status_at=now,
+        last_error_code=429,
+        last_error_reset_at=now + 3600,
+    )
+    pool._replace_entry(entry, exhausted)
+    pool._persist()
+
+    # Sanity: before the reauth, _available_entries refuses to return
+    # this entry because last_error_reset_at is in the future.
+    # (clear_expired would only clear it AFTER exhausted_until elapsed.)
+    available_before = pool._available_entries(clear_expired=True, refresh=False)
+    assert available_before == []
+
+    # Simulate `hermes model` / `hermes auth` refreshing the tokens.
+    _write_auth_store(tmp_path, _codex_auth_store("access-FRESH", "refresh-FRESH"))
+
+    available = pool._available_entries(clear_expired=True, refresh=False)
+    assert len(available) == 1
+    assert available[0].access_token == "access-FRESH"
+    assert available[0].refresh_token == "refresh-FRESH"
+    assert available[0].last_status is None
+    assert available[0].last_error_reset_at is None
+
+
+def test_codex_exhausted_entry_stays_stuck_without_auth_store_update(tmp_path, monkeypatch):
+    """Regression guard: if auth.json tokens haven't changed, the exhausted
+    entry must stay stuck behind its reset window — sync must not spuriously
+    clear status just because the entry is STATUS_EXHAUSTED."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    from agent.credential_pool import load_pool, STATUS_EXHAUSTED
+    from dataclasses import replace as dc_replace
+
+    _write_auth_store(tmp_path, _codex_auth_store("access-same", "refresh-same"))
+
+    pool = load_pool("openai-codex")
+    entry = pool.select()
+    assert entry is not None
+
+    now = time.time()
+    exhausted = dc_replace(
+        entry,
+        last_status=STATUS_EXHAUSTED,
+        last_status_at=now,
+        last_error_code=429,
+        last_error_reset_at=now + 3600,
+    )
+    pool._replace_entry(entry, exhausted)
+    pool._persist()
+
+    # auth.json unchanged → sync returns same entry → exhausted_until check
+    # still skips it.
+    available = pool._available_entries(clear_expired=True, refresh=False)
+    assert available == []