fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions

When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403, token revoked or reused), _mark_exhausted was called but auth.json was left with the dead credentials. On the next session, _seed_from_singletons re-read auth.json and re-seeded the pool with the same revoked token, triggering the same terminal failure in a loop. Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine block in _refresh_entry: when a terminal error is detected and auth.json holds no newer tokens, clear access_token/refresh_token from auth.json and remove all device_code-sourced pool entries from memory. Mirrors the Nous quarantine added in c90556262 and the xAI quarantine in #28116. Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure, matching the xAI and Nous patterns, to avoid refresh_token_reused races when multiple Hermes processes share the same auth.json singleton. Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems); fix re-applied surgically on current main with their predicate and tests preserved.
2026-05-18 10:31:13 -07:00
parent 9aae59feab
commit b570e0fdd0
3 changed files with 237 additions and 0 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -797,6 +797,13 @@ class CredentialPool:
                    except Exception as wexc:
                        logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
            elif self.provider == "openai-codex":
                # Adopt fresher tokens from auth.json before spending the
                # refresh_token — single-use tokens consumed by another Hermes
                # process sharing the same auth.json singleton would otherwise
                # trigger ``refresh_token_reused`` on the next POST.
                synced = self._sync_codex_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
                refreshed = auth_mod.refresh_codex_oauth_pure(
                    entry.access_token,
                    entry.refresh_token,
@@ -951,6 +958,72 @@ class CredentialPool:
                        self._current_id = None
                    self._persist()
                    return None
            # For openai-codex: same race as xAI/nous — another Hermes process
            # may have consumed the refresh token between our proactive sync
            # and the HTTP call.  Re-check auth.json and adopt the fresh tokens
            # if they have rotated since.
            if self.provider == "openai-codex":
                synced = self._sync_codex_entry_from_auth_store(entry)
                if synced.refresh_token != entry.refresh_token:
                    logger.debug(
                        "Codex OAuth refresh failed but auth.json has newer tokens — adopting"
                    )
                    updated = replace(
                        synced,
                        last_status=STATUS_OK,
                        last_status_at=None,
                        last_error_code=None,
                        last_error_reason=None,
                        last_error_message=None,
                        last_error_reset_at=None,
                    )
                    self._replace_entry(synced, updated)
                    self._persist()
                    return updated
                # Terminal error: auth.json has no newer tokens — the stored
                # refresh_token is dead.  Clear it from auth.json so the next
                # session does not re-seed the same revoked credentials, and
                # remove all singleton-seeded (device_code) entries from the
                # in-memory pool.  Mirrors the xAI and Nous quarantine paths.
                if auth_mod._is_terminal_codex_oauth_refresh_error(exc):
                    logger.debug(
                        "Codex OAuth refresh token is terminally invalid; clearing local token state"
                    )
                    try:
                        with _auth_store_lock():
                            auth_store = _load_auth_store()
                            state = _load_provider_state(auth_store, "openai-codex") or {}
                            if isinstance(state, dict):
                                tokens = state.get("tokens") or {}
                                if isinstance(tokens, dict):
                                    store_refresh = str(tokens.get("refresh_token") or "").strip()
                                    entry_refresh = str(entry.refresh_token or "").strip()
                                    if not store_refresh or store_refresh == entry_refresh:
                                        tokens.pop("access_token", None)
                                        tokens.pop("refresh_token", None)
                                        state["tokens"] = tokens
                                        state["last_auth_error"] = {
                                            "provider": "openai-codex",
                                            "code": getattr(exc, "code", "unknown"),
                                            "message": str(exc),
                                            "reason": "credential_pool_refresh_failure",
                                            "relogin_required": True,
                                            "at": datetime.now(timezone.utc).isoformat(),
                                        }
                                        _save_provider_state(auth_store, "openai-codex", state)
                                        _save_auth_store(auth_store)
                    except Exception as clear_exc:
                        logger.debug(
                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
                        )
                    self._entries = [
                        item for item in self._entries
                        if item.source != "device_code"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
                    self._persist()
                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
            # auth.json and adopt the fresh tokens if available.
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -4061,6 +4061,29 @@ def _is_terminal_xai_oauth_refresh_error(exc: Exception) -> bool:
    )
 def _is_terminal_codex_oauth_refresh_error(exc: Exception) -> bool:
    """True when retrying the same Codex OAuth refresh token cannot succeed.
    ``codex_refresh_failed`` covers HTTP 400/401/403 from the token endpoint
    (invalid_grant, token revoked, refresh_token_reused).
    ``codex_auth_missing_refresh_token`` means the pool entry has no refresh
    token at all — retrying will never work.
    Both carry ``relogin_required=True``; transient failures (429, 5xx) do not.
    """
    return (
        isinstance(exc, AuthError)
        and exc.provider == "openai-codex"
        and exc.code in {
            "codex_refresh_failed",
            "codex_auth_missing_refresh_token",
            "invalid_grant",
            "invalid_token",
            "refresh_token_reused",
        }
        and bool(exc.relogin_required)
    )
 def _quarantine_nous_oauth_state(
    state: Dict[str, Any],
    error: AuthError,
--- a/tests/agent/test_credential_pool.py
+++ b/tests/agent/test_credential_pool.py
@@ -1963,3 +1963,144 @@ def test_xai_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch
    tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {})
    assert tokens.get("access_token") == "old-access-token"
    assert tokens.get("refresh_token") == "old-refresh-token"
 # ---------------------------------------------------------------------------
 # Codex OAuth terminal error quarantine
 # ---------------------------------------------------------------------------
 def _codex_auth_store(access_token: str, refresh_token: str) -> dict:
    return {
        "version": 1,
        "active_provider": "openai-codex",
        "providers": {
            "openai-codex": {
                "tokens": {
                    "access_token": access_token,
                    "refresh_token": refresh_token,
                },
            }
        },
    }
 def test_is_terminal_codex_oauth_refresh_error():
    from hermes_cli.auth import AuthError, _is_terminal_codex_oauth_refresh_error
    assert _is_terminal_codex_oauth_refresh_error(
        AuthError("Refresh failed", provider="openai-codex", code="codex_refresh_failed", relogin_required=True)
    )
    assert _is_terminal_codex_oauth_refresh_error(
        AuthError("No token", provider="openai-codex", code="codex_auth_missing_refresh_token", relogin_required=True)
    )
    assert _is_terminal_codex_oauth_refresh_error(
        AuthError("Revoked", provider="openai-codex", code="invalid_grant", relogin_required=True)
    )
    assert _is_terminal_codex_oauth_refresh_error(
        AuthError("Reused", provider="openai-codex", code="refresh_token_reused", relogin_required=True)
    )
    # transient 429/5xx: relogin_required=False -> not terminal
    assert not _is_terminal_codex_oauth_refresh_error(
        AuthError("Rate limit", provider="openai-codex", code="codex_refresh_failed", relogin_required=False)
    )
    # xAI error does not trigger Codex check
    assert not _is_terminal_codex_oauth_refresh_error(
        AuthError("Revoked", provider="xai-oauth", code="xai_refresh_failed", relogin_required=True)
    )
    # Generic exception
    assert not _is_terminal_codex_oauth_refresh_error(ValueError("oops"))
 def test_codex_oauth_terminal_refresh_clears_auth_json_and_removes_pool_entries(
    tmp_path, monkeypatch
 ):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
    _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
    from agent.credential_pool import PooledCredential, load_pool
    import hermes_cli.auth as auth_mod
    from hermes_cli.auth import AuthError
    pool = load_pool("openai-codex")
    selected = pool.select()
    assert selected is not None
    assert selected.source == "device_code"
    # Add a manual API-key entry that must survive the quarantine.
    pool.add_entry(PooledCredential.from_dict("openai-codex", {
        "id": "manual-key",
        "source": "manual",
        "auth_type": "api_key",
        "access_token": "manual-codex-key",
    }))
    refresh_calls = {"count": 0}
    def _terminal_refresh_failure(*_args, **_kwargs):
        refresh_calls["count"] += 1
        raise AuthError(
            "Refresh session has been revoked",
            provider="openai-codex",
            code="codex_refresh_failed",
            relogin_required=True,
        )
    monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _terminal_refresh_failure)
    assert pool.try_refresh_current() is None
    # Only the manual entry survives.
    assert [entry.id for entry in pool.entries()] == ["manual-key"]
    # Auth.json tokens must be cleared.
    auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
    codex_state = auth_payload["providers"]["openai-codex"]
    tokens = codex_state.get("tokens", {})
    assert not tokens.get("access_token")
    assert not tokens.get("refresh_token")
    assert codex_state["last_auth_error"]["code"] == "codex_refresh_failed"
    assert codex_state["last_auth_error"]["relogin_required"] is True
    # Persisted pool must also have only the manual entry.
    assert [entry["id"] for entry in auth_payload["credential_pool"]["openai-codex"]] == ["manual-key"]
    # A second try_refresh_current must not call refresh_codex_oauth_pure again.
    assert pool.try_refresh_current() is None
    assert refresh_calls["count"] == 1
 def test_codex_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
    _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
    from agent.credential_pool import load_pool
    import hermes_cli.auth as auth_mod
    from hermes_cli.auth import AuthError
    pool = load_pool("openai-codex")
    assert pool.select() is not None
    def _transient_failure(*_args, **_kwargs):
        raise AuthError(
            "Rate limited",
            provider="openai-codex",
            code="codex_refresh_failed",
            relogin_required=False,
        )
    monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _transient_failure)
    pool.try_refresh_current()
    # Tokens must NOT be cleared from auth.json.
    auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
    tokens = auth_payload["providers"]["openai-codex"].get("tokens", {})
    assert tokens.get("access_token") == "old-access-token"
    assert tokens.get("refresh_token") == "old-refresh-token"