fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions
When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403,
token revoked or reused), _mark_exhausted was called but auth.json was left with
the dead credentials. On the next session, _seed_from_singletons re-read
auth.json and re-seeded the pool with the same revoked token, triggering the
same terminal failure in a loop.
Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine
block in _refresh_entry: when a terminal error is detected and auth.json holds
no newer tokens, clear access_token/refresh_token from auth.json and remove all
device_code-sourced pool entries from memory. Mirrors the Nous quarantine added
in c90556262 and the xAI quarantine in #28116.
Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure,
matching the xAI and Nous patterns, to avoid refresh_token_reused races when
multiple Hermes processes share the same auth.json singleton.
Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely
stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems);
fix re-applied surgically on current main with their predicate and tests preserved.
This commit is contained in:
@@ -797,6 +797,13 @@ class CredentialPool:
|
|||||||
except Exception as wexc:
|
except Exception as wexc:
|
||||||
logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
|
logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
|
||||||
elif self.provider == "openai-codex":
|
elif self.provider == "openai-codex":
|
||||||
|
# Adopt fresher tokens from auth.json before spending the
|
||||||
|
# refresh_token — single-use tokens consumed by another Hermes
|
||||||
|
# process sharing the same auth.json singleton would otherwise
|
||||||
|
# trigger ``refresh_token_reused`` on the next POST.
|
||||||
|
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||||
|
if synced is not entry:
|
||||||
|
entry = synced
|
||||||
refreshed = auth_mod.refresh_codex_oauth_pure(
|
refreshed = auth_mod.refresh_codex_oauth_pure(
|
||||||
entry.access_token,
|
entry.access_token,
|
||||||
entry.refresh_token,
|
entry.refresh_token,
|
||||||
@@ -951,6 +958,72 @@ class CredentialPool:
|
|||||||
self._current_id = None
|
self._current_id = None
|
||||||
self._persist()
|
self._persist()
|
||||||
return None
|
return None
|
||||||
|
# For openai-codex: same race as xAI/nous — another Hermes process
|
||||||
|
# may have consumed the refresh token between our proactive sync
|
||||||
|
# and the HTTP call. Re-check auth.json and adopt the fresh tokens
|
||||||
|
# if they have rotated since.
|
||||||
|
if self.provider == "openai-codex":
|
||||||
|
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||||
|
if synced.refresh_token != entry.refresh_token:
|
||||||
|
logger.debug(
|
||||||
|
"Codex OAuth refresh failed but auth.json has newer tokens — adopting"
|
||||||
|
)
|
||||||
|
updated = replace(
|
||||||
|
synced,
|
||||||
|
last_status=STATUS_OK,
|
||||||
|
last_status_at=None,
|
||||||
|
last_error_code=None,
|
||||||
|
last_error_reason=None,
|
||||||
|
last_error_message=None,
|
||||||
|
last_error_reset_at=None,
|
||||||
|
)
|
||||||
|
self._replace_entry(synced, updated)
|
||||||
|
self._persist()
|
||||||
|
return updated
|
||||||
|
# Terminal error: auth.json has no newer tokens — the stored
|
||||||
|
# refresh_token is dead. Clear it from auth.json so the next
|
||||||
|
# session does not re-seed the same revoked credentials, and
|
||||||
|
# remove all singleton-seeded (device_code) entries from the
|
||||||
|
# in-memory pool. Mirrors the xAI and Nous quarantine paths.
|
||||||
|
if auth_mod._is_terminal_codex_oauth_refresh_error(exc):
|
||||||
|
logger.debug(
|
||||||
|
"Codex OAuth refresh token is terminally invalid; clearing local token state"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with _auth_store_lock():
|
||||||
|
auth_store = _load_auth_store()
|
||||||
|
state = _load_provider_state(auth_store, "openai-codex") or {}
|
||||||
|
if isinstance(state, dict):
|
||||||
|
tokens = state.get("tokens") or {}
|
||||||
|
if isinstance(tokens, dict):
|
||||||
|
store_refresh = str(tokens.get("refresh_token") or "").strip()
|
||||||
|
entry_refresh = str(entry.refresh_token or "").strip()
|
||||||
|
if not store_refresh or store_refresh == entry_refresh:
|
||||||
|
tokens.pop("access_token", None)
|
||||||
|
tokens.pop("refresh_token", None)
|
||||||
|
state["tokens"] = tokens
|
||||||
|
state["last_auth_error"] = {
|
||||||
|
"provider": "openai-codex",
|
||||||
|
"code": getattr(exc, "code", "unknown"),
|
||||||
|
"message": str(exc),
|
||||||
|
"reason": "credential_pool_refresh_failure",
|
||||||
|
"relogin_required": True,
|
||||||
|
"at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
}
|
||||||
|
_save_provider_state(auth_store, "openai-codex", state)
|
||||||
|
_save_auth_store(auth_store)
|
||||||
|
except Exception as clear_exc:
|
||||||
|
logger.debug(
|
||||||
|
"Failed to clear terminal Codex OAuth state: %s", clear_exc
|
||||||
|
)
|
||||||
|
self._entries = [
|
||||||
|
item for item in self._entries
|
||||||
|
if item.source != "device_code"
|
||||||
|
]
|
||||||
|
if self._current_id == entry.id:
|
||||||
|
self._current_id = None
|
||||||
|
self._persist()
|
||||||
|
return None
|
||||||
# For nous: another process may have consumed the refresh token
|
# For nous: another process may have consumed the refresh token
|
||||||
# between our proactive sync and the HTTP call. Re-sync from
|
# between our proactive sync and the HTTP call. Re-sync from
|
||||||
# auth.json and adopt the fresh tokens if available.
|
# auth.json and adopt the fresh tokens if available.
|
||||||
|
|||||||
@@ -4061,6 +4061,29 @@ def _is_terminal_xai_oauth_refresh_error(exc: Exception) -> bool:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_terminal_codex_oauth_refresh_error(exc: Exception) -> bool:
|
||||||
|
"""True when retrying the same Codex OAuth refresh token cannot succeed.
|
||||||
|
|
||||||
|
``codex_refresh_failed`` covers HTTP 400/401/403 from the token endpoint
|
||||||
|
(invalid_grant, token revoked, refresh_token_reused).
|
||||||
|
``codex_auth_missing_refresh_token`` means the pool entry has no refresh
|
||||||
|
token at all — retrying will never work.
|
||||||
|
Both carry ``relogin_required=True``; transient failures (429, 5xx) do not.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
isinstance(exc, AuthError)
|
||||||
|
and exc.provider == "openai-codex"
|
||||||
|
and exc.code in {
|
||||||
|
"codex_refresh_failed",
|
||||||
|
"codex_auth_missing_refresh_token",
|
||||||
|
"invalid_grant",
|
||||||
|
"invalid_token",
|
||||||
|
"refresh_token_reused",
|
||||||
|
}
|
||||||
|
and bool(exc.relogin_required)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _quarantine_nous_oauth_state(
|
def _quarantine_nous_oauth_state(
|
||||||
state: Dict[str, Any],
|
state: Dict[str, Any],
|
||||||
error: AuthError,
|
error: AuthError,
|
||||||
|
|||||||
@@ -1963,3 +1963,144 @@ def test_xai_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch
|
|||||||
tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {})
|
tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {})
|
||||||
assert tokens.get("access_token") == "old-access-token"
|
assert tokens.get("access_token") == "old-access-token"
|
||||||
assert tokens.get("refresh_token") == "old-refresh-token"
|
assert tokens.get("refresh_token") == "old-refresh-token"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Codex OAuth terminal error quarantine
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _codex_auth_store(access_token: str, refresh_token: str) -> dict:
|
||||||
|
return {
|
||||||
|
"version": 1,
|
||||||
|
"active_provider": "openai-codex",
|
||||||
|
"providers": {
|
||||||
|
"openai-codex": {
|
||||||
|
"tokens": {
|
||||||
|
"access_token": access_token,
|
||||||
|
"refresh_token": refresh_token,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_terminal_codex_oauth_refresh_error():
|
||||||
|
from hermes_cli.auth import AuthError, _is_terminal_codex_oauth_refresh_error
|
||||||
|
|
||||||
|
assert _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("Refresh failed", provider="openai-codex", code="codex_refresh_failed", relogin_required=True)
|
||||||
|
)
|
||||||
|
assert _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("No token", provider="openai-codex", code="codex_auth_missing_refresh_token", relogin_required=True)
|
||||||
|
)
|
||||||
|
assert _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("Revoked", provider="openai-codex", code="invalid_grant", relogin_required=True)
|
||||||
|
)
|
||||||
|
assert _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("Reused", provider="openai-codex", code="refresh_token_reused", relogin_required=True)
|
||||||
|
)
|
||||||
|
# transient 429/5xx: relogin_required=False -> not terminal
|
||||||
|
assert not _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("Rate limit", provider="openai-codex", code="codex_refresh_failed", relogin_required=False)
|
||||||
|
)
|
||||||
|
# xAI error does not trigger Codex check
|
||||||
|
assert not _is_terminal_codex_oauth_refresh_error(
|
||||||
|
AuthError("Revoked", provider="xai-oauth", code="xai_refresh_failed", relogin_required=True)
|
||||||
|
)
|
||||||
|
# Generic exception
|
||||||
|
assert not _is_terminal_codex_oauth_refresh_error(ValueError("oops"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_codex_oauth_terminal_refresh_clears_auth_json_and_removes_pool_entries(
|
||||||
|
tmp_path, monkeypatch
|
||||||
|
):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
|
||||||
|
|
||||||
|
_write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
|
||||||
|
|
||||||
|
from agent.credential_pool import PooledCredential, load_pool
|
||||||
|
import hermes_cli.auth as auth_mod
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
|
||||||
|
pool = load_pool("openai-codex")
|
||||||
|
selected = pool.select()
|
||||||
|
assert selected is not None
|
||||||
|
assert selected.source == "device_code"
|
||||||
|
|
||||||
|
# Add a manual API-key entry that must survive the quarantine.
|
||||||
|
pool.add_entry(PooledCredential.from_dict("openai-codex", {
|
||||||
|
"id": "manual-key",
|
||||||
|
"source": "manual",
|
||||||
|
"auth_type": "api_key",
|
||||||
|
"access_token": "manual-codex-key",
|
||||||
|
}))
|
||||||
|
|
||||||
|
refresh_calls = {"count": 0}
|
||||||
|
|
||||||
|
def _terminal_refresh_failure(*_args, **_kwargs):
|
||||||
|
refresh_calls["count"] += 1
|
||||||
|
raise AuthError(
|
||||||
|
"Refresh session has been revoked",
|
||||||
|
provider="openai-codex",
|
||||||
|
code="codex_refresh_failed",
|
||||||
|
relogin_required=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _terminal_refresh_failure)
|
||||||
|
|
||||||
|
assert pool.try_refresh_current() is None
|
||||||
|
|
||||||
|
# Only the manual entry survives.
|
||||||
|
assert [entry.id for entry in pool.entries()] == ["manual-key"]
|
||||||
|
|
||||||
|
# Auth.json tokens must be cleared.
|
||||||
|
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
|
||||||
|
codex_state = auth_payload["providers"]["openai-codex"]
|
||||||
|
tokens = codex_state.get("tokens", {})
|
||||||
|
assert not tokens.get("access_token")
|
||||||
|
assert not tokens.get("refresh_token")
|
||||||
|
assert codex_state["last_auth_error"]["code"] == "codex_refresh_failed"
|
||||||
|
assert codex_state["last_auth_error"]["relogin_required"] is True
|
||||||
|
|
||||||
|
# Persisted pool must also have only the manual entry.
|
||||||
|
assert [entry["id"] for entry in auth_payload["credential_pool"]["openai-codex"]] == ["manual-key"]
|
||||||
|
|
||||||
|
# A second try_refresh_current must not call refresh_codex_oauth_pure again.
|
||||||
|
assert pool.try_refresh_current() is None
|
||||||
|
assert refresh_calls["count"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_codex_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
|
||||||
|
|
||||||
|
_write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
|
||||||
|
|
||||||
|
from agent.credential_pool import load_pool
|
||||||
|
import hermes_cli.auth as auth_mod
|
||||||
|
from hermes_cli.auth import AuthError
|
||||||
|
|
||||||
|
pool = load_pool("openai-codex")
|
||||||
|
assert pool.select() is not None
|
||||||
|
|
||||||
|
def _transient_failure(*_args, **_kwargs):
|
||||||
|
raise AuthError(
|
||||||
|
"Rate limited",
|
||||||
|
provider="openai-codex",
|
||||||
|
code="codex_refresh_failed",
|
||||||
|
relogin_required=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _transient_failure)
|
||||||
|
|
||||||
|
pool.try_refresh_current()
|
||||||
|
|
||||||
|
# Tokens must NOT be cleared from auth.json.
|
||||||
|
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
|
||||||
|
tokens = auth_payload["providers"]["openai-codex"].get("tokens", {})
|
||||||
|
assert tokens.get("access_token") == "old-access-token"
|
||||||
|
assert tokens.get("refresh_token") == "old-refresh-token"
|
||||||
|
|||||||
Reference in New Issue
Block a user