fix(codex): resync pool entry from auth.json after reauth (#17001)
When openai-codex tokens expire or the ChatGPT account hits a 429
window, the pool entry gets marked STATUS_EXHAUSTED with
last_error_reset_at many hours in the future. If the user then runs
`hermes model` / `hermes auth openai-codex` to reauth, fresh tokens
land in ~/.hermes/auth.json but the pool entry stayed frozen behind
its reset_at — every request kept failing with 'credential pool: no
available entries (all exhausted or empty)' until the original window
elapsed.
_available_entries() already had auth.json/credentials-file resync
branches for anthropic/claude_code and nous/device_code; openai-codex
was missing. Added _sync_codex_entry_from_auth_store() mirroring the
nous version (reads state["tokens"][{access,refresh}_token] +
state["last_refresh"]) and wired it into the exhausted-entry resync
loop.
Also softens the 'codex CLI not found' doctor warning — native
device-code OAuth does not require the Codex binary, only
importing existing Codex CLI tokens does. Downgraded to an info line.
Reported on Discord by p1aceho1der: Codex stalled indefinitely after
a rate-limit reset, reauth didn't help, and doctor falsely warned
that the codex CLI was required.
Co-authored-by: teknium1 <teknium@users.noreply.github.com>
This commit is contained in:
@@ -456,6 +456,70 @@ class CredentialPool:
|
||||
logger.debug("Failed to sync from credentials file: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_codex_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
|
||||
"""Sync a Codex device_code pool entry from auth.json if tokens differ.
|
||||
|
||||
When a Codex OAuth access token expires (or the ChatGPT account hits
|
||||
its 5h/weekly quota), the pool entry gets marked ``STATUS_EXHAUSTED``
|
||||
with a ``last_error_reset_at`` that can be many hours in the future.
|
||||
Meanwhile the user may run ``hermes model`` / ``hermes auth`` which
|
||||
performs a fresh device-code login and writes new tokens to
|
||||
``auth.json`` under ``_auth_store_lock``. Without this sync the pool
|
||||
entry stays frozen until ``last_error_reset_at`` elapses — even
|
||||
though fresh credentials are sitting on disk — and every request
|
||||
fails with "no available entries (all exhausted or empty)".
|
||||
|
||||
Mirrors the Nous/Anthropic resync paths above. Only applies to
|
||||
device_code-sourced entries; env/API-key-sourced entries have no
|
||||
auth.json shadow to sync from.
|
||||
"""
|
||||
if self.provider != "openai-codex" or entry.source != "device_code":
|
||||
return entry
|
||||
try:
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "openai-codex")
|
||||
if not isinstance(state, dict):
|
||||
return entry
|
||||
tokens = state.get("tokens")
|
||||
if not isinstance(tokens, dict):
|
||||
return entry
|
||||
store_access = tokens.get("access_token", "")
|
||||
store_refresh = tokens.get("refresh_token", "")
|
||||
# Adopt auth.json tokens when either side differs. Codex refresh
|
||||
# tokens are single-use too, so a fresh refresh_token from
|
||||
# another process means our entry's pair is consumed/stale.
|
||||
entry_access = entry.access_token or ""
|
||||
entry_refresh = entry.refresh_token or ""
|
||||
if store_access and (
|
||||
store_access != entry_access
|
||||
or (store_refresh and store_refresh != entry_refresh)
|
||||
):
|
||||
logger.debug(
|
||||
"Pool entry %s: syncing Codex tokens from auth.json "
|
||||
"(refreshed by another process)",
|
||||
entry.id,
|
||||
)
|
||||
field_updates: Dict[str, Any] = {
|
||||
"access_token": store_access,
|
||||
"refresh_token": store_refresh or entry.refresh_token,
|
||||
"last_status": None,
|
||||
"last_status_at": None,
|
||||
"last_error_code": None,
|
||||
"last_error_reason": None,
|
||||
"last_error_message": None,
|
||||
"last_error_reset_at": None,
|
||||
}
|
||||
if state.get("last_refresh"):
|
||||
field_updates["last_refresh"] = state["last_refresh"]
|
||||
updated = replace(entry, **field_updates)
|
||||
self._replace_entry(entry, updated)
|
||||
self._persist()
|
||||
return updated
|
||||
except Exception as exc:
|
||||
logger.debug("Failed to sync Codex entry from auth.json: %s", exc)
|
||||
return entry
|
||||
|
||||
def _sync_nous_entry_from_auth_store(self, entry: PooledCredential) -> PooledCredential:
|
||||
"""Sync a Nous pool entry from auth.json if tokens differ.
|
||||
|
||||
@@ -788,6 +852,18 @@ class CredentialPool:
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
# For openai-codex entries, same pattern: the user may have
|
||||
# re-authed via `hermes model` / `hermes auth` after a 429/401,
|
||||
# leaving fresh tokens on disk while the pool entry is still
|
||||
# frozen behind last_error_reset_at (can be hours in the
|
||||
# future for ChatGPT weekly windows).
|
||||
if (self.provider == "openai-codex"
|
||||
and entry.source == "device_code"
|
||||
and entry.last_status == STATUS_EXHAUSTED):
|
||||
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
cleared_any = True
|
||||
if entry.last_status == STATUS_EXHAUSTED:
|
||||
exhausted_until = _exhausted_until(entry)
|
||||
if exhausted_until is not None and now < exhausted_until:
|
||||
|
||||
@@ -517,7 +517,14 @@ def run_doctor(args):
|
||||
if shutil.which("codex"):
|
||||
check_ok("codex CLI")
|
||||
else:
|
||||
check_warn("codex CLI not found", "(required for openai-codex login)")
|
||||
# Native OAuth uses Hermes' own device-code flow — the Codex CLI is
|
||||
# only needed if you want to import existing tokens from
|
||||
# ~/.codex/auth.json. Downgrade to info so users running
|
||||
# `hermes auth openai-codex` aren't told they're missing something.
|
||||
check_info(
|
||||
"codex CLI not installed "
|
||||
"(optional — only required to import tokens from an existing Codex CLI login)"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# Check: Directory structure
|
||||
|
||||
@@ -1370,3 +1370,143 @@ def test_nous_exhausted_entry_recovers_via_auth_store_sync(tmp_path, monkeypatch
|
||||
assert len(available) == 1
|
||||
assert available[0].refresh_token == "refresh-FRESH"
|
||||
assert available[0].last_status is None
|
||||
|
||||
|
||||
# ── OpenAI Codex OAuth cross-process sync tests ────────────────────────────
|
||||
|
||||
def _codex_auth_store(access: str, refresh: str) -> dict:
|
||||
return {
|
||||
"version": 1,
|
||||
"active_provider": "openai-codex",
|
||||
"providers": {
|
||||
"openai-codex": {
|
||||
"auth_mode": "chatgpt",
|
||||
"tokens": {
|
||||
"access_token": access,
|
||||
"refresh_token": refresh,
|
||||
"id_token": "id-" + access,
|
||||
},
|
||||
"last_refresh": "2026-04-28T00:00:00Z",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_sync_codex_entry_from_auth_store_adopts_newer_tokens(tmp_path, monkeypatch):
|
||||
"""When auth.json has newer Codex tokens, the pool entry should adopt them."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-OLD", "refresh-OLD"))
|
||||
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("openai-codex")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
assert entry.access_token == "access-OLD"
|
||||
assert entry.refresh_token == "refresh-OLD"
|
||||
|
||||
# Simulate `hermes auth openai-codex` replacing the token pair on disk.
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-NEW", "refresh-NEW"))
|
||||
|
||||
synced = pool._sync_codex_entry_from_auth_store(entry)
|
||||
assert synced is not entry
|
||||
assert synced.access_token == "access-NEW"
|
||||
assert synced.refresh_token == "refresh-NEW"
|
||||
assert synced.last_status is None
|
||||
assert synced.last_error_code is None
|
||||
assert synced.last_error_reset_at is None
|
||||
|
||||
|
||||
def test_sync_codex_entry_noop_when_tokens_match(tmp_path, monkeypatch):
|
||||
"""When auth.json has the same tokens, sync should be a no-op."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-same", "refresh-same"))
|
||||
|
||||
from agent.credential_pool import load_pool
|
||||
|
||||
pool = load_pool("openai-codex")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
|
||||
synced = pool._sync_codex_entry_from_auth_store(entry)
|
||||
assert synced is entry
|
||||
|
||||
|
||||
def test_codex_exhausted_entry_recovers_via_auth_store_sync(tmp_path, monkeypatch):
|
||||
"""An exhausted Codex entry should recover when auth.json has newer tokens.
|
||||
|
||||
Reproduces the Discord report (p1aceho1der, Apr 2026): after a Codex
|
||||
rate-limit reset the user ran `hermes model` to reauth, but the pool
|
||||
entry stayed marked EXHAUSTED with last_error_reset_at many hours in
|
||||
the future — so `_available_entries` kept returning empty and every
|
||||
request failed with "no available entries (all exhausted or empty)".
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
from agent.credential_pool import load_pool, STATUS_EXHAUSTED
|
||||
from dataclasses import replace as dc_replace
|
||||
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-OLD", "refresh-OLD"))
|
||||
|
||||
pool = load_pool("openai-codex")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
|
||||
# Mark entry as exhausted with last_error_reset_at one hour in the
|
||||
# future (Codex 429 weekly-window pattern).
|
||||
now = time.time()
|
||||
exhausted = dc_replace(
|
||||
entry,
|
||||
last_status=STATUS_EXHAUSTED,
|
||||
last_status_at=now,
|
||||
last_error_code=429,
|
||||
last_error_reset_at=now + 3600,
|
||||
)
|
||||
pool._replace_entry(entry, exhausted)
|
||||
pool._persist()
|
||||
|
||||
# Sanity: before the reauth, _available_entries refuses to return
|
||||
# this entry because last_error_reset_at is in the future.
|
||||
# (clear_expired would only clear it AFTER exhausted_until elapsed.)
|
||||
available_before = pool._available_entries(clear_expired=True, refresh=False)
|
||||
assert available_before == []
|
||||
|
||||
# Simulate `hermes model` / `hermes auth` refreshing the tokens.
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-FRESH", "refresh-FRESH"))
|
||||
|
||||
available = pool._available_entries(clear_expired=True, refresh=False)
|
||||
assert len(available) == 1
|
||||
assert available[0].access_token == "access-FRESH"
|
||||
assert available[0].refresh_token == "refresh-FRESH"
|
||||
assert available[0].last_status is None
|
||||
assert available[0].last_error_reset_at is None
|
||||
|
||||
|
||||
def test_codex_exhausted_entry_stays_stuck_without_auth_store_update(tmp_path, monkeypatch):
|
||||
"""Regression guard: if auth.json tokens haven't changed, the exhausted
|
||||
entry must stay stuck behind its reset window — sync must not spuriously
|
||||
clear status just because the entry is STATUS_EXHAUSTED."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
|
||||
from agent.credential_pool import load_pool, STATUS_EXHAUSTED
|
||||
from dataclasses import replace as dc_replace
|
||||
|
||||
_write_auth_store(tmp_path, _codex_auth_store("access-same", "refresh-same"))
|
||||
|
||||
pool = load_pool("openai-codex")
|
||||
entry = pool.select()
|
||||
assert entry is not None
|
||||
|
||||
now = time.time()
|
||||
exhausted = dc_replace(
|
||||
entry,
|
||||
last_status=STATUS_EXHAUSTED,
|
||||
last_status_at=now,
|
||||
last_error_code=429,
|
||||
last_error_reset_at=now + 3600,
|
||||
)
|
||||
pool._replace_entry(entry, exhausted)
|
||||
pool._persist()
|
||||
|
||||
# auth.json unchanged → sync returns same entry → exhausted_until check
|
||||
# still skips it.
|
||||
available = pool._available_entries(clear_expired=True, refresh=False)
|
||||
assert available == []
|
||||
|
||||
Reference in New Issue
Block a user