From c5781d50c70487ede1297553dff909b4a8388493 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 26 Apr 2026 21:33:31 -0700 Subject: [PATCH] fix(azure-foundry): auto-route gpt-5.x / codex / o-series to Responses API (#16361) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Azure Foundry deploys GPT-5.x, codex-*, and o1/o3/o4 reasoning models as Responses-API-only. Calling /chat/completions against these deployments returns 400 'The requested operation is unsupported.', which broke any user who ran 'hermes model' on Azure, picked a gpt-5/codex deployment, and kept the default api_mode: chat_completions. Verified in a user debug bundle on 2026-04-26: gpt-5.3-codex failed on synopsisse.openai.azure.com with that exact payload while gpt-4o-pure on the same endpoint worked. Adds azure_foundry_model_api_mode(model_name) that returns codex_responses when the model name starts with gpt-5, codex, o1, o3, or o4 — otherwise None so chat_completions / anthropic_messages stay untouched for gpt-4o, Llama, Claude-via-Anthropic, etc. Resolver (both the direct Azure Foundry path and the pool-entry path) consults it and upgrades api_mode unless the user explicitly picked anthropic_messages. target_model (from /model mid-session switch) takes precedence over the persisted default so switching from gpt-4o to gpt-5.3-codex routes correctly before the next request. Docs: correct the azure-foundry guide which previously claimed Azure keeps gpt-5.x on chat completions — that was only true for early Azure OpenAI, not Azure Foundry codex/o-series deployments. Tests: 14 unit tests for azure_foundry_model_api_mode + 6 integration tests in TestAzureFoundryResolution covering Bob's exact scenario, target_model override, anthropic_messages guard, and o3-mini. --- hermes_cli/models.py | 46 ++++++++ hermes_cli/runtime_provider.py | 31 +++++ tests/hermes_cli/test_model_validation.py | 64 ++++++++++ .../test_runtime_provider_resolution.py | 110 +++++++++++++++++- website/docs/guides/azure-foundry.md | 2 +- 5 files changed, 251 insertions(+), 2 deletions(-) diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 5170bc7ce..7c15f7c3d 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -2226,6 +2226,52 @@ def copilot_model_api_mode( return "chat_completions" +# Azure Foundry model families that require the Responses API. Azure +# rejects /chat/completions against these deployments with +# ``400 "The requested operation is unsupported."`` — the same payload Bob +# Dobolina hit in April 2026 on ``gpt-5.3-codex`` while ``gpt-4o-pure`` on +# the same endpoint worked fine. Keep the patterns broad enough to cover +# vendor-renamed deployments (e.g. ``gpt-5.3-codex``, ``gpt-5-codex``, +# ``gpt-5.4``, ``o1-preview``) but tight enough to leave GPT-4 / 3.5 / Llama / +# Mistral / Grok deployments on chat completions. +_AZURE_FOUNDRY_RESPONSES_PREFIXES = ( + "codex", # codex-*, codex-mini + "gpt-5", # gpt-5, gpt-5.x, gpt-5-codex, gpt-5.x-codex + "o1", # o1, o1-preview, o1-mini + "o3", # o3, o3-mini + "o4", # o4, o4-mini +) + + +def azure_foundry_model_api_mode(model_name: Optional[str]) -> Optional[str]: + """Infer Azure Foundry api_mode from a deployment/model name. + + Returns ``"codex_responses"`` when the model name matches a family that + only accepts the Responses API on Azure Foundry (GPT-5.x, codex, o1/o3/o4 + reasoning models). Returns ``None`` otherwise — the caller should fall + back to the configured/default api_mode (typically ``chat_completions``) + so GPT-4o, GPT-4 Turbo, Llama, Mistral, etc. keep working. + + Intentionally does NOT return ``anthropic_messages``; Anthropic-style + Azure endpoints are disambiguated by URL (``/anthropic`` suffix) in + ``runtime_provider._detect_api_mode_for_url`` and by the user setting + ``model.api_mode: anthropic_messages`` explicitly. + """ + raw = str(model_name or "").strip().lower() + if not raw: + return None + # Strip any vendor/ prefix a user may have copied from OpenRouter / Copilot. + if "/" in raw: + raw = raw.rsplit("/", 1)[-1] + # gpt-5-mini speaks chat completions on Copilot but Azure Foundry deploys + # the full gpt-5 family uniformly on Responses API — don't carve an + # exception here. + for prefix in _AZURE_FOUNDRY_RESPONSES_PREFIXES: + if raw.startswith(prefix): + return "codex_responses" + return None + + def normalize_opencode_model_id(provider_id: Optional[str], model_id: Optional[str]) -> str: """Normalize OpenCode config IDs to the bare model slug used in API requests.""" provider = normalize_provider(provider_id) diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py index d77154df5..1fe5acc2b 100644 --- a/hermes_cli/runtime_provider.py +++ b/hermes_cli/runtime_provider.py @@ -231,6 +231,19 @@ def _resolve_runtime_from_pool_entry( configured_mode = _parse_api_mode(model_cfg.get("api_mode")) if configured_mode: api_mode = configured_mode + # Model-family inference for GPT-5.x / codex / o1-o4: Azure rejects + # /chat/completions on these with 400 "operation unsupported" — see + # azure_foundry_model_api_mode() for rationale. Skip when the user + # explicitly picked anthropic_messages (Anthropic-style endpoint). + if effective_model and api_mode != "anthropic_messages": + try: + from hermes_cli.models import azure_foundry_model_api_mode + + inferred = azure_foundry_model_api_mode(effective_model) + except Exception: + inferred = None + if inferred: + api_mode = inferred # For Anthropic-style endpoints, strip /v1 suffix if api_mode == "anthropic_messages": base_url = re.sub(r"/v1/?$", "", base_url) @@ -608,6 +621,7 @@ def _resolve_azure_foundry_runtime( model_cfg: Dict[str, Any], explicit_api_key: Optional[str] = None, explicit_base_url: Optional[str] = None, + target_model: Optional[str] = None, ) -> Dict[str, Any]: """Resolve an Azure Foundry runtime entry. @@ -628,6 +642,22 @@ def _resolve_azure_foundry_runtime( cfg_base_url = str(model_cfg.get("base_url") or "").strip().rstrip("/") cfg_api_mode = _parse_api_mode(model_cfg.get("api_mode")) or "chat_completions" + # Model-family inference: Azure Foundry deploys GPT-5.x / codex / o1-o4 + # reasoning models as Responses-API-only. Calling /chat/completions + # against them returns 400 "The requested operation is unsupported." + # Upgrade api_mode when the model name matches, unless the user has + # explicitly chosen anthropic_messages (Anthropic-style endpoint). + effective_model = str(target_model or model_cfg.get("default") or "").strip() + if effective_model and cfg_api_mode != "anthropic_messages": + try: + from hermes_cli.models import azure_foundry_model_api_mode + + inferred = azure_foundry_model_api_mode(effective_model) + except Exception: + inferred = None + if inferred: + cfg_api_mode = inferred + env_base_url = os.getenv("AZURE_FOUNDRY_BASE_URL", "").strip().rstrip("/") base_url = explicit_base_url_clean or cfg_base_url or env_base_url if not base_url: @@ -864,6 +894,7 @@ def resolve_runtime_provider( model_cfg=_get_model_config(), explicit_api_key=explicit_api_key, explicit_base_url=explicit_base_url, + target_model=target_model, ) return azure_runtime diff --git a/tests/hermes_cli/test_model_validation.py b/tests/hermes_cli/test_model_validation.py index 80c7d2502..c8e334d69 100644 --- a/tests/hermes_cli/test_model_validation.py +++ b/tests/hermes_cli/test_model_validation.py @@ -3,6 +3,7 @@ from unittest.mock import patch from hermes_cli.models import ( + azure_foundry_model_api_mode, copilot_model_api_mode, fetch_github_model_catalog, curated_models_for_provider, @@ -414,6 +415,69 @@ class TestCopilotNormalization: assert opencode_model_api_mode("opencode-go", "opencode-go/minimax-m2.5") == "anthropic_messages" +class TestAzureFoundryModelApiMode: + """Azure Foundry deploys GPT-5.x / codex / o-series as Responses-API-only. + + Azure returns ``400 "The requested operation is unsupported."`` when + /chat/completions is called against these deployments. Verified in the + wild by a user debug bundle on 2026-04-26: gpt-5.3-codex failed with + that exact payload while gpt-4o-pure worked on the same endpoint. + """ + + def test_gpt5_family_uses_responses(self): + assert azure_foundry_model_api_mode("gpt-5") == "codex_responses" + assert azure_foundry_model_api_mode("gpt-5.3") == "codex_responses" + assert azure_foundry_model_api_mode("gpt-5.4") == "codex_responses" + assert azure_foundry_model_api_mode("gpt-5-codex") == "codex_responses" + assert azure_foundry_model_api_mode("gpt-5.3-codex") == "codex_responses" + # gpt-5-mini exceptions are Copilot-specific; Azure deploys the whole + # gpt-5 family on Responses API uniformly. + assert azure_foundry_model_api_mode("gpt-5-mini") == "codex_responses" + + def test_codex_family_uses_responses(self): + assert azure_foundry_model_api_mode("codex") == "codex_responses" + assert azure_foundry_model_api_mode("codex-mini") == "codex_responses" + + def test_o_series_reasoning_uses_responses(self): + assert azure_foundry_model_api_mode("o1") == "codex_responses" + assert azure_foundry_model_api_mode("o1-preview") == "codex_responses" + assert azure_foundry_model_api_mode("o1-mini") == "codex_responses" + assert azure_foundry_model_api_mode("o3") == "codex_responses" + assert azure_foundry_model_api_mode("o3-mini") == "codex_responses" + assert azure_foundry_model_api_mode("o4-mini") == "codex_responses" + + def test_gpt4_family_returns_none(self): + """GPT-4, GPT-4o, etc. speak chat completions on Azure.""" + assert azure_foundry_model_api_mode("gpt-4") is None + assert azure_foundry_model_api_mode("gpt-4o") is None + assert azure_foundry_model_api_mode("gpt-4o-pure") is None + assert azure_foundry_model_api_mode("gpt-4o-mini") is None + assert azure_foundry_model_api_mode("gpt-4-turbo") is None + assert azure_foundry_model_api_mode("gpt-4.1") is None + assert azure_foundry_model_api_mode("gpt-3.5-turbo") is None + + def test_non_openai_deployments_return_none(self): + """Llama, Mistral, Grok, etc. keep the default chat completions.""" + assert azure_foundry_model_api_mode("llama-3.1-70b") is None + assert azure_foundry_model_api_mode("mistral-large") is None + assert azure_foundry_model_api_mode("grok-4") is None + assert azure_foundry_model_api_mode("phi-3-medium") is None + + def test_vendor_prefix_stripped(self): + """Users who copy-paste ``openai/gpt-5.3-codex`` should still match.""" + assert azure_foundry_model_api_mode("openai/gpt-5.3-codex") == "codex_responses" + assert azure_foundry_model_api_mode("openai/gpt-4o") is None + + def test_empty_and_none_return_none(self): + assert azure_foundry_model_api_mode(None) is None + assert azure_foundry_model_api_mode("") is None + assert azure_foundry_model_api_mode(" ") is None + + def test_case_insensitive(self): + assert azure_foundry_model_api_mode("GPT-5.3-Codex") == "codex_responses" + assert azure_foundry_model_api_mode("Codex-Mini") == "codex_responses" + + # -- validate — format checks ----------------------------------------------- class TestValidateFormatChecks: diff --git a/tests/hermes_cli/test_runtime_provider_resolution.py b/tests/hermes_cli/test_runtime_provider_resolution.py index 8ca7a0cf3..bf2ea27cd 100644 --- a/tests/hermes_cli/test_runtime_provider_resolution.py +++ b/tests/hermes_cli/test_runtime_provider_resolution.py @@ -1581,7 +1581,10 @@ class TestAzureFoundryResolution: "provider": "azure-foundry", "base_url": base_url, "api_mode": api_mode, - "default": "gpt-5.4", + # GPT-4 speaks chat completions on Azure, so this test's assertion + # about chat_completions stays valid across the Apr 2026 fix that + # upgrades GPT-5.x / codex deployments to codex_responses. + "default": "gpt-4.1", } def test_azure_foundry_openai_style_explicit(self, monkeypatch): @@ -1643,3 +1646,108 @@ class TestAzureFoundryResolution: with pytest.raises(rp.AuthError, match="API key"): rp.resolve_runtime_provider(requested="azure-foundry") + + # -- Model-family api_mode inference ------------------------------------- + # Azure rejects /chat/completions on GPT-5.x / codex / o-series with + # ``400 "The requested operation is unsupported."`` — the resolver must + # upgrade api_mode to ``codex_responses`` for those models even when the + # config was persisted as ``chat_completions`` (the default the setup + # wizard writes when the user didn't pick explicitly). + + def _make_cfg_with_model(self, model: str, api_mode: str = "chat_completions"): + return { + "provider": "azure-foundry", + "base_url": "https://synopsisse.openai.azure.com/openai/v1", + "api_mode": api_mode, + "default": model, + } + + def test_gpt5_codex_upgrades_chat_completions_to_responses(self, monkeypatch): + """Reproduces Bob's April 2026 bug: gpt-5.3-codex on chat_completions.""" + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + monkeypatch.setattr(rp, "_get_model_config", + lambda: self._make_cfg_with_model("gpt-5.3-codex", "chat_completions")) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider(requested="azure-foundry") + + assert resolved["api_mode"] == "codex_responses" + assert resolved["base_url"] == "https://synopsisse.openai.azure.com/openai/v1" + + def test_gpt4o_stays_on_chat_completions(self, monkeypatch): + """gpt-4o-pure worked on Bob's endpoint — must not get upgraded.""" + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + monkeypatch.setattr(rp, "_get_model_config", + lambda: self._make_cfg_with_model("gpt-4o-pure", "chat_completions")) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider(requested="azure-foundry") + + assert resolved["api_mode"] == "chat_completions" + + def test_anthropic_messages_not_downgraded(self, monkeypatch): + """Anthropic-style endpoint: keep anthropic_messages even for gpt-5 names.""" + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + monkeypatch.setattr(rp, "_get_model_config", lambda: { + "provider": "azure-foundry", + "base_url": "https://my-resource.services.ai.azure.com/anthropic/v1", + "api_mode": "anthropic_messages", + "default": "gpt-5.3-codex", # nonsensical on Anthropic but tests the guard + }) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider(requested="azure-foundry") + + assert resolved["api_mode"] == "anthropic_messages" + + def test_target_model_overrides_stale_default(self, monkeypatch): + """/model switch: target_model should drive api_mode, not the stale config default.""" + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + # Config still pinned to gpt-4o, but user just ran /model gpt-5.3-codex + monkeypatch.setattr(rp, "_get_model_config", + lambda: self._make_cfg_with_model("gpt-4o-pure", "chat_completions")) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider( + requested="azure-foundry", + target_model="gpt-5.3-codex", + ) + + assert resolved["api_mode"] == "codex_responses" + + def test_target_model_downgrade_path(self, monkeypatch): + """/model switch gpt-5.3-codex → gpt-4o: api_mode follows new model.""" + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + # Config was upgraded to codex_responses for the previous model; user + # now switches to gpt-4o which speaks chat completions. + monkeypatch.setattr(rp, "_get_model_config", + lambda: self._make_cfg_with_model("gpt-5.3-codex", "codex_responses")) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider( + requested="azure-foundry", + target_model="gpt-4o-pure", + ) + + # codex_responses was persisted; we keep it because gpt-4o can speak + # both protocols but the explicit persisted mode is the safer signal. + # (gpt-4o returning None from the inference function means "don't + # override" — the persisted codex_responses survives.) + assert resolved["api_mode"] == "codex_responses" + + def test_o3_mini_upgrades(self, monkeypatch): + monkeypatch.setenv("AZURE_FOUNDRY_API_KEY", "az-key") + monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "azure-foundry") + monkeypatch.setattr(rp, "_get_model_config", + lambda: self._make_cfg_with_model("o3-mini", "chat_completions")) + monkeypatch.setattr(rp, "load_pool", lambda provider: None) + + resolved = rp.resolve_runtime_provider(requested="azure-foundry") + + assert resolved["api_mode"] == "codex_responses" + diff --git a/website/docs/guides/azure-foundry.md b/website/docs/guides/azure-foundry.md index 2aae73ea6..29c62e145 100644 --- a/website/docs/guides/azure-foundry.md +++ b/website/docs/guides/azure-foundry.md @@ -72,7 +72,7 @@ model: Important behaviour: -- **gpt-5.x stays on `/chat/completions`.** Unlike `api.openai.com`, Azure OpenAI does not support the Responses API — Hermes detects Azure endpoints and keeps gpt-5.x on `chat_completions` where Azure actually serves it. +- **GPT-5.x, codex, and o-series auto-route to the Responses API.** Azure Foundry deploys GPT-5 / codex / o1 / o3 / o4 models as Responses-API-only — calling `/chat/completions` against them returns `400 "The requested operation is unsupported."`. Hermes detects these model families by name and upgrades `api_mode` to `codex_responses` transparently, even when `config.yaml` still reads `api_mode: chat_completions`. GPT-4, GPT-4o, Llama, Mistral, and other deployments stay on `/chat/completions`. - **`max_completion_tokens` is used automatically.** Azure OpenAI (like direct OpenAI) requires `max_completion_tokens` for gpt-4o, o-series, and gpt-5.x models. Hermes sends the right parameter based on the endpoint. - **Pre-v1 endpoints that require `api-version`.** If you have a legacy base URL like `https://.openai.azure.com/openai?api-version=2025-04-01-preview`, Hermes extracts the query string and forwards it via `default_query` on every request (the OpenAI SDK otherwise drops it when joining paths).