fix(auxiliary): generalize unsupported-parameter detector and harden max_tokens retry (#15633)

Generalize the temperature-specific 400 retry that shipped in PR #15621 so
the same reactive strategy covers any provider that rejects an arbitrary
request parameter —  — not just temperature.

- agent/auxiliary_client.py:
  * New _is_unsupported_parameter_error(exc, param): matches the same six
    phrasings the old temperature detector did plus 'unrecognized parameter'
    and 'invalid parameter', against any named param.
  * _is_unsupported_temperature_error is now a thin back-compat wrapper so
    existing imports and tests keep working.
  * The max_tokens → max_completion_tokens retry branch in call_llm and
    async_call_llm now (a) gates on 'max_tokens is not None' so we do not
    pop a key that was never set and silently substitute a None value on
    the retry, and (b) also matches the generic helper in addition to the
    legacy 'max_tokens' / 'unsupported_parameter' substring checks — picking
    up phrasings like 'Unknown parameter: max_tokens' that previously slipped
    through.

- tests/agent/test_unsupported_parameter_retry.py: 18 new tests covering
  the generic detector across params, the back-compat wrapper, and the two
  hardenings to the max_tokens retry branch (None gate + generic phrasing).

Credit: retry-generalization pattern from @nicholasrae's PR #15416. That PR
also proposed the reactive temperature retry which landed independently via
PR #15621 + #15623 (co-authored with @BlueBirdBack). This commit salvages
the remaining hardening ideas onto current main.
This commit is contained in:
Teknium
2026-04-25 05:50:34 -07:00
committed by GitHub
parent f92006ce1c
commit 3c1c65e754
2 changed files with 239 additions and 13 deletions

View File

@@ -1349,21 +1349,27 @@ def _is_auth_error(exc: Exception) -> bool:
return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
def _is_unsupported_temperature_error(exc: Exception) -> bool:
"""Detect API errors where the selected model rejects `temperature`.
def _is_unsupported_parameter_error(exc: Exception, param: str) -> bool:
"""Detect provider 400s for an unsupported request parameter.
Triggered by provider responses like:
* OpenAI / Codex Responses — ``Unsupported parameter: temperature``
* Copilot reasoning models — ``unsupported_parameter`` with temperature
* OpenRouter reasoning models — ``does not support temperature``
* Anthropic Opus 4.7+ via OpenAI-compat — ``temperature is not supported``
Different OpenAI-compatible endpoints phrase the same class of error a few
ways: ``Unsupported parameter: X``, ``unsupported_parameter`` with a
``param`` field, ``X is not supported``, ``unknown parameter: X``,
``unrecognized request argument: X``. We match on both the parameter
name and a generic "unsupported/unknown/unrecognized parameter" marker so
call sites can reactively retry without the offending key instead of
surfacing a noisy auxiliary failure.
The same backend can accept temperature for some models and reject it for
others (e.g. gpt-5.4 accepts, gpt-5.5 rejects on the same endpoint), so we
react to the concrete error rather than maintaining a model allowlist.
Generalizes the temperature-specific detector that originally shipped
with PR #15621 so the same retry strategy can cover ``max_tokens``,
``seed``, ``top_p``, and any future quirk. Credit @nicholasrae (PR #15416)
for the generalization pattern.
"""
param_lower = (param or "").lower()
if not param_lower:
return False
err_lower = str(exc).lower()
if "temperature" not in err_lower:
if param_lower not in err_lower:
return False
return any(marker in err_lower for marker in (
"unsupported parameter",
@@ -1372,9 +1378,20 @@ def _is_unsupported_temperature_error(exc: Exception) -> bool:
"does not support",
"unknown parameter",
"unrecognized request argument",
"unrecognized parameter",
"invalid parameter",
))
def _is_unsupported_temperature_error(exc: Exception) -> bool:
"""Back-compat wrapper: detect API errors where the model rejects ``temperature``.
Delegates to :func:`_is_unsupported_parameter_error`; kept as a separate
public symbol because existing tests and call sites import it by name.
"""
return _is_unsupported_parameter_error(exc, "temperature")
def _evict_cached_clients(provider: str) -> None:
"""Drop cached auxiliary clients for a provider so fresh creds are used."""
normalized = _normalize_aux_provider(provider)
@@ -3012,7 +3029,11 @@ def call_llm(
kwargs = retry_kwargs
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
if max_tokens is not None and (
"max_tokens" in err_str
or "unsupported_parameter" in err_str
or _is_unsupported_parameter_error(first_err, "max_tokens")
):
kwargs.pop("max_tokens", None)
kwargs["max_completion_tokens"] = max_tokens
try:
@@ -3299,7 +3320,11 @@ async def async_call_llm(
kwargs = retry_kwargs
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
if max_tokens is not None and (
"max_tokens" in err_str
or "unsupported_parameter" in err_str
or _is_unsupported_parameter_error(first_err, "max_tokens")
):
kwargs.pop("max_tokens", None)
kwargs["max_completion_tokens"] = max_tokens
try:

View File

@@ -0,0 +1,201 @@
"""Regression tests for the generic unsupported-parameter detector in
``agent.auxiliary_client``.
The original temperature-specific detector (PR #15621) was generalized so the
same reactive-retry strategy covers any provider that rejects an arbitrary
request parameter — ``max_tokens``, ``seed``, ``top_p``, future quirks — not
just ``temperature``. Credit @nicholasrae (PR #15416) for the generalization
pattern.
These tests lock in:
* ``_is_unsupported_parameter_error(exc, param)`` across common phrasings
* the back-compat wrapper ``_is_unsupported_temperature_error`` still works
* the max_tokens retry branch no longer pops a key that was never set
(``max_tokens is None`` gate)
* the max_tokens retry branch matches via the generic helper on top of the
legacy ``"max_tokens"`` / ``"unsupported_parameter"`` substring checks
"""
from unittest.mock import patch, MagicMock, AsyncMock
import pytest
from agent.auxiliary_client import (
call_llm,
async_call_llm,
_is_unsupported_parameter_error,
_is_unsupported_temperature_error,
)
class TestIsUnsupportedParameterError:
"""The generic detector must match real provider phrasings for any param."""
@pytest.mark.parametrize("param,message", [
# temperature phrasings (regression coverage via the generic API)
("temperature", "HTTP 400: Unsupported parameter: temperature"),
("temperature", "Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}"),
("temperature", "this model does not support temperature"),
# max_tokens phrasings
("max_tokens", "HTTP 400: Unsupported parameter: max_tokens"),
("max_tokens", "Unknown parameter: max_tokens — use max_completion_tokens"),
("max_tokens", "Invalid parameter: max_tokens is not supported"),
# arbitrary future params
("seed", "HTTP 400: unrecognized parameter: seed"),
("top_p", "Error: top_p is not supported for this model"),
])
def test_matches_real_provider_messages(self, param, message):
assert _is_unsupported_parameter_error(RuntimeError(message), param) is True
@pytest.mark.parametrize("param,message", [
# Param not mentioned at all
("temperature", "HTTP 400: max_tokens is too large"),
# Param mentioned but not flagged as unsupported
("temperature", "temperature must be between 0 and 2"),
# Totally unrelated 400
("max_tokens", "Rate limit exceeded"),
# Connection-level errors
("temperature", "Connection reset by peer"),
])
def test_does_not_match_unrelated_errors(self, param, message):
assert _is_unsupported_parameter_error(RuntimeError(message), param) is False
def test_empty_param_returns_false(self):
assert _is_unsupported_parameter_error(
RuntimeError("HTTP 400: Unsupported parameter: temperature"), ""
) is False
def test_temperature_wrapper_delegates_to_generic(self):
"""Back-compat: ``_is_unsupported_temperature_error`` still routes through."""
msg = "HTTP 400: Unsupported parameter: temperature"
assert _is_unsupported_temperature_error(RuntimeError(msg)) is True
# And the unrelated-case still holds
assert _is_unsupported_temperature_error(
RuntimeError("max_tokens is too large")) is False
def _dummy_response():
"""Sentinel — real code calls ``_validate_llm_response`` which we patch out."""
return {"ok": True}
class TestMaxTokensRetryHardening:
"""The max_tokens retry branch now (a) gates on ``max_tokens is not None``
and (b) also matches the generic phrasings via the helper.
"""
def test_sync_max_tokens_retry_skipped_when_max_tokens_is_none(self):
"""No max_tokens kwarg → must not pop/retry even if the error mentions it.
Before the hardening, ``kwargs.pop("max_tokens", None)`` was safe but
``kwargs["max_completion_tokens"] = max_tokens`` would set a None
value and hit the provider again. The gate skips the whole branch.
"""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
err = RuntimeError("HTTP 400: Unsupported parameter: max_tokens")
client.chat.completions.create.side_effect = err
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
with pytest.raises(RuntimeError):
call_llm(
task="session_search",
messages=[{"role": "user", "content": "hi"}],
temperature=0.3,
# max_tokens omitted on purpose
)
# Only the initial attempt — no retry because the gate blocked it
assert client.chat.completions.create.call_count == 1
def test_sync_max_tokens_retry_matches_generic_phrasing(self):
"""A 400 saying "Unknown parameter: max_tokens" (not the legacy
substring ``"max_tokens"`` bare + no ``unsupported_parameter`` token)
now triggers the retry via the generic helper.
"""
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
err = RuntimeError("Unknown parameter: max_tokens")
response = _dummy_response()
client.chat.completions.create.side_effect = [err, response]
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = call_llm(
task="session_search",
messages=[{"role": "user", "content": "hi"}],
temperature=0.3,
max_tokens=512,
)
assert result is response
assert client.chat.completions.create.call_count == 2
second_call = client.chat.completions.create.call_args_list[1]
assert "max_tokens" not in second_call.kwargs
assert second_call.kwargs["max_completion_tokens"] == 512
@pytest.mark.asyncio
async def test_async_max_tokens_retry_skipped_when_max_tokens_is_none(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
err = RuntimeError("HTTP 400: Unsupported parameter: max_tokens")
client.chat.completions.create = AsyncMock(side_effect=err)
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
with pytest.raises(RuntimeError):
await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "hi"}],
temperature=0.3,
)
assert client.chat.completions.create.call_count == 1
@pytest.mark.asyncio
async def test_async_max_tokens_retry_matches_generic_phrasing(self):
client = MagicMock()
client.base_url = "https://api.openai.com/v1"
err = RuntimeError("Unknown parameter: max_tokens")
response = _dummy_response()
client.chat.completions.create = AsyncMock(side_effect=[err, response])
with (
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)),
patch("agent.auxiliary_client._get_cached_client",
return_value=(client, "gpt-5.5")),
patch("agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp),
):
result = await async_call_llm(
task="session_search",
messages=[{"role": "user", "content": "hi"}],
temperature=0.3,
max_tokens=512,
)
assert result is response
assert client.chat.completions.create.await_count == 2
second_call = client.chat.completions.create.call_args_list[1]
assert "max_tokens" not in second_call.kwargs
assert second_call.kwargs["max_completion_tokens"] == 512