fix(auxiliary): generalize unsupported-parameter detector and harden max_tokens retry (#15633)
Generalize the temperature-specific 400 retry that shipped in PR #15621 so the same reactive strategy covers any provider that rejects an arbitrary request parameter — — not just temperature. - agent/auxiliary_client.py: * New _is_unsupported_parameter_error(exc, param): matches the same six phrasings the old temperature detector did plus 'unrecognized parameter' and 'invalid parameter', against any named param. * _is_unsupported_temperature_error is now a thin back-compat wrapper so existing imports and tests keep working. * The max_tokens → max_completion_tokens retry branch in call_llm and async_call_llm now (a) gates on 'max_tokens is not None' so we do not pop a key that was never set and silently substitute a None value on the retry, and (b) also matches the generic helper in addition to the legacy 'max_tokens' / 'unsupported_parameter' substring checks — picking up phrasings like 'Unknown parameter: max_tokens' that previously slipped through. - tests/agent/test_unsupported_parameter_retry.py: 18 new tests covering the generic detector across params, the back-compat wrapper, and the two hardenings to the max_tokens retry branch (None gate + generic phrasing). Credit: retry-generalization pattern from @nicholasrae's PR #15416. That PR also proposed the reactive temperature retry which landed independently via PR #15621 + #15623 (co-authored with @BlueBirdBack). This commit salvages the remaining hardening ideas onto current main.
This commit is contained in:
@@ -1349,21 +1349,27 @@ def _is_auth_error(exc: Exception) -> bool:
|
||||
return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
|
||||
|
||||
|
||||
def _is_unsupported_temperature_error(exc: Exception) -> bool:
|
||||
"""Detect API errors where the selected model rejects `temperature`.
|
||||
def _is_unsupported_parameter_error(exc: Exception, param: str) -> bool:
|
||||
"""Detect provider 400s for an unsupported request parameter.
|
||||
|
||||
Triggered by provider responses like:
|
||||
* OpenAI / Codex Responses — ``Unsupported parameter: temperature``
|
||||
* Copilot reasoning models — ``unsupported_parameter`` with temperature
|
||||
* OpenRouter reasoning models — ``does not support temperature``
|
||||
* Anthropic Opus 4.7+ via OpenAI-compat — ``temperature is not supported``
|
||||
Different OpenAI-compatible endpoints phrase the same class of error a few
|
||||
ways: ``Unsupported parameter: X``, ``unsupported_parameter`` with a
|
||||
``param`` field, ``X is not supported``, ``unknown parameter: X``,
|
||||
``unrecognized request argument: X``. We match on both the parameter
|
||||
name and a generic "unsupported/unknown/unrecognized parameter" marker so
|
||||
call sites can reactively retry without the offending key instead of
|
||||
surfacing a noisy auxiliary failure.
|
||||
|
||||
The same backend can accept temperature for some models and reject it for
|
||||
others (e.g. gpt-5.4 accepts, gpt-5.5 rejects on the same endpoint), so we
|
||||
react to the concrete error rather than maintaining a model allowlist.
|
||||
Generalizes the temperature-specific detector that originally shipped
|
||||
with PR #15621 so the same retry strategy can cover ``max_tokens``,
|
||||
``seed``, ``top_p``, and any future quirk. Credit @nicholasrae (PR #15416)
|
||||
for the generalization pattern.
|
||||
"""
|
||||
param_lower = (param or "").lower()
|
||||
if not param_lower:
|
||||
return False
|
||||
err_lower = str(exc).lower()
|
||||
if "temperature" not in err_lower:
|
||||
if param_lower not in err_lower:
|
||||
return False
|
||||
return any(marker in err_lower for marker in (
|
||||
"unsupported parameter",
|
||||
@@ -1372,9 +1378,20 @@ def _is_unsupported_temperature_error(exc: Exception) -> bool:
|
||||
"does not support",
|
||||
"unknown parameter",
|
||||
"unrecognized request argument",
|
||||
"unrecognized parameter",
|
||||
"invalid parameter",
|
||||
))
|
||||
|
||||
|
||||
def _is_unsupported_temperature_error(exc: Exception) -> bool:
|
||||
"""Back-compat wrapper: detect API errors where the model rejects ``temperature``.
|
||||
|
||||
Delegates to :func:`_is_unsupported_parameter_error`; kept as a separate
|
||||
public symbol because existing tests and call sites import it by name.
|
||||
"""
|
||||
return _is_unsupported_parameter_error(exc, "temperature")
|
||||
|
||||
|
||||
def _evict_cached_clients(provider: str) -> None:
|
||||
"""Drop cached auxiliary clients for a provider so fresh creds are used."""
|
||||
normalized = _normalize_aux_provider(provider)
|
||||
@@ -3012,7 +3029,11 @@ def call_llm(
|
||||
kwargs = retry_kwargs
|
||||
|
||||
err_str = str(first_err)
|
||||
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||
if max_tokens is not None and (
|
||||
"max_tokens" in err_str
|
||||
or "unsupported_parameter" in err_str
|
||||
or _is_unsupported_parameter_error(first_err, "max_tokens")
|
||||
):
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
try:
|
||||
@@ -3299,7 +3320,11 @@ async def async_call_llm(
|
||||
kwargs = retry_kwargs
|
||||
|
||||
err_str = str(first_err)
|
||||
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||
if max_tokens is not None and (
|
||||
"max_tokens" in err_str
|
||||
or "unsupported_parameter" in err_str
|
||||
or _is_unsupported_parameter_error(first_err, "max_tokens")
|
||||
):
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
try:
|
||||
|
||||
201
tests/agent/test_unsupported_parameter_retry.py
Normal file
201
tests/agent/test_unsupported_parameter_retry.py
Normal file
@@ -0,0 +1,201 @@
|
||||
"""Regression tests for the generic unsupported-parameter detector in
|
||||
``agent.auxiliary_client``.
|
||||
|
||||
The original temperature-specific detector (PR #15621) was generalized so the
|
||||
same reactive-retry strategy covers any provider that rejects an arbitrary
|
||||
request parameter — ``max_tokens``, ``seed``, ``top_p``, future quirks — not
|
||||
just ``temperature``. Credit @nicholasrae (PR #15416) for the generalization
|
||||
pattern.
|
||||
|
||||
These tests lock in:
|
||||
* ``_is_unsupported_parameter_error(exc, param)`` across common phrasings
|
||||
* the back-compat wrapper ``_is_unsupported_temperature_error`` still works
|
||||
* the max_tokens retry branch no longer pops a key that was never set
|
||||
(``max_tokens is None`` gate)
|
||||
* the max_tokens retry branch matches via the generic helper on top of the
|
||||
legacy ``"max_tokens"`` / ``"unsupported_parameter"`` substring checks
|
||||
"""
|
||||
|
||||
from unittest.mock import patch, MagicMock, AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.auxiliary_client import (
|
||||
call_llm,
|
||||
async_call_llm,
|
||||
_is_unsupported_parameter_error,
|
||||
_is_unsupported_temperature_error,
|
||||
)
|
||||
|
||||
|
||||
class TestIsUnsupportedParameterError:
|
||||
"""The generic detector must match real provider phrasings for any param."""
|
||||
|
||||
@pytest.mark.parametrize("param,message", [
|
||||
# temperature phrasings (regression coverage via the generic API)
|
||||
("temperature", "HTTP 400: Unsupported parameter: temperature"),
|
||||
("temperature", "Error code: 400 - {'error': {'code': 'unsupported_parameter', 'param': 'temperature'}}"),
|
||||
("temperature", "this model does not support temperature"),
|
||||
# max_tokens phrasings
|
||||
("max_tokens", "HTTP 400: Unsupported parameter: max_tokens"),
|
||||
("max_tokens", "Unknown parameter: max_tokens — use max_completion_tokens"),
|
||||
("max_tokens", "Invalid parameter: max_tokens is not supported"),
|
||||
# arbitrary future params
|
||||
("seed", "HTTP 400: unrecognized parameter: seed"),
|
||||
("top_p", "Error: top_p is not supported for this model"),
|
||||
])
|
||||
def test_matches_real_provider_messages(self, param, message):
|
||||
assert _is_unsupported_parameter_error(RuntimeError(message), param) is True
|
||||
|
||||
@pytest.mark.parametrize("param,message", [
|
||||
# Param not mentioned at all
|
||||
("temperature", "HTTP 400: max_tokens is too large"),
|
||||
# Param mentioned but not flagged as unsupported
|
||||
("temperature", "temperature must be between 0 and 2"),
|
||||
# Totally unrelated 400
|
||||
("max_tokens", "Rate limit exceeded"),
|
||||
# Connection-level errors
|
||||
("temperature", "Connection reset by peer"),
|
||||
])
|
||||
def test_does_not_match_unrelated_errors(self, param, message):
|
||||
assert _is_unsupported_parameter_error(RuntimeError(message), param) is False
|
||||
|
||||
def test_empty_param_returns_false(self):
|
||||
assert _is_unsupported_parameter_error(
|
||||
RuntimeError("HTTP 400: Unsupported parameter: temperature"), ""
|
||||
) is False
|
||||
|
||||
def test_temperature_wrapper_delegates_to_generic(self):
|
||||
"""Back-compat: ``_is_unsupported_temperature_error`` still routes through."""
|
||||
msg = "HTTP 400: Unsupported parameter: temperature"
|
||||
assert _is_unsupported_temperature_error(RuntimeError(msg)) is True
|
||||
# And the unrelated-case still holds
|
||||
assert _is_unsupported_temperature_error(
|
||||
RuntimeError("max_tokens is too large")) is False
|
||||
|
||||
|
||||
def _dummy_response():
|
||||
"""Sentinel — real code calls ``_validate_llm_response`` which we patch out."""
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
class TestMaxTokensRetryHardening:
|
||||
"""The max_tokens retry branch now (a) gates on ``max_tokens is not None``
|
||||
and (b) also matches the generic phrasings via the helper.
|
||||
"""
|
||||
|
||||
def test_sync_max_tokens_retry_skipped_when_max_tokens_is_none(self):
|
||||
"""No max_tokens kwarg → must not pop/retry even if the error mentions it.
|
||||
|
||||
Before the hardening, ``kwargs.pop("max_tokens", None)`` was safe but
|
||||
``kwargs["max_completion_tokens"] = max_tokens`` would set a None
|
||||
value and hit the provider again. The gate skips the whole branch.
|
||||
"""
|
||||
client = MagicMock()
|
||||
client.base_url = "https://api.openai.com/v1"
|
||||
err = RuntimeError("HTTP 400: Unsupported parameter: max_tokens")
|
||||
client.chat.completions.create.side_effect = err
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(client, "gpt-5.5")),
|
||||
patch("agent.auxiliary_client._validate_llm_response",
|
||||
side_effect=lambda resp, _task: resp),
|
||||
):
|
||||
with pytest.raises(RuntimeError):
|
||||
call_llm(
|
||||
task="session_search",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
temperature=0.3,
|
||||
# max_tokens omitted on purpose
|
||||
)
|
||||
|
||||
# Only the initial attempt — no retry because the gate blocked it
|
||||
assert client.chat.completions.create.call_count == 1
|
||||
|
||||
def test_sync_max_tokens_retry_matches_generic_phrasing(self):
|
||||
"""A 400 saying "Unknown parameter: max_tokens" (not the legacy
|
||||
substring ``"max_tokens"`` bare + no ``unsupported_parameter`` token)
|
||||
now triggers the retry via the generic helper.
|
||||
"""
|
||||
client = MagicMock()
|
||||
client.base_url = "https://api.openai.com/v1"
|
||||
err = RuntimeError("Unknown parameter: max_tokens")
|
||||
response = _dummy_response()
|
||||
client.chat.completions.create.side_effect = [err, response]
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(client, "gpt-5.5")),
|
||||
patch("agent.auxiliary_client._validate_llm_response",
|
||||
side_effect=lambda resp, _task: resp),
|
||||
):
|
||||
result = call_llm(
|
||||
task="session_search",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
temperature=0.3,
|
||||
max_tokens=512,
|
||||
)
|
||||
|
||||
assert result is response
|
||||
assert client.chat.completions.create.call_count == 2
|
||||
second_call = client.chat.completions.create.call_args_list[1]
|
||||
assert "max_tokens" not in second_call.kwargs
|
||||
assert second_call.kwargs["max_completion_tokens"] == 512
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_max_tokens_retry_skipped_when_max_tokens_is_none(self):
|
||||
client = MagicMock()
|
||||
client.base_url = "https://api.openai.com/v1"
|
||||
err = RuntimeError("HTTP 400: Unsupported parameter: max_tokens")
|
||||
client.chat.completions.create = AsyncMock(side_effect=err)
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(client, "gpt-5.5")),
|
||||
patch("agent.auxiliary_client._validate_llm_response",
|
||||
side_effect=lambda resp, _task: resp),
|
||||
):
|
||||
with pytest.raises(RuntimeError):
|
||||
await async_call_llm(
|
||||
task="session_search",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
temperature=0.3,
|
||||
)
|
||||
|
||||
assert client.chat.completions.create.call_count == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_max_tokens_retry_matches_generic_phrasing(self):
|
||||
client = MagicMock()
|
||||
client.base_url = "https://api.openai.com/v1"
|
||||
err = RuntimeError("Unknown parameter: max_tokens")
|
||||
response = _dummy_response()
|
||||
client.chat.completions.create = AsyncMock(side_effect=[err, response])
|
||||
|
||||
with (
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openai-codex", "gpt-5.5", None, None, None)),
|
||||
patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(client, "gpt-5.5")),
|
||||
patch("agent.auxiliary_client._validate_llm_response",
|
||||
side_effect=lambda resp, _task: resp),
|
||||
):
|
||||
result = await async_call_llm(
|
||||
task="session_search",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
temperature=0.3,
|
||||
max_tokens=512,
|
||||
)
|
||||
|
||||
assert result is response
|
||||
assert client.chat.completions.create.await_count == 2
|
||||
second_call = client.chat.completions.create.call_args_list[1]
|
||||
assert "max_tokens" not in second_call.kwargs
|
||||
assert second_call.kwargs["max_completion_tokens"] == 512
|
||||
Reference in New Issue
Block a user