Some providers (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible servers) follow the OpenAI spec strictly and require tool message `content` to be a string — they reject our list-type content (text + image_url parts) with HTTP 400 'text is not set' / 'tool message content must be a string'. Instead of an allowlist of known-good providers (maintenance burden, guaranteed to miss aggregators like OpenRouter where the underlying model determines support, not the aggregator name), this lands a reactive recovery: 1. New `FailoverReason.multimodal_tool_content_unsupported` with a small pattern list covering the common 400 wordings. 2. `AIAgent._try_strip_image_parts_from_tool_messages` walks the API message list, downgrades any `role:tool` message whose content is list-with-image to a plain text summary (preserves text parts) in place, AND records the active (provider, model) in a session-scoped `_no_list_tool_content_models` set. 3. `_tool_result_content_for_active_model` short-circuits to a text summary when (provider, model) is in the cache — so after the first 400 + retry, subsequent screenshots in the same session skip the round trip entirely. 4. Retry hook in `agent.conversation_loop` mirrors the existing `image_too_large` recovery: detect the reason, run the helper, retry once, fall through to the normal error path if no list-type tool content was actually present. Cache is transient (per-session) by design — next session retries in case the provider added support, no persistent state to maintain. Fixes #27344. Closes #27351 (allowlist approach superseded by reactive recovery).
This commit is contained in:
@@ -989,6 +989,7 @@ def run_conversation(
|
|||||||
copilot_auth_retry_attempted=False
|
copilot_auth_retry_attempted=False
|
||||||
thinking_sig_retry_attempted = False
|
thinking_sig_retry_attempted = False
|
||||||
image_shrink_retry_attempted = False
|
image_shrink_retry_attempted = False
|
||||||
|
multimodal_tool_content_retry_attempted = False
|
||||||
oauth_1m_beta_retry_attempted = False
|
oauth_1m_beta_retry_attempted = False
|
||||||
llama_cpp_grammar_retry_attempted = False
|
llama_cpp_grammar_retry_attempted = False
|
||||||
has_retried_429 = False
|
has_retried_429 = False
|
||||||
@@ -2060,6 +2061,31 @@ def run_conversation(
|
|||||||
"or shrink didn't reduce size; surfacing original error."
|
"or shrink didn't reduce size; surfacing original error."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Multimodal-tool-content recovery: providers that follow
|
||||||
|
# the OpenAI spec strictly (tool message content must be a
|
||||||
|
# string) reject our list-type content with a 400. Strip
|
||||||
|
# image parts from any list-type tool messages, mark the
|
||||||
|
# (provider, model) as no-list-tool-content for the rest
|
||||||
|
# of this session so future tool results preemptively
|
||||||
|
# downgrade, and retry once. See issue #27344.
|
||||||
|
if (
|
||||||
|
classified.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
and not multimodal_tool_content_retry_attempted
|
||||||
|
):
|
||||||
|
multimodal_tool_content_retry_attempted = True
|
||||||
|
if agent._try_strip_image_parts_from_tool_messages(api_messages):
|
||||||
|
agent._vprint(
|
||||||
|
f"{agent.log_prefix}📐 Provider rejected list-type tool content — "
|
||||||
|
f"downgraded screenshots to text and retrying...",
|
||||||
|
force=True,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"multimodal-tool-content recovery: no list-type tool "
|
||||||
|
"messages with image parts found; surfacing original error."
|
||||||
|
)
|
||||||
|
|
||||||
# Anthropic OAuth subscription rejected the 1M-context beta
|
# Anthropic OAuth subscription rejected the 1M-context beta
|
||||||
# header ("long context beta is not yet available for this
|
# header ("long context beta is not yet available for this
|
||||||
# subscription"). Disable the beta for the rest of this
|
# subscription"). Disable the beta for the rest of this
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ class FailoverReason(enum.Enum):
|
|||||||
|
|
||||||
# Request format
|
# Request format
|
||||||
format_error = "format_error" # 400 bad request — abort or strip + retry
|
format_error = "format_error" # 400 bad request — abort or strip + retry
|
||||||
|
multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported" # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry
|
||||||
|
|
||||||
# Provider-specific
|
# Provider-specific
|
||||||
thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid
|
thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid
|
||||||
@@ -165,6 +166,32 @@ _IMAGE_TOO_LARGE_PATTERNS = [
|
|||||||
# the likely culprit; we still try the shrink path before giving up.
|
# the likely culprit; we still try the shrink path before giving up.
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Providers that follow the OpenAI spec strictly require tool message
|
||||||
|
# ``content`` to be a string. Some (Anthropic native, Codex Responses,
|
||||||
|
# Gemini native, first-party OpenAI) extend this to accept a content-parts
|
||||||
|
# list (text + image_url) so screenshots from computer_use survive. Others
|
||||||
|
# (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible
|
||||||
|
# providers) reject the list with a 400 — the patterns below are the most
|
||||||
|
# common error shapes we see. Recovery: strip image parts from tool
|
||||||
|
# messages in-place, record the (provider, model) for the rest of the
|
||||||
|
# session so we don't waste another call learning the same lesson, retry.
|
||||||
|
#
|
||||||
|
# See: https://github.com/NousResearch/hermes-agent/issues/27344
|
||||||
|
_MULTIMODAL_TOOL_CONTENT_PATTERNS = [
|
||||||
|
# Xiaomi MiMo: {"error":{"code":"400","message":"Param Incorrect","param":"text is not set"}}
|
||||||
|
"text is not set",
|
||||||
|
# Generic "tool message must be string" shapes
|
||||||
|
"tool message content must be a string",
|
||||||
|
"tool content must be a string",
|
||||||
|
"tool message must be a string",
|
||||||
|
# OpenAI-compat servers that reject list-type tool content with a
|
||||||
|
# schema-validation message
|
||||||
|
"expected string, got list",
|
||||||
|
"expected string, got array",
|
||||||
|
# Alibaba/DashScope variant
|
||||||
|
"tool_call.content must be string",
|
||||||
|
]
|
||||||
|
|
||||||
# Context overflow patterns
|
# Context overflow patterns
|
||||||
_CONTEXT_OVERFLOW_PATTERNS = [
|
_CONTEXT_OVERFLOW_PATTERNS = [
|
||||||
"context length",
|
"context length",
|
||||||
@@ -781,6 +808,19 @@ def _classify_400(
|
|||||||
) -> ClassifiedError:
|
) -> ClassifiedError:
|
||||||
"""Classify 400 Bad Request — context overflow, format error, or generic."""
|
"""Classify 400 Bad Request — context overflow, format error, or generic."""
|
||||||
|
|
||||||
|
# Multimodal tool content rejected from 400. Must be checked BEFORE
|
||||||
|
# image_too_large because the recovery is different (strip image parts
|
||||||
|
# from tool messages, mark the model as no-list-tool-content for the
|
||||||
|
# rest of the session) and BEFORE context_overflow because some of the
|
||||||
|
# patterns ("text is not set") are ambiguous in isolation but become
|
||||||
|
# specific when combined with a 400 on a request known to contain
|
||||||
|
# multimodal tool content.
|
||||||
|
if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
|
||||||
|
return result_fn(
|
||||||
|
FailoverReason.multimodal_tool_content_unsupported,
|
||||||
|
retryable=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
|
# Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
|
||||||
# Must be checked BEFORE context_overflow because messages can trip both
|
# Must be checked BEFORE context_overflow because messages can trip both
|
||||||
# patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
|
# patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
|
||||||
@@ -922,6 +962,13 @@ def _classify_by_message(
|
|||||||
should_compress=True,
|
should_compress=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Multimodal tool content patterns (from message text when no status_code)
|
||||||
|
if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
|
||||||
|
return result_fn(
|
||||||
|
FailoverReason.multimodal_tool_content_unsupported,
|
||||||
|
retryable=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Image-too-large patterns (from message text when no status_code)
|
# Image-too-large patterns (from message text when no status_code)
|
||||||
if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
|
if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
|
||||||
return result_fn(
|
return result_fn(
|
||||||
|
|||||||
93
run_agent.py
93
run_agent.py
@@ -3357,6 +3357,25 @@ class AIAgent:
|
|||||||
return content
|
return content
|
||||||
|
|
||||||
if self._model_supports_vision():
|
if self._model_supports_vision():
|
||||||
|
# Vision-capable on paper — but if we've already learned in this
|
||||||
|
# session that the active (provider, model) rejects list-type
|
||||||
|
# tool content (e.g. Xiaomi MiMo's 400 "text is not set"),
|
||||||
|
# short-circuit to a text summary so we don't burn another
|
||||||
|
# round-trip relearning the same lesson. Cache populated by
|
||||||
|
# the 400 recovery path in agent.conversation_loop. Transient
|
||||||
|
# per-session; next session retries.
|
||||||
|
key = (
|
||||||
|
(getattr(self, "provider", "") or "").strip().lower(),
|
||||||
|
(getattr(self, "model", "") or "").strip(),
|
||||||
|
)
|
||||||
|
no_list = getattr(self, "_no_list_tool_content_models", None)
|
||||||
|
if no_list and key in no_list:
|
||||||
|
logger.debug(
|
||||||
|
"Tool %s: model %s/%s known to reject list-type tool "
|
||||||
|
"content this session — sending text summary",
|
||||||
|
tool_name, key[0], key[1],
|
||||||
|
)
|
||||||
|
return _multimodal_text_summary(result)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
summary = _multimodal_text_summary(result)
|
summary = _multimodal_text_summary(result)
|
||||||
@@ -3385,6 +3404,80 @@ class AIAgent:
|
|||||||
from agent.conversation_compression import try_shrink_image_parts_in_messages
|
from agent.conversation_compression import try_shrink_image_parts_in_messages
|
||||||
return try_shrink_image_parts_in_messages(api_messages)
|
return try_shrink_image_parts_in_messages(api_messages)
|
||||||
|
|
||||||
|
def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool:
|
||||||
|
"""Downgrade list-type tool messages to text summaries in-place.
|
||||||
|
|
||||||
|
Recovery path for providers that reject list-type tool message content
|
||||||
|
(e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344). Walks
|
||||||
|
``api_messages`` for any ``role: "tool"`` message whose ``content`` is
|
||||||
|
a list containing image parts, replaces the content with the existing
|
||||||
|
text part(s) (or a minimal placeholder if none survive), and records
|
||||||
|
the active (provider, model) in ``self._no_list_tool_content_models``
|
||||||
|
so subsequent ``_tool_result_content_for_active_model`` calls in this
|
||||||
|
session preemptively downgrade screenshots without a round-trip.
|
||||||
|
|
||||||
|
Returns True when at least one tool message was downgraded — the
|
||||||
|
caller (the 400 recovery branch in ``agent.conversation_loop``) uses
|
||||||
|
this to decide whether to retry the API call with the modified
|
||||||
|
history or surface the original error.
|
||||||
|
"""
|
||||||
|
if not isinstance(api_messages, list):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Record (provider, model) so we don't relearn this lesson.
|
||||||
|
key = (
|
||||||
|
(getattr(self, "provider", "") or "").strip().lower(),
|
||||||
|
(getattr(self, "model", "") or "").strip(),
|
||||||
|
)
|
||||||
|
if not hasattr(self, "_no_list_tool_content_models"):
|
||||||
|
self._no_list_tool_content_models = set()
|
||||||
|
if key[1]: # only record when we actually have a model id
|
||||||
|
self._no_list_tool_content_models.add(key)
|
||||||
|
|
||||||
|
changed = False
|
||||||
|
for msg in api_messages:
|
||||||
|
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||||
|
continue
|
||||||
|
content = msg.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Salvage any text parts so the model still sees some signal.
|
||||||
|
text_parts: List[str] = []
|
||||||
|
had_image = False
|
||||||
|
for part in content:
|
||||||
|
if not isinstance(part, dict):
|
||||||
|
if isinstance(part, str) and part.strip():
|
||||||
|
text_parts.append(part.strip())
|
||||||
|
continue
|
||||||
|
ptype = part.get("type")
|
||||||
|
if ptype == "image_url" or ptype == "input_image":
|
||||||
|
had_image = True
|
||||||
|
continue
|
||||||
|
if ptype in {"text", "input_text"}:
|
||||||
|
text = str(part.get("text") or "").strip()
|
||||||
|
if text:
|
||||||
|
text_parts.append(text)
|
||||||
|
|
||||||
|
if not had_image:
|
||||||
|
# List-type content but no image parts — leave alone (some
|
||||||
|
# providers reject ANY list content, but stripping a
|
||||||
|
# text-only list doesn't reduce ambiguity; let the caller
|
||||||
|
# surface the original error if this turns out to be the
|
||||||
|
# case).
|
||||||
|
continue
|
||||||
|
|
||||||
|
if text_parts:
|
||||||
|
msg["content"] = "\n\n".join(text_parts)
|
||||||
|
else:
|
||||||
|
msg["content"] = (
|
||||||
|
"[image content removed — provider does not accept "
|
||||||
|
"list-type tool message content]"
|
||||||
|
)
|
||||||
|
changed = True
|
||||||
|
|
||||||
|
return changed
|
||||||
|
|
||||||
def _anthropic_preserve_dots(self) -> bool:
|
def _anthropic_preserve_dots(self) -> bool:
|
||||||
"""True when using an anthropic-compatible endpoint that preserves dots in model names.
|
"""True when using an anthropic-compatible endpoint that preserves dots in model names.
|
||||||
Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
|
Alibaba/DashScope keeps dots (e.g. qwen3.5-plus).
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ class TestFailoverReason:
|
|||||||
"overloaded", "server_error", "timeout",
|
"overloaded", "server_error", "timeout",
|
||||||
"context_overflow", "payload_too_large", "image_too_large",
|
"context_overflow", "payload_too_large", "image_too_large",
|
||||||
"model_not_found", "format_error",
|
"model_not_found", "format_error",
|
||||||
|
"multimodal_tool_content_unsupported",
|
||||||
"provider_policy_blocked",
|
"provider_policy_blocked",
|
||||||
"thinking_signature", "long_context_tier",
|
"thinking_signature", "long_context_tier",
|
||||||
"oauth_long_context_beta_forbidden",
|
"oauth_long_context_beta_forbidden",
|
||||||
@@ -1256,3 +1257,66 @@ class TestRateLimitErrorWithoutStatusCode:
|
|||||||
e.status_code = None
|
e.status_code = None
|
||||||
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
result = classify_api_error(e, provider="copilot", model="gpt-4o")
|
||||||
assert result.reason != FailoverReason.rate_limit
|
assert result.reason != FailoverReason.rate_limit
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── Test: multimodal_tool_content_unsupported pattern ───────────────────
|
||||||
|
|
||||||
|
class TestMultimodalToolContentUnsupported:
|
||||||
|
"""Issue #27344 — providers that reject list-type tool message content
|
||||||
|
should be classified as ``multimodal_tool_content_unsupported`` so the
|
||||||
|
retry loop can downgrade screenshots to text and try again.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_xiaomi_mimo_text_is_not_set_pattern(self):
|
||||||
|
"""The actual Xiaomi MiMo 400 wording from the bug report."""
|
||||||
|
e = MockAPIError(
|
||||||
|
"Error code: 400 - {'error': {'code': '400', 'message': 'Param Incorrect', 'param': 'text is not set', 'type': ''}}",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
assert result.retryable is True
|
||||||
|
|
||||||
|
def test_generic_tool_message_must_be_string(self):
|
||||||
|
e = MockAPIError(
|
||||||
|
"tool message content must be a string",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
result = classify_api_error(e, provider="custom", model="some-model")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
|
||||||
|
def test_expected_string_got_list(self):
|
||||||
|
e = MockAPIError(
|
||||||
|
"Schema validation failed: expected string, got list",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
result = classify_api_error(e, provider="custom", model="some-model")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
|
||||||
|
def test_multimodal_tool_content_takes_priority_over_context_overflow(self):
|
||||||
|
"""Some providers return a 400 whose message contains BOTH
|
||||||
|
'text is not set' and a length-shaped phrase; the tool-content
|
||||||
|
recovery is cheaper than compression so it must win the priority.
|
||||||
|
"""
|
||||||
|
e = MockAPIError(
|
||||||
|
"text is not set; context length exceeded",
|
||||||
|
status_code=400,
|
||||||
|
)
|
||||||
|
result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
|
||||||
|
def test_no_status_code_path_also_classifies(self):
|
||||||
|
"""When the error reaches us without a status code (transport
|
||||||
|
layer ate it) the message-only classifier branch must also
|
||||||
|
recognise the pattern.
|
||||||
|
"""
|
||||||
|
e = MockTransportError("tool_call.content must be string")
|
||||||
|
result = classify_api_error(e, provider="alibaba", model="qwen3.5-plus")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
|
||||||
|
def test_unrelated_400_is_not_misclassified(self):
|
||||||
|
"""Make sure the patterns don't false-positive on normal 400s."""
|
||||||
|
e = MockAPIError("bad request: missing field 'model'", status_code=400)
|
||||||
|
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-sonnet-4")
|
||||||
|
assert result.reason != FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
|||||||
260
tests/run_agent/test_multimodal_tool_content_recovery.py
Normal file
260
tests/run_agent/test_multimodal_tool_content_recovery.py
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
"""Tests for reactive multimodal-tool-content recovery.
|
||||||
|
|
||||||
|
Covers the full chain for providers that reject list-type content in
|
||||||
|
``role: "tool"`` messages (Xiaomi MiMo's 400 "text is not set", etc.):
|
||||||
|
|
||||||
|
1. agent/error_classifier.py: 400 with the right wording classifies as
|
||||||
|
``FailoverReason.multimodal_tool_content_unsupported``.
|
||||||
|
2. run_agent._try_strip_image_parts_from_tool_messages downgrades tool
|
||||||
|
messages whose ``content`` is a list-with-image to a string text
|
||||||
|
summary, in-place, and records the active (provider, model) in
|
||||||
|
``self._no_list_tool_content_models`` so future tool results in this
|
||||||
|
session preemptively downgrade.
|
||||||
|
3. run_agent._tool_result_content_for_active_model short-circuits to a
|
||||||
|
text summary when the (provider, model) is in the cache, even though
|
||||||
|
``_model_supports_vision`` returns True — avoiding a wasted round
|
||||||
|
trip on every subsequent screenshot in the session.
|
||||||
|
|
||||||
|
The end-to-end retry loop wiring (`conversation_loop.py`) is exercised by
|
||||||
|
the classifier signal + helper-mutation tests; the integration only adds
|
||||||
|
a trivial flag-and-continue around the existing pattern used for
|
||||||
|
``image_too_large`` recovery.
|
||||||
|
|
||||||
|
See: https://github.com/NousResearch/hermes-agent/issues/27344
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agent.error_classifier import FailoverReason, classify_api_error
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeApiError(Exception):
|
||||||
|
"""Stand-in for an openai.BadRequestError with status_code + body."""
|
||||||
|
|
||||||
|
def __init__(self, status_code: int, message: str, body: dict | None = None):
|
||||||
|
super().__init__(message)
|
||||||
|
self.status_code = status_code
|
||||||
|
self.body = body or {"error": {"message": message}}
|
||||||
|
self.response = None
|
||||||
|
|
||||||
|
|
||||||
|
def _make_agent(provider: str = "xiaomi", model: str = "mimo-v2.5"):
|
||||||
|
"""Build a bare AIAgent for method-level testing, no provider setup."""
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = object.__new__(AIAgent)
|
||||||
|
agent.provider = provider
|
||||||
|
agent.model = model
|
||||||
|
return agent
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Strip helper ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripImagePartsHelper:
|
||||||
|
def test_no_messages_returns_false(self):
|
||||||
|
agent = _make_agent()
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages([]) is False
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(None) is False
|
||||||
|
|
||||||
|
def test_no_tool_messages_returns_false(self):
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "plain text"},
|
||||||
|
{"role": "assistant", "content": "ack"},
|
||||||
|
]
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||||
|
|
||||||
|
def test_tool_message_with_string_content_unchanged(self):
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": "plain string result"},
|
||||||
|
]
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||||
|
assert msgs[0]["content"] == "plain string result"
|
||||||
|
|
||||||
|
def test_tool_message_list_without_image_unchanged(self):
|
||||||
|
"""List content with only text parts is left alone — caller surfaces
|
||||||
|
the original error if this turns out to also be rejected."""
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "text", "text": "hello"},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
||||||
|
|
||||||
|
def test_tool_message_list_with_image_downgrades(self):
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "text", "text": "AX summary: 5 buttons visible"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
||||||
|
# Image stripped; text preserved as a string.
|
||||||
|
assert isinstance(msgs[0]["content"], str)
|
||||||
|
assert "AX summary" in msgs[0]["content"]
|
||||||
|
assert "image_url" not in msgs[0]["content"]
|
||||||
|
assert "iVBOR" not in msgs[0]["content"]
|
||||||
|
|
||||||
|
def test_tool_message_image_only_gets_placeholder(self):
|
||||||
|
"""If the list had nothing but image parts, leave a placeholder so
|
||||||
|
the assistant message has something to reference."""
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
||||||
|
assert isinstance(msgs[0]["content"], str)
|
||||||
|
assert "image content removed" in msgs[0]["content"]
|
||||||
|
|
||||||
|
def test_records_provider_model_in_session_cache(self):
|
||||||
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "text", "text": "summary"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||||
|
assert ("xiaomi", "mimo-v2.5") in agent._no_list_tool_content_models
|
||||||
|
|
||||||
|
def test_only_tool_messages_get_downgraded(self):
|
||||||
|
"""User / assistant messages with list-type content are out of
|
||||||
|
scope — they're handled by the existing image-routing path."""
|
||||||
|
agent = _make_agent()
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": "describe"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||||
|
]},
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "text", "text": "summary"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,Y"}},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||||
|
# User message untouched.
|
||||||
|
assert isinstance(msgs[0]["content"], list)
|
||||||
|
assert any(p.get("type") == "image_url" for p in msgs[0]["content"])
|
||||||
|
# Tool message downgraded.
|
||||||
|
assert isinstance(msgs[1]["content"], str)
|
||||||
|
assert "summary" in msgs[1]["content"]
|
||||||
|
|
||||||
|
def test_skips_recording_when_no_model_id(self):
|
||||||
|
"""Don't poison the cache with empty keys when provider/model is
|
||||||
|
unset (e.g. lazy-initialised mid-handshake)."""
|
||||||
|
agent = _make_agent(provider="", model="")
|
||||||
|
msgs = [
|
||||||
|
{"role": "tool", "tool_call_id": "x", "content": [
|
||||||
|
{"type": "text", "text": "summary"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
||||||
|
assert agent._no_list_tool_content_models == set()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Short-circuit on cached models ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestToolResultContentShortCircuit:
|
||||||
|
"""Once the session has learned that (provider, model) rejects list
|
||||||
|
content, ``_tool_result_content_for_active_model`` returns a text
|
||||||
|
summary even though ``_model_supports_vision`` reports True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _multimodal_result(self, png_b64: str = "iVBORw0KGgoAAAA"):
|
||||||
|
return {
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "capture mode=som 800x600 app=Safari"},
|
||||||
|
{"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/png;base64,{png_b64}"}},
|
||||||
|
],
|
||||||
|
"text_summary": "capture mode=som 800x600 app=Safari",
|
||||||
|
"meta": {"mode": "som", "width": 800, "height": 600, "elements": 5,
|
||||||
|
"png_bytes": 1024},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch):
|
||||||
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||||
|
agent._no_list_tool_content_models = set() # explicit empty
|
||||||
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||||
|
out = agent._tool_result_content_for_active_model(
|
||||||
|
"computer_use", self._multimodal_result()
|
||||||
|
)
|
||||||
|
# Native multimodal path: returns the content parts list.
|
||||||
|
assert isinstance(out, list)
|
||||||
|
assert any(p.get("type") == "image_url" for p in out)
|
||||||
|
|
||||||
|
def test_returns_text_summary_when_model_in_cache(self, monkeypatch):
|
||||||
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||||
|
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
||||||
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||||
|
out = agent._tool_result_content_for_active_model(
|
||||||
|
"computer_use", self._multimodal_result()
|
||||||
|
)
|
||||||
|
# Short-circuit: a plain string summary, no image_url present.
|
||||||
|
assert isinstance(out, str)
|
||||||
|
assert "data:image" not in out
|
||||||
|
assert "image_url" not in out
|
||||||
|
|
||||||
|
def test_cache_miss_on_different_model(self, monkeypatch):
|
||||||
|
"""Cache is per (provider, model). A cached entry for mimo-v2.5
|
||||||
|
must NOT affect a session running on a different model.
|
||||||
|
"""
|
||||||
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro")
|
||||||
|
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
||||||
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||||
|
out = agent._tool_result_content_for_active_model(
|
||||||
|
"computer_use", self._multimodal_result()
|
||||||
|
)
|
||||||
|
assert isinstance(out, list)
|
||||||
|
|
||||||
|
def test_missing_cache_attribute_falls_through(self, monkeypatch):
|
||||||
|
"""Tests that build agents via ``object.__new__`` without calling
|
||||||
|
``__init__`` must not crash — the cache attribute may be absent.
|
||||||
|
"""
|
||||||
|
agent = _make_agent()
|
||||||
|
# Deliberately do not assign _no_list_tool_content_models.
|
||||||
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||||
|
out = agent._tool_result_content_for_active_model(
|
||||||
|
"computer_use", self._multimodal_result()
|
||||||
|
)
|
||||||
|
assert isinstance(out, list)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Classifier ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestRecoveryEndToEndClassification:
|
||||||
|
"""Lock in that the patterns used by the recovery path classify to
|
||||||
|
the right ``FailoverReason``. (The recovery hook in
|
||||||
|
``agent.conversation_loop`` consumes this reason directly.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_xiaomi_mimo_classifies(self):
|
||||||
|
err = _FakeApiError(
|
||||||
|
status_code=400,
|
||||||
|
message=(
|
||||||
|
"Error code: 400 - {'error': {'code': '400', 'message': "
|
||||||
|
"'Param Incorrect', 'param': 'text is not set', 'type': ''}}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result = classify_api_error(err, provider="xiaomi", model="mimo-v2.5")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
|
assert result.retryable is True
|
||||||
|
|
||||||
|
def test_alibaba_variant_classifies(self):
|
||||||
|
err = _FakeApiError(
|
||||||
|
status_code=400,
|
||||||
|
message="tool_call.content must be string",
|
||||||
|
)
|
||||||
|
result = classify_api_error(err, provider="alibaba", model="qwen3.5-plus")
|
||||||
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
||||||
Reference in New Issue
Block a user