diff --git a/agent/message_sanitization.py b/agent/message_sanitization.py new file mode 100644 index 000000000..ff53d247a --- /dev/null +++ b/agent/message_sanitization.py @@ -0,0 +1,444 @@ +"""Message and tool-payload sanitization helpers. + +Pure functions extracted from ``run_agent.py`` so the AIAgent module can +stay focused on the conversation loop. These walk OpenAI-format message +lists and structured payloads, repairing or stripping problematic +characters that would otherwise crash ``json.dumps`` inside the OpenAI +SDK or be rejected by upstream APIs. + +All helpers are stateless and side-effect-free except for in-place +mutation of their input (where documented). Backward-compatible +re-exports from ``run_agent`` remain in place so existing imports +``from run_agent import _sanitize_surrogates`` keep working. +""" + +from __future__ import annotations + +import json +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +# Lone surrogate code points are invalid in UTF-8 and crash json.dumps +# inside the OpenAI SDK. Used by every surrogate-sanitization helper +# below as well as by run_agent and the CLI for paste-from-clipboard +# scrubbing. +_SURROGATE_RE = re.compile(r'[\ud800-\udfff]') + + +def _sanitize_surrogates(text: str) -> str: + """Replace lone surrogate code points with U+FFFD (replacement character). + + Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the + OpenAI SDK. This is a fast no-op when the text contains no surrogates. + """ + if _SURROGATE_RE.search(text): + return _SURROGATE_RE.sub('\ufffd', text) + return text + + +def _sanitize_structure_surrogates(payload: Any) -> bool: + """Replace surrogate code points in nested dict/list payloads in-place. + + Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery. + Used to scrub nested structured fields (e.g. ``reasoning_details`` — an + array of dicts with ``summary``/``text`` strings) that flat per-field + checks don't reach. Returns True if any surrogates were replaced. + """ + found = False + + def _walk(node): + nonlocal found + if isinstance(node, dict): + for key, value in node.items(): + if isinstance(value, str): + if _SURROGATE_RE.search(value): + node[key] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + _walk(value) + elif isinstance(node, list): + for idx, value in enumerate(node): + if isinstance(value, str): + if _SURROGATE_RE.search(value): + node[idx] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + _walk(value) + + _walk(payload) + return found + + +def _sanitize_messages_surrogates(messages: list) -> bool: + """Sanitize surrogate characters from all string content in a messages list. + + Walks message dicts in-place. Returns True if any surrogates were found + and replaced, False otherwise. Covers content/text, name, tool call + metadata/arguments, AND any additional string or nested structured fields + (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so + retries don't fail on a non-content field. Byte-level reasoning models + (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output + that flow through to ``api_messages["reasoning_content"]`` on the next + turn and crash json.dumps inside the OpenAI SDK. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if isinstance(content, str) and _SURROGATE_RE.search(content): + msg["content"] = _SURROGATE_RE.sub('\ufffd', content) + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str) and _SURROGATE_RE.search(text): + part["text"] = _SURROGATE_RE.sub('\ufffd', text) + found = True + name = msg.get("name") + if isinstance(name, str) and _SURROGATE_RE.search(name): + msg["name"] = _SURROGATE_RE.sub('\ufffd', name) + found = True + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if not isinstance(tc, dict): + continue + tc_id = tc.get("id") + if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id): + tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id) + found = True + fn = tc.get("function") + if isinstance(fn, dict): + fn_name = fn.get("name") + if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name): + fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name) + found = True + fn_args = fn.get("arguments") + if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args): + fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args) + found = True + # Walk any additional string / nested fields (reasoning, + # reasoning_content, reasoning_details, etc.) — surrogates from + # byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk + # in these fields and aren't covered by the per-field checks above. + # Matches _sanitize_messages_non_ascii's coverage (PR #10537). + for key, value in msg.items(): + if key in {"content", "name", "tool_calls", "role"}: + continue + if isinstance(value, str): + if _SURROGATE_RE.search(value): + msg[key] = _SURROGATE_RE.sub('\ufffd', value) + found = True + elif isinstance(value, (dict, list)): + if _sanitize_structure_surrogates(value): + found = True + return found + + +def _escape_invalid_chars_in_json_strings(raw: str) -> str: + """Escape unescaped control chars inside JSON string values. + + Walks the raw JSON character-by-character, tracking whether we are + inside a double-quoted string. Inside strings, replaces literal + control characters (0x00-0x1F) that aren't already part of an escape + sequence with their ``\\uXXXX`` equivalents. Pass-through for everything + else. + + Ported from #12093 — complements the other repair passes in + ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is + not enough (e.g. llama.cpp backends that emit literal apostrophes or + tabs alongside other malformations). + """ + out: list[str] = [] + in_string = False + i = 0 + n = len(raw) + while i < n: + ch = raw[i] + if in_string: + if ch == "\\" and i + 1 < n: + # Already-escaped char — pass through as-is + out.append(ch) + out.append(raw[i + 1]) + i += 2 + continue + if ch == '"': + in_string = False + out.append(ch) + elif ord(ch) < 0x20: + out.append(f"\\u{ord(ch):04x}") + else: + out.append(ch) + else: + if ch == '"': + in_string = True + out.append(ch) + i += 1 + return "".join(out) + + +def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: + """Attempt to repair malformed tool_call argument JSON. + + Models like GLM-5.1 via Ollama can produce truncated JSON, trailing + commas, Python ``None``, etc. The API proxy rejects these with HTTP 400 + "invalid tool call arguments". This function applies common repairs; + if all fail it returns ``"{}"`` so the request succeeds (better than + crashing the session). All repairs are logged at WARNING level. + """ + raw_stripped = raw_args.strip() if isinstance(raw_args, str) else "" + + # Fast-path: empty / whitespace-only -> empty object + if not raw_stripped: + logger.warning("Sanitized empty tool_call arguments for %s", tool_name) + return "{}" + + # Python-literal None -> normalise to {} + if raw_stripped == "None": + logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name) + return "{}" + + # Repair pass 0: llama.cpp backends sometimes emit literal control + # characters (tabs, newlines) inside JSON string values. json.loads + # with strict=False accepts these and lets us re-serialise the + # result into wire-valid JSON without any string surgery. This is + # the most common local-model repair case (#12068). + try: + parsed = json.loads(raw_stripped, strict=False) + reserialised = json.dumps(parsed, separators=(",", ":")) + if reserialised != raw_stripped: + logger.warning( + "Repaired unescaped control chars in tool_call arguments for %s", + tool_name, + ) + return reserialised + except (json.JSONDecodeError, TypeError, ValueError): + pass + + # Attempt common JSON repairs + fixed = raw_stripped + # 1. Strip trailing commas before } or ] + fixed = re.sub(r',\s*([}\]])', r'\1', fixed) + # 2. Close unclosed structures + open_curly = fixed.count('{') - fixed.count('}') + open_bracket = fixed.count('[') - fixed.count(']') + if open_curly > 0: + fixed += '}' * open_curly + if open_bracket > 0: + fixed += ']' * open_bracket + # 3. Remove excess closing braces/brackets (bounded to 50 iterations) + for _ in range(50): + try: + json.loads(fixed) + break + except json.JSONDecodeError: + if fixed.endswith('}') and fixed.count('}') > fixed.count('{'): + fixed = fixed[:-1] + elif fixed.endswith(']') and fixed.count(']') > fixed.count('['): + fixed = fixed[:-1] + else: + break + + try: + json.loads(fixed) + logger.warning( + "Repaired malformed tool_call arguments for %s: %s → %s", + tool_name, raw_stripped[:80], fixed[:80], + ) + return fixed + except json.JSONDecodeError: + pass + + # Repair pass 4: escape unescaped control chars inside JSON strings, + # then retry. Catches cases where strict=False alone fails because + # other malformations are present too. + try: + escaped = _escape_invalid_chars_in_json_strings(fixed) + if escaped != fixed: + json.loads(escaped) + logger.warning( + "Repaired control-char-laced tool_call arguments for %s: %s → %s", + tool_name, raw_stripped[:80], escaped[:80], + ) + return escaped + except (json.JSONDecodeError, TypeError, ValueError): + pass + + # Last resort: replace with empty object so the API request doesn't + # crash the entire session. + logger.warning( + "Unrepairable tool_call arguments for %s — " + "replaced with empty object (was: %s)", + tool_name, raw_stripped[:80], + ) + return "{}" + + +def _strip_non_ascii(text: str) -> str: + """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing. + + Used as a last resort when the system encoding is ASCII and can't handle + any non-ASCII characters (e.g. LANG=C on Chromebooks). + """ + return text.encode('ascii', errors='ignore').decode('ascii') + + +def _sanitize_messages_non_ascii(messages: list) -> bool: + """Strip non-ASCII characters from all string content in a messages list. + + This is a last-resort recovery for systems with ASCII-only encoding + (LANG=C, Chromebooks, minimal containers). Returns True if any + non-ASCII content was found and sanitized. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + # Sanitize content (string) + content = msg.get("content") + if isinstance(content, str): + sanitized = _strip_non_ascii(content) + if sanitized != content: + msg["content"] = sanitized + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str): + sanitized = _strip_non_ascii(text) + if sanitized != text: + part["text"] = sanitized + found = True + # Sanitize name field (can contain non-ASCII in tool results) + name = msg.get("name") + if isinstance(name, str): + sanitized = _strip_non_ascii(name) + if sanitized != name: + msg["name"] = sanitized + found = True + # Sanitize tool_calls + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if isinstance(tc, dict): + fn = tc.get("function", {}) + if isinstance(fn, dict): + fn_args = fn.get("arguments") + if isinstance(fn_args, str): + sanitized = _strip_non_ascii(fn_args) + if sanitized != fn_args: + fn["arguments"] = sanitized + found = True + # Sanitize any additional top-level string fields (e.g. reasoning_content) + for key, value in msg.items(): + if key in {"content", "name", "tool_calls", "role"}: + continue + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + msg[key] = sanitized + found = True + return found + + +def _sanitize_tools_non_ascii(tools: list) -> bool: + """Strip non-ASCII characters from tool payloads in-place.""" + return _sanitize_structure_non_ascii(tools) + + +def _strip_images_from_messages(messages: list) -> bool: + """Remove image_url content parts from all messages in-place. + + Called when a server signals it does not support images (e.g. + "Only 'text' content type is supported."). Mutates messages so the + next API call sends text only. + + Preserves message alternation invariants: + * ``tool``-role messages whose content was entirely images are replaced + with a plaintext placeholder, NOT deleted — deleting them would leave + the paired ``tool_call_id`` on the prior assistant message unmatched, + which providers reject with HTTP 400. + * Non-tool messages whose content becomes empty are dropped. In + practice this only hits synthetic image-only user messages appended + for attachment delivery; real user turns always include text. + + Returns True if any image parts were removed. + """ + found = False + to_delete = [] + for i, msg in enumerate(messages): + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + new_parts = [] + for part in content: + if isinstance(part, dict) and part.get("type") in {"image_url", "image", "input_image"}: + found = True + else: + new_parts.append(part) + if len(new_parts) < len(content): + if new_parts: + msg["content"] = new_parts + elif msg.get("role") == "tool": + # Preserve tool_call_id linkage — providers require every + # assistant tool_call to have a matching tool response. + msg["content"] = "[image content removed — server does not support images]" + else: + # Synthetic image-only user/assistant message with no text; + # safe to drop. + to_delete.append(i) + for i in reversed(to_delete): + del messages[i] + return found + + +def _sanitize_structure_non_ascii(payload: Any) -> bool: + """Strip non-ASCII characters from nested dict/list payloads in-place.""" + found = False + + def _walk(node): + nonlocal found + if isinstance(node, dict): + for key, value in node.items(): + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + node[key] = sanitized + found = True + elif isinstance(value, (dict, list)): + _walk(value) + elif isinstance(node, list): + for idx, value in enumerate(node): + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + node[idx] = sanitized + found = True + elif isinstance(value, (dict, list)): + _walk(value) + + _walk(payload) + return found + + +__all__ = [ + "_SURROGATE_RE", + "_sanitize_surrogates", + "_sanitize_structure_surrogates", + "_sanitize_messages_surrogates", + "_escape_invalid_chars_in_json_strings", + "_repair_tool_call_arguments", + "_strip_non_ascii", + "_sanitize_messages_non_ascii", + "_sanitize_tools_non_ascii", + "_strip_images_from_messages", + "_sanitize_structure_non_ascii", +] diff --git a/run_agent.py b/run_agent.py index 325e1e13e..b4d88f59e 100644 --- a/run_agent.py +++ b/run_agent.py @@ -189,6 +189,19 @@ from agent.trajectory import ( convert_scratchpad_to_think, has_incomplete_scratchpad, save_trajectory as _save_trajectory_to_file, ) +from agent.message_sanitization import ( + _SURROGATE_RE, + _sanitize_surrogates, + _sanitize_structure_surrogates, + _sanitize_messages_surrogates, + _escape_invalid_chars_in_json_strings, + _repair_tool_call_arguments, + _strip_non_ascii, + _sanitize_messages_non_ascii, + _sanitize_tools_non_ascii, + _strip_images_from_messages, + _sanitize_structure_non_ascii, +) from utils import atomic_json_write, base_url_host_matches, base_url_hostname, env_var_enabled, normalize_proxy_url from hermes_cli.config import cfg_get @@ -465,12 +478,6 @@ def _paths_overlap(left: Path, right: Path) -> bool: return left_parts[:common_len] == right_parts[:common_len] - -_SURROGATE_RE = re.compile(r'[\ud800-\udfff]') - - - - def _is_multimodal_tool_result(value: Any) -> bool: """True if the value is a multimodal tool result envelope. @@ -617,414 +624,6 @@ def _trajectory_normalize_msg(msg: Dict[str, Any]) -> Dict[str, Any]: return msg -def _sanitize_surrogates(text: str) -> str: - """Replace lone surrogate code points with U+FFFD (replacement character). - - Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the - OpenAI SDK. This is a fast no-op when the text contains no surrogates. - """ - if _SURROGATE_RE.search(text): - return _SURROGATE_RE.sub('\ufffd', text) - return text - - -# _summarize_user_message_for_log is imported from agent.codex_responses_adapter -# (see import block above). Remains importable from run_agent for backward compat. - - -def _sanitize_structure_surrogates(payload: Any) -> bool: - """Replace surrogate code points in nested dict/list payloads in-place. - - Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery. - Used to scrub nested structured fields (e.g. ``reasoning_details`` — an - array of dicts with ``summary``/``text`` strings) that flat per-field - checks don't reach. Returns True if any surrogates were replaced. - """ - found = False - - def _walk(node): - nonlocal found - if isinstance(node, dict): - for key, value in node.items(): - if isinstance(value, str): - if _SURROGATE_RE.search(value): - node[key] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - _walk(value) - elif isinstance(node, list): - for idx, value in enumerate(node): - if isinstance(value, str): - if _SURROGATE_RE.search(value): - node[idx] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - _walk(value) - - _walk(payload) - return found - - -def _sanitize_messages_surrogates(messages: list) -> bool: - """Sanitize surrogate characters from all string content in a messages list. - - Walks message dicts in-place. Returns True if any surrogates were found - and replaced, False otherwise. Covers content/text, name, tool call - metadata/arguments, AND any additional string or nested structured fields - (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so - retries don't fail on a non-content field. Byte-level reasoning models - (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output - that flow through to ``api_messages["reasoning_content"]`` on the next - turn and crash json.dumps inside the OpenAI SDK. - """ - found = False - for msg in messages: - if not isinstance(msg, dict): - continue - content = msg.get("content") - if isinstance(content, str) and _SURROGATE_RE.search(content): - msg["content"] = _SURROGATE_RE.sub('\ufffd', content) - found = True - elif isinstance(content, list): - for part in content: - if isinstance(part, dict): - text = part.get("text") - if isinstance(text, str) and _SURROGATE_RE.search(text): - part["text"] = _SURROGATE_RE.sub('\ufffd', text) - found = True - name = msg.get("name") - if isinstance(name, str) and _SURROGATE_RE.search(name): - msg["name"] = _SURROGATE_RE.sub('\ufffd', name) - found = True - tool_calls = msg.get("tool_calls") - if isinstance(tool_calls, list): - for tc in tool_calls: - if not isinstance(tc, dict): - continue - tc_id = tc.get("id") - if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id): - tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id) - found = True - fn = tc.get("function") - if isinstance(fn, dict): - fn_name = fn.get("name") - if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name): - fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name) - found = True - fn_args = fn.get("arguments") - if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args): - fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args) - found = True - # Walk any additional string / nested fields (reasoning, - # reasoning_content, reasoning_details, etc.) — surrogates from - # byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk - # in these fields and aren't covered by the per-field checks above. - # Matches _sanitize_messages_non_ascii's coverage (PR #10537). - for key, value in msg.items(): - if key in {"content", "name", "tool_calls", "role"}: - continue - if isinstance(value, str): - if _SURROGATE_RE.search(value): - msg[key] = _SURROGATE_RE.sub('\ufffd', value) - found = True - elif isinstance(value, (dict, list)): - if _sanitize_structure_surrogates(value): - found = True - return found - - -def _escape_invalid_chars_in_json_strings(raw: str) -> str: - """Escape unescaped control chars inside JSON string values. - - Walks the raw JSON character-by-character, tracking whether we are - inside a double-quoted string. Inside strings, replaces literal - control characters (0x00-0x1F) that aren't already part of an escape - sequence with their ``\\uXXXX`` equivalents. Pass-through for everything - else. - - Ported from #12093 — complements the other repair passes in - ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is - not enough (e.g. llama.cpp backends that emit literal apostrophes or - tabs alongside other malformations). - """ - out: list[str] = [] - in_string = False - i = 0 - n = len(raw) - while i < n: - ch = raw[i] - if in_string: - if ch == "\\" and i + 1 < n: - # Already-escaped char — pass through as-is - out.append(ch) - out.append(raw[i + 1]) - i += 2 - continue - if ch == '"': - in_string = False - out.append(ch) - elif ord(ch) < 0x20: - out.append(f"\\u{ord(ch):04x}") - else: - out.append(ch) - else: - if ch == '"': - in_string = True - out.append(ch) - i += 1 - return "".join(out) - - -def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: - """Attempt to repair malformed tool_call argument JSON. - - Models like GLM-5.1 via Ollama can produce truncated JSON, trailing - commas, Python ``None``, etc. The API proxy rejects these with HTTP 400 - "invalid tool call arguments". This function applies common repairs; - if all fail it returns ``"{}"`` so the request succeeds (better than - crashing the session). All repairs are logged at WARNING level. - """ - raw_stripped = raw_args.strip() if isinstance(raw_args, str) else "" - - # Fast-path: empty / whitespace-only -> empty object - if not raw_stripped: - logger.warning("Sanitized empty tool_call arguments for %s", tool_name) - return "{}" - - # Python-literal None -> normalise to {} - if raw_stripped == "None": - logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name) - return "{}" - - # Repair pass 0: llama.cpp backends sometimes emit literal control - # characters (tabs, newlines) inside JSON string values. json.loads - # with strict=False accepts these and lets us re-serialise the - # result into wire-valid JSON without any string surgery. This is - # the most common local-model repair case (#12068). - try: - parsed = json.loads(raw_stripped, strict=False) - reserialised = json.dumps(parsed, separators=(",", ":")) - if reserialised != raw_stripped: - logger.warning( - "Repaired unescaped control chars in tool_call arguments for %s", - tool_name, - ) - return reserialised - except (json.JSONDecodeError, TypeError, ValueError): - pass - - # Attempt common JSON repairs - fixed = raw_stripped - # 1. Strip trailing commas before } or ] - fixed = re.sub(r',\s*([}\]])', r'\1', fixed) - # 2. Close unclosed structures - open_curly = fixed.count('{') - fixed.count('}') - open_bracket = fixed.count('[') - fixed.count(']') - if open_curly > 0: - fixed += '}' * open_curly - if open_bracket > 0: - fixed += ']' * open_bracket - # 3. Remove excess closing braces/brackets (bounded to 50 iterations) - for _ in range(50): - try: - json.loads(fixed) - break - except json.JSONDecodeError: - if fixed.endswith('}') and fixed.count('}') > fixed.count('{'): - fixed = fixed[:-1] - elif fixed.endswith(']') and fixed.count(']') > fixed.count('['): - fixed = fixed[:-1] - else: - break - - try: - json.loads(fixed) - logger.warning( - "Repaired malformed tool_call arguments for %s: %s → %s", - tool_name, raw_stripped[:80], fixed[:80], - ) - return fixed - except json.JSONDecodeError: - pass - - # Repair pass 4: escape unescaped control chars inside JSON strings, - # then retry. Catches cases where strict=False alone fails because - # other malformations are present too. - try: - escaped = _escape_invalid_chars_in_json_strings(fixed) - if escaped != fixed: - json.loads(escaped) - logger.warning( - "Repaired control-char-laced tool_call arguments for %s: %s → %s", - tool_name, raw_stripped[:80], escaped[:80], - ) - return escaped - except (json.JSONDecodeError, TypeError, ValueError): - pass - - # Last resort: replace with empty object so the API request doesn't - # crash the entire session. - logger.warning( - "Unrepairable tool_call arguments for %s — " - "replaced with empty object (was: %s)", - tool_name, raw_stripped[:80], - ) - return "{}" - - -def _strip_non_ascii(text: str) -> str: - """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing. - - Used as a last resort when the system encoding is ASCII and can't handle - any non-ASCII characters (e.g. LANG=C on Chromebooks). - """ - return text.encode('ascii', errors='ignore').decode('ascii') - - -def _sanitize_messages_non_ascii(messages: list) -> bool: - """Strip non-ASCII characters from all string content in a messages list. - - This is a last-resort recovery for systems with ASCII-only encoding - (LANG=C, Chromebooks, minimal containers). Returns True if any - non-ASCII content was found and sanitized. - """ - found = False - for msg in messages: - if not isinstance(msg, dict): - continue - # Sanitize content (string) - content = msg.get("content") - if isinstance(content, str): - sanitized = _strip_non_ascii(content) - if sanitized != content: - msg["content"] = sanitized - found = True - elif isinstance(content, list): - for part in content: - if isinstance(part, dict): - text = part.get("text") - if isinstance(text, str): - sanitized = _strip_non_ascii(text) - if sanitized != text: - part["text"] = sanitized - found = True - # Sanitize name field (can contain non-ASCII in tool results) - name = msg.get("name") - if isinstance(name, str): - sanitized = _strip_non_ascii(name) - if sanitized != name: - msg["name"] = sanitized - found = True - # Sanitize tool_calls - tool_calls = msg.get("tool_calls") - if isinstance(tool_calls, list): - for tc in tool_calls: - if isinstance(tc, dict): - fn = tc.get("function", {}) - if isinstance(fn, dict): - fn_args = fn.get("arguments") - if isinstance(fn_args, str): - sanitized = _strip_non_ascii(fn_args) - if sanitized != fn_args: - fn["arguments"] = sanitized - found = True - # Sanitize any additional top-level string fields (e.g. reasoning_content) - for key, value in msg.items(): - if key in {"content", "name", "tool_calls", "role"}: - continue - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - msg[key] = sanitized - found = True - return found - - -def _sanitize_tools_non_ascii(tools: list) -> bool: - """Strip non-ASCII characters from tool payloads in-place.""" - return _sanitize_structure_non_ascii(tools) - - -def _strip_images_from_messages(messages: list) -> bool: - """Remove image_url content parts from all messages in-place. - - Called when a server signals it does not support images (e.g. - "Only 'text' content type is supported."). Mutates messages so the - next API call sends text only. - - Preserves message alternation invariants: - * ``tool``-role messages whose content was entirely images are replaced - with a plaintext placeholder, NOT deleted — deleting them would leave - the paired ``tool_call_id`` on the prior assistant message unmatched, - which providers reject with HTTP 400. - * Non-tool messages whose content becomes empty are dropped. In - practice this only hits synthetic image-only user messages appended - for attachment delivery; real user turns always include text. - - Returns True if any image parts were removed. - """ - found = False - to_delete = [] - for i, msg in enumerate(messages): - if not isinstance(msg, dict): - continue - content = msg.get("content") - if not isinstance(content, list): - continue - new_parts = [] - for part in content: - if isinstance(part, dict) and part.get("type") in {"image_url", "image", "input_image"}: - found = True - else: - new_parts.append(part) - if len(new_parts) < len(content): - if new_parts: - msg["content"] = new_parts - elif msg.get("role") == "tool": - # Preserve tool_call_id linkage — providers require every - # assistant tool_call to have a matching tool response. - msg["content"] = "[image content removed — server does not support images]" - else: - # Synthetic image-only user/assistant message with no text; - # safe to drop. - to_delete.append(i) - for i in reversed(to_delete): - del messages[i] - return found - - -def _sanitize_structure_non_ascii(payload: Any) -> bool: - """Strip non-ASCII characters from nested dict/list payloads in-place.""" - found = False - - def _walk(node): - nonlocal found - if isinstance(node, dict): - for key, value in node.items(): - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - node[key] = sanitized - found = True - elif isinstance(value, (dict, list)): - _walk(value) - elif isinstance(node, list): - for idx, value in enumerate(node): - if isinstance(value, str): - sanitized = _strip_non_ascii(value) - if sanitized != value: - node[idx] = sanitized - found = True - elif isinstance(value, (dict, list)): - _walk(value) - - _walk(payload) - return found - - - - - # ========================================================================= # Large tool result handler — save oversized output to temp file # =========================================================================