fix(computer-use): cap AX elements array to prevent context blowup (#22865)
`computer_use(action='capture', mode='ax')` returned the full AX element list verbatim in the JSON response. Dense Electron / Obsidian / JetBrains UIs publish 500+ AX nodes (one reproduction in #22865 returned 597 elements against Obsidian), so a single capture could consume enough context to trigger compression failures or render the session unusable. The human-readable `_format_elements` summary is already capped at 40 lines, so the truncation gap was invisible to anyone reading the summary output. Add a `max_elements` argument to the tool schema, default 100, that trims the AX `elements` array. When the cap fires, the response surfaces `total_elements` and `truncated_elements` and appends a "raise max_elements or pass app= to narrow" hint to the summary so the model knows the JSON view is partial and can re-issue with a tighter scope. Validation is centralized in `_coerce_max_elements`: missing / non-integer / sub-1 inputs fall back to the default cap, so the protection can never be silently disabled by a malformed tool-call argument. The cap only affects AX-mode JSON; `mode='som'` and `mode='vision'` keep returning a screenshot + image-aware summary unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -75,6 +75,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
|
||||
"frontmost app's window or the whole screen."
|
||||
),
|
||||
},
|
||||
"max_elements": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Optional cap on the AX `elements` array returned by "
|
||||
"`action='capture'`. Default 100. Dense UIs (Electron "
|
||||
"apps such as Obsidian or VS Code, JetBrains IDEs) can "
|
||||
"publish 500+ AX nodes — capping prevents a single "
|
||||
"capture from blowing session context. When the cap "
|
||||
"trims the response, `total_elements` and "
|
||||
"`truncated_elements` are surfaced in the result so "
|
||||
"you can re-call with `app=` to narrow scope or raise "
|
||||
"`max_elements` when the full tree is required. Has no "
|
||||
"effect on `mode='som'` / `mode='vision'` (those return "
|
||||
"a screenshot, not the elements array)."
|
||||
),
|
||||
"minimum": 1,
|
||||
},
|
||||
# ── click / drag / scroll targeting ────────────────────
|
||||
"element": {
|
||||
"type": "integer",
|
||||
|
||||
@@ -317,7 +317,7 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
|
||||
if mode not in {"som", "vision", "ax"}:
|
||||
return json.dumps({"error": f"bad mode {mode!r}; use som|vision|ax"})
|
||||
cap = backend.capture(mode=mode, app=args.get("app"))
|
||||
return _capture_response(cap)
|
||||
return _capture_response(cap, max_elements=_coerce_max_elements(args.get("max_elements")))
|
||||
|
||||
if action == "wait":
|
||||
seconds = float(args.get("seconds", 1.0))
|
||||
@@ -416,16 +416,50 @@ def _text_response(res: ActionResult) -> str:
|
||||
return json.dumps(payload)
|
||||
|
||||
|
||||
def _capture_response(cap: CaptureResult) -> Any:
|
||||
# Default cap for the AX `elements` array returned by capture. Dense UIs
|
||||
# (Electron apps, Obsidian, JetBrains IDEs) can publish 500+ AX nodes, which
|
||||
# can exhaust session context after a single capture. The model-facing
|
||||
# `max_elements` argument lets callers raise this when they need the full tree.
|
||||
_DEFAULT_MAX_ELEMENTS = 100
|
||||
|
||||
|
||||
def _coerce_max_elements(value: Any) -> int:
|
||||
"""Validate the caller-supplied ``max_elements``.
|
||||
|
||||
Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
|
||||
sub-1 inputs so the cap can never be silently disabled by a malformed
|
||||
tool-call argument.
|
||||
"""
|
||||
if value is None:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
try:
|
||||
n = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
if n < 1:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
return n
|
||||
|
||||
|
||||
def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEMENTS) -> Any:
|
||||
total_elements = len(cap.elements)
|
||||
visible_elements = cap.elements[:max_elements]
|
||||
truncated_elements = max(0, total_elements - len(visible_elements))
|
||||
|
||||
element_index = _format_elements(cap.elements)
|
||||
summary_lines = [
|
||||
f"capture mode={cap.mode} {cap.width}x{cap.height}"
|
||||
+ (f" app={cap.app}" if cap.app else "")
|
||||
+ (f" window={cap.window_title!r}" if cap.window_title else ""),
|
||||
f"{len(cap.elements)} interactable element(s):",
|
||||
f"{total_elements} interactable element(s):",
|
||||
]
|
||||
if element_index:
|
||||
summary_lines.extend(element_index)
|
||||
if truncated_elements:
|
||||
summary_lines.append(
|
||||
f" (response truncated to {len(visible_elements)} of {total_elements} elements; "
|
||||
f"raise max_elements or pass app= to narrow)"
|
||||
)
|
||||
summary = "\n".join(summary_lines)
|
||||
|
||||
if cap.png_b64 and cap.mode != "ax":
|
||||
@@ -458,18 +492,22 @@ def _capture_response(cap: CaptureResult) -> Any:
|
||||
],
|
||||
"text_summary": summary,
|
||||
"meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
|
||||
"elements": len(cap.elements), "png_bytes": cap.png_bytes_len},
|
||||
"elements": total_elements, "png_bytes": cap.png_bytes_len},
|
||||
}
|
||||
# AX-only (or image missing): text path.
|
||||
return json.dumps({
|
||||
payload: Dict[str, Any] = {
|
||||
"mode": cap.mode,
|
||||
"width": cap.width,
|
||||
"height": cap.height,
|
||||
"app": cap.app,
|
||||
"window_title": cap.window_title,
|
||||
"elements": [_element_to_dict(e) for e in cap.elements],
|
||||
"elements": [_element_to_dict(e) for e in visible_elements],
|
||||
"total_elements": total_elements,
|
||||
"summary": summary,
|
||||
})
|
||||
}
|
||||
if truncated_elements:
|
||||
payload["truncated_elements"] = truncated_elements
|
||||
return json.dumps(payload)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user