fix(computer_use): correct type_text MCP tool name and implement drag action
Bug 3: The cua_backend type_text() method called MCP tool 'type_text_chars' which does not exist in current cua-driver. Changed to 'type_text' which is the correct MCP tool name. Bug 4: The drag() method returned a hardcoded 'not supported' error even though cua-driver exposes a 'drag' MCP tool. Implemented proper drag dispatching with coordinate-based and element-based targeting. Added dispatch-level validation for drag to ensure from/to coordinates or elements are provided before calling any backend. Fixes #24170 (bugs 3 and 4)
This commit is contained in:
@@ -497,9 +497,25 @@ class CuaDriverBackend(ComputerUseBackend):
|
||||
button: str = "left",
|
||||
modifiers: Optional[List[str]] = None,
|
||||
) -> ActionResult:
|
||||
# cua-driver does not expose a drag tool.
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="drag is not supported by the cua-driver backend.")
|
||||
pid = self._active_pid
|
||||
if pid is None:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="No active window — call capture() first.")
|
||||
args: Dict[str, Any] = {"pid": pid}
|
||||
if from_element is not None and to_element is not None:
|
||||
if self._active_window_id is None:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="No active window_id for element-based drag.")
|
||||
args["from_element"] = from_element
|
||||
args["to_element"] = to_element
|
||||
args["window_id"] = self._active_window_id
|
||||
elif from_xy is not None and to_xy is not None:
|
||||
args["from_x"], args["from_y"] = int(from_xy[0]), int(from_xy[1])
|
||||
args["to_x"], args["to_y"] = int(to_xy[0]), int(to_xy[1])
|
||||
else:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="drag requires from_element/to_element or from_coordinate/to_coordinate.")
|
||||
return self._action("drag", args)
|
||||
|
||||
def scroll(
|
||||
self,
|
||||
@@ -534,10 +550,7 @@ class CuaDriverBackend(ComputerUseBackend):
|
||||
if pid is None:
|
||||
return ActionResult(ok=False, action="type_text",
|
||||
message="No active window — call capture() first.")
|
||||
# Safari WebKit AXTextField does not accept AX attribute writes (type_text),
|
||||
# so use type_text_chars which synthesises individual key events instead.
|
||||
# This works universally across all macOS apps in background mode.
|
||||
return self._action("type_text_chars", {"pid": pid, "text": text})
|
||||
return self._action("type_text", {"pid": pid, "text": text})
|
||||
|
||||
def key(self, keys: str) -> ActionResult:
|
||||
pid = self._active_pid
|
||||
|
||||
@@ -357,6 +357,12 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
|
||||
return _maybe_follow_capture(backend, res, capture_after)
|
||||
|
||||
if action == "drag":
|
||||
has_elements = args.get("from_element") is not None and args.get("to_element") is not None
|
||||
has_coords = args.get("from_coordinate") and args.get("to_coordinate")
|
||||
if not has_elements and not has_coords:
|
||||
return json.dumps({
|
||||
"error": "drag requires from_coordinate/to_coordinate or from_element/to_element",
|
||||
})
|
||||
res = backend.drag(
|
||||
from_element=args.get("from_element"),
|
||||
to_element=args.get("to_element"),
|
||||
|
||||
Reference in New Issue
Block a user