openclaw/extensions/cua-computer/computer-tool.ts
f-trycua 438c80b758 refactor: move computer tool to cua-computer plugin
Move GUI automation from core to a standalone plugin. This keeps the
core lean and lets users opt-in when they need computer-use capabilities.

Plugin provides:
- Screenshot capture
- Mouse clicks (left, right, double)
- Keyboard input (type, key, hotkey)
- Scrolling and dragging
- Cursor position/screen size queries

Enable via config:
```yaml
plugins:
  cua-computer:
    serverUrl: "http://localhost:8000"
```
2026-01-26 15:56:22 -08:00

313 lines
10 KiB
TypeScript

/**
* Computer tool for GUI automation via cua-computer-server.
*
* Enables agents to take screenshots, click, type, scroll, and perform
* other desktop automation actions on sandboxes or nodes running computer-server.
*
* @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
*/
import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../src/config/config.js";
import { stringEnum } from "../../src/agents/schema/typebox.js";
import type { AnyAgentTool } from "../../src/agents/tools/common.js";
import { imageResult, jsonResult, readNumberParam, readStringParam } from "../../src/agents/tools/common.js";
import { ComputerServerClient, ComputerServerError } from "./computer-server-client.js";
const COMPUTER_ACTIONS = [
"screenshot",
"click",
"double_click",
"right_click",
"type",
"key",
"hotkey",
"scroll",
"move",
"drag",
"get_screen_size",
"get_cursor_position",
] as const;
const SCROLL_DIRECTIONS = ["up", "down", "left", "right"] as const;
const ComputerToolSchema = Type.Object({
action: stringEnum(COMPUTER_ACTIONS, {
description:
"Action to perform: screenshot, click, double_click, right_click, type, key, hotkey, scroll, move, drag, get_screen_size, get_cursor_position",
}),
// Coordinates (for click, double_click, right_click, move, scroll)
x: Type.Optional(Type.Number({ description: "X coordinate in pixels" })),
y: Type.Optional(Type.Number({ description: "Y coordinate in pixels" })),
// Typing
text: Type.Optional(Type.String({ description: "Text to type (for 'type' action)" })),
// Key press
key: Type.Optional(
Type.String({
description: "Key to press (for 'key' action), e.g., 'Return', 'Tab', 'Escape'",
}),
),
// Hotkey
keys: Type.Optional(
Type.Array(Type.String(), {
description: "Keys for hotkey combination (for 'hotkey' action), e.g., ['cmd', 'c']",
}),
),
// Scroll
direction: Type.Optional(
stringEnum(SCROLL_DIRECTIONS, {
description: "Scroll direction (for 'scroll' action): up, down, left, right",
}),
),
amount: Type.Optional(
Type.Number({
description: "Scroll amount in clicks (for 'scroll' action), default: 1",
}),
),
// Drag
end_x: Type.Optional(Type.Number({ description: "End X coordinate for drag action" })),
end_y: Type.Optional(Type.Number({ description: "End Y coordinate for drag action" })),
// Connection
computer_server_url: Type.Optional(
Type.String({
description:
"URL of the computer-server (default: http://localhost:8000). Usually set automatically based on sandbox/node configuration.",
}),
),
});
export type ComputerToolOptions = {
/** Default computer-server URL */
defaultServerUrl?: string;
/** Clawdbot configuration */
config?: ClawdbotConfig;
};
export function createComputerTool(options?: ComputerToolOptions): AnyAgentTool {
return {
label: "Computer",
name: "computer",
description: `Control a computer's GUI - take screenshots, click, type, scroll, and more.
Use this tool to interact with desktop applications running in a sandbox or on a connected node.
**Actions:**
- \`screenshot\`: Capture the current screen state. Always do this first to see what's on screen.
- \`click\`: Left-click at coordinates (x, y)
- \`double_click\`: Double-click at coordinates (x, y)
- \`right_click\`: Right-click at coordinates (x, y)
- \`type\`: Type text at the current cursor position
- \`key\`: Press a single key (e.g., "Return", "Tab", "Escape")
- \`hotkey\`: Press a key combination (e.g., ["cmd", "c"] for copy)
- \`scroll\`: Scroll in a direction (up, down, left, right)
- \`move\`: Move cursor to coordinates without clicking
- \`drag\`: Drag from (x, y) to (end_x, end_y)
- \`get_screen_size\`: Get screen dimensions
- \`get_cursor_position\`: Get current cursor position
**Tips:**
- Always take a screenshot first to understand the current screen state
- Use coordinates from screenshots to click on UI elements
- After performing actions, take another screenshot to verify the result`,
parameters: ComputerToolSchema,
execute: async (_toolCallId, params) => {
const action = readStringParam(params as Record<string, unknown>, "action", {
required: true,
});
const serverUrl =
readStringParam(params as Record<string, unknown>, "computer_server_url") ??
options?.defaultServerUrl ??
"http://localhost:8000";
const client = new ComputerServerClient({ baseUrl: serverUrl });
try {
switch (action) {
case "screenshot": {
const result = await client.screenshot();
return await imageResult({
label: "Screenshot",
path: "screenshot.png",
base64: result.imageData,
mimeType: "image/png",
extraText: "Screenshot captured successfully",
});
}
case "click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.click(x, y);
return jsonResult({
success: true,
action: "click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "double_click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.doubleClick(x, y);
return jsonResult({
success: true,
action: "double_click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "right_click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.rightClick(x, y);
return jsonResult({
success: true,
action: "right_click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "type": {
const text = readStringParam(params as Record<string, unknown>, "text", {
required: true,
label: "text",
});
await client.type(text);
return jsonResult({
success: true,
action: "type",
text,
});
}
case "key": {
const key = readStringParam(params as Record<string, unknown>, "key", {
required: true,
label: "key",
});
await client.key(key);
return jsonResult({
success: true,
action: "key",
key,
});
}
case "hotkey": {
const keys = params.keys as string[] | undefined;
if (!keys || !Array.isArray(keys) || keys.length === 0) {
throw new Error("keys array required for hotkey action");
}
await client.hotkey(keys);
return jsonResult({
success: true,
action: "hotkey",
keys,
});
}
case "scroll": {
const direction = readStringParam(params as Record<string, unknown>, "direction", {
required: true,
label: "direction",
}) as "up" | "down" | "left" | "right";
const amount = readNumberParam(params as Record<string, unknown>, "amount") ?? 1;
await client.scroll(direction, amount);
return jsonResult({
success: true,
action: "scroll",
direction,
amount,
});
}
case "move": {
const x = readNumberParam(params as Record<string, unknown>, "x", {
required: true,
label: "x coordinate",
})!;
const y = readNumberParam(params as Record<string, unknown>, "y", {
required: true,
label: "y coordinate",
})!;
await client.moveCursor(x, y);
return jsonResult({
success: true,
action: "move",
coordinates: { x, y },
});
}
case "drag": {
const x = readNumberParam(params as Record<string, unknown>, "x", {
required: true,
label: "start x coordinate",
})!;
const y = readNumberParam(params as Record<string, unknown>, "y", {
required: true,
label: "start y coordinate",
})!;
const endX = readNumberParam(params as Record<string, unknown>, "end_x", {
required: true,
label: "end x coordinate",
})!;
const endY = readNumberParam(params as Record<string, unknown>, "end_y", {
required: true,
label: "end y coordinate",
})!;
// Move to start position first, then drag
await client.moveCursor(x, y);
await client.dragTo(endX, endY);
return jsonResult({
success: true,
action: "drag",
from: { x, y },
to: { x: endX, y: endY },
});
}
case "get_screen_size": {
const size = await client.getScreenSize();
return jsonResult({
success: true,
action: "get_screen_size",
width: size.width,
height: size.height,
});
}
case "get_cursor_position": {
const pos = await client.getCursorPosition();
return jsonResult({
success: true,
action: "get_cursor_position",
x: pos.x,
y: pos.y,
});
}
default:
throw new Error(`Unknown action: ${action}`);
}
} catch (error) {
if (error instanceof ComputerServerError) {
return jsonResult({
success: false,
error: error.message,
command: error.command,
});
}
throw error;
}
},
};
}