diff --git a/docs/guides/computer-server-setup.md b/docs/guides/computer-server-setup.md new file mode 100644 index 000000000..6823e9208 --- /dev/null +++ b/docs/guides/computer-server-setup.md @@ -0,0 +1,176 @@ +# Computer-Server Setup for GUI Automation + +This guide explains how to set up [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) for desktop GUI automation with Clawdbot agents. + +## Overview + +The `computer` tool enables agents to: +- Take screenshots of the desktop +- Click at screen coordinates +- Type text and press keys +- Scroll and drag +- Control native applications + +## Setup Options + +### Option 1: Sandbox Mode (Recommended) + +Use Cua's pre-built desktop sandbox images. Computer-server is already installed and running. + +![cua-xfce desktop sandbox](./cua-xfce-screenshot.png) + +```yaml +# In your clawdbot config +agents: + defaults: + sandbox: + docker: + image: ghcr.io/trycua/cua-xfce:latest +``` + +Available sandbox images: + +| Image | Description | +|-------|-------------| +| `ghcr.io/trycua/cua-xfce:latest` | Linux + XFCE desktop | +| `ghcr.io/trycua/cua-ubuntu:latest` | Linux + Kasm desktop | + +For more options including Windows and Android, see [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox#docker-sandboxes). + +### Option 2: Node Mode (Linux) + +For Linux nodes, you can set up computer-server to run as a daemon using supervisor. + +#### 1. Install computer-server + +```bash +pip install cua-computer-server +``` + +#### 2. Create startup script + +Create `/usr/local/bin/start-computer-server.sh`: + +```bash +#!/bin/bash +set -e + +# Wait for X server to be ready +echo "Waiting for X server to start..." +while ! xdpyinfo -display :0 >/dev/null 2>&1; do + sleep 1 +done +echo "X server is ready" + +# Start computer-server +export DISPLAY=:0 +python3 -m computer_server --port 8000 +``` + +Make it executable: + +```bash +chmod +x /usr/local/bin/start-computer-server.sh +``` + +#### 3. Configure supervisor + +Create `/etc/supervisor/conf.d/computer-server.conf`: + +```ini +[program:computer-server] +command=/usr/local/bin/start-computer-server.sh +user= +autorestart=true +stdout_logfile=/var/log/computer-server.log +stderr_logfile=/var/log/computer-server.error.log +``` + +#### 4. Start the service + +```bash +sudo supervisorctl reread +sudo supervisorctl update +sudo supervisorctl start computer-server +``` + +### Option 3: Node Mode (Windows/macOS) + +**TBD** - Setup instructions for Windows and macOS nodes will be added in a future update. + +## Usage + +Once computer-server is running, agents can use the `computer` tool: + +``` +Use the computer tool to take a screenshot and then click on the "File" menu. +``` + +The agent will: +1. Call `computer` with `action: "screenshot"` to see the screen +2. Identify the coordinates of "File" menu +3. Call `computer` with `action: "click"` and the coordinates + +## Configuration + +You can configure the computer-server URL in your Clawdbot config: + +```yaml +tools: + computer: + serverUrl: "http://localhost:8000" +``` + +Or per-agent: + +```yaml +agents: + my-agent: + tools: + computer: + serverUrl: "http://192.168.1.100:8000" +``` + +## MCP Alternative + +Computer-server also exposes an MCP (Model Context Protocol) interface at `/mcp`. This could be used as an alternative integration method if Clawdbot adds MCP client support in the future. + +To use computer-server as a standalone MCP server: + +```bash +python -m computer_server --mcp +``` + +## Troubleshooting + +### Connection refused + +Ensure computer-server is running and accessible: + +```bash +curl http://localhost:8000/status +``` + +Should return: `{"status": "ok", "os_type": "...", "features": [...]}` + +### No display + +If computer-server fails with display errors, ensure: +- X server is running (Linux) +- The `DISPLAY` environment variable is set correctly +- The user has permission to access the display + +### Sandbox not connecting + +Ensure the sandbox container has port 8000 exposed and the network is accessible from Clawdbot. + +## Security Considerations + +- **Sandbox mode**: Isolated container - safe for untrusted workloads +- **Node mode**: Controls the actual device screen - use only with trusted agents +- **Gateway mode**: Not supported - would give agents control of your actual desktop + +## Related + +- [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) - Desktop automation server +- [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox) - Pre-built sandbox images diff --git a/docs/guides/cua-xfce-screenshot.png b/docs/guides/cua-xfce-screenshot.png new file mode 100644 index 000000000..3bb4d3626 Binary files /dev/null and b/docs/guides/cua-xfce-screenshot.png differ diff --git a/extensions/cua-computer/clawdbot.plugin.json b/extensions/cua-computer/clawdbot.plugin.json new file mode 100644 index 000000000..bbccc202d --- /dev/null +++ b/extensions/cua-computer/clawdbot.plugin.json @@ -0,0 +1,22 @@ +{ + "id": "cua-computer", + "name": "Cua Computer", + "description": "GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling", + "version": "0.1.0", + "configSchema": { + "type": "object", + "properties": { + "serverUrl": { + "type": "string", + "description": "computer-server URL (default: http://localhost:8000)" + } + } + }, + "uiHints": { + "serverUrl": { + "label": "Server URL", + "placeholder": "http://localhost:8000", + "help": "URL of the cua-computer-server instance" + } + } +} diff --git a/extensions/cua-computer/computer-server-client.ts b/extensions/cua-computer/computer-server-client.ts new file mode 100644 index 000000000..c920a7ff8 --- /dev/null +++ b/extensions/cua-computer/computer-server-client.ts @@ -0,0 +1,227 @@ +/** + * HTTP client for communicating with cua-computer-server. + * + * computer-server provides desktop automation capabilities via HTTP POST /cmd endpoint. + * Each command returns { success: boolean, ...result } or { success: false, error: string }. + * + * Note: computer-server also exposes an MCP interface at /mcp which could be used + * if Clawdbot adds MCP client support in the future. + * + * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server + */ + +export interface ComputerServerConfig { + /** Base URL of the computer-server (default: http://localhost:8000) */ + baseUrl?: string; + /** Request timeout in milliseconds (default: 30000) */ + timeoutMs?: number; +} + +export interface ScreenshotResult { + /** Base64-encoded PNG image data */ + imageData: string; +} + +export interface ScreenSize { + width: number; + height: number; +} + +export interface CursorPosition { + x: number; + y: number; +} + +export interface CommandResult { + success: boolean; + error?: string; + [key: string]: unknown; +} + +export class ComputerServerError extends Error { + constructor( + message: string, + public readonly command: string, + public readonly statusCode?: number, + ) { + super(message); + this.name = "ComputerServerError"; + } +} + +export class ComputerServerClient { + private readonly baseUrl: string; + private readonly timeoutMs: number; + + constructor(config: ComputerServerConfig = {}) { + this.baseUrl = config.baseUrl ?? "http://localhost:8000"; + this.timeoutMs = config.timeoutMs ?? 30000; + } + + /** + * Take a screenshot of the desktop. + * @returns Base64-encoded PNG image data + */ + async screenshot(): Promise { + const result = await this.call("screenshot"); + return { imageData: result.image_data as string }; + } + + /** + * Get the screen dimensions. + */ + async getScreenSize(): Promise { + const result = await this.call("get_screen_size"); + const size = result.size as { width: number; height: number }; + return { + width: size.width, + height: size.height, + }; + } + + /** + * Get the current cursor position. + */ + async getCursorPosition(): Promise { + const result = await this.call("get_cursor_position"); + const position = result.position as { x: number; y: number }; + return { + x: position.x, + y: position.y, + }; + } + + /** + * Perform a left click at the specified coordinates. + * If coordinates are omitted, clicks at the current cursor position. + */ + async click(x?: number, y?: number): Promise { + await this.call("left_click", { x, y }); + } + + /** + * Perform a double click at the specified coordinates. + */ + async doubleClick(x?: number, y?: number): Promise { + await this.call("double_click", { x, y }); + } + + /** + * Perform a right click at the specified coordinates. + */ + async rightClick(x?: number, y?: number): Promise { + await this.call("right_click", { x, y }); + } + + /** + * Move the cursor to the specified coordinates. + */ + async moveCursor(x: number, y: number): Promise { + await this.call("move_cursor", { x, y }); + } + + /** + * Type text using the keyboard. + */ + async type(text: string): Promise { + await this.call("type_text", { text }); + } + + /** + * Press a single key (e.g., "Return", "Tab", "Escape"). + */ + async key(key: string): Promise { + await this.call("press_key", { key }); + } + + /** + * Press a combination of keys (e.g., ["cmd", "c"] for copy). + */ + async hotkey(keys: string[]): Promise { + await this.call("hotkey", { keys }); + } + + /** + * Scroll in a direction. + * @param direction - "up", "down", "left", or "right" + * @param clicks - Number of scroll clicks (default: 1) + */ + async scroll(direction: "up" | "down" | "left" | "right", clicks = 1): Promise { + if (direction === "down") { + await this.call("scroll_down", { clicks }); + } else if (direction === "up") { + await this.call("scroll_up", { clicks }); + } else { + // Horizontal scroll: use scroll(x, y) where positive x = right, negative = left + const x = direction === "right" ? 300 * clicks : -300 * clicks; + await this.call("scroll", { x, y: 0 }); + } + } + + /** + * Drag from current position to target coordinates. + */ + async dragTo(x: number, y: number, button = "left", duration = 0.5): Promise { + await this.call("drag_to", { x, y, button, duration }); + } + + /** + * Check if the computer-server is available and responding. + */ + async healthCheck(): Promise { + try { + const response = await fetch(`${this.baseUrl}/status`, { + method: "GET", + signal: AbortSignal.timeout(5000), + }); + return response.ok; + } catch { + return false; + } + } + + /** + * Send a command to the computer-server. + */ + private async call( + command: string, + params: Record = {}, + ): Promise { + // Filter out undefined values from params + const filteredParams = Object.fromEntries( + Object.entries(params).filter(([, v]) => v !== undefined), + ); + + const response = await fetch(`${this.baseUrl}/cmd`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ command, params: filteredParams }), + signal: AbortSignal.timeout(this.timeoutMs), + }); + + if (!response.ok) { + throw new ComputerServerError( + `HTTP ${response.status}: ${response.statusText}`, + command, + response.status, + ); + } + + // The /cmd endpoint returns SSE-style "data: {...}\n\n" format + const text = await response.text(); + const jsonMatch = text.match(/^data:\s*(.+)$/m); + if (!jsonMatch) { + throw new ComputerServerError(`Invalid response format from computer-server`, command); + } + + const result = JSON.parse(jsonMatch[1]) as CommandResult; + + if (!result.success) { + throw new ComputerServerError(result.error ?? `Command '${command}' failed`, command); + } + + return result; + } +} diff --git a/extensions/cua-computer/computer-tool.ts b/extensions/cua-computer/computer-tool.ts new file mode 100644 index 000000000..875f922d9 --- /dev/null +++ b/extensions/cua-computer/computer-tool.ts @@ -0,0 +1,312 @@ +/** + * Computer tool for GUI automation via cua-computer-server. + * + * Enables agents to take screenshots, click, type, scroll, and perform + * other desktop automation actions on sandboxes or nodes running computer-server. + * + * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server + */ + +import { Type } from "@sinclair/typebox"; + +import type { ClawdbotConfig } from "../../src/config/config.js"; +import { stringEnum } from "../../src/agents/schema/typebox.js"; +import type { AnyAgentTool } from "../../src/agents/tools/common.js"; +import { imageResult, jsonResult, readNumberParam, readStringParam } from "../../src/agents/tools/common.js"; +import { ComputerServerClient, ComputerServerError } from "./computer-server-client.js"; + +const COMPUTER_ACTIONS = [ + "screenshot", + "click", + "double_click", + "right_click", + "type", + "key", + "hotkey", + "scroll", + "move", + "drag", + "get_screen_size", + "get_cursor_position", +] as const; + +const SCROLL_DIRECTIONS = ["up", "down", "left", "right"] as const; + +const ComputerToolSchema = Type.Object({ + action: stringEnum(COMPUTER_ACTIONS, { + description: + "Action to perform: screenshot, click, double_click, right_click, type, key, hotkey, scroll, move, drag, get_screen_size, get_cursor_position", + }), + + // Coordinates (for click, double_click, right_click, move, scroll) + x: Type.Optional(Type.Number({ description: "X coordinate in pixels" })), + y: Type.Optional(Type.Number({ description: "Y coordinate in pixels" })), + + // Typing + text: Type.Optional(Type.String({ description: "Text to type (for 'type' action)" })), + + // Key press + key: Type.Optional( + Type.String({ + description: "Key to press (for 'key' action), e.g., 'Return', 'Tab', 'Escape'", + }), + ), + + // Hotkey + keys: Type.Optional( + Type.Array(Type.String(), { + description: "Keys for hotkey combination (for 'hotkey' action), e.g., ['cmd', 'c']", + }), + ), + + // Scroll + direction: Type.Optional( + stringEnum(SCROLL_DIRECTIONS, { + description: "Scroll direction (for 'scroll' action): up, down, left, right", + }), + ), + amount: Type.Optional( + Type.Number({ + description: "Scroll amount in clicks (for 'scroll' action), default: 1", + }), + ), + + // Drag + end_x: Type.Optional(Type.Number({ description: "End X coordinate for drag action" })), + end_y: Type.Optional(Type.Number({ description: "End Y coordinate for drag action" })), + + // Connection + computer_server_url: Type.Optional( + Type.String({ + description: + "URL of the computer-server (default: http://localhost:8000). Usually set automatically based on sandbox/node configuration.", + }), + ), +}); + +export type ComputerToolOptions = { + /** Default computer-server URL */ + defaultServerUrl?: string; + /** Clawdbot configuration */ + config?: ClawdbotConfig; +}; + +export function createComputerTool(options?: ComputerToolOptions): AnyAgentTool { + return { + label: "Computer", + name: "computer", + description: `Control a computer's GUI - take screenshots, click, type, scroll, and more. + +Use this tool to interact with desktop applications running in a sandbox or on a connected node. + +**Actions:** +- \`screenshot\`: Capture the current screen state. Always do this first to see what's on screen. +- \`click\`: Left-click at coordinates (x, y) +- \`double_click\`: Double-click at coordinates (x, y) +- \`right_click\`: Right-click at coordinates (x, y) +- \`type\`: Type text at the current cursor position +- \`key\`: Press a single key (e.g., "Return", "Tab", "Escape") +- \`hotkey\`: Press a key combination (e.g., ["cmd", "c"] for copy) +- \`scroll\`: Scroll in a direction (up, down, left, right) +- \`move\`: Move cursor to coordinates without clicking +- \`drag\`: Drag from (x, y) to (end_x, end_y) +- \`get_screen_size\`: Get screen dimensions +- \`get_cursor_position\`: Get current cursor position + +**Tips:** +- Always take a screenshot first to understand the current screen state +- Use coordinates from screenshots to click on UI elements +- After performing actions, take another screenshot to verify the result`, + parameters: ComputerToolSchema, + execute: async (_toolCallId, params) => { + const action = readStringParam(params as Record, "action", { + required: true, + }); + const serverUrl = + readStringParam(params as Record, "computer_server_url") ?? + options?.defaultServerUrl ?? + "http://localhost:8000"; + + const client = new ComputerServerClient({ baseUrl: serverUrl }); + + try { + switch (action) { + case "screenshot": { + const result = await client.screenshot(); + return await imageResult({ + label: "Screenshot", + path: "screenshot.png", + base64: result.imageData, + mimeType: "image/png", + extraText: "Screenshot captured successfully", + }); + } + + case "click": { + const x = readNumberParam(params as Record, "x"); + const y = readNumberParam(params as Record, "y"); + await client.click(x, y); + return jsonResult({ + success: true, + action: "click", + coordinates: x !== undefined && y !== undefined ? { x, y } : "current position", + }); + } + + case "double_click": { + const x = readNumberParam(params as Record, "x"); + const y = readNumberParam(params as Record, "y"); + await client.doubleClick(x, y); + return jsonResult({ + success: true, + action: "double_click", + coordinates: x !== undefined && y !== undefined ? { x, y } : "current position", + }); + } + + case "right_click": { + const x = readNumberParam(params as Record, "x"); + const y = readNumberParam(params as Record, "y"); + await client.rightClick(x, y); + return jsonResult({ + success: true, + action: "right_click", + coordinates: x !== undefined && y !== undefined ? { x, y } : "current position", + }); + } + + case "type": { + const text = readStringParam(params as Record, "text", { + required: true, + label: "text", + }); + await client.type(text); + return jsonResult({ + success: true, + action: "type", + text, + }); + } + + case "key": { + const key = readStringParam(params as Record, "key", { + required: true, + label: "key", + }); + await client.key(key); + return jsonResult({ + success: true, + action: "key", + key, + }); + } + + case "hotkey": { + const keys = params.keys as string[] | undefined; + if (!keys || !Array.isArray(keys) || keys.length === 0) { + throw new Error("keys array required for hotkey action"); + } + await client.hotkey(keys); + return jsonResult({ + success: true, + action: "hotkey", + keys, + }); + } + + case "scroll": { + const direction = readStringParam(params as Record, "direction", { + required: true, + label: "direction", + }) as "up" | "down" | "left" | "right"; + const amount = readNumberParam(params as Record, "amount") ?? 1; + await client.scroll(direction, amount); + return jsonResult({ + success: true, + action: "scroll", + direction, + amount, + }); + } + + case "move": { + const x = readNumberParam(params as Record, "x", { + required: true, + label: "x coordinate", + })!; + const y = readNumberParam(params as Record, "y", { + required: true, + label: "y coordinate", + })!; + await client.moveCursor(x, y); + return jsonResult({ + success: true, + action: "move", + coordinates: { x, y }, + }); + } + + case "drag": { + const x = readNumberParam(params as Record, "x", { + required: true, + label: "start x coordinate", + })!; + const y = readNumberParam(params as Record, "y", { + required: true, + label: "start y coordinate", + })!; + const endX = readNumberParam(params as Record, "end_x", { + required: true, + label: "end x coordinate", + })!; + const endY = readNumberParam(params as Record, "end_y", { + required: true, + label: "end y coordinate", + })!; + // Move to start position first, then drag + await client.moveCursor(x, y); + await client.dragTo(endX, endY); + return jsonResult({ + success: true, + action: "drag", + from: { x, y }, + to: { x: endX, y: endY }, + }); + } + + case "get_screen_size": { + const size = await client.getScreenSize(); + return jsonResult({ + success: true, + action: "get_screen_size", + width: size.width, + height: size.height, + }); + } + + case "get_cursor_position": { + const pos = await client.getCursorPosition(); + return jsonResult({ + success: true, + action: "get_cursor_position", + x: pos.x, + y: pos.y, + }); + } + + default: + throw new Error(`Unknown action: ${action}`); + } + } catch (error) { + if (error instanceof ComputerServerError) { + return jsonResult({ + success: false, + error: error.message, + command: error.command, + }); + } + throw error; + } + }, + }; +} diff --git a/extensions/cua-computer/index.ts b/extensions/cua-computer/index.ts new file mode 100644 index 000000000..fd3408371 --- /dev/null +++ b/extensions/cua-computer/index.ts @@ -0,0 +1,33 @@ +/** + * Cua Computer Plugin + * + * Provides GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling. + * + * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server + */ + +import type { ClawdbotPluginDefinition } from "../../src/plugins/types.js"; +import { createComputerTool } from "./computer-tool.js"; + +interface CuaComputerConfig { + serverUrl?: string; +} + +const plugin: ClawdbotPluginDefinition = { + id: "cua-computer", + name: "Cua Computer", + description: "GUI automation via cua-computer-server", + + register(api) { + const config = api.pluginConfig as CuaComputerConfig | undefined; + + api.registerTool( + createComputerTool({ + defaultServerUrl: config?.serverUrl, + config: api.config, + }), + ); + }, +}; + +export default plugin; diff --git a/extensions/cua-computer/package.json b/extensions/cua-computer/package.json new file mode 100644 index 000000000..6563fcaaa --- /dev/null +++ b/extensions/cua-computer/package.json @@ -0,0 +1,6 @@ +{ + "name": "@clawdbot/plugin-cua-computer", + "version": "0.1.0", + "type": "module", + "main": "index.ts" +}