Merge 438c80b758 into 09be5d45d5

2026-01-30 10:15:47 -06:00 · 2026-01-30 10:15:47 -06:00 · 0020a25dea
commit 0020a25dea
parent 09be5d45d5 438c80b758
7 changed files with 776 additions and 0 deletions
--- a/docs/guides/computer-server-setup.md
+++ b/docs/guides/computer-server-setup.md
@ -0,0 +1,176 @@
 # Computer-Server Setup for GUI Automation
 This guide explains how to set up [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) for desktop GUI automation with Clawdbot agents.
 ## Overview
 The `computer` tool enables agents to:
 - Take screenshots of the desktop
 - Click at screen coordinates
 - Type text and press keys
 - Scroll and drag
 - Control native applications
 ## Setup Options
 ### Option 1: Sandbox Mode (Recommended)
 Use Cua's pre-built desktop sandbox images. Computer-server is already installed and running.
 ![cua-xfce desktop sandbox](./cua-xfce-screenshot.png)
 ```yaml
 # In your clawdbot config
 agents:
  defaults:
    sandbox:
      docker:
        image: ghcr.io/trycua/cua-xfce:latest
 ```
 Available sandbox images:
 | Image | Description |
 |-------|-------------|
 | `ghcr.io/trycua/cua-xfce:latest` | Linux + XFCE desktop |
 | `ghcr.io/trycua/cua-ubuntu:latest` | Linux + Kasm desktop |
 For more options including Windows and Android, see [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox#docker-sandboxes).
 ### Option 2: Node Mode (Linux)
 For Linux nodes, you can set up computer-server to run as a daemon using supervisor.
 #### 1. Install computer-server
 ```bash
 pip install cua-computer-server
 ```
 #### 2. Create startup script
 Create `/usr/local/bin/start-computer-server.sh`:
 ```bash
 #!/bin/bash
 set -e
 # Wait for X server to be ready
 echo "Waiting for X server to start..."
 while ! xdpyinfo -display :0 >/dev/null 2>&1; do
    sleep 1
 done
 echo "X server is ready"
 # Start computer-server
 export DISPLAY=:0
 python3 -m computer_server --port 8000
 ```
 Make it executable:
 ```bash
 chmod +x /usr/local/bin/start-computer-server.sh
 ```
 #### 3. Configure supervisor
 Create `/etc/supervisor/conf.d/computer-server.conf`:
 ```ini
 [program:computer-server]
 command=/usr/local/bin/start-computer-server.sh
 user=<your-user>
 autorestart=true
 stdout_logfile=/var/log/computer-server.log
 stderr_logfile=/var/log/computer-server.error.log
 ```
 #### 4. Start the service
 ```bash
 sudo supervisorctl reread
 sudo supervisorctl update
 sudo supervisorctl start computer-server
 ```
 ### Option 3: Node Mode (Windows/macOS)
 **TBD** - Setup instructions for Windows and macOS nodes will be added in a future update.
 ## Usage
 Once computer-server is running, agents can use the `computer` tool:
 ```
 Use the computer tool to take a screenshot and then click on the "File" menu.
 ```
 The agent will:
 1. Call `computer` with `action: "screenshot"` to see the screen
 2. Identify the coordinates of "File" menu
 3. Call `computer` with `action: "click"` and the coordinates
 ## Configuration
 You can configure the computer-server URL in your Clawdbot config:
 ```yaml
 tools:
  computer:
    serverUrl: "http://localhost:8000"
 ```
 Or per-agent:
 ```yaml
 agents:
  my-agent:
    tools:
      computer:
        serverUrl: "http://192.168.1.100:8000"
 ```
 ## MCP Alternative
 Computer-server also exposes an MCP (Model Context Protocol) interface at `/mcp`. This could be used as an alternative integration method if Clawdbot adds MCP client support in the future.
 To use computer-server as a standalone MCP server:
 ```bash
 python -m computer_server --mcp
 ```
 ## Troubleshooting
 ### Connection refused
 Ensure computer-server is running and accessible:
 ```bash
 curl http://localhost:8000/status
 ```
 Should return: `{"status": "ok", "os_type": "...", "features": [...]}`
 ### No display
 If computer-server fails with display errors, ensure:
 - X server is running (Linux)
 - The `DISPLAY` environment variable is set correctly
 - The user has permission to access the display
 ### Sandbox not connecting
 Ensure the sandbox container has port 8000 exposed and the network is accessible from Clawdbot.
 ## Security Considerations
 - **Sandbox mode**: Isolated container - safe for untrusted workloads
 - **Node mode**: Controls the actual device screen - use only with trusted agents
 - **Gateway mode**: Not supported - would give agents control of your actual desktop
 ## Related
 - [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) - Desktop automation server
 - [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox) - Pre-built sandbox images
--- a/docs/guides/cua-xfce-screenshot.png
+++ b/docs/guides/cua-xfce-screenshot.png
--- a/extensions/cua-computer/clawdbot.plugin.json
+++ b/extensions/cua-computer/clawdbot.plugin.json
@ -0,0 +1,22 @@
 {
  "id": "cua-computer",
  "name": "Cua Computer",
  "description": "GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling",
  "version": "0.1.0",
  "configSchema": {
    "type": "object",
    "properties": {
      "serverUrl": {
        "type": "string",
        "description": "computer-server URL (default: http://localhost:8000)"
      }
    }
  },
  "uiHints": {
    "serverUrl": {
      "label": "Server URL",
      "placeholder": "http://localhost:8000",
      "help": "URL of the cua-computer-server instance"
    }
  }
 }
--- a/extensions/cua-computer/computer-server-client.ts
+++ b/extensions/cua-computer/computer-server-client.ts
@ -0,0 +1,227 @@
 /**
 * HTTP client for communicating with cua-computer-server.
 *
 * computer-server provides desktop automation capabilities via HTTP POST /cmd endpoint.
 * Each command returns { success: boolean, ...result } or { success: false, error: string }.
 *
 * Note: computer-server also exposes an MCP interface at /mcp which could be used
 * if Clawdbot adds MCP client support in the future.
 *
 * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
 */
 export interface ComputerServerConfig {
  /** Base URL of the computer-server (default: http://localhost:8000) */
  baseUrl?: string;
  /** Request timeout in milliseconds (default: 30000) */
  timeoutMs?: number;
 }
 export interface ScreenshotResult {
  /** Base64-encoded PNG image data */
  imageData: string;
 }
 export interface ScreenSize {
  width: number;
  height: number;
 }
 export interface CursorPosition {
  x: number;
  y: number;
 }
 export interface CommandResult {
  success: boolean;
  error?: string;
  [key: string]: unknown;
 }
 export class ComputerServerError extends Error {
  constructor(
    message: string,
    public readonly command: string,
    public readonly statusCode?: number,
  ) {
    super(message);
    this.name = "ComputerServerError";
  }
 }
 export class ComputerServerClient {
  private readonly baseUrl: string;
  private readonly timeoutMs: number;
  constructor(config: ComputerServerConfig = {}) {
    this.baseUrl = config.baseUrl ?? "http://localhost:8000";
    this.timeoutMs = config.timeoutMs ?? 30000;
  }
  /**
   * Take a screenshot of the desktop.
   * @returns Base64-encoded PNG image data
   */
  async screenshot(): Promise<ScreenshotResult> {
    const result = await this.call("screenshot");
    return { imageData: result.image_data as string };
  }
  /**
   * Get the screen dimensions.
   */
  async getScreenSize(): Promise<ScreenSize> {
    const result = await this.call("get_screen_size");
    const size = result.size as { width: number; height: number };
    return {
      width: size.width,
      height: size.height,
    };
  }
  /**
   * Get the current cursor position.
   */
  async getCursorPosition(): Promise<CursorPosition> {
    const result = await this.call("get_cursor_position");
    const position = result.position as { x: number; y: number };
    return {
      x: position.x,
      y: position.y,
    };
  }
  /**
   * Perform a left click at the specified coordinates.
   * If coordinates are omitted, clicks at the current cursor position.
   */
  async click(x?: number, y?: number): Promise<void> {
    await this.call("left_click", { x, y });
  }
  /**
   * Perform a double click at the specified coordinates.
   */
  async doubleClick(x?: number, y?: number): Promise<void> {
    await this.call("double_click", { x, y });
  }
  /**
   * Perform a right click at the specified coordinates.
   */
  async rightClick(x?: number, y?: number): Promise<void> {
    await this.call("right_click", { x, y });
  }
  /**
   * Move the cursor to the specified coordinates.
   */
  async moveCursor(x: number, y: number): Promise<void> {
    await this.call("move_cursor", { x, y });
  }
  /**
   * Type text using the keyboard.
   */
  async type(text: string): Promise<void> {
    await this.call("type_text", { text });
  }
  /**
   * Press a single key (e.g., "Return", "Tab", "Escape").
   */
  async key(key: string): Promise<void> {
    await this.call("press_key", { key });
  }
  /**
   * Press a combination of keys (e.g., ["cmd", "c"] for copy).
   */
  async hotkey(keys: string[]): Promise<void> {
    await this.call("hotkey", { keys });
  }
  /**
   * Scroll in a direction.
   * @param direction - "up", "down", "left", or "right"
   * @param clicks - Number of scroll clicks (default: 1)
   */
  async scroll(direction: "up" | "down" | "left" | "right", clicks = 1): Promise<void> {
    if (direction === "down") {
      await this.call("scroll_down", { clicks });
    } else if (direction === "up") {
      await this.call("scroll_up", { clicks });
    } else {
      // Horizontal scroll: use scroll(x, y) where positive x = right, negative = left
      const x = direction === "right" ? 300 * clicks : -300 * clicks;
      await this.call("scroll", { x, y: 0 });
    }
  }
  /**
   * Drag from current position to target coordinates.
   */
  async dragTo(x: number, y: number, button = "left", duration = 0.5): Promise<void> {
    await this.call("drag_to", { x, y, button, duration });
  }
  /**
   * Check if the computer-server is available and responding.
   */
  async healthCheck(): Promise<boolean> {
    try {
      const response = await fetch(`${this.baseUrl}/status`, {
        method: "GET",
        signal: AbortSignal.timeout(5000),
      });
      return response.ok;
    } catch {
      return false;
    }
  }
  /**
   * Send a command to the computer-server.
   */
  private async call(
    command: string,
    params: Record<string, unknown> = {},
  ): Promise<CommandResult> {
    // Filter out undefined values from params
    const filteredParams = Object.fromEntries(
      Object.entries(params).filter(([, v]) => v !== undefined),
    );
    const response = await fetch(`${this.baseUrl}/cmd`, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
      },
      body: JSON.stringify({ command, params: filteredParams }),
      signal: AbortSignal.timeout(this.timeoutMs),
    });
    if (!response.ok) {
      throw new ComputerServerError(
        `HTTP ${response.status}: ${response.statusText}`,
        command,
        response.status,
      );
    }
    // The /cmd endpoint returns SSE-style "data: {...}\n\n" format
    const text = await response.text();
    const jsonMatch = text.match(/^data:\s*(.+)$/m);
    if (!jsonMatch) {
      throw new ComputerServerError(`Invalid response format from computer-server`, command);
    }
    const result = JSON.parse(jsonMatch[1]) as CommandResult;
    if (!result.success) {
      throw new ComputerServerError(result.error ?? `Command '${command}' failed`, command);
    }
    return result;
  }
 }
--- a/extensions/cua-computer/computer-tool.ts
+++ b/extensions/cua-computer/computer-tool.ts
@ -0,0 +1,312 @@
 /**
 * Computer tool for GUI automation via cua-computer-server.
 *
 * Enables agents to take screenshots, click, type, scroll, and perform
 * other desktop automation actions on sandboxes or nodes running computer-server.
 *
 * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
 */
 import { Type } from "@sinclair/typebox";
 import type { ClawdbotConfig } from "../../src/config/config.js";
 import { stringEnum } from "../../src/agents/schema/typebox.js";
 import type { AnyAgentTool } from "../../src/agents/tools/common.js";
 import { imageResult, jsonResult, readNumberParam, readStringParam } from "../../src/agents/tools/common.js";
 import { ComputerServerClient, ComputerServerError } from "./computer-server-client.js";
 const COMPUTER_ACTIONS = [
  "screenshot",
  "click",
  "double_click",
  "right_click",
  "type",
  "key",
  "hotkey",
  "scroll",
  "move",
  "drag",
  "get_screen_size",
  "get_cursor_position",
 ] as const;
 const SCROLL_DIRECTIONS = ["up", "down", "left", "right"] as const;
 const ComputerToolSchema = Type.Object({
  action: stringEnum(COMPUTER_ACTIONS, {
    description:
      "Action to perform: screenshot, click, double_click, right_click, type, key, hotkey, scroll, move, drag, get_screen_size, get_cursor_position",
  }),
  // Coordinates (for click, double_click, right_click, move, scroll)
  x: Type.Optional(Type.Number({ description: "X coordinate in pixels" })),
  y: Type.Optional(Type.Number({ description: "Y coordinate in pixels" })),
  // Typing
  text: Type.Optional(Type.String({ description: "Text to type (for 'type' action)" })),
  // Key press
  key: Type.Optional(
    Type.String({
      description: "Key to press (for 'key' action), e.g., 'Return', 'Tab', 'Escape'",
    }),
  ),
  // Hotkey
  keys: Type.Optional(
    Type.Array(Type.String(), {
      description: "Keys for hotkey combination (for 'hotkey' action), e.g., ['cmd', 'c']",
    }),
  ),
  // Scroll
  direction: Type.Optional(
    stringEnum(SCROLL_DIRECTIONS, {
      description: "Scroll direction (for 'scroll' action): up, down, left, right",
    }),
  ),
  amount: Type.Optional(
    Type.Number({
      description: "Scroll amount in clicks (for 'scroll' action), default: 1",
    }),
  ),
  // Drag
  end_x: Type.Optional(Type.Number({ description: "End X coordinate for drag action" })),
  end_y: Type.Optional(Type.Number({ description: "End Y coordinate for drag action" })),
  // Connection
  computer_server_url: Type.Optional(
    Type.String({
      description:
        "URL of the computer-server (default: http://localhost:8000). Usually set automatically based on sandbox/node configuration.",
    }),
  ),
 });
 export type ComputerToolOptions = {
  /** Default computer-server URL */
  defaultServerUrl?: string;
  /** Clawdbot configuration */
  config?: ClawdbotConfig;
 };
 export function createComputerTool(options?: ComputerToolOptions): AnyAgentTool {
  return {
    label: "Computer",
    name: "computer",
    description: `Control a computer's GUI - take screenshots, click, type, scroll, and more.
 Use this tool to interact with desktop applications running in a sandbox or on a connected node.
 **Actions:**
 - \`screenshot\`: Capture the current screen state. Always do this first to see what's on screen.
 - \`click\`: Left-click at coordinates (x, y)
 - \`double_click\`: Double-click at coordinates (x, y)
 - \`right_click\`: Right-click at coordinates (x, y)
 - \`type\`: Type text at the current cursor position
 - \`key\`: Press a single key (e.g., "Return", "Tab", "Escape")
 - \`hotkey\`: Press a key combination (e.g., ["cmd", "c"] for copy)
 - \`scroll\`: Scroll in a direction (up, down, left, right)
 - \`move\`: Move cursor to coordinates without clicking
 - \`drag\`: Drag from (x, y) to (end_x, end_y)
 - \`get_screen_size\`: Get screen dimensions
 - \`get_cursor_position\`: Get current cursor position
 **Tips:**
 - Always take a screenshot first to understand the current screen state
 - Use coordinates from screenshots to click on UI elements
 - After performing actions, take another screenshot to verify the result`,
    parameters: ComputerToolSchema,
    execute: async (_toolCallId, params) => {
      const action = readStringParam(params as Record<string, unknown>, "action", {
        required: true,
      });
      const serverUrl =
        readStringParam(params as Record<string, unknown>, "computer_server_url") ??
        options?.defaultServerUrl ??
        "http://localhost:8000";
      const client = new ComputerServerClient({ baseUrl: serverUrl });
      try {
        switch (action) {
          case "screenshot": {
            const result = await client.screenshot();
            return await imageResult({
              label: "Screenshot",
              path: "screenshot.png",
              base64: result.imageData,
              mimeType: "image/png",
              extraText: "Screenshot captured successfully",
            });
          }
          case "click": {
            const x = readNumberParam(params as Record<string, unknown>, "x");
            const y = readNumberParam(params as Record<string, unknown>, "y");
            await client.click(x, y);
            return jsonResult({
              success: true,
              action: "click",
              coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
            });
          }
          case "double_click": {
            const x = readNumberParam(params as Record<string, unknown>, "x");
            const y = readNumberParam(params as Record<string, unknown>, "y");
            await client.doubleClick(x, y);
            return jsonResult({
              success: true,
              action: "double_click",
              coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
            });
          }
          case "right_click": {
            const x = readNumberParam(params as Record<string, unknown>, "x");
            const y = readNumberParam(params as Record<string, unknown>, "y");
            await client.rightClick(x, y);
            return jsonResult({
              success: true,
              action: "right_click",
              coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
            });
          }
          case "type": {
            const text = readStringParam(params as Record<string, unknown>, "text", {
              required: true,
              label: "text",
            });
            await client.type(text);
            return jsonResult({
              success: true,
              action: "type",
              text,
            });
          }
          case "key": {
            const key = readStringParam(params as Record<string, unknown>, "key", {
              required: true,
              label: "key",
            });
            await client.key(key);
            return jsonResult({
              success: true,
              action: "key",
              key,
            });
          }
          case "hotkey": {
            const keys = params.keys as string[] | undefined;
            if (!keys || !Array.isArray(keys) || keys.length === 0) {
              throw new Error("keys array required for hotkey action");
            }
            await client.hotkey(keys);
            return jsonResult({
              success: true,
              action: "hotkey",
              keys,
            });
          }
          case "scroll": {
            const direction = readStringParam(params as Record<string, unknown>, "direction", {
              required: true,
              label: "direction",
            }) as "up" | "down" | "left" | "right";
            const amount = readNumberParam(params as Record<string, unknown>, "amount") ?? 1;
            await client.scroll(direction, amount);
            return jsonResult({
              success: true,
              action: "scroll",
              direction,
              amount,
            });
          }
          case "move": {
            const x = readNumberParam(params as Record<string, unknown>, "x", {
              required: true,
              label: "x coordinate",
            })!;
            const y = readNumberParam(params as Record<string, unknown>, "y", {
              required: true,
              label: "y coordinate",
            })!;
            await client.moveCursor(x, y);
            return jsonResult({
              success: true,
              action: "move",
              coordinates: { x, y },
            });
          }
          case "drag": {
            const x = readNumberParam(params as Record<string, unknown>, "x", {
              required: true,
              label: "start x coordinate",
            })!;
            const y = readNumberParam(params as Record<string, unknown>, "y", {
              required: true,
              label: "start y coordinate",
            })!;
            const endX = readNumberParam(params as Record<string, unknown>, "end_x", {
              required: true,
              label: "end x coordinate",
            })!;
            const endY = readNumberParam(params as Record<string, unknown>, "end_y", {
              required: true,
              label: "end y coordinate",
            })!;
            // Move to start position first, then drag
            await client.moveCursor(x, y);
            await client.dragTo(endX, endY);
            return jsonResult({
              success: true,
              action: "drag",
              from: { x, y },
              to: { x: endX, y: endY },
            });
          }
          case "get_screen_size": {
            const size = await client.getScreenSize();
            return jsonResult({
              success: true,
              action: "get_screen_size",
              width: size.width,
              height: size.height,
            });
          }
          case "get_cursor_position": {
            const pos = await client.getCursorPosition();
            return jsonResult({
              success: true,
              action: "get_cursor_position",
              x: pos.x,
              y: pos.y,
            });
          }
          default:
            throw new Error(`Unknown action: ${action}`);
        }
      } catch (error) {
        if (error instanceof ComputerServerError) {
          return jsonResult({
            success: false,
            error: error.message,
            command: error.command,
          });
        }
        throw error;
      }
    },
  };
 }
--- a/extensions/cua-computer/index.ts
+++ b/extensions/cua-computer/index.ts
@ -0,0 +1,33 @@
 /**
 * Cua Computer Plugin
 *
 * Provides GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling.
 *
 * @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
 */
 import type { ClawdbotPluginDefinition } from "../../src/plugins/types.js";
 import { createComputerTool } from "./computer-tool.js";
 interface CuaComputerConfig {
  serverUrl?: string;
 }
 const plugin: ClawdbotPluginDefinition = {
  id: "cua-computer",
  name: "Cua Computer",
  description: "GUI automation via cua-computer-server",
  register(api) {
    const config = api.pluginConfig as CuaComputerConfig | undefined;
    api.registerTool(
      createComputerTool({
        defaultServerUrl: config?.serverUrl,
        config: api.config,
      }),
    );
  },
 };
 export default plugin;
--- a/extensions/cua-computer/package.json
+++ b/extensions/cua-computer/package.json
@ -0,0 +1,6 @@
 {
  "name": "@clawdbot/plugin-cua-computer",
  "version": "0.1.0",
  "type": "module",
  "main": "index.ts"
 }