This commit is contained in:
Francesco Bonacci 2026-01-30 10:15:47 -06:00 committed by GitHub
commit 0020a25dea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 776 additions and 0 deletions

View File

@ -0,0 +1,176 @@
# Computer-Server Setup for GUI Automation
This guide explains how to set up [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) for desktop GUI automation with Clawdbot agents.
## Overview
The `computer` tool enables agents to:
- Take screenshots of the desktop
- Click at screen coordinates
- Type text and press keys
- Scroll and drag
- Control native applications
## Setup Options
### Option 1: Sandbox Mode (Recommended)
Use Cua's pre-built desktop sandbox images. Computer-server is already installed and running.
![cua-xfce desktop sandbox](./cua-xfce-screenshot.png)
```yaml
# In your clawdbot config
agents:
defaults:
sandbox:
docker:
image: ghcr.io/trycua/cua-xfce:latest
```
Available sandbox images:
| Image | Description |
|-------|-------------|
| `ghcr.io/trycua/cua-xfce:latest` | Linux + XFCE desktop |
| `ghcr.io/trycua/cua-ubuntu:latest` | Linux + Kasm desktop |
For more options including Windows and Android, see [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox#docker-sandboxes).
### Option 2: Node Mode (Linux)
For Linux nodes, you can set up computer-server to run as a daemon using supervisor.
#### 1. Install computer-server
```bash
pip install cua-computer-server
```
#### 2. Create startup script
Create `/usr/local/bin/start-computer-server.sh`:
```bash
#!/bin/bash
set -e
# Wait for X server to be ready
echo "Waiting for X server to start..."
while ! xdpyinfo -display :0 >/dev/null 2>&1; do
sleep 1
done
echo "X server is ready"
# Start computer-server
export DISPLAY=:0
python3 -m computer_server --port 8000
```
Make it executable:
```bash
chmod +x /usr/local/bin/start-computer-server.sh
```
#### 3. Configure supervisor
Create `/etc/supervisor/conf.d/computer-server.conf`:
```ini
[program:computer-server]
command=/usr/local/bin/start-computer-server.sh
user=<your-user>
autorestart=true
stdout_logfile=/var/log/computer-server.log
stderr_logfile=/var/log/computer-server.error.log
```
#### 4. Start the service
```bash
sudo supervisorctl reread
sudo supervisorctl update
sudo supervisorctl start computer-server
```
### Option 3: Node Mode (Windows/macOS)
**TBD** - Setup instructions for Windows and macOS nodes will be added in a future update.
## Usage
Once computer-server is running, agents can use the `computer` tool:
```
Use the computer tool to take a screenshot and then click on the "File" menu.
```
The agent will:
1. Call `computer` with `action: "screenshot"` to see the screen
2. Identify the coordinates of "File" menu
3. Call `computer` with `action: "click"` and the coordinates
## Configuration
You can configure the computer-server URL in your Clawdbot config:
```yaml
tools:
computer:
serverUrl: "http://localhost:8000"
```
Or per-agent:
```yaml
agents:
my-agent:
tools:
computer:
serverUrl: "http://192.168.1.100:8000"
```
## MCP Alternative
Computer-server also exposes an MCP (Model Context Protocol) interface at `/mcp`. This could be used as an alternative integration method if Clawdbot adds MCP client support in the future.
To use computer-server as a standalone MCP server:
```bash
python -m computer_server --mcp
```
## Troubleshooting
### Connection refused
Ensure computer-server is running and accessible:
```bash
curl http://localhost:8000/status
```
Should return: `{"status": "ok", "os_type": "...", "features": [...]}`
### No display
If computer-server fails with display errors, ensure:
- X server is running (Linux)
- The `DISPLAY` environment variable is set correctly
- The user has permission to access the display
### Sandbox not connecting
Ensure the sandbox container has port 8000 exposed and the network is accessible from Clawdbot.
## Security Considerations
- **Sandbox mode**: Isolated container - safe for untrusted workloads
- **Node mode**: Controls the actual device screen - use only with trusted agents
- **Gateway mode**: Not supported - would give agents control of your actual desktop
## Related
- [cua-computer-server](https://github.com/trycua/cua/tree/main/libs/python/computer-server) - Desktop automation server
- [Cua Desktop Sandboxes](https://cua.ai/docs/cua/guide/get-started/what-is-desktop-sandbox) - Pre-built sandbox images

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

View File

@ -0,0 +1,22 @@
{
"id": "cua-computer",
"name": "Cua Computer",
"description": "GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling",
"version": "0.1.0",
"configSchema": {
"type": "object",
"properties": {
"serverUrl": {
"type": "string",
"description": "computer-server URL (default: http://localhost:8000)"
}
}
},
"uiHints": {
"serverUrl": {
"label": "Server URL",
"placeholder": "http://localhost:8000",
"help": "URL of the cua-computer-server instance"
}
}
}

View File

@ -0,0 +1,227 @@
/**
* HTTP client for communicating with cua-computer-server.
*
* computer-server provides desktop automation capabilities via HTTP POST /cmd endpoint.
* Each command returns { success: boolean, ...result } or { success: false, error: string }.
*
* Note: computer-server also exposes an MCP interface at /mcp which could be used
* if Clawdbot adds MCP client support in the future.
*
* @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
*/
export interface ComputerServerConfig {
/** Base URL of the computer-server (default: http://localhost:8000) */
baseUrl?: string;
/** Request timeout in milliseconds (default: 30000) */
timeoutMs?: number;
}
export interface ScreenshotResult {
/** Base64-encoded PNG image data */
imageData: string;
}
export interface ScreenSize {
width: number;
height: number;
}
export interface CursorPosition {
x: number;
y: number;
}
export interface CommandResult {
success: boolean;
error?: string;
[key: string]: unknown;
}
export class ComputerServerError extends Error {
constructor(
message: string,
public readonly command: string,
public readonly statusCode?: number,
) {
super(message);
this.name = "ComputerServerError";
}
}
export class ComputerServerClient {
private readonly baseUrl: string;
private readonly timeoutMs: number;
constructor(config: ComputerServerConfig = {}) {
this.baseUrl = config.baseUrl ?? "http://localhost:8000";
this.timeoutMs = config.timeoutMs ?? 30000;
}
/**
* Take a screenshot of the desktop.
* @returns Base64-encoded PNG image data
*/
async screenshot(): Promise<ScreenshotResult> {
const result = await this.call("screenshot");
return { imageData: result.image_data as string };
}
/**
* Get the screen dimensions.
*/
async getScreenSize(): Promise<ScreenSize> {
const result = await this.call("get_screen_size");
const size = result.size as { width: number; height: number };
return {
width: size.width,
height: size.height,
};
}
/**
* Get the current cursor position.
*/
async getCursorPosition(): Promise<CursorPosition> {
const result = await this.call("get_cursor_position");
const position = result.position as { x: number; y: number };
return {
x: position.x,
y: position.y,
};
}
/**
* Perform a left click at the specified coordinates.
* If coordinates are omitted, clicks at the current cursor position.
*/
async click(x?: number, y?: number): Promise<void> {
await this.call("left_click", { x, y });
}
/**
* Perform a double click at the specified coordinates.
*/
async doubleClick(x?: number, y?: number): Promise<void> {
await this.call("double_click", { x, y });
}
/**
* Perform a right click at the specified coordinates.
*/
async rightClick(x?: number, y?: number): Promise<void> {
await this.call("right_click", { x, y });
}
/**
* Move the cursor to the specified coordinates.
*/
async moveCursor(x: number, y: number): Promise<void> {
await this.call("move_cursor", { x, y });
}
/**
* Type text using the keyboard.
*/
async type(text: string): Promise<void> {
await this.call("type_text", { text });
}
/**
* Press a single key (e.g., "Return", "Tab", "Escape").
*/
async key(key: string): Promise<void> {
await this.call("press_key", { key });
}
/**
* Press a combination of keys (e.g., ["cmd", "c"] for copy).
*/
async hotkey(keys: string[]): Promise<void> {
await this.call("hotkey", { keys });
}
/**
* Scroll in a direction.
* @param direction - "up", "down", "left", or "right"
* @param clicks - Number of scroll clicks (default: 1)
*/
async scroll(direction: "up" | "down" | "left" | "right", clicks = 1): Promise<void> {
if (direction === "down") {
await this.call("scroll_down", { clicks });
} else if (direction === "up") {
await this.call("scroll_up", { clicks });
} else {
// Horizontal scroll: use scroll(x, y) where positive x = right, negative = left
const x = direction === "right" ? 300 * clicks : -300 * clicks;
await this.call("scroll", { x, y: 0 });
}
}
/**
* Drag from current position to target coordinates.
*/
async dragTo(x: number, y: number, button = "left", duration = 0.5): Promise<void> {
await this.call("drag_to", { x, y, button, duration });
}
/**
* Check if the computer-server is available and responding.
*/
async healthCheck(): Promise<boolean> {
try {
const response = await fetch(`${this.baseUrl}/status`, {
method: "GET",
signal: AbortSignal.timeout(5000),
});
return response.ok;
} catch {
return false;
}
}
/**
* Send a command to the computer-server.
*/
private async call(
command: string,
params: Record<string, unknown> = {},
): Promise<CommandResult> {
// Filter out undefined values from params
const filteredParams = Object.fromEntries(
Object.entries(params).filter(([, v]) => v !== undefined),
);
const response = await fetch(`${this.baseUrl}/cmd`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ command, params: filteredParams }),
signal: AbortSignal.timeout(this.timeoutMs),
});
if (!response.ok) {
throw new ComputerServerError(
`HTTP ${response.status}: ${response.statusText}`,
command,
response.status,
);
}
// The /cmd endpoint returns SSE-style "data: {...}\n\n" format
const text = await response.text();
const jsonMatch = text.match(/^data:\s*(.+)$/m);
if (!jsonMatch) {
throw new ComputerServerError(`Invalid response format from computer-server`, command);
}
const result = JSON.parse(jsonMatch[1]) as CommandResult;
if (!result.success) {
throw new ComputerServerError(result.error ?? `Command '${command}' failed`, command);
}
return result;
}
}

View File

@ -0,0 +1,312 @@
/**
* Computer tool for GUI automation via cua-computer-server.
*
* Enables agents to take screenshots, click, type, scroll, and perform
* other desktop automation actions on sandboxes or nodes running computer-server.
*
* @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
*/
import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../src/config/config.js";
import { stringEnum } from "../../src/agents/schema/typebox.js";
import type { AnyAgentTool } from "../../src/agents/tools/common.js";
import { imageResult, jsonResult, readNumberParam, readStringParam } from "../../src/agents/tools/common.js";
import { ComputerServerClient, ComputerServerError } from "./computer-server-client.js";
const COMPUTER_ACTIONS = [
"screenshot",
"click",
"double_click",
"right_click",
"type",
"key",
"hotkey",
"scroll",
"move",
"drag",
"get_screen_size",
"get_cursor_position",
] as const;
const SCROLL_DIRECTIONS = ["up", "down", "left", "right"] as const;
const ComputerToolSchema = Type.Object({
action: stringEnum(COMPUTER_ACTIONS, {
description:
"Action to perform: screenshot, click, double_click, right_click, type, key, hotkey, scroll, move, drag, get_screen_size, get_cursor_position",
}),
// Coordinates (for click, double_click, right_click, move, scroll)
x: Type.Optional(Type.Number({ description: "X coordinate in pixels" })),
y: Type.Optional(Type.Number({ description: "Y coordinate in pixels" })),
// Typing
text: Type.Optional(Type.String({ description: "Text to type (for 'type' action)" })),
// Key press
key: Type.Optional(
Type.String({
description: "Key to press (for 'key' action), e.g., 'Return', 'Tab', 'Escape'",
}),
),
// Hotkey
keys: Type.Optional(
Type.Array(Type.String(), {
description: "Keys for hotkey combination (for 'hotkey' action), e.g., ['cmd', 'c']",
}),
),
// Scroll
direction: Type.Optional(
stringEnum(SCROLL_DIRECTIONS, {
description: "Scroll direction (for 'scroll' action): up, down, left, right",
}),
),
amount: Type.Optional(
Type.Number({
description: "Scroll amount in clicks (for 'scroll' action), default: 1",
}),
),
// Drag
end_x: Type.Optional(Type.Number({ description: "End X coordinate for drag action" })),
end_y: Type.Optional(Type.Number({ description: "End Y coordinate for drag action" })),
// Connection
computer_server_url: Type.Optional(
Type.String({
description:
"URL of the computer-server (default: http://localhost:8000). Usually set automatically based on sandbox/node configuration.",
}),
),
});
export type ComputerToolOptions = {
/** Default computer-server URL */
defaultServerUrl?: string;
/** Clawdbot configuration */
config?: ClawdbotConfig;
};
export function createComputerTool(options?: ComputerToolOptions): AnyAgentTool {
return {
label: "Computer",
name: "computer",
description: `Control a computer's GUI - take screenshots, click, type, scroll, and more.
Use this tool to interact with desktop applications running in a sandbox or on a connected node.
**Actions:**
- \`screenshot\`: Capture the current screen state. Always do this first to see what's on screen.
- \`click\`: Left-click at coordinates (x, y)
- \`double_click\`: Double-click at coordinates (x, y)
- \`right_click\`: Right-click at coordinates (x, y)
- \`type\`: Type text at the current cursor position
- \`key\`: Press a single key (e.g., "Return", "Tab", "Escape")
- \`hotkey\`: Press a key combination (e.g., ["cmd", "c"] for copy)
- \`scroll\`: Scroll in a direction (up, down, left, right)
- \`move\`: Move cursor to coordinates without clicking
- \`drag\`: Drag from (x, y) to (end_x, end_y)
- \`get_screen_size\`: Get screen dimensions
- \`get_cursor_position\`: Get current cursor position
**Tips:**
- Always take a screenshot first to understand the current screen state
- Use coordinates from screenshots to click on UI elements
- After performing actions, take another screenshot to verify the result`,
parameters: ComputerToolSchema,
execute: async (_toolCallId, params) => {
const action = readStringParam(params as Record<string, unknown>, "action", {
required: true,
});
const serverUrl =
readStringParam(params as Record<string, unknown>, "computer_server_url") ??
options?.defaultServerUrl ??
"http://localhost:8000";
const client = new ComputerServerClient({ baseUrl: serverUrl });
try {
switch (action) {
case "screenshot": {
const result = await client.screenshot();
return await imageResult({
label: "Screenshot",
path: "screenshot.png",
base64: result.imageData,
mimeType: "image/png",
extraText: "Screenshot captured successfully",
});
}
case "click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.click(x, y);
return jsonResult({
success: true,
action: "click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "double_click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.doubleClick(x, y);
return jsonResult({
success: true,
action: "double_click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "right_click": {
const x = readNumberParam(params as Record<string, unknown>, "x");
const y = readNumberParam(params as Record<string, unknown>, "y");
await client.rightClick(x, y);
return jsonResult({
success: true,
action: "right_click",
coordinates: x !== undefined && y !== undefined ? { x, y } : "current position",
});
}
case "type": {
const text = readStringParam(params as Record<string, unknown>, "text", {
required: true,
label: "text",
});
await client.type(text);
return jsonResult({
success: true,
action: "type",
text,
});
}
case "key": {
const key = readStringParam(params as Record<string, unknown>, "key", {
required: true,
label: "key",
});
await client.key(key);
return jsonResult({
success: true,
action: "key",
key,
});
}
case "hotkey": {
const keys = params.keys as string[] | undefined;
if (!keys || !Array.isArray(keys) || keys.length === 0) {
throw new Error("keys array required for hotkey action");
}
await client.hotkey(keys);
return jsonResult({
success: true,
action: "hotkey",
keys,
});
}
case "scroll": {
const direction = readStringParam(params as Record<string, unknown>, "direction", {
required: true,
label: "direction",
}) as "up" | "down" | "left" | "right";
const amount = readNumberParam(params as Record<string, unknown>, "amount") ?? 1;
await client.scroll(direction, amount);
return jsonResult({
success: true,
action: "scroll",
direction,
amount,
});
}
case "move": {
const x = readNumberParam(params as Record<string, unknown>, "x", {
required: true,
label: "x coordinate",
})!;
const y = readNumberParam(params as Record<string, unknown>, "y", {
required: true,
label: "y coordinate",
})!;
await client.moveCursor(x, y);
return jsonResult({
success: true,
action: "move",
coordinates: { x, y },
});
}
case "drag": {
const x = readNumberParam(params as Record<string, unknown>, "x", {
required: true,
label: "start x coordinate",
})!;
const y = readNumberParam(params as Record<string, unknown>, "y", {
required: true,
label: "start y coordinate",
})!;
const endX = readNumberParam(params as Record<string, unknown>, "end_x", {
required: true,
label: "end x coordinate",
})!;
const endY = readNumberParam(params as Record<string, unknown>, "end_y", {
required: true,
label: "end y coordinate",
})!;
// Move to start position first, then drag
await client.moveCursor(x, y);
await client.dragTo(endX, endY);
return jsonResult({
success: true,
action: "drag",
from: { x, y },
to: { x: endX, y: endY },
});
}
case "get_screen_size": {
const size = await client.getScreenSize();
return jsonResult({
success: true,
action: "get_screen_size",
width: size.width,
height: size.height,
});
}
case "get_cursor_position": {
const pos = await client.getCursorPosition();
return jsonResult({
success: true,
action: "get_cursor_position",
x: pos.x,
y: pos.y,
});
}
default:
throw new Error(`Unknown action: ${action}`);
}
} catch (error) {
if (error instanceof ComputerServerError) {
return jsonResult({
success: false,
error: error.message,
command: error.command,
});
}
throw error;
}
},
};
}

View File

@ -0,0 +1,33 @@
/**
* Cua Computer Plugin
*
* Provides GUI automation via cua-computer-server - screenshots, clicks, typing, scrolling.
*
* @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
*/
import type { ClawdbotPluginDefinition } from "../../src/plugins/types.js";
import { createComputerTool } from "./computer-tool.js";
interface CuaComputerConfig {
serverUrl?: string;
}
const plugin: ClawdbotPluginDefinition = {
id: "cua-computer",
name: "Cua Computer",
description: "GUI automation via cua-computer-server",
register(api) {
const config = api.pluginConfig as CuaComputerConfig | undefined;
api.registerTool(
createComputerTool({
defaultServerUrl: config?.serverUrl,
config: api.config,
}),
);
},
};
export default plugin;

View File

@ -0,0 +1,6 @@
{
"name": "@clawdbot/plugin-cua-computer",
"version": "0.1.0",
"type": "module",
"main": "index.ts"
}