openclaw/extensions/cua-computer/computer-server-client.ts
f-trycua 438c80b758 refactor: move computer tool to cua-computer plugin
Move GUI automation from core to a standalone plugin. This keeps the
core lean and lets users opt-in when they need computer-use capabilities.

Plugin provides:
- Screenshot capture
- Mouse clicks (left, right, double)
- Keyboard input (type, key, hotkey)
- Scrolling and dragging
- Cursor position/screen size queries

Enable via config:
```yaml
plugins:
  cua-computer:
    serverUrl: "http://localhost:8000"
```
2026-01-26 15:56:22 -08:00

228 lines
6.0 KiB
TypeScript

/**
* HTTP client for communicating with cua-computer-server.
*
* computer-server provides desktop automation capabilities via HTTP POST /cmd endpoint.
* Each command returns { success: boolean, ...result } or { success: false, error: string }.
*
* Note: computer-server also exposes an MCP interface at /mcp which could be used
* if Clawdbot adds MCP client support in the future.
*
* @see https://github.com/trycua/cua/tree/main/libs/python/computer-server
*/
export interface ComputerServerConfig {
/** Base URL of the computer-server (default: http://localhost:8000) */
baseUrl?: string;
/** Request timeout in milliseconds (default: 30000) */
timeoutMs?: number;
}
export interface ScreenshotResult {
/** Base64-encoded PNG image data */
imageData: string;
}
export interface ScreenSize {
width: number;
height: number;
}
export interface CursorPosition {
x: number;
y: number;
}
export interface CommandResult {
success: boolean;
error?: string;
[key: string]: unknown;
}
export class ComputerServerError extends Error {
constructor(
message: string,
public readonly command: string,
public readonly statusCode?: number,
) {
super(message);
this.name = "ComputerServerError";
}
}
export class ComputerServerClient {
private readonly baseUrl: string;
private readonly timeoutMs: number;
constructor(config: ComputerServerConfig = {}) {
this.baseUrl = config.baseUrl ?? "http://localhost:8000";
this.timeoutMs = config.timeoutMs ?? 30000;
}
/**
* Take a screenshot of the desktop.
* @returns Base64-encoded PNG image data
*/
async screenshot(): Promise<ScreenshotResult> {
const result = await this.call("screenshot");
return { imageData: result.image_data as string };
}
/**
* Get the screen dimensions.
*/
async getScreenSize(): Promise<ScreenSize> {
const result = await this.call("get_screen_size");
const size = result.size as { width: number; height: number };
return {
width: size.width,
height: size.height,
};
}
/**
* Get the current cursor position.
*/
async getCursorPosition(): Promise<CursorPosition> {
const result = await this.call("get_cursor_position");
const position = result.position as { x: number; y: number };
return {
x: position.x,
y: position.y,
};
}
/**
* Perform a left click at the specified coordinates.
* If coordinates are omitted, clicks at the current cursor position.
*/
async click(x?: number, y?: number): Promise<void> {
await this.call("left_click", { x, y });
}
/**
* Perform a double click at the specified coordinates.
*/
async doubleClick(x?: number, y?: number): Promise<void> {
await this.call("double_click", { x, y });
}
/**
* Perform a right click at the specified coordinates.
*/
async rightClick(x?: number, y?: number): Promise<void> {
await this.call("right_click", { x, y });
}
/**
* Move the cursor to the specified coordinates.
*/
async moveCursor(x: number, y: number): Promise<void> {
await this.call("move_cursor", { x, y });
}
/**
* Type text using the keyboard.
*/
async type(text: string): Promise<void> {
await this.call("type_text", { text });
}
/**
* Press a single key (e.g., "Return", "Tab", "Escape").
*/
async key(key: string): Promise<void> {
await this.call("press_key", { key });
}
/**
* Press a combination of keys (e.g., ["cmd", "c"] for copy).
*/
async hotkey(keys: string[]): Promise<void> {
await this.call("hotkey", { keys });
}
/**
* Scroll in a direction.
* @param direction - "up", "down", "left", or "right"
* @param clicks - Number of scroll clicks (default: 1)
*/
async scroll(direction: "up" | "down" | "left" | "right", clicks = 1): Promise<void> {
if (direction === "down") {
await this.call("scroll_down", { clicks });
} else if (direction === "up") {
await this.call("scroll_up", { clicks });
} else {
// Horizontal scroll: use scroll(x, y) where positive x = right, negative = left
const x = direction === "right" ? 300 * clicks : -300 * clicks;
await this.call("scroll", { x, y: 0 });
}
}
/**
* Drag from current position to target coordinates.
*/
async dragTo(x: number, y: number, button = "left", duration = 0.5): Promise<void> {
await this.call("drag_to", { x, y, button, duration });
}
/**
* Check if the computer-server is available and responding.
*/
async healthCheck(): Promise<boolean> {
try {
const response = await fetch(`${this.baseUrl}/status`, {
method: "GET",
signal: AbortSignal.timeout(5000),
});
return response.ok;
} catch {
return false;
}
}
/**
* Send a command to the computer-server.
*/
private async call(
command: string,
params: Record<string, unknown> = {},
): Promise<CommandResult> {
// Filter out undefined values from params
const filteredParams = Object.fromEntries(
Object.entries(params).filter(([, v]) => v !== undefined),
);
const response = await fetch(`${this.baseUrl}/cmd`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ command, params: filteredParams }),
signal: AbortSignal.timeout(this.timeoutMs),
});
if (!response.ok) {
throw new ComputerServerError(
`HTTP ${response.status}: ${response.statusText}`,
command,
response.status,
);
}
// The /cmd endpoint returns SSE-style "data: {...}\n\n" format
const text = await response.text();
const jsonMatch = text.match(/^data:\s*(.+)$/m);
if (!jsonMatch) {
throw new ComputerServerError(`Invalid response format from computer-server`, command);
}
const result = JSON.parse(jsonMatch[1]) as CommandResult;
if (!result.success) {
throw new ComputerServerError(result.error ?? `Command '${command}' failed`, command);
}
return result;
}
}