diff --git a/docs/AGENTS.default.md b/docs/AGENTS.default.md index 62750c68b..1c1f625b7 100644 --- a/docs/AGENTS.default.md +++ b/docs/AGENTS.default.md @@ -112,4 +112,4 @@ git commit -m "Add Clawd workspace" - Canvas UI runs full-screen with native overlays. Avoid placing critical controls in the top-left/top-right/bottom edges; add explicit gutters in the layout and don’t rely on safe-area insets. - For browser-driven verification, use `clawdis browser` (tabs/status/screenshot) with the clawd-managed Chrome profile. - For DOM inspection, use `clawdis browser eval|query|dom|snapshot` (and `--json`/`--out` when you need machine output). -- For interactions, use `clawdis browser click|type|hover|drag|select|upload|press|wait|navigate|back|evaluate|run`. +- For interactions, use `clawdis browser click|type|hover|drag|select|upload|press|wait|navigate|back|evaluate|run` (click/type accept `--selector`). diff --git a/docs/browser.md b/docs/browser.md index 6297420fc..0358ef0b0 100644 --- a/docs/browser.md +++ b/docs/browser.md @@ -174,7 +174,9 @@ Actions: - `clawdis browser navigate https://example.com` - `clawdis browser resize 1280 720` - `clawdis browser click 12 --double` +- `clawdis browser click --selector 'button.save'` - `clawdis browser type 23 "hello" --submit` +- `clawdis browser type --selector "input[name=q]" "hello"` - `clawdis browser press Enter` - `clawdis browser hover 44` - `clawdis browser drag 10 11` @@ -191,6 +193,7 @@ Notes: - `upload` and `dialog` are **arming** calls; run them before the click/press that triggers the chooser/dialog. - The arm default timeout is **2 minutes** (clamped to max 2 minutes); pass `timeoutMs` if you need shorter. - `snapshot --format ai` returns AI snapshot markup used for ref-based actions. +- `click`/`type` accept `--selector` to target CSS selectors instead of AI refs. ## Security & privacy notes diff --git a/docs/tools.md b/docs/tools.md index e49f552d9..d682f4f02 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -26,6 +26,7 @@ Core actions: Notes: - Requires `browser.enabled=true` in `~/.clawdis/clawdis.json`. - Uses `browser.controlUrl` unless `controlUrl` is passed explicitly. +- `act` supports CSS selectors for `click`/`type` via `selector` (use `ref` for AI snapshot targets). ### `clawdis_canvas` Drive the node Canvas (present, eval, snapshot, A2UI). diff --git a/src/agents/clawdis-tools.ts b/src/agents/clawdis-tools.ts index c3fcc5f02..57aa1961b 100644 --- a/src/agents/clawdis-tools.ts +++ b/src/agents/clawdis-tools.ts @@ -356,6 +356,7 @@ const BrowserActSchema = Type.Object({ Type.Literal("close"), ]), ref: Type.Optional(Type.String()), + selector: Type.Optional(Type.String()), targetId: Type.Optional(Type.String()), doubleClick: Type.Optional(Type.Boolean()), button: Type.Optional(Type.String()), diff --git a/src/browser/client-actions-core.ts b/src/browser/client-actions-core.ts index 117167349..273a5be6a 100644 --- a/src/browser/client-actions-core.ts +++ b/src/browser/client-actions-core.ts @@ -14,7 +14,8 @@ export type BrowserFormField = { export type BrowserActRequest = | { kind: "click"; - ref: string; + ref?: string; + selector?: string; targetId?: string; doubleClick?: boolean; button?: string; @@ -22,7 +23,8 @@ export type BrowserActRequest = } | { kind: "type"; - ref: string; + ref?: string; + selector?: string; text: string; targetId?: string; submit?: boolean; diff --git a/src/browser/pw-ai.test.ts b/src/browser/pw-ai.test.ts index d769ed55d..36c43b5ee 100644 --- a/src/browser/pw-ai.test.ts +++ b/src/browser/pw-ai.test.ts @@ -29,7 +29,8 @@ function createPage(opts: { const click = vi.fn().mockResolvedValue(undefined); const dblclick = vi.fn().mockResolvedValue(undefined); - const locator = vi.fn().mockReturnValue({ click, dblclick }); + const fill = vi.fn().mockResolvedValue(undefined); + const locator = vi.fn().mockReturnValue({ click, dblclick, fill }); const page = { context: () => context, @@ -44,7 +45,7 @@ function createPage(opts: { }), }; - return { page, session, locator, click }; + return { page, session, locator, click, fill }; } function createBrowser(pages: unknown[]) { @@ -110,6 +111,45 @@ describe("pw-ai", () => { expect(p1.click).toHaveBeenCalledTimes(1); }); + it("clicks a css selector when provided", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1" }); + const browser = createBrowser([p1.page]); + ( + chromium.connectOverCDP as unknown as ReturnType + ).mockResolvedValue(browser); + + const mod = await importModule(); + await mod.clickViaPlaywright({ + cdpPort: 18792, + targetId: "T1", + selector: "button.save", + }); + + expect(p1.locator).toHaveBeenCalledWith("button.save"); + expect(p1.click).toHaveBeenCalledTimes(1); + }); + + it("types via css selector when provided", async () => { + const { chromium } = await import("playwright-core"); + const p1 = createPage({ targetId: "T1" }); + const browser = createBrowser([p1.page]); + ( + chromium.connectOverCDP as unknown as ReturnType + ).mockResolvedValue(browser); + + const mod = await importModule(); + await mod.typeViaPlaywright({ + cdpPort: 18792, + targetId: "T1", + selector: "input[name=q]", + text: "hello", + }); + + expect(p1.locator).toHaveBeenCalledWith("input[name=q]"); + expect(p1.fill).toHaveBeenCalledTimes(1); + }); + it("fails with a clear error when _snapshotForAI is missing", async () => { const { chromium } = await import("playwright-core"); const p1 = createPage({ targetId: "T1", hasSnapshotForAI: false }); diff --git a/src/browser/pw-tools-core.ts b/src/browser/pw-tools-core.ts index 409787b01..2218e0a19 100644 --- a/src/browser/pw-tools-core.ts +++ b/src/browser/pw-tools-core.ts @@ -10,6 +10,19 @@ import { let nextUploadArmId = 0; let nextDialogArmId = 0; +type LocatorPage = Parameters[0]; + +function resolveLocator( + page: LocatorPage, + opts: { ref?: string; selector?: string }, +) { + const selector = typeof opts.selector === "string" ? opts.selector.trim() : ""; + if (selector) return page.locator(selector); + const ref = typeof opts.ref === "string" ? opts.ref.trim() : ""; + if (ref) return refLocator(page, ref); + throw new Error("ref or selector is required"); +} + export async function snapshotAiViaPlaywright(opts: { cdpPort: number; targetId?: string; @@ -41,21 +54,22 @@ export async function snapshotAiViaPlaywright(opts: { export async function clickViaPlaywright(opts: { cdpPort: number; targetId?: string; - ref: string; + ref?: string; + selector?: string; doubleClick?: boolean; button?: "left" | "right" | "middle"; modifiers?: Array<"Alt" | "Control" | "ControlOrMeta" | "Meta" | "Shift">; timeoutMs?: number; }): Promise { - const ref = String(opts.ref ?? "").trim(); - if (!ref) throw new Error("ref is required"); - const page = await getPageForTargetId({ cdpPort: opts.cdpPort, targetId: opts.targetId, }); ensurePageState(page); - const locator = refLocator(page, ref); + const locator = resolveLocator(page, { + ref: opts.ref, + selector: opts.selector, + }); const timeout = Math.max( 500, Math.min(60_000, Math.floor(opts.timeoutMs ?? 8000)), @@ -142,18 +156,20 @@ export async function pressKeyViaPlaywright(opts: { export async function typeViaPlaywright(opts: { cdpPort: number; targetId?: string; - ref: string; + ref?: string; + selector?: string; text: string; submit?: boolean; slowly?: boolean; timeoutMs?: number; }): Promise { - const ref = String(opts.ref ?? "").trim(); - if (!ref) throw new Error("ref is required"); const text = String(opts.text ?? ""); const page = await getPageForTargetId(opts); ensurePageState(page); - const locator = refLocator(page, ref); + const locator = resolveLocator(page, { + ref: opts.ref, + selector: opts.selector, + }); const timeout = Math.max(500, Math.min(60_000, opts.timeoutMs ?? 8000)); if (opts.slowly) { await locator.click({ timeout }); diff --git a/src/browser/routes/agent.ts b/src/browser/routes/agent.ts index c88775067..df89dfb2a 100644 --- a/src/browser/routes/agent.ts +++ b/src/browser/routes/agent.ts @@ -139,7 +139,9 @@ export function registerBrowserAgentRoutes( switch (kind) { case "click": { const ref = toStringOrEmpty(body.ref); - if (!ref) return jsonError(res, 400, "ref is required"); + const selector = toStringOrEmpty(body.selector); + if (!ref && !selector) + return jsonError(res, 400, "ref or selector is required"); const doubleClick = toBoolean(body.doubleClick) ?? false; const buttonRaw = toStringOrEmpty(body.button) || ""; const button = buttonRaw ? parseClickButton(buttonRaw) : undefined; @@ -166,32 +168,38 @@ export function registerBrowserAgentRoutes( const modifiers = modifiersRaw.length ? (modifiersRaw as ClickModifier[]) : undefined; - await pw.clickViaPlaywright({ + const clickRequest: Parameters[0] = { cdpPort, targetId: tab.targetId, - ref, doubleClick, - button, - modifiers, - }); + }; + if (ref) clickRequest.ref = ref; + if (selector) clickRequest.selector = selector; + if (button) clickRequest.button = button; + if (modifiers) clickRequest.modifiers = modifiers; + await pw.clickViaPlaywright(clickRequest); return res.json({ ok: true, targetId: tab.targetId, url: tab.url }); } case "type": { const ref = toStringOrEmpty(body.ref); - if (!ref) return jsonError(res, 400, "ref is required"); + const selector = toStringOrEmpty(body.selector); + if (!ref && !selector) + return jsonError(res, 400, "ref or selector is required"); if (typeof body.text !== "string") return jsonError(res, 400, "text is required"); const text = body.text; const submit = toBoolean(body.submit) ?? false; const slowly = toBoolean(body.slowly) ?? false; - await pw.typeViaPlaywright({ + const typeRequest: Parameters[0] = { cdpPort, targetId: tab.targetId, - ref, text, submit, slowly, - }); + }; + if (ref) typeRequest.ref = ref; + if (selector) typeRequest.selector = selector; + await pw.typeViaPlaywright(typeRequest); return res.json({ ok: true, targetId: tab.targetId }); } case "press": { diff --git a/src/browser/server.test.ts b/src/browser/server.test.ts index f1a953abd..be7624cbc 100644 --- a/src/browser/server.test.ts +++ b/src/browser/server.test.ts @@ -318,7 +318,7 @@ describe("browser control server", () => { }), }).then((r) => r.json())) as { ok: boolean }; expect(click.ok).toBe(true); - expect(pwMocks.clickViaPlaywright).toHaveBeenCalledWith({ + expect(pwMocks.clickViaPlaywright).toHaveBeenNthCalledWith(1, { cdpPort: testPort + 1, targetId: "abcd1234", ref: "1", @@ -327,13 +327,29 @@ describe("browser control server", () => { modifiers: ["Shift"], }); + const clickSelector = (await realFetch(`${base}/act`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + kind: "click", + selector: "button.save", + }), + }).then((r) => r.json())) as { ok: boolean }; + expect(clickSelector.ok).toBe(true); + expect(pwMocks.clickViaPlaywright).toHaveBeenNthCalledWith(2, { + cdpPort: testPort + 1, + targetId: "abcd1234", + selector: "button.save", + doubleClick: false, + }); + const type = (await realFetch(`${base}/act`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ kind: "type", ref: "1", text: "" }), }).then((r) => r.json())) as { ok: boolean }; expect(type.ok).toBe(true); - expect(pwMocks.typeViaPlaywright).toHaveBeenCalledWith({ + expect(pwMocks.typeViaPlaywright).toHaveBeenNthCalledWith(1, { cdpPort: testPort + 1, targetId: "abcd1234", ref: "1", @@ -342,6 +358,26 @@ describe("browser control server", () => { slowly: false, }); + const typeSelector = (await realFetch(`${base}/act`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + kind: "type", + selector: "input[name=q]", + text: "hello", + submit: true, + }), + }).then((r) => r.json())) as { ok: boolean }; + expect(typeSelector.ok).toBe(true); + expect(pwMocks.typeViaPlaywright).toHaveBeenNthCalledWith(2, { + cdpPort: testPort + 1, + targetId: "abcd1234", + selector: "input[name=q]", + text: "hello", + submit: true, + slowly: false, + }); + const press = (await realFetch(`${base}/act`, { method: "POST", headers: { "Content-Type": "application/json" }, diff --git a/src/cli/browser-cli-actions-input.ts b/src/cli/browser-cli-actions-input.ts index fedd73b2e..31b0e13e5 100644 --- a/src/cli/browser-cli-actions-input.ts +++ b/src/cli/browser-cli-actions-input.ts @@ -114,15 +114,24 @@ export function registerBrowserActionInputCommands( browser .command("click") - .description("Click an element by ref from an ai snapshot (e.g. 76)") - .argument("", "Ref id from ai snapshot") + .description("Click an element by ai ref or CSS selector") + .argument("[ref]", "Ref id from ai snapshot") + .option("--selector ", "CSS selector (instead of ref)") .option("--target-id ", "CDP target id (or unique prefix)") .option("--double", "Double click", false) .option("--button ", "Mouse button to use") .option("--modifiers ", "Comma-separated modifiers (Shift,Alt,Meta)") - .action(async (ref: string, opts, cmd) => { + .action(async (ref: string | undefined, opts, cmd) => { const parent = parentOpts(cmd); const baseUrl = resolveBrowserControlUrl(parent?.url); + const selector = + typeof opts.selector === "string" ? opts.selector.trim() : ""; + const refValue = typeof ref === "string" ? ref.trim() : ""; + if (!selector && !refValue) { + defaultRuntime.error(danger("ref or --selector is required")); + defaultRuntime.exit(1); + return; + } const modifiers = opts.modifiers ? String(opts.modifiers) .split(",") @@ -132,7 +141,8 @@ export function registerBrowserActionInputCommands( try { const result = await browserAct(baseUrl, { kind: "click", - ref, + ref: refValue || undefined, + selector: selector || undefined, targetId: opts.targetId?.trim() || undefined, doubleClick: Boolean(opts.double), button: opts.button?.trim() || undefined, @@ -143,7 +153,11 @@ export function registerBrowserActionInputCommands( return; } const suffix = result.url ? ` on ${result.url}` : ""; - defaultRuntime.log(`clicked ref ${ref}${suffix}`); + if (selector) { + defaultRuntime.log(`clicked ${selector}${suffix}`); + } else { + defaultRuntime.log(`clicked ref ${refValue}${suffix}`); + } } catch (err) { defaultRuntime.error(danger(String(err))); defaultRuntime.exit(1); @@ -152,19 +166,29 @@ export function registerBrowserActionInputCommands( browser .command("type") - .description("Type into an element by ai ref") - .argument("", "Ref id from ai snapshot") + .description("Type into an element by ai ref or CSS selector") + .argument("[ref]", "Ref id from ai snapshot") .argument("", "Text to type") + .option("--selector ", "CSS selector (instead of ref)") .option("--submit", "Press Enter after typing", false) .option("--slowly", "Type slowly (human-like)", false) .option("--target-id ", "CDP target id (or unique prefix)") - .action(async (ref: string, text: string, opts, cmd) => { + .action(async (ref: string | undefined, text: string, opts, cmd) => { const parent = parentOpts(cmd); const baseUrl = resolveBrowserControlUrl(parent?.url); + const selector = + typeof opts.selector === "string" ? opts.selector.trim() : ""; + const refValue = typeof ref === "string" ? ref.trim() : ""; + if (!selector && !refValue) { + defaultRuntime.error(danger("ref or --selector is required")); + defaultRuntime.exit(1); + return; + } try { const result = await browserAct(baseUrl, { kind: "type", - ref, + ref: refValue || undefined, + selector: selector || undefined, text, submit: Boolean(opts.submit), slowly: Boolean(opts.slowly), @@ -174,7 +198,11 @@ export function registerBrowserActionInputCommands( defaultRuntime.log(JSON.stringify(result, null, 2)); return; } - defaultRuntime.log(`typed into ref ${ref}`); + if (selector) { + defaultRuntime.log(`typed into ${selector}`); + } else { + defaultRuntime.log(`typed into ref ${refValue}`); + } } catch (err) { defaultRuntime.error(danger(String(err))); defaultRuntime.exit(1); diff --git a/src/cli/browser-cli-examples.ts b/src/cli/browser-cli-examples.ts index e86d480b9..04c96cc65 100644 --- a/src/cli/browser-cli-examples.ts +++ b/src/cli/browser-cli-examples.ts @@ -17,7 +17,9 @@ export const browserActionExamples = [ "clawdis browser navigate https://example.com", "clawdis browser resize 1280 720", "clawdis browser click 12 --double", + "clawdis browser click --selector 'button.save'", 'clawdis browser type 23 "hello" --submit', + 'clawdis browser type --selector "input[name=q]" "hello"', "clawdis browser press Enter", "clawdis browser hover 44", "clawdis browser drag 10 11",