import { Type } from "@sinclair/typebox"; import type { ClawdbotConfig } from "../../config/config.js"; import { VERSION } from "../../version.js"; import { stringEnum } from "../schema/typebox.js"; import type { AnyAgentTool } from "./common.js"; import { jsonResult, readNumberParam, readStringParam } from "./common.js"; const SEARCH_PROVIDERS = ["brave"] as const; const EXTRACT_MODES = ["markdown", "text"] as const; const DEFAULT_SEARCH_COUNT = 5; const MAX_SEARCH_COUNT = 10; const DEFAULT_FETCH_MAX_CHARS = 50_000; const DEFAULT_TIMEOUT_SECONDS = 30; const DEFAULT_CACHE_TTL_MINUTES = 15; const DEFAULT_CACHE_MAX_ENTRIES = 100; const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; type WebSearchConfig = NonNullable["web"] extends infer Web ? Web extends { search?: infer Search } ? Search : undefined : undefined; type WebFetchConfig = NonNullable["web"] extends infer Web ? Web extends { fetch?: infer Fetch } ? Fetch : undefined : undefined; type CacheEntry = { value: T; expiresAt: number; insertedAt: number; }; const SEARCH_CACHE = new Map>>(); const FETCH_CACHE = new Map>>(); const WebSearchSchema = Type.Object({ query: Type.String({ description: "Search query string." }), count: Type.Optional( Type.Number({ description: "Number of results to return (1-10).", minimum: 1, maximum: MAX_SEARCH_COUNT, }), ), country: Type.Optional( Type.String({ description: "2-letter country code for region-specific results (e.g., 'DE', 'US', 'ALL'). Default: 'US'.", }), ), search_lang: Type.Optional( Type.String({ description: "ISO language code for search results (e.g., 'de', 'en', 'fr').", }), ), ui_lang: Type.Optional( Type.String({ description: "ISO language code for UI elements.", }), ), }); const WebFetchSchema = Type.Object({ url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), extractMode: Type.Optional( stringEnum(EXTRACT_MODES, { description: 'Extraction mode ("markdown" or "text").', default: "markdown", }), ), maxChars: Type.Optional( Type.Number({ description: "Maximum characters to return (truncates when exceeded).", minimum: 100, }), ), }); type BraveSearchResult = { title?: string; url?: string; description?: string; age?: string; }; type BraveSearchResponse = { web?: { results?: BraveSearchResult[]; }; }; function resolveSearchConfig(cfg?: ClawdbotConfig): WebSearchConfig { const search = cfg?.tools?.web?.search; if (!search || typeof search !== "object") return undefined; return search as WebSearchConfig; } function resolveFetchConfig(cfg?: ClawdbotConfig): WebFetchConfig { const fetch = cfg?.tools?.web?.fetch; if (!fetch || typeof fetch !== "object") return undefined; return fetch as WebFetchConfig; } function resolveSearchEnabled(params: { search?: WebSearchConfig; sandboxed?: boolean }): boolean { if (typeof params.search?.enabled === "boolean") return params.search.enabled; if (params.sandboxed) return true; return true; } function resolveFetchEnabled(params: { fetch?: WebFetchConfig; sandboxed?: boolean }): boolean { if (typeof params.fetch?.enabled === "boolean") return params.fetch.enabled; return true; } function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { if (typeof fetch?.readability === "boolean") return fetch.readability; return true; } function resolveSearchApiKey(search?: WebSearchConfig): string | undefined { const fromConfig = search && "apiKey" in search && typeof search.apiKey === "string" ? search.apiKey.trim() : ""; const fromEnv = (process.env.BRAVE_API_KEY ?? "").trim(); return fromConfig || fromEnv || undefined; } function missingSearchKeyPayload() { return { error: "missing_brave_api_key", message: "web_search needs a Brave Search API key. Run `clawdbot configure --section web` to store it, or set BRAVE_API_KEY in the Gateway environment.", docs: "https://docs.clawd.bot/tools/web", }; } function resolveSearchProvider(search?: WebSearchConfig): (typeof SEARCH_PROVIDERS)[number] { const raw = search && "provider" in search && typeof search.provider === "string" ? search.provider.trim().toLowerCase() : ""; if (raw === "brave") return "brave"; return "brave"; } function resolveTimeoutSeconds(value: unknown, fallback: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; return Math.max(1, Math.floor(parsed)); } function resolveCacheTtlMs(value: unknown, fallbackMinutes: number): number { const minutes = typeof value === "number" && Number.isFinite(value) ? Math.max(0, value) : fallbackMinutes; return Math.round(minutes * 60_000); } function resolveMaxChars(value: unknown, fallback: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; return Math.max(100, Math.floor(parsed)); } function resolveSearchCount(value: unknown, fallback: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; const clamped = Math.max(1, Math.min(MAX_SEARCH_COUNT, Math.floor(parsed))); return clamped; } function normalizeCacheKey(value: string): string { return value.trim().toLowerCase(); } function readCache( cache: Map>, key: string, ): { value: T; cached: boolean } | null { const entry = cache.get(key); if (!entry) return null; if (Date.now() > entry.expiresAt) { cache.delete(key); return null; } return { value: entry.value, cached: true }; } function writeCache(cache: Map>, key: string, value: T, ttlMs: number) { if (ttlMs <= 0) return; if (cache.size >= DEFAULT_CACHE_MAX_ENTRIES) { const oldest = cache.keys().next(); if (!oldest.done) cache.delete(oldest.value); } cache.set(key, { value, expiresAt: Date.now() + ttlMs, insertedAt: Date.now(), }); } function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): AbortSignal { if (timeoutMs <= 0) return signal ?? new AbortController().signal; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); if (signal) { signal.addEventListener( "abort", () => { clearTimeout(timer); controller.abort(); }, { once: true }, ); } controller.signal.addEventListener( "abort", () => { clearTimeout(timer); }, { once: true }, ); return controller.signal; } function decodeEntities(value: string): string { return value .replace(/ /gi, " ") .replace(/&/gi, "&") .replace(/"/gi, '"') .replace(/'/gi, "'") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) .replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); } function stripTags(value: string): string { return decodeEntities(value.replace(/<[^>]+>/g, "")); } function normalizeWhitespace(value: string): string { return value .replace(/\r/g, "") .replace(/[ \t]+\n/g, "\n") .replace(/\n{3,}/g, "\n\n") .replace(/[ \t]{2,}/g, " ") .trim(); } function htmlToMarkdown(html: string): { text: string; title?: string } { const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); const title = titleMatch ? normalizeWhitespace(stripTags(titleMatch[1])) : undefined; let text = html .replace(//gi, "") .replace(//gi, "") .replace(//gi, ""); text = text.replace(/]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => { const label = normalizeWhitespace(stripTags(body)); if (!label) return href; return `[${label}](${href})`; }); text = text.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => { const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10)))); const label = normalizeWhitespace(stripTags(body)); return `\n${prefix} ${label}\n`; }); text = text.replace(/]*>([\s\S]*?)<\/li>/gi, (_, body) => { const label = normalizeWhitespace(stripTags(body)); return label ? `\n- ${label}` : ""; }); text = text .replace(/<(br|hr)\s*\/?>/gi, "\n") .replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n"); text = stripTags(text); text = normalizeWhitespace(text); return { text, title }; } function htmlToText(html: string): { text: string; title?: string } { const { text, title } = htmlToMarkdown(html); return { text, title }; } function truncateText(value: string, maxChars: number): { text: string; truncated: boolean } { if (value.length <= maxChars) return { text: value, truncated: false }; return { text: value.slice(0, maxChars), truncated: true }; } function resolveSiteName(url: string | undefined): string | undefined { if (!url) return undefined; try { return new URL(url).hostname; } catch { return undefined; } } async function readResponseText(res: Response): Promise { try { return await res.text(); } catch { return ""; } } export async function extractReadableContent(params: { html: string; url: string; extractMode: (typeof EXTRACT_MODES)[number]; }): Promise<{ text: string; title?: string } | null> { try { const [{ Readability }, { parseHTML }] = await Promise.all([ import("@mozilla/readability"), import("linkedom"), ]); const { document } = parseHTML(params.html); try { (document as { baseURI?: string }).baseURI = params.url; } catch { // Best-effort base URI for relative links. } const reader = new Readability(document, { charThreshold: 0 }); const parsed = reader.parse(); if (!parsed?.content) return null; const title = parsed.title || undefined; if (params.extractMode === "text") { const text = normalizeWhitespace(parsed.textContent ?? ""); return { text, title }; } const rendered = htmlToMarkdown(parsed.content); return { text: rendered.text, title: title ?? rendered.title }; } catch { return null; } } async function runWebSearch(params: { query: string; count: number; apiKey: string; timeoutSeconds: number; cacheTtlMs: number; provider: (typeof SEARCH_PROVIDERS)[number]; country?: string; search_lang?: string; ui_lang?: string; }): Promise> { const cacheKey = normalizeCacheKey( `${params.provider}:${params.query}:${params.count}:${params.country || "default"}:${params.search_lang || "default"}:${params.ui_lang || "default"}` ); const cached = readCache(SEARCH_CACHE, cacheKey); if (cached) return { ...cached.value, cached: true }; const start = Date.now(); if (params.provider !== "brave") { throw new Error("Unsupported web search provider."); } const url = new URL(BRAVE_SEARCH_ENDPOINT); url.searchParams.set("q", params.query); url.searchParams.set("count", String(params.count)); if (params.country) { url.searchParams.set("country", params.country); } if (params.search_lang) { url.searchParams.set("search_lang", params.search_lang); } if (params.ui_lang) { url.searchParams.set("ui_lang", params.ui_lang); } const res = await fetch(url.toString(), { method: "GET", headers: { Accept: "application/json", "X-Subscription-Token": params.apiKey, }, signal: withTimeout(undefined, params.timeoutSeconds * 1000), }); if (!res.ok) { const detail = await readResponseText(res); throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); } const data = (await res.json()) as BraveSearchResponse; const results = Array.isArray(data.web?.results) ? (data.web?.results ?? []) : []; const mapped = results.map((entry) => ({ title: entry.title ?? "", url: entry.url ?? "", description: entry.description ?? "", published: entry.age ?? undefined, siteName: resolveSiteName(entry.url ?? ""), })); const payload = { query: params.query, provider: params.provider, count: mapped.length, tookMs: Date.now() - start, results: mapped, }; writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } async function runWebFetch(params: { url: string; extractMode: (typeof EXTRACT_MODES)[number]; maxChars: number; timeoutSeconds: number; cacheTtlMs: number; userAgent: string; readabilityEnabled: boolean; }): Promise> { const cacheKey = normalizeCacheKey( `fetch:${params.url}:${params.extractMode}:${params.maxChars}`, ); const cached = readCache(FETCH_CACHE, cacheKey); if (cached) return { ...cached.value, cached: true }; let parsedUrl: URL; try { parsedUrl = new URL(params.url); } catch { throw new Error("Invalid URL: must be http or https"); } if (!["http:", "https:"].includes(parsedUrl.protocol)) { throw new Error("Invalid URL: must be http or https"); } const start = Date.now(); const res = await fetch(parsedUrl.toString(), { method: "GET", headers: { Accept: "*/*", "User-Agent": params.userAgent, }, signal: withTimeout(undefined, params.timeoutSeconds * 1000), }); if (!res.ok) { const detail = await readResponseText(res); throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); } const contentType = res.headers.get("content-type") ?? "application/octet-stream"; const body = await readResponseText(res); let title: string | undefined; let text = body; if (contentType.includes("text/html")) { if (params.readabilityEnabled) { const readable = await extractReadableContent({ html: body, url: res.url || params.url, extractMode: params.extractMode, }); if (readable?.text) { text = readable.text; title = readable.title; } else { const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); text = parsed.text; title = parsed.title; } } else { const parsed = params.extractMode === "text" ? htmlToText(body) : htmlToMarkdown(body); text = parsed.text; title = parsed.title; } } else if (contentType.includes("application/json")) { try { text = JSON.stringify(JSON.parse(body), null, 2); } catch { text = body; } } const truncated = truncateText(text, params.maxChars); const payload = { url: params.url, finalUrl: res.url || params.url, status: res.status, contentType, title, extractMode: params.extractMode, truncated: truncated.truncated, length: truncated.text.length, fetchedAt: new Date().toISOString(), tookMs: Date.now() - start, text: truncated.text, }; writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } export function createWebSearchTool(options?: { config?: ClawdbotConfig; sandboxed?: boolean; }): AnyAgentTool | null { const search = resolveSearchConfig(options?.config); if (!resolveSearchEnabled({ search, sandboxed: options?.sandboxed })) return null; return { label: "Web Search", name: "web_search", description: "Search the web using Brave Search API. Supports region-specific and localized search via country and language parameters. Returns titles, URLs, and snippets for fast research.", parameters: WebSearchSchema, execute: async (_toolCallId, args) => { const apiKey = resolveSearchApiKey(search); if (!apiKey) { return jsonResult(missingSearchKeyPayload()); } const params = args as Record; const query = readStringParam(params, "query", { required: true }); const count = readNumberParam(params, "count", { integer: true }) ?? search?.maxResults ?? undefined; const country = readStringParam(params, "country"); const search_lang = readStringParam(params, "search_lang"); const ui_lang = readStringParam(params, "ui_lang"); const result = await runWebSearch({ query, count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), apiKey, timeoutSeconds: resolveTimeoutSeconds(search?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(search?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), provider: resolveSearchProvider(search), country, search_lang, ui_lang, }); return jsonResult(result); }, }; } export function createWebFetchTool(options?: { config?: ClawdbotConfig; sandboxed?: boolean; }): AnyAgentTool | null { const fetch = resolveFetchConfig(options?.config); if (!resolveFetchEnabled({ fetch, sandboxed: options?.sandboxed })) return null; const readabilityEnabled = resolveFetchReadabilityEnabled(fetch); const userAgent = (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || `clawdbot/${VERSION}`; return { label: "Web Fetch", name: "web_fetch", description: "Fetch and extract readable content from a URL (HTML → markdown/text). Use for lightweight page access without browser automation.", parameters: WebFetchSchema, execute: async (_toolCallId, args) => { const params = args as Record; const url = readStringParam(params, "url", { required: true }); const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; const maxChars = readNumberParam(params, "maxChars", { integer: true }); const result = await runWebFetch({ url, extractMode, maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), userAgent, readabilityEnabled, }); return jsonResult(result); }, }; }