import { type Api, type AssistantMessage, complete, type Context, type Model } from "@mariozechner/pi-ai"; import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent"; import { Type } from "@sinclair/typebox"; import type { ClawdbotConfig } from "../../config/config.js"; import { loadWebMedia } from "../../web/media.js"; import { resolveClawdbotAgentDir } from "../agent-paths.js"; import { getApiKeyForModel } from "../model-auth.js"; import { runWithImageModelFallback } from "../model-fallback.js"; import { ensureClawdbotModelsJson } from "../models-config.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; import { resolveUserPath } from "../../utils.js"; import type { AnyAgentTool } from "./common.js"; const DEFAULT_PROMPT = "Describe the image."; function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean { const primary = cfg?.agent?.imageModel?.trim(); const fallbacks = cfg?.agent?.imageModelFallbacks ?? []; return Boolean(primary || fallbacks.length > 0); } function pickMaxBytes(cfg?: ClawdbotConfig, maxBytesMb?: number): number | undefined { if (typeof maxBytesMb === "number" && Number.isFinite(maxBytesMb) && maxBytesMb > 0) { return Math.floor(maxBytesMb * 1024 * 1024); } const configured = cfg?.agent?.mediaMaxMb; if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) { return Math.floor(configured * 1024 * 1024); } return undefined; } function buildImageContext(prompt: string, base64: string, mimeType: string): Context { return { messages: [ { role: "user", content: [ { type: "text", text: prompt }, { type: "image", data: base64, mimeType }, ], timestamp: Date.now(), }, ], }; } async function runImagePrompt(params: { cfg?: ClawdbotConfig; modelOverride?: string; prompt: string; base64: string; mimeType: string; }): Promise<{ text: string; provider: string; model: string }> { const agentDir = resolveClawdbotAgentDir(); await ensureClawdbotModelsJson(params.cfg); const authStorage = discoverAuthStorage(agentDir); const modelRegistry = discoverModels(authStorage, agentDir); const result = await runWithImageModelFallback({ cfg: params.cfg, modelOverride: params.modelOverride, run: async (provider, modelId) => { const model = modelRegistry.find(provider, modelId) as Model | null; if (!model) { throw new Error(`Unknown model: ${provider}/${modelId}`); } if (!model.input?.includes("image")) { throw new Error(`Model does not support images: ${provider}/${modelId}`); } const apiKey = await getApiKeyForModel(model, authStorage); authStorage.setRuntimeApiKey(model.provider, apiKey); const context = buildImageContext( params.prompt, params.base64, params.mimeType, ); const message = (await complete(model, context, { apiKey, maxTokens: 512, temperature: 0, })) as AssistantMessage; return message; }, }); const text = extractAssistantText(result.result); return { text: text || "(no text returned)", provider: result.provider, model: result.model, }; } export function createImageTool(options?: { config?: ClawdbotConfig; }): AnyAgentTool | null { if (!ensureImageToolConfigured(options?.config)) return null; return { label: "Image", name: "image", description: "Analyze an image with the configured image model (agent.imageModel). Provide a prompt and image path or URL.", parameters: Type.Object({ prompt: Type.Optional(Type.String()), image: Type.String(), model: Type.Optional(Type.String()), maxBytesMb: Type.Optional(Type.Number()), }), execute: async (_toolCallId, args) => { const record = args && typeof args === "object" ? (args as Record) : {}; const imageRaw = typeof record.image === "string" ? record.image.trim() : ""; if (!imageRaw) throw new Error("image required"); const promptRaw = typeof record.prompt === "string" && record.prompt.trim() ? record.prompt.trim() : DEFAULT_PROMPT; const modelOverride = typeof record.model === "string" && record.model.trim() ? record.model.trim() : undefined; const maxBytesMb = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined; const maxBytes = pickMaxBytes(options?.config, maxBytesMb); const resolvedImage = imageRaw.startsWith("~") ? resolveUserPath(imageRaw) : imageRaw; const media = await loadWebMedia(resolvedImage, maxBytes); if (media.kind !== "image") { throw new Error(`Unsupported media type: ${media.kind}`); } const mimeType = media.contentType ?? "image/png"; const base64 = media.buffer.toString("base64"); const result = await runImagePrompt({ cfg: options?.config, modelOverride, prompt: promptRaw, base64, mimeType, }); return { content: [{ type: "text", text: result.text }], details: { model: `${result.provider}/${result.model}`, image: resolvedImage, }, }; }, }; }