diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 870453f38..97dc77771 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -618,7 +618,7 @@ export async function runEmbeddedPiAgent( usage, }; - const payloads = buildEmbeddedRunPayloads({ + const payloads = await buildEmbeddedRunPayloads({ assistantTexts: attempt.assistantTexts, toolMetas: attempt.toolMetas, lastAssistant: attempt.lastAssistant, diff --git a/src/agents/pi-embedded-runner/run/payloads.ts b/src/agents/pi-embedded-runner/run/payloads.ts index 14d304e41..89359c118 100644 --- a/src/agents/pi-embedded-runner/run/payloads.ts +++ b/src/agents/pi-embedded-runner/run/payloads.ts @@ -4,6 +4,7 @@ import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking. import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js"; import { formatToolAggregate } from "../../../auto-reply/tool-meta.js"; import type { MoltbotConfig } from "../../../config/config.js"; +import { saveMediaBuffer } from "../../../media/store.js"; import { formatAssistantErrorText, formatRawAssistantErrorForUi, @@ -12,15 +13,33 @@ import { normalizeTextForComparison, } from "../../pi-embedded-helpers.js"; import { + extractAssistantImages, extractAssistantText, extractAssistantThinking, formatReasoningMessage, } from "../../pi-embedded-utils.js"; import type { ToolResultFormat } from "../../pi-embedded-subscribe.js"; +/** + * Save base64 image data from OpenRouter image generation models. + * Uses the standard media store with "generated" subdirectory. + */ +async function saveGeneratedImage(image: { + mimeType: string; + data: string; +}): Promise { + try { + const buffer = Buffer.from(image.data, "base64"); + const saved = await saveMediaBuffer(buffer, image.mimeType, "generated"); + return saved.path; + } catch { + return null; + } +} + type ToolMetaEntry = { toolName: string; meta?: string }; -export function buildEmbeddedRunPayloads(params: { +export async function buildEmbeddedRunPayloads(params: { assistantTexts: string[]; toolMetas: ToolMetaEntry[]; lastAssistant: AssistantMessage | undefined; @@ -31,16 +50,18 @@ export function buildEmbeddedRunPayloads(params: { reasoningLevel?: ReasoningLevel; toolResultFormat?: ToolResultFormat; inlineToolResultsAllowed: boolean; -}): Array<{ - text?: string; - mediaUrl?: string; - mediaUrls?: string[]; - replyToId?: string; - isError?: boolean; - audioAsVoice?: boolean; - replyToTag?: boolean; - replyToCurrent?: boolean; -}> { +}): Promise< + Array<{ + text?: string; + mediaUrl?: string; + mediaUrls?: string[]; + replyToId?: string; + isError?: boolean; + audioAsVoice?: boolean; + replyToTag?: boolean; + replyToCurrent?: boolean; + }> +> { const replyItems: Array<{ text: string; media?: string[]; @@ -209,6 +230,21 @@ export function buildEmbeddedRunPayloads(params: { } } + // Extract and save generated images from OpenRouter image models + if (params.lastAssistant) { + const generatedImages = extractAssistantImages(params.lastAssistant); + for (const img of generatedImages) { + const filePath = await saveGeneratedImage(img); + if (filePath) { + // Add image as a separate media item + replyItems.push({ + text: "", + media: [filePath], + }); + } + } + } + const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice); return replyItems .map((item) => ({ diff --git a/src/agents/pi-embedded-utils.ts b/src/agents/pi-embedded-utils.ts index 969b0a316..3c1e482aa 100644 --- a/src/agents/pi-embedded-utils.ts +++ b/src/agents/pi-embedded-utils.ts @@ -206,6 +206,32 @@ export function extractAssistantThinking(msg: AssistantMessage): string { return blocks.join("\n").trim(); } +/** + * Extract image blocks from assistant message content. + * Used for OpenRouter image generation models that return images as ImageContent. + */ +export function extractAssistantImages( + msg: AssistantMessage, +): Array<{ mimeType: string; data: string }> { + if (!Array.isArray(msg.content)) return []; + const images: Array<{ mimeType: string; data: string }> = []; + for (const block of msg.content) { + if (!block || typeof block !== "object") continue; + const record = block as unknown as Record; + if ( + record.type === "image" && + typeof record.data === "string" && + typeof record.mimeType === "string" + ) { + images.push({ + mimeType: record.mimeType, + data: record.data, + }); + } + } + return images; +} + export function formatReasoningMessage(text: string): string { const trimmed = text.trim(); if (!trimmed) return ""; diff --git a/src/agents/subagent-announce.ts b/src/agents/subagent-announce.ts index 444726efc..e3d930959 100644 --- a/src/agents/subagent-announce.ts +++ b/src/agents/subagent-announce.ts @@ -11,6 +11,7 @@ import { import { normalizeMainKey } from "../routing/session-key.js"; import { resolveQueueSettings } from "../auto-reply/reply/queue.js"; import { callGateway } from "../gateway/call.js"; +import { saveMediaBuffer } from "../media/store.js"; import { defaultRuntime } from "../runtime.js"; import { type DeliveryContext, @@ -20,7 +21,24 @@ import { } from "../utils/delivery-context.js"; import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js"; import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js"; -import { readLatestAssistantReply } from "./tools/agent-step.js"; +import { readLatestAssistantReplyWithMedia } from "./tools/agent-step.js"; + +/** + * Save base64 image data from subagent image generation models. + * Uses the standard media store with "generated" subdirectory. + */ +async function saveGeneratedImage(image: { + mimeType: string; + data: string; +}): Promise { + try { + const buffer = Buffer.from(image.data, "base64"); + const saved = await saveMediaBuffer(buffer, image.mimeType, "generated"); + return saved.path; + } catch { + return null; + } +} function formatDurationShort(valueMs?: number) { if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined; @@ -321,6 +339,7 @@ export async function runSubagentAnnounceFlow(params: { try { const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin); let reply = params.roundOneReply; + let replyImages: Array<{ mimeType: string; data: string }> | undefined; let outcome: SubagentRunOutcome | undefined = params.outcome; if (!reply && params.waitForCompletion !== false) { const waitMs = Math.min(params.timeoutMs, 60_000); @@ -353,15 +372,20 @@ export async function runSubagentAnnounceFlow(params: { if (wait?.status === "timeout") { if (!outcome) outcome = { status: "timeout" }; } - reply = await readLatestAssistantReply({ + // Use the new function that also extracts images + const replyContent = await readLatestAssistantReplyWithMedia({ sessionKey: params.childSessionKey, }); + reply = replyContent.text; + replyImages = replyContent.images; } - if (!reply) { - reply = await readLatestAssistantReply({ + if (!reply && !replyImages) { + const replyContent = await readLatestAssistantReplyWithMedia({ sessionKey: params.childSessionKey, }); + reply = replyContent.text; + replyImages = replyContent.images; } if (!outcome) outcome = { status: "unknown" }; @@ -373,6 +397,39 @@ export async function runSubagentAnnounceFlow(params: { endedAt: params.endedAt, }); + // Handle generated images - send them directly to the user + const savedImagePaths: string[] = []; + if (replyImages && replyImages.length > 0) { + for (const img of replyImages) { + const filePath = await saveGeneratedImage(img); + if (filePath) { + savedImagePaths.push(filePath); + } + } + // Send images directly to the user if we have a delivery context + if (savedImagePaths.length > 0 && requesterOrigin?.to && requesterOrigin?.channel) { + try { + await callGateway({ + method: "send", + params: { + to: requesterOrigin.to, + message: "Here's what I generated:", // Non-empty message required + mediaUrls: savedImagePaths, + channel: requesterOrigin.channel, + accountId: requesterOrigin.accountId, + idempotencyKey: crypto.randomUUID(), + }, + timeoutMs: 30_000, + }); + defaultRuntime.log( + `[subagent] Images sent: ${savedImagePaths.length} image(s) to ${requesterOrigin.to}`, + ); + } catch (err) { + defaultRuntime.error?.(`Failed to send subagent images: ${String(err)}`); + } + } + } + // Build status label const statusLabel = outcome.status === "ok" @@ -385,17 +442,25 @@ export async function runSubagentAnnounceFlow(params: { // Build instructional message for main agent const taskLabel = params.label || params.task || "background task"; + // If we sent images, mention it in the findings + const imageNote = + savedImagePaths.length > 0 + ? `\n[${savedImagePaths.length} image(s) were generated and sent to the user]` + : ""; const triggerMessage = [ `A background task "${taskLabel}" just ${statusLabel}.`, "", "Findings:", reply || "(no output)", + imageNote, "", statsLine, "", "Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.", "Do not mention technical details like tokens, stats, or that this was a background task.", - "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).", + savedImagePaths.length > 0 + ? "The generated image(s) have already been sent to the user. Just acknowledge the completion naturally." + : "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).", ].join("\n"); const queued = await maybeQueueSubagentAnnounce({ diff --git a/src/agents/tools/agent-step.ts b/src/agents/tools/agent-step.ts index e55c16a78..7192086e4 100644 --- a/src/agents/tools/agent-step.ts +++ b/src/agents/tools/agent-step.ts @@ -3,7 +3,16 @@ import crypto from "node:crypto"; import { callGateway } from "../../gateway/call.js"; import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js"; import { AGENT_LANE_NESTED } from "../lanes.js"; -import { extractAssistantText, stripToolMessages } from "./sessions-helpers.js"; +import { + extractAssistantText, + extractAssistantImages, + stripToolMessages, +} from "./sessions-helpers.js"; + +export interface AssistantReplyContent { + text?: string; + images?: Array<{ mimeType: string; data: string }>; +} export async function readLatestAssistantReply(params: { sessionKey: string; @@ -18,6 +27,29 @@ export async function readLatestAssistantReply(params: { return last ? extractAssistantText(last) : undefined; } +/** + * Read the latest assistant reply including both text and images. + * Used for subagent announce flow where images need to be forwarded. + */ +export async function readLatestAssistantReplyWithMedia(params: { + sessionKey: string; + limit?: number; +}): Promise { + const history = (await callGateway({ + method: "chat.history", + params: { sessionKey: params.sessionKey, limit: params.limit ?? 50 }, + })) as { messages?: unknown[] }; + const filtered = stripToolMessages(Array.isArray(history?.messages) ? history.messages : []); + const last = filtered.length > 0 ? filtered[filtered.length - 1] : undefined; + if (!last) return {}; + const text = extractAssistantText(last); + const images = extractAssistantImages(last); + return { + text: text || undefined, + images: images.length > 0 ? images : undefined, + }; +} + export async function runAgentStep(params: { sessionKey: string; message: string; diff --git a/src/agents/tools/sessions-helpers.ts b/src/agents/tools/sessions-helpers.ts index c03ace571..0dfa85593 100644 --- a/src/agents/tools/sessions-helpers.ts +++ b/src/agents/tools/sessions-helpers.ts @@ -325,3 +325,32 @@ export function extractAssistantText(message: unknown): string | undefined { const joined = chunks.join("").trim(); return joined ? sanitizeUserFacingText(joined) : undefined; } + +/** + * Extract image blocks from an assistant message. + * Used for OpenRouter image generation models that return images. + */ +export function extractAssistantImages( + message: unknown, +): Array<{ mimeType: string; data: string }> { + if (!message || typeof message !== "object") return []; + if ((message as { role?: unknown }).role !== "assistant") return []; + const content = (message as { content?: unknown }).content; + if (!Array.isArray(content)) return []; + const images: Array<{ mimeType: string; data: string }> = []; + for (const block of content) { + if (!block || typeof block !== "object") continue; + const record = block as Record; + if ( + record.type === "image" && + typeof record.data === "string" && + typeof record.mimeType === "string" + ) { + images.push({ + mimeType: record.mimeType, + data: record.data, + }); + } + } + return images; +}