feat(agents): support OpenRouter image generation models

Add support for extracting and saving generated images from OpenRouter
models that return images (like gpt-5-image-mini).

- Extract base64 images from assistant responses via extractAssistantImages()
- Save generated images using standard media store ("generated" subdir)
- Send generated images to users via message channels
- Support both direct agent responses and subagent image generation

This enables use of OpenRouter's image generation models in Moltbot
conversations and subagent tasks.
This commit is contained in:
Clawd 2026-01-29 12:37:42 -06:00
parent c9fe062824
commit 7d11b9246b
6 changed files with 206 additions and 18 deletions

View File

@ -618,7 +618,7 @@ export async function runEmbeddedPiAgent(
usage, usage,
}; };
const payloads = buildEmbeddedRunPayloads({ const payloads = await buildEmbeddedRunPayloads({
assistantTexts: attempt.assistantTexts, assistantTexts: attempt.assistantTexts,
toolMetas: attempt.toolMetas, toolMetas: attempt.toolMetas,
lastAssistant: attempt.lastAssistant, lastAssistant: attempt.lastAssistant,

View File

@ -4,6 +4,7 @@ import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking.
import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js";
import { formatToolAggregate } from "../../../auto-reply/tool-meta.js"; import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
import type { MoltbotConfig } from "../../../config/config.js"; import type { MoltbotConfig } from "../../../config/config.js";
import { saveMediaBuffer } from "../../../media/store.js";
import { import {
formatAssistantErrorText, formatAssistantErrorText,
formatRawAssistantErrorForUi, formatRawAssistantErrorForUi,
@ -12,15 +13,33 @@ import {
normalizeTextForComparison, normalizeTextForComparison,
} from "../../pi-embedded-helpers.js"; } from "../../pi-embedded-helpers.js";
import { import {
extractAssistantImages,
extractAssistantText, extractAssistantText,
extractAssistantThinking, extractAssistantThinking,
formatReasoningMessage, formatReasoningMessage,
} from "../../pi-embedded-utils.js"; } from "../../pi-embedded-utils.js";
import type { ToolResultFormat } from "../../pi-embedded-subscribe.js"; import type { ToolResultFormat } from "../../pi-embedded-subscribe.js";
/**
* Save base64 image data from OpenRouter image generation models.
* Uses the standard media store with "generated" subdirectory.
*/
async function saveGeneratedImage(image: {
mimeType: string;
data: string;
}): Promise<string | null> {
try {
const buffer = Buffer.from(image.data, "base64");
const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
return saved.path;
} catch {
return null;
}
}
type ToolMetaEntry = { toolName: string; meta?: string }; type ToolMetaEntry = { toolName: string; meta?: string };
export function buildEmbeddedRunPayloads(params: { export async function buildEmbeddedRunPayloads(params: {
assistantTexts: string[]; assistantTexts: string[];
toolMetas: ToolMetaEntry[]; toolMetas: ToolMetaEntry[];
lastAssistant: AssistantMessage | undefined; lastAssistant: AssistantMessage | undefined;
@ -31,16 +50,18 @@ export function buildEmbeddedRunPayloads(params: {
reasoningLevel?: ReasoningLevel; reasoningLevel?: ReasoningLevel;
toolResultFormat?: ToolResultFormat; toolResultFormat?: ToolResultFormat;
inlineToolResultsAllowed: boolean; inlineToolResultsAllowed: boolean;
}): Array<{ }): Promise<
text?: string; Array<{
mediaUrl?: string; text?: string;
mediaUrls?: string[]; mediaUrl?: string;
replyToId?: string; mediaUrls?: string[];
isError?: boolean; replyToId?: string;
audioAsVoice?: boolean; isError?: boolean;
replyToTag?: boolean; audioAsVoice?: boolean;
replyToCurrent?: boolean; replyToTag?: boolean;
}> { replyToCurrent?: boolean;
}>
> {
const replyItems: Array<{ const replyItems: Array<{
text: string; text: string;
media?: string[]; media?: string[];
@ -209,6 +230,21 @@ export function buildEmbeddedRunPayloads(params: {
} }
} }
// Extract and save generated images from OpenRouter image models
if (params.lastAssistant) {
const generatedImages = extractAssistantImages(params.lastAssistant);
for (const img of generatedImages) {
const filePath = await saveGeneratedImage(img);
if (filePath) {
// Add image as a separate media item
replyItems.push({
text: "",
media: [filePath],
});
}
}
}
const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice); const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice);
return replyItems return replyItems
.map((item) => ({ .map((item) => ({

View File

@ -206,6 +206,32 @@ export function extractAssistantThinking(msg: AssistantMessage): string {
return blocks.join("\n").trim(); return blocks.join("\n").trim();
} }
/**
* Extract image blocks from assistant message content.
* Used for OpenRouter image generation models that return images as ImageContent.
*/
export function extractAssistantImages(
msg: AssistantMessage,
): Array<{ mimeType: string; data: string }> {
if (!Array.isArray(msg.content)) return [];
const images: Array<{ mimeType: string; data: string }> = [];
for (const block of msg.content) {
if (!block || typeof block !== "object") continue;
const record = block as unknown as Record<string, unknown>;
if (
record.type === "image" &&
typeof record.data === "string" &&
typeof record.mimeType === "string"
) {
images.push({
mimeType: record.mimeType,
data: record.data,
});
}
}
return images;
}
export function formatReasoningMessage(text: string): string { export function formatReasoningMessage(text: string): string {
const trimmed = text.trim(); const trimmed = text.trim();
if (!trimmed) return ""; if (!trimmed) return "";

View File

@ -11,6 +11,7 @@ import {
import { normalizeMainKey } from "../routing/session-key.js"; import { normalizeMainKey } from "../routing/session-key.js";
import { resolveQueueSettings } from "../auto-reply/reply/queue.js"; import { resolveQueueSettings } from "../auto-reply/reply/queue.js";
import { callGateway } from "../gateway/call.js"; import { callGateway } from "../gateway/call.js";
import { saveMediaBuffer } from "../media/store.js";
import { defaultRuntime } from "../runtime.js"; import { defaultRuntime } from "../runtime.js";
import { import {
type DeliveryContext, type DeliveryContext,
@ -20,7 +21,24 @@ import {
} from "../utils/delivery-context.js"; } from "../utils/delivery-context.js";
import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js"; import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js";
import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js"; import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js";
import { readLatestAssistantReply } from "./tools/agent-step.js"; import { readLatestAssistantReplyWithMedia } from "./tools/agent-step.js";
/**
* Save base64 image data from subagent image generation models.
* Uses the standard media store with "generated" subdirectory.
*/
async function saveGeneratedImage(image: {
mimeType: string;
data: string;
}): Promise<string | null> {
try {
const buffer = Buffer.from(image.data, "base64");
const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
return saved.path;
} catch {
return null;
}
}
function formatDurationShort(valueMs?: number) { function formatDurationShort(valueMs?: number) {
if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined; if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined;
@ -321,6 +339,7 @@ export async function runSubagentAnnounceFlow(params: {
try { try {
const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin); const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin);
let reply = params.roundOneReply; let reply = params.roundOneReply;
let replyImages: Array<{ mimeType: string; data: string }> | undefined;
let outcome: SubagentRunOutcome | undefined = params.outcome; let outcome: SubagentRunOutcome | undefined = params.outcome;
if (!reply && params.waitForCompletion !== false) { if (!reply && params.waitForCompletion !== false) {
const waitMs = Math.min(params.timeoutMs, 60_000); const waitMs = Math.min(params.timeoutMs, 60_000);
@ -353,15 +372,20 @@ export async function runSubagentAnnounceFlow(params: {
if (wait?.status === "timeout") { if (wait?.status === "timeout") {
if (!outcome) outcome = { status: "timeout" }; if (!outcome) outcome = { status: "timeout" };
} }
reply = await readLatestAssistantReply({ // Use the new function that also extracts images
const replyContent = await readLatestAssistantReplyWithMedia({
sessionKey: params.childSessionKey, sessionKey: params.childSessionKey,
}); });
reply = replyContent.text;
replyImages = replyContent.images;
} }
if (!reply) { if (!reply && !replyImages) {
reply = await readLatestAssistantReply({ const replyContent = await readLatestAssistantReplyWithMedia({
sessionKey: params.childSessionKey, sessionKey: params.childSessionKey,
}); });
reply = replyContent.text;
replyImages = replyContent.images;
} }
if (!outcome) outcome = { status: "unknown" }; if (!outcome) outcome = { status: "unknown" };
@ -373,6 +397,39 @@ export async function runSubagentAnnounceFlow(params: {
endedAt: params.endedAt, endedAt: params.endedAt,
}); });
// Handle generated images - send them directly to the user
const savedImagePaths: string[] = [];
if (replyImages && replyImages.length > 0) {
for (const img of replyImages) {
const filePath = await saveGeneratedImage(img);
if (filePath) {
savedImagePaths.push(filePath);
}
}
// Send images directly to the user if we have a delivery context
if (savedImagePaths.length > 0 && requesterOrigin?.to && requesterOrigin?.channel) {
try {
await callGateway({
method: "send",
params: {
to: requesterOrigin.to,
message: "Here's what I generated:", // Non-empty message required
mediaUrls: savedImagePaths,
channel: requesterOrigin.channel,
accountId: requesterOrigin.accountId,
idempotencyKey: crypto.randomUUID(),
},
timeoutMs: 30_000,
});
defaultRuntime.log(
`[subagent] Images sent: ${savedImagePaths.length} image(s) to ${requesterOrigin.to}`,
);
} catch (err) {
defaultRuntime.error?.(`Failed to send subagent images: ${String(err)}`);
}
}
}
// Build status label // Build status label
const statusLabel = const statusLabel =
outcome.status === "ok" outcome.status === "ok"
@ -385,17 +442,25 @@ export async function runSubagentAnnounceFlow(params: {
// Build instructional message for main agent // Build instructional message for main agent
const taskLabel = params.label || params.task || "background task"; const taskLabel = params.label || params.task || "background task";
// If we sent images, mention it in the findings
const imageNote =
savedImagePaths.length > 0
? `\n[${savedImagePaths.length} image(s) were generated and sent to the user]`
: "";
const triggerMessage = [ const triggerMessage = [
`A background task "${taskLabel}" just ${statusLabel}.`, `A background task "${taskLabel}" just ${statusLabel}.`,
"", "",
"Findings:", "Findings:",
reply || "(no output)", reply || "(no output)",
imageNote,
"", "",
statsLine, statsLine,
"", "",
"Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.", "Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.",
"Do not mention technical details like tokens, stats, or that this was a background task.", "Do not mention technical details like tokens, stats, or that this was a background task.",
"You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).", savedImagePaths.length > 0
? "The generated image(s) have already been sent to the user. Just acknowledge the completion naturally."
: "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).",
].join("\n"); ].join("\n");
const queued = await maybeQueueSubagentAnnounce({ const queued = await maybeQueueSubagentAnnounce({

View File

@ -3,7 +3,16 @@ import crypto from "node:crypto";
import { callGateway } from "../../gateway/call.js"; import { callGateway } from "../../gateway/call.js";
import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js"; import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
import { AGENT_LANE_NESTED } from "../lanes.js"; import { AGENT_LANE_NESTED } from "../lanes.js";
import { extractAssistantText, stripToolMessages } from "./sessions-helpers.js"; import {
extractAssistantText,
extractAssistantImages,
stripToolMessages,
} from "./sessions-helpers.js";
export interface AssistantReplyContent {
text?: string;
images?: Array<{ mimeType: string; data: string }>;
}
export async function readLatestAssistantReply(params: { export async function readLatestAssistantReply(params: {
sessionKey: string; sessionKey: string;
@ -18,6 +27,29 @@ export async function readLatestAssistantReply(params: {
return last ? extractAssistantText(last) : undefined; return last ? extractAssistantText(last) : undefined;
} }
/**
* Read the latest assistant reply including both text and images.
* Used for subagent announce flow where images need to be forwarded.
*/
export async function readLatestAssistantReplyWithMedia(params: {
sessionKey: string;
limit?: number;
}): Promise<AssistantReplyContent> {
const history = (await callGateway({
method: "chat.history",
params: { sessionKey: params.sessionKey, limit: params.limit ?? 50 },
})) as { messages?: unknown[] };
const filtered = stripToolMessages(Array.isArray(history?.messages) ? history.messages : []);
const last = filtered.length > 0 ? filtered[filtered.length - 1] : undefined;
if (!last) return {};
const text = extractAssistantText(last);
const images = extractAssistantImages(last);
return {
text: text || undefined,
images: images.length > 0 ? images : undefined,
};
}
export async function runAgentStep(params: { export async function runAgentStep(params: {
sessionKey: string; sessionKey: string;
message: string; message: string;

View File

@ -325,3 +325,32 @@ export function extractAssistantText(message: unknown): string | undefined {
const joined = chunks.join("").trim(); const joined = chunks.join("").trim();
return joined ? sanitizeUserFacingText(joined) : undefined; return joined ? sanitizeUserFacingText(joined) : undefined;
} }
/**
* Extract image blocks from an assistant message.
* Used for OpenRouter image generation models that return images.
*/
export function extractAssistantImages(
message: unknown,
): Array<{ mimeType: string; data: string }> {
if (!message || typeof message !== "object") return [];
if ((message as { role?: unknown }).role !== "assistant") return [];
const content = (message as { content?: unknown }).content;
if (!Array.isArray(content)) return [];
const images: Array<{ mimeType: string; data: string }> = [];
for (const block of content) {
if (!block || typeof block !== "object") continue;
const record = block as Record<string, unknown>;
if (
record.type === "image" &&
typeof record.data === "string" &&
typeof record.mimeType === "string"
) {
images.push({
mimeType: record.mimeType,
data: record.data,
});
}
}
return images;
}