feat(agents): support OpenRouter image generation models
Add support for extracting and saving generated images from OpenRouter
models that return images (like gpt-5-image-mini).
- Extract base64 images from assistant responses via extractAssistantImages()
- Save generated images using standard media store ("generated" subdir)
- Send generated images to users via message channels
- Support both direct agent responses and subagent image generation
This enables use of OpenRouter's image generation models in Moltbot
conversations and subagent tasks.
This commit is contained in:
parent
c9fe062824
commit
7d11b9246b
@ -618,7 +618,7 @@ export async function runEmbeddedPiAgent(
|
|||||||
usage,
|
usage,
|
||||||
};
|
};
|
||||||
|
|
||||||
const payloads = buildEmbeddedRunPayloads({
|
const payloads = await buildEmbeddedRunPayloads({
|
||||||
assistantTexts: attempt.assistantTexts,
|
assistantTexts: attempt.assistantTexts,
|
||||||
toolMetas: attempt.toolMetas,
|
toolMetas: attempt.toolMetas,
|
||||||
lastAssistant: attempt.lastAssistant,
|
lastAssistant: attempt.lastAssistant,
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking.
|
|||||||
import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js";
|
import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js";
|
||||||
import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
|
import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
|
||||||
import type { MoltbotConfig } from "../../../config/config.js";
|
import type { MoltbotConfig } from "../../../config/config.js";
|
||||||
|
import { saveMediaBuffer } from "../../../media/store.js";
|
||||||
import {
|
import {
|
||||||
formatAssistantErrorText,
|
formatAssistantErrorText,
|
||||||
formatRawAssistantErrorForUi,
|
formatRawAssistantErrorForUi,
|
||||||
@ -12,15 +13,33 @@ import {
|
|||||||
normalizeTextForComparison,
|
normalizeTextForComparison,
|
||||||
} from "../../pi-embedded-helpers.js";
|
} from "../../pi-embedded-helpers.js";
|
||||||
import {
|
import {
|
||||||
|
extractAssistantImages,
|
||||||
extractAssistantText,
|
extractAssistantText,
|
||||||
extractAssistantThinking,
|
extractAssistantThinking,
|
||||||
formatReasoningMessage,
|
formatReasoningMessage,
|
||||||
} from "../../pi-embedded-utils.js";
|
} from "../../pi-embedded-utils.js";
|
||||||
import type { ToolResultFormat } from "../../pi-embedded-subscribe.js";
|
import type { ToolResultFormat } from "../../pi-embedded-subscribe.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save base64 image data from OpenRouter image generation models.
|
||||||
|
* Uses the standard media store with "generated" subdirectory.
|
||||||
|
*/
|
||||||
|
async function saveGeneratedImage(image: {
|
||||||
|
mimeType: string;
|
||||||
|
data: string;
|
||||||
|
}): Promise<string | null> {
|
||||||
|
try {
|
||||||
|
const buffer = Buffer.from(image.data, "base64");
|
||||||
|
const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
|
||||||
|
return saved.path;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type ToolMetaEntry = { toolName: string; meta?: string };
|
type ToolMetaEntry = { toolName: string; meta?: string };
|
||||||
|
|
||||||
export function buildEmbeddedRunPayloads(params: {
|
export async function buildEmbeddedRunPayloads(params: {
|
||||||
assistantTexts: string[];
|
assistantTexts: string[];
|
||||||
toolMetas: ToolMetaEntry[];
|
toolMetas: ToolMetaEntry[];
|
||||||
lastAssistant: AssistantMessage | undefined;
|
lastAssistant: AssistantMessage | undefined;
|
||||||
@ -31,16 +50,18 @@ export function buildEmbeddedRunPayloads(params: {
|
|||||||
reasoningLevel?: ReasoningLevel;
|
reasoningLevel?: ReasoningLevel;
|
||||||
toolResultFormat?: ToolResultFormat;
|
toolResultFormat?: ToolResultFormat;
|
||||||
inlineToolResultsAllowed: boolean;
|
inlineToolResultsAllowed: boolean;
|
||||||
}): Array<{
|
}): Promise<
|
||||||
text?: string;
|
Array<{
|
||||||
mediaUrl?: string;
|
text?: string;
|
||||||
mediaUrls?: string[];
|
mediaUrl?: string;
|
||||||
replyToId?: string;
|
mediaUrls?: string[];
|
||||||
isError?: boolean;
|
replyToId?: string;
|
||||||
audioAsVoice?: boolean;
|
isError?: boolean;
|
||||||
replyToTag?: boolean;
|
audioAsVoice?: boolean;
|
||||||
replyToCurrent?: boolean;
|
replyToTag?: boolean;
|
||||||
}> {
|
replyToCurrent?: boolean;
|
||||||
|
}>
|
||||||
|
> {
|
||||||
const replyItems: Array<{
|
const replyItems: Array<{
|
||||||
text: string;
|
text: string;
|
||||||
media?: string[];
|
media?: string[];
|
||||||
@ -209,6 +230,21 @@ export function buildEmbeddedRunPayloads(params: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract and save generated images from OpenRouter image models
|
||||||
|
if (params.lastAssistant) {
|
||||||
|
const generatedImages = extractAssistantImages(params.lastAssistant);
|
||||||
|
for (const img of generatedImages) {
|
||||||
|
const filePath = await saveGeneratedImage(img);
|
||||||
|
if (filePath) {
|
||||||
|
// Add image as a separate media item
|
||||||
|
replyItems.push({
|
||||||
|
text: "",
|
||||||
|
media: [filePath],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice);
|
const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice);
|
||||||
return replyItems
|
return replyItems
|
||||||
.map((item) => ({
|
.map((item) => ({
|
||||||
|
|||||||
@ -206,6 +206,32 @@ export function extractAssistantThinking(msg: AssistantMessage): string {
|
|||||||
return blocks.join("\n").trim();
|
return blocks.join("\n").trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract image blocks from assistant message content.
|
||||||
|
* Used for OpenRouter image generation models that return images as ImageContent.
|
||||||
|
*/
|
||||||
|
export function extractAssistantImages(
|
||||||
|
msg: AssistantMessage,
|
||||||
|
): Array<{ mimeType: string; data: string }> {
|
||||||
|
if (!Array.isArray(msg.content)) return [];
|
||||||
|
const images: Array<{ mimeType: string; data: string }> = [];
|
||||||
|
for (const block of msg.content) {
|
||||||
|
if (!block || typeof block !== "object") continue;
|
||||||
|
const record = block as unknown as Record<string, unknown>;
|
||||||
|
if (
|
||||||
|
record.type === "image" &&
|
||||||
|
typeof record.data === "string" &&
|
||||||
|
typeof record.mimeType === "string"
|
||||||
|
) {
|
||||||
|
images.push({
|
||||||
|
mimeType: record.mimeType,
|
||||||
|
data: record.data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
export function formatReasoningMessage(text: string): string {
|
export function formatReasoningMessage(text: string): string {
|
||||||
const trimmed = text.trim();
|
const trimmed = text.trim();
|
||||||
if (!trimmed) return "";
|
if (!trimmed) return "";
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import {
|
|||||||
import { normalizeMainKey } from "../routing/session-key.js";
|
import { normalizeMainKey } from "../routing/session-key.js";
|
||||||
import { resolveQueueSettings } from "../auto-reply/reply/queue.js";
|
import { resolveQueueSettings } from "../auto-reply/reply/queue.js";
|
||||||
import { callGateway } from "../gateway/call.js";
|
import { callGateway } from "../gateway/call.js";
|
||||||
|
import { saveMediaBuffer } from "../media/store.js";
|
||||||
import { defaultRuntime } from "../runtime.js";
|
import { defaultRuntime } from "../runtime.js";
|
||||||
import {
|
import {
|
||||||
type DeliveryContext,
|
type DeliveryContext,
|
||||||
@ -20,7 +21,24 @@ import {
|
|||||||
} from "../utils/delivery-context.js";
|
} from "../utils/delivery-context.js";
|
||||||
import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js";
|
import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js";
|
||||||
import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js";
|
import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js";
|
||||||
import { readLatestAssistantReply } from "./tools/agent-step.js";
|
import { readLatestAssistantReplyWithMedia } from "./tools/agent-step.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save base64 image data from subagent image generation models.
|
||||||
|
* Uses the standard media store with "generated" subdirectory.
|
||||||
|
*/
|
||||||
|
async function saveGeneratedImage(image: {
|
||||||
|
mimeType: string;
|
||||||
|
data: string;
|
||||||
|
}): Promise<string | null> {
|
||||||
|
try {
|
||||||
|
const buffer = Buffer.from(image.data, "base64");
|
||||||
|
const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
|
||||||
|
return saved.path;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function formatDurationShort(valueMs?: number) {
|
function formatDurationShort(valueMs?: number) {
|
||||||
if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined;
|
if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined;
|
||||||
@ -321,6 +339,7 @@ export async function runSubagentAnnounceFlow(params: {
|
|||||||
try {
|
try {
|
||||||
const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin);
|
const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin);
|
||||||
let reply = params.roundOneReply;
|
let reply = params.roundOneReply;
|
||||||
|
let replyImages: Array<{ mimeType: string; data: string }> | undefined;
|
||||||
let outcome: SubagentRunOutcome | undefined = params.outcome;
|
let outcome: SubagentRunOutcome | undefined = params.outcome;
|
||||||
if (!reply && params.waitForCompletion !== false) {
|
if (!reply && params.waitForCompletion !== false) {
|
||||||
const waitMs = Math.min(params.timeoutMs, 60_000);
|
const waitMs = Math.min(params.timeoutMs, 60_000);
|
||||||
@ -353,15 +372,20 @@ export async function runSubagentAnnounceFlow(params: {
|
|||||||
if (wait?.status === "timeout") {
|
if (wait?.status === "timeout") {
|
||||||
if (!outcome) outcome = { status: "timeout" };
|
if (!outcome) outcome = { status: "timeout" };
|
||||||
}
|
}
|
||||||
reply = await readLatestAssistantReply({
|
// Use the new function that also extracts images
|
||||||
|
const replyContent = await readLatestAssistantReplyWithMedia({
|
||||||
sessionKey: params.childSessionKey,
|
sessionKey: params.childSessionKey,
|
||||||
});
|
});
|
||||||
|
reply = replyContent.text;
|
||||||
|
replyImages = replyContent.images;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!reply) {
|
if (!reply && !replyImages) {
|
||||||
reply = await readLatestAssistantReply({
|
const replyContent = await readLatestAssistantReplyWithMedia({
|
||||||
sessionKey: params.childSessionKey,
|
sessionKey: params.childSessionKey,
|
||||||
});
|
});
|
||||||
|
reply = replyContent.text;
|
||||||
|
replyImages = replyContent.images;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!outcome) outcome = { status: "unknown" };
|
if (!outcome) outcome = { status: "unknown" };
|
||||||
@ -373,6 +397,39 @@ export async function runSubagentAnnounceFlow(params: {
|
|||||||
endedAt: params.endedAt,
|
endedAt: params.endedAt,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Handle generated images - send them directly to the user
|
||||||
|
const savedImagePaths: string[] = [];
|
||||||
|
if (replyImages && replyImages.length > 0) {
|
||||||
|
for (const img of replyImages) {
|
||||||
|
const filePath = await saveGeneratedImage(img);
|
||||||
|
if (filePath) {
|
||||||
|
savedImagePaths.push(filePath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Send images directly to the user if we have a delivery context
|
||||||
|
if (savedImagePaths.length > 0 && requesterOrigin?.to && requesterOrigin?.channel) {
|
||||||
|
try {
|
||||||
|
await callGateway({
|
||||||
|
method: "send",
|
||||||
|
params: {
|
||||||
|
to: requesterOrigin.to,
|
||||||
|
message: "Here's what I generated:", // Non-empty message required
|
||||||
|
mediaUrls: savedImagePaths,
|
||||||
|
channel: requesterOrigin.channel,
|
||||||
|
accountId: requesterOrigin.accountId,
|
||||||
|
idempotencyKey: crypto.randomUUID(),
|
||||||
|
},
|
||||||
|
timeoutMs: 30_000,
|
||||||
|
});
|
||||||
|
defaultRuntime.log(
|
||||||
|
`[subagent] Images sent: ${savedImagePaths.length} image(s) to ${requesterOrigin.to}`,
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
defaultRuntime.error?.(`Failed to send subagent images: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Build status label
|
// Build status label
|
||||||
const statusLabel =
|
const statusLabel =
|
||||||
outcome.status === "ok"
|
outcome.status === "ok"
|
||||||
@ -385,17 +442,25 @@ export async function runSubagentAnnounceFlow(params: {
|
|||||||
|
|
||||||
// Build instructional message for main agent
|
// Build instructional message for main agent
|
||||||
const taskLabel = params.label || params.task || "background task";
|
const taskLabel = params.label || params.task || "background task";
|
||||||
|
// If we sent images, mention it in the findings
|
||||||
|
const imageNote =
|
||||||
|
savedImagePaths.length > 0
|
||||||
|
? `\n[${savedImagePaths.length} image(s) were generated and sent to the user]`
|
||||||
|
: "";
|
||||||
const triggerMessage = [
|
const triggerMessage = [
|
||||||
`A background task "${taskLabel}" just ${statusLabel}.`,
|
`A background task "${taskLabel}" just ${statusLabel}.`,
|
||||||
"",
|
"",
|
||||||
"Findings:",
|
"Findings:",
|
||||||
reply || "(no output)",
|
reply || "(no output)",
|
||||||
|
imageNote,
|
||||||
"",
|
"",
|
||||||
statsLine,
|
statsLine,
|
||||||
"",
|
"",
|
||||||
"Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.",
|
"Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.",
|
||||||
"Do not mention technical details like tokens, stats, or that this was a background task.",
|
"Do not mention technical details like tokens, stats, or that this was a background task.",
|
||||||
"You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).",
|
savedImagePaths.length > 0
|
||||||
|
? "The generated image(s) have already been sent to the user. Just acknowledge the completion naturally."
|
||||||
|
: "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).",
|
||||||
].join("\n");
|
].join("\n");
|
||||||
|
|
||||||
const queued = await maybeQueueSubagentAnnounce({
|
const queued = await maybeQueueSubagentAnnounce({
|
||||||
|
|||||||
@ -3,7 +3,16 @@ import crypto from "node:crypto";
|
|||||||
import { callGateway } from "../../gateway/call.js";
|
import { callGateway } from "../../gateway/call.js";
|
||||||
import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
|
import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
|
||||||
import { AGENT_LANE_NESTED } from "../lanes.js";
|
import { AGENT_LANE_NESTED } from "../lanes.js";
|
||||||
import { extractAssistantText, stripToolMessages } from "./sessions-helpers.js";
|
import {
|
||||||
|
extractAssistantText,
|
||||||
|
extractAssistantImages,
|
||||||
|
stripToolMessages,
|
||||||
|
} from "./sessions-helpers.js";
|
||||||
|
|
||||||
|
export interface AssistantReplyContent {
|
||||||
|
text?: string;
|
||||||
|
images?: Array<{ mimeType: string; data: string }>;
|
||||||
|
}
|
||||||
|
|
||||||
export async function readLatestAssistantReply(params: {
|
export async function readLatestAssistantReply(params: {
|
||||||
sessionKey: string;
|
sessionKey: string;
|
||||||
@ -18,6 +27,29 @@ export async function readLatestAssistantReply(params: {
|
|||||||
return last ? extractAssistantText(last) : undefined;
|
return last ? extractAssistantText(last) : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the latest assistant reply including both text and images.
|
||||||
|
* Used for subagent announce flow where images need to be forwarded.
|
||||||
|
*/
|
||||||
|
export async function readLatestAssistantReplyWithMedia(params: {
|
||||||
|
sessionKey: string;
|
||||||
|
limit?: number;
|
||||||
|
}): Promise<AssistantReplyContent> {
|
||||||
|
const history = (await callGateway({
|
||||||
|
method: "chat.history",
|
||||||
|
params: { sessionKey: params.sessionKey, limit: params.limit ?? 50 },
|
||||||
|
})) as { messages?: unknown[] };
|
||||||
|
const filtered = stripToolMessages(Array.isArray(history?.messages) ? history.messages : []);
|
||||||
|
const last = filtered.length > 0 ? filtered[filtered.length - 1] : undefined;
|
||||||
|
if (!last) return {};
|
||||||
|
const text = extractAssistantText(last);
|
||||||
|
const images = extractAssistantImages(last);
|
||||||
|
return {
|
||||||
|
text: text || undefined,
|
||||||
|
images: images.length > 0 ? images : undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export async function runAgentStep(params: {
|
export async function runAgentStep(params: {
|
||||||
sessionKey: string;
|
sessionKey: string;
|
||||||
message: string;
|
message: string;
|
||||||
|
|||||||
@ -325,3 +325,32 @@ export function extractAssistantText(message: unknown): string | undefined {
|
|||||||
const joined = chunks.join("").trim();
|
const joined = chunks.join("").trim();
|
||||||
return joined ? sanitizeUserFacingText(joined) : undefined;
|
return joined ? sanitizeUserFacingText(joined) : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract image blocks from an assistant message.
|
||||||
|
* Used for OpenRouter image generation models that return images.
|
||||||
|
*/
|
||||||
|
export function extractAssistantImages(
|
||||||
|
message: unknown,
|
||||||
|
): Array<{ mimeType: string; data: string }> {
|
||||||
|
if (!message || typeof message !== "object") return [];
|
||||||
|
if ((message as { role?: unknown }).role !== "assistant") return [];
|
||||||
|
const content = (message as { content?: unknown }).content;
|
||||||
|
if (!Array.isArray(content)) return [];
|
||||||
|
const images: Array<{ mimeType: string; data: string }> = [];
|
||||||
|
for (const block of content) {
|
||||||
|
if (!block || typeof block !== "object") continue;
|
||||||
|
const record = block as Record<string, unknown>;
|
||||||
|
if (
|
||||||
|
record.type === "image" &&
|
||||||
|
typeof record.data === "string" &&
|
||||||
|
typeof record.mimeType === "string"
|
||||||
|
) {
|
||||||
|
images.push({
|
||||||
|
mimeType: record.mimeType,
|
||||||
|
data: record.data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user