feat(agents): support OpenRouter image generation models

Add support for extracting and saving generated images from OpenRouter models that return images (like gpt-5-image-mini). - Extract base64 images from assistant responses via extractAssistantImages() - Save generated images using standard media store ("generated" subdir) - Send generated images to users via message channels - Support both direct agent responses and subagent image generation This enables use of OpenRouter's image generation models in Moltbot conversations and subagent tasks.
2026-01-29 12:37:42 -06:00 · 2026-01-29 12:37:42 -06:00 · 7d11b9246b
commit 7d11b9246b
parent c9fe062824
6 changed files with 206 additions and 18 deletions
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@ -618,7 +618,7 @@ export async function runEmbeddedPiAgent(
            usage,
          };

-          const payloads = buildEmbeddedRunPayloads({
+          const payloads = await buildEmbeddedRunPayloads({
            assistantTexts: attempt.assistantTexts,
            toolMetas: attempt.toolMetas,
            lastAssistant: attempt.lastAssistant,
--- a/src/agents/pi-embedded-runner/run/payloads.ts
+++ b/src/agents/pi-embedded-runner/run/payloads.ts
@ -4,6 +4,7 @@ import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking.
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js";
 import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
 import type { MoltbotConfig } from "../../../config/config.js";
+import { saveMediaBuffer } from "../../../media/store.js";
 import {
  formatAssistantErrorText,
  formatRawAssistantErrorForUi,
@ -12,15 +13,33 @@ import {
  normalizeTextForComparison,
 } from "../../pi-embedded-helpers.js";
 import {
+  extractAssistantImages,
  extractAssistantText,
  extractAssistantThinking,
  formatReasoningMessage,
 } from "../../pi-embedded-utils.js";
 import type { ToolResultFormat } from "../../pi-embedded-subscribe.js";

+/**
+ * Save base64 image data from OpenRouter image generation models.
+ * Uses the standard media store with "generated" subdirectory.
+ */
+async function saveGeneratedImage(image: {
+  mimeType: string;
+  data: string;
+}): Promise<string | null> {
+  try {
+    const buffer = Buffer.from(image.data, "base64");
+    const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
+    return saved.path;
+  } catch {
+    return null;
+  }
+}
+
 type ToolMetaEntry = { toolName: string; meta?: string };

-export function buildEmbeddedRunPayloads(params: {
+export async function buildEmbeddedRunPayloads(params: {
  assistantTexts: string[];
  toolMetas: ToolMetaEntry[];
  lastAssistant: AssistantMessage | undefined;
@ -31,16 +50,18 @@ export function buildEmbeddedRunPayloads(params: {
  reasoningLevel?: ReasoningLevel;
  toolResultFormat?: ToolResultFormat;
  inlineToolResultsAllowed: boolean;
-}): Array<{
-  text?: string;
-  mediaUrl?: string;
-  mediaUrls?: string[];
-  replyToId?: string;
-  isError?: boolean;
-  audioAsVoice?: boolean;
-  replyToTag?: boolean;
-  replyToCurrent?: boolean;
-}> {
+}): Promise<
+  Array<{
+    text?: string;
+    mediaUrl?: string;
+    mediaUrls?: string[];
+    replyToId?: string;
+    isError?: boolean;
+    audioAsVoice?: boolean;
+    replyToTag?: boolean;
+    replyToCurrent?: boolean;
+  }>
+> {
  const replyItems: Array<{
    text: string;
    media?: string[];
@ -209,6 +230,21 @@ export function buildEmbeddedRunPayloads(params: {
    }
  }

+  // Extract and save generated images from OpenRouter image models
+  if (params.lastAssistant) {
+    const generatedImages = extractAssistantImages(params.lastAssistant);
+    for (const img of generatedImages) {
+      const filePath = await saveGeneratedImage(img);
+      if (filePath) {
+        // Add image as a separate media item
+        replyItems.push({
+          text: "",
+          media: [filePath],
+        });
+      }
+    }
+  }
+
  const hasAudioAsVoiceTag = replyItems.some((item) => item.audioAsVoice);
  return replyItems
    .map((item) => ({
--- a/src/agents/pi-embedded-utils.ts
+++ b/src/agents/pi-embedded-utils.ts
@ -206,6 +206,32 @@ export function extractAssistantThinking(msg: AssistantMessage): string {
  return blocks.join("\n").trim();
 }

+/**
+ * Extract image blocks from assistant message content.
+ * Used for OpenRouter image generation models that return images as ImageContent.
+ */
+export function extractAssistantImages(
+  msg: AssistantMessage,
+): Array<{ mimeType: string; data: string }> {
+  if (!Array.isArray(msg.content)) return [];
+  const images: Array<{ mimeType: string; data: string }> = [];
+  for (const block of msg.content) {
+    if (!block || typeof block !== "object") continue;
+    const record = block as unknown as Record<string, unknown>;
+    if (
+      record.type === "image" &&
+      typeof record.data === "string" &&
+      typeof record.mimeType === "string"
+    ) {
+      images.push({
+        mimeType: record.mimeType,
+        data: record.data,
+      });
+    }
+  }
+  return images;
+}
+
 export function formatReasoningMessage(text: string): string {
  const trimmed = text.trim();
  if (!trimmed) return "";
--- a/src/agents/subagent-announce.ts
+++ b/src/agents/subagent-announce.ts
@ -11,6 +11,7 @@ import {
 import { normalizeMainKey } from "../routing/session-key.js";
 import { resolveQueueSettings } from "../auto-reply/reply/queue.js";
 import { callGateway } from "../gateway/call.js";
+import { saveMediaBuffer } from "../media/store.js";
 import { defaultRuntime } from "../runtime.js";
 import {
  type DeliveryContext,
@ -20,7 +21,24 @@ import {
 } from "../utils/delivery-context.js";
 import { isEmbeddedPiRunActive, queueEmbeddedPiMessage } from "./pi-embedded.js";
 import { type AnnounceQueueItem, enqueueAnnounce } from "./subagent-announce-queue.js";
-import { readLatestAssistantReply } from "./tools/agent-step.js";
+import { readLatestAssistantReplyWithMedia } from "./tools/agent-step.js";
+
+/**
+ * Save base64 image data from subagent image generation models.
+ * Uses the standard media store with "generated" subdirectory.
+ */
+async function saveGeneratedImage(image: {
+  mimeType: string;
+  data: string;
+}): Promise<string | null> {
+  try {
+    const buffer = Buffer.from(image.data, "base64");
+    const saved = await saveMediaBuffer(buffer, image.mimeType, "generated");
+    return saved.path;
+  } catch {
+    return null;
+  }
+}

 function formatDurationShort(valueMs?: number) {
  if (!valueMs || !Number.isFinite(valueMs) || valueMs <= 0) return undefined;
@ -321,6 +339,7 @@ export async function runSubagentAnnounceFlow(params: {
  try {
    const requesterOrigin = normalizeDeliveryContext(params.requesterOrigin);
    let reply = params.roundOneReply;
+    let replyImages: Array<{ mimeType: string; data: string }> | undefined;
    let outcome: SubagentRunOutcome | undefined = params.outcome;
    if (!reply && params.waitForCompletion !== false) {
      const waitMs = Math.min(params.timeoutMs, 60_000);
@ -353,15 +372,20 @@ export async function runSubagentAnnounceFlow(params: {
      if (wait?.status === "timeout") {
        if (!outcome) outcome = { status: "timeout" };
      }
-      reply = await readLatestAssistantReply({
+      // Use the new function that also extracts images
+      const replyContent = await readLatestAssistantReplyWithMedia({
        sessionKey: params.childSessionKey,
      });
+      reply = replyContent.text;
+      replyImages = replyContent.images;
    }

-    if (!reply) {
-      reply = await readLatestAssistantReply({
+    if (!reply && !replyImages) {
+      const replyContent = await readLatestAssistantReplyWithMedia({
        sessionKey: params.childSessionKey,
      });
+      reply = replyContent.text;
+      replyImages = replyContent.images;
    }

    if (!outcome) outcome = { status: "unknown" };
@ -373,6 +397,39 @@ export async function runSubagentAnnounceFlow(params: {
      endedAt: params.endedAt,
    });

+    // Handle generated images - send them directly to the user
+    const savedImagePaths: string[] = [];
+    if (replyImages && replyImages.length > 0) {
+      for (const img of replyImages) {
+        const filePath = await saveGeneratedImage(img);
+        if (filePath) {
+          savedImagePaths.push(filePath);
+        }
+      }
+      // Send images directly to the user if we have a delivery context
+      if (savedImagePaths.length > 0 && requesterOrigin?.to && requesterOrigin?.channel) {
+        try {
+          await callGateway({
+            method: "send",
+            params: {
+              to: requesterOrigin.to,
+              message: "Here's what I generated:", // Non-empty message required
+              mediaUrls: savedImagePaths,
+              channel: requesterOrigin.channel,
+              accountId: requesterOrigin.accountId,
+              idempotencyKey: crypto.randomUUID(),
+            },
+            timeoutMs: 30_000,
+          });
+          defaultRuntime.log(
+            `[subagent] Images sent: ${savedImagePaths.length} image(s) to ${requesterOrigin.to}`,
+          );
+        } catch (err) {
+          defaultRuntime.error?.(`Failed to send subagent images: ${String(err)}`);
+        }
+      }
+    }
+
    // Build status label
    const statusLabel =
      outcome.status === "ok"
@ -385,17 +442,25 @@ export async function runSubagentAnnounceFlow(params: {

    // Build instructional message for main agent
    const taskLabel = params.label || params.task || "background task";
+    // If we sent images, mention it in the findings
+    const imageNote =
+      savedImagePaths.length > 0
+        ? `\n[${savedImagePaths.length} image(s) were generated and sent to the user]`
+        : "";
    const triggerMessage = [
      `A background task "${taskLabel}" just ${statusLabel}.`,
      "",
      "Findings:",
      reply || "(no output)",
+      imageNote,
      "",
      statsLine,
      "",
      "Summarize this naturally for the user. Keep it brief (1-2 sentences). Flow it into the conversation naturally.",
      "Do not mention technical details like tokens, stats, or that this was a background task.",
-      "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).",
+      savedImagePaths.length > 0
+        ? "The generated image(s) have already been sent to the user. Just acknowledge the completion naturally."
+        : "You can respond with NO_REPLY if no announcement is needed (e.g., internal task with no user-facing result).",
    ].join("\n");

    const queued = await maybeQueueSubagentAnnounce({
--- a/src/agents/tools/agent-step.ts
+++ b/src/agents/tools/agent-step.ts
@ -3,7 +3,16 @@ import crypto from "node:crypto";
 import { callGateway } from "../../gateway/call.js";
 import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
 import { AGENT_LANE_NESTED } from "../lanes.js";
-import { extractAssistantText, stripToolMessages } from "./sessions-helpers.js";
+import {
+  extractAssistantText,
+  extractAssistantImages,
+  stripToolMessages,
+} from "./sessions-helpers.js";
+
+export interface AssistantReplyContent {
+  text?: string;
+  images?: Array<{ mimeType: string; data: string }>;
+}

 export async function readLatestAssistantReply(params: {
  sessionKey: string;
@ -18,6 +27,29 @@ export async function readLatestAssistantReply(params: {
  return last ? extractAssistantText(last) : undefined;
 }

+/**
+ * Read the latest assistant reply including both text and images.
+ * Used for subagent announce flow where images need to be forwarded.
+ */
+export async function readLatestAssistantReplyWithMedia(params: {
+  sessionKey: string;
+  limit?: number;
+}): Promise<AssistantReplyContent> {
+  const history = (await callGateway({
+    method: "chat.history",
+    params: { sessionKey: params.sessionKey, limit: params.limit ?? 50 },
+  })) as { messages?: unknown[] };
+  const filtered = stripToolMessages(Array.isArray(history?.messages) ? history.messages : []);
+  const last = filtered.length > 0 ? filtered[filtered.length - 1] : undefined;
+  if (!last) return {};
+  const text = extractAssistantText(last);
+  const images = extractAssistantImages(last);
+  return {
+    text: text || undefined,
+    images: images.length > 0 ? images : undefined,
+  };
+}
+
 export async function runAgentStep(params: {
  sessionKey: string;
  message: string;
--- a/src/agents/tools/sessions-helpers.ts
+++ b/src/agents/tools/sessions-helpers.ts
@ -325,3 +325,32 @@ export function extractAssistantText(message: unknown): string | undefined {
  const joined = chunks.join("").trim();
  return joined ? sanitizeUserFacingText(joined) : undefined;
 }
+
+/**
+ * Extract image blocks from an assistant message.
+ * Used for OpenRouter image generation models that return images.
+ */
+export function extractAssistantImages(
+  message: unknown,
+): Array<{ mimeType: string; data: string }> {
+  if (!message || typeof message !== "object") return [];
+  if ((message as { role?: unknown }).role !== "assistant") return [];
+  const content = (message as { content?: unknown }).content;
+  if (!Array.isArray(content)) return [];
+  const images: Array<{ mimeType: string; data: string }> = [];
+  for (const block of content) {
+    if (!block || typeof block !== "object") continue;
+    const record = block as Record<string, unknown>;
+    if (
+      record.type === "image" &&
+      typeof record.data === "string" &&
+      typeof record.mimeType === "string"
+    ) {
+      images.push({
+        mimeType: record.mimeType,
+        data: record.data,
+      });
+    }
+  }
+  return images;
+}