openclaw/src/telegram/bot/delivery.ts

import { type Bot, InputFile } from "grammy";
import {
  markdownToTelegramChunks,
  markdownToTelegramHtml,
  renderTelegramHtmlText,
} from "../format.js";
import { splitTelegramCaption } from "../caption.js";
import type { ReplyPayload } from "../../auto-reply/types.js";
import type { ReplyToMode } from "../../config/config.js";
import type { MarkdownTableMode } from "../../config/types.base.js";
import { danger, logVerbose } from "../../globals.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { mediaKindFromMime } from "../../media/constants.js";
import { fetchRemoteMedia } from "../../media/fetch.js";
import { isGifMedia } from "../../media/mime.js";
import { saveMediaBuffer } from "../../media/store.js";
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
import type { RuntimeEnv } from "../../runtime.js";
import { loadWebMedia } from "../../web/media.js";
import { resolveTelegramVoiceSend } from "../voice.js";
import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js";
import type { TelegramContext } from "./types.js";

const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;

export async function deliverReplies(params: {
  replies: ReplyPayload[];
  chatId: string;
  token: string;
  runtime: RuntimeEnv;
  bot: Bot;
  replyToMode: ReplyToMode;
  textLimit: number;
  messageThreadId?: number;
  tableMode?: MarkdownTableMode;
  /** Callback invoked before sending a voice message to switch typing indicator. */
  onVoiceRecording?: () => Promise<void> | void;
}) {
  const { replies, chatId, runtime, bot, replyToMode, textLimit, messageThreadId } = params;
  const threadParams = buildTelegramThreadParams(messageThreadId);
  let hasReplied = false;
  for (const reply of replies) {
    // Track if hook wants to send audio after text
    let audioToSendAfter: string | undefined;

    // Run message_sending hook (allows plugins like TTS to generate audio)
    const hookRunner = getGlobalHookRunner();
    if (hookRunner && reply?.text?.trim()) {
      try {
        const hookResult = await hookRunner.runMessageSending(
          {
            to: chatId,
            content: reply.text,
            metadata: { channel: "telegram", threadId: messageThreadId },
          },
          {
            channelId: "telegram",
            accountId: undefined,
            conversationId: chatId,
          },
        );

        // Check if hook wants to cancel the message
        if (hookResult?.cancel) {
          continue; // Skip this reply
        }

        // Check if hook returned a MEDIA directive (TTS audio)
        if (hookResult?.content !== undefined) {
          const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
          if (mediaMatch) {
            // Save audio path to send AFTER the text message
            audioToSendAfter = mediaMatch[1].trim();
          }
        }
      } catch (err) {
        // Hook errors shouldn't block message sending
        logVerbose(`[telegram delivery] hook error: ${String(err)}`);
      }
    }

    const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0;
    if (!reply?.text && !hasMedia) {
      if (reply?.audioAsVoice) {
        logVerbose("telegram reply has audioAsVoice without media/text; skipping");
        continue;
      }
      runtime.error?.(danger("reply missing text/media"));
      continue;
    }
    const replyToId = replyToMode === "off" ? undefined : resolveTelegramReplyId(reply.replyToId);
    const mediaList = reply.mediaUrls?.length
      ? reply.mediaUrls
      : reply.mediaUrl
        ? [reply.mediaUrl]
        : [];
    if (mediaList.length === 0) {
      const chunks = markdownToTelegramChunks(reply.text || "", textLimit, {
        tableMode: params.tableMode,
      });
      for (const chunk of chunks) {
        await sendTelegramText(bot, chatId, chunk.html, runtime, {
          replyToMessageId:
            replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined,
          messageThreadId,
          textMode: "html",
          plainText: chunk.text,
        });
        if (replyToId && !hasReplied) {
          hasReplied = true;
        }
      }

      // Send TTS audio after text (if hook generated one)
      if (audioToSendAfter) {
        try {
          const audioMedia = await loadWebMedia(audioToSendAfter);
          const audioFile = new InputFile(audioMedia.buffer, "voice.mp3");
          // Switch typing indicator to record_voice before sending
          await params.onVoiceRecording?.();
          const audioParams: Record<string, unknown> = {};
          if (threadParams) {
            audioParams.message_thread_id = threadParams.message_thread_id;
          }
          await bot.api.sendVoice(chatId, audioFile, audioParams);
          logVerbose(`[telegram delivery] TTS audio sent: ${audioToSendAfter}`);
        } catch (err) {
          logVerbose(`[telegram delivery] TTS audio send failed: ${String(err)}`);
        }
      }

      continue;
    }
    // media with optional caption on first item
    let first = true;
    // Track if we need to send a follow-up text message after media
    // (when caption exceeds Telegram's 1024-char limit)
    let pendingFollowUpText: string | undefined;
    for (const mediaUrl of mediaList) {
      const isFirstMedia = first;
      const media = await loadWebMedia(mediaUrl);
      const kind = mediaKindFromMime(media.contentType ?? undefined);
      const isGif = isGifMedia({
        contentType: media.contentType,
        fileName: media.fileName,
      });
      const fileName = media.fileName ?? (isGif ? "animation.gif" : "file");
      const file = new InputFile(media.buffer, fileName);
      // Caption only on first item; if text exceeds limit, defer to follow-up message.
      const { caption, followUpText } = splitTelegramCaption(
        isFirstMedia ? (reply.text ?? undefined) : undefined,
      );
      const htmlCaption = caption
        ? renderTelegramHtmlText(caption, { tableMode: params.tableMode })
        : undefined;
      if (followUpText) {
        pendingFollowUpText = followUpText;
      }
      first = false;
      const replyToMessageId =
        replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
      const mediaParams: Record<string, unknown> = {
        caption: htmlCaption,
        reply_to_message_id: replyToMessageId,
        ...(htmlCaption ? { parse_mode: "HTML" } : {}),
      };
      if (threadParams) {
        mediaParams.message_thread_id = threadParams.message_thread_id;
      }
      if (isGif) {
        await bot.api.sendAnimation(chatId, file, {
          ...mediaParams,
        });
      } else if (kind === "image") {
        await bot.api.sendPhoto(chatId, file, {
          ...mediaParams,
        });
      } else if (kind === "video") {
        await bot.api.sendVideo(chatId, file, {
          ...mediaParams,
        });
      } else if (kind === "audio") {
        const { useVoice } = resolveTelegramVoiceSend({
          wantsVoice: reply.audioAsVoice === true, // default false (backward compatible)
          contentType: media.contentType,
          fileName,
          logFallback: logVerbose,
        });
        if (useVoice) {
          // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]])
          // Switch typing indicator to record_voice before sending.
          await params.onVoiceRecording?.();
          await bot.api.sendVoice(chatId, file, {
            ...mediaParams,
          });
        } else {
          // Audio file - displays with metadata (title, duration) - DEFAULT
          await bot.api.sendAudio(chatId, file, {
            ...mediaParams,
          });
        }
      } else {
        await bot.api.sendDocument(chatId, file, {
          ...mediaParams,
        });
      }
      if (replyToId && !hasReplied) {
        hasReplied = true;
      }
      // Send deferred follow-up text right after the first media item.
      // Chunk it in case it's extremely long (same logic as text-only replies).
      if (pendingFollowUpText && isFirstMedia) {
        const chunks = markdownToTelegramChunks(pendingFollowUpText, textLimit, {
          tableMode: params.tableMode,
        });
        for (const chunk of chunks) {
          const replyToMessageIdFollowup =
            replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
          await sendTelegramText(bot, chatId, chunk.html, runtime, {
            replyToMessageId: replyToMessageIdFollowup,
            messageThreadId,
            textMode: "html",
            plainText: chunk.text,
          });
          if (replyToId && !hasReplied) {
            hasReplied = true;
          }
        }
        pendingFollowUpText = undefined;
      }
    }
  }
}

export async function resolveMedia(
  ctx: TelegramContext,
  maxBytes: number,
  token: string,
  proxyFetch?: typeof fetch,
): Promise<{ path: string; contentType?: string; placeholder: string } | null> {
  const msg = ctx.message;
  const m =
    msg.photo?.[msg.photo.length - 1] ?? msg.video ?? msg.document ?? msg.audio ?? msg.voice;
  if (!m?.file_id) return null;
  const file = await ctx.getFile();
  if (!file.file_path) {
    throw new Error("Telegram getFile returned no file_path");
  }
  const fetchImpl = proxyFetch ?? globalThis.fetch;
  if (!fetchImpl) {
    throw new Error("fetch is not available; set channels.telegram.proxy in config");
  }
  const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
  const fetched = await fetchRemoteMedia({
    url,
    fetchImpl,
    filePathHint: file.file_path,
  });
  const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes);
  let placeholder = "<media:document>";
  if (msg.photo) placeholder = "<media:image>";
  else if (msg.video) placeholder = "<media:video>";
  else if (msg.audio || msg.voice) placeholder = "<media:audio>";
  return { path: saved.path, contentType: saved.contentType, placeholder };
}

function buildTelegramSendParams(opts?: {
  replyToMessageId?: number;
  messageThreadId?: number;
}): Record<string, unknown> {
  const threadParams = buildTelegramThreadParams(opts?.messageThreadId);
  const params: Record<string, unknown> = {};
  if (opts?.replyToMessageId) {
    params.reply_to_message_id = opts.replyToMessageId;
  }
  if (threadParams) {
    params.message_thread_id = threadParams.message_thread_id;
  }
  return params;
}

async function sendTelegramText(
  bot: Bot,
  chatId: string,
  text: string,
  runtime: RuntimeEnv,
  opts?: {
    replyToMessageId?: number;
    messageThreadId?: number;
    textMode?: "markdown" | "html";
    plainText?: string;
  },
): Promise<number | undefined> {
  const baseParams = buildTelegramSendParams({
    replyToMessageId: opts?.replyToMessageId,
    messageThreadId: opts?.messageThreadId,
  });
  const textMode = opts?.textMode ?? "markdown";
  const htmlText = textMode === "html" ? text : markdownToTelegramHtml(text);
  try {
    const res = await bot.api.sendMessage(chatId, htmlText, {
      parse_mode: "HTML",
      ...baseParams,
    });
    return res.message_id;
  } catch (err) {
    const errText = formatErrorMessage(err);
    if (PARSE_ERR_RE.test(errText)) {
      runtime.log?.(`telegram HTML parse failed; retrying without formatting: ${errText}`);
      const fallbackText = opts?.plainText ?? text;
      const res = await bot.api.sendMessage(chatId, fallbackText, {
        ...baseParams,
      });
      return res.message_id;
    }
    throw err;
  }
}