openclaw/src/telegram/bot/delivery.ts
Glucksberg df09e583aa feat(telegram-tts): add auto-TTS hook and provider switching
- Integrate message_sending hook into Telegram delivery path
- Send text first, then audio as voice message after
- Add /tts_provider command to switch between OpenAI and ElevenLabs
- Implement automatic fallback when primary provider fails
- Use gpt-4o-mini-tts as default OpenAI model
- Add hook integration to route-reply.ts for other channels

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 08:00:44 +00:00

319 lines
11 KiB
TypeScript

import { type Bot, InputFile } from "grammy";
import {
markdownToTelegramChunks,
markdownToTelegramHtml,
renderTelegramHtmlText,
} from "../format.js";
import { splitTelegramCaption } from "../caption.js";
import type { ReplyPayload } from "../../auto-reply/types.js";
import type { ReplyToMode } from "../../config/config.js";
import type { MarkdownTableMode } from "../../config/types.base.js";
import { danger, logVerbose } from "../../globals.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { mediaKindFromMime } from "../../media/constants.js";
import { fetchRemoteMedia } from "../../media/fetch.js";
import { isGifMedia } from "../../media/mime.js";
import { saveMediaBuffer } from "../../media/store.js";
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
import type { RuntimeEnv } from "../../runtime.js";
import { loadWebMedia } from "../../web/media.js";
import { resolveTelegramVoiceSend } from "../voice.js";
import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js";
import type { TelegramContext } from "./types.js";
const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;
export async function deliverReplies(params: {
replies: ReplyPayload[];
chatId: string;
token: string;
runtime: RuntimeEnv;
bot: Bot;
replyToMode: ReplyToMode;
textLimit: number;
messageThreadId?: number;
tableMode?: MarkdownTableMode;
/** Callback invoked before sending a voice message to switch typing indicator. */
onVoiceRecording?: () => Promise<void> | void;
}) {
const { replies, chatId, runtime, bot, replyToMode, textLimit, messageThreadId } = params;
const threadParams = buildTelegramThreadParams(messageThreadId);
let hasReplied = false;
for (const reply of replies) {
// Track if hook wants to send audio after text
let audioToSendAfter: string | undefined;
// Run message_sending hook (allows plugins like TTS to generate audio)
const hookRunner = getGlobalHookRunner();
if (hookRunner && reply?.text?.trim()) {
try {
const hookResult = await hookRunner.runMessageSending(
{
to: chatId,
content: reply.text,
metadata: { channel: "telegram", threadId: messageThreadId },
},
{
channelId: "telegram",
accountId: undefined,
conversationId: chatId,
},
);
// Check if hook wants to cancel the message
if (hookResult?.cancel) {
continue; // Skip this reply
}
// Check if hook returned a MEDIA directive (TTS audio)
if (hookResult?.content !== undefined) {
const mediaMatch = hookResult.content.match(/^MEDIA:(.+)$/m);
if (mediaMatch) {
// Save audio path to send AFTER the text message
audioToSendAfter = mediaMatch[1].trim();
}
}
} catch (err) {
// Hook errors shouldn't block message sending
logVerbose(`[telegram delivery] hook error: ${String(err)}`);
}
}
const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0;
if (!reply?.text && !hasMedia) {
if (reply?.audioAsVoice) {
logVerbose("telegram reply has audioAsVoice without media/text; skipping");
continue;
}
runtime.error?.(danger("reply missing text/media"));
continue;
}
const replyToId = replyToMode === "off" ? undefined : resolveTelegramReplyId(reply.replyToId);
const mediaList = reply.mediaUrls?.length
? reply.mediaUrls
: reply.mediaUrl
? [reply.mediaUrl]
: [];
if (mediaList.length === 0) {
const chunks = markdownToTelegramChunks(reply.text || "", textLimit, {
tableMode: params.tableMode,
});
for (const chunk of chunks) {
await sendTelegramText(bot, chatId, chunk.html, runtime, {
replyToMessageId:
replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined,
messageThreadId,
textMode: "html",
plainText: chunk.text,
});
if (replyToId && !hasReplied) {
hasReplied = true;
}
}
// Send TTS audio after text (if hook generated one)
if (audioToSendAfter) {
try {
const audioMedia = await loadWebMedia(audioToSendAfter);
const audioFile = new InputFile(audioMedia.buffer, "voice.mp3");
// Switch typing indicator to record_voice before sending
await params.onVoiceRecording?.();
const audioParams: Record<string, unknown> = {};
if (threadParams) {
audioParams.message_thread_id = threadParams.message_thread_id;
}
await bot.api.sendVoice(chatId, audioFile, audioParams);
logVerbose(`[telegram delivery] TTS audio sent: ${audioToSendAfter}`);
} catch (err) {
logVerbose(`[telegram delivery] TTS audio send failed: ${String(err)}`);
}
}
continue;
}
// media with optional caption on first item
let first = true;
// Track if we need to send a follow-up text message after media
// (when caption exceeds Telegram's 1024-char limit)
let pendingFollowUpText: string | undefined;
for (const mediaUrl of mediaList) {
const isFirstMedia = first;
const media = await loadWebMedia(mediaUrl);
const kind = mediaKindFromMime(media.contentType ?? undefined);
const isGif = isGifMedia({
contentType: media.contentType,
fileName: media.fileName,
});
const fileName = media.fileName ?? (isGif ? "animation.gif" : "file");
const file = new InputFile(media.buffer, fileName);
// Caption only on first item; if text exceeds limit, defer to follow-up message.
const { caption, followUpText } = splitTelegramCaption(
isFirstMedia ? (reply.text ?? undefined) : undefined,
);
const htmlCaption = caption
? renderTelegramHtmlText(caption, { tableMode: params.tableMode })
: undefined;
if (followUpText) {
pendingFollowUpText = followUpText;
}
first = false;
const replyToMessageId =
replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
const mediaParams: Record<string, unknown> = {
caption: htmlCaption,
reply_to_message_id: replyToMessageId,
...(htmlCaption ? { parse_mode: "HTML" } : {}),
};
if (threadParams) {
mediaParams.message_thread_id = threadParams.message_thread_id;
}
if (isGif) {
await bot.api.sendAnimation(chatId, file, {
...mediaParams,
});
} else if (kind === "image") {
await bot.api.sendPhoto(chatId, file, {
...mediaParams,
});
} else if (kind === "video") {
await bot.api.sendVideo(chatId, file, {
...mediaParams,
});
} else if (kind === "audio") {
const { useVoice } = resolveTelegramVoiceSend({
wantsVoice: reply.audioAsVoice === true, // default false (backward compatible)
contentType: media.contentType,
fileName,
logFallback: logVerbose,
});
if (useVoice) {
// Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]])
// Switch typing indicator to record_voice before sending.
await params.onVoiceRecording?.();
await bot.api.sendVoice(chatId, file, {
...mediaParams,
});
} else {
// Audio file - displays with metadata (title, duration) - DEFAULT
await bot.api.sendAudio(chatId, file, {
...mediaParams,
});
}
} else {
await bot.api.sendDocument(chatId, file, {
...mediaParams,
});
}
if (replyToId && !hasReplied) {
hasReplied = true;
}
// Send deferred follow-up text right after the first media item.
// Chunk it in case it's extremely long (same logic as text-only replies).
if (pendingFollowUpText && isFirstMedia) {
const chunks = markdownToTelegramChunks(pendingFollowUpText, textLimit, {
tableMode: params.tableMode,
});
for (const chunk of chunks) {
const replyToMessageIdFollowup =
replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
await sendTelegramText(bot, chatId, chunk.html, runtime, {
replyToMessageId: replyToMessageIdFollowup,
messageThreadId,
textMode: "html",
plainText: chunk.text,
});
if (replyToId && !hasReplied) {
hasReplied = true;
}
}
pendingFollowUpText = undefined;
}
}
}
}
export async function resolveMedia(
ctx: TelegramContext,
maxBytes: number,
token: string,
proxyFetch?: typeof fetch,
): Promise<{ path: string; contentType?: string; placeholder: string } | null> {
const msg = ctx.message;
const m =
msg.photo?.[msg.photo.length - 1] ?? msg.video ?? msg.document ?? msg.audio ?? msg.voice;
if (!m?.file_id) return null;
const file = await ctx.getFile();
if (!file.file_path) {
throw new Error("Telegram getFile returned no file_path");
}
const fetchImpl = proxyFetch ?? globalThis.fetch;
if (!fetchImpl) {
throw new Error("fetch is not available; set channels.telegram.proxy in config");
}
const url = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
const fetched = await fetchRemoteMedia({
url,
fetchImpl,
filePathHint: file.file_path,
});
const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes);
let placeholder = "<media:document>";
if (msg.photo) placeholder = "<media:image>";
else if (msg.video) placeholder = "<media:video>";
else if (msg.audio || msg.voice) placeholder = "<media:audio>";
return { path: saved.path, contentType: saved.contentType, placeholder };
}
function buildTelegramSendParams(opts?: {
replyToMessageId?: number;
messageThreadId?: number;
}): Record<string, unknown> {
const threadParams = buildTelegramThreadParams(opts?.messageThreadId);
const params: Record<string, unknown> = {};
if (opts?.replyToMessageId) {
params.reply_to_message_id = opts.replyToMessageId;
}
if (threadParams) {
params.message_thread_id = threadParams.message_thread_id;
}
return params;
}
async function sendTelegramText(
bot: Bot,
chatId: string,
text: string,
runtime: RuntimeEnv,
opts?: {
replyToMessageId?: number;
messageThreadId?: number;
textMode?: "markdown" | "html";
plainText?: string;
},
): Promise<number | undefined> {
const baseParams = buildTelegramSendParams({
replyToMessageId: opts?.replyToMessageId,
messageThreadId: opts?.messageThreadId,
});
const textMode = opts?.textMode ?? "markdown";
const htmlText = textMode === "html" ? text : markdownToTelegramHtml(text);
try {
const res = await bot.api.sendMessage(chatId, htmlText, {
parse_mode: "HTML",
...baseParams,
});
return res.message_id;
} catch (err) {
const errText = formatErrorMessage(err);
if (PARSE_ERR_RE.test(errText)) {
runtime.log?.(`telegram HTML parse failed; retrying without formatting: ${errText}`);
const fallbackText = opts?.plainText ?? text;
const res = await bot.api.sendMessage(chatId, fallbackText, {
...baseParams,
});
return res.message_id;
}
throw err;
}
}