From 938a9ab627c64b33c58663798218b2d73515bbf2 Mon Sep 17 00:00:00 2001 From: Glucksberg Date: Sun, 25 Jan 2026 19:42:06 +0000 Subject: [PATCH] fix(tts): generate audio when block streaming drops final reply When block streaming succeeds, final replies are dropped but TTS was only applied to final replies. Fix by accumulating block text during streaming and generating TTS-only audio after streaming completes. Also: - Change truncate vs skip behavior when summary OFF (now truncates) - Align TTS limits with Telegram max (4096 chars) - Improve /tts command help messages with examples - Add newline separator between accumulated blocks --- src/auto-reply/commands-registry.data.ts | 18 ++- src/auto-reply/reply/commands-tts.ts | 141 +++++++++---------- src/auto-reply/reply/dispatch-from-config.ts | 58 ++++++++ src/tts/tts.ts | 54 +++---- 4 files changed, 171 insertions(+), 100 deletions(-) diff --git a/src/auto-reply/commands-registry.data.ts b/src/auto-reply/commands-registry.data.ts index 12fec300b..35c00892c 100644 --- a/src/auto-reply/commands-registry.data.ts +++ b/src/auto-reply/commands-registry.data.ts @@ -181,9 +181,23 @@ function buildChatCommands(): ChatCommandDefinition[] { defineChatCommand({ key: "tts", nativeName: "tts", - description: "Configure text-to-speech.", + description: "Control text-to-speech (TTS).", textAlias: "/tts", - acceptsArgs: true, + args: [ + { + name: "action", + description: "on | off | status | provider | limit | summary | audio | help", + type: "string", + choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"], + }, + { + name: "value", + description: "Provider, limit, or text", + type: "string", + captureRemaining: true, + }, + ], + argsMenu: "auto", }), defineChatCommand({ key: "whoami", diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 5c65fb94c..bba7e2b02 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -6,20 +6,18 @@ import { getTtsMaxLength, getTtsProvider, isSummarizationEnabled, + isTtsEnabled, isTtsProviderConfigured, - normalizeTtsAutoMode, - resolveTtsAutoMode, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, - resolveTtsProviderOrder, setLastTtsAttempt, setSummarizationEnabled, + setTtsEnabled, setTtsMaxLength, setTtsProvider, textToSpeech, } from "../../tts/tts.js"; -import { updateSessionStore } from "../../config/sessions.js"; type ParsedTtsCommand = { action: string; @@ -27,11 +25,11 @@ type ParsedTtsCommand = { }; function parseTtsCommand(normalized: string): ParsedTtsCommand | null { - // Accept `/tts` and `/tts [args]` as a single control surface. - if (normalized === "/tts") return { action: "status", args: "" }; + // Accept `/tts [args]` - return null for `/tts` alone to trigger inline menu. + if (normalized === "/tts") return null; if (!normalized.startsWith("/tts ")) return null; const rest = normalized.slice(5).trim(); - if (!rest) return { action: "status", args: "" }; + if (!rest) return null; const [action, ...tail] = rest.split(/\s+/); return { action: action.toLowerCase(), args: tail.join(" ").trim() }; } @@ -40,14 +38,27 @@ function ttsUsage(): ReplyPayload { // Keep usage in one place so help/validation stays consistent. return { text: - "⚙️ Usage: /tts [value]" + - "\nExamples:\n" + - "/tts always\n" + - "/tts provider openai\n" + - "/tts provider edge\n" + - "/tts limit 2000\n" + - "/tts summary off\n" + - "/tts audio Hello from Clawdbot", + `🔊 **TTS (Text-to-Speech) Help**\n\n` + + `**Commands:**\n` + + `• /tts on — Enable automatic TTS for replies\n` + + `• /tts off — Disable TTS\n` + + `• /tts status — Show current settings\n` + + `• /tts provider [name] — View/change provider\n` + + `• /tts limit [number] — View/change text limit\n` + + `• /tts summary [on|off] — View/change auto-summary\n` + + `• /tts audio — Generate audio from text\n\n` + + `**Providers:**\n` + + `• edge — Free, fast (default)\n` + + `• openai — High quality (requires API key)\n` + + `• elevenlabs — Premium voices (requires API key)\n\n` + + `**Text Limit (default: 1500, max: 4096):**\n` + + `When text exceeds the limit:\n` + + `• Summary ON: AI summarizes, then generates audio\n` + + `• Summary OFF: Truncates text, then generates audio\n\n` + + `**Examples:**\n` + + `/tts provider edge\n` + + `/tts limit 2000\n` + + `/tts audio Hello, this is a test!`, }; } @@ -72,35 +83,27 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: ttsUsage() }; } - const requestedAuto = normalizeTtsAutoMode( - action === "on" ? "always" : action === "off" ? "off" : action, - ); - if (requestedAuto) { - const entry = params.sessionEntry; - const sessionKey = params.sessionKey; - const store = params.sessionStore; - if (entry && store && sessionKey) { - entry.ttsAuto = requestedAuto; - entry.updatedAt = Date.now(); - store[sessionKey] = entry; - if (params.storePath) { - await updateSessionStore(params.storePath, (store) => { - store[sessionKey] = entry; - }); - } - } - const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto; - return { - shouldContinue: false, - reply: { - text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`, - }, - }; + if (action === "on") { + setTtsEnabled(prefsPath, true); + return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } }; + } + + if (action === "off") { + setTtsEnabled(prefsPath, false); + return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } }; } if (action === "audio") { if (!args.trim()) { - return { shouldContinue: false, reply: ttsUsage() }; + return { + shouldContinue: false, + reply: { + text: + `🎤 Generate audio from text.\n\n` + + `Usage: /tts audio \n` + + `Example: /tts audio Hello, this is a test!`, + }, + }; } const start = Date.now(); @@ -146,9 +149,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "provider") { const currentProvider = getTtsProvider(config, prefsPath); if (!args.trim()) { - const fallback = resolveTtsProviderOrder(currentProvider) - .slice(1) - .filter((provider) => isTtsProviderConfigured(config, provider)); const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); const hasEdge = isTtsProviderConfigured(config, "edge"); @@ -158,7 +158,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand text: `🎙️ TTS provider\n` + `Primary: ${currentProvider}\n` + - `Fallbacks: ${fallback.join(", ") || "none"}\n` + `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` + @@ -173,18 +172,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand } setTtsProvider(prefsPath, requested); - const fallback = resolveTtsProviderOrder(requested) - .slice(1) - .filter((provider) => isTtsProviderConfigured(config, provider)); return { shouldContinue: false, - reply: { - text: - `✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` + - (requested === "edge" - ? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true." - : ""), - }, + reply: { text: `✅ TTS provider set to ${requested}.` }, }; } @@ -193,12 +183,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const currentLimit = getTtsMaxLength(prefsPath); return { shouldContinue: false, - reply: { text: `📏 TTS limit: ${currentLimit} characters.` }, + reply: { + text: + `📏 TTS limit: ${currentLimit} characters.\n\n` + + `Text longer than this triggers summary (if enabled).\n` + + `Range: 100-4096 chars (Telegram max).\n\n` + + `To change: /tts limit \n` + + `Example: /tts limit 2000`, + }, }; } const next = Number.parseInt(args.trim(), 10); - if (!Number.isFinite(next) || next < 100 || next > 10_000) { - return { shouldContinue: false, reply: ttsUsage() }; + if (!Number.isFinite(next) || next < 100 || next > 4096) { + return { + shouldContinue: false, + reply: { text: "❌ Limit must be between 100 and 4096 characters." }, + }; } setTtsMaxLength(prefsPath, next); return { @@ -210,9 +210,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "summary") { if (!args.trim()) { const enabled = isSummarizationEnabled(prefsPath); + const maxLen = getTtsMaxLength(prefsPath); return { shouldContinue: false, - reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` }, + reply: { + text: + `📝 TTS auto-summary: ${enabled ? "on" : "off"}.\n\n` + + `When text exceeds ${maxLen} chars:\n` + + `• ON: summarizes text, then generates audio\n` + + `• OFF: truncates text, then generates audio\n\n` + + `To change: /tts summary on | off`, + }, }; } const requested = args.trim().toLowerCase(); @@ -229,27 +237,16 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand } if (action === "status") { - const sessionAuto = params.sessionEntry?.ttsAuto; - const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto }); - const enabled = autoMode !== "off"; + const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); const hasKey = isTtsProviderConfigured(config, provider); - const providerStatus = - provider === "edge" - ? hasKey - ? "✅ enabled" - : "❌ disabled" - : hasKey - ? "✅ key" - : "❌ no key"; const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); const last = getLastTtsAttempt(); - const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode; const lines = [ "📊 TTS status", - `Auto: ${enabled ? autoLabel : "off"}`, - `Provider: ${provider} (${providerStatus})`, + `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, + `Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, ]; diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index f946c05f9..f1e11b416 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -266,12 +266,26 @@ export async function dispatchReplyFromConfig(params: { return { queuedFinal, counts }; } + // Track accumulated block text for TTS generation after streaming completes. + // When block streaming succeeds, there's no final reply, so we need to generate + // TTS audio separately from the accumulated block content. + let accumulatedBlockText = ""; + let blockCount = 0; + const replyResult = await (params.replyResolver ?? getReplyFromConfig)( ctx, { ...params.replyOptions, onBlockReply: (payload: ReplyPayload, context) => { const run = async () => { + // Accumulate block text for TTS generation after streaming + if (payload.text) { + if (accumulatedBlockText.length > 0) { + accumulatedBlockText += "\n"; + } + accumulatedBlockText += payload.text; + blockCount++; + } const ttsPayload = await maybeApplyTtsToPayload({ payload, cfg, @@ -327,6 +341,50 @@ export async function dispatchReplyFromConfig(params: { queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal; } } + + // Generate TTS-only reply after block streaming completes (when there's no final reply). + // This handles the case where block streaming succeeds and drops final payloads, + // but we still want TTS audio to be generated from the accumulated block content. + if (replies.length === 0 && blockCount > 0 && accumulatedBlockText.trim()) { + const ttsSyntheticReply = await maybeApplyTtsToPayload({ + payload: { text: accumulatedBlockText }, + cfg, + channel: ttsChannel, + kind: "final", + inboundAudio, + ttsAuto: sessionTtsAuto, + }); + // Only send if TTS was actually applied (mediaUrl exists) + if (ttsSyntheticReply.mediaUrl) { + // Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content + const ttsOnlyPayload: ReplyPayload = { + mediaUrl: ttsSyntheticReply.mediaUrl, + audioAsVoice: ttsSyntheticReply.audioAsVoice, + }; + if (shouldRouteToOriginating && originatingChannel && originatingTo) { + const result = await routeReply({ + payload: ttsOnlyPayload, + channel: originatingChannel, + to: originatingTo, + sessionKey: ctx.SessionKey, + accountId: ctx.AccountId, + threadId: ctx.MessageThreadId, + cfg, + }); + queuedFinal = result.ok || queuedFinal; + if (result.ok) routedFinalCount += 1; + if (!result.ok) { + logVerbose( + `dispatch-from-config: route-reply (tts-only) failed: ${result.error ?? "unknown error"}`, + ); + } + } else { + const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload); + queuedFinal = didQueue || queuedFinal; + } + } + } + await dispatcher.waitForIdle(); const counts = dispatcher.getQueuedCounts(); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 847876d04..9507c5535 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -40,7 +40,7 @@ import { resolveModel } from "../agents/pi-embedded-runner/model.js"; const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; -const DEFAULT_MAX_TEXT_LENGTH = 4000; +const DEFAULT_MAX_TEXT_LENGTH = 4096; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; @@ -1386,32 +1386,34 @@ export async function maybeApplyTtsToPayload(params: { if (textForAudio.length > maxLength) { if (!isSummarizationEnabled(prefsPath)) { + // Truncate text when summarization is disabled logVerbose( - `TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, + `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, ); - return nextPayload; - } - - try { - const summary = await summarizeText({ - text: textForAudio, - targetLength: maxLength, - cfg: params.cfg, - config, - timeoutMs: config.timeoutMs, - }); - textForAudio = summary.summary; - wasSummarized = true; - if (textForAudio.length > config.maxTextLength) { - logVerbose( - `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, - ); - textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; + } else { + // Summarize text when enabled + try { + const summary = await summarizeText({ + text: textForAudio, + targetLength: maxLength, + cfg: params.cfg, + config, + timeoutMs: config.timeoutMs, + }); + textForAudio = summary.summary; + wasSummarized = true; + if (textForAudio.length > config.maxTextLength) { + logVerbose( + `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, + ); + textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; + } + } catch (err) { + const error = err as Error; + logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`); + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; } - } catch (err) { - const error = err as Error; - logVerbose(`TTS: summarization failed: ${error.message}`); - return nextPayload; } } @@ -1436,12 +1438,12 @@ export async function maybeApplyTtsToPayload(params: { const channelId = resolveChannelId(params.channel); const shouldVoice = channelId === "telegram" && result.voiceCompatible === true; - - return { + const finalPayload = { ...nextPayload, mediaUrl: result.audioPath, audioAsVoice: shouldVoice || params.payload.audioAsVoice, }; + return finalPayload; } lastTtsAttempt = {