diff --git a/src/auto-reply/commands-registry.data.ts b/src/auto-reply/commands-registry.data.ts index 12fec300b..35c00892c 100644 --- a/src/auto-reply/commands-registry.data.ts +++ b/src/auto-reply/commands-registry.data.ts @@ -181,9 +181,23 @@ function buildChatCommands(): ChatCommandDefinition[] { defineChatCommand({ key: "tts", nativeName: "tts", - description: "Configure text-to-speech.", + description: "Control text-to-speech (TTS).", textAlias: "/tts", - acceptsArgs: true, + args: [ + { + name: "action", + description: "on | off | status | provider | limit | summary | audio | help", + type: "string", + choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"], + }, + { + name: "value", + description: "Provider, limit, or text", + type: "string", + captureRemaining: true, + }, + ], + argsMenu: "auto", }), defineChatCommand({ key: "whoami", diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 5c65fb94c..bba7e2b02 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -6,20 +6,18 @@ import { getTtsMaxLength, getTtsProvider, isSummarizationEnabled, + isTtsEnabled, isTtsProviderConfigured, - normalizeTtsAutoMode, - resolveTtsAutoMode, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, - resolveTtsProviderOrder, setLastTtsAttempt, setSummarizationEnabled, + setTtsEnabled, setTtsMaxLength, setTtsProvider, textToSpeech, } from "../../tts/tts.js"; -import { updateSessionStore } from "../../config/sessions.js"; type ParsedTtsCommand = { action: string; @@ -27,11 +25,11 @@ type ParsedTtsCommand = { }; function parseTtsCommand(normalized: string): ParsedTtsCommand | null { - // Accept `/tts` and `/tts [args]` as a single control surface. - if (normalized === "/tts") return { action: "status", args: "" }; + // Accept `/tts [args]` - return null for `/tts` alone to trigger inline menu. + if (normalized === "/tts") return null; if (!normalized.startsWith("/tts ")) return null; const rest = normalized.slice(5).trim(); - if (!rest) return { action: "status", args: "" }; + if (!rest) return null; const [action, ...tail] = rest.split(/\s+/); return { action: action.toLowerCase(), args: tail.join(" ").trim() }; } @@ -40,14 +38,27 @@ function ttsUsage(): ReplyPayload { // Keep usage in one place so help/validation stays consistent. return { text: - "⚙️ Usage: /tts [value]" + - "\nExamples:\n" + - "/tts always\n" + - "/tts provider openai\n" + - "/tts provider edge\n" + - "/tts limit 2000\n" + - "/tts summary off\n" + - "/tts audio Hello from Clawdbot", + `🔊 **TTS (Text-to-Speech) Help**\n\n` + + `**Commands:**\n` + + `• /tts on — Enable automatic TTS for replies\n` + + `• /tts off — Disable TTS\n` + + `• /tts status — Show current settings\n` + + `• /tts provider [name] — View/change provider\n` + + `• /tts limit [number] — View/change text limit\n` + + `• /tts summary [on|off] — View/change auto-summary\n` + + `• /tts audio — Generate audio from text\n\n` + + `**Providers:**\n` + + `• edge — Free, fast (default)\n` + + `• openai — High quality (requires API key)\n` + + `• elevenlabs — Premium voices (requires API key)\n\n` + + `**Text Limit (default: 1500, max: 4096):**\n` + + `When text exceeds the limit:\n` + + `• Summary ON: AI summarizes, then generates audio\n` + + `• Summary OFF: Truncates text, then generates audio\n\n` + + `**Examples:**\n` + + `/tts provider edge\n` + + `/tts limit 2000\n` + + `/tts audio Hello, this is a test!`, }; } @@ -72,35 +83,27 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: ttsUsage() }; } - const requestedAuto = normalizeTtsAutoMode( - action === "on" ? "always" : action === "off" ? "off" : action, - ); - if (requestedAuto) { - const entry = params.sessionEntry; - const sessionKey = params.sessionKey; - const store = params.sessionStore; - if (entry && store && sessionKey) { - entry.ttsAuto = requestedAuto; - entry.updatedAt = Date.now(); - store[sessionKey] = entry; - if (params.storePath) { - await updateSessionStore(params.storePath, (store) => { - store[sessionKey] = entry; - }); - } - } - const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto; - return { - shouldContinue: false, - reply: { - text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`, - }, - }; + if (action === "on") { + setTtsEnabled(prefsPath, true); + return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } }; + } + + if (action === "off") { + setTtsEnabled(prefsPath, false); + return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } }; } if (action === "audio") { if (!args.trim()) { - return { shouldContinue: false, reply: ttsUsage() }; + return { + shouldContinue: false, + reply: { + text: + `🎤 Generate audio from text.\n\n` + + `Usage: /tts audio \n` + + `Example: /tts audio Hello, this is a test!`, + }, + }; } const start = Date.now(); @@ -146,9 +149,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "provider") { const currentProvider = getTtsProvider(config, prefsPath); if (!args.trim()) { - const fallback = resolveTtsProviderOrder(currentProvider) - .slice(1) - .filter((provider) => isTtsProviderConfigured(config, provider)); const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); const hasEdge = isTtsProviderConfigured(config, "edge"); @@ -158,7 +158,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand text: `🎙️ TTS provider\n` + `Primary: ${currentProvider}\n` + - `Fallbacks: ${fallback.join(", ") || "none"}\n` + `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` + @@ -173,18 +172,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand } setTtsProvider(prefsPath, requested); - const fallback = resolveTtsProviderOrder(requested) - .slice(1) - .filter((provider) => isTtsProviderConfigured(config, provider)); return { shouldContinue: false, - reply: { - text: - `✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` + - (requested === "edge" - ? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true." - : ""), - }, + reply: { text: `✅ TTS provider set to ${requested}.` }, }; } @@ -193,12 +183,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const currentLimit = getTtsMaxLength(prefsPath); return { shouldContinue: false, - reply: { text: `📏 TTS limit: ${currentLimit} characters.` }, + reply: { + text: + `📏 TTS limit: ${currentLimit} characters.\n\n` + + `Text longer than this triggers summary (if enabled).\n` + + `Range: 100-4096 chars (Telegram max).\n\n` + + `To change: /tts limit \n` + + `Example: /tts limit 2000`, + }, }; } const next = Number.parseInt(args.trim(), 10); - if (!Number.isFinite(next) || next < 100 || next > 10_000) { - return { shouldContinue: false, reply: ttsUsage() }; + if (!Number.isFinite(next) || next < 100 || next > 4096) { + return { + shouldContinue: false, + reply: { text: "❌ Limit must be between 100 and 4096 characters." }, + }; } setTtsMaxLength(prefsPath, next); return { @@ -210,9 +210,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (action === "summary") { if (!args.trim()) { const enabled = isSummarizationEnabled(prefsPath); + const maxLen = getTtsMaxLength(prefsPath); return { shouldContinue: false, - reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` }, + reply: { + text: + `📝 TTS auto-summary: ${enabled ? "on" : "off"}.\n\n` + + `When text exceeds ${maxLen} chars:\n` + + `• ON: summarizes text, then generates audio\n` + + `• OFF: truncates text, then generates audio\n\n` + + `To change: /tts summary on | off`, + }, }; } const requested = args.trim().toLowerCase(); @@ -229,27 +237,16 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand } if (action === "status") { - const sessionAuto = params.sessionEntry?.ttsAuto; - const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto }); - const enabled = autoMode !== "off"; + const enabled = isTtsEnabled(config, prefsPath); const provider = getTtsProvider(config, prefsPath); const hasKey = isTtsProviderConfigured(config, provider); - const providerStatus = - provider === "edge" - ? hasKey - ? "✅ enabled" - : "❌ disabled" - : hasKey - ? "✅ key" - : "❌ no key"; const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); const last = getLastTtsAttempt(); - const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode; const lines = [ "📊 TTS status", - `Auto: ${enabled ? autoLabel : "off"}`, - `Provider: ${provider} (${providerStatus})`, + `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, + `Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, ]; diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index f946c05f9..f1e11b416 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -266,12 +266,26 @@ export async function dispatchReplyFromConfig(params: { return { queuedFinal, counts }; } + // Track accumulated block text for TTS generation after streaming completes. + // When block streaming succeeds, there's no final reply, so we need to generate + // TTS audio separately from the accumulated block content. + let accumulatedBlockText = ""; + let blockCount = 0; + const replyResult = await (params.replyResolver ?? getReplyFromConfig)( ctx, { ...params.replyOptions, onBlockReply: (payload: ReplyPayload, context) => { const run = async () => { + // Accumulate block text for TTS generation after streaming + if (payload.text) { + if (accumulatedBlockText.length > 0) { + accumulatedBlockText += "\n"; + } + accumulatedBlockText += payload.text; + blockCount++; + } const ttsPayload = await maybeApplyTtsToPayload({ payload, cfg, @@ -327,6 +341,50 @@ export async function dispatchReplyFromConfig(params: { queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal; } } + + // Generate TTS-only reply after block streaming completes (when there's no final reply). + // This handles the case where block streaming succeeds and drops final payloads, + // but we still want TTS audio to be generated from the accumulated block content. + if (replies.length === 0 && blockCount > 0 && accumulatedBlockText.trim()) { + const ttsSyntheticReply = await maybeApplyTtsToPayload({ + payload: { text: accumulatedBlockText }, + cfg, + channel: ttsChannel, + kind: "final", + inboundAudio, + ttsAuto: sessionTtsAuto, + }); + // Only send if TTS was actually applied (mediaUrl exists) + if (ttsSyntheticReply.mediaUrl) { + // Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content + const ttsOnlyPayload: ReplyPayload = { + mediaUrl: ttsSyntheticReply.mediaUrl, + audioAsVoice: ttsSyntheticReply.audioAsVoice, + }; + if (shouldRouteToOriginating && originatingChannel && originatingTo) { + const result = await routeReply({ + payload: ttsOnlyPayload, + channel: originatingChannel, + to: originatingTo, + sessionKey: ctx.SessionKey, + accountId: ctx.AccountId, + threadId: ctx.MessageThreadId, + cfg, + }); + queuedFinal = result.ok || queuedFinal; + if (result.ok) routedFinalCount += 1; + if (!result.ok) { + logVerbose( + `dispatch-from-config: route-reply (tts-only) failed: ${result.error ?? "unknown error"}`, + ); + } + } else { + const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload); + queuedFinal = didQueue || queuedFinal; + } + } + } + await dispatcher.waitForIdle(); const counts = dispatcher.getQueuedCounts(); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 847876d04..9507c5535 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -40,7 +40,7 @@ import { resolveModel } from "../agents/pi-embedded-runner/model.js"; const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; -const DEFAULT_MAX_TEXT_LENGTH = 4000; +const DEFAULT_MAX_TEXT_LENGTH = 4096; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; @@ -1386,32 +1386,34 @@ export async function maybeApplyTtsToPayload(params: { if (textForAudio.length > maxLength) { if (!isSummarizationEnabled(prefsPath)) { + // Truncate text when summarization is disabled logVerbose( - `TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, + `TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, ); - return nextPayload; - } - - try { - const summary = await summarizeText({ - text: textForAudio, - targetLength: maxLength, - cfg: params.cfg, - config, - timeoutMs: config.timeoutMs, - }); - textForAudio = summary.summary; - wasSummarized = true; - if (textForAudio.length > config.maxTextLength) { - logVerbose( - `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, - ); - textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; + } else { + // Summarize text when enabled + try { + const summary = await summarizeText({ + text: textForAudio, + targetLength: maxLength, + cfg: params.cfg, + config, + timeoutMs: config.timeoutMs, + }); + textForAudio = summary.summary; + wasSummarized = true; + if (textForAudio.length > config.maxTextLength) { + logVerbose( + `TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`, + ); + textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`; + } + } catch (err) { + const error = err as Error; + logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`); + textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`; } - } catch (err) { - const error = err as Error; - logVerbose(`TTS: summarization failed: ${error.message}`); - return nextPayload; } } @@ -1436,12 +1438,12 @@ export async function maybeApplyTtsToPayload(params: { const channelId = resolveChannelId(params.channel); const shouldVoice = channelId === "telegram" && result.voiceCompatible === true; - - return { + const finalPayload = { ...nextPayload, mediaUrl: result.audioPath, audioAsVoice: shouldVoice || params.payload.audioAsVoice, }; + return finalPayload; } lastTtsAttempt = {