fix(tts): generate audio when block streaming drops final reply

When block streaming succeeds, final replies are dropped but TTS was only
applied to final replies. Fix by accumulating block text during streaming
and generating TTS-only audio after streaming completes.

Also:
- Change truncate vs skip behavior when summary OFF (now truncates)
- Align TTS limits with Telegram max (4096 chars)
- Improve /tts command help messages with examples
- Add newline separator between accumulated blocks
This commit is contained in:
Glucksberg 2026-01-25 19:42:06 +00:00 committed by Shadow
parent 14f8acdecb
commit 938a9ab627
No known key found for this signature in database
4 changed files with 171 additions and 100 deletions

View File

@ -181,9 +181,23 @@ function buildChatCommands(): ChatCommandDefinition[] {
defineChatCommand({
key: "tts",
nativeName: "tts",
description: "Configure text-to-speech.",
description: "Control text-to-speech (TTS).",
textAlias: "/tts",
acceptsArgs: true,
args: [
{
name: "action",
description: "on | off | status | provider | limit | summary | audio | help",
type: "string",
choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"],
},
{
name: "value",
description: "Provider, limit, or text",
type: "string",
captureRemaining: true,
},
],
argsMenu: "auto",
}),
defineChatCommand({
key: "whoami",

View File

@ -6,20 +6,18 @@ import {
getTtsMaxLength,
getTtsProvider,
isSummarizationEnabled,
isTtsEnabled,
isTtsProviderConfigured,
normalizeTtsAutoMode,
resolveTtsAutoMode,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
resolveTtsProviderOrder,
setLastTtsAttempt,
setSummarizationEnabled,
setTtsEnabled,
setTtsMaxLength,
setTtsProvider,
textToSpeech,
} from "../../tts/tts.js";
import { updateSessionStore } from "../../config/sessions.js";
type ParsedTtsCommand = {
action: string;
@ -27,11 +25,11 @@ type ParsedTtsCommand = {
};
function parseTtsCommand(normalized: string): ParsedTtsCommand | null {
// Accept `/tts` and `/tts <action> [args]` as a single control surface.
if (normalized === "/tts") return { action: "status", args: "" };
// Accept `/tts <action> [args]` - return null for `/tts` alone to trigger inline menu.
if (normalized === "/tts") return null;
if (!normalized.startsWith("/tts ")) return null;
const rest = normalized.slice(5).trim();
if (!rest) return { action: "status", args: "" };
if (!rest) return null;
const [action, ...tail] = rest.split(/\s+/);
return { action: action.toLowerCase(), args: tail.join(" ").trim() };
}
@ -40,14 +38,27 @@ function ttsUsage(): ReplyPayload {
// Keep usage in one place so help/validation stays consistent.
return {
text:
"⚙️ Usage: /tts <off|always|inbound|tagged|status|provider|limit|summary|audio> [value]" +
"\nExamples:\n" +
"/tts always\n" +
"/tts provider openai\n" +
"/tts provider edge\n" +
"/tts limit 2000\n" +
"/tts summary off\n" +
"/tts audio Hello from Clawdbot",
`🔊 **TTS (Text-to-Speech) Help**\n\n` +
`**Commands:**\n` +
`• /tts on — Enable automatic TTS for replies\n` +
`• /tts off — Disable TTS\n` +
`• /tts status — Show current settings\n` +
`• /tts provider [name] — View/change provider\n` +
`• /tts limit [number] — View/change text limit\n` +
`• /tts summary [on|off] — View/change auto-summary\n` +
`• /tts audio <text> — Generate audio from text\n\n` +
`**Providers:**\n` +
`• edge — Free, fast (default)\n` +
`• openai — High quality (requires API key)\n` +
`• elevenlabs — Premium voices (requires API key)\n\n` +
`**Text Limit (default: 1500, max: 4096):**\n` +
`When text exceeds the limit:\n` +
`• Summary ON: AI summarizes, then generates audio\n` +
`• Summary OFF: Truncates text, then generates audio\n\n` +
`**Examples:**\n` +
`/tts provider edge\n` +
`/tts limit 2000\n` +
`/tts audio Hello, this is a test!`,
};
}
@ -72,35 +83,27 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
return { shouldContinue: false, reply: ttsUsage() };
}
const requestedAuto = normalizeTtsAutoMode(
action === "on" ? "always" : action === "off" ? "off" : action,
);
if (requestedAuto) {
const entry = params.sessionEntry;
const sessionKey = params.sessionKey;
const store = params.sessionStore;
if (entry && store && sessionKey) {
entry.ttsAuto = requestedAuto;
entry.updatedAt = Date.now();
store[sessionKey] = entry;
if (params.storePath) {
await updateSessionStore(params.storePath, (store) => {
store[sessionKey] = entry;
});
}
}
const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto;
return {
shouldContinue: false,
reply: {
text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`,
},
};
if (action === "on") {
setTtsEnabled(prefsPath, true);
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
}
if (action === "off") {
setTtsEnabled(prefsPath, false);
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
}
if (action === "audio") {
if (!args.trim()) {
return { shouldContinue: false, reply: ttsUsage() };
return {
shouldContinue: false,
reply: {
text:
`🎤 Generate audio from text.\n\n` +
`Usage: /tts audio <text>\n` +
`Example: /tts audio Hello, this is a test!`,
},
};
}
const start = Date.now();
@ -146,9 +149,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (action === "provider") {
const currentProvider = getTtsProvider(config, prefsPath);
if (!args.trim()) {
const fallback = resolveTtsProviderOrder(currentProvider)
.slice(1)
.filter((provider) => isTtsProviderConfigured(config, provider));
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
const hasEdge = isTtsProviderConfigured(config, "edge");
@ -158,7 +158,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
text:
`🎙️ TTS provider\n` +
`Primary: ${currentProvider}\n` +
`Fallbacks: ${fallback.join(", ") || "none"}\n` +
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
@ -173,18 +172,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
}
setTtsProvider(prefsPath, requested);
const fallback = resolveTtsProviderOrder(requested)
.slice(1)
.filter((provider) => isTtsProviderConfigured(config, provider));
return {
shouldContinue: false,
reply: {
text:
`✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` +
(requested === "edge"
? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true."
: ""),
},
reply: { text: `✅ TTS provider set to ${requested}.` },
};
}
@ -193,12 +183,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
const currentLimit = getTtsMaxLength(prefsPath);
return {
shouldContinue: false,
reply: { text: `📏 TTS limit: ${currentLimit} characters.` },
reply: {
text:
`📏 TTS limit: ${currentLimit} characters.\n\n` +
`Text longer than this triggers summary (if enabled).\n` +
`Range: 100-4096 chars (Telegram max).\n\n` +
`To change: /tts limit <number>\n` +
`Example: /tts limit 2000`,
},
};
}
const next = Number.parseInt(args.trim(), 10);
if (!Number.isFinite(next) || next < 100 || next > 10_000) {
return { shouldContinue: false, reply: ttsUsage() };
if (!Number.isFinite(next) || next < 100 || next > 4096) {
return {
shouldContinue: false,
reply: { text: "❌ Limit must be between 100 and 4096 characters." },
};
}
setTtsMaxLength(prefsPath, next);
return {
@ -210,9 +210,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (action === "summary") {
if (!args.trim()) {
const enabled = isSummarizationEnabled(prefsPath);
const maxLen = getTtsMaxLength(prefsPath);
return {
shouldContinue: false,
reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` },
reply: {
text:
`📝 TTS auto-summary: ${enabled ? "on" : "off"}.\n\n` +
`When text exceeds ${maxLen} chars:\n` +
`• ON: summarizes text, then generates audio\n` +
`• OFF: truncates text, then generates audio\n\n` +
`To change: /tts summary on | off`,
},
};
}
const requested = args.trim().toLowerCase();
@ -229,27 +237,16 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
}
if (action === "status") {
const sessionAuto = params.sessionEntry?.ttsAuto;
const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto });
const enabled = autoMode !== "off";
const enabled = isTtsEnabled(config, prefsPath);
const provider = getTtsProvider(config, prefsPath);
const hasKey = isTtsProviderConfigured(config, provider);
const providerStatus =
provider === "edge"
? hasKey
? "✅ enabled"
: "❌ disabled"
: hasKey
? "✅ key"
: "❌ no key";
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath);
const last = getLastTtsAttempt();
const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode;
const lines = [
"📊 TTS status",
`Auto: ${enabled ? autoLabel : "off"}`,
`Provider: ${provider} (${providerStatus})`,
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
`Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`,
`Text limit: ${maxLength} chars`,
`Auto-summary: ${summarize ? "on" : "off"}`,
];

View File

@ -266,12 +266,26 @@ export async function dispatchReplyFromConfig(params: {
return { queuedFinal, counts };
}
// Track accumulated block text for TTS generation after streaming completes.
// When block streaming succeeds, there's no final reply, so we need to generate
// TTS audio separately from the accumulated block content.
let accumulatedBlockText = "";
let blockCount = 0;
const replyResult = await (params.replyResolver ?? getReplyFromConfig)(
ctx,
{
...params.replyOptions,
onBlockReply: (payload: ReplyPayload, context) => {
const run = async () => {
// Accumulate block text for TTS generation after streaming
if (payload.text) {
if (accumulatedBlockText.length > 0) {
accumulatedBlockText += "\n";
}
accumulatedBlockText += payload.text;
blockCount++;
}
const ttsPayload = await maybeApplyTtsToPayload({
payload,
cfg,
@ -327,6 +341,50 @@ export async function dispatchReplyFromConfig(params: {
queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal;
}
}
// Generate TTS-only reply after block streaming completes (when there's no final reply).
// This handles the case where block streaming succeeds and drops final payloads,
// but we still want TTS audio to be generated from the accumulated block content.
if (replies.length === 0 && blockCount > 0 && accumulatedBlockText.trim()) {
const ttsSyntheticReply = await maybeApplyTtsToPayload({
payload: { text: accumulatedBlockText },
cfg,
channel: ttsChannel,
kind: "final",
inboundAudio,
ttsAuto: sessionTtsAuto,
});
// Only send if TTS was actually applied (mediaUrl exists)
if (ttsSyntheticReply.mediaUrl) {
// Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content
const ttsOnlyPayload: ReplyPayload = {
mediaUrl: ttsSyntheticReply.mediaUrl,
audioAsVoice: ttsSyntheticReply.audioAsVoice,
};
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
const result = await routeReply({
payload: ttsOnlyPayload,
channel: originatingChannel,
to: originatingTo,
sessionKey: ctx.SessionKey,
accountId: ctx.AccountId,
threadId: ctx.MessageThreadId,
cfg,
});
queuedFinal = result.ok || queuedFinal;
if (result.ok) routedFinalCount += 1;
if (!result.ok) {
logVerbose(
`dispatch-from-config: route-reply (tts-only) failed: ${result.error ?? "unknown error"}`,
);
}
} else {
const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload);
queuedFinal = didQueue || queuedFinal;
}
}
}
await dispatcher.waitForIdle();
const counts = dispatcher.getQueuedCounts();

View File

@ -40,7 +40,7 @@ import { resolveModel } from "../agents/pi-embedded-runner/model.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4000;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
@ -1386,32 +1386,34 @@ export async function maybeApplyTtsToPayload(params: {
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
// Truncate text when summarization is disabled
logVerbose(
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
return nextPayload;
}
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
// Summarize text when enabled
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed: ${error.message}`);
return nextPayload;
}
}
@ -1436,12 +1438,12 @@ export async function maybeApplyTtsToPayload(params: {
const channelId = resolveChannelId(params.channel);
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
return {
const finalPayload = {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
return finalPayload;
}
lastTtsAttempt = {