fix(tts): generate audio when block streaming drops final reply
When block streaming succeeds, final replies are dropped but TTS was only applied to final replies. Fix by accumulating block text during streaming and generating TTS-only audio after streaming completes. Also: - Change truncate vs skip behavior when summary OFF (now truncates) - Align TTS limits with Telegram max (4096 chars) - Improve /tts command help messages with examples - Add newline separator between accumulated blocks
This commit is contained in:
parent
14f8acdecb
commit
938a9ab627
@ -181,9 +181,23 @@ function buildChatCommands(): ChatCommandDefinition[] {
|
||||
defineChatCommand({
|
||||
key: "tts",
|
||||
nativeName: "tts",
|
||||
description: "Configure text-to-speech.",
|
||||
description: "Control text-to-speech (TTS).",
|
||||
textAlias: "/tts",
|
||||
acceptsArgs: true,
|
||||
args: [
|
||||
{
|
||||
name: "action",
|
||||
description: "on | off | status | provider | limit | summary | audio | help",
|
||||
type: "string",
|
||||
choices: ["on", "off", "status", "provider", "limit", "summary", "audio", "help"],
|
||||
},
|
||||
{
|
||||
name: "value",
|
||||
description: "Provider, limit, or text",
|
||||
type: "string",
|
||||
captureRemaining: true,
|
||||
},
|
||||
],
|
||||
argsMenu: "auto",
|
||||
}),
|
||||
defineChatCommand({
|
||||
key: "whoami",
|
||||
|
||||
@ -6,20 +6,18 @@ import {
|
||||
getTtsMaxLength,
|
||||
getTtsProvider,
|
||||
isSummarizationEnabled,
|
||||
isTtsEnabled,
|
||||
isTtsProviderConfigured,
|
||||
normalizeTtsAutoMode,
|
||||
resolveTtsAutoMode,
|
||||
resolveTtsApiKey,
|
||||
resolveTtsConfig,
|
||||
resolveTtsPrefsPath,
|
||||
resolveTtsProviderOrder,
|
||||
setLastTtsAttempt,
|
||||
setSummarizationEnabled,
|
||||
setTtsEnabled,
|
||||
setTtsMaxLength,
|
||||
setTtsProvider,
|
||||
textToSpeech,
|
||||
} from "../../tts/tts.js";
|
||||
import { updateSessionStore } from "../../config/sessions.js";
|
||||
|
||||
type ParsedTtsCommand = {
|
||||
action: string;
|
||||
@ -27,11 +25,11 @@ type ParsedTtsCommand = {
|
||||
};
|
||||
|
||||
function parseTtsCommand(normalized: string): ParsedTtsCommand | null {
|
||||
// Accept `/tts` and `/tts <action> [args]` as a single control surface.
|
||||
if (normalized === "/tts") return { action: "status", args: "" };
|
||||
// Accept `/tts <action> [args]` - return null for `/tts` alone to trigger inline menu.
|
||||
if (normalized === "/tts") return null;
|
||||
if (!normalized.startsWith("/tts ")) return null;
|
||||
const rest = normalized.slice(5).trim();
|
||||
if (!rest) return { action: "status", args: "" };
|
||||
if (!rest) return null;
|
||||
const [action, ...tail] = rest.split(/\s+/);
|
||||
return { action: action.toLowerCase(), args: tail.join(" ").trim() };
|
||||
}
|
||||
@ -40,14 +38,27 @@ function ttsUsage(): ReplyPayload {
|
||||
// Keep usage in one place so help/validation stays consistent.
|
||||
return {
|
||||
text:
|
||||
"⚙️ Usage: /tts <off|always|inbound|tagged|status|provider|limit|summary|audio> [value]" +
|
||||
"\nExamples:\n" +
|
||||
"/tts always\n" +
|
||||
"/tts provider openai\n" +
|
||||
"/tts provider edge\n" +
|
||||
"/tts limit 2000\n" +
|
||||
"/tts summary off\n" +
|
||||
"/tts audio Hello from Clawdbot",
|
||||
`🔊 **TTS (Text-to-Speech) Help**\n\n` +
|
||||
`**Commands:**\n` +
|
||||
`• /tts on — Enable automatic TTS for replies\n` +
|
||||
`• /tts off — Disable TTS\n` +
|
||||
`• /tts status — Show current settings\n` +
|
||||
`• /tts provider [name] — View/change provider\n` +
|
||||
`• /tts limit [number] — View/change text limit\n` +
|
||||
`• /tts summary [on|off] — View/change auto-summary\n` +
|
||||
`• /tts audio <text> — Generate audio from text\n\n` +
|
||||
`**Providers:**\n` +
|
||||
`• edge — Free, fast (default)\n` +
|
||||
`• openai — High quality (requires API key)\n` +
|
||||
`• elevenlabs — Premium voices (requires API key)\n\n` +
|
||||
`**Text Limit (default: 1500, max: 4096):**\n` +
|
||||
`When text exceeds the limit:\n` +
|
||||
`• Summary ON: AI summarizes, then generates audio\n` +
|
||||
`• Summary OFF: Truncates text, then generates audio\n\n` +
|
||||
`**Examples:**\n` +
|
||||
`/tts provider edge\n` +
|
||||
`/tts limit 2000\n` +
|
||||
`/tts audio Hello, this is a test!`,
|
||||
};
|
||||
}
|
||||
|
||||
@ -72,35 +83,27 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
const requestedAuto = normalizeTtsAutoMode(
|
||||
action === "on" ? "always" : action === "off" ? "off" : action,
|
||||
);
|
||||
if (requestedAuto) {
|
||||
const entry = params.sessionEntry;
|
||||
const sessionKey = params.sessionKey;
|
||||
const store = params.sessionStore;
|
||||
if (entry && store && sessionKey) {
|
||||
entry.ttsAuto = requestedAuto;
|
||||
entry.updatedAt = Date.now();
|
||||
store[sessionKey] = entry;
|
||||
if (params.storePath) {
|
||||
await updateSessionStore(params.storePath, (store) => {
|
||||
store[sessionKey] = entry;
|
||||
});
|
||||
}
|
||||
}
|
||||
const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto;
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: {
|
||||
text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`,
|
||||
},
|
||||
};
|
||||
if (action === "on") {
|
||||
setTtsEnabled(prefsPath, true);
|
||||
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
|
||||
}
|
||||
|
||||
if (action === "off") {
|
||||
setTtsEnabled(prefsPath, false);
|
||||
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
|
||||
}
|
||||
|
||||
if (action === "audio") {
|
||||
if (!args.trim()) {
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: {
|
||||
text:
|
||||
`🎤 Generate audio from text.\n\n` +
|
||||
`Usage: /tts audio <text>\n` +
|
||||
`Example: /tts audio Hello, this is a test!`,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
@ -146,9 +149,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
if (action === "provider") {
|
||||
const currentProvider = getTtsProvider(config, prefsPath);
|
||||
if (!args.trim()) {
|
||||
const fallback = resolveTtsProviderOrder(currentProvider)
|
||||
.slice(1)
|
||||
.filter((provider) => isTtsProviderConfigured(config, provider));
|
||||
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
|
||||
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
|
||||
const hasEdge = isTtsProviderConfigured(config, "edge");
|
||||
@ -158,7 +158,6 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
text:
|
||||
`🎙️ TTS provider\n` +
|
||||
`Primary: ${currentProvider}\n` +
|
||||
`Fallbacks: ${fallback.join(", ") || "none"}\n` +
|
||||
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
|
||||
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
|
||||
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
|
||||
@ -173,18 +172,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
}
|
||||
|
||||
setTtsProvider(prefsPath, requested);
|
||||
const fallback = resolveTtsProviderOrder(requested)
|
||||
.slice(1)
|
||||
.filter((provider) => isTtsProviderConfigured(config, provider));
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: {
|
||||
text:
|
||||
`✅ TTS provider set to ${requested} (fallbacks: ${fallback.join(", ") || "none"}).` +
|
||||
(requested === "edge"
|
||||
? "\nEnable Edge TTS in config: messages.tts.edge.enabled = true."
|
||||
: ""),
|
||||
},
|
||||
reply: { text: `✅ TTS provider set to ${requested}.` },
|
||||
};
|
||||
}
|
||||
|
||||
@ -193,12 +183,22 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
const currentLimit = getTtsMaxLength(prefsPath);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: `📏 TTS limit: ${currentLimit} characters.` },
|
||||
reply: {
|
||||
text:
|
||||
`📏 TTS limit: ${currentLimit} characters.\n\n` +
|
||||
`Text longer than this triggers summary (if enabled).\n` +
|
||||
`Range: 100-4096 chars (Telegram max).\n\n` +
|
||||
`To change: /tts limit <number>\n` +
|
||||
`Example: /tts limit 2000`,
|
||||
},
|
||||
};
|
||||
}
|
||||
const next = Number.parseInt(args.trim(), 10);
|
||||
if (!Number.isFinite(next) || next < 100 || next > 10_000) {
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
if (!Number.isFinite(next) || next < 100 || next > 4096) {
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: "❌ Limit must be between 100 and 4096 characters." },
|
||||
};
|
||||
}
|
||||
setTtsMaxLength(prefsPath, next);
|
||||
return {
|
||||
@ -210,9 +210,17 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
if (action === "summary") {
|
||||
if (!args.trim()) {
|
||||
const enabled = isSummarizationEnabled(prefsPath);
|
||||
const maxLen = getTtsMaxLength(prefsPath);
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: { text: `📝 TTS auto-summary: ${enabled ? "on" : "off"}.` },
|
||||
reply: {
|
||||
text:
|
||||
`📝 TTS auto-summary: ${enabled ? "on" : "off"}.\n\n` +
|
||||
`When text exceeds ${maxLen} chars:\n` +
|
||||
`• ON: summarizes text, then generates audio\n` +
|
||||
`• OFF: truncates text, then generates audio\n\n` +
|
||||
`To change: /tts summary on | off`,
|
||||
},
|
||||
};
|
||||
}
|
||||
const requested = args.trim().toLowerCase();
|
||||
@ -229,27 +237,16 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
}
|
||||
|
||||
if (action === "status") {
|
||||
const sessionAuto = params.sessionEntry?.ttsAuto;
|
||||
const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto });
|
||||
const enabled = autoMode !== "off";
|
||||
const enabled = isTtsEnabled(config, prefsPath);
|
||||
const provider = getTtsProvider(config, prefsPath);
|
||||
const hasKey = isTtsProviderConfigured(config, provider);
|
||||
const providerStatus =
|
||||
provider === "edge"
|
||||
? hasKey
|
||||
? "✅ enabled"
|
||||
: "❌ disabled"
|
||||
: hasKey
|
||||
? "✅ key"
|
||||
: "❌ no key";
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath);
|
||||
const last = getLastTtsAttempt();
|
||||
const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode;
|
||||
const lines = [
|
||||
"📊 TTS status",
|
||||
`Auto: ${enabled ? autoLabel : "off"}`,
|
||||
`Provider: ${provider} (${providerStatus})`,
|
||||
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
|
||||
`Provider: ${provider} (${hasKey ? "✅ configured" : "❌ not configured"})`,
|
||||
`Text limit: ${maxLength} chars`,
|
||||
`Auto-summary: ${summarize ? "on" : "off"}`,
|
||||
];
|
||||
|
||||
@ -266,12 +266,26 @@ export async function dispatchReplyFromConfig(params: {
|
||||
return { queuedFinal, counts };
|
||||
}
|
||||
|
||||
// Track accumulated block text for TTS generation after streaming completes.
|
||||
// When block streaming succeeds, there's no final reply, so we need to generate
|
||||
// TTS audio separately from the accumulated block content.
|
||||
let accumulatedBlockText = "";
|
||||
let blockCount = 0;
|
||||
|
||||
const replyResult = await (params.replyResolver ?? getReplyFromConfig)(
|
||||
ctx,
|
||||
{
|
||||
...params.replyOptions,
|
||||
onBlockReply: (payload: ReplyPayload, context) => {
|
||||
const run = async () => {
|
||||
// Accumulate block text for TTS generation after streaming
|
||||
if (payload.text) {
|
||||
if (accumulatedBlockText.length > 0) {
|
||||
accumulatedBlockText += "\n";
|
||||
}
|
||||
accumulatedBlockText += payload.text;
|
||||
blockCount++;
|
||||
}
|
||||
const ttsPayload = await maybeApplyTtsToPayload({
|
||||
payload,
|
||||
cfg,
|
||||
@ -327,6 +341,50 @@ export async function dispatchReplyFromConfig(params: {
|
||||
queuedFinal = dispatcher.sendFinalReply(ttsReply) || queuedFinal;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate TTS-only reply after block streaming completes (when there's no final reply).
|
||||
// This handles the case where block streaming succeeds and drops final payloads,
|
||||
// but we still want TTS audio to be generated from the accumulated block content.
|
||||
if (replies.length === 0 && blockCount > 0 && accumulatedBlockText.trim()) {
|
||||
const ttsSyntheticReply = await maybeApplyTtsToPayload({
|
||||
payload: { text: accumulatedBlockText },
|
||||
cfg,
|
||||
channel: ttsChannel,
|
||||
kind: "final",
|
||||
inboundAudio,
|
||||
ttsAuto: sessionTtsAuto,
|
||||
});
|
||||
// Only send if TTS was actually applied (mediaUrl exists)
|
||||
if (ttsSyntheticReply.mediaUrl) {
|
||||
// Send TTS-only payload (no text, just audio) so it doesn't duplicate the block content
|
||||
const ttsOnlyPayload: ReplyPayload = {
|
||||
mediaUrl: ttsSyntheticReply.mediaUrl,
|
||||
audioAsVoice: ttsSyntheticReply.audioAsVoice,
|
||||
};
|
||||
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
|
||||
const result = await routeReply({
|
||||
payload: ttsOnlyPayload,
|
||||
channel: originatingChannel,
|
||||
to: originatingTo,
|
||||
sessionKey: ctx.SessionKey,
|
||||
accountId: ctx.AccountId,
|
||||
threadId: ctx.MessageThreadId,
|
||||
cfg,
|
||||
});
|
||||
queuedFinal = result.ok || queuedFinal;
|
||||
if (result.ok) routedFinalCount += 1;
|
||||
if (!result.ok) {
|
||||
logVerbose(
|
||||
`dispatch-from-config: route-reply (tts-only) failed: ${result.error ?? "unknown error"}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const didQueue = dispatcher.sendFinalReply(ttsOnlyPayload);
|
||||
queuedFinal = didQueue || queuedFinal;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await dispatcher.waitForIdle();
|
||||
|
||||
const counts = dispatcher.getQueuedCounts();
|
||||
|
||||
@ -40,7 +40,7 @@ import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
const DEFAULT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_TTS_MAX_LENGTH = 1500;
|
||||
const DEFAULT_TTS_SUMMARIZE = true;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4000;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4096;
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
@ -1386,32 +1386,34 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
|
||||
if (textForAudio.length > maxLength) {
|
||||
if (!isSummarizationEnabled(prefsPath)) {
|
||||
// Truncate text when summarization is disabled
|
||||
logVerbose(
|
||||
`TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
|
||||
);
|
||||
return nextPayload;
|
||||
}
|
||||
|
||||
try {
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
targetLength: maxLength,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > config.maxTextLength) {
|
||||
logVerbose(
|
||||
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
} else {
|
||||
// Summarize text when enabled
|
||||
try {
|
||||
const summary = await summarizeText({
|
||||
text: textForAudio,
|
||||
targetLength: maxLength,
|
||||
cfg: params.cfg,
|
||||
config,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
textForAudio = summary.summary;
|
||||
wasSummarized = true;
|
||||
if (textForAudio.length > config.maxTextLength) {
|
||||
logVerbose(
|
||||
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
|
||||
);
|
||||
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
|
||||
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logVerbose(`TTS: summarization failed: ${error.message}`);
|
||||
return nextPayload;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1436,12 +1438,12 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
|
||||
const channelId = resolveChannelId(params.channel);
|
||||
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
|
||||
|
||||
return {
|
||||
const finalPayload = {
|
||||
...nextPayload,
|
||||
mediaUrl: result.audioPath,
|
||||
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
|
||||
};
|
||||
return finalPayload;
|
||||
}
|
||||
|
||||
lastTtsAttempt = {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user