This commit is contained in:
Franco Viotti 2026-01-29 15:55:48 -03:00 committed by GitHub
commit fa7996edfc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 181 additions and 14 deletions

View File

@ -8,13 +8,14 @@ read_when:
# Text-to-speech (TTS)
Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS.
Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, Telnyx, or Edge TTS.
It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubble.
## Supported services
- **ElevenLabs** (primary or fallback provider)
- **OpenAI** (primary or fallback provider; also used for summaries)
- **Telnyx** (primary or fallback provider; great quality, cheaper than ElevenLabs)
- **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys)
### Edge TTS notes
@ -31,9 +32,10 @@ does not publish limits, so assume similar or lower limits. citeturn0searc
## Optional keys
If you want OpenAI or ElevenLabs:
If you want OpenAI, ElevenLabs, or Telnyx:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `OPENAI_API_KEY`
- `TELNYX_API_KEY`
Edge TTS does **not** require an API key. If no API keys are found, Moltbot defaults
to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
@ -202,9 +204,9 @@ Then run:
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
- `provider`: `"elevenlabs"`, `"openai"`, `"telnyx"`, or `"edge"` (fallback is automatic).
- If `provider` is **unset**, Moltbot prefers `openai` (if key), then `elevenlabs` (if key),
otherwise `edge`.
then `telnyx` (if key), otherwise `edge`.
- `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`.
- Accepts `provider/model` or a configured model alias.
- `modelOverrides`: allow the model to emit TTS directives (on by default).
@ -250,7 +252,7 @@ Here you go.
```
Available directive keys (when enabled):
- `provider` (`openai` | `elevenlabs` | `edge`)
- `provider` (`openai` | `elevenlabs` | `telnyx` | `edge`)
- `voice` (OpenAI voice) or `voiceId` (ElevenLabs)
- `model` (OpenAI TTS model or ElevenLabs model id)
- `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost`

View File

@ -50,7 +50,8 @@ function ttsUsage(): ReplyPayload {
`**Providers:**\n` +
`• edge — Free, fast (default)\n` +
`• openai — High quality (requires API key)\n` +
`• elevenlabs — Premium voices (requires API key)\n\n` +
`• elevenlabs — Premium voices (requires API key)\n` +
`• telnyx — Great quality, cheaper than ElevenLabs (requires API key)\n\n` +
`**Text Limit (default: 1500, max: 4096):**\n` +
`When text exceeds the limit:\n` +
`• Summary ON: AI summarizes, then generates audio\n` +
@ -151,6 +152,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
if (!args.trim()) {
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
const hasTelnyx = Boolean(resolveTtsApiKey(config, "telnyx"));
const hasEdge = isTtsProviderConfigured(config, "edge");
return {
shouldContinue: false,
@ -160,18 +162,24 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
`Primary: ${currentProvider}\n` +
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Telnyx key: ${hasTelnyx ? "✅" : "❌"}\n` +
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | edge`,
`Usage: /tts provider openai | elevenlabs | telnyx | edge`,
},
};
}
const requested = args.trim().toLowerCase();
if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
if (
requested !== "openai" &&
requested !== "elevenlabs" &&
requested !== "telnyx" &&
requested !== "edge"
) {
return { shouldContinue: false, reply: ttsUsage() };
}
setTtsProvider(prefsPath, requested);
setTtsProvider(prefsPath, requested as "openai" | "elevenlabs" | "telnyx" | "edge");
return {
shouldContinue: false,
reply: { text: `✅ TTS provider set to ${requested}.` },

View File

@ -1,4 +1,4 @@
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsProvider = "elevenlabs" | "openai" | "edge" | "telnyx";
export type TtsMode = "final" | "all";
@ -73,6 +73,14 @@ export type TtsConfig = {
proxy?: string;
timeoutMs?: number;
};
/** Telnyx TTS configuration. */
telnyx?: {
apiKey?: string;
/** Voice ID (e.g. "Telnyx.NaturalHD.astra", "Telnyx.Kokoro.af_heart"). */
voice?: string;
/** WebSocket inactivity timeout in seconds (default: 20). */
inactivityTimeout?: number;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "telnyx"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
@ -224,6 +224,14 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
telnyx: z
.object({
apiKey: z.string().optional(),
voice: z.string().optional(),
inactivityTimeout: z.number().int().min(1).max(300).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@ -202,6 +202,14 @@ describe("tts", () => {
expect(result.overrides.provider).toBe("edge");
});
it("accepts telnyx as provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:provider=telnyx]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("telnyx");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";
@ -426,6 +434,7 @@ describe("tts", () => {
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
TELNYX_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
@ -434,6 +443,22 @@ describe("tts", () => {
},
);
});
it("prefers Telnyx when OpenAI and ElevenLabs are missing and Telnyx key exists", () => {
withEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
TELNYX_API_KEY: "test-telnyx-key",
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-telnyx.json");
expect(provider).toBe("telnyx");
},
);
});
});
describe("maybeApplyTtsToPayload", () => {

View File

@ -13,6 +13,7 @@ import path from "node:path";
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import WebSocket from "ws";
import type { ReplyPayload } from "../auto-reply/types.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_TELNYX_VOICE = "Telnyx.NaturalHD.astra";
const DEFAULT_TELNYX_INACTIVITY_TIMEOUT = 20;
const TELNYX_WS_URL = "wss://api.telnyx.com/v2/text-to-speech/speech";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
@ -65,6 +69,7 @@ const TELEGRAM_OUTPUT = {
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
telnyx: "mp3_16000" as const,
extension: ".opus",
voiceCompatible: true,
};
@ -72,6 +77,7 @@ const TELEGRAM_OUTPUT = {
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
elevenlabs: "mp3_44100_128",
telnyx: "mp3_16000" as const,
extension: ".mp3",
voiceCompatible: false,
};
@ -124,6 +130,11 @@ export type ResolvedTtsConfig = {
proxy?: string;
timeoutMs?: number;
};
telnyx: {
apiKey?: string;
voice: string;
inactivityTimeout: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
@ -296,6 +307,11 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
telnyx: {
apiKey: raw.telnyx?.apiKey,
voice: raw.telnyx?.voice?.trim() || DEFAULT_TELNYX_VOICE,
inactivityTimeout: raw.telnyx?.inactivityTimeout ?? DEFAULT_TELNYX_INACTIVITY_TIMEOUT,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -412,6 +428,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
if (resolveTtsApiKey(config, "openai")) return "openai";
if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
if (resolveTtsApiKey(config, "telnyx")) return "telnyx";
return "edge";
}
@ -474,10 +491,13 @@ export function resolveTtsApiKey(
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
if (provider === "telnyx") {
return config.telnyx.apiKey || process.env.TELNYX_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export const TTS_PROVIDERS = ["openai", "elevenlabs", "telnyx", "edge"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
@ -485,6 +505,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") return config.edge.enabled;
if (provider === "telnyx") return Boolean(resolveTtsApiKey(config, "telnyx"));
return Boolean(resolveTtsApiKey(config, provider));
}
@ -587,7 +608,12 @@ function parseTtsDirectives(
switch (key) {
case "provider":
if (!policy.allowProvider) break;
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
if (
rawValue === "openai" ||
rawValue === "elevenlabs" ||
rawValue === "edge" ||
rawValue === "telnyx"
) {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
@ -1076,6 +1102,79 @@ async function edgeTTS(params: {
await tts.ttsPromise(text, outputPath);
}
async function telnyxTTS(params: {
text: string;
apiKey: string;
voice: string;
inactivityTimeout: number;
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, voice, inactivityTimeout, timeoutMs } = params;
return new Promise((resolve, reject) => {
const url = new URL(TELNYX_WS_URL);
url.searchParams.set("voice", voice);
if (inactivityTimeout !== DEFAULT_TELNYX_INACTIVITY_TIMEOUT) {
url.searchParams.set("inactivity_timeout", String(inactivityTimeout));
}
const ws = new WebSocket(url.toString(), {
headers: { Authorization: `Bearer ${apiKey}` },
});
const audioChunks: Buffer[] = [];
let completed = false;
const timeout = setTimeout(() => {
if (!completed) {
completed = true;
ws.close();
reject(new Error("Telnyx TTS request timed out"));
}
}, timeoutMs);
ws.on("open", () => {
// Send initialization frame (required first)
ws.send(JSON.stringify({ text: " " }));
// Send text frame
ws.send(JSON.stringify({ text }));
// Send stop frame to signal completion
ws.send(JSON.stringify({ text: "" }));
});
ws.on("message", (data: Buffer | string) => {
try {
const message = JSON.parse(data.toString()) as { audio?: string };
if (message.audio) {
audioChunks.push(Buffer.from(message.audio, "base64"));
}
} catch {
// Ignore non-JSON messages
}
});
ws.on("close", () => {
clearTimeout(timeout);
if (!completed) {
completed = true;
if (audioChunks.length === 0) {
reject(new Error("Telnyx TTS returned no audio"));
} else {
resolve(Buffer.concat(audioChunks));
}
}
});
ws.on("error", (err) => {
clearTimeout(timeout);
if (!completed) {
completed = true;
reject(new Error(`Telnyx TTS WebSocket error: ${err.message}`));
}
});
});
}
export async function textToSpeech(params: {
text: string;
cfg: MoltbotConfig;
@ -1203,6 +1302,14 @@ export async function textToSpeech(params: {
voiceSettings,
timeoutMs: config.timeoutMs,
});
} else if (provider === "telnyx") {
audioBuffer = await telnyxTTS({
text: params.text,
apiKey,
voice: config.telnyx.voice,
inactivityTimeout: config.telnyx.inactivityTimeout,
timeoutMs: config.timeoutMs,
});
} else {
const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice;
@ -1228,7 +1335,12 @@ export async function textToSpeech(params: {
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
outputFormat:
provider === "openai"
? output.openai
: provider === "telnyx"
? output.telnyx
: output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
@ -1274,6 +1386,10 @@ export async function textToSpeechTelephony(params: {
lastError = "edge: unsupported for telephony";
continue;
}
if (provider === "telnyx") {
lastError = "telnyx: WebSocket API outputs MP3, telephony requires PCM";
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {