From 2ba4d4e66940b08970500d7bbcb4a5918fac1bc0 Mon Sep 17 00:00:00 2001 From: Franco Viotti Date: Wed, 28 Jan 2026 23:07:43 -0300 Subject: [PATCH] feat(tts): add Telnyx as TTS provider with WebSocket streaming --- src/config/types.tts.ts | 10 ++- src/config/zod-schema.core.ts | 10 ++- src/tts/tts.test.ts | 25 +++++++ src/tts/tts.ts | 123 +++++++++++++++++++++++++++++++++- 4 files changed, 163 insertions(+), 5 deletions(-) diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4eb4989b9..c7824f177 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -1,4 +1,4 @@ -export type TtsProvider = "elevenlabs" | "openai" | "edge"; +export type TtsProvider = "elevenlabs" | "openai" | "edge" | "telnyx"; export type TtsMode = "final" | "all"; @@ -73,6 +73,14 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** Telnyx TTS configuration. */ + telnyx?: { + apiKey?: string; + /** Voice ID (e.g. "Telnyx.NaturalHD.astra", "Telnyx.Kokoro.af_heart"). */ + voice?: string; + /** WebSocket inactivity timeout in seconds (default: 20). */ + inactivityTimeout?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 4a8c80bcc..1d4a3f811 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z .strict() .optional(); -export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); +export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "telnyx"]); export const TtsModeSchema = z.enum(["final", "all"]); export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); export const TtsConfigSchema = z @@ -224,6 +224,14 @@ export const TtsConfigSchema = z }) .strict() .optional(), + telnyx: z + .object({ + apiKey: z.string().optional(), + voice: z.string().optional(), + inactivityTimeout: z.number().int().min(1).max(300).optional(), + }) + .strict() + .optional(), prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 8462cba01..c1c987a67 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -202,6 +202,14 @@ describe("tts", () => { expect(result.overrides.provider).toBe("edge"); }); + it("accepts telnyx as provider override", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:provider=telnyx]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.overrides.provider).toBe("telnyx"); + }); + it("keeps text intact when overrides are disabled", () => { const policy = resolveModelOverridePolicy({ enabled: false }); const input = "Hello [[tts:voice=alloy]] world"; @@ -426,6 +434,7 @@ describe("tts", () => { OPENAI_API_KEY: undefined, ELEVENLABS_API_KEY: undefined, XI_API_KEY: undefined, + TELNYX_API_KEY: undefined, }, () => { const config = resolveTtsConfig(baseCfg); @@ -434,6 +443,22 @@ describe("tts", () => { }, ); }); + + it("prefers Telnyx when OpenAI and ElevenLabs are missing and Telnyx key exists", () => { + withEnv( + { + OPENAI_API_KEY: undefined, + ELEVENLABS_API_KEY: undefined, + XI_API_KEY: undefined, + TELNYX_API_KEY: "test-telnyx-key", + }, + () => { + const config = resolveTtsConfig(baseCfg); + const provider = getTtsProvider(config, "/tmp/tts-prefs-telnyx.json"); + expect(provider).toBe("telnyx"); + }, + ); + }); }); describe("maybeApplyTtsToPayload", () => { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index af3d7fda5..fec413cb1 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -13,6 +13,7 @@ import path from "node:path"; import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; import { EdgeTTS } from "node-edge-tts"; +import WebSocket from "ws"; import type { ReplyPayload } from "../auto-reply/types.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; @@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy"; const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +const DEFAULT_TELNYX_VOICE = "Telnyx.NaturalHD.astra"; +const DEFAULT_TELNYX_INACTIVITY_TIMEOUT = 20; +const TELNYX_WS_URL = "wss://api.telnyx.com/v2/text-to-speech/speech"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, @@ -65,6 +69,8 @@ const TELEGRAM_OUTPUT = { // ElevenLabs output formats use codec_sample_rate_bitrate naming. // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram. elevenlabs: "opus_48000_64", + // Telnyx outputs MP3 only (16kHz); not ideal for Telegram voice bubbles but works. + telnyx: "mp3_16000" as const, extension: ".opus", voiceCompatible: true, }; @@ -72,6 +78,7 @@ const TELEGRAM_OUTPUT = { const DEFAULT_OUTPUT = { openai: "mp3" as const, elevenlabs: "mp3_44100_128", + telnyx: "mp3_16000" as const, extension: ".mp3", voiceCompatible: false, }; @@ -124,6 +131,11 @@ export type ResolvedTtsConfig = { proxy?: string; timeoutMs?: number; }; + telnyx: { + apiKey?: string; + voice: string; + inactivityTimeout: number; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -296,6 +308,11 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig { proxy: raw.edge?.proxy?.trim() || undefined, timeoutMs: raw.edge?.timeoutMs, }, + telnyx: { + apiKey: raw.telnyx?.apiKey, + voice: raw.telnyx?.voice?.trim() || DEFAULT_TELNYX_VOICE, + inactivityTimeout: raw.telnyx?.inactivityTimeout ?? DEFAULT_TELNYX_INACTIVITY_TIMEOUT, + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -412,6 +429,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt if (resolveTtsApiKey(config, "openai")) return "openai"; if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs"; + if (resolveTtsApiKey(config, "telnyx")) return "telnyx"; return "edge"; } @@ -474,10 +492,13 @@ export function resolveTtsApiKey( if (provider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } + if (provider === "telnyx") { + return config.telnyx.apiKey || process.env.TELNYX_API_KEY; + } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "telnyx", "edge"] as const; export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)]; @@ -485,6 +506,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean { if (provider === "edge") return config.edge.enabled; + if (provider === "telnyx") return Boolean(resolveTtsApiKey(config, "telnyx")); return Boolean(resolveTtsApiKey(config, provider)); } @@ -587,7 +609,12 @@ function parseTtsDirectives( switch (key) { case "provider": if (!policy.allowProvider) break; - if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { + if ( + rawValue === "openai" || + rawValue === "elevenlabs" || + rawValue === "edge" || + rawValue === "telnyx" + ) { overrides.provider = rawValue; } else { warnings.push(`unsupported provider "${rawValue}"`); @@ -1068,6 +1095,79 @@ async function edgeTTS(params: { await tts.ttsPromise(text, outputPath); } +async function telnyxTTS(params: { + text: string; + apiKey: string; + voice: string; + inactivityTimeout: number; + timeoutMs: number; +}): Promise { + const { text, apiKey, voice, inactivityTimeout, timeoutMs } = params; + + return new Promise((resolve, reject) => { + const url = new URL(TELNYX_WS_URL); + url.searchParams.set("voice", voice); + if (inactivityTimeout !== DEFAULT_TELNYX_INACTIVITY_TIMEOUT) { + url.searchParams.set("inactivity_timeout", String(inactivityTimeout)); + } + + const ws = new WebSocket(url.toString(), { + headers: { Authorization: `Bearer ${apiKey}` }, + }); + + const audioChunks: Buffer[] = []; + let completed = false; + + const timeout = setTimeout(() => { + if (!completed) { + completed = true; + ws.close(); + reject(new Error("Telnyx TTS request timed out")); + } + }, timeoutMs); + + ws.on("open", () => { + // Send initialization frame (required first) + ws.send(JSON.stringify({ text: " " })); + // Send text frame + ws.send(JSON.stringify({ text })); + // Send stop frame to signal completion + ws.send(JSON.stringify({ text: "" })); + }); + + ws.on("message", (data: Buffer | string) => { + try { + const message = JSON.parse(data.toString()) as { audio?: string }; + if (message.audio) { + audioChunks.push(Buffer.from(message.audio, "base64")); + } + } catch { + // Ignore non-JSON messages + } + }); + + ws.on("close", () => { + clearTimeout(timeout); + if (!completed) { + completed = true; + if (audioChunks.length === 0) { + reject(new Error("Telnyx TTS returned no audio")); + } else { + resolve(Buffer.concat(audioChunks)); + } + } + }); + + ws.on("error", (err) => { + clearTimeout(timeout); + if (!completed) { + completed = true; + reject(new Error(`Telnyx TTS WebSocket error: ${err.message}`)); + } + }); + }); +} + export async function textToSpeech(params: { text: string; cfg: MoltbotConfig; @@ -1195,6 +1295,14 @@ export async function textToSpeech(params: { voiceSettings, timeoutMs: config.timeoutMs, }); + } else if (provider === "telnyx") { + audioBuffer = await telnyxTTS({ + text: params.text, + apiKey, + voice: config.telnyx.voice, + inactivityTimeout: config.telnyx.inactivityTimeout, + timeoutMs: config.timeoutMs, + }); } else { const openaiModelOverride = params.overrides?.openai?.model; const openaiVoiceOverride = params.overrides?.openai?.voice; @@ -1220,7 +1328,12 @@ export async function textToSpeech(params: { audioPath, latencyMs, provider, - outputFormat: provider === "openai" ? output.openai : output.elevenlabs, + outputFormat: + provider === "openai" + ? output.openai + : provider === "telnyx" + ? output.telnyx + : output.elevenlabs, voiceCompatible: output.voiceCompatible, }; } catch (err) { @@ -1266,6 +1379,10 @@ export async function textToSpeechTelephony(params: { lastError = "edge: unsupported for telephony"; continue; } + if (provider === "telnyx") { + lastError = "telnyx: unsupported for telephony (MP3 output only)"; + continue; + } const apiKey = resolveTtsApiKey(config, provider); if (!apiKey) {