feat(tts): add Telnyx as TTS provider with WebSocket streaming

This commit is contained in:
Franco Viotti 2026-01-28 23:07:43 -03:00
parent 109ac1c549
commit 2ba4d4e669
No known key found for this signature in database
4 changed files with 163 additions and 5 deletions

View File

@ -1,4 +1,4 @@
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsProvider = "elevenlabs" | "openai" | "edge" | "telnyx";
export type TtsMode = "final" | "all";
@ -73,6 +73,14 @@ export type TtsConfig = {
proxy?: string;
timeoutMs?: number;
};
/** Telnyx TTS configuration. */
telnyx?: {
apiKey?: string;
/** Voice ID (e.g. "Telnyx.NaturalHD.astra", "Telnyx.Kokoro.af_heart"). */
voice?: string;
/** WebSocket inactivity timeout in seconds (default: 20). */
inactivityTimeout?: number;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "telnyx"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
@ -224,6 +224,14 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
telnyx: z
.object({
apiKey: z.string().optional(),
voice: z.string().optional(),
inactivityTimeout: z.number().int().min(1).max(300).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@ -202,6 +202,14 @@ describe("tts", () => {
expect(result.overrides.provider).toBe("edge");
});
it("accepts telnyx as provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:provider=telnyx]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("telnyx");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";
@ -426,6 +434,7 @@ describe("tts", () => {
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
TELNYX_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
@ -434,6 +443,22 @@ describe("tts", () => {
},
);
});
it("prefers Telnyx when OpenAI and ElevenLabs are missing and Telnyx key exists", () => {
withEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
TELNYX_API_KEY: "test-telnyx-key",
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-telnyx.json");
expect(provider).toBe("telnyx");
},
);
});
});
describe("maybeApplyTtsToPayload", () => {

View File

@ -13,6 +13,7 @@ import path from "node:path";
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import WebSocket from "ws";
import type { ReplyPayload } from "../auto-reply/types.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_TELNYX_VOICE = "Telnyx.NaturalHD.astra";
const DEFAULT_TELNYX_INACTIVITY_TIMEOUT = 20;
const TELNYX_WS_URL = "wss://api.telnyx.com/v2/text-to-speech/speech";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
@ -65,6 +69,8 @@ const TELEGRAM_OUTPUT = {
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
// Telnyx outputs MP3 only (16kHz); not ideal for Telegram voice bubbles but works.
telnyx: "mp3_16000" as const,
extension: ".opus",
voiceCompatible: true,
};
@ -72,6 +78,7 @@ const TELEGRAM_OUTPUT = {
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
elevenlabs: "mp3_44100_128",
telnyx: "mp3_16000" as const,
extension: ".mp3",
voiceCompatible: false,
};
@ -124,6 +131,11 @@ export type ResolvedTtsConfig = {
proxy?: string;
timeoutMs?: number;
};
telnyx: {
apiKey?: string;
voice: string;
inactivityTimeout: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
@ -296,6 +308,11 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
telnyx: {
apiKey: raw.telnyx?.apiKey,
voice: raw.telnyx?.voice?.trim() || DEFAULT_TELNYX_VOICE,
inactivityTimeout: raw.telnyx?.inactivityTimeout ?? DEFAULT_TELNYX_INACTIVITY_TIMEOUT,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -412,6 +429,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
if (resolveTtsApiKey(config, "openai")) return "openai";
if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
if (resolveTtsApiKey(config, "telnyx")) return "telnyx";
return "edge";
}
@ -474,10 +492,13 @@ export function resolveTtsApiKey(
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
if (provider === "telnyx") {
return config.telnyx.apiKey || process.env.TELNYX_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export const TTS_PROVIDERS = ["openai", "elevenlabs", "telnyx", "edge"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
@ -485,6 +506,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") return config.edge.enabled;
if (provider === "telnyx") return Boolean(resolveTtsApiKey(config, "telnyx"));
return Boolean(resolveTtsApiKey(config, provider));
}
@ -587,7 +609,12 @@ function parseTtsDirectives(
switch (key) {
case "provider":
if (!policy.allowProvider) break;
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
if (
rawValue === "openai" ||
rawValue === "elevenlabs" ||
rawValue === "edge" ||
rawValue === "telnyx"
) {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
@ -1068,6 +1095,79 @@ async function edgeTTS(params: {
await tts.ttsPromise(text, outputPath);
}
async function telnyxTTS(params: {
text: string;
apiKey: string;
voice: string;
inactivityTimeout: number;
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, voice, inactivityTimeout, timeoutMs } = params;
return new Promise((resolve, reject) => {
const url = new URL(TELNYX_WS_URL);
url.searchParams.set("voice", voice);
if (inactivityTimeout !== DEFAULT_TELNYX_INACTIVITY_TIMEOUT) {
url.searchParams.set("inactivity_timeout", String(inactivityTimeout));
}
const ws = new WebSocket(url.toString(), {
headers: { Authorization: `Bearer ${apiKey}` },
});
const audioChunks: Buffer[] = [];
let completed = false;
const timeout = setTimeout(() => {
if (!completed) {
completed = true;
ws.close();
reject(new Error("Telnyx TTS request timed out"));
}
}, timeoutMs);
ws.on("open", () => {
// Send initialization frame (required first)
ws.send(JSON.stringify({ text: " " }));
// Send text frame
ws.send(JSON.stringify({ text }));
// Send stop frame to signal completion
ws.send(JSON.stringify({ text: "" }));
});
ws.on("message", (data: Buffer | string) => {
try {
const message = JSON.parse(data.toString()) as { audio?: string };
if (message.audio) {
audioChunks.push(Buffer.from(message.audio, "base64"));
}
} catch {
// Ignore non-JSON messages
}
});
ws.on("close", () => {
clearTimeout(timeout);
if (!completed) {
completed = true;
if (audioChunks.length === 0) {
reject(new Error("Telnyx TTS returned no audio"));
} else {
resolve(Buffer.concat(audioChunks));
}
}
});
ws.on("error", (err) => {
clearTimeout(timeout);
if (!completed) {
completed = true;
reject(new Error(`Telnyx TTS WebSocket error: ${err.message}`));
}
});
});
}
export async function textToSpeech(params: {
text: string;
cfg: MoltbotConfig;
@ -1195,6 +1295,14 @@ export async function textToSpeech(params: {
voiceSettings,
timeoutMs: config.timeoutMs,
});
} else if (provider === "telnyx") {
audioBuffer = await telnyxTTS({
text: params.text,
apiKey,
voice: config.telnyx.voice,
inactivityTimeout: config.telnyx.inactivityTimeout,
timeoutMs: config.timeoutMs,
});
} else {
const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice;
@ -1220,7 +1328,12 @@ export async function textToSpeech(params: {
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
outputFormat:
provider === "openai"
? output.openai
: provider === "telnyx"
? output.telnyx
: output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
@ -1266,6 +1379,10 @@ export async function textToSpeechTelephony(params: {
lastError = "edge: unsupported for telephony";
continue;
}
if (provider === "telnyx") {
lastError = "telnyx: unsupported for telephony (MP3 output only)";
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {