Merge bf5f4e9b7a into 4583f88626

2026-01-29 15:55:48 -03:00 · 2026-01-29 15:55:48 -03:00 · fa7996edfc
commit fa7996edfc
parent 4583f88626 bf5f4e9b7a
6 changed files with 181 additions and 14 deletions
--- a/docs/tts.md
+++ b/docs/tts.md
@ -8,13 +8,14 @@ read_when:

 # Text-to-speech (TTS)

-Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS.
+Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, Telnyx, or Edge TTS.
 It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubble.

 ## Supported services

 - **ElevenLabs** (primary or fallback provider)
 - **OpenAI** (primary or fallback provider; also used for summaries)
+- **Telnyx** (primary or fallback provider; great quality, cheaper than ElevenLabs)
 - **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys)

 ### Edge TTS notes
@ -31,9 +32,10 @@ does not publish limits, so assume similar or lower limits. citeturn0searc

 ## Optional keys

-If you want OpenAI or ElevenLabs:
+If you want OpenAI, ElevenLabs, or Telnyx:
 - `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
 - `OPENAI_API_KEY`
+- `TELNYX_API_KEY`

 Edge TTS does **not** require an API key. If no API keys are found, Moltbot defaults
 to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
@ -202,9 +204,9 @@ Then run:
  - `tagged` only sends audio when the reply includes `[[tts]]` tags.
 - `enabled`: legacy toggle (doctor migrates this to `auto`).
 - `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
+- `provider`: `"elevenlabs"`, `"openai"`, `"telnyx"`, or `"edge"` (fallback is automatic).
 - If `provider` is **unset**, Moltbot prefers `openai` (if key), then `elevenlabs` (if key),
-  otherwise `edge`.
+  then `telnyx` (if key), otherwise `edge`.
 - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`.
  - Accepts `provider/model` or a configured model alias.
 - `modelOverrides`: allow the model to emit TTS directives (on by default).
@ -250,7 +252,7 @@ Here you go.
 ```

 Available directive keys (when enabled):
- `provider` (`openai` | `elevenlabs` | `edge`)
+- `provider` (`openai` | `elevenlabs` | `telnyx` | `edge`)
 - `voice` (OpenAI voice) or `voiceId` (ElevenLabs)
 - `model` (OpenAI TTS model or ElevenLabs model id)
 - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost`
--- a/src/auto-reply/reply/commands-tts.ts
+++ b/src/auto-reply/reply/commands-tts.ts
@ -50,7 +50,8 @@ function ttsUsage(): ReplyPayload {
      `**Providers:**\n` +
      `• edge — Free, fast (default)\n` +
      `• openai — High quality (requires API key)\n` +
-      `• elevenlabs — Premium voices (requires API key)\n\n` +
+      `• elevenlabs — Premium voices (requires API key)\n` +
+      `• telnyx — Great quality, cheaper than ElevenLabs (requires API key)\n\n` +
      `**Text Limit (default: 1500, max: 4096):**\n` +
      `When text exceeds the limit:\n` +
      `• Summary ON: AI summarizes, then generates audio\n` +
@ -151,6 +152,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
    if (!args.trim()) {
      const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
      const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
+      const hasTelnyx = Boolean(resolveTtsApiKey(config, "telnyx"));
      const hasEdge = isTtsProviderConfigured(config, "edge");
      return {
        shouldContinue: false,
@ -160,18 +162,24 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
            `Primary: ${currentProvider}\n` +
            `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
            `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
+            `Telnyx key: ${hasTelnyx ? "✅" : "❌"}\n` +
            `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
-            `Usage: /tts provider openai | elevenlabs | edge`,
+            `Usage: /tts provider openai | elevenlabs | telnyx | edge`,
        },
      };
    }

    const requested = args.trim().toLowerCase();
-    if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
+    if (
+      requested !== "openai" &&
+      requested !== "elevenlabs" &&
+      requested !== "telnyx" &&
+      requested !== "edge"
+    ) {
      return { shouldContinue: false, reply: ttsUsage() };
    }

-    setTtsProvider(prefsPath, requested);
+    setTtsProvider(prefsPath, requested as "openai" | "elevenlabs" | "telnyx" | "edge");
    return {
      shouldContinue: false,
      reply: { text: `✅ TTS provider set to ${requested}.` },
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@ -1,4 +1,4 @@
-export type TtsProvider = "elevenlabs" | "openai" | "edge";
+export type TtsProvider = "elevenlabs" | "openai" | "edge" | "telnyx";

 export type TtsMode = "final" | "all";

@ -73,6 +73,14 @@ export type TtsConfig = {
    proxy?: string;
    timeoutMs?: number;
  };
+  /** Telnyx TTS configuration. */
+  telnyx?: {
+    apiKey?: string;
+    /** Voice ID (e.g. "Telnyx.NaturalHD.astra", "Telnyx.Kokoro.af_heart"). */
+    voice?: string;
+    /** WebSocket inactivity timeout in seconds (default: 20). */
+    inactivityTimeout?: number;
+  };
  /** Optional path for local TTS user preferences JSON. */
  prefsPath?: string;
  /** Hard cap for text sent to TTS (chars). */
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
  .strict()
  .optional();

-export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
+export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "telnyx"]);
 export const TtsModeSchema = z.enum(["final", "all"]);
 export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
 export const TtsConfigSchema = z
@ -224,6 +224,14 @@ export const TtsConfigSchema = z
      })
      .strict()
      .optional(),
+    telnyx: z
+      .object({
+        apiKey: z.string().optional(),
+        voice: z.string().optional(),
+        inactivityTimeout: z.number().int().min(1).max(300).optional(),
+      })
+      .strict()
+      .optional(),
    prefsPath: z.string().optional(),
    maxTextLength: z.number().int().min(1).optional(),
    timeoutMs: z.number().int().min(1000).max(120000).optional(),
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@ -202,6 +202,14 @@ describe("tts", () => {
      expect(result.overrides.provider).toBe("edge");
    });

+    it("accepts telnyx as provider override", () => {
+      const policy = resolveModelOverridePolicy({ enabled: true });
+      const input = "Hello [[tts:provider=telnyx]] world";
+      const result = parseTtsDirectives(input, policy);
+
+      expect(result.overrides.provider).toBe("telnyx");
+    });
+
    it("keeps text intact when overrides are disabled", () => {
      const policy = resolveModelOverridePolicy({ enabled: false });
      const input = "Hello [[tts:voice=alloy]] world";
@ -426,6 +434,7 @@ describe("tts", () => {
          OPENAI_API_KEY: undefined,
          ELEVENLABS_API_KEY: undefined,
          XI_API_KEY: undefined,
+          TELNYX_API_KEY: undefined,
        },
        () => {
          const config = resolveTtsConfig(baseCfg);
@ -434,6 +443,22 @@ describe("tts", () => {
        },
      );
    });
+
+    it("prefers Telnyx when OpenAI and ElevenLabs are missing and Telnyx key exists", () => {
+      withEnv(
+        {
+          OPENAI_API_KEY: undefined,
+          ELEVENLABS_API_KEY: undefined,
+          XI_API_KEY: undefined,
+          TELNYX_API_KEY: "test-telnyx-key",
+        },
+        () => {
+          const config = resolveTtsConfig(baseCfg);
+          const provider = getTtsProvider(config, "/tmp/tts-prefs-telnyx.json");
+          expect(provider).toBe("telnyx");
+        },
+      );
+    });
  });

  describe("maybeApplyTtsToPayload", () => {
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@ -13,6 +13,7 @@ import path from "node:path";

 import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
 import { EdgeTTS } from "node-edge-tts";
+import WebSocket from "ws";

 import type { ReplyPayload } from "../auto-reply/types.js";
 import { normalizeChannelId } from "../channels/plugins/index.js";
@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy";
 const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
 const DEFAULT_EDGE_LANG = "en-US";
 const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+const DEFAULT_TELNYX_VOICE = "Telnyx.NaturalHD.astra";
+const DEFAULT_TELNYX_INACTIVITY_TIMEOUT = 20;
+const TELNYX_WS_URL = "wss://api.telnyx.com/v2/text-to-speech/speech";

 const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
  stability: 0.5,
@ -65,6 +69,7 @@ const TELEGRAM_OUTPUT = {
  // ElevenLabs output formats use codec_sample_rate_bitrate naming.
  // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
  elevenlabs: "opus_48000_64",
+  telnyx: "mp3_16000" as const,
  extension: ".opus",
  voiceCompatible: true,
 };
@ -72,6 +77,7 @@ const TELEGRAM_OUTPUT = {
 const DEFAULT_OUTPUT = {
  openai: "mp3" as const,
  elevenlabs: "mp3_44100_128",
+  telnyx: "mp3_16000" as const,
  extension: ".mp3",
  voiceCompatible: false,
 };
@ -124,6 +130,11 @@ export type ResolvedTtsConfig = {
    proxy?: string;
    timeoutMs?: number;
  };
+  telnyx: {
+    apiKey?: string;
+    voice: string;
+    inactivityTimeout: number;
+  };
  prefsPath?: string;
  maxTextLength: number;
  timeoutMs: number;
@ -296,6 +307,11 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
      proxy: raw.edge?.proxy?.trim() || undefined,
      timeoutMs: raw.edge?.timeoutMs,
    },
+    telnyx: {
+      apiKey: raw.telnyx?.apiKey,
+      voice: raw.telnyx?.voice?.trim() || DEFAULT_TELNYX_VOICE,
+      inactivityTimeout: raw.telnyx?.inactivityTimeout ?? DEFAULT_TELNYX_INACTIVITY_TIMEOUT,
+    },
    prefsPath: raw.prefsPath,
    maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
    timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -412,6 +428,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt

  if (resolveTtsApiKey(config, "openai")) return "openai";
  if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
+  if (resolveTtsApiKey(config, "telnyx")) return "telnyx";
  return "edge";
 }

@ -474,10 +491,13 @@ export function resolveTtsApiKey(
  if (provider === "openai") {
    return config.openai.apiKey || process.env.OPENAI_API_KEY;
  }
+  if (provider === "telnyx") {
+    return config.telnyx.apiKey || process.env.TELNYX_API_KEY;
+  }
  return undefined;
 }

-export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
+export const TTS_PROVIDERS = ["openai", "elevenlabs", "telnyx", "edge"] as const;

 export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
  return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
@ -485,6 +505,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {

 export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
  if (provider === "edge") return config.edge.enabled;
+  if (provider === "telnyx") return Boolean(resolveTtsApiKey(config, "telnyx"));
  return Boolean(resolveTtsApiKey(config, provider));
 }

@ -587,7 +608,12 @@ function parseTtsDirectives(
        switch (key) {
          case "provider":
            if (!policy.allowProvider) break;
-            if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
+            if (
+              rawValue === "openai" ||
+              rawValue === "elevenlabs" ||
+              rawValue === "edge" ||
+              rawValue === "telnyx"
+            ) {
              overrides.provider = rawValue;
            } else {
              warnings.push(`unsupported provider "${rawValue}"`);
@ -1076,6 +1102,79 @@ async function edgeTTS(params: {
  await tts.ttsPromise(text, outputPath);
 }

+async function telnyxTTS(params: {
+  text: string;
+  apiKey: string;
+  voice: string;
+  inactivityTimeout: number;
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const { text, apiKey, voice, inactivityTimeout, timeoutMs } = params;
+
+  return new Promise((resolve, reject) => {
+    const url = new URL(TELNYX_WS_URL);
+    url.searchParams.set("voice", voice);
+    if (inactivityTimeout !== DEFAULT_TELNYX_INACTIVITY_TIMEOUT) {
+      url.searchParams.set("inactivity_timeout", String(inactivityTimeout));
+    }
+
+    const ws = new WebSocket(url.toString(), {
+      headers: { Authorization: `Bearer ${apiKey}` },
+    });
+
+    const audioChunks: Buffer[] = [];
+    let completed = false;
+
+    const timeout = setTimeout(() => {
+      if (!completed) {
+        completed = true;
+        ws.close();
+        reject(new Error("Telnyx TTS request timed out"));
+      }
+    }, timeoutMs);
+
+    ws.on("open", () => {
+      // Send initialization frame (required first)
+      ws.send(JSON.stringify({ text: " " }));
+      // Send text frame
+      ws.send(JSON.stringify({ text }));
+      // Send stop frame to signal completion
+      ws.send(JSON.stringify({ text: "" }));
+    });
+
+    ws.on("message", (data: Buffer | string) => {
+      try {
+        const message = JSON.parse(data.toString()) as { audio?: string };
+        if (message.audio) {
+          audioChunks.push(Buffer.from(message.audio, "base64"));
+        }
+      } catch {
+        // Ignore non-JSON messages
+      }
+    });
+
+    ws.on("close", () => {
+      clearTimeout(timeout);
+      if (!completed) {
+        completed = true;
+        if (audioChunks.length === 0) {
+          reject(new Error("Telnyx TTS returned no audio"));
+        } else {
+          resolve(Buffer.concat(audioChunks));
+        }
+      }
+    });
+
+    ws.on("error", (err) => {
+      clearTimeout(timeout);
+      if (!completed) {
+        completed = true;
+        reject(new Error(`Telnyx TTS WebSocket error: ${err.message}`));
+      }
+    });
+  });
+}
+
 export async function textToSpeech(params: {
  text: string;
  cfg: MoltbotConfig;
@ -1203,6 +1302,14 @@ export async function textToSpeech(params: {
          voiceSettings,
          timeoutMs: config.timeoutMs,
        });
+      } else if (provider === "telnyx") {
+        audioBuffer = await telnyxTTS({
+          text: params.text,
+          apiKey,
+          voice: config.telnyx.voice,
+          inactivityTimeout: config.telnyx.inactivityTimeout,
+          timeoutMs: config.timeoutMs,
+        });
      } else {
        const openaiModelOverride = params.overrides?.openai?.model;
        const openaiVoiceOverride = params.overrides?.openai?.voice;
@ -1228,7 +1335,12 @@ export async function textToSpeech(params: {
        audioPath,
        latencyMs,
        provider,
-        outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
+        outputFormat:
+          provider === "openai"
+            ? output.openai
+            : provider === "telnyx"
+              ? output.telnyx
+              : output.elevenlabs,
        voiceCompatible: output.voiceCompatible,
      };
    } catch (err) {
@ -1274,6 +1386,10 @@ export async function textToSpeechTelephony(params: {
        lastError = "edge: unsupported for telephony";
        continue;
      }
+      if (provider === "telnyx") {
+        lastError = "telnyx: WebSocket API outputs MP3, telephony requires PCM";
+        continue;
+      }

      const apiKey = resolveTtsApiKey(config, provider);
      if (!apiKey) {