diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 6bb13fb54..dc49c16c8 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -120,7 +120,7 @@ working option**: - `whisper` (Python CLI; downloads models automatically) 2) **Gemini CLI** (`gemini`) using `read_many_files` 3) **Provider keys** - - Audio: OpenAI → Groq → Deepgram → Google + - Audio: OpenAI → Groq → Deepgram → Google → Telnyx - Image: OpenAI → Anthropic → Google → MiniMax - Video: Google @@ -145,6 +145,7 @@ lists, Moltbot can infer defaults: - `google` (Gemini API): **image + audio + video** - `groq`: **audio** - `deepgram`: **audio** +- `telnyx`: **audio** For CLI entries, **set `capabilities` explicitly** to avoid surprising matches. If you omit `capabilities`, the entry is eligible for the list it appears in. @@ -153,7 +154,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. | Capability | Provider integration | Notes | |------------|----------------------|-------| | Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. | -| Audio | OpenAI, Groq, Deepgram, Google | Provider transcription (Whisper/Deepgram/Gemini). | +| Audio | OpenAI, Groq, Deepgram, Google, Telnyx | Provider transcription (Whisper/Deepgram/Gemini/Telnyx). | | Video | Google (Gemini API) | Provider video understanding. | ## Recommended providers @@ -162,9 +163,10 @@ If you omit `capabilities`, the entry is eligible for the list it appears in. - Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`. **Audio** -- `openai/gpt-4o-mini-transcribe`, `groq/whisper-large-v3-turbo`, or `deepgram/nova-3`. +- `openai/gpt-4o-mini-transcribe`, `groq/whisper-large-v3-turbo`, `deepgram/nova-3`, or `telnyx/openai/whisper-large-v3-turbo`. - CLI fallback: `whisper-cli` (whisper-cpp) or `whisper`. - Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram). +- Telnyx setup: [Telnyx (audio transcription)](/providers/telnyx). **Video** - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer). diff --git a/docs/providers/index.md b/docs/providers/index.md index c18ad70fb..b217958c1 100644 --- a/docs/providers/index.md +++ b/docs/providers/index.md @@ -50,6 +50,7 @@ See [Venice AI](/providers/venice). ## Transcription providers - [Deepgram (audio transcription)](/providers/deepgram) +- [Telnyx (audio transcription)](/providers/telnyx) ## Community tools diff --git a/docs/providers/telnyx.md b/docs/providers/telnyx.md new file mode 100644 index 000000000..ec0d99196 --- /dev/null +++ b/docs/providers/telnyx.md @@ -0,0 +1,73 @@ +--- +summary: "Telnyx speech-to-text for inbound voice notes" +read_when: + - You want Telnyx speech-to-text for audio attachments + - You need a quick Telnyx STT config example +--- +# Telnyx (Audio Transcription) + +Telnyx provides speech-to-text via their AI API powered by Whisper. In Moltbot it is used +for **inbound audio/voice note transcription** via `tools.media.audio`. + +When enabled, Moltbot uploads the audio file to Telnyx and injects the transcript +into the reply pipeline (`{{Transcript}}` + `[Audio]` block). This is **not streaming**; +it uses the pre-recorded transcription endpoint. + +Website: https://telnyx.com +Docs: https://developers.telnyx.com/docs/voice/programmable-voice/stt-standalone + +## Quick start + +1) Set your API key: +``` +TELNYX_API_KEY=KEY... +``` + +2) Enable the provider: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [{ provider: "telnyx", model: "openai/whisper-large-v3-turbo" }] + } + } + } +} +``` + +## Options + +- `model`: Telnyx model id (default: `openai/whisper-large-v3-turbo`) +- `language`: language hint (optional) + +Example with language: +```json5 +{ + tools: { + media: { + audio: { + enabled: true, + models: [ + { provider: "telnyx", model: "openai/whisper-large-v3-turbo", language: "en" } + ] + } + } + } +} +``` + +## Available models + +Telnyx offers Whisper-based transcription models: + +- `openai/whisper-large-v3-turbo` (default) - Fast, high-quality transcription +- `openai/whisper-large-v3` - Higher accuracy, slightly slower + +## Notes + +- Authentication follows the standard provider auth order; `TELNYX_API_KEY` is the simplest path. +- The API follows OpenAI's transcription format, making it compatible with existing tooling. +- Override endpoints or headers with `tools.media.audio.baseUrl` and `tools.media.audio.headers` when using a proxy. +- Output follows the same audio rules as other providers (size caps, timeouts, transcript injection). diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index b4e443d20..802a680ab 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -31,6 +31,7 @@ export const DEFAULT_AUDIO_MODELS: Record = { groq: "whisper-large-v3-turbo", openai: "gpt-4o-mini-transcribe", deepgram: "nova-3", + telnyx: "openai/whisper-large-v3-turbo", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index a20ba92fb..e9e6f13bc 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; import { minimaxProvider } from "./minimax/index.js"; import { openaiProvider } from "./openai/index.js"; +import { telnyxProvider } from "./telnyx/index.js"; const PROVIDERS: MediaUnderstandingProvider[] = [ groqProvider, @@ -14,6 +15,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ anthropicProvider, minimaxProvider, deepgramProvider, + telnyxProvider, ]; export function normalizeMediaProviderId(id: string): string { diff --git a/src/media-understanding/providers/telnyx/index.test.ts b/src/media-understanding/providers/telnyx/index.test.ts new file mode 100644 index 000000000..38f0e23f3 --- /dev/null +++ b/src/media-understanding/providers/telnyx/index.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from "vitest"; + +import { telnyxProvider } from "./index.js"; + +const resolveRequestUrl = (input: RequestInfo | URL) => { + if (typeof input === "string") return input; + if (input instanceof URL) return input.toString(); + return input.url; +}; + +describe("telnyxProvider", () => { + it("has correct id and capabilities", () => { + expect(telnyxProvider.id).toBe("telnyx"); + expect(telnyxProvider.capabilities).toEqual(["audio"]); + expect(telnyxProvider.transcribeAudio).toBeDefined(); + }); + + it("uses the correct Telnyx API base URL", async () => { + let seenUrl: string | null = null; + const fetchFn = async (input: RequestInfo | URL, _init?: RequestInit) => { + seenUrl = resolveRequestUrl(input); + return new Response(JSON.stringify({ text: "transcribed text" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + const result = await telnyxProvider.transcribeAudio!({ + buffer: Buffer.from("audio-bytes"), + fileName: "voice.ogg", + apiKey: "test-telnyx-key", + timeoutMs: 5000, + fetchFn, + }); + + expect(seenUrl).toBe("https://api.telnyx.com/v2/ai/audio/transcriptions"); + expect(result.text).toBe("transcribed text"); + }); + + it("allows overriding the base URL", async () => { + let seenUrl: string | null = null; + const fetchFn = async (input: RequestInfo | URL, _init?: RequestInit) => { + seenUrl = resolveRequestUrl(input); + return new Response(JSON.stringify({ text: "ok" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + await telnyxProvider.transcribeAudio!({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "test-key", + timeoutMs: 1000, + baseUrl: "https://custom.telnyx.example/v1", + fetchFn, + }); + + expect(seenUrl).toBe("https://custom.telnyx.example/v1/audio/transcriptions"); + }); + + it("sends the correct authorization header", async () => { + let seenAuth: string | null = null; + const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => { + const headers = new Headers(init?.headers); + seenAuth = headers.get("authorization"); + return new Response(JSON.stringify({ text: "ok" }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }; + + await telnyxProvider.transcribeAudio!({ + buffer: Buffer.from("audio"), + fileName: "note.mp3", + apiKey: "KEY_TELNYX_12345", + timeoutMs: 1000, + fetchFn, + }); + + expect(seenAuth).toBe("Bearer KEY_TELNYX_12345"); + }); +}); diff --git a/src/media-understanding/providers/telnyx/index.ts b/src/media-understanding/providers/telnyx/index.ts new file mode 100644 index 000000000..2f387deab --- /dev/null +++ b/src/media-understanding/providers/telnyx/index.ts @@ -0,0 +1,14 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js"; + +const DEFAULT_TELNYX_AUDIO_BASE_URL = "https://api.telnyx.com/v2/ai"; + +export const telnyxProvider: MediaUnderstandingProvider = { + id: "telnyx", + capabilities: ["audio"], + transcribeAudio: (req) => + transcribeOpenAiCompatibleAudio({ + ...req, + baseUrl: req.baseUrl ?? DEFAULT_TELNYX_AUDIO_BASE_URL, + }), +}; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index ffc6e4d64..1d26efed7 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -49,7 +49,7 @@ import { import { describeImageWithModel } from "./providers/image.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; -const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const; +const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google", "telnyx"] as const; const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; const DEFAULT_IMAGE_MODELS: Record = {