diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f15cc3b0..342521f05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Status: unreleased. ### Changes - Rebrand: rename the npm package/CLI to `moltbot`, add a `moltbot` compatibility shim, and move extensions to the `@moltbot/*` scope. +- TTS: add Smallest AI provider (Lightning v3.1 TTS + Pulse STT) with native telephony support. - Commands: group /help and /commands output with Telegram paging. (#2504) Thanks @hougangdev. - macOS: limit project-local `node_modules/.bin` PATH preference to debug builds (reduce PATH hijacking risk). - macOS: finish Moltbot app rename for macOS sources, bundle identifiers, and shared kit paths. (#2844) Thanks @fal3. diff --git a/docs/tts.md b/docs/tts.md index c3899ac08..15b766815 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -8,13 +8,14 @@ read_when: # Text-to-speech (TTS) -Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS. +Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, Smallest AI, or Edge TTS. It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubble. ## Supported services - **ElevenLabs** (primary or fallback provider) - **OpenAI** (primary or fallback provider; also used for summaries) +- **Smallest AI** (primary or fallback provider; fast Lightning model, native telephony support) - **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys) ### Edge TTS notes @@ -31,9 +32,10 @@ does not publish limits, so assume similar or lower limits. citeturn0searc ## Optional keys -If you want OpenAI or ElevenLabs: +If you want OpenAI, ElevenLabs, or Smallest AI: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `OPENAI_API_KEY` +- `SMALLEST_API_KEY` Edge TTS does **not** require an API key. If no API keys are found, Moltbot defaults to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`). @@ -48,6 +50,7 @@ so that provider must also be authenticated if you enable summaries. - [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio) - [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech) - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) +- [Smallest AI Waves TTS](https://waves-docs.smallest.ai/) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) @@ -115,6 +118,37 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Smallest AI primary (Lightning v3.1) + +```json5 +{ + messages: { + tts: { + auto: "always", + provider: "smallestai", + smallestai: { + voiceId: "lauren", // or "emily", "jasmine", "arman", custom ID + model: "lightning-v3.1", // "lightning-v3.1" (latest), "lightning", or "waves" + sampleRate: 24000, + outputFormat: "mp3", // "mp3", "wav", "pcm", or "mulaw" (telephony) + speed: 1.0, + language: "en", + consistency: 0.5, + similarity: 0, + enhancement: 1 + } + } + } +} +``` + +Smallest AI notes: +- `lightning-v3.1` is the latest model, optimized for low latency (ideal for real-time) +- `waves` model offers higher quality speech +- Native `mulaw` @ 8kHz support for telephony (no resampling needed) +- 30+ voices available: `lauren`, `emily`, `jasmine`, `arman`, `james`, `george`, `karen`, etc. +- Get full voice list: `curl -H "Authorization: Bearer $SMALLEST_API_KEY" https://waves-api.smallest.ai/api/v1/lightning/get_voices` + ### Edge TTS primary (no API key) ```json5 @@ -202,9 +236,9 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts]]` tags. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic). +- `provider`: `"elevenlabs"`, `"openai"`, `"smallestai"`, or `"edge"` (fallback is automatic). - If `provider` is **unset**, Moltbot prefers `openai` (if key), then `elevenlabs` (if key), - otherwise `edge`. + then `smallestai` (if key), otherwise `edge`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). diff --git a/src/agents/model-auth.ts b/src/agents/model-auth.ts index 96e4e4ae6..a6141c1ba 100644 --- a/src/agents/model-auth.ts +++ b/src/agents/model-auth.ts @@ -274,6 +274,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null { google: "GEMINI_API_KEY", groq: "GROQ_API_KEY", deepgram: "DEEPGRAM_API_KEY", + smallestai: "SMALLEST_API_KEY", cerebras: "CEREBRAS_API_KEY", xai: "XAI_API_KEY", openrouter: "OPENROUTER_API_KEY", diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4eb4989b9..e2de15968 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -1,4 +1,4 @@ -export type TtsProvider = "elevenlabs" | "openai" | "edge"; +export type TtsProvider = "elevenlabs" | "openai" | "edge" | "smallestai"; export type TtsMode = "final" | "all"; @@ -73,6 +73,29 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** Smallest AI (Waves) configuration. */ + smallestai?: { + apiKey?: string; + baseUrl?: string; + /** Voice ID (e.g. "lauren", "emily", "jasmine", "arman", or custom voice ID). */ + voiceId?: string; + /** Model: "lightning-v3.1" (latest), "lightning", or "waves". */ + model?: "lightning-v3.1" | "lightning" | "waves"; + /** Sample rate in Hz (8000, 16000, 22050, 24000, 44100, 48000). */ + sampleRate?: number; + /** Output format: "mp3", "wav", "pcm" (raw), or "mulaw" (telephony). */ + outputFormat?: "mp3" | "wav" | "pcm" | "mulaw"; + /** Speed multiplier (0.5 to 2.0). */ + speed?: number; + /** Language code (e.g. "en", "hi"). Default: "en". */ + language?: string; + /** Consistency (0 to 1). Default: 0.5. */ + consistency?: number; + /** Similarity (0 to 1). Default: 0. */ + similarity?: number; + /** Enhancement (0 or 1). Default: 1. */ + enhancement?: number; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 4a8c80bcc..4670d582b 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z .strict() .optional(); -export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); +export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "smallestai"]); export const TtsModeSchema = z.enum(["final", "all"]); export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); export const TtsConfigSchema = z @@ -224,6 +224,22 @@ export const TtsConfigSchema = z }) .strict() .optional(), + smallestai: z + .object({ + apiKey: z.string().optional(), + baseUrl: z.string().optional(), + voiceId: z.string().optional(), + model: z.enum(["lightning-v3.1", "lightning", "waves"]).optional(), + sampleRate: z.number().int().min(8000).max(48000).optional(), + outputFormat: z.enum(["mp3", "wav", "pcm", "mulaw"]).optional(), + speed: z.number().min(0.5).max(2).optional(), + language: z.string().optional(), + consistency: z.number().min(0).max(1).optional(), + similarity: z.number().min(0).max(1).optional(), + enhancement: z.number().min(0).max(1).optional(), + }) + .strict() + .optional(), prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index b4e443d20..2d313c400 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -31,6 +31,7 @@ export const DEFAULT_AUDIO_MODELS: Record = { groq: "whisper-large-v3-turbo", openai: "gpt-4o-mini-transcribe", deepgram: "nova-3", + smallestai: "pulse", }; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index a20ba92fb..68072f01f 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; import { minimaxProvider } from "./minimax/index.js"; import { openaiProvider } from "./openai/index.js"; +import { smallestaiProvider } from "./smallestai/index.js"; const PROVIDERS: MediaUnderstandingProvider[] = [ groqProvider, @@ -14,6 +15,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ anthropicProvider, minimaxProvider, deepgramProvider, + smallestaiProvider, ]; export function normalizeMediaProviderId(id: string): string { diff --git a/src/media-understanding/providers/smallestai/audio.ts b/src/media-understanding/providers/smallestai/audio.ts new file mode 100644 index 000000000..817c3f669 --- /dev/null +++ b/src/media-understanding/providers/smallestai/audio.ts @@ -0,0 +1,79 @@ +import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js"; +import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js"; + +export const DEFAULT_SMALLEST_AUDIO_BASE_URL = "https://waves-api.smallest.ai/api/v1"; +export const DEFAULT_SMALLEST_AUDIO_MODEL = "pulse"; // Smallest AI Pulse STT model + +type SmallestTranscriptResponse = { + status?: string; + transcription?: string; + text?: string; + audio_length?: number; + metadata?: { + duration?: number; + fileSize?: number; + }; +}; + +/** + * Transcribe audio using Smallest AI's Pulse STT API. + * + * Endpoint: POST /api/v1/pulse/get_text + * Uses raw audio bytes with Content-Type header (application/octet-stream method). + * @see https://waves-docs.smallest.ai/v4.0.0/content/api-references/pulse-stt + */ +export async function transcribeSmallestAiAudio( + params: AudioTranscriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_SMALLEST_AUDIO_BASE_URL); + + // Build query parameters + const queryParams = new URLSearchParams(); + queryParams.set("model", "pulse"); + if (params.language?.trim()) { + queryParams.set("language", params.language.trim()); + } else { + queryParams.set("language", "en"); + } + + // Pulse API endpoint for STT with query params + const url = `${baseUrl}/pulse/get_text?${queryParams.toString()}`; + + // Determine content type from mime or default to audio/wav + const contentType = params.mime ?? "audio/wav"; + + // Send raw audio bytes directly (application/octet-stream method) + const headers = new Headers(params.headers); + headers.set("Content-Type", contentType); + if (!headers.has("authorization")) { + headers.set("authorization", `Bearer ${params.apiKey}`); + } + + // Convert Buffer to Uint8Array for fetch body + const audioBytes = new Uint8Array(params.buffer); + + const res = await fetchWithTimeout( + url, + { + method: "POST", + headers, + body: audioBytes, + }, + params.timeoutMs, + fetchFn, + ); + + if (!res.ok) { + const detail = await readErrorResponse(res); + const suffix = detail ? `: ${detail}` : ""; + throw new Error(`Smallest AI Pulse STT failed (HTTP ${res.status})${suffix}`); + } + + const payload = (await res.json()) as SmallestTranscriptResponse; + const transcript = (payload.transcription ?? payload.text)?.trim(); + if (!transcript) { + throw new Error("Smallest AI Pulse STT response missing transcription"); + } + return { text: transcript, model: "pulse" }; +} diff --git a/src/media-understanding/providers/smallestai/index.ts b/src/media-understanding/providers/smallestai/index.ts new file mode 100644 index 000000000..a46d087cf --- /dev/null +++ b/src/media-understanding/providers/smallestai/index.ts @@ -0,0 +1,8 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { transcribeSmallestAiAudio } from "./audio.js"; + +export const smallestaiProvider: MediaUnderstandingProvider = { + id: "smallestai", + capabilities: ["audio"], + transcribeAudio: transcribeSmallestAiAudio, +}; diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index ffc6e4d64..3e5028c1e 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -49,7 +49,7 @@ import { import { describeImageWithModel } from "./providers/image.js"; import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; -const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const; +const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google", "smallestai"] as const; const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; const DEFAULT_IMAGE_MODELS: Record = { diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 8462cba01..ec453f32f 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -202,6 +202,14 @@ describe("tts", () => { expect(result.overrides.provider).toBe("edge"); }); + it("accepts smallestai as provider override", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:provider=smallestai]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.overrides.provider).toBe("smallestai"); + }); + it("keeps text intact when overrides are disabled", () => { const policy = resolveModelOverridePolicy({ enabled: false }); const input = "Hello [[tts:voice=alloy]] world"; @@ -359,7 +367,12 @@ describe("tts", () => { }; const restoreEnv = (snapshot: Record) => { - const keys = ["OPENAI_API_KEY", "ELEVENLABS_API_KEY", "XI_API_KEY"] as const; + const keys = [ + "OPENAI_API_KEY", + "ELEVENLABS_API_KEY", + "XI_API_KEY", + "SMALLEST_API_KEY", + ] as const; for (const key of keys) { const value = snapshot[key]; if (value === undefined) { @@ -375,6 +388,7 @@ describe("tts", () => { OPENAI_API_KEY: process.env.OPENAI_API_KEY, ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY, XI_API_KEY: process.env.XI_API_KEY, + SMALLEST_API_KEY: process.env.SMALLEST_API_KEY, }; try { for (const [key, value] of Object.entries(env)) { @@ -426,6 +440,7 @@ describe("tts", () => { OPENAI_API_KEY: undefined, ELEVENLABS_API_KEY: undefined, XI_API_KEY: undefined, + SMALLEST_API_KEY: undefined, }, () => { const config = resolveTtsConfig(baseCfg); @@ -434,6 +449,22 @@ describe("tts", () => { }, ); }); + + it("prefers Smallest AI when OpenAI and ElevenLabs are missing and Smallest key exists", () => { + withEnv( + { + OPENAI_API_KEY: undefined, + ELEVENLABS_API_KEY: undefined, + XI_API_KEY: undefined, + SMALLEST_API_KEY: "test-smallest-key", + }, + () => { + const config = resolveTtsConfig(baseCfg); + const provider = getTtsProvider(config, "/tmp/tts-prefs-smallest.json"); + expect(provider).toBe("smallestai"); + }, + ); + }); }); describe("maybeApplyTtsToPayload", () => { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index af3d7fda5..09a02ad17 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -52,6 +52,19 @@ const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +// Smallest AI defaults (Lightning v3.1) +// See: https://waves-docs.smallest.ai/ +const DEFAULT_SMALLESTAI_BASE_URL = "https://waves-api.smallest.ai/api/v1"; +const DEFAULT_SMALLESTAI_VOICE_ID = "lauren"; +const DEFAULT_SMALLESTAI_MODEL = "lightning-v3.1" as const; +const DEFAULT_SMALLESTAI_SAMPLE_RATE = 24000; +const DEFAULT_SMALLESTAI_OUTPUT_FORMAT = "mp3" as const; +const DEFAULT_SMALLESTAI_SPEED = 1.0; +const DEFAULT_SMALLESTAI_CONSISTENCY = 0.5; +const DEFAULT_SMALLESTAI_SIMILARITY = 0; +const DEFAULT_SMALLESTAI_ENHANCEMENT = 1; +const DEFAULT_SMALLESTAI_LANGUAGE = "en"; + const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, similarityBoost: 0.75, @@ -79,6 +92,8 @@ const DEFAULT_OUTPUT = { const TELEPHONY_OUTPUT = { openai: { format: "pcm" as const, sampleRate: 24000 }, elevenlabs: { format: "pcm_22050", sampleRate: 22050 }, + // Smallest AI natively supports mulaw@8kHz - perfect for telephony! + smallestai: { format: "mulaw" as const, sampleRate: 8000 }, }; const TTS_AUTO_MODES = new Set(["off", "always", "inbound", "tagged"]); @@ -124,6 +139,19 @@ export type ResolvedTtsConfig = { proxy?: string; timeoutMs?: number; }; + smallestai: { + apiKey?: string; + baseUrl: string; + voiceId: string; + model: "lightning-v3.1" | "lightning" | "waves"; + sampleRate: number; + outputFormat: "mp3" | "wav" | "pcm" | "mulaw"; + speed: number; + language: string; + consistency: number; + similarity: number; + enhancement: number; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -296,6 +324,19 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig { proxy: raw.edge?.proxy?.trim() || undefined, timeoutMs: raw.edge?.timeoutMs, }, + smallestai: { + apiKey: raw.smallestai?.apiKey, + baseUrl: raw.smallestai?.baseUrl?.trim() || DEFAULT_SMALLESTAI_BASE_URL, + voiceId: raw.smallestai?.voiceId?.trim() || DEFAULT_SMALLESTAI_VOICE_ID, + model: raw.smallestai?.model ?? DEFAULT_SMALLESTAI_MODEL, + sampleRate: raw.smallestai?.sampleRate ?? DEFAULT_SMALLESTAI_SAMPLE_RATE, + outputFormat: raw.smallestai?.outputFormat ?? DEFAULT_SMALLESTAI_OUTPUT_FORMAT, + speed: raw.smallestai?.speed ?? DEFAULT_SMALLESTAI_SPEED, + language: raw.smallestai?.language?.trim() || DEFAULT_SMALLESTAI_LANGUAGE, + consistency: raw.smallestai?.consistency ?? DEFAULT_SMALLESTAI_CONSISTENCY, + similarity: raw.smallestai?.similarity ?? DEFAULT_SMALLESTAI_SIMILARITY, + enhancement: raw.smallestai?.enhancement ?? DEFAULT_SMALLESTAI_ENHANCEMENT, + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -412,6 +453,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt if (resolveTtsApiKey(config, "openai")) return "openai"; if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs"; + if (resolveTtsApiKey(config, "smallestai")) return "smallestai"; return "edge"; } @@ -474,10 +516,13 @@ export function resolveTtsApiKey( if (provider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } + if (provider === "smallestai") { + return config.smallestai.apiKey || process.env.SMALLEST_API_KEY; + } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "smallestai", "edge"] as const; export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)]; @@ -485,6 +530,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean { if (provider === "edge") return config.edge.enabled; + if (provider === "smallestai") return Boolean(resolveTtsApiKey(config, provider)); return Boolean(resolveTtsApiKey(config, provider)); } @@ -587,7 +633,12 @@ function parseTtsDirectives( switch (key) { case "provider": if (!policy.allowProvider) break; - if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { + if ( + rawValue === "openai" || + rawValue === "elevenlabs" || + rawValue === "edge" || + rawValue === "smallestai" + ) { overrides.provider = rawValue; } else { warnings.push(`unsupported provider "${rawValue}"`); @@ -1036,6 +1087,114 @@ async function openaiTTS(params: { } } +/** + * Smallest AI (Waves) TTS provider. + * Uses the Lightning model for fast speech synthesis or Waves for highest quality. + * Supports native mulaw@8kHz output for telephony. + * + * @see https://waves-docs.smallest.ai/ + */ +async function smallestAiTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + voiceId: string; + model: "lightning-v3.1" | "lightning" | "waves"; + sampleRate: number; + outputFormat: "mp3" | "wav" | "pcm" | "mulaw"; + speed: number; + language?: string; + consistency?: number; + similarity?: number; + enhancement?: number; + timeoutMs: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + voiceId, + model, + sampleRate, + outputFormat, + speed, + language = DEFAULT_SMALLESTAI_LANGUAGE, + consistency = DEFAULT_SMALLESTAI_CONSISTENCY, + similarity = DEFAULT_SMALLESTAI_SIMILARITY, + enhancement = DEFAULT_SMALLESTAI_ENHANCEMENT, + timeoutMs, + } = params; + + // Validate speed + if (speed < 0.5 || speed > 2.0) { + throw new Error("Smallest AI speed must be between 0.5 and 2.0"); + } + + // Validate sample rate + const validSampleRates = [8000, 16000, 22050, 24000, 44100, 48000]; + if (!validSampleRates.includes(sampleRate)) { + throw new Error(`Invalid sample rate: ${sampleRate}. Valid: ${validSampleRates.join(", ")}`); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + // Choose endpoint based on model (v3.1 is the latest) + let endpoint: string; + if (model === "lightning-v3.1") { + endpoint = "lightning-v3.1/get_speech"; + } else if (model === "lightning") { + endpoint = "lightning/get_speech"; + } else { + endpoint = "waves/get_speech"; + } + const url = `${baseUrl.replace(/\/+$/, "")}/${endpoint}`; + + // Build request body based on model version + const body: Record = { + text, + voice_id: voiceId, + sample_rate: sampleRate, + speed, + }; + + if (model === "lightning-v3.1") { + // v3.1 uses output_format directly and has additional parameters + body.output_format = outputFormat; + body.language = language; + body.consistency = consistency; + body.similarity = similarity; + body.enhancement = enhancement; + } else { + // Legacy models use add_wav_header and encoding + body.add_wav_header = outputFormat === "wav"; + if (outputFormat === "mulaw") { + body.encoding = "pcm_mulaw"; + } + } + + const response = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => ""); + throw new Error(`Smallest AI TTS API error (${response.status}): ${errorText}`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + function inferEdgeExtension(outputFormat: string): string { const normalized = outputFormat.toLowerCase(); if (normalized.includes("webm")) return ".webm"; @@ -1172,6 +1331,7 @@ export async function textToSpeech(params: { } let audioBuffer: Buffer; + let outputExtension = output.extension; if (provider === "elevenlabs") { const voiceIdOverride = params.overrides?.elevenlabs?.voiceId; const modelIdOverride = params.overrides?.elevenlabs?.modelId; @@ -1195,6 +1355,33 @@ export async function textToSpeech(params: { voiceSettings, timeoutMs: config.timeoutMs, }); + } else if (provider === "smallestai") { + // Choose output format based on config (mp3 is default for v3.1) + const smallestOutputFormat = config.smallestai.outputFormat; + audioBuffer = await smallestAiTTS({ + text: params.text, + apiKey, + baseUrl: config.smallestai.baseUrl, + voiceId: config.smallestai.voiceId, + model: config.smallestai.model, + sampleRate: config.smallestai.sampleRate, + outputFormat: smallestOutputFormat, + speed: config.smallestai.speed, + language: config.smallestai.language, + consistency: config.smallestai.consistency, + similarity: config.smallestai.similarity, + enhancement: config.smallestai.enhancement, + timeoutMs: config.timeoutMs, + }); + // Determine extension based on output format + outputExtension = + smallestOutputFormat === "mp3" + ? ".mp3" + : smallestOutputFormat === "wav" + ? ".wav" + : smallestOutputFormat === "mulaw" || smallestOutputFormat === "pcm" + ? ".raw" + : ".wav"; // Default to .wav } else { const openaiModelOverride = params.overrides?.openai?.model; const openaiVoiceOverride = params.overrides?.openai?.voice; @@ -1211,7 +1398,7 @@ export async function textToSpeech(params: { const latencyMs = Date.now() - providerStart; const tempDir = mkdtempSync(path.join(tmpdir(), "tts-")); - const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`); + const audioPath = path.join(tempDir, `voice-${Date.now()}${outputExtension}`); writeFileSync(audioPath, audioBuffer); scheduleCleanup(tempDir); @@ -1220,7 +1407,12 @@ export async function textToSpeech(params: { audioPath, latencyMs, provider, - outputFormat: provider === "openai" ? output.openai : output.elevenlabs, + outputFormat: + provider === "openai" + ? output.openai + : provider === "smallestai" + ? config.smallestai.outputFormat + : output.elevenlabs, voiceCompatible: output.voiceCompatible, }; } catch (err) { @@ -1299,6 +1491,35 @@ export async function textToSpeechTelephony(params: { }; } + // Smallest AI natively supports mulaw@8kHz - ideal for telephony! + if (provider === "smallestai") { + const output = TELEPHONY_OUTPUT.smallestai; + const audioBuffer = await smallestAiTTS({ + text: params.text, + apiKey, + baseUrl: config.smallestai.baseUrl, + voiceId: config.smallestai.voiceId, + model: config.smallestai.model, + sampleRate: output.sampleRate, + outputFormat: output.format, + speed: config.smallestai.speed, + language: config.smallestai.language, + consistency: config.smallestai.consistency, + similarity: config.smallestai.similarity, + enhancement: config.smallestai.enhancement, + timeoutMs: config.timeoutMs, + }); + + return { + success: true, + audioBuffer, + latencyMs: Date.now() - providerStart, + provider, + outputFormat: output.format, + sampleRate: output.sampleRate, + }; + } + const output = TELEPHONY_OUTPUT.openai; const audioBuffer = await openaiTTS({ text: params.text,