This commit is contained in:
Tofu 2026-01-29 11:33:29 -05:00 committed by GitHub
commit b220fcf108
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 434 additions and 12 deletions

View File

@ -7,6 +7,7 @@ Status: beta.
### Changes
- Rebrand: rename the npm package/CLI to `moltbot`, add a `moltbot` compatibility shim, and move extensions to the `@moltbot/*` scope.
- TTS: add Smallest AI provider (Lightning v3.1 TTS + Pulse STT) with native telephony support.
- Commands: group /help and /commands output with Telegram paging. (#2504) Thanks @hougangdev.
- macOS: limit project-local `node_modules/.bin` PATH preference to debug builds (reduce PATH hijacking risk).
- macOS: finish Moltbot app rename for macOS sources, bundle identifiers, and shared kit paths. (#2844) Thanks @fal3.

View File

@ -8,13 +8,14 @@ read_when:
# Text-to-speech (TTS)
Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS.
Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, Smallest AI, or Edge TTS.
It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubble.
## Supported services
- **ElevenLabs** (primary or fallback provider)
- **OpenAI** (primary or fallback provider; also used for summaries)
- **Smallest AI** (primary or fallback provider; fast Lightning model, native telephony support)
- **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys)
### Edge TTS notes
@ -31,9 +32,10 @@ does not publish limits, so assume similar or lower limits. citeturn0searc
## Optional keys
If you want OpenAI or ElevenLabs:
If you want OpenAI, ElevenLabs, or Smallest AI:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `OPENAI_API_KEY`
- `SMALLEST_API_KEY`
Edge TTS does **not** require an API key. If no API keys are found, Moltbot defaults
to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
@ -48,6 +50,7 @@ so that provider must also be authenticated if you enable summaries.
- [OpenAI Audio API reference](https://platform.openai.com/docs/api-reference/audio)
- [ElevenLabs Text to Speech](https://elevenlabs.io/docs/api-reference/text-to-speech)
- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication)
- [Smallest AI Waves TTS](https://waves-docs.smallest.ai/)
- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts)
- [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs)
@ -115,6 +118,42 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Smallest AI primary (Lightning v3.1)
```json5
{
messages: {
tts: {
auto: "always",
provider: "smallestai",
smallestai: {
voiceId: "lauren", // or "emily", "jasmine", "arman", custom ID
model: "lightning-v3.1", // "lightning-v3.1" (latest), "lightning", or "waves"
sampleRate: 24000,
outputFormat: "mp3", // "mp3", "wav", "pcm", or "mulaw" (telephony)
speed: 1.0,
language: "en",
consistency: 0.5,
similarity: 0,
enhancement: 1
}
}
}
}
```
Smallest AI notes:
- `lightning-v3.1` is the latest model, optimized for low latency (ideal for real-time)
- `waves` model offers higher quality speech
- Native `mulaw` @ 8kHz support for telephony (no resampling needed)
- 20+ voices available:
- **US Female:** `sophia`, `sandra`, `rachel`, `lauren`, `hannah`, `vanessa`, `brooke`, `megan`
- **US Male:** `robert`, `johnny`, `ethan`, `lucas`, `daniel`
- **British Male:** `edward`
- **Indian Female:** `advika`, `aisha`, `yuvika`, `ishani`, `anuja`
- **Indian Male:** `vaibhav`, `hitesh`, `gaurav`, `vivaan`, `arjun`, `kunal`, `siddharth`
- Get full voice list: `curl -H "Authorization: Bearer $SMALLEST_API_KEY" https://waves-api.smallest.ai/api/v1/lightning/get_voices`
### Edge TTS primary (no API key)
```json5
@ -202,9 +241,9 @@ Then run:
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
- `provider`: `"elevenlabs"`, `"openai"`, `"smallestai"`, or `"edge"` (fallback is automatic).
- If `provider` is **unset**, Moltbot prefers `openai` (if key), then `elevenlabs` (if key),
otherwise `edge`.
then `smallestai` (if key), otherwise `edge`.
- `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`.
- Accepts `provider/model` or a configured model alias.
- `modelOverrides`: allow the model to emit TTS directives (on by default).

View File

@ -274,6 +274,7 @@ export function resolveEnvApiKey(provider: string): EnvApiKeyResult | null {
google: "GEMINI_API_KEY",
groq: "GROQ_API_KEY",
deepgram: "DEEPGRAM_API_KEY",
smallestai: "SMALLEST_API_KEY",
cerebras: "CEREBRAS_API_KEY",
xai: "XAI_API_KEY",
openrouter: "OPENROUTER_API_KEY",

View File

@ -1,4 +1,4 @@
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsProvider = "elevenlabs" | "openai" | "edge" | "smallestai";
export type TtsMode = "final" | "all";
@ -73,6 +73,29 @@ export type TtsConfig = {
proxy?: string;
timeoutMs?: number;
};
/** Smallest AI (Waves) configuration. */
smallestai?: {
apiKey?: string;
baseUrl?: string;
/** Voice ID (e.g. "lauren", "emily", "jasmine", "arman", or custom voice ID). */
voiceId?: string;
/** Model: "lightning-v3.1" (latest), "lightning", or "waves". */
model?: "lightning-v3.1" | "lightning" | "waves";
/** Sample rate in Hz (8000, 16000, 22050, 24000, 44100, 48000). */
sampleRate?: number;
/** Output format: "mp3", "wav", "pcm" (raw), or "mulaw" (telephony). */
outputFormat?: "mp3" | "wav" | "pcm" | "mulaw";
/** Speed multiplier (0.5 to 2.0). */
speed?: number;
/** Language code (e.g. "en", "hi"). Default: "en". */
language?: string;
/** Consistency (0 to 1). Default: 0.5. */
consistency?: number;
/** Similarity (0 to 1). Default: 0. */
similarity?: number;
/** Enhancement (0 or 1). Default: 1. */
enhancement?: number;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "smallestai"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
@ -224,6 +224,22 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
smallestai: z
.object({
apiKey: z.string().optional(),
baseUrl: z.string().optional(),
voiceId: z.string().optional(),
model: z.enum(["lightning-v3.1", "lightning", "waves"]).optional(),
sampleRate: z.number().int().min(8000).max(48000).optional(),
outputFormat: z.enum(["mp3", "wav", "pcm", "mulaw"]).optional(),
speed: z.number().min(0.5).max(2).optional(),
language: z.string().optional(),
consistency: z.number().min(0).max(1).optional(),
similarity: z.number().min(0).max(1).optional(),
enhancement: z.number().min(0).max(1).optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@ -31,6 +31,7 @@ export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
groq: "whisper-large-v3-turbo",
openai: "gpt-4o-mini-transcribe",
deepgram: "nova-3",
smallestai: "pulse",
};
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;

View File

@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxProvider } from "./minimax/index.js";
import { openaiProvider } from "./openai/index.js";
import { smallestaiProvider } from "./smallestai/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [
groqProvider,
@ -14,6 +15,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
anthropicProvider,
minimaxProvider,
deepgramProvider,
smallestaiProvider,
];
export function normalizeMediaProviderId(id: string): string {

View File

@ -0,0 +1,79 @@
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
export const DEFAULT_SMALLEST_AUDIO_BASE_URL = "https://waves-api.smallest.ai/api/v1";
export const DEFAULT_SMALLEST_AUDIO_MODEL = "pulse"; // Smallest AI Pulse STT model
type SmallestTranscriptResponse = {
status?: string;
transcription?: string;
text?: string;
audio_length?: number;
metadata?: {
duration?: number;
fileSize?: number;
};
};
/**
* Transcribe audio using Smallest AI's Pulse STT API.
*
* Endpoint: POST /api/v1/pulse/get_text
* Uses raw audio bytes with Content-Type header (application/octet-stream method).
* @see https://waves-docs.smallest.ai/v4.0.0/content/api-references/pulse-stt
*/
export async function transcribeSmallestAiAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_SMALLEST_AUDIO_BASE_URL);
// Build query parameters
const queryParams = new URLSearchParams();
queryParams.set("model", "pulse");
if (params.language?.trim()) {
queryParams.set("language", params.language.trim());
} else {
queryParams.set("language", "en");
}
// Pulse API endpoint for STT with query params
const url = `${baseUrl}/pulse/get_text?${queryParams.toString()}`;
// Determine content type from mime or default to audio/wav
const contentType = params.mime ?? "audio/wav";
// Send raw audio bytes directly (application/octet-stream method)
const headers = new Headers(params.headers);
headers.set("Content-Type", contentType);
if (!headers.has("authorization")) {
headers.set("authorization", `Bearer ${params.apiKey}`);
}
// Convert Buffer to Uint8Array for fetch body
const audioBytes = new Uint8Array(params.buffer);
const res = await fetchWithTimeout(
url,
{
method: "POST",
headers,
body: audioBytes,
},
params.timeoutMs,
fetchFn,
);
if (!res.ok) {
const detail = await readErrorResponse(res);
const suffix = detail ? `: ${detail}` : "";
throw new Error(`Smallest AI Pulse STT failed (HTTP ${res.status})${suffix}`);
}
const payload = (await res.json()) as SmallestTranscriptResponse;
const transcript = (payload.transcription ?? payload.text)?.trim();
if (!transcript) {
throw new Error("Smallest AI Pulse STT response missing transcription");
}
return { text: transcript, model: "pulse" };
}

View File

@ -0,0 +1,8 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeSmallestAiAudio } from "./audio.js";
export const smallestaiProvider: MediaUnderstandingProvider = {
id: "smallestai",
capabilities: ["audio"],
transcribeAudio: transcribeSmallestAiAudio,
};

View File

@ -49,7 +49,7 @@ import {
import { describeImageWithModel } from "./providers/image.js";
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google"] as const;
const AUTO_AUDIO_KEY_PROVIDERS = ["openai", "groq", "deepgram", "google", "smallestai"] as const;
const AUTO_IMAGE_KEY_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const;
const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const;
const DEFAULT_IMAGE_MODELS: Record<string, string> = {

View File

@ -202,6 +202,14 @@ describe("tts", () => {
expect(result.overrides.provider).toBe("edge");
});
it("accepts smallestai as provider override", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input = "Hello [[tts:provider=smallestai]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("smallestai");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";
@ -359,7 +367,12 @@ describe("tts", () => {
};
const restoreEnv = (snapshot: Record<string, string | undefined>) => {
const keys = ["OPENAI_API_KEY", "ELEVENLABS_API_KEY", "XI_API_KEY"] as const;
const keys = [
"OPENAI_API_KEY",
"ELEVENLABS_API_KEY",
"XI_API_KEY",
"SMALLEST_API_KEY",
] as const;
for (const key of keys) {
const value = snapshot[key];
if (value === undefined) {
@ -375,6 +388,7 @@ describe("tts", () => {
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY,
XI_API_KEY: process.env.XI_API_KEY,
SMALLEST_API_KEY: process.env.SMALLEST_API_KEY,
};
try {
for (const [key, value] of Object.entries(env)) {
@ -426,6 +440,7 @@ describe("tts", () => {
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
SMALLEST_API_KEY: undefined,
},
() => {
const config = resolveTtsConfig(baseCfg);
@ -434,6 +449,22 @@ describe("tts", () => {
},
);
});
it("prefers Smallest AI when OpenAI and ElevenLabs are missing and Smallest key exists", () => {
withEnv(
{
OPENAI_API_KEY: undefined,
ELEVENLABS_API_KEY: undefined,
XI_API_KEY: undefined,
SMALLEST_API_KEY: "test-smallest-key",
},
() => {
const config = resolveTtsConfig(baseCfg);
const provider = getTtsProvider(config, "/tmp/tts-prefs-smallest.json");
expect(provider).toBe("smallestai");
},
);
});
});
describe("maybeApplyTtsToPayload", () => {

View File

@ -52,6 +52,19 @@ const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
// Smallest AI defaults (Lightning v3.1)
// See: https://waves-docs.smallest.ai/
const DEFAULT_SMALLESTAI_BASE_URL = "https://waves-api.smallest.ai/api/v1";
const DEFAULT_SMALLESTAI_VOICE_ID = "lauren";
const DEFAULT_SMALLESTAI_MODEL = "lightning-v3.1" as const;
const DEFAULT_SMALLESTAI_SAMPLE_RATE = 24000;
const DEFAULT_SMALLESTAI_OUTPUT_FORMAT = "mp3" as const;
const DEFAULT_SMALLESTAI_SPEED = 1.0;
const DEFAULT_SMALLESTAI_CONSISTENCY = 0.5;
const DEFAULT_SMALLESTAI_SIMILARITY = 0;
const DEFAULT_SMALLESTAI_ENHANCEMENT = 1;
const DEFAULT_SMALLESTAI_LANGUAGE = "en";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
@ -79,6 +92,8 @@ const DEFAULT_OUTPUT = {
const TELEPHONY_OUTPUT = {
openai: { format: "pcm" as const, sampleRate: 24000 },
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
// Smallest AI natively supports mulaw@8kHz - perfect for telephony!
smallestai: { format: "mulaw" as const, sampleRate: 8000 },
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
@ -124,6 +139,19 @@ export type ResolvedTtsConfig = {
proxy?: string;
timeoutMs?: number;
};
smallestai: {
apiKey?: string;
baseUrl: string;
voiceId: string;
model: "lightning-v3.1" | "lightning" | "waves";
sampleRate: number;
outputFormat: "mp3" | "wav" | "pcm" | "mulaw";
speed: number;
language: string;
consistency: number;
similarity: number;
enhancement: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
@ -296,6 +324,19 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
smallestai: {
apiKey: raw.smallestai?.apiKey,
baseUrl: raw.smallestai?.baseUrl?.trim() || DEFAULT_SMALLESTAI_BASE_URL,
voiceId: raw.smallestai?.voiceId?.trim() || DEFAULT_SMALLESTAI_VOICE_ID,
model: raw.smallestai?.model ?? DEFAULT_SMALLESTAI_MODEL,
sampleRate: raw.smallestai?.sampleRate ?? DEFAULT_SMALLESTAI_SAMPLE_RATE,
outputFormat: raw.smallestai?.outputFormat ?? DEFAULT_SMALLESTAI_OUTPUT_FORMAT,
speed: raw.smallestai?.speed ?? DEFAULT_SMALLESTAI_SPEED,
language: raw.smallestai?.language?.trim() || DEFAULT_SMALLESTAI_LANGUAGE,
consistency: raw.smallestai?.consistency ?? DEFAULT_SMALLESTAI_CONSISTENCY,
similarity: raw.smallestai?.similarity ?? DEFAULT_SMALLESTAI_SIMILARITY,
enhancement: raw.smallestai?.enhancement ?? DEFAULT_SMALLESTAI_ENHANCEMENT,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -412,6 +453,7 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
if (resolveTtsApiKey(config, "openai")) return "openai";
if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
if (resolveTtsApiKey(config, "smallestai")) return "smallestai";
return "edge";
}
@ -474,10 +516,13 @@ export function resolveTtsApiKey(
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
if (provider === "smallestai") {
return config.smallestai.apiKey || process.env.SMALLEST_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export const TTS_PROVIDERS = ["openai", "elevenlabs", "smallestai", "edge"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
@ -485,6 +530,7 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") return config.edge.enabled;
if (provider === "smallestai") return Boolean(resolveTtsApiKey(config, provider));
return Boolean(resolveTtsApiKey(config, provider));
}
@ -587,7 +633,12 @@ function parseTtsDirectives(
switch (key) {
case "provider":
if (!policy.allowProvider) break;
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
if (
rawValue === "openai" ||
rawValue === "elevenlabs" ||
rawValue === "edge" ||
rawValue === "smallestai"
) {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
@ -1044,6 +1095,114 @@ async function openaiTTS(params: {
}
}
/**
* Smallest AI (Waves) TTS provider.
* Uses the Lightning model for fast speech synthesis or Waves for highest quality.
* Supports native mulaw@8kHz output for telephony.
*
* @see https://waves-docs.smallest.ai/
*/
async function smallestAiTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
model: "lightning-v3.1" | "lightning" | "waves";
sampleRate: number;
outputFormat: "mp3" | "wav" | "pcm" | "mulaw";
speed: number;
language?: string;
consistency?: number;
similarity?: number;
enhancement?: number;
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
model,
sampleRate,
outputFormat,
speed,
language = DEFAULT_SMALLESTAI_LANGUAGE,
consistency = DEFAULT_SMALLESTAI_CONSISTENCY,
similarity = DEFAULT_SMALLESTAI_SIMILARITY,
enhancement = DEFAULT_SMALLESTAI_ENHANCEMENT,
timeoutMs,
} = params;
// Validate speed
if (speed < 0.5 || speed > 2.0) {
throw new Error("Smallest AI speed must be between 0.5 and 2.0");
}
// Validate sample rate
const validSampleRates = [8000, 16000, 22050, 24000, 44100, 48000];
if (!validSampleRates.includes(sampleRate)) {
throw new Error(`Invalid sample rate: ${sampleRate}. Valid: ${validSampleRates.join(", ")}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
// Choose endpoint based on model (v3.1 is the latest)
let endpoint: string;
if (model === "lightning-v3.1") {
endpoint = "lightning-v3.1/get_speech";
} else if (model === "lightning") {
endpoint = "lightning/get_speech";
} else {
endpoint = "waves/get_speech";
}
const url = `${baseUrl.replace(/\/+$/, "")}/${endpoint}`;
// Build request body based on model version
const body: Record<string, unknown> = {
text,
voice_id: voiceId,
sample_rate: sampleRate,
speed,
};
if (model === "lightning-v3.1") {
// v3.1 uses output_format directly and has additional parameters
body.output_format = outputFormat;
body.language = language;
body.consistency = consistency;
body.similarity = similarity;
body.enhancement = enhancement;
} else {
// Legacy models use add_wav_header and encoding
body.add_wav_header = outputFormat === "wav";
if (outputFormat === "mulaw") {
body.encoding = "pcm_mulaw";
}
}
const response = await fetch(url, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
signal: controller.signal,
});
if (!response.ok) {
const errorText = await response.text().catch(() => "");
throw new Error(`Smallest AI TTS API error (${response.status}): ${errorText}`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) return ".webm";
@ -1180,6 +1339,7 @@ export async function textToSpeech(params: {
}
let audioBuffer: Buffer;
let outputExtension = output.extension;
if (provider === "elevenlabs") {
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
@ -1203,6 +1363,33 @@ export async function textToSpeech(params: {
voiceSettings,
timeoutMs: config.timeoutMs,
});
} else if (provider === "smallestai") {
// Choose output format based on config (mp3 is default for v3.1)
const smallestOutputFormat = config.smallestai.outputFormat;
audioBuffer = await smallestAiTTS({
text: params.text,
apiKey,
baseUrl: config.smallestai.baseUrl,
voiceId: config.smallestai.voiceId,
model: config.smallestai.model,
sampleRate: config.smallestai.sampleRate,
outputFormat: smallestOutputFormat,
speed: config.smallestai.speed,
language: config.smallestai.language,
consistency: config.smallestai.consistency,
similarity: config.smallestai.similarity,
enhancement: config.smallestai.enhancement,
timeoutMs: config.timeoutMs,
});
// Determine extension based on output format
outputExtension =
smallestOutputFormat === "mp3"
? ".mp3"
: smallestOutputFormat === "wav"
? ".wav"
: smallestOutputFormat === "mulaw" || smallestOutputFormat === "pcm"
? ".raw"
: ".wav"; // Default to .wav
} else {
const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice;
@ -1219,7 +1406,7 @@ export async function textToSpeech(params: {
const latencyMs = Date.now() - providerStart;
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
const audioPath = path.join(tempDir, `voice-${Date.now()}${outputExtension}`);
writeFileSync(audioPath, audioBuffer);
scheduleCleanup(tempDir);
@ -1228,7 +1415,12 @@ export async function textToSpeech(params: {
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
outputFormat:
provider === "openai"
? output.openai
: provider === "smallestai"
? config.smallestai.outputFormat
: output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
@ -1307,6 +1499,35 @@ export async function textToSpeechTelephony(params: {
};
}
// Smallest AI natively supports mulaw@8kHz - ideal for telephony!
if (provider === "smallestai") {
const output = TELEPHONY_OUTPUT.smallestai;
const audioBuffer = await smallestAiTTS({
text: params.text,
apiKey,
baseUrl: config.smallestai.baseUrl,
voiceId: config.smallestai.voiceId,
model: config.smallestai.model,
sampleRate: output.sampleRate,
outputFormat: output.format,
speed: config.smallestai.speed,
language: config.smallestai.language,
consistency: config.smallestai.consistency,
similarity: config.smallestai.similarity,
enhancement: config.smallestai.enhancement,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
}
const output = TELEPHONY_OUTPUT.openai;
const audioBuffer = await openaiTTS({
text: params.text,