feat(tts): add speed and baseUrl config for OpenAI TTS

Add configurable speed (0.25-4.0) and baseUrl parameters for OpenAI TTS,
matching the existing pattern used by ElevenLabs (voiceSettings.speed).

This enables:
- Adjustable playback speed for OpenAI TTS output
- Custom OpenAI-compatible TTS endpoints via config (e.g., Kokoro, LocalAI)

The baseUrl config option complements the existing OPENAI_TTS_BASE_URL
environment variable, allowing per-config endpoint selection.

Changes:
- types.tts.ts: Add speed and baseUrl to OpenAI config type
- zod-schema.core.ts: Add validation for speed (0.25-4.0) and baseUrl
- tts.ts: Pass speed and baseUrl through to API calls, update validation
  to allow any model/voice when using custom endpoints
This commit is contained in:
Jeremy Corbello 2026-01-28 19:14:57 -06:00
parent fdcac0ccf4
commit 1be2ac4cbb
3 changed files with 27 additions and 8 deletions

View File

@ -56,8 +56,12 @@ export type TtsConfig = {
/** OpenAI configuration. */ /** OpenAI configuration. */
openai?: { openai?: {
apiKey?: string; apiKey?: string;
/** Custom base URL for OpenAI-compatible TTS servers (e.g., Kokoro, LocalAI). */
baseUrl?: string;
model?: string; model?: string;
voice?: string; voice?: string;
/** Speed multiplier (0.25-4.0 for OpenAI, server-dependent for compatible APIs). */
speed?: number;
}; };
/** Microsoft Edge (node-edge-tts) configuration. */ /** Microsoft Edge (node-edge-tts) configuration. */
edge?: { edge?: {

View File

@ -204,8 +204,10 @@ export const TtsConfigSchema = z
openai: z openai: z
.object({ .object({
apiKey: z.string().optional(), apiKey: z.string().optional(),
baseUrl: z.string().url().optional(),
model: z.string().optional(), model: z.string().optional(),
voice: z.string().optional(), voice: z.string().optional(),
speed: z.number().min(0.25).max(4.0).optional(),
}) })
.strict() .strict()
.optional(), .optional(),

View File

@ -108,8 +108,10 @@ export type ResolvedTtsConfig = {
}; };
openai: { openai: {
apiKey?: string; apiKey?: string;
baseUrl?: string;
model: string; model: string;
voice: string; voice: string;
speed: number;
}; };
edge: { edge: {
enabled: boolean; enabled: boolean;
@ -280,8 +282,10 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
}, },
openai: { openai: {
apiKey: raw.openai?.apiKey, apiKey: raw.openai?.apiKey,
baseUrl: raw.openai?.baseUrl?.trim() || undefined,
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed ?? 1.0,
}, },
edge: { edge: {
enabled: raw.edge?.enabled ?? true, enabled: raw.edge?.enabled ?? true,
@ -784,15 +788,15 @@ export const OPENAI_TTS_VOICES = [
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean { function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI) // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
if (isCustomOpenAIEndpoint()) return true; if (baseUrl || isCustomOpenAIEndpoint()) return true;
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
} }
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint()) return true; if (baseUrl || isCustomOpenAIEndpoint()) return true;
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
} }
@ -1001,25 +1005,29 @@ async function elevenLabsTTS(params: {
async function openaiTTS(params: { async function openaiTTS(params: {
text: string; text: string;
apiKey: string; apiKey: string;
baseUrl?: string;
model: string; model: string;
voice: string; voice: string;
speed: number;
responseFormat: "mp3" | "opus" | "pcm"; responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number; timeoutMs: number;
}): Promise<Buffer> { }): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; const { text, apiKey, baseUrl, model, voice, speed, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) { if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`); throw new Error(`Invalid model: ${model}`);
} }
if (!isValidOpenAIVoice(voice)) { if (!isValidOpenAIVoice(voice, baseUrl)) {
throw new Error(`Invalid voice: ${voice}`); throw new Error(`Invalid voice: ${voice}`);
} }
const controller = new AbortController(); const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs); const timeout = setTimeout(() => controller.abort(), timeoutMs);
const effectiveBaseUrl = baseUrl || getOpenAITtsBaseUrl();
try { try {
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, { const response = await fetch(`${effectiveBaseUrl}/audio/speech`, {
method: "POST", method: "POST",
headers: { headers: {
Authorization: `Bearer ${apiKey}`, Authorization: `Bearer ${apiKey}`,
@ -1029,6 +1037,7 @@ async function openaiTTS(params: {
model, model,
input: text, input: text,
voice, voice,
speed,
response_format: responseFormat, response_format: responseFormat,
}), }),
signal: controller.signal, signal: controller.signal,
@ -1209,8 +1218,10 @@ export async function textToSpeech(params: {
audioBuffer = await openaiTTS({ audioBuffer = await openaiTTS({
text: params.text, text: params.text,
apiKey, apiKey,
baseUrl: config.openai.baseUrl,
model: openaiModelOverride ?? config.openai.model, model: openaiModelOverride ?? config.openai.model,
voice: openaiVoiceOverride ?? config.openai.voice, voice: openaiVoiceOverride ?? config.openai.voice,
speed: config.openai.speed,
responseFormat: output.openai, responseFormat: output.openai,
timeoutMs: config.timeoutMs, timeoutMs: config.timeoutMs,
}); });
@ -1311,8 +1322,10 @@ export async function textToSpeechTelephony(params: {
const audioBuffer = await openaiTTS({ const audioBuffer = await openaiTTS({
text: params.text, text: params.text,
apiKey, apiKey,
baseUrl: config.openai.baseUrl,
model: config.openai.model, model: config.openai.model,
voice: config.openai.voice, voice: config.openai.voice,
speed: config.openai.speed,
responseFormat: output.format, responseFormat: output.format,
timeoutMs: config.timeoutMs, timeoutMs: config.timeoutMs,
}); });