feat(tts): add speed and baseUrl config for OpenAI TTS

Add configurable speed (0.25-4.0) and baseUrl parameters for OpenAI TTS,
matching the existing pattern used by ElevenLabs (voiceSettings.speed).

This enables:
- Adjustable playback speed for OpenAI TTS output
- Custom OpenAI-compatible TTS endpoints via config (e.g., Kokoro, LocalAI)

The baseUrl config option complements the existing OPENAI_TTS_BASE_URL
environment variable, allowing per-config endpoint selection.

Changes:
- types.tts.ts: Add speed and baseUrl to OpenAI config type
- zod-schema.core.ts: Add validation for speed (0.25-4.0) and baseUrl
- tts.ts: Pass speed and baseUrl through to API calls, update validation
  to allow any model/voice when using custom endpoints
This commit is contained in:
Jeremy Corbello 2026-01-28 19:14:57 -06:00
parent fdcac0ccf4
commit 1be2ac4cbb
3 changed files with 27 additions and 8 deletions

View File

@ -56,8 +56,12 @@ export type TtsConfig = {
/** OpenAI configuration. */
openai?: {
apiKey?: string;
/** Custom base URL for OpenAI-compatible TTS servers (e.g., Kokoro, LocalAI). */
baseUrl?: string;
model?: string;
voice?: string;
/** Speed multiplier (0.25-4.0 for OpenAI, server-dependent for compatible APIs). */
speed?: number;
};
/** Microsoft Edge (node-edge-tts) configuration. */
edge?: {

View File

@ -204,8 +204,10 @@ export const TtsConfigSchema = z
openai: z
.object({
apiKey: z.string().optional(),
baseUrl: z.string().url().optional(),
model: z.string().optional(),
voice: z.string().optional(),
speed: z.number().min(0.25).max(4.0).optional(),
})
.strict()
.optional(),

View File

@ -108,8 +108,10 @@ export type ResolvedTtsConfig = {
};
openai: {
apiKey?: string;
baseUrl?: string;
model: string;
voice: string;
speed: number;
};
edge: {
enabled: boolean;
@ -280,8 +282,10 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
},
openai: {
apiKey: raw.openai?.apiKey,
baseUrl: raw.openai?.baseUrl?.trim() || undefined,
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed ?? 1.0,
},
edge: {
enabled: raw.edge?.enabled ?? true,
@ -784,15 +788,15 @@ export const OPENAI_TTS_VOICES = [
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean {
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
if (isCustomOpenAIEndpoint()) return true;
if (baseUrl || isCustomOpenAIEndpoint()) return true;
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint()) return true;
if (baseUrl || isCustomOpenAIEndpoint()) return true;
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
@ -1001,25 +1005,29 @@ async function elevenLabsTTS(params: {
async function openaiTTS(params: {
text: string;
apiKey: string;
baseUrl?: string;
model: string;
voice: string;
speed: number;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
const { text, apiKey, baseUrl, model, voice, speed, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) {
if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice)) {
if (!isValidOpenAIVoice(voice, baseUrl)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
const effectiveBaseUrl = baseUrl || getOpenAITtsBaseUrl();
try {
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
const response = await fetch(`${effectiveBaseUrl}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
@ -1029,6 +1037,7 @@ async function openaiTTS(params: {
model,
input: text,
voice,
speed,
response_format: responseFormat,
}),
signal: controller.signal,
@ -1209,8 +1218,10 @@ export async function textToSpeech(params: {
audioBuffer = await openaiTTS({
text: params.text,
apiKey,
baseUrl: config.openai.baseUrl,
model: openaiModelOverride ?? config.openai.model,
voice: openaiVoiceOverride ?? config.openai.voice,
speed: config.openai.speed,
responseFormat: output.openai,
timeoutMs: config.timeoutMs,
});
@ -1311,8 +1322,10 @@ export async function textToSpeechTelephony(params: {
const audioBuffer = await openaiTTS({
text: params.text,
apiKey,
baseUrl: config.openai.baseUrl,
model: config.openai.model,
voice: config.openai.voice,
speed: config.openai.speed,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
});