feat(tts): add speed and baseUrl config for OpenAI TTS
Add configurable speed (0.25-4.0) and baseUrl parameters for OpenAI TTS, matching the existing pattern used by ElevenLabs (voiceSettings.speed). This enables: - Adjustable playback speed for OpenAI TTS output - Custom OpenAI-compatible TTS endpoints via config (e.g., Kokoro, LocalAI) The baseUrl config option complements the existing OPENAI_TTS_BASE_URL environment variable, allowing per-config endpoint selection. Changes: - types.tts.ts: Add speed and baseUrl to OpenAI config type - zod-schema.core.ts: Add validation for speed (0.25-4.0) and baseUrl - tts.ts: Pass speed and baseUrl through to API calls, update validation to allow any model/voice when using custom endpoints
This commit is contained in:
parent
fdcac0ccf4
commit
1be2ac4cbb
@ -56,8 +56,12 @@ export type TtsConfig = {
|
||||
/** OpenAI configuration. */
|
||||
openai?: {
|
||||
apiKey?: string;
|
||||
/** Custom base URL for OpenAI-compatible TTS servers (e.g., Kokoro, LocalAI). */
|
||||
baseUrl?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
/** Speed multiplier (0.25-4.0 for OpenAI, server-dependent for compatible APIs). */
|
||||
speed?: number;
|
||||
};
|
||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||
edge?: {
|
||||
|
||||
@ -204,8 +204,10 @@ export const TtsConfigSchema = z
|
||||
openai: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().url().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
speed: z.number().min(0.25).max(4.0).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
|
||||
@ -108,8 +108,10 @@ export type ResolvedTtsConfig = {
|
||||
};
|
||||
openai: {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed: number;
|
||||
};
|
||||
edge: {
|
||||
enabled: boolean;
|
||||
@ -280,8 +282,10 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
|
||||
},
|
||||
openai: {
|
||||
apiKey: raw.openai?.apiKey,
|
||||
baseUrl: raw.openai?.baseUrl?.trim() || undefined,
|
||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||
speed: raw.openai?.speed ?? 1.0,
|
||||
},
|
||||
edge: {
|
||||
enabled: raw.edge?.enabled ?? true,
|
||||
@ -784,15 +788,15 @@ export const OPENAI_TTS_VOICES = [
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
function isValidOpenAIModel(model: string): boolean {
|
||||
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
|
||||
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
|
||||
if (isCustomOpenAIEndpoint()) return true;
|
||||
if (baseUrl || isCustomOpenAIEndpoint()) return true;
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
||||
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
|
||||
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
||||
if (isCustomOpenAIEndpoint()) return true;
|
||||
if (baseUrl || isCustomOpenAIEndpoint()) return true;
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
@ -1001,25 +1005,29 @@ async function elevenLabsTTS(params: {
|
||||
async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed: number;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
const { text, apiKey, baseUrl, model, voice, speed, responseFormat, timeoutMs } = params;
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice)) {
|
||||
if (!isValidOpenAIVoice(voice, baseUrl)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
const effectiveBaseUrl = baseUrl || getOpenAITtsBaseUrl();
|
||||
|
||||
try {
|
||||
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
|
||||
const response = await fetch(`${effectiveBaseUrl}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
@ -1029,6 +1037,7 @@ async function openaiTTS(params: {
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
speed,
|
||||
response_format: responseFormat,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
@ -1209,8 +1218,10 @@ export async function textToSpeech(params: {
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: openaiModelOverride ?? config.openai.model,
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
@ -1311,8 +1322,10 @@ export async function textToSpeechTelephony(params: {
|
||||
const audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user