feat(tts): add speed and baseUrl config for OpenAI TTS
Add configurable speed (0.25-4.0) and baseUrl parameters for OpenAI TTS, matching the existing pattern used by ElevenLabs (voiceSettings.speed). This enables: - Adjustable playback speed for OpenAI TTS output - Custom OpenAI-compatible TTS endpoints via config (e.g., Kokoro, LocalAI) The baseUrl config option complements the existing OPENAI_TTS_BASE_URL environment variable, allowing per-config endpoint selection. Changes: - types.tts.ts: Add speed and baseUrl to OpenAI config type - zod-schema.core.ts: Add validation for speed (0.25-4.0) and baseUrl - tts.ts: Pass speed and baseUrl through to API calls, update validation to allow any model/voice when using custom endpoints
This commit is contained in:
parent
fdcac0ccf4
commit
1be2ac4cbb
@ -56,8 +56,12 @@ export type TtsConfig = {
|
|||||||
/** OpenAI configuration. */
|
/** OpenAI configuration. */
|
||||||
openai?: {
|
openai?: {
|
||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
|
/** Custom base URL for OpenAI-compatible TTS servers (e.g., Kokoro, LocalAI). */
|
||||||
|
baseUrl?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
voice?: string;
|
voice?: string;
|
||||||
|
/** Speed multiplier (0.25-4.0 for OpenAI, server-dependent for compatible APIs). */
|
||||||
|
speed?: number;
|
||||||
};
|
};
|
||||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||||
edge?: {
|
edge?: {
|
||||||
|
|||||||
@ -204,8 +204,10 @@ export const TtsConfigSchema = z
|
|||||||
openai: z
|
openai: z
|
||||||
.object({
|
.object({
|
||||||
apiKey: z.string().optional(),
|
apiKey: z.string().optional(),
|
||||||
|
baseUrl: z.string().url().optional(),
|
||||||
model: z.string().optional(),
|
model: z.string().optional(),
|
||||||
voice: z.string().optional(),
|
voice: z.string().optional(),
|
||||||
|
speed: z.number().min(0.25).max(4.0).optional(),
|
||||||
})
|
})
|
||||||
.strict()
|
.strict()
|
||||||
.optional(),
|
.optional(),
|
||||||
|
|||||||
@ -108,8 +108,10 @@ export type ResolvedTtsConfig = {
|
|||||||
};
|
};
|
||||||
openai: {
|
openai: {
|
||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
model: string;
|
model: string;
|
||||||
voice: string;
|
voice: string;
|
||||||
|
speed: number;
|
||||||
};
|
};
|
||||||
edge: {
|
edge: {
|
||||||
enabled: boolean;
|
enabled: boolean;
|
||||||
@ -280,8 +282,10 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig {
|
|||||||
},
|
},
|
||||||
openai: {
|
openai: {
|
||||||
apiKey: raw.openai?.apiKey,
|
apiKey: raw.openai?.apiKey,
|
||||||
|
baseUrl: raw.openai?.baseUrl?.trim() || undefined,
|
||||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||||
|
speed: raw.openai?.speed ?? 1.0,
|
||||||
},
|
},
|
||||||
edge: {
|
edge: {
|
||||||
enabled: raw.edge?.enabled ?? true,
|
enabled: raw.edge?.enabled ?? true,
|
||||||
@ -784,15 +788,15 @@ export const OPENAI_TTS_VOICES = [
|
|||||||
|
|
||||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||||
|
|
||||||
function isValidOpenAIModel(model: string): boolean {
|
function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
|
||||||
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
|
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
|
||||||
if (isCustomOpenAIEndpoint()) return true;
|
if (baseUrl || isCustomOpenAIEndpoint()) return true;
|
||||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||||
}
|
}
|
||||||
|
|
||||||
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
|
||||||
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
||||||
if (isCustomOpenAIEndpoint()) return true;
|
if (baseUrl || isCustomOpenAIEndpoint()) return true;
|
||||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1001,25 +1005,29 @@ async function elevenLabsTTS(params: {
|
|||||||
async function openaiTTS(params: {
|
async function openaiTTS(params: {
|
||||||
text: string;
|
text: string;
|
||||||
apiKey: string;
|
apiKey: string;
|
||||||
|
baseUrl?: string;
|
||||||
model: string;
|
model: string;
|
||||||
voice: string;
|
voice: string;
|
||||||
|
speed: number;
|
||||||
responseFormat: "mp3" | "opus" | "pcm";
|
responseFormat: "mp3" | "opus" | "pcm";
|
||||||
timeoutMs: number;
|
timeoutMs: number;
|
||||||
}): Promise<Buffer> {
|
}): Promise<Buffer> {
|
||||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
const { text, apiKey, baseUrl, model, voice, speed, responseFormat, timeoutMs } = params;
|
||||||
|
|
||||||
if (!isValidOpenAIModel(model)) {
|
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||||
throw new Error(`Invalid model: ${model}`);
|
throw new Error(`Invalid model: ${model}`);
|
||||||
}
|
}
|
||||||
if (!isValidOpenAIVoice(voice)) {
|
if (!isValidOpenAIVoice(voice, baseUrl)) {
|
||||||
throw new Error(`Invalid voice: ${voice}`);
|
throw new Error(`Invalid voice: ${voice}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
const effectiveBaseUrl = baseUrl || getOpenAITtsBaseUrl();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
|
const response = await fetch(`${effectiveBaseUrl}/audio/speech`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
@ -1029,6 +1037,7 @@ async function openaiTTS(params: {
|
|||||||
model,
|
model,
|
||||||
input: text,
|
input: text,
|
||||||
voice,
|
voice,
|
||||||
|
speed,
|
||||||
response_format: responseFormat,
|
response_format: responseFormat,
|
||||||
}),
|
}),
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
@ -1209,8 +1218,10 @@ export async function textToSpeech(params: {
|
|||||||
audioBuffer = await openaiTTS({
|
audioBuffer = await openaiTTS({
|
||||||
text: params.text,
|
text: params.text,
|
||||||
apiKey,
|
apiKey,
|
||||||
|
baseUrl: config.openai.baseUrl,
|
||||||
model: openaiModelOverride ?? config.openai.model,
|
model: openaiModelOverride ?? config.openai.model,
|
||||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||||
|
speed: config.openai.speed,
|
||||||
responseFormat: output.openai,
|
responseFormat: output.openai,
|
||||||
timeoutMs: config.timeoutMs,
|
timeoutMs: config.timeoutMs,
|
||||||
});
|
});
|
||||||
@ -1311,8 +1322,10 @@ export async function textToSpeechTelephony(params: {
|
|||||||
const audioBuffer = await openaiTTS({
|
const audioBuffer = await openaiTTS({
|
||||||
text: params.text,
|
text: params.text,
|
||||||
apiKey,
|
apiKey,
|
||||||
|
baseUrl: config.openai.baseUrl,
|
||||||
model: config.openai.model,
|
model: config.openai.model,
|
||||||
voice: config.openai.voice,
|
voice: config.openai.voice,
|
||||||
|
speed: config.openai.speed,
|
||||||
responseFormat: output.format,
|
responseFormat: output.format,
|
||||||
timeoutMs: config.timeoutMs,
|
timeoutMs: config.timeoutMs,
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user