diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4eb4989b9..5614c05e2 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -21,6 +21,8 @@ export type TtsModelOverrideConfig = { allowNormalization?: boolean; /** Allow model-provided seed override. */ allowSeed?: boolean; + /** Allow model-provided instructions override (OpenAI gpt-4o-mini-tts). */ + allowInstructions?: boolean; }; export type TtsConfig = { @@ -58,6 +60,8 @@ export type TtsConfig = { apiKey?: string; model?: string; voice?: string; + /** Instructions for gpt-4o-mini-tts model to control tone, style, etc. */ + instructions?: string; }; /** Microsoft Edge (node-edge-tts) configuration. */ edge?: { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 847876d04..bced3871f 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -110,6 +110,7 @@ export type ResolvedTtsConfig = { apiKey?: string; model: string; voice: string; + instructions?: string; }; edge: { enabled: boolean; @@ -148,6 +149,7 @@ type ResolvedTtsModelOverrides = { allowVoiceSettings: boolean; allowNormalization: boolean; allowSeed: boolean; + allowInstructions: boolean; }; type TtsDirectiveOverrides = { @@ -156,6 +158,7 @@ type TtsDirectiveOverrides = { openai?: { voice?: string; model?: string; + instructions?: string; }; elevenlabs?: { voiceId?: string; @@ -230,6 +233,7 @@ function resolveModelOverridePolicy( allowVoiceSettings: false, allowNormalization: false, allowSeed: false, + allowInstructions: false, }; } const allow = (value?: boolean) => value ?? true; @@ -242,6 +246,7 @@ function resolveModelOverridePolicy( allowVoiceSettings: allow(overrides?.allowVoiceSettings), allowNormalization: allow(overrides?.allowNormalization), allowSeed: allow(overrides?.allowSeed), + allowInstructions: allow(overrides?.allowInstructions), }; } @@ -282,6 +287,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { apiKey: raw.openai?.apiKey, model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, + instructions: raw.openai?.instructions?.trim() || undefined, }, edge: { enabled: raw.edge?.enabled ?? true, @@ -732,6 +738,12 @@ function parseTtsDirectives( seed: normalizeSeed(Number.parseInt(rawValue, 10)), }; break; + case "instructions": + case "openai_instructions": + case "openaiinstructions": + if (!policy.allowInstructions) break; + overrides.openai = { ...overrides.openai, instructions: rawValue }; + break; default: break; } @@ -997,8 +1009,9 @@ async function openaiTTS(params: { voice: string; responseFormat: "mp3" | "opus" | "pcm"; timeoutMs: number; + instructions?: string; }): Promise { - const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; + const { text, apiKey, model, voice, responseFormat, timeoutMs, instructions } = params; if (!isValidOpenAIModel(model)) { throw new Error(`Invalid model: ${model}`); @@ -1010,6 +1023,18 @@ async function openaiTTS(params: { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs); + // instructions parameter only supported by gpt-4o-mini-tts model + const supportsInstructions = model === "gpt-4o-mini-tts"; + const body: Record = { + model, + input: text, + voice, + response_format: responseFormat, + }; + if (supportsInstructions && instructions) { + body.instructions = instructions; + } + try { const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, { method: "POST", @@ -1017,12 +1042,7 @@ async function openaiTTS(params: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json", }, - body: JSON.stringify({ - model, - input: text, - voice, - response_format: responseFormat, - }), + body: JSON.stringify(body), signal: controller.signal, }); @@ -1198,6 +1218,7 @@ export async function textToSpeech(params: { } else { const openaiModelOverride = params.overrides?.openai?.model; const openaiVoiceOverride = params.overrides?.openai?.voice; + const openaiInstructionsOverride = params.overrides?.openai?.instructions; audioBuffer = await openaiTTS({ text: params.text, apiKey, @@ -1205,6 +1226,7 @@ export async function textToSpeech(params: { voice: openaiVoiceOverride ?? config.openai.voice, responseFormat: output.openai, timeoutMs: config.timeoutMs, + instructions: openaiInstructionsOverride ?? config.openai.instructions, }); } @@ -1307,6 +1329,7 @@ export async function textToSpeechTelephony(params: { voice: config.openai.voice, responseFormat: output.format, timeoutMs: config.timeoutMs, + instructions: config.openai.instructions, }); return {