feat(tts): add instructions parameter for OpenAI gpt-4o-mini-tts

OpenAI's gpt-4o-mini-tts model supports an 'instructions' parameter
for controlling tone, style, accent, speed, and other speech characteristics.

Changes:
- Add 'instructions' to TtsConfig.openai for default instructions
- Add 'allowInstructions' to TtsModelOverrideConfig for policy control
- Support [[tts:instructions=...]] directive for per-message overrides
- Only pass instructions when model is gpt-4o-mini-tts (older models don't support it)

Usage example:
  tts:
    provider: openai
    openai:
      model: gpt-4o-mini-tts
      voice: coral
      instructions: "Speak in a friendly, casual tone"
This commit is contained in:
Kenneth 2026-01-27 09:55:57 +08:00
parent 27174f5d82
commit 904072c4a5
2 changed files with 34 additions and 7 deletions

View File

@ -21,6 +21,8 @@ export type TtsModelOverrideConfig = {
allowNormalization?: boolean; allowNormalization?: boolean;
/** Allow model-provided seed override. */ /** Allow model-provided seed override. */
allowSeed?: boolean; allowSeed?: boolean;
/** Allow model-provided instructions override (OpenAI gpt-4o-mini-tts). */
allowInstructions?: boolean;
}; };
export type TtsConfig = { export type TtsConfig = {
@ -58,6 +60,8 @@ export type TtsConfig = {
apiKey?: string; apiKey?: string;
model?: string; model?: string;
voice?: string; voice?: string;
/** Instructions for gpt-4o-mini-tts model to control tone, style, etc. */
instructions?: string;
}; };
/** Microsoft Edge (node-edge-tts) configuration. */ /** Microsoft Edge (node-edge-tts) configuration. */
edge?: { edge?: {

View File

@ -110,6 +110,7 @@ export type ResolvedTtsConfig = {
apiKey?: string; apiKey?: string;
model: string; model: string;
voice: string; voice: string;
instructions?: string;
}; };
edge: { edge: {
enabled: boolean; enabled: boolean;
@ -148,6 +149,7 @@ type ResolvedTtsModelOverrides = {
allowVoiceSettings: boolean; allowVoiceSettings: boolean;
allowNormalization: boolean; allowNormalization: boolean;
allowSeed: boolean; allowSeed: boolean;
allowInstructions: boolean;
}; };
type TtsDirectiveOverrides = { type TtsDirectiveOverrides = {
@ -156,6 +158,7 @@ type TtsDirectiveOverrides = {
openai?: { openai?: {
voice?: string; voice?: string;
model?: string; model?: string;
instructions?: string;
}; };
elevenlabs?: { elevenlabs?: {
voiceId?: string; voiceId?: string;
@ -230,6 +233,7 @@ function resolveModelOverridePolicy(
allowVoiceSettings: false, allowVoiceSettings: false,
allowNormalization: false, allowNormalization: false,
allowSeed: false, allowSeed: false,
allowInstructions: false,
}; };
} }
const allow = (value?: boolean) => value ?? true; const allow = (value?: boolean) => value ?? true;
@ -242,6 +246,7 @@ function resolveModelOverridePolicy(
allowVoiceSettings: allow(overrides?.allowVoiceSettings), allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization), allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed), allowSeed: allow(overrides?.allowSeed),
allowInstructions: allow(overrides?.allowInstructions),
}; };
} }
@ -282,6 +287,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
apiKey: raw.openai?.apiKey, apiKey: raw.openai?.apiKey,
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
instructions: raw.openai?.instructions?.trim() || undefined,
}, },
edge: { edge: {
enabled: raw.edge?.enabled ?? true, enabled: raw.edge?.enabled ?? true,
@ -732,6 +738,12 @@ function parseTtsDirectives(
seed: normalizeSeed(Number.parseInt(rawValue, 10)), seed: normalizeSeed(Number.parseInt(rawValue, 10)),
}; };
break; break;
case "instructions":
case "openai_instructions":
case "openaiinstructions":
if (!policy.allowInstructions) break;
overrides.openai = { ...overrides.openai, instructions: rawValue };
break;
default: default:
break; break;
} }
@ -997,8 +1009,9 @@ async function openaiTTS(params: {
voice: string; voice: string;
responseFormat: "mp3" | "opus" | "pcm"; responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number; timeoutMs: number;
instructions?: string;
}): Promise<Buffer> { }): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; const { text, apiKey, model, voice, responseFormat, timeoutMs, instructions } = params;
if (!isValidOpenAIModel(model)) { if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`); throw new Error(`Invalid model: ${model}`);
@ -1010,6 +1023,18 @@ async function openaiTTS(params: {
const controller = new AbortController(); const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs); const timeout = setTimeout(() => controller.abort(), timeoutMs);
// instructions parameter only supported by gpt-4o-mini-tts model
const supportsInstructions = model === "gpt-4o-mini-tts";
const body: Record<string, unknown> = {
model,
input: text,
voice,
response_format: responseFormat,
};
if (supportsInstructions && instructions) {
body.instructions = instructions;
}
try { try {
const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, { const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, {
method: "POST", method: "POST",
@ -1017,12 +1042,7 @@ async function openaiTTS(params: {
Authorization: `Bearer ${apiKey}`, Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ body: JSON.stringify(body),
model,
input: text,
voice,
response_format: responseFormat,
}),
signal: controller.signal, signal: controller.signal,
}); });
@ -1198,6 +1218,7 @@ export async function textToSpeech(params: {
} else { } else {
const openaiModelOverride = params.overrides?.openai?.model; const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice; const openaiVoiceOverride = params.overrides?.openai?.voice;
const openaiInstructionsOverride = params.overrides?.openai?.instructions;
audioBuffer = await openaiTTS({ audioBuffer = await openaiTTS({
text: params.text, text: params.text,
apiKey, apiKey,
@ -1205,6 +1226,7 @@ export async function textToSpeech(params: {
voice: openaiVoiceOverride ?? config.openai.voice, voice: openaiVoiceOverride ?? config.openai.voice,
responseFormat: output.openai, responseFormat: output.openai,
timeoutMs: config.timeoutMs, timeoutMs: config.timeoutMs,
instructions: openaiInstructionsOverride ?? config.openai.instructions,
}); });
} }
@ -1307,6 +1329,7 @@ export async function textToSpeechTelephony(params: {
voice: config.openai.voice, voice: config.openai.voice,
responseFormat: output.format, responseFormat: output.format,
timeoutMs: config.timeoutMs, timeoutMs: config.timeoutMs,
instructions: config.openai.instructions,
}); });
return { return {