feat(tts): add instructions parameter for OpenAI gpt-4o-mini-tts
OpenAI's gpt-4o-mini-tts model supports an 'instructions' parameter
for controlling tone, style, accent, speed, and other speech characteristics.
Changes:
- Add 'instructions' to TtsConfig.openai for default instructions
- Add 'allowInstructions' to TtsModelOverrideConfig for policy control
- Support [[tts:instructions=...]] directive for per-message overrides
- Only pass instructions when model is gpt-4o-mini-tts (older models don't support it)
Usage example:
tts:
provider: openai
openai:
model: gpt-4o-mini-tts
voice: coral
instructions: "Speak in a friendly, casual tone"
This commit is contained in:
parent
27174f5d82
commit
904072c4a5
@ -21,6 +21,8 @@ export type TtsModelOverrideConfig = {
|
||||
allowNormalization?: boolean;
|
||||
/** Allow model-provided seed override. */
|
||||
allowSeed?: boolean;
|
||||
/** Allow model-provided instructions override (OpenAI gpt-4o-mini-tts). */
|
||||
allowInstructions?: boolean;
|
||||
};
|
||||
|
||||
export type TtsConfig = {
|
||||
@ -58,6 +60,8 @@ export type TtsConfig = {
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
/** Instructions for gpt-4o-mini-tts model to control tone, style, etc. */
|
||||
instructions?: string;
|
||||
};
|
||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||
edge?: {
|
||||
|
||||
@ -110,6 +110,7 @@ export type ResolvedTtsConfig = {
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
instructions?: string;
|
||||
};
|
||||
edge: {
|
||||
enabled: boolean;
|
||||
@ -148,6 +149,7 @@ type ResolvedTtsModelOverrides = {
|
||||
allowVoiceSettings: boolean;
|
||||
allowNormalization: boolean;
|
||||
allowSeed: boolean;
|
||||
allowInstructions: boolean;
|
||||
};
|
||||
|
||||
type TtsDirectiveOverrides = {
|
||||
@ -156,6 +158,7 @@ type TtsDirectiveOverrides = {
|
||||
openai?: {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
instructions?: string;
|
||||
};
|
||||
elevenlabs?: {
|
||||
voiceId?: string;
|
||||
@ -230,6 +233,7 @@ function resolveModelOverridePolicy(
|
||||
allowVoiceSettings: false,
|
||||
allowNormalization: false,
|
||||
allowSeed: false,
|
||||
allowInstructions: false,
|
||||
};
|
||||
}
|
||||
const allow = (value?: boolean) => value ?? true;
|
||||
@ -242,6 +246,7 @@ function resolveModelOverridePolicy(
|
||||
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
|
||||
allowNormalization: allow(overrides?.allowNormalization),
|
||||
allowSeed: allow(overrides?.allowSeed),
|
||||
allowInstructions: allow(overrides?.allowInstructions),
|
||||
};
|
||||
}
|
||||
|
||||
@ -282,6 +287,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
apiKey: raw.openai?.apiKey,
|
||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||
},
|
||||
edge: {
|
||||
enabled: raw.edge?.enabled ?? true,
|
||||
@ -732,6 +738,12 @@ function parseTtsDirectives(
|
||||
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
||||
};
|
||||
break;
|
||||
case "instructions":
|
||||
case "openai_instructions":
|
||||
case "openaiinstructions":
|
||||
if (!policy.allowInstructions) break;
|
||||
overrides.openai = { ...overrides.openai, instructions: rawValue };
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -997,8 +1009,9 @@ async function openaiTTS(params: {
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
instructions?: string;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs, instructions } = params;
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
@ -1010,6 +1023,18 @@ async function openaiTTS(params: {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
// instructions parameter only supported by gpt-4o-mini-tts model
|
||||
const supportsInstructions = model === "gpt-4o-mini-tts";
|
||||
const body: Record<string, unknown> = {
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
};
|
||||
if (supportsInstructions && instructions) {
|
||||
body.instructions = instructions;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, {
|
||||
method: "POST",
|
||||
@ -1017,12 +1042,7 @@ async function openaiTTS(params: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
}),
|
||||
body: JSON.stringify(body),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
@ -1198,6 +1218,7 @@ export async function textToSpeech(params: {
|
||||
} else {
|
||||
const openaiModelOverride = params.overrides?.openai?.model;
|
||||
const openaiVoiceOverride = params.overrides?.openai?.voice;
|
||||
const openaiInstructionsOverride = params.overrides?.openai?.instructions;
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
@ -1205,6 +1226,7 @@ export async function textToSpeech(params: {
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
instructions: openaiInstructionsOverride ?? config.openai.instructions,
|
||||
});
|
||||
}
|
||||
|
||||
@ -1307,6 +1329,7 @@ export async function textToSpeechTelephony(params: {
|
||||
voice: config.openai.voice,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
instructions: config.openai.instructions,
|
||||
});
|
||||
|
||||
return {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user