feat(tts): add instructions parameter for OpenAI gpt-4o-mini-tts
OpenAI's gpt-4o-mini-tts model supports an 'instructions' parameter
for controlling tone, style, accent, speed, and other speech characteristics.
Changes:
- Add 'instructions' to TtsConfig.openai for default instructions
- Add 'allowInstructions' to TtsModelOverrideConfig for policy control
- Support [[tts:instructions=...]] directive for per-message overrides
- Only pass instructions when model is gpt-4o-mini-tts (older models don't support it)
Usage example:
tts:
provider: openai
openai:
model: gpt-4o-mini-tts
voice: coral
instructions: "Speak in a friendly, casual tone"
This commit is contained in:
parent
27174f5d82
commit
904072c4a5
@ -21,6 +21,8 @@ export type TtsModelOverrideConfig = {
|
|||||||
allowNormalization?: boolean;
|
allowNormalization?: boolean;
|
||||||
/** Allow model-provided seed override. */
|
/** Allow model-provided seed override. */
|
||||||
allowSeed?: boolean;
|
allowSeed?: boolean;
|
||||||
|
/** Allow model-provided instructions override (OpenAI gpt-4o-mini-tts). */
|
||||||
|
allowInstructions?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type TtsConfig = {
|
export type TtsConfig = {
|
||||||
@ -58,6 +60,8 @@ export type TtsConfig = {
|
|||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
voice?: string;
|
voice?: string;
|
||||||
|
/** Instructions for gpt-4o-mini-tts model to control tone, style, etc. */
|
||||||
|
instructions?: string;
|
||||||
};
|
};
|
||||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||||
edge?: {
|
edge?: {
|
||||||
|
|||||||
@ -110,6 +110,7 @@ export type ResolvedTtsConfig = {
|
|||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
model: string;
|
model: string;
|
||||||
voice: string;
|
voice: string;
|
||||||
|
instructions?: string;
|
||||||
};
|
};
|
||||||
edge: {
|
edge: {
|
||||||
enabled: boolean;
|
enabled: boolean;
|
||||||
@ -148,6 +149,7 @@ type ResolvedTtsModelOverrides = {
|
|||||||
allowVoiceSettings: boolean;
|
allowVoiceSettings: boolean;
|
||||||
allowNormalization: boolean;
|
allowNormalization: boolean;
|
||||||
allowSeed: boolean;
|
allowSeed: boolean;
|
||||||
|
allowInstructions: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
type TtsDirectiveOverrides = {
|
type TtsDirectiveOverrides = {
|
||||||
@ -156,6 +158,7 @@ type TtsDirectiveOverrides = {
|
|||||||
openai?: {
|
openai?: {
|
||||||
voice?: string;
|
voice?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
|
instructions?: string;
|
||||||
};
|
};
|
||||||
elevenlabs?: {
|
elevenlabs?: {
|
||||||
voiceId?: string;
|
voiceId?: string;
|
||||||
@ -230,6 +233,7 @@ function resolveModelOverridePolicy(
|
|||||||
allowVoiceSettings: false,
|
allowVoiceSettings: false,
|
||||||
allowNormalization: false,
|
allowNormalization: false,
|
||||||
allowSeed: false,
|
allowSeed: false,
|
||||||
|
allowInstructions: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
const allow = (value?: boolean) => value ?? true;
|
const allow = (value?: boolean) => value ?? true;
|
||||||
@ -242,6 +246,7 @@ function resolveModelOverridePolicy(
|
|||||||
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
|
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
|
||||||
allowNormalization: allow(overrides?.allowNormalization),
|
allowNormalization: allow(overrides?.allowNormalization),
|
||||||
allowSeed: allow(overrides?.allowSeed),
|
allowSeed: allow(overrides?.allowSeed),
|
||||||
|
allowInstructions: allow(overrides?.allowInstructions),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,6 +287,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
|||||||
apiKey: raw.openai?.apiKey,
|
apiKey: raw.openai?.apiKey,
|
||||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||||
|
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||||
},
|
},
|
||||||
edge: {
|
edge: {
|
||||||
enabled: raw.edge?.enabled ?? true,
|
enabled: raw.edge?.enabled ?? true,
|
||||||
@ -732,6 +738,12 @@ function parseTtsDirectives(
|
|||||||
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case "instructions":
|
||||||
|
case "openai_instructions":
|
||||||
|
case "openaiinstructions":
|
||||||
|
if (!policy.allowInstructions) break;
|
||||||
|
overrides.openai = { ...overrides.openai, instructions: rawValue };
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -997,8 +1009,9 @@ async function openaiTTS(params: {
|
|||||||
voice: string;
|
voice: string;
|
||||||
responseFormat: "mp3" | "opus" | "pcm";
|
responseFormat: "mp3" | "opus" | "pcm";
|
||||||
timeoutMs: number;
|
timeoutMs: number;
|
||||||
|
instructions?: string;
|
||||||
}): Promise<Buffer> {
|
}): Promise<Buffer> {
|
||||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
const { text, apiKey, model, voice, responseFormat, timeoutMs, instructions } = params;
|
||||||
|
|
||||||
if (!isValidOpenAIModel(model)) {
|
if (!isValidOpenAIModel(model)) {
|
||||||
throw new Error(`Invalid model: ${model}`);
|
throw new Error(`Invalid model: ${model}`);
|
||||||
@ -1010,6 +1023,18 @@ async function openaiTTS(params: {
|
|||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
// instructions parameter only supported by gpt-4o-mini-tts model
|
||||||
|
const supportsInstructions = model === "gpt-4o-mini-tts";
|
||||||
|
const body: Record<string, unknown> = {
|
||||||
|
model,
|
||||||
|
input: text,
|
||||||
|
voice,
|
||||||
|
response_format: responseFormat,
|
||||||
|
};
|
||||||
|
if (supportsInstructions && instructions) {
|
||||||
|
body.instructions = instructions;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, {
|
const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
@ -1017,12 +1042,7 @@ async function openaiTTS(params: {
|
|||||||
Authorization: `Bearer ${apiKey}`,
|
Authorization: `Bearer ${apiKey}`,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify(body),
|
||||||
model,
|
|
||||||
input: text,
|
|
||||||
voice,
|
|
||||||
response_format: responseFormat,
|
|
||||||
}),
|
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -1198,6 +1218,7 @@ export async function textToSpeech(params: {
|
|||||||
} else {
|
} else {
|
||||||
const openaiModelOverride = params.overrides?.openai?.model;
|
const openaiModelOverride = params.overrides?.openai?.model;
|
||||||
const openaiVoiceOverride = params.overrides?.openai?.voice;
|
const openaiVoiceOverride = params.overrides?.openai?.voice;
|
||||||
|
const openaiInstructionsOverride = params.overrides?.openai?.instructions;
|
||||||
audioBuffer = await openaiTTS({
|
audioBuffer = await openaiTTS({
|
||||||
text: params.text,
|
text: params.text,
|
||||||
apiKey,
|
apiKey,
|
||||||
@ -1205,6 +1226,7 @@ export async function textToSpeech(params: {
|
|||||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||||
responseFormat: output.openai,
|
responseFormat: output.openai,
|
||||||
timeoutMs: config.timeoutMs,
|
timeoutMs: config.timeoutMs,
|
||||||
|
instructions: openaiInstructionsOverride ?? config.openai.instructions,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1307,6 +1329,7 @@ export async function textToSpeechTelephony(params: {
|
|||||||
voice: config.openai.voice,
|
voice: config.openai.voice,
|
||||||
responseFormat: output.format,
|
responseFormat: output.format,
|
||||||
timeoutMs: config.timeoutMs,
|
timeoutMs: config.timeoutMs,
|
||||||
|
instructions: config.openai.instructions,
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user